aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig104
-rw-r--r--arch/x86/Kconfig.cpu131
-rw-r--r--arch/x86/Kconfig.debug13
-rw-r--r--arch/x86/Makefile_32.cpu6
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/compressed/Makefile8
-rw-r--r--arch/x86/boot/cpu.c17
-rw-r--r--arch/x86/boot/edd.c7
-rw-r--r--arch/x86/boot/mkcpustr.c40
-rw-r--r--arch/x86/boot/video-vesa.c2
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/ia32/ia32_signal.c117
-rw-r--r--arch/x86/ia32/ia32entry.S26
-rw-r--r--arch/x86/kernel/Makefile17
-rw-r--r--arch/x86/kernel/acpi/boot.c20
-rw-r--r--arch/x86/kernel/alternative.c2
-rw-r--r--arch/x86/kernel/amd_iommu_init.c4
-rw-r--r--arch/x86/kernel/apic_32.c441
-rw-r--r--arch/x86/kernel/apic_64.c630
-rw-r--r--arch/x86/kernel/cpu/Makefile34
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c88
-rw-r--r--arch/x86/kernel/cpu/amd.c548
-rw-r--r--arch/x86/kernel/cpu/amd_64.c224
-rw-r--r--arch/x86/kernel/cpu/centaur.c4
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c6
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c1001
-rw-r--r--arch/x86/kernel/cpu/common_64.c763
-rw-r--r--arch/x86/kernel/cpu/cpu.h19
-rw-r--r--arch/x86/kernel/cpu/cyrix.c23
-rw-r--r--arch/x86/kernel/cpu/feature_names.c84
-rw-r--r--arch/x86/kernel/cpu/intel.c364
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c169
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c2
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/powerflags.c20
-rw-r--r--arch/x86/kernel/cpu/transmeta.c32
-rw-r--r--arch/x86/kernel/cpu/umc.c3
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_32.c447
-rw-r--r--arch/x86/kernel/dumpstack_64.c573
-rw-r--r--arch/x86/kernel/e820.c28
-rw-r--r--arch/x86/kernel/early-quirks.c48
-rw-r--r--arch/x86/kernel/early_printk.c748
-rw-r--r--arch/x86/kernel/entry_32.S22
-rw-r--r--arch/x86/kernel/entry_64.S15
-rw-r--r--arch/x86/kernel/es7000_32.c (renamed from arch/x86/mach-es7000/es7000plat.c)115
-rw-r--r--arch/x86/kernel/genapic_64.c88
-rw-r--r--arch/x86/kernel/genapic_flat_64.c62
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c159
-rw-r--r--arch/x86/kernel/genx2apic_phys.c154
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c93
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/hpet.c6
-rw-r--r--arch/x86/kernel/i387.c168
-rw-r--r--arch/x86/kernel/i8259.c24
-rw-r--r--arch/x86/kernel/io_apic_32.c47
-rw-r--r--arch/x86/kernel/io_apic_64.c639
-rw-r--r--arch/x86/kernel/irqinit_32.c49
-rw-r--r--arch/x86/kernel/irqinit_64.c43
-rw-r--r--arch/x86/kernel/ldt.c9
-rw-r--r--arch/x86/kernel/microcode.c853
-rw-r--r--arch/x86/kernel/microcode_amd.c435
-rw-r--r--arch/x86/kernel/microcode_core.c508
-rw-r--r--arch/x86/kernel/microcode_intel.c480
-rw-r--r--arch/x86/kernel/mpparse.c2
-rw-r--r--arch/x86/kernel/numaq_32.c7
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c29
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--arch/x86/kernel/process_32.c43
-rw-r--r--arch/x86/kernel/process_64.c33
-rw-r--r--arch/x86/kernel/ptrace.c44
-rw-r--r--arch/x86/kernel/quirks.c41
-rw-r--r--arch/x86/kernel/setup.c208
-rw-r--r--arch/x86/kernel/sigframe.h14
-rw-r--r--arch/x86/kernel/signal_32.c261
-rw-r--r--arch/x86/kernel/signal_64.c292
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c202
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/time_32.c7
-rw-r--r--arch/x86/kernel/time_64.c23
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/kernel/traps.c (renamed from arch/x86/kernel/traps_32.c)896
-rw-r--r--arch/x86/kernel/traps_64.c1218
-rw-r--r--arch/x86/kernel/vmi_32.c4
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S9
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S9
-rw-r--r--arch/x86/kernel/xsave.c345
-rw-r--r--arch/x86/kvm/vmx.h15
-rw-r--r--arch/x86/lguest/boot.c38
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/usercopy_32.c7
-rw-r--r--arch/x86/mach-default/setup.c15
-rw-r--r--arch/x86/mach-es7000/Makefile5
-rw-r--r--arch/x86/mach-es7000/es7000.h114
-rw-r--r--arch/x86/mach-generic/Makefile1
-rw-r--r--arch/x86/mach-generic/bigsmp.c9
-rw-r--r--arch/x86/mach-generic/es7000.c33
-rw-r--r--arch/x86/mach-generic/numaq.c12
-rw-r--r--arch/x86/mach-generic/summit.c11
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/fault.c19
-rw-r--r--arch/x86/mm/gup.c10
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c66
-rw-r--r--arch/x86/mm/ioremap.c170
-rw-r--r--arch/x86/mm/numa_32.c (renamed from arch/x86/mm/discontig_32.c)0
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/oprofile/Makefile2
-rw-r--r--arch/x86/oprofile/nmi_int.c27
-rw-r--r--arch/x86/oprofile/op_model_amd.c543
-rw-r--r--arch/x86/oprofile/op_model_athlon.c190
-rw-r--r--arch/x86/oprofile/op_x86_model.h4
-rw-r--r--arch/x86/pci/acpi.c5
-rw-r--r--arch/x86/pci/fixup.c28
-rw-r--r--arch/x86/pci/i386.c3
-rw-r--r--arch/x86/pci/mmconfig-shared.c12
-rw-r--r--arch/x86/power/cpu_32.c7
-rw-r--r--arch/x86/power/cpu_64.c7
-rw-r--r--arch/x86/xen/Kconfig12
-rw-r--r--arch/x86/xen/Makefile12
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c297
-rw-r--r--arch/x86/xen/irq.c143
-rw-r--r--arch/x86/xen/mmu.c314
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c115
-rw-r--r--arch/x86/xen/smp.c245
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/time.c12
-rw-r--r--arch/x86/xen/xen-asm_32.S2
-rw-r--r--arch/x86/xen/xen-asm_64.S22
-rw-r--r--arch/x86/xen/xen-ops.h8
138 files changed, 11301 insertions, 6548 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0d7cdbbfc1ee..f65c2744d573 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
18### Arch settings 18### Arch settings
19config X86 19config X86
20 def_bool y 20 def_bool y
21 select HAVE_AOUT if X86_32
21 select HAVE_UNSTABLE_SCHED_CLOCK 22 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 23 select HAVE_IDE
23 select HAVE_OPROFILE 24 select HAVE_OPROFILE
@@ -152,9 +153,6 @@ config AUDIT_ARCH
152 bool 153 bool
153 default X86_64 154 default X86_64
154 155
155config ARCH_SUPPORTS_AOUT
156 def_bool y
157
158config ARCH_SUPPORTS_OPTIMIZED_INLINING 156config ARCH_SUPPORTS_OPTIMIZED_INLINING
159 def_bool y 157 def_bool y
160 158
@@ -778,23 +776,45 @@ config X86_REBOOTFIXUPS
778 Say N otherwise. 776 Say N otherwise.
779 777
780config MICROCODE 778config MICROCODE
781 tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" 779 tristate "/dev/cpu/microcode - microcode support"
782 select FW_LOADER 780 select FW_LOADER
783 ---help--- 781 ---help---
784 If you say Y here, you will be able to update the microcode on 782 If you say Y here, you will be able to update the microcode on
785 Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, 783 certain Intel and AMD processors. The Intel support is for the
786 Pentium III, Pentium 4, Xeon etc. You will obviously need the 784 IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
787 actual microcode binary data itself which is not shipped with the 785 Pentium 4, Xeon etc. The AMD support is for family 0x10 and
788 Linux kernel. 786 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
787 You will obviously need the actual microcode binary data itself
788 which is not shipped with the Linux kernel.
789 789
790 For latest news and information on obtaining all the required 790 This option selects the general module only, you need to select
791 ingredients for this driver, check: 791 at least one vendor specific module as well.
792 <http://www.urbanmyth.org/microcode/>.
793 792
794 To compile this driver as a module, choose M here: the 793 To compile this driver as a module, choose M here: the
795 module will be called microcode. 794 module will be called microcode.
796 795
797config MICROCODE_OLD_INTERFACE 796config MICROCODE_INTEL
797 bool "Intel microcode patch loading support"
798 depends on MICROCODE
799 default MICROCODE
800 select FW_LOADER
801 --help---
802 This options enables microcode patch loading support for Intel
803 processors.
804
805 For latest news and information on obtaining all the required
806 Intel ingredients for this driver, check:
807 <http://www.urbanmyth.org/microcode/>.
808
809config MICROCODE_AMD
810 bool "AMD microcode patch loading support"
811 depends on MICROCODE
812 select FW_LOADER
813 --help---
814 If you select this option, microcode patch loading support for AMD
815 processors will be enabled.
816
817 config MICROCODE_OLD_INTERFACE
798 def_bool y 818 def_bool y
799 depends on MICROCODE 819 depends on MICROCODE
800 820
@@ -1061,6 +1081,56 @@ config HIGHPTE
1061 low memory. Setting this option will put user-space page table 1081 low memory. Setting this option will put user-space page table
1062 entries in high memory. 1082 entries in high memory.
1063 1083
1084config X86_CHECK_BIOS_CORRUPTION
1085 bool "Check for low memory corruption"
1086 help
1087 Periodically check for memory corruption in low memory, which
1088 is suspected to be caused by BIOS. Even when enabled in the
1089 configuration, it is disabled at runtime. Enable it by
1090 setting "memory_corruption_check=1" on the kernel command
1091 line. By default it scans the low 64k of memory every 60
1092 seconds; see the memory_corruption_check_size and
1093 memory_corruption_check_period parameters in
1094 Documentation/kernel-parameters.txt to adjust this.
1095
1096 When enabled with the default parameters, this option has
1097 almost no overhead, as it reserves a relatively small amount
1098 of memory and scans it infrequently. It both detects corruption
1099 and prevents it from affecting the running system.
1100
1101 It is, however, intended as a diagnostic tool; if repeatable
1102 BIOS-originated corruption always affects the same memory,
1103 you can use memmap= to prevent the kernel from using that
1104 memory.
1105
1106config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1107 bool "Set the default setting of memory_corruption_check"
1108 depends on X86_CHECK_BIOS_CORRUPTION
1109 default y
1110 help
1111 Set whether the default state of memory_corruption_check is
1112 on or off.
1113
1114config X86_RESERVE_LOW_64K
1115 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
1116 default y
1117 help
1118 Reserve the first 64K of physical RAM on BIOSes that are known
1119 to potentially corrupt that memory range. A numbers of BIOSes are
1120 known to utilize this area during suspend/resume, so it must not
1121 be used by the kernel.
1122
1123 Set this to N if you are absolutely sure that you trust the BIOS
1124 to get all its memory reservations and usages right.
1125
1126 If you have doubts about the BIOS (e.g. suspend/resume does not
1127 work or there's kernel crashes after certain hardware hotplug
1128 events) and it's not AMI or Phoenix, then you might want to enable
1129 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
1130 corruption patterns.
1131
1132 Say Y if unsure.
1133
1064config MATH_EMULATION 1134config MATH_EMULATION
1065 bool 1135 bool
1066 prompt "Math emulation" if X86_32 1136 prompt "Math emulation" if X86_32
@@ -1689,6 +1759,14 @@ config DMAR_FLOPPY_WA
1689 workaround will setup a 1:1 mapping for the first 1759 workaround will setup a 1:1 mapping for the first
1690 16M to make floppy (an ISA device) work. 1760 16M to make floppy (an ISA device) work.
1691 1761
1762config INTR_REMAP
1763 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
1764 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
1765 help
1766 Supports Interrupt remapping for IO-APIC and MSI devices.
1767 To use x2apic mode in the CPU's which support x2APIC enhancements or
1768 to support platforms with CPU's having > 8 bit APIC ID, say Y.
1769
1692source "drivers/pci/pcie/Kconfig" 1770source "drivers/pci/pcie/Kconfig"
1693 1771
1694source "drivers/pci/Kconfig" 1772source "drivers/pci/Kconfig"
@@ -1805,7 +1883,7 @@ config IA32_EMULATION
1805 1883
1806config IA32_AOUT 1884config IA32_AOUT
1807 tristate "IA32 a.out support" 1885 tristate "IA32 a.out support"
1808 depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT 1886 depends on IA32_EMULATION
1809 help 1887 help
1810 Support old a.out binaries in the 32bit emulation. 1888 Support old a.out binaries in the 32bit emulation.
1811 1889
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 60a85768cfcb..0b7c4a3f0651 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -38,8 +38,7 @@ config M386
38 - "Crusoe" for the Transmeta Crusoe series. 38 - "Crusoe" for the Transmeta Crusoe series.
39 - "Efficeon" for the Transmeta Efficeon series. 39 - "Efficeon" for the Transmeta Efficeon series.
40 - "Winchip-C6" for original IDT Winchip. 40 - "Winchip-C6" for original IDT Winchip.
41 - "Winchip-2" for IDT Winchip 2. 41 - "Winchip-2" for IDT Winchips with 3dNow! capabilities.
42 - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
43 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). 42 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
44 - "Geode GX/LX" For AMD Geode GX and LX processors. 43 - "Geode GX/LX" For AMD Geode GX and LX processors.
45 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. 44 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
@@ -194,19 +193,11 @@ config MWINCHIPC6
194 treat this chip as a 586TSC with some extended instructions 193 treat this chip as a 586TSC with some extended instructions
195 and alignment requirements. 194 and alignment requirements.
196 195
197config MWINCHIP2
198 bool "Winchip-2"
199 depends on X86_32
200 help
201 Select this for an IDT Winchip-2. Linux and GCC
202 treat this chip as a 586TSC with some extended instructions
203 and alignment requirements.
204
205config MWINCHIP3D 196config MWINCHIP3D
206 bool "Winchip-2A/Winchip-3" 197 bool "Winchip-2/Winchip-2A/Winchip-3"
207 depends on X86_32 198 depends on X86_32
208 help 199 help
209 Select this for an IDT Winchip-2A or 3. Linux and GCC 200 Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
210 treat this chip as a 586TSC with some extended instructions 201 treat this chip as a 586TSC with some extended instructions
211 and alignment requirements. Also enable out of order memory 202 and alignment requirements. Also enable out of order memory
212 stores for this CPU, which can increase performance of some 203 stores for this CPU, which can increase performance of some
@@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT
318 int 309 int
319 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC 310 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
323 314
324config X86_XADD 315config X86_XADD
@@ -360,7 +351,7 @@ config X86_POPAD_OK
360 351
361config X86_ALIGNMENT_16 352config X86_ALIGNMENT_16
362 def_bool y 353 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 354 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 355
365config X86_INTEL_USERCOPY 356config X86_INTEL_USERCOPY
366 def_bool y 357 def_bool y
@@ -368,7 +359,7 @@ config X86_INTEL_USERCOPY
368 359
369config X86_USE_PPRO_CHECKSUM 360config X86_USE_PPRO_CHECKSUM
370 def_bool y 361 def_bool y
371 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 362 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
372 363
373config X86_USE_3DNOW 364config X86_USE_3DNOW
374 def_bool y 365 def_bool y
@@ -376,7 +367,7 @@ config X86_USE_3DNOW
376 367
377config X86_OOSTORE 368config X86_OOSTORE
378 def_bool y 369 def_bool y
379 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR 370 depends on (MWINCHIP3D || MWINCHIPC6) && MTRR
380 371
381# 372#
382# P6_NOPs are a relatively minor optimization that require a family >= 373# P6_NOPs are a relatively minor optimization that require a family >=
@@ -396,7 +387,7 @@ config X86_P6_NOP
396 387
397config X86_TSC 388config X86_TSC
398 def_bool y 389 def_bool y
399 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 390 depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
400 391
401config X86_CMPXCHG64 392config X86_CMPXCHG64
402 def_bool y 393 def_bool y
@@ -406,7 +397,7 @@ config X86_CMPXCHG64
406# generates cmov. 397# generates cmov.
407config X86_CMOV 398config X86_CMOV
408 def_bool y 399 def_bool y
409 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) 400 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64)
410 401
411config X86_MINIMUM_CPU_FAMILY 402config X86_MINIMUM_CPU_FAMILY
412 int 403 int
@@ -417,7 +408,109 @@ config X86_MINIMUM_CPU_FAMILY
417 408
418config X86_DEBUGCTLMSR 409config X86_DEBUGCTLMSR
419 def_bool y 410 def_bool y
420 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) 411 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
412
413menuconfig PROCESSOR_SELECT
414 bool "Supported processor vendors" if EMBEDDED
415 help
416 This lets you choose what x86 vendor support code your kernel
417 will include.
418
419config CPU_SUP_INTEL
420 default y
421 bool "Support Intel processors" if PROCESSOR_SELECT
422 help
423 This enables detection, tunings and quirks for Intel processors
424
425 You need this enabled if you want your kernel to run on an
426 Intel CPU. Disabling this option on other types of CPUs
427 makes the kernel a tiny bit smaller. Disabling it on an Intel
428 CPU might render the kernel unbootable.
429
430 If unsure, say N.
431
432config CPU_SUP_CYRIX_32
433 default y
434 bool "Support Cyrix processors" if PROCESSOR_SELECT
435 depends on !64BIT
436 help
437 This enables detection, tunings and quirks for Cyrix processors
438
439 You need this enabled if you want your kernel to run on a
440 Cyrix CPU. Disabling this option on other types of CPUs
441 makes the kernel a tiny bit smaller. Disabling it on a Cyrix
442 CPU might render the kernel unbootable.
443
444 If unsure, say N.
445
446config CPU_SUP_AMD
447 default y
448 bool "Support AMD processors" if PROCESSOR_SELECT
449 help
450 This enables detection, tunings and quirks for AMD processors
451
452 You need this enabled if you want your kernel to run on an
453 AMD CPU. Disabling this option on other types of CPUs
454 makes the kernel a tiny bit smaller. Disabling it on an AMD
455 CPU might render the kernel unbootable.
456
457 If unsure, say N.
458
459config CPU_SUP_CENTAUR_32
460 default y
461 bool "Support Centaur processors" if PROCESSOR_SELECT
462 depends on !64BIT
463 help
464 This enables detection, tunings and quirks for Centaur processors
465
466 You need this enabled if you want your kernel to run on a
467 Centaur CPU. Disabling this option on other types of CPUs
468 makes the kernel a tiny bit smaller. Disabling it on a Centaur
469 CPU might render the kernel unbootable.
470
471 If unsure, say N.
472
473config CPU_SUP_CENTAUR_64
474 default y
475 bool "Support Centaur processors" if PROCESSOR_SELECT
476 depends on 64BIT
477 help
478 This enables detection, tunings and quirks for Centaur processors
479
480 You need this enabled if you want your kernel to run on a
481 Centaur CPU. Disabling this option on other types of CPUs
482 makes the kernel a tiny bit smaller. Disabling it on a Centaur
483 CPU might render the kernel unbootable.
484
485 If unsure, say N.
486
487config CPU_SUP_TRANSMETA_32
488 default y
489 bool "Support Transmeta processors" if PROCESSOR_SELECT
490 depends on !64BIT
491 help
492 This enables detection, tunings and quirks for Transmeta processors
493
494 You need this enabled if you want your kernel to run on a
495 Transmeta CPU. Disabling this option on other types of CPUs
496 makes the kernel a tiny bit smaller. Disabling it on a Transmeta
497 CPU might render the kernel unbootable.
498
499 If unsure, say N.
500
501config CPU_SUP_UMC_32
502 default y
503 bool "Support UMC processors" if PROCESSOR_SELECT
504 depends on !64BIT
505 help
506 This enables detection, tunings and quirks for UMC processors
507
508 You need this enabled if you want your kernel to run on a
509 UMC CPU. Disabling this option on other types of CPUs
510 makes the kernel a tiny bit smaller. Disabling it on a UMC
511 CPU might render the kernel unbootable.
512
513 If unsure, say N.
421 514
422config X86_DS 515config X86_DS
423 bool "Debug Store support" 516 bool "Debug Store support"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 092f019e033a..2a3dfbd5e677 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,19 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port"
48 default n
49 depends on EARLY_PRINTK && PCI
50 help
51 Write kernel log output directly into the EHCI debug port.
52
53 This is useful for kernel debugging when your machine crashes very
54 early before the console code is initialized. For normal operation
55 it is not recommended because it looks ugly and doesn't cooperate
56 with klogd/syslogd or the X server. You should normally N here,
57 unless you want to debug such a crash. You need usb debug device.
58
46config DEBUG_STACKOVERFLOW 59config DEBUG_STACKOVERFLOW
47 bool "Check for stack overflows" 60 bool "Check for stack overflows"
48 depends on DEBUG_KERNEL 61 depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index e372b584e919..80177ec052f0 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) 30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
31cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 31cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 32cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 33cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
@@ -45,3 +44,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
45# cpu entries 44# cpu entries
46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 45cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
47 46
47# Bug fix for binutils: this option is required in order to keep
48# binutils from generating NOPL instructions against our will.
49ifneq ($(CONFIG_X86_P6_NOP),y)
50cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,)
51endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7ee102f9c4f8..cd48c7210016 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
72KBUILD_CFLAGS += $(call cc-option,-m32) 72KBUILD_CFLAGS += $(call cc-option,-m32)
73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
74 74
75$(obj)/zImage: IMAGE_OFFSET := 0x1000
76$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 75$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
77$(obj)/bzImage: IMAGE_OFFSET := 0x100000
78$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ 76$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
79$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ 77$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
80$(obj)/bzImage: BUILDFLAGS := -b 78$(obj)/bzImage: BUILDFLAGS := -b
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
117 $(call if_changed,objcopy) 115 $(call if_changed,objcopy)
118 116
119$(obj)/compressed/vmlinux: FORCE 117$(obj)/compressed/vmlinux: FORCE
120 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ 118 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
121 119
122# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 120# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
123FDARGS = 121FDARGS =
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE)
181 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ 179 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
182 -no-emul-boot -boot-load-size 4 -boot-info-table \ 180 -no-emul-boot -boot-load-size 4 -boot-info-table \
183 $(obj)/isoimage 181 $(obj)/isoimage
182 isohybrid $(obj)/image.iso 2>/dev/null || true
184 rm -rf $(obj)/isoimage 183 rm -rf $(obj)/isoimage
185 184
186zlilo: $(BOOTIMAGE) 185zlilo: $(BOOTIMAGE)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 92fdd35bd93e..1771c804e02f 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
27 $(call if_changed,objcopy) 27 $(call if_changed,objcopy)
28 28
29 29
30ifeq ($(CONFIG_X86_32),y) 30targets += vmlinux.bin.all vmlinux.relocs relocs
31targets += vmlinux.bin.all vmlinux.relocs 31hostprogs-$(CONFIG_X86_32) += relocs
32hostprogs-y := relocs
33 32
34quiet_cmd_relocs = RELOCS $@ 33quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 34 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE 42$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin) 43 $(call if_changed,relocbin)
45 44
45ifeq ($(CONFIG_X86_32),y)
46
46ifdef CONFIG_RELOCATABLE 47ifdef CONFIG_RELOCATABLE
47$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE 48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
48 $(call if_changed,gzip) 49 $(call if_changed,gzip)
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
59LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T 60LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
60endif 61endif
61 62
62
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
64 $(call if_changed,ld) 64 $(call if_changed,ld)
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 75298fe2edca..6ec6bb6e9957 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -59,17 +59,18 @@ int validate_cpu(void)
59 u32 e = err_flags[i]; 59 u32 e = err_flags[i];
60 60
61 for (j = 0; j < 32; j++) { 61 for (j = 0; j < 32; j++) {
62 int n = (i << 5)+j; 62 if (msg_strs[0] < i ||
63 if (*msg_strs < n) { 63 (msg_strs[0] == i && msg_strs[1] < j)) {
64 /* Skip to the next string */ 64 /* Skip to the next string */
65 do { 65 msg_strs += 2;
66 msg_strs++; 66 while (*msg_strs++)
67 } while (*msg_strs); 67 ;
68 msg_strs++;
69 } 68 }
70 if (e & 1) { 69 if (e & 1) {
71 if (*msg_strs == n && msg_strs[1]) 70 if (msg_strs[0] == i &&
72 printf("%s ", msg_strs+1); 71 msg_strs[1] == j &&
72 msg_strs[2])
73 printf("%s ", msg_strs+2);
73 else 74 else
74 printf("%d:%d ", i, j); 75 printf("%d:%d ", i, j);
75 } 76 }
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index d93cbc6464d0..1aae8f3e5ca1 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
41 char *mbrbuf_ptr, *mbrbuf_end; 41 char *mbrbuf_ptr, *mbrbuf_end;
42 u32 buf_base, mbr_base; 42 u32 buf_base, mbr_base;
43 extern char _end[]; 43 extern char _end[];
44 u16 mbr_magic;
44 45
45 sector_size = ei->params.bytes_per_sector; 46 sector_size = ei->params.bytes_per_sector;
46 if (!sector_size) 47 if (!sector_size)
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
58 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) 59 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
59 return -1; 60 return -1;
60 61
62 memset(mbrbuf_ptr, 0, sector_size);
61 if (read_mbr(devno, mbrbuf_ptr)) 63 if (read_mbr(devno, mbrbuf_ptr))
62 return -1; 64 return -1;
63 65
64 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; 66 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
65 return 0; 67 mbr_magic = *(u16 *)&mbrbuf_ptr[510];
68
69 /* check for valid MBR magic */
70 return mbr_magic == 0xAA55 ? 0 : -1;
66} 71}
67 72
68static int get_edd_info(u8 devno, struct edd_info *ei) 73static int get_edd_info(u8 devno, struct edd_info *ei)
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index bbe76953bae9..8ef60f20b371 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -15,33 +15,33 @@
15 15
16#include <stdio.h> 16#include <stdio.h>
17 17
18#include "../kernel/cpu/feature_names.c" 18#include "../kernel/cpu/capflags.c"
19
20#if NCAPFLAGS > 8
21# error "Need to adjust the boot code handling of CPUID strings"
22#endif
23 19
24int main(void) 20int main(void)
25{ 21{
26 int i; 22 int i, j;
27 const char *str; 23 const char *str;
28 24
29 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] = \n");
30 26
31 for (i = 0; i < NCAPINTS*32; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
32 str = x86_cap_flags[i]; 28 for (j = 0; j < 32; j++) {
33 29 str = x86_cap_flags[i*32+j];
34 if (i == NCAPINTS*32-1) { 30
35 /* The last entry must be unconditional; this 31 if (i == NCAPINTS-1 && j == 31) {
36 also consumes the compiler-added null character */ 32 /* The last entry must be unconditional; this
37 if (!str) 33 also consumes the compiler-added null
38 str = ""; 34 character */
39 printf("\t\"\\x%02x\"\"%s\"\n", i, str); 35 if (!str)
40 } else if (str) { 36 str = "";
41 printf("#if REQUIRED_MASK%d & (1 << %d)\n" 37 printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
42 "\t\"\\x%02x\"\"%s\\0\"\n" 38 i, j, str);
43 "#endif\n", 39 } else if (str) {
44 i >> 5, i & 31, i, str); 40 printf("#if REQUIRED_MASK%d & (1 << %d)\n"
41 "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
42 "#endif\n",
43 i, j, i, j, str);
44 }
45 } 45 }
46 } 46 }
47 printf("\t;\n"); 47 printf("\t;\n");
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 401ad998ad08..1e6fe0214c85 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -224,7 +224,7 @@ static void vesa_store_pm_info(void)
224static void vesa_store_mode_params_graphics(void) 224static void vesa_store_mode_params_graphics(void)
225{ 225{
226 /* Tell the kernel we're in VESA graphics mode */ 226 /* Tell the kernel we're in VESA graphics mode */
227 boot_params.screen_info.orig_video_isVGA = 0x23; 227 boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
228 228
229 /* Mode parameters */ 229 /* Mode parameters */
230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr; 230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index ef9a52005ec9..52d0359719d7 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -213,7 +213,6 @@ CONFIG_M686=y
213# CONFIG_MCRUSOE is not set 213# CONFIG_MCRUSOE is not set
214# CONFIG_MEFFICEON is not set 214# CONFIG_MEFFICEON is not set
215# CONFIG_MWINCHIPC6 is not set 215# CONFIG_MWINCHIPC6 is not set
216# CONFIG_MWINCHIP2 is not set
217# CONFIG_MWINCHIP3D is not set 216# CONFIG_MWINCHIP3D is not set
218# CONFIG_MGEODEGX1 is not set 217# CONFIG_MGEODEGX1 is not set
219# CONFIG_MGEODE_LX is not set 218# CONFIG_MGEODE_LX is not set
@@ -1535,7 +1534,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1535CONFIG_VGA_CONSOLE=y 1534CONFIG_VGA_CONSOLE=y
1536CONFIG_VGACON_SOFT_SCROLLBACK=y 1535CONFIG_VGACON_SOFT_SCROLLBACK=y
1537CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1536CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1538CONFIG_VIDEO_SELECT=y
1539CONFIG_DUMMY_CONSOLE=y 1537CONFIG_DUMMY_CONSOLE=y
1540# CONFIG_FRAMEBUFFER_CONSOLE is not set 1538# CONFIG_FRAMEBUFFER_CONSOLE is not set
1541CONFIG_LOGO=y 1539CONFIG_LOGO=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index e620ea6e2a7a..f0a03d7a7d63 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -210,7 +210,6 @@ CONFIG_X86_PC=y
210# CONFIG_MCRUSOE is not set 210# CONFIG_MCRUSOE is not set
211# CONFIG_MEFFICEON is not set 211# CONFIG_MEFFICEON is not set
212# CONFIG_MWINCHIPC6 is not set 212# CONFIG_MWINCHIPC6 is not set
213# CONFIG_MWINCHIP2 is not set
214# CONFIG_MWINCHIP3D is not set 213# CONFIG_MWINCHIP3D is not set
215# CONFIG_MGEODEGX1 is not set 214# CONFIG_MGEODEGX1 is not set
216# CONFIG_MGEODE_LX is not set 215# CONFIG_MGEODE_LX is not set
@@ -1505,7 +1504,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1505CONFIG_VGA_CONSOLE=y 1504CONFIG_VGA_CONSOLE=y
1506CONFIG_VGACON_SOFT_SCROLLBACK=y 1505CONFIG_VGACON_SOFT_SCROLLBACK=y
1507CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1506CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1508CONFIG_VIDEO_SELECT=y
1509CONFIG_DUMMY_CONSOLE=y 1507CONFIG_DUMMY_CONSOLE=y
1510# CONFIG_FRAMEBUFFER_CONSOLE is not set 1508# CONFIG_FRAMEBUFFER_CONSOLE is not set
1511CONFIG_LOGO=y 1509CONFIG_LOGO=y
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f1a2ac777faf..4bc02b23674b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -179,9 +179,10 @@ struct sigframe
179 u32 pretcode; 179 u32 pretcode;
180 int sig; 180 int sig;
181 struct sigcontext_ia32 sc; 181 struct sigcontext_ia32 sc;
182 struct _fpstate_ia32 fpstate; 182 struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
183 unsigned int extramask[_COMPAT_NSIG_WORDS-1]; 183 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
184 char retcode[8]; 184 char retcode[8];
185 /* fp state follows here */
185}; 186};
186 187
187struct rt_sigframe 188struct rt_sigframe
@@ -192,8 +193,8 @@ struct rt_sigframe
192 u32 puc; 193 u32 puc;
193 compat_siginfo_t info; 194 compat_siginfo_t info;
194 struct ucontext_ia32 uc; 195 struct ucontext_ia32 uc;
195 struct _fpstate_ia32 fpstate;
196 char retcode[8]; 196 char retcode[8];
197 /* fp state follows here */
197}; 198};
198 199
199#define COPY(x) { \ 200#define COPY(x) { \
@@ -215,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
215 unsigned int *peax) 216 unsigned int *peax)
216{ 217{
217 unsigned int tmpflags, gs, oldgs, err = 0; 218 unsigned int tmpflags, gs, oldgs, err = 0;
218 struct _fpstate_ia32 __user *buf; 219 void __user *buf;
219 u32 tmp; 220 u32 tmp;
220 221
221 /* Always make any pending restarted system calls return -EINTR */ 222 /* Always make any pending restarted system calls return -EINTR */
@@ -259,26 +260,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
259 260
260 err |= __get_user(tmp, &sc->fpstate); 261 err |= __get_user(tmp, &sc->fpstate);
261 buf = compat_ptr(tmp); 262 buf = compat_ptr(tmp);
262 if (buf) { 263 err |= restore_i387_xstate_ia32(buf);
263 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
264 goto badframe;
265 err |= restore_i387_ia32(buf);
266 } else {
267 struct task_struct *me = current;
268
269 if (used_math()) {
270 clear_fpu(me);
271 clear_used_math();
272 }
273 }
274 264
275 err |= __get_user(tmp, &sc->ax); 265 err |= __get_user(tmp, &sc->ax);
276 *peax = tmp; 266 *peax = tmp;
277 267
278 return err; 268 return err;
279
280badframe:
281 return 1;
282} 269}
283 270
284asmlinkage long sys32_sigreturn(struct pt_regs *regs) 271asmlinkage long sys32_sigreturn(struct pt_regs *regs)
@@ -350,7 +337,7 @@ badframe:
350 */ 337 */
351 338
352static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, 339static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
353 struct _fpstate_ia32 __user *fpstate, 340 void __user *fpstate,
354 struct pt_regs *regs, unsigned int mask) 341 struct pt_regs *regs, unsigned int mask)
355{ 342{
356 int tmp, err = 0; 343 int tmp, err = 0;
@@ -364,31 +351,28 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
364 savesegment(es, tmp); 351 savesegment(es, tmp);
365 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 352 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
366 353
367 err |= __put_user((u32)regs->di, &sc->di); 354 err |= __put_user(regs->di, &sc->di);
368 err |= __put_user((u32)regs->si, &sc->si); 355 err |= __put_user(regs->si, &sc->si);
369 err |= __put_user((u32)regs->bp, &sc->bp); 356 err |= __put_user(regs->bp, &sc->bp);
370 err |= __put_user((u32)regs->sp, &sc->sp); 357 err |= __put_user(regs->sp, &sc->sp);
371 err |= __put_user((u32)regs->bx, &sc->bx); 358 err |= __put_user(regs->bx, &sc->bx);
372 err |= __put_user((u32)regs->dx, &sc->dx); 359 err |= __put_user(regs->dx, &sc->dx);
373 err |= __put_user((u32)regs->cx, &sc->cx); 360 err |= __put_user(regs->cx, &sc->cx);
374 err |= __put_user((u32)regs->ax, &sc->ax); 361 err |= __put_user(regs->ax, &sc->ax);
375 err |= __put_user((u32)regs->cs, &sc->cs); 362 err |= __put_user(regs->cs, &sc->cs);
376 err |= __put_user((u32)regs->ss, &sc->ss); 363 err |= __put_user(regs->ss, &sc->ss);
377 err |= __put_user(current->thread.trap_no, &sc->trapno); 364 err |= __put_user(current->thread.trap_no, &sc->trapno);
378 err |= __put_user(current->thread.error_code, &sc->err); 365 err |= __put_user(current->thread.error_code, &sc->err);
379 err |= __put_user((u32)regs->ip, &sc->ip); 366 err |= __put_user(regs->ip, &sc->ip);
380 err |= __put_user((u32)regs->flags, &sc->flags); 367 err |= __put_user(regs->flags, &sc->flags);
381 err |= __put_user((u32)regs->sp, &sc->sp_at_signal); 368 err |= __put_user(regs->sp, &sc->sp_at_signal);
382 369
383 tmp = save_i387_ia32(fpstate); 370 tmp = save_i387_xstate_ia32(fpstate);
384 if (tmp < 0) 371 if (tmp < 0)
385 err = -EFAULT; 372 err = -EFAULT;
386 else { 373 else
387 clear_used_math();
388 stts();
389 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), 374 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
390 &sc->fpstate); 375 &sc->fpstate);
391 }
392 376
393 /* non-iBCS2 extensions.. */ 377 /* non-iBCS2 extensions.. */
394 err |= __put_user(mask, &sc->oldmask); 378 err |= __put_user(mask, &sc->oldmask);
@@ -401,7 +385,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
401 * Determine which stack to use.. 385 * Determine which stack to use..
402 */ 386 */
403static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 387static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
404 size_t frame_size) 388 size_t frame_size,
389 void **fpstate)
405{ 390{
406 unsigned long sp; 391 unsigned long sp;
407 392
@@ -420,6 +405,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
420 ka->sa.sa_restorer) 405 ka->sa.sa_restorer)
421 sp = (unsigned long) ka->sa.sa_restorer; 406 sp = (unsigned long) ka->sa.sa_restorer;
422 407
408 if (used_math()) {
409 sp = sp - sig_xstate_ia32_size;
410 *fpstate = (struct _fpstate_ia32 *) sp;
411 }
412
423 sp -= frame_size; 413 sp -= frame_size;
424 /* Align the stack pointer according to the i386 ABI, 414 /* Align the stack pointer according to the i386 ABI,
425 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 415 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
@@ -433,6 +423,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
433 struct sigframe __user *frame; 423 struct sigframe __user *frame;
434 void __user *restorer; 424 void __user *restorer;
435 int err = 0; 425 int err = 0;
426 void __user *fpstate = NULL;
436 427
437 /* copy_to_user optimizes that into a single 8 byte store */ 428 /* copy_to_user optimizes that into a single 8 byte store */
438 static const struct { 429 static const struct {
@@ -447,25 +438,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
447 0, 438 0,
448 }; 439 };
449 440
450 frame = get_sigframe(ka, regs, sizeof(*frame)); 441 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
451 442
452 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 443 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
453 goto give_sigsegv; 444 return -EFAULT;
454 445
455 err |= __put_user(sig, &frame->sig); 446 if (__put_user(sig, &frame->sig))
456 if (err) 447 return -EFAULT;
457 goto give_sigsegv;
458 448
459 err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, 449 if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
460 set->sig[0]); 450 return -EFAULT;
461 if (err)
462 goto give_sigsegv;
463 451
464 if (_COMPAT_NSIG_WORDS > 1) { 452 if (_COMPAT_NSIG_WORDS > 1) {
465 err |= __copy_to_user(frame->extramask, &set->sig[1], 453 if (__copy_to_user(frame->extramask, &set->sig[1],
466 sizeof(frame->extramask)); 454 sizeof(frame->extramask)))
467 if (err) 455 return -EFAULT;
468 goto give_sigsegv;
469 } 456 }
470 457
471 if (ka->sa.sa_flags & SA_RESTORER) { 458 if (ka->sa.sa_flags & SA_RESTORER) {
@@ -486,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
486 */ 473 */
487 err |= __copy_to_user(frame->retcode, &code, 8); 474 err |= __copy_to_user(frame->retcode, &code, 8);
488 if (err) 475 if (err)
489 goto give_sigsegv; 476 return -EFAULT;
490 477
491 /* Set up registers for signal handler */ 478 /* Set up registers for signal handler */
492 regs->sp = (unsigned long) frame; 479 regs->sp = (unsigned long) frame;
@@ -509,10 +496,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
509#endif 496#endif
510 497
511 return 0; 498 return 0;
512
513give_sigsegv:
514 force_sigsegv(sig, current);
515 return -EFAULT;
516} 499}
517 500
518int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 501int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -521,6 +504,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
521 struct rt_sigframe __user *frame; 504 struct rt_sigframe __user *frame;
522 void __user *restorer; 505 void __user *restorer;
523 int err = 0; 506 int err = 0;
507 void __user *fpstate = NULL;
524 508
525 /* __copy_to_user optimizes that into a single 8 byte store */ 509 /* __copy_to_user optimizes that into a single 8 byte store */
526 static const struct { 510 static const struct {
@@ -536,30 +520,33 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
536 0, 520 0,
537 }; 521 };
538 522
539 frame = get_sigframe(ka, regs, sizeof(*frame)); 523 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
540 524
541 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 525 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
542 goto give_sigsegv; 526 return -EFAULT;
543 527
544 err |= __put_user(sig, &frame->sig); 528 err |= __put_user(sig, &frame->sig);
545 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 529 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
546 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 530 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
547 err |= copy_siginfo_to_user32(&frame->info, info); 531 err |= copy_siginfo_to_user32(&frame->info, info);
548 if (err) 532 if (err)
549 goto give_sigsegv; 533 return -EFAULT;
550 534
551 /* Create the ucontext. */ 535 /* Create the ucontext. */
552 err |= __put_user(0, &frame->uc.uc_flags); 536 if (cpu_has_xsave)
537 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
538 else
539 err |= __put_user(0, &frame->uc.uc_flags);
553 err |= __put_user(0, &frame->uc.uc_link); 540 err |= __put_user(0, &frame->uc.uc_link);
554 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 541 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
555 err |= __put_user(sas_ss_flags(regs->sp), 542 err |= __put_user(sas_ss_flags(regs->sp),
556 &frame->uc.uc_stack.ss_flags); 543 &frame->uc.uc_stack.ss_flags);
557 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 544 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
558 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 545 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
559 regs, set->sig[0]); 546 regs, set->sig[0]);
560 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 547 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
561 if (err) 548 if (err)
562 goto give_sigsegv; 549 return -EFAULT;
563 550
564 if (ka->sa.sa_flags & SA_RESTORER) 551 if (ka->sa.sa_flags & SA_RESTORER)
565 restorer = ka->sa.sa_restorer; 552 restorer = ka->sa.sa_restorer;
@@ -574,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
574 */ 561 */
575 err |= __copy_to_user(frame->retcode, &code, 8); 562 err |= __copy_to_user(frame->retcode, &code, 8);
576 if (err) 563 if (err)
577 goto give_sigsegv; 564 return -EFAULT;
578 565
579 /* Set up registers for signal handler */ 566 /* Set up registers for signal handler */
580 regs->sp = (unsigned long) frame; 567 regs->sp = (unsigned long) frame;
@@ -602,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
602#endif 589#endif
603 590
604 return 0; 591 return 0;
605
606give_sigsegv:
607 force_sigsegv(sig, current);
608 return -EFAULT;
609} 592}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffc1bb4fed7d..eb4314768bf7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -39,11 +39,11 @@
39 .endm 39 .endm
40 40
41 /* clobbers %eax */ 41 /* clobbers %eax */
42 .macro CLEAR_RREGS 42 .macro CLEAR_RREGS _r9=rax
43 xorl %eax,%eax 43 xorl %eax,%eax
44 movq %rax,R11(%rsp) 44 movq %rax,R11(%rsp)
45 movq %rax,R10(%rsp) 45 movq %rax,R10(%rsp)
46 movq %rax,R9(%rsp) 46 movq %\_r9,R9(%rsp)
47 movq %rax,R8(%rsp) 47 movq %rax,R8(%rsp)
48 .endm 48 .endm
49 49
@@ -52,11 +52,10 @@
52 * We don't reload %eax because syscall_trace_enter() returned 52 * We don't reload %eax because syscall_trace_enter() returned
53 * the value it wants us to use in the table lookup. 53 * the value it wants us to use in the table lookup.
54 */ 54 */
55 .macro LOAD_ARGS32 offset 55 .macro LOAD_ARGS32 offset, _r9=0
56 movl \offset(%rsp),%r11d 56 .if \_r9
57 movl \offset+8(%rsp),%r10d
58 movl \offset+16(%rsp),%r9d 57 movl \offset+16(%rsp),%r9d
59 movl \offset+24(%rsp),%r8d 58 .endif
60 movl \offset+40(%rsp),%ecx 59 movl \offset+40(%rsp),%ecx
61 movl \offset+48(%rsp),%edx 60 movl \offset+48(%rsp),%edx
62 movl \offset+56(%rsp),%esi 61 movl \offset+56(%rsp),%esi
@@ -145,7 +144,7 @@ ENTRY(ia32_sysenter_target)
145 SAVE_ARGS 0,0,1 144 SAVE_ARGS 0,0,1
146 /* no need to do an access_ok check here because rbp has been 145 /* no need to do an access_ok check here because rbp has been
147 32bit zero extended */ 146 32bit zero extended */
1481: movl (%rbp),%r9d 1471: movl (%rbp),%ebp
149 .section __ex_table,"a" 148 .section __ex_table,"a"
150 .quad 1b,ia32_badarg 149 .quad 1b,ia32_badarg
151 .previous 150 .previous
@@ -157,7 +156,7 @@ ENTRY(ia32_sysenter_target)
157 cmpl $(IA32_NR_syscalls-1),%eax 156 cmpl $(IA32_NR_syscalls-1),%eax
158 ja ia32_badsys 157 ja ia32_badsys
159sysenter_do_call: 158sysenter_do_call:
160 IA32_ARG_FIXUP 1 159 IA32_ARG_FIXUP
161sysenter_dispatch: 160sysenter_dispatch:
162 call *ia32_sys_call_table(,%rax,8) 161 call *ia32_sys_call_table(,%rax,8)
163 movq %rax,RAX-ARGOFFSET(%rsp) 162 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -234,20 +233,17 @@ sysexit_audit:
234#endif 233#endif
235 234
236sysenter_tracesys: 235sysenter_tracesys:
237 xchgl %r9d,%ebp
238#ifdef CONFIG_AUDITSYSCALL 236#ifdef CONFIG_AUDITSYSCALL
239 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 237 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
240 jz sysenter_auditsys 238 jz sysenter_auditsys
241#endif 239#endif
242 SAVE_REST 240 SAVE_REST
243 CLEAR_RREGS 241 CLEAR_RREGS
244 movq %r9,R9(%rsp)
245 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ 242 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
246 movq %rsp,%rdi /* &pt_regs -> arg1 */ 243 movq %rsp,%rdi /* &pt_regs -> arg1 */
247 call syscall_trace_enter 244 call syscall_trace_enter
248 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 245 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
249 RESTORE_REST 246 RESTORE_REST
250 xchgl %ebp,%r9d
251 cmpl $(IA32_NR_syscalls-1),%eax 247 cmpl $(IA32_NR_syscalls-1),%eax
252 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ 248 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
253 jmp sysenter_do_call 249 jmp sysenter_do_call
@@ -314,9 +310,9 @@ ENTRY(ia32_cstar_target)
314 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) 310 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
315 CFI_REMEMBER_STATE 311 CFI_REMEMBER_STATE
316 jnz cstar_tracesys 312 jnz cstar_tracesys
317cstar_do_call:
318 cmpl $IA32_NR_syscalls-1,%eax 313 cmpl $IA32_NR_syscalls-1,%eax
319 ja ia32_badsys 314 ja ia32_badsys
315cstar_do_call:
320 IA32_ARG_FIXUP 1 316 IA32_ARG_FIXUP 1
321cstar_dispatch: 317cstar_dispatch:
322 call *ia32_sys_call_table(,%rax,8) 318 call *ia32_sys_call_table(,%rax,8)
@@ -357,15 +353,13 @@ cstar_tracesys:
357#endif 353#endif
358 xchgl %r9d,%ebp 354 xchgl %r9d,%ebp
359 SAVE_REST 355 SAVE_REST
360 CLEAR_RREGS 356 CLEAR_RREGS r9
361 movq %r9,R9(%rsp)
362 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 357 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
363 movq %rsp,%rdi /* &pt_regs -> arg1 */ 358 movq %rsp,%rdi /* &pt_regs -> arg1 */
364 call syscall_trace_enter 359 call syscall_trace_enter
365 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 360 LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
366 RESTORE_REST 361 RESTORE_REST
367 xchgl %ebp,%r9d 362 xchgl %ebp,%r9d
368 movl RSP-ARGOFFSET(%rsp), %r8d
369 cmpl $(IA32_NR_syscalls-1),%eax 363 cmpl $(IA32_NR_syscalls-1),%eax
370 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ 364 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
371 jmp cstar_do_call 365 jmp cstar_do_call
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3db651fc8ec5..0d41f0343dc0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14endif 14endif
15 15
16# 16#
@@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp)
23CFLAGS_tsc.o := $(nostackp) 23CFLAGS_tsc.o := $(nostackp)
24 24
25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
26obj-y += traps_$(BITS).o irq_$(BITS).o 26obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o
27obj-y += time_$(BITS).o ioport.o ldt.o 27obj-y += time_$(BITS).o ioport.o ldt.o
28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
29obj-$(CONFIG_X86_VISWS) += visws_quirks.o 29obj-$(CONFIG_X86_VISWS) += visws_quirks.o
@@ -38,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o
38 38
39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
40obj-y += process.o 40obj-y += process.o
41obj-y += i387.o 41obj-y += i387.o xsave.o
42obj-y += ptrace.o 42obj-y += ptrace.o
43obj-y += ds.o 43obj-y += ds.o
44obj-$(CONFIG_X86_32) += tls.o 44obj-$(CONFIG_X86_32) += tls.o
@@ -51,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
51obj-$(CONFIG_MCA) += mca_32.o 51obj-$(CONFIG_MCA) += mca_32.o
52obj-$(CONFIG_X86_MSR) += msr.o 52obj-$(CONFIG_X86_MSR) += msr.o
53obj-$(CONFIG_X86_CPUID) += cpuid.o 53obj-$(CONFIG_X86_CPUID) += cpuid.o
54obj-$(CONFIG_MICROCODE) += microcode.o
55obj-$(CONFIG_PCI) += early-quirks.o 54obj-$(CONFIG_PCI) += early-quirks.o
56apm-y := apm_32.o 55apm-y := apm_32.o
57obj-$(CONFIG_APM) += apm.o 56obj-$(CONFIG_APM) += apm.o
@@ -69,6 +68,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
71obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 70obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
71obj-$(CONFIG_X86_ES7000) += es7000_32.o
72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o 72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
73obj-y += vsmp_64.o 73obj-y += vsmp_64.o
74obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
@@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
90obj-$(CONFIG_KVM_GUEST) += kvm.o 90obj-$(CONFIG_KVM_GUEST) += kvm.o
91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
94 94
95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -99,11 +99,18 @@ scx200-y += scx200_32.o
99 99
100obj-$(CONFIG_OLPC) += olpc.o 100obj-$(CONFIG_OLPC) += olpc.o
101 101
102microcode-y := microcode_core.o
103microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
104microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
105obj-$(CONFIG_MICROCODE) += microcode.o
106
102### 107###
103# 64 bit specific files 108# 64 bit specific files
104ifeq ($(CONFIG_X86_64),y) 109ifeq ($(CONFIG_X86_64),y)
105 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 110 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
106 obj-y += bios_uv.o 111 obj-y += bios_uv.o
112 obj-y += genx2apic_cluster.o
113 obj-y += genx2apic_phys.o
107 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 114 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
108 obj-$(CONFIG_AUDIT) += audit_64.o 115 obj-$(CONFIG_AUDIT) += audit_64.o
109 116
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7d40ef7b36e3..eb875cdc7367 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -252,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
252 return; 252 return;
253 } 253 }
254 254
255#ifdef CONFIG_X86_32
256 if (boot_cpu_physical_apicid != -1U) 255 if (boot_cpu_physical_apicid != -1U)
257 ver = apic_version[boot_cpu_physical_apicid]; 256 ver = apic_version[boot_cpu_physical_apicid];
258#endif
259 257
260 generic_processor_info(id, ver); 258 generic_processor_info(id, ver);
261} 259}
@@ -774,11 +772,9 @@ static void __init acpi_register_lapic_address(unsigned long address)
774 772
775 set_fixmap_nocache(FIX_APIC_BASE, address); 773 set_fixmap_nocache(FIX_APIC_BASE, address);
776 if (boot_cpu_physical_apicid == -1U) { 774 if (boot_cpu_physical_apicid == -1U) {
777 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 775 boot_cpu_physical_apicid = read_apic_id();
778#ifdef CONFIG_X86_32
779 apic_version[boot_cpu_physical_apicid] = 776 apic_version[boot_cpu_physical_apicid] =
780 GET_APIC_VERSION(apic_read(APIC_LVR)); 777 GET_APIC_VERSION(apic_read(APIC_LVR));
781#endif
782 } 778 }
783} 779}
784 780
@@ -1350,7 +1346,9 @@ static void __init acpi_process_madt(void)
1350 acpi_ioapic = 1; 1346 acpi_ioapic = 1;
1351 1347
1352 smp_found_config = 1; 1348 smp_found_config = 1;
1349#ifdef CONFIG_X86_32
1353 setup_apic_routing(); 1350 setup_apic_routing();
1351#endif
1354 } 1352 }
1355 } 1353 }
1356 if (error == -EINVAL) { 1354 if (error == -EINVAL) {
@@ -1420,8 +1418,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1420 */ 1418 */
1421static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1419static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1422{ 1420{
1423 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); 1421 /*
1424 acpi_skip_timer_override = 1; 1422 * The ati_ixp4x0_rev() early PCI quirk should have set
1423 * the acpi_skip_timer_override flag already:
1424 */
1425 if (!acpi_skip_timer_override) {
1426 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
1427 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1428 d->ident);
1429 acpi_skip_timer_override = 1;
1430 }
1425 return 0; 1431 return 0;
1426} 1432}
1427 1433
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index fb04e49776ba..a84ac7b570e6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -444,7 +444,7 @@ void __init alternative_instructions(void)
444 _text, _etext); 444 _text, _etext);
445 445
446 /* Only switch to UP mode if we don't immediately boot others */ 446 /* Only switch to UP mode if we don't immediately boot others */
447 if (num_possible_cpus() == 1 || setup_max_cpus <= 1) 447 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
448 alternatives_smp_switch(0); 448 alternatives_smp_switch(0);
449 } 449 }
450#endif 450#endif
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 148fcfe22f17..4cd8083c58be 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -723,9 +723,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
723 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
724 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
725 725
726 pci_enable_device(iommu->dev); 726 return pci_enable_device(iommu->dev);
727
728 return 0;
729} 727}
730 728
731/* 729/*
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index f88bd0d982b0..21c831d96af3 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -60,10 +60,8 @@ unsigned long mp_lapic_addr;
60static int force_enable_local_apic; 60static int force_enable_local_apic;
61int disable_apic; 61int disable_apic;
62 62
63/* Local APIC timer verification ok */
64static int local_apic_timer_verify_ok;
65/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 63/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_disabled; 64static int disable_apic_timer __cpuinitdata;
67/* Local APIC timer works in C2 */ 65/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok; 66int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 67EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -130,7 +128,11 @@ static inline int lapic_get_version(void)
130 */ 128 */
131static inline int lapic_is_integrated(void) 129static inline int lapic_is_integrated(void)
132{ 130{
131#ifdef CONFIG_X86_64
132 return 1;
133#else
133 return APIC_INTEGRATED(lapic_get_version()); 134 return APIC_INTEGRATED(lapic_get_version());
135#endif
134} 136}
135 137
136/* 138/*
@@ -145,13 +147,18 @@ static int modern_apic(void)
145 return lapic_get_version() >= 0x14; 147 return lapic_get_version() >= 0x14;
146} 148}
147 149
148void apic_wait_icr_idle(void) 150/*
151 * Paravirt kernels also might be using these below ops. So we still
152 * use generic apic_read()/apic_write(), which might be pointing to different
153 * ops in PARAVIRT case.
154 */
155void xapic_wait_icr_idle(void)
149{ 156{
150 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 157 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
151 cpu_relax(); 158 cpu_relax();
152} 159}
153 160
154u32 safe_apic_wait_icr_idle(void) 161u32 safe_xapic_wait_icr_idle(void)
155{ 162{
156 u32 send_status; 163 u32 send_status;
157 int timeout; 164 int timeout;
@@ -167,16 +174,48 @@ u32 safe_apic_wait_icr_idle(void)
167 return send_status; 174 return send_status;
168} 175}
169 176
177void xapic_icr_write(u32 low, u32 id)
178{
179 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
180 apic_write(APIC_ICR, low);
181}
182
183u64 xapic_icr_read(void)
184{
185 u32 icr1, icr2;
186
187 icr2 = apic_read(APIC_ICR2);
188 icr1 = apic_read(APIC_ICR);
189
190 return icr1 | ((u64)icr2 << 32);
191}
192
193static struct apic_ops xapic_ops = {
194 .read = native_apic_mem_read,
195 .write = native_apic_mem_write,
196 .icr_read = xapic_icr_read,
197 .icr_write = xapic_icr_write,
198 .wait_icr_idle = xapic_wait_icr_idle,
199 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
200};
201
202struct apic_ops __read_mostly *apic_ops = &xapic_ops;
203EXPORT_SYMBOL_GPL(apic_ops);
204
170/** 205/**
171 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 206 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
172 */ 207 */
173void __cpuinit enable_NMI_through_LVT0(void) 208void __cpuinit enable_NMI_through_LVT0(void)
174{ 209{
175 unsigned int v = APIC_DM_NMI; 210 unsigned int v;
176 211
177 /* Level triggered for 82489DX */ 212 /* unmask and set to NMI */
213 v = APIC_DM_NMI;
214
215 /* Level triggered for 82489DX (32bit mode) */
178 if (!lapic_is_integrated()) 216 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 217 v |= APIC_LVT_LEVEL_TRIGGER;
218
180 apic_write(APIC_LVT0, v); 219 apic_write(APIC_LVT0, v);
181} 220}
182 221
@@ -193,9 +232,13 @@ int get_physical_broadcast(void)
193 */ 232 */
194int lapic_get_maxlvt(void) 233int lapic_get_maxlvt(void)
195{ 234{
196 unsigned int v = apic_read(APIC_LVR); 235 unsigned int v;
197 236
198 /* 82489DXs do not report # of LVT entries. */ 237 v = apic_read(APIC_LVR);
238 /*
239 * - we always have APIC integrated on 64bit mode
240 * - 82489DXs do not report # of LVT entries
241 */
199 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; 242 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
200} 243}
201 244
@@ -203,8 +246,12 @@ int lapic_get_maxlvt(void)
203 * Local APIC timer 246 * Local APIC timer
204 */ 247 */
205 248
206/* Clock divisor is set to 16 */ 249/* Clock divisor */
250#ifdef CONFG_X86_64
251#define APIC_DIVISOR 1
252#else
207#define APIC_DIVISOR 16 253#define APIC_DIVISOR 16
254#endif
208 255
209/* 256/*
210 * This function sets up the local APIC timer, with a timeout of 257 * This function sets up the local APIC timer, with a timeout of
@@ -212,6 +259,9 @@ int lapic_get_maxlvt(void)
212 * this function twice on the boot CPU, once with a bogus timeout 259 * this function twice on the boot CPU, once with a bogus timeout
213 * value, second time for real. The other (noncalibrating) CPUs 260 * value, second time for real. The other (noncalibrating) CPUs
214 * call this function only once, with the real, calibrated value. 261 * call this function only once, with the real, calibrated value.
262 *
263 * We do reads before writes even if unnecessary, to get around the
264 * P5 APIC double write bug.
215 */ 265 */
216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 266static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
217{ 267{
@@ -233,14 +283,48 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
233 */ 283 */
234 tmp_value = apic_read(APIC_TDCR); 284 tmp_value = apic_read(APIC_TDCR);
235 apic_write(APIC_TDCR, 285 apic_write(APIC_TDCR,
236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | 286 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
237 APIC_TDR_DIV_16); 287 APIC_TDR_DIV_16);
238 288
239 if (!oneshot) 289 if (!oneshot)
240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR); 290 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
241} 291}
242 292
243/* 293/*
294 * Setup extended LVT, AMD specific (K8, family 10h)
295 *
296 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
297 * MCE interrupts are supported. Thus MCE offset must be set to 0.
298 *
299 * If mask=1, the LVT entry does not generate interrupts while mask=0
300 * enables the vector. See also the BKDGs.
301 */
302
303#define APIC_EILVT_LVTOFF_MCE 0
304#define APIC_EILVT_LVTOFF_IBS 1
305
306static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
307{
308 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
309 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
310
311 apic_write(reg, v);
312}
313
314u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
315{
316 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
317 return APIC_EILVT_LVTOFF_MCE;
318}
319
320u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
321{
322 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
323 return APIC_EILVT_LVTOFF_IBS;
324}
325EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
326
327/*
244 * Program the next event, relative to now 328 * Program the next event, relative to now
245 */ 329 */
246static int lapic_next_event(unsigned long delta, 330static int lapic_next_event(unsigned long delta,
@@ -259,8 +343,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
259 unsigned long flags; 343 unsigned long flags;
260 unsigned int v; 344 unsigned int v;
261 345
262 /* Lapic used for broadcast ? */ 346 /* Lapic used as dummy for broadcast ? */
263 if (!local_apic_timer_verify_ok) 347 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
264 return; 348 return;
265 349
266 local_irq_save(flags); 350 local_irq_save(flags);
@@ -473,7 +557,7 @@ static int __init calibrate_APIC_clock(void)
473 return -1; 557 return -1;
474 } 558 }
475 559
476 local_apic_timer_verify_ok = 1; 560 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
477 561
478 /* We trust the pm timer based calibration */ 562 /* We trust the pm timer based calibration */
479 if (!pm_referenced) { 563 if (!pm_referenced) {
@@ -507,11 +591,11 @@ static int __init calibrate_APIC_clock(void)
507 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 591 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
508 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 592 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
509 else 593 else
510 local_apic_timer_verify_ok = 0; 594 levt->features |= CLOCK_EVT_FEAT_DUMMY;
511 } else 595 } else
512 local_irq_enable(); 596 local_irq_enable();
513 597
514 if (!local_apic_timer_verify_ok) { 598 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
515 printk(KERN_WARNING 599 printk(KERN_WARNING
516 "APIC timer disabled due to verification failure.\n"); 600 "APIC timer disabled due to verification failure.\n");
517 return -1; 601 return -1;
@@ -533,7 +617,8 @@ void __init setup_boot_APIC_clock(void)
533 * timer as a dummy clock event source on SMP systems, so the 617 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it. 618 * broadcast mechanism is used. On UP systems simply ignore it.
535 */ 619 */
536 if (local_apic_timer_disabled) { 620 if (disable_apic_timer) {
621 printk(KERN_INFO "Disabling APIC timer\n");
537 /* No broadcast on UP ! */ 622 /* No broadcast on UP ! */
538 if (num_possible_cpus() > 1) { 623 if (num_possible_cpus() > 1) {
539 lapic_clockevent.mult = 1; 624 lapic_clockevent.mult = 1;
@@ -602,7 +687,11 @@ static void local_apic_timer_interrupt(void)
602 /* 687 /*
603 * the NMI deadlock-detector uses this. 688 * the NMI deadlock-detector uses this.
604 */ 689 */
690#ifdef CONFIG_X86_64
691 add_pda(apic_timer_irqs, 1);
692#else
605 per_cpu(irq_stat, cpu).apic_timer_irqs++; 693 per_cpu(irq_stat, cpu).apic_timer_irqs++;
694#endif
606 695
607 evt->event_handler(evt); 696 evt->event_handler(evt);
608} 697}
@@ -642,35 +731,6 @@ int setup_profiling_timer(unsigned int multiplier)
642} 731}
643 732
644/* 733/*
645 * Setup extended LVT, AMD specific (K8, family 10h)
646 *
647 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
648 * MCE interrupts are supported. Thus MCE offset must be set to 0.
649 */
650
651#define APIC_EILVT_LVTOFF_MCE 0
652#define APIC_EILVT_LVTOFF_IBS 1
653
654static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
655{
656 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
657 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
658 apic_write(reg, v);
659}
660
661u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
662{
663 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
664 return APIC_EILVT_LVTOFF_MCE;
665}
666
667u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
668{
669 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
670 return APIC_EILVT_LVTOFF_IBS;
671}
672
673/*
674 * Local APIC start and shutdown 734 * Local APIC start and shutdown
675 */ 735 */
676 736
@@ -715,7 +775,7 @@ void clear_local_APIC(void)
715 } 775 }
716 776
717 /* lets not touch this if we didn't frob it */ 777 /* lets not touch this if we didn't frob it */
718#ifdef CONFIG_X86_MCE_P4THERMAL 778#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
719 if (maxlvt >= 5) { 779 if (maxlvt >= 5) {
720 v = apic_read(APIC_LVTTHMR); 780 v = apic_read(APIC_LVTTHMR);
721 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 781 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -732,10 +792,6 @@ void clear_local_APIC(void)
732 if (maxlvt >= 4) 792 if (maxlvt >= 4)
733 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 793 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
734 794
735#ifdef CONFIG_X86_MCE_P4THERMAL
736 if (maxlvt >= 5)
737 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
738#endif
739 /* Integrated APIC (!82489DX) ? */ 795 /* Integrated APIC (!82489DX) ? */
740 if (lapic_is_integrated()) { 796 if (lapic_is_integrated()) {
741 if (maxlvt > 3) 797 if (maxlvt > 3)
@@ -750,7 +806,7 @@ void clear_local_APIC(void)
750 */ 806 */
751void disable_local_APIC(void) 807void disable_local_APIC(void)
752{ 808{
753 unsigned long value; 809 unsigned int value;
754 810
755 clear_local_APIC(); 811 clear_local_APIC();
756 812
@@ -762,6 +818,7 @@ void disable_local_APIC(void)
762 value &= ~APIC_SPIV_APIC_ENABLED; 818 value &= ~APIC_SPIV_APIC_ENABLED;
763 apic_write(APIC_SPIV, value); 819 apic_write(APIC_SPIV, value);
764 820
821#ifdef CONFIG_X86_32
765 /* 822 /*
766 * When LAPIC was disabled by the BIOS and enabled by the kernel, 823 * When LAPIC was disabled by the BIOS and enabled by the kernel,
767 * restore the disabled state. 824 * restore the disabled state.
@@ -773,6 +830,7 @@ void disable_local_APIC(void)
773 l &= ~MSR_IA32_APICBASE_ENABLE; 830 l &= ~MSR_IA32_APICBASE_ENABLE;
774 wrmsr(MSR_IA32_APICBASE, l, h); 831 wrmsr(MSR_IA32_APICBASE, l, h);
775 } 832 }
833#endif
776} 834}
777 835
778/* 836/*
@@ -789,11 +847,15 @@ void lapic_shutdown(void)
789 return; 847 return;
790 848
791 local_irq_save(flags); 849 local_irq_save(flags);
792 clear_local_APIC();
793 850
794 if (enabled_via_apicbase) 851#ifdef CONFIG_X86_32
852 if (!enabled_via_apicbase)
853 clear_local_APIC();
854 else
855#endif
795 disable_local_APIC(); 856 disable_local_APIC();
796 857
858
797 local_irq_restore(flags); 859 local_irq_restore(flags);
798} 860}
799 861
@@ -838,6 +900,12 @@ int __init verify_local_APIC(void)
838 */ 900 */
839 reg0 = apic_read(APIC_ID); 901 reg0 = apic_read(APIC_ID);
840 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 902 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
903 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
904 reg1 = apic_read(APIC_ID);
905 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
906 apic_write(APIC_ID, reg0);
907 if (reg1 != (reg0 ^ APIC_ID_MASK))
908 return 0;
841 909
842 /* 910 /*
843 * The next two are just to see if we have sane values. 911 * The next two are just to see if we have sane values.
@@ -863,14 +931,15 @@ void __init sync_Arb_IDs(void)
863 */ 931 */
864 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 932 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
865 return; 933 return;
934
866 /* 935 /*
867 * Wait for idle. 936 * Wait for idle.
868 */ 937 */
869 apic_wait_icr_idle(); 938 apic_wait_icr_idle();
870 939
871 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 940 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
872 apic_write(APIC_ICR, 941 apic_write(APIC_ICR, APIC_DEST_ALLINC |
873 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); 942 APIC_INT_LEVELTRIG | APIC_DM_INIT);
874} 943}
875 944
876/* 945/*
@@ -878,7 +947,7 @@ void __init sync_Arb_IDs(void)
878 */ 947 */
879void __init init_bsp_APIC(void) 948void __init init_bsp_APIC(void)
880{ 949{
881 unsigned long value; 950 unsigned int value;
882 951
883 /* 952 /*
884 * Don't do the setup now if we have a SMP BIOS as the 953 * Don't do the setup now if we have a SMP BIOS as the
@@ -899,11 +968,13 @@ void __init init_bsp_APIC(void)
899 value &= ~APIC_VECTOR_MASK; 968 value &= ~APIC_VECTOR_MASK;
900 value |= APIC_SPIV_APIC_ENABLED; 969 value |= APIC_SPIV_APIC_ENABLED;
901 970
971#ifdef CONFIG_X86_32
902 /* This bit is reserved on P4/Xeon and should be cleared */ 972 /* This bit is reserved on P4/Xeon and should be cleared */
903 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 973 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
904 (boot_cpu_data.x86 == 15)) 974 (boot_cpu_data.x86 == 15))
905 value &= ~APIC_SPIV_FOCUS_DISABLED; 975 value &= ~APIC_SPIV_FOCUS_DISABLED;
906 else 976 else
977#endif
907 value |= APIC_SPIV_FOCUS_DISABLED; 978 value |= APIC_SPIV_FOCUS_DISABLED;
908 value |= SPURIOUS_APIC_VECTOR; 979 value |= SPURIOUS_APIC_VECTOR;
909 apic_write(APIC_SPIV, value); 980 apic_write(APIC_SPIV, value);
@@ -922,6 +993,16 @@ static void __cpuinit lapic_setup_esr(void)
922{ 993{
923 unsigned long oldvalue, value, maxlvt; 994 unsigned long oldvalue, value, maxlvt;
924 if (lapic_is_integrated() && !esr_disable) { 995 if (lapic_is_integrated() && !esr_disable) {
996 if (esr_disable) {
997 /*
998 * Something untraceable is creating bad interrupts on
999 * secondary quads ... for the moment, just leave the
1000 * ESR disabled - we can't do anything useful with the
1001 * errors anyway - mbligh
1002 */
1003 printk(KERN_INFO "Leaving ESR disabled.\n");
1004 return;
1005 }
925 /* !82489DX */ 1006 /* !82489DX */
926 maxlvt = lapic_get_maxlvt(); 1007 maxlvt = lapic_get_maxlvt();
927 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1008 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -942,16 +1023,7 @@ static void __cpuinit lapic_setup_esr(void)
942 "vector: 0x%08lx after: 0x%08lx\n", 1023 "vector: 0x%08lx after: 0x%08lx\n",
943 oldvalue, value); 1024 oldvalue, value);
944 } else { 1025 } else {
945 if (esr_disable) 1026 printk(KERN_INFO "No ESR for 82489DX.\n");
946 /*
947 * Something untraceable is creating bad interrupts on
948 * secondary quads ... for the moment, just leave the
949 * ESR disabled - we can't do anything useful with the
950 * errors anyway - mbligh
951 */
952 printk(KERN_INFO "Leaving ESR disabled.\n");
953 else
954 printk(KERN_INFO "No ESR for 82489DX.\n");
955 } 1027 }
956} 1028}
957 1029
@@ -1089,13 +1161,17 @@ void __cpuinit setup_local_APIC(void)
1089 1161
1090void __cpuinit end_local_APIC_setup(void) 1162void __cpuinit end_local_APIC_setup(void)
1091{ 1163{
1092 unsigned long value;
1093
1094 lapic_setup_esr(); 1164 lapic_setup_esr();
1095 /* Disable the local apic timer */ 1165
1096 value = apic_read(APIC_LVTT); 1166#ifdef CONFIG_X86_32
1097 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1167 {
1098 apic_write(APIC_LVTT, value); 1168 unsigned int value;
1169 /* Disable the local apic timer */
1170 value = apic_read(APIC_LVTT);
1171 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1172 apic_write(APIC_LVTT, value);
1173 }
1174#endif
1099 1175
1100 setup_apic_nmi_watchdog(NULL); 1176 setup_apic_nmi_watchdog(NULL);
1101 apic_pm_activate(); 1177 apic_pm_activate();
@@ -1205,7 +1281,7 @@ void __init init_apic_mappings(void)
1205 * default configuration (or the MP table is broken). 1281 * default configuration (or the MP table is broken).
1206 */ 1282 */
1207 if (boot_cpu_physical_apicid == -1U) 1283 if (boot_cpu_physical_apicid == -1U)
1208 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1284 boot_cpu_physical_apicid = read_apic_id();
1209 1285
1210} 1286}
1211 1287
@@ -1242,7 +1318,7 @@ int __init APIC_init_uniprocessor(void)
1242 * might be zero if read from MP tables. Get it from LAPIC. 1318 * might be zero if read from MP tables. Get it from LAPIC.
1243 */ 1319 */
1244#ifdef CONFIG_CRASH_DUMP 1320#ifdef CONFIG_CRASH_DUMP
1245 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1321 boot_cpu_physical_apicid = read_apic_id();
1246#endif 1322#endif
1247 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1323 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1248 1324
@@ -1321,59 +1397,12 @@ void smp_error_interrupt(struct pt_regs *regs)
1321 irq_exit(); 1397 irq_exit();
1322} 1398}
1323 1399
1324#ifdef CONFIG_SMP
1325void __init smp_intr_init(void)
1326{
1327 /*
1328 * IRQ0 must be given a fixed assignment and initialized,
1329 * because it's used before the IO-APIC is set up.
1330 */
1331 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1332
1333 /*
1334 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1335 * IPI, driven by wakeup.
1336 */
1337 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1338
1339 /* IPI for invalidation */
1340 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1341
1342 /* IPI for generic function call */
1343 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1344
1345 /* IPI for single call function */
1346 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
1347 call_function_single_interrupt);
1348}
1349#endif
1350
1351/*
1352 * Initialize APIC interrupts
1353 */
1354void __init apic_intr_init(void)
1355{
1356#ifdef CONFIG_SMP
1357 smp_intr_init();
1358#endif
1359 /* self generated IPI for local APIC timer */
1360 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1361
1362 /* IPI vectors for APIC spurious and error interrupts */
1363 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1364 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1365
1366 /* thermal monitor LVT interrupt */
1367#ifdef CONFIG_X86_MCE_P4THERMAL
1368 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1369#endif
1370}
1371
1372/** 1400/**
1373 * connect_bsp_APIC - attach the APIC to the interrupt system 1401 * connect_bsp_APIC - attach the APIC to the interrupt system
1374 */ 1402 */
1375void __init connect_bsp_APIC(void) 1403void __init connect_bsp_APIC(void)
1376{ 1404{
1405#ifdef CONFIG_X86_32
1377 if (pic_mode) { 1406 if (pic_mode) {
1378 /* 1407 /*
1379 * Do not trust the local APIC being empty at bootup. 1408 * Do not trust the local APIC being empty at bootup.
@@ -1388,6 +1417,7 @@ void __init connect_bsp_APIC(void)
1388 outb(0x70, 0x22); 1417 outb(0x70, 0x22);
1389 outb(0x01, 0x23); 1418 outb(0x01, 0x23);
1390 } 1419 }
1420#endif
1391 enable_apic_mode(); 1421 enable_apic_mode();
1392} 1422}
1393 1423
@@ -1400,6 +1430,9 @@ void __init connect_bsp_APIC(void)
1400 */ 1430 */
1401void disconnect_bsp_APIC(int virt_wire_setup) 1431void disconnect_bsp_APIC(int virt_wire_setup)
1402{ 1432{
1433 unsigned int value;
1434
1435#ifdef CONFIG_X86_32
1403 if (pic_mode) { 1436 if (pic_mode) {
1404 /* 1437 /*
1405 * Put the board back into PIC mode (has an effect only on 1438 * Put the board back into PIC mode (has an effect only on
@@ -1411,54 +1444,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1411 "entering PIC mode.\n"); 1444 "entering PIC mode.\n");
1412 outb(0x70, 0x22); 1445 outb(0x70, 0x22);
1413 outb(0x00, 0x23); 1446 outb(0x00, 0x23);
1414 } else { 1447 return;
1415 /* Go back to Virtual Wire compatibility mode */ 1448 }
1416 unsigned long value; 1449#endif
1417 1450
1418 /* For the spurious interrupt use vector F, and enable it */ 1451 /* Go back to Virtual Wire compatibility mode */
1419 value = apic_read(APIC_SPIV);
1420 value &= ~APIC_VECTOR_MASK;
1421 value |= APIC_SPIV_APIC_ENABLED;
1422 value |= 0xf;
1423 apic_write(APIC_SPIV, value);
1424 1452
1425 if (!virt_wire_setup) { 1453 /* For the spurious interrupt use vector F, and enable it */
1426 /* 1454 value = apic_read(APIC_SPIV);
1427 * For LVT0 make it edge triggered, active high, 1455 value &= ~APIC_VECTOR_MASK;
1428 * external and enabled 1456 value |= APIC_SPIV_APIC_ENABLED;
1429 */ 1457 value |= 0xf;
1430 value = apic_read(APIC_LVT0); 1458 apic_write(APIC_SPIV, value);
1431 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1432 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1433 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1434 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1435 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1436 apic_write(APIC_LVT0, value);
1437 } else {
1438 /* Disable LVT0 */
1439 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1440 }
1441 1459
1460 if (!virt_wire_setup) {
1442 /* 1461 /*
1443 * For LVT1 make it edge triggered, active high, nmi and 1462 * For LVT0 make it edge triggered, active high,
1444 * enabled 1463 * external and enabled
1445 */ 1464 */
1446 value = apic_read(APIC_LVT1); 1465 value = apic_read(APIC_LVT0);
1447 value &= ~( 1466 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1448 APIC_MODE_MASK | APIC_SEND_PENDING |
1449 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1467 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1450 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1468 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1451 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1469 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1452 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1470 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1453 apic_write(APIC_LVT1, value); 1471 apic_write(APIC_LVT0, value);
1472 } else {
1473 /* Disable LVT0 */
1474 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1454 } 1475 }
1476
1477 /*
1478 * For LVT1 make it edge triggered, active high,
1479 * nmi and enabled
1480 */
1481 value = apic_read(APIC_LVT1);
1482 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1483 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1484 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1485 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1486 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1487 apic_write(APIC_LVT1, value);
1455} 1488}
1456 1489
1457void __cpuinit generic_processor_info(int apicid, int version) 1490void __cpuinit generic_processor_info(int apicid, int version)
1458{ 1491{
1459 int cpu; 1492 int cpu;
1460 cpumask_t tmp_map; 1493 cpumask_t tmp_map;
1461 physid_mask_t phys_cpu;
1462 1494
1463 /* 1495 /*
1464 * Validate version 1496 * Validate version
@@ -1471,9 +1503,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1471 } 1503 }
1472 apic_version[apicid] = version; 1504 apic_version[apicid] = version;
1473 1505
1474 phys_cpu = apicid_to_cpu_present(apicid);
1475 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1476
1477 if (num_processors >= NR_CPUS) { 1506 if (num_processors >= NR_CPUS) {
1478 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1507 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1479 " Processor ignored.\n", NR_CPUS); 1508 " Processor ignored.\n", NR_CPUS);
@@ -1484,17 +1513,19 @@ void __cpuinit generic_processor_info(int apicid, int version)
1484 cpus_complement(tmp_map, cpu_present_map); 1513 cpus_complement(tmp_map, cpu_present_map);
1485 cpu = first_cpu(tmp_map); 1514 cpu = first_cpu(tmp_map);
1486 1515
1487 if (apicid == boot_cpu_physical_apicid) 1516 physid_set(apicid, phys_cpu_present_map);
1517 if (apicid == boot_cpu_physical_apicid) {
1488 /* 1518 /*
1489 * x86_bios_cpu_apicid is required to have processors listed 1519 * x86_bios_cpu_apicid is required to have processors listed
1490 * in same order as logical cpu numbers. Hence the first 1520 * in same order as logical cpu numbers. Hence the first
1491 * entry is BSP, and so on. 1521 * entry is BSP, and so on.
1492 */ 1522 */
1493 cpu = 0; 1523 cpu = 0;
1494 1524 }
1495 if (apicid > max_physical_apicid) 1525 if (apicid > max_physical_apicid)
1496 max_physical_apicid = apicid; 1526 max_physical_apicid = apicid;
1497 1527
1528#ifdef CONFIG_X86_32
1498 /* 1529 /*
1499 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1530 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1500 * but we need to work other dependencies like SMP_SUSPEND etc 1531 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1514,7 +1545,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1514 def_to_bigsmp = 1; 1545 def_to_bigsmp = 1;
1515 } 1546 }
1516 } 1547 }
1517#ifdef CONFIG_SMP 1548#endif
1549
1550#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1518 /* are we being called early in kernel startup? */ 1551 /* are we being called early in kernel startup? */
1519 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1552 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1520 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1553 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1527,6 +1560,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1527 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1560 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1528 } 1561 }
1529#endif 1562#endif
1563
1530 cpu_set(cpu, cpu_possible_map); 1564 cpu_set(cpu, cpu_possible_map);
1531 cpu_set(cpu, cpu_present_map); 1565 cpu_set(cpu, cpu_present_map);
1532} 1566}
@@ -1537,6 +1571,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
1537#ifdef CONFIG_PM 1571#ifdef CONFIG_PM
1538 1572
1539static struct { 1573static struct {
1574 /*
1575 * 'active' is true if the local APIC was enabled by us and
1576 * not the BIOS; this signifies that we are also responsible
1577 * for disabling it before entering apm/acpi suspend
1578 */
1540 int active; 1579 int active;
1541 /* r/w apic fields */ 1580 /* r/w apic fields */
1542 unsigned int apic_id; 1581 unsigned int apic_id;
@@ -1577,7 +1616,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1577 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1616 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1578 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1617 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1579 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1618 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1580#ifdef CONFIG_X86_MCE_P4THERMAL 1619#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1581 if (maxlvt >= 5) 1620 if (maxlvt >= 5)
1582 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1621 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1583#endif 1622#endif
@@ -1601,16 +1640,23 @@ static int lapic_resume(struct sys_device *dev)
1601 1640
1602 local_irq_save(flags); 1641 local_irq_save(flags);
1603 1642
1604 /* 1643#ifdef CONFIG_X86_64
1605 * Make sure the APICBASE points to the right address 1644 if (x2apic)
1606 * 1645 enable_x2apic();
1607 * FIXME! This will be wrong if we ever support suspend on 1646 else
1608 * SMP! We'll need to do this as part of the CPU restore! 1647#endif
1609 */ 1648 {
1610 rdmsr(MSR_IA32_APICBASE, l, h); 1649 /*
1611 l &= ~MSR_IA32_APICBASE_BASE; 1650 * Make sure the APICBASE points to the right address
1612 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1651 *
1613 wrmsr(MSR_IA32_APICBASE, l, h); 1652 * FIXME! This will be wrong if we ever support suspend on
1653 * SMP! We'll need to do this as part of the CPU restore!
1654 */
1655 rdmsr(MSR_IA32_APICBASE, l, h);
1656 l &= ~MSR_IA32_APICBASE_BASE;
1657 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1658 wrmsr(MSR_IA32_APICBASE, l, h);
1659 }
1614 1660
1615 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1661 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1616 apic_write(APIC_ID, apic_pm_state.apic_id); 1662 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1620,7 +1666,7 @@ static int lapic_resume(struct sys_device *dev)
1620 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1666 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1621 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1667 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1622 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1668 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1623#ifdef CONFIG_X86_MCE_P4THERMAL 1669#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1624 if (maxlvt >= 5) 1670 if (maxlvt >= 5)
1625 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1671 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1626#endif 1672#endif
@@ -1634,7 +1680,9 @@ static int lapic_resume(struct sys_device *dev)
1634 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1680 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1635 apic_write(APIC_ESR, 0); 1681 apic_write(APIC_ESR, 0);
1636 apic_read(APIC_ESR); 1682 apic_read(APIC_ESR);
1683
1637 local_irq_restore(flags); 1684 local_irq_restore(flags);
1685
1638 return 0; 1686 return 0;
1639} 1687}
1640 1688
@@ -1690,20 +1738,20 @@ static int __init parse_lapic(char *arg)
1690} 1738}
1691early_param("lapic", parse_lapic); 1739early_param("lapic", parse_lapic);
1692 1740
1693static int __init parse_nolapic(char *arg) 1741static int __init setup_disableapic(char *arg)
1694{ 1742{
1695 disable_apic = 1; 1743 disable_apic = 1;
1696 setup_clear_cpu_cap(X86_FEATURE_APIC); 1744 setup_clear_cpu_cap(X86_FEATURE_APIC);
1697 return 0; 1745 return 0;
1698} 1746}
1699early_param("nolapic", parse_nolapic); 1747early_param("disableapic", setup_disableapic);
1700 1748
1701static int __init parse_disable_lapic_timer(char *arg) 1749/* same as disableapic, for compatibility */
1750static int __init setup_nolapic(char *arg)
1702{ 1751{
1703 local_apic_timer_disabled = 1; 1752 return setup_disableapic(arg);
1704 return 0;
1705} 1753}
1706early_param("nolapic_timer", parse_disable_lapic_timer); 1754early_param("nolapic", setup_nolapic);
1707 1755
1708static int __init parse_lapic_timer_c2_ok(char *arg) 1756static int __init parse_lapic_timer_c2_ok(char *arg)
1709{ 1757{
@@ -1712,15 +1760,40 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1712} 1760}
1713early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1761early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1714 1762
1763static int __init parse_disable_apic_timer(char *arg)
1764{
1765 disable_apic_timer = 1;
1766 return 0;
1767}
1768early_param("noapictimer", parse_disable_apic_timer);
1769
1770static int __init parse_nolapic_timer(char *arg)
1771{
1772 disable_apic_timer = 1;
1773 return 0;
1774}
1775early_param("nolapic_timer", parse_nolapic_timer);
1776
1715static int __init apic_set_verbosity(char *arg) 1777static int __init apic_set_verbosity(char *arg)
1716{ 1778{
1717 if (!arg) 1779 if (!arg) {
1780#ifdef CONFIG_X86_64
1781 skip_ioapic_setup = 0;
1782 ioapic_force = 1;
1783 return 0;
1784#endif
1718 return -EINVAL; 1785 return -EINVAL;
1786 }
1719 1787
1720 if (strcmp(arg, "debug") == 0) 1788 if (strcmp("debug", arg) == 0)
1721 apic_verbosity = APIC_DEBUG; 1789 apic_verbosity = APIC_DEBUG;
1722 else if (strcmp(arg, "verbose") == 0) 1790 else if (strcmp("verbose", arg) == 0)
1723 apic_verbosity = APIC_VERBOSE; 1791 apic_verbosity = APIC_VERBOSE;
1792 else {
1793 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1794 " use apic=verbose or apic=debug\n", arg);
1795 return -EINVAL;
1796 }
1724 1797
1725 return 0; 1798 return 0;
1726} 1799}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 446c062e831c..94ddb69ae15e 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -27,6 +27,7 @@
27#include <linux/clockchips.h> 27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h> 28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/dmar.h>
30 31
31#include <asm/atomic.h> 32#include <asm/atomic.h>
32#include <asm/smp.h> 33#include <asm/smp.h>
@@ -39,13 +40,20 @@
39#include <asm/proto.h> 40#include <asm/proto.h>
40#include <asm/timex.h> 41#include <asm/timex.h>
41#include <asm/apic.h> 42#include <asm/apic.h>
43#include <asm/i8259.h>
42 44
43#include <mach_ipi.h> 45#include <mach_ipi.h>
44#include <mach_apic.h> 46#include <mach_apic.h>
45 47
48/* Disable local APIC timer from the kernel commandline or via dmi quirk */
46static int disable_apic_timer __cpuinitdata; 49static int disable_apic_timer __cpuinitdata;
47static int apic_calibrate_pmtmr __initdata; 50static int apic_calibrate_pmtmr __initdata;
48int disable_apic; 51int disable_apic;
52int disable_x2apic;
53int x2apic;
54
55/* x2apic enabled before OS handover */
56int x2apic_preenabled;
49 57
50/* Local APIC timer works in C2 */ 58/* Local APIC timer works in C2 */
51int local_apic_timer_c2_ok; 59int local_apic_timer_c2_ok;
@@ -73,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
73static void lapic_timer_broadcast(cpumask_t mask); 81static void lapic_timer_broadcast(cpumask_t mask);
74static void apic_pm_activate(void); 82static void apic_pm_activate(void);
75 83
84/*
85 * The local apic timer can be used for any function which is CPU local.
86 */
76static struct clock_event_device lapic_clockevent = { 87static struct clock_event_device lapic_clockevent = {
77 .name = "lapic", 88 .name = "lapic",
78 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT 89 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
@@ -99,11 +110,15 @@ static inline int lapic_get_version(void)
99} 110}
100 111
101/* 112/*
102 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
103 */ 114 */
104static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
105{ 116{
117#ifdef CONFIG_X86_64
106 return 1; 118 return 1;
119#else
120 return APIC_INTEGRATED(lapic_get_version());
121#endif
107} 122}
108 123
109/* 124/*
@@ -118,13 +133,18 @@ static int modern_apic(void)
118 return lapic_get_version() >= 0x14; 133 return lapic_get_version() >= 0x14;
119} 134}
120 135
121void apic_wait_icr_idle(void) 136/*
137 * Paravirt kernels also might be using these below ops. So we still
138 * use generic apic_read()/apic_write(), which might be pointing to different
139 * ops in PARAVIRT case.
140 */
141void xapic_wait_icr_idle(void)
122{ 142{
123 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 143 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
124 cpu_relax(); 144 cpu_relax();
125} 145}
126 146
127u32 safe_apic_wait_icr_idle(void) 147u32 safe_xapic_wait_icr_idle(void)
128{ 148{
129 u32 send_status; 149 u32 send_status;
130 int timeout; 150 int timeout;
@@ -140,6 +160,68 @@ u32 safe_apic_wait_icr_idle(void)
140 return send_status; 160 return send_status;
141} 161}
142 162
163void xapic_icr_write(u32 low, u32 id)
164{
165 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
166 apic_write(APIC_ICR, low);
167}
168
169u64 xapic_icr_read(void)
170{
171 u32 icr1, icr2;
172
173 icr2 = apic_read(APIC_ICR2);
174 icr1 = apic_read(APIC_ICR);
175
176 return icr1 | ((u64)icr2 << 32);
177}
178
179static struct apic_ops xapic_ops = {
180 .read = native_apic_mem_read,
181 .write = native_apic_mem_write,
182 .icr_read = xapic_icr_read,
183 .icr_write = xapic_icr_write,
184 .wait_icr_idle = xapic_wait_icr_idle,
185 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
186};
187
188struct apic_ops __read_mostly *apic_ops = &xapic_ops;
189EXPORT_SYMBOL_GPL(apic_ops);
190
191static void x2apic_wait_icr_idle(void)
192{
193 /* no need to wait for icr idle in x2apic */
194 return;
195}
196
197static u32 safe_x2apic_wait_icr_idle(void)
198{
199 /* no need to wait for icr idle in x2apic */
200 return 0;
201}
202
203void x2apic_icr_write(u32 low, u32 id)
204{
205 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
206}
207
208u64 x2apic_icr_read(void)
209{
210 unsigned long val;
211
212 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
213 return val;
214}
215
216static struct apic_ops x2apic_ops = {
217 .read = native_apic_msr_read,
218 .write = native_apic_msr_write,
219 .icr_read = x2apic_icr_read,
220 .icr_write = x2apic_icr_write,
221 .wait_icr_idle = x2apic_wait_icr_idle,
222 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
223};
224
143/** 225/**
144 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 226 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
145 */ 227 */
@@ -149,6 +231,11 @@ void __cpuinit enable_NMI_through_LVT0(void)
149 231
150 /* unmask and set to NMI */ 232 /* unmask and set to NMI */
151 v = APIC_DM_NMI; 233 v = APIC_DM_NMI;
234
235 /* Level triggered for 82489DX (32bit mode) */
236 if (!lapic_is_integrated())
237 v |= APIC_LVT_LEVEL_TRIGGER;
238
152 apic_write(APIC_LVT0, v); 239 apic_write(APIC_LVT0, v);
153} 240}
154 241
@@ -157,14 +244,28 @@ void __cpuinit enable_NMI_through_LVT0(void)
157 */ 244 */
158int lapic_get_maxlvt(void) 245int lapic_get_maxlvt(void)
159{ 246{
160 unsigned int v, maxlvt; 247 unsigned int v;
161 248
162 v = apic_read(APIC_LVR); 249 v = apic_read(APIC_LVR);
163 maxlvt = GET_APIC_MAXLVT(v); 250 /*
164 return maxlvt; 251 * - we always have APIC integrated on 64bit mode
252 * - 82489DXs do not report # of LVT entries
253 */
254 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
165} 255}
166 256
167/* 257/*
258 * Local APIC timer
259 */
260
261/* Clock divisor */
262#ifdef CONFG_X86_64
263#define APIC_DIVISOR 1
264#else
265#define APIC_DIVISOR 16
266#endif
267
268/*
168 * This function sets up the local APIC timer, with a timeout of 269 * This function sets up the local APIC timer, with a timeout of
169 * 'clocks' APIC bus clock. During calibration we actually call 270 * 'clocks' APIC bus clock. During calibration we actually call
170 * this function twice on the boot CPU, once with a bogus timeout 271 * this function twice on the boot CPU, once with a bogus timeout
@@ -174,7 +275,6 @@ int lapic_get_maxlvt(void)
174 * We do reads before writes even if unnecessary, to get around the 275 * We do reads before writes even if unnecessary, to get around the
175 * P5 APIC double write bug. 276 * P5 APIC double write bug.
176 */ 277 */
177
178static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 278static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
179{ 279{
180 unsigned int lvtt_value, tmp_value; 280 unsigned int lvtt_value, tmp_value;
@@ -182,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
182 lvtt_value = LOCAL_TIMER_VECTOR; 282 lvtt_value = LOCAL_TIMER_VECTOR;
183 if (!oneshot) 283 if (!oneshot)
184 lvtt_value |= APIC_LVT_TIMER_PERIODIC; 284 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
285 if (!lapic_is_integrated())
286 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
287
185 if (!irqen) 288 if (!irqen)
186 lvtt_value |= APIC_LVT_MASKED; 289 lvtt_value |= APIC_LVT_MASKED;
187 290
@@ -191,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
191 * Divide PICLK by 16 294 * Divide PICLK by 16
192 */ 295 */
193 tmp_value = apic_read(APIC_TDCR); 296 tmp_value = apic_read(APIC_TDCR);
194 apic_write(APIC_TDCR, (tmp_value 297 apic_write(APIC_TDCR,
195 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 298 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
196 | APIC_TDR_DIV_16); 299 APIC_TDR_DIV_16);
197 300
198 if (!oneshot) 301 if (!oneshot)
199 apic_write(APIC_TMICT, clocks); 302 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
200} 303}
201 304
202/* 305/*
@@ -204,6 +307,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
204 * 307 *
205 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and 308 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
206 * MCE interrupts are supported. Thus MCE offset must be set to 0. 309 * MCE interrupts are supported. Thus MCE offset must be set to 0.
310 *
311 * If mask=1, the LVT entry does not generate interrupts while mask=0
312 * enables the vector. See also the BKDGs.
207 */ 313 */
208 314
209#define APIC_EILVT_LVTOFF_MCE 0 315#define APIC_EILVT_LVTOFF_MCE 0
@@ -228,6 +334,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
228 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); 334 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
229 return APIC_EILVT_LVTOFF_IBS; 335 return APIC_EILVT_LVTOFF_IBS;
230} 336}
337EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
231 338
232/* 339/*
233 * Program the next event, relative to now 340 * Program the next event, relative to now
@@ -366,7 +473,7 @@ static int __init calibrate_APIC_clock(void)
366 lapic_clockevent.min_delta_ns = 473 lapic_clockevent.min_delta_ns =
367 clockevent_delta2ns(0xF, &lapic_clockevent); 474 clockevent_delta2ns(0xF, &lapic_clockevent);
368 475
369 calibration_result = result / HZ; 476 calibration_result = (result * APIC_DIVISOR) / HZ;
370 477
371 /* 478 /*
372 * Do a sanity check on the APIC calibration result 479 * Do a sanity check on the APIC calibration result
@@ -388,10 +495,10 @@ static int __init calibrate_APIC_clock(void)
388void __init setup_boot_APIC_clock(void) 495void __init setup_boot_APIC_clock(void)
389{ 496{
390 /* 497 /*
391 * The local apic timer can be disabled via the kernel commandline. 498 * The local apic timer can be disabled via the kernel
392 * Register the lapic timer as a dummy clock event source on SMP 499 * commandline or from the CPU detection code. Register the lapic
393 * systems, so the broadcast mechanism is used. On UP systems simply 500 * timer as a dummy clock event source on SMP systems, so the
394 * ignore it. 501 * broadcast mechanism is used. On UP systems simply ignore it.
395 */ 502 */
396 if (disable_apic_timer) { 503 if (disable_apic_timer) {
397 printk(KERN_INFO "Disabling APIC timer\n"); 504 printk(KERN_INFO "Disabling APIC timer\n");
@@ -403,7 +510,9 @@ void __init setup_boot_APIC_clock(void)
403 return; 510 return;
404 } 511 }
405 512
406 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 513 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
514 "calibrating APIC timer ...\n");
515
407 if (calibrate_APIC_clock()) { 516 if (calibrate_APIC_clock()) {
408 /* No broadcast on UP ! */ 517 /* No broadcast on UP ! */
409 if (num_possible_cpus() > 1) 518 if (num_possible_cpus() > 1)
@@ -422,6 +531,7 @@ void __init setup_boot_APIC_clock(void)
422 printk(KERN_WARNING "APIC timer registered as dummy," 531 printk(KERN_WARNING "APIC timer registered as dummy,"
423 " due to nmi_watchdog=%d!\n", nmi_watchdog); 532 " due to nmi_watchdog=%d!\n", nmi_watchdog);
424 533
534 /* Setup the lapic or request the broadcast */
425 setup_APIC_timer(); 535 setup_APIC_timer();
426} 536}
427 537
@@ -460,7 +570,11 @@ static void local_apic_timer_interrupt(void)
460 /* 570 /*
461 * the NMI deadlock-detector uses this. 571 * the NMI deadlock-detector uses this.
462 */ 572 */
573#ifdef CONFIG_X86_64
463 add_pda(apic_timer_irqs, 1); 574 add_pda(apic_timer_irqs, 1);
575#else
576 per_cpu(irq_stat, cpu).apic_timer_irqs++;
577#endif
464 578
465 evt->event_handler(evt); 579 evt->event_handler(evt);
466} 580}
@@ -491,6 +605,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
491 irq_enter(); 605 irq_enter();
492 local_apic_timer_interrupt(); 606 local_apic_timer_interrupt();
493 irq_exit(); 607 irq_exit();
608
494 set_irq_regs(old_regs); 609 set_irq_regs(old_regs);
495} 610}
496 611
@@ -544,6 +659,13 @@ void clear_local_APIC(void)
544 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); 659 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
545 } 660 }
546 661
662 /* lets not touch this if we didn't frob it */
663#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
664 if (maxlvt >= 5) {
665 v = apic_read(APIC_LVTTHMR);
666 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
667 }
668#endif
547 /* 669 /*
548 * Clean APIC state for other OSs: 670 * Clean APIC state for other OSs:
549 */ 671 */
@@ -554,8 +676,14 @@ void clear_local_APIC(void)
554 apic_write(APIC_LVTERR, APIC_LVT_MASKED); 676 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
555 if (maxlvt >= 4) 677 if (maxlvt >= 4)
556 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 678 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
557 apic_write(APIC_ESR, 0); 679
558 apic_read(APIC_ESR); 680 /* Integrated APIC (!82489DX) ? */
681 if (lapic_is_integrated()) {
682 if (maxlvt > 3)
683 /* Clear ESR due to Pentium errata 3AP and 11AP */
684 apic_write(APIC_ESR, 0);
685 apic_read(APIC_ESR);
686 }
559} 687}
560 688
561/** 689/**
@@ -574,8 +702,28 @@ void disable_local_APIC(void)
574 value = apic_read(APIC_SPIV); 702 value = apic_read(APIC_SPIV);
575 value &= ~APIC_SPIV_APIC_ENABLED; 703 value &= ~APIC_SPIV_APIC_ENABLED;
576 apic_write(APIC_SPIV, value); 704 apic_write(APIC_SPIV, value);
705
706#ifdef CONFIG_X86_32
707 /*
708 * When LAPIC was disabled by the BIOS and enabled by the kernel,
709 * restore the disabled state.
710 */
711 if (enabled_via_apicbase) {
712 unsigned int l, h;
713
714 rdmsr(MSR_IA32_APICBASE, l, h);
715 l &= ~MSR_IA32_APICBASE_ENABLE;
716 wrmsr(MSR_IA32_APICBASE, l, h);
717 }
718#endif
577} 719}
578 720
721/*
722 * If Linux enabled the LAPIC against the BIOS default disable it down before
723 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
724 * not power-off. Additionally clear all LVT entries before disable_local_APIC
725 * for the case where Linux didn't enable the LAPIC.
726 */
579void lapic_shutdown(void) 727void lapic_shutdown(void)
580{ 728{
581 unsigned long flags; 729 unsigned long flags;
@@ -585,7 +733,13 @@ void lapic_shutdown(void)
585 733
586 local_irq_save(flags); 734 local_irq_save(flags);
587 735
588 disable_local_APIC(); 736#ifdef CONFIG_X86_32
737 if (!enabled_via_apicbase)
738 clear_local_APIC();
739 else
740#endif
741 disable_local_APIC();
742
589 743
590 local_irq_restore(flags); 744 local_irq_restore(flags);
591} 745}
@@ -629,10 +783,10 @@ int __init verify_local_APIC(void)
629 /* 783 /*
630 * The ID register is read/write in a real APIC. 784 * The ID register is read/write in a real APIC.
631 */ 785 */
632 reg0 = read_apic_id(); 786 reg0 = apic_read(APIC_ID);
633 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 787 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
634 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); 788 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
635 reg1 = read_apic_id(); 789 reg1 = apic_read(APIC_ID);
636 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); 790 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
637 apic_write(APIC_ID, reg0); 791 apic_write(APIC_ID, reg0);
638 if (reg1 != (reg0 ^ APIC_ID_MASK)) 792 if (reg1 != (reg0 ^ APIC_ID_MASK))
@@ -656,8 +810,11 @@ int __init verify_local_APIC(void)
656 */ 810 */
657void __init sync_Arb_IDs(void) 811void __init sync_Arb_IDs(void)
658{ 812{
659 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 813 /*
660 if (modern_apic()) 814 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
815 * needed on AMD.
816 */
817 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
661 return; 818 return;
662 819
663 /* 820 /*
@@ -666,8 +823,8 @@ void __init sync_Arb_IDs(void)
666 apic_wait_icr_idle(); 823 apic_wait_icr_idle();
667 824
668 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 825 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
669 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 826 apic_write(APIC_ICR, APIC_DEST_ALLINC |
670 | APIC_DM_INIT); 827 APIC_INT_LEVELTRIG | APIC_DM_INIT);
671} 828}
672 829
673/* 830/*
@@ -684,8 +841,6 @@ void __init init_bsp_APIC(void)
684 if (smp_found_config || !cpu_has_apic) 841 if (smp_found_config || !cpu_has_apic)
685 return; 842 return;
686 843
687 value = apic_read(APIC_LVR);
688
689 /* 844 /*
690 * Do not trust the local APIC being empty at bootup. 845 * Do not trust the local APIC being empty at bootup.
691 */ 846 */
@@ -697,7 +852,15 @@ void __init init_bsp_APIC(void)
697 value = apic_read(APIC_SPIV); 852 value = apic_read(APIC_SPIV);
698 value &= ~APIC_VECTOR_MASK; 853 value &= ~APIC_VECTOR_MASK;
699 value |= APIC_SPIV_APIC_ENABLED; 854 value |= APIC_SPIV_APIC_ENABLED;
700 value |= APIC_SPIV_FOCUS_DISABLED; 855
856#ifdef CONFIG_X86_32
857 /* This bit is reserved on P4/Xeon and should be cleared */
858 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
859 (boot_cpu_data.x86 == 15))
860 value &= ~APIC_SPIV_FOCUS_DISABLED;
861 else
862#endif
863 value |= APIC_SPIV_FOCUS_DISABLED;
701 value |= SPURIOUS_APIC_VECTOR; 864 value |= SPURIOUS_APIC_VECTOR;
702 apic_write(APIC_SPIV, value); 865 apic_write(APIC_SPIV, value);
703 866
@@ -706,9 +869,50 @@ void __init init_bsp_APIC(void)
706 */ 869 */
707 apic_write(APIC_LVT0, APIC_DM_EXTINT); 870 apic_write(APIC_LVT0, APIC_DM_EXTINT);
708 value = APIC_DM_NMI; 871 value = APIC_DM_NMI;
872 if (!lapic_is_integrated()) /* 82489DX */
873 value |= APIC_LVT_LEVEL_TRIGGER;
709 apic_write(APIC_LVT1, value); 874 apic_write(APIC_LVT1, value);
710} 875}
711 876
877static void __cpuinit lapic_setup_esr(void)
878{
879 unsigned long oldvalue, value, maxlvt;
880 if (lapic_is_integrated() && !esr_disable) {
881 if (esr_disable) {
882 /*
883 * Something untraceable is creating bad interrupts on
884 * secondary quads ... for the moment, just leave the
885 * ESR disabled - we can't do anything useful with the
886 * errors anyway - mbligh
887 */
888 printk(KERN_INFO "Leaving ESR disabled.\n");
889 return;
890 }
891 /* !82489DX */
892 maxlvt = lapic_get_maxlvt();
893 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
894 apic_write(APIC_ESR, 0);
895 oldvalue = apic_read(APIC_ESR);
896
897 /* enables sending errors */
898 value = ERROR_APIC_VECTOR;
899 apic_write(APIC_LVTERR, value);
900 /*
901 * spec says clear errors after enabling vector.
902 */
903 if (maxlvt > 3)
904 apic_write(APIC_ESR, 0);
905 value = apic_read(APIC_ESR);
906 if (value != oldvalue)
907 apic_printk(APIC_VERBOSE, "ESR value before enabling "
908 "vector: 0x%08lx after: 0x%08lx\n",
909 oldvalue, value);
910 } else {
911 printk(KERN_INFO "No ESR for 82489DX.\n");
912 }
913}
914
915
712/** 916/**
713 * setup_local_APIC - setup the local APIC 917 * setup_local_APIC - setup the local APIC
714 */ 918 */
@@ -814,25 +1018,143 @@ void __cpuinit setup_local_APIC(void)
814 preempt_enable(); 1018 preempt_enable();
815} 1019}
816 1020
817static void __cpuinit lapic_setup_esr(void)
818{
819 unsigned maxlvt = lapic_get_maxlvt();
820
821 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
822 /*
823 * spec says clear errors after enabling vector.
824 */
825 if (maxlvt > 3)
826 apic_write(APIC_ESR, 0);
827}
828
829void __cpuinit end_local_APIC_setup(void) 1021void __cpuinit end_local_APIC_setup(void)
830{ 1022{
831 lapic_setup_esr(); 1023 lapic_setup_esr();
1024
1025#ifdef CONFIG_X86_32
1026 {
1027 unsigned int value;
1028 /* Disable the local apic timer */
1029 value = apic_read(APIC_LVTT);
1030 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1031 apic_write(APIC_LVTT, value);
1032 }
1033#endif
1034
832 setup_apic_nmi_watchdog(NULL); 1035 setup_apic_nmi_watchdog(NULL);
833 apic_pm_activate(); 1036 apic_pm_activate();
834} 1037}
835 1038
1039void check_x2apic(void)
1040{
1041 int msr, msr2;
1042
1043 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1044
1045 if (msr & X2APIC_ENABLE) {
1046 printk("x2apic enabled by BIOS, switching to x2apic ops\n");
1047 x2apic_preenabled = x2apic = 1;
1048 apic_ops = &x2apic_ops;
1049 }
1050}
1051
1052void enable_x2apic(void)
1053{
1054 int msr, msr2;
1055
1056 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1057 if (!(msr & X2APIC_ENABLE)) {
1058 printk("Enabling x2apic\n");
1059 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1060 }
1061}
1062
1063void enable_IR_x2apic(void)
1064{
1065#ifdef CONFIG_INTR_REMAP
1066 int ret;
1067 unsigned long flags;
1068
1069 if (!cpu_has_x2apic)
1070 return;
1071
1072 if (!x2apic_preenabled && disable_x2apic) {
1073 printk(KERN_INFO
1074 "Skipped enabling x2apic and Interrupt-remapping "
1075 "because of nox2apic\n");
1076 return;
1077 }
1078
1079 if (x2apic_preenabled && disable_x2apic)
1080 panic("Bios already enabled x2apic, can't enforce nox2apic");
1081
1082 if (!x2apic_preenabled && skip_ioapic_setup) {
1083 printk(KERN_INFO
1084 "Skipped enabling x2apic and Interrupt-remapping "
1085 "because of skipping io-apic setup\n");
1086 return;
1087 }
1088
1089 ret = dmar_table_init();
1090 if (ret) {
1091 printk(KERN_INFO
1092 "dmar_table_init() failed with %d:\n", ret);
1093
1094 if (x2apic_preenabled)
1095 panic("x2apic enabled by bios. But IR enabling failed");
1096 else
1097 printk(KERN_INFO
1098 "Not enabling x2apic,Intr-remapping\n");
1099 return;
1100 }
1101
1102 local_irq_save(flags);
1103 mask_8259A();
1104 save_mask_IO_APIC_setup();
1105
1106 ret = enable_intr_remapping(1);
1107
1108 if (ret && x2apic_preenabled) {
1109 local_irq_restore(flags);
1110 panic("x2apic enabled by bios. But IR enabling failed");
1111 }
1112
1113 if (ret)
1114 goto end;
1115
1116 if (!x2apic) {
1117 x2apic = 1;
1118 apic_ops = &x2apic_ops;
1119 enable_x2apic();
1120 }
1121end:
1122 if (ret)
1123 /*
1124 * IR enabling failed
1125 */
1126 restore_IO_APIC_setup();
1127 else
1128 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1129
1130 unmask_8259A();
1131 local_irq_restore(flags);
1132
1133 if (!ret) {
1134 if (!x2apic_preenabled)
1135 printk(KERN_INFO
1136 "Enabled x2apic and interrupt-remapping\n");
1137 else
1138 printk(KERN_INFO
1139 "Enabled Interrupt-remapping\n");
1140 } else
1141 printk(KERN_ERR
1142 "Failed to enable Interrupt-remapping and x2apic\n");
1143#else
1144 if (!cpu_has_x2apic)
1145 return;
1146
1147 if (x2apic_preenabled)
1148 panic("x2apic enabled prior OS handover,"
1149 " enable CONFIG_INTR_REMAP");
1150
1151 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1152 " and x2apic\n");
1153#endif
1154
1155 return;
1156}
1157
836/* 1158/*
837 * Detect and enable local APICs on non-SMP boards. 1159 * Detect and enable local APICs on non-SMP boards.
838 * Original code written by Keir Fraser. 1160 * Original code written by Keir Fraser.
@@ -872,7 +1194,7 @@ void __init early_init_lapic_mapping(void)
872 * Fetch the APIC ID of the BSP in case we have a 1194 * Fetch the APIC ID of the BSP in case we have a
873 * default configuration (or the MP table is broken). 1195 * default configuration (or the MP table is broken).
874 */ 1196 */
875 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1197 boot_cpu_physical_apicid = read_apic_id();
876} 1198}
877 1199
878/** 1200/**
@@ -880,6 +1202,11 @@ void __init early_init_lapic_mapping(void)
880 */ 1202 */
881void __init init_apic_mappings(void) 1203void __init init_apic_mappings(void)
882{ 1204{
1205 if (x2apic) {
1206 boot_cpu_physical_apicid = read_apic_id();
1207 return;
1208 }
1209
883 /* 1210 /*
884 * If no local APIC can be found then set up a fake all 1211 * If no local APIC can be found then set up a fake all
885 * zeroes page to simulate the local APIC and another 1212 * zeroes page to simulate the local APIC and another
@@ -899,13 +1226,15 @@ void __init init_apic_mappings(void)
899 * Fetch the APIC ID of the BSP in case we have a 1226 * Fetch the APIC ID of the BSP in case we have a
900 * default configuration (or the MP table is broken). 1227 * default configuration (or the MP table is broken).
901 */ 1228 */
902 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1229 boot_cpu_physical_apicid = read_apic_id();
903} 1230}
904 1231
905/* 1232/*
906 * This initializes the IO-APIC and APIC hardware if this is 1233 * This initializes the IO-APIC and APIC hardware if this is
907 * a UP kernel. 1234 * a UP kernel.
908 */ 1235 */
1236int apic_version[MAX_APICS];
1237
909int __init APIC_init_uniprocessor(void) 1238int __init APIC_init_uniprocessor(void)
910{ 1239{
911 if (disable_apic) { 1240 if (disable_apic) {
@@ -918,6 +1247,9 @@ int __init APIC_init_uniprocessor(void)
918 return -1; 1247 return -1;
919 } 1248 }
920 1249
1250 enable_IR_x2apic();
1251 setup_apic_routing();
1252
921 verify_local_APIC(); 1253 verify_local_APIC();
922 1254
923 connect_bsp_APIC(); 1255 connect_bsp_APIC();
@@ -1004,17 +1336,57 @@ asmlinkage void smp_error_interrupt(void)
1004} 1336}
1005 1337
1006/** 1338/**
1007 * * connect_bsp_APIC - attach the APIC to the interrupt system 1339 * connect_bsp_APIC - attach the APIC to the interrupt system
1008 * */ 1340 */
1009void __init connect_bsp_APIC(void) 1341void __init connect_bsp_APIC(void)
1010{ 1342{
1343#ifdef CONFIG_X86_32
1344 if (pic_mode) {
1345 /*
1346 * Do not trust the local APIC being empty at bootup.
1347 */
1348 clear_local_APIC();
1349 /*
1350 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1351 * local APIC to INT and NMI lines.
1352 */
1353 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1354 "enabling APIC mode.\n");
1355 outb(0x70, 0x22);
1356 outb(0x01, 0x23);
1357 }
1358#endif
1011 enable_apic_mode(); 1359 enable_apic_mode();
1012} 1360}
1013 1361
1362/**
1363 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1364 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1365 *
1366 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1367 * APIC is disabled.
1368 */
1014void disconnect_bsp_APIC(int virt_wire_setup) 1369void disconnect_bsp_APIC(int virt_wire_setup)
1015{ 1370{
1371 unsigned int value;
1372
1373#ifdef CONFIG_X86_32
1374 if (pic_mode) {
1375 /*
1376 * Put the board back into PIC mode (has an effect only on
1377 * certain older boards). Note that APIC interrupts, including
1378 * IPIs, won't work beyond this point! The only exception are
1379 * INIT IPIs.
1380 */
1381 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1382 "entering PIC mode.\n");
1383 outb(0x70, 0x22);
1384 outb(0x00, 0x23);
1385 return;
1386 }
1387#endif
1388
1016 /* Go back to Virtual Wire compatibility mode */ 1389 /* Go back to Virtual Wire compatibility mode */
1017 unsigned long value;
1018 1390
1019 /* For the spurious interrupt use vector F, and enable it */ 1391 /* For the spurious interrupt use vector F, and enable it */
1020 value = apic_read(APIC_SPIV); 1392 value = apic_read(APIC_SPIV);
@@ -1040,7 +1412,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1040 apic_write(APIC_LVT0, APIC_LVT_MASKED); 1412 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1041 } 1413 }
1042 1414
1043 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 1415 /*
1416 * For LVT1 make it edge triggered, active high,
1417 * nmi and enabled
1418 */
1044 value = apic_read(APIC_LVT1); 1419 value = apic_read(APIC_LVT1);
1045 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1420 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1046 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1421 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
@@ -1055,9 +1430,20 @@ void __cpuinit generic_processor_info(int apicid, int version)
1055 int cpu; 1430 int cpu;
1056 cpumask_t tmp_map; 1431 cpumask_t tmp_map;
1057 1432
1433 /*
1434 * Validate version
1435 */
1436 if (version == 0x0) {
1437 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
1438 "fixing up to 0x10. (tell your hw vendor)\n",
1439 version);
1440 version = 0x10;
1441 }
1442 apic_version[apicid] = version;
1443
1058 if (num_processors >= NR_CPUS) { 1444 if (num_processors >= NR_CPUS) {
1059 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1445 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1060 " Processor ignored.\n", NR_CPUS); 1446 " Processor ignored.\n", NR_CPUS);
1061 return; 1447 return;
1062 } 1448 }
1063 1449
@@ -1077,6 +1463,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1077 if (apicid > max_physical_apicid) 1463 if (apicid > max_physical_apicid)
1078 max_physical_apicid = apicid; 1464 max_physical_apicid = apicid;
1079 1465
1466#ifdef CONFIG_X86_32
1467 /*
1468 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1469 * but we need to work other dependencies like SMP_SUSPEND etc
1470 * before this can be done without some confusion.
1471 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1472 * - Ashok Raj <ashok.raj@intel.com>
1473 */
1474 if (max_physical_apicid >= 8) {
1475 switch (boot_cpu_data.x86_vendor) {
1476 case X86_VENDOR_INTEL:
1477 if (!APIC_XAPIC(version)) {
1478 def_to_bigsmp = 0;
1479 break;
1480 }
1481 /* If P4 and above fall through */
1482 case X86_VENDOR_AMD:
1483 def_to_bigsmp = 1;
1484 }
1485 }
1486#endif
1487
1488#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1080 /* are we being called early in kernel startup? */ 1489 /* are we being called early in kernel startup? */
1081 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1490 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1082 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1491 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1088,20 +1497,28 @@ void __cpuinit generic_processor_info(int apicid, int version)
1088 per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1497 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1089 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1498 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1090 } 1499 }
1500#endif
1091 1501
1092 cpu_set(cpu, cpu_possible_map); 1502 cpu_set(cpu, cpu_possible_map);
1093 cpu_set(cpu, cpu_present_map); 1503 cpu_set(cpu, cpu_present_map);
1094} 1504}
1095 1505
1506int hard_smp_processor_id(void)
1507{
1508 return read_apic_id();
1509}
1510
1096/* 1511/*
1097 * Power management 1512 * Power management
1098 */ 1513 */
1099#ifdef CONFIG_PM 1514#ifdef CONFIG_PM
1100 1515
1101static struct { 1516static struct {
1102 /* 'active' is true if the local APIC was enabled by us and 1517 /*
1103 not the BIOS; this signifies that we are also responsible 1518 * 'active' is true if the local APIC was enabled by us and
1104 for disabling it before entering apm/acpi suspend */ 1519 * not the BIOS; this signifies that we are also responsible
1520 * for disabling it before entering apm/acpi suspend
1521 */
1105 int active; 1522 int active;
1106 /* r/w apic fields */ 1523 /* r/w apic fields */
1107 unsigned int apic_id; 1524 unsigned int apic_id;
@@ -1129,7 +1546,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1129 1546
1130 maxlvt = lapic_get_maxlvt(); 1547 maxlvt = lapic_get_maxlvt();
1131 1548
1132 apic_pm_state.apic_id = read_apic_id(); 1549 apic_pm_state.apic_id = apic_read(APIC_ID);
1133 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 1550 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1134 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 1551 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1135 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 1552 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
@@ -1142,10 +1559,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1142 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1559 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1143 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1560 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1144 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1561 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1145#ifdef CONFIG_X86_MCE_INTEL 1562#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1146 if (maxlvt >= 5) 1563 if (maxlvt >= 5)
1147 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1564 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1148#endif 1565#endif
1566
1149 local_irq_save(flags); 1567 local_irq_save(flags);
1150 disable_local_APIC(); 1568 disable_local_APIC();
1151 local_irq_restore(flags); 1569 local_irq_restore(flags);
@@ -1164,10 +1582,25 @@ static int lapic_resume(struct sys_device *dev)
1164 maxlvt = lapic_get_maxlvt(); 1582 maxlvt = lapic_get_maxlvt();
1165 1583
1166 local_irq_save(flags); 1584 local_irq_save(flags);
1167 rdmsr(MSR_IA32_APICBASE, l, h); 1585
1168 l &= ~MSR_IA32_APICBASE_BASE; 1586#ifdef CONFIG_X86_64
1169 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1587 if (x2apic)
1170 wrmsr(MSR_IA32_APICBASE, l, h); 1588 enable_x2apic();
1589 else
1590#endif
1591 {
1592 /*
1593 * Make sure the APICBASE points to the right address
1594 *
1595 * FIXME! This will be wrong if we ever support suspend on
1596 * SMP! We'll need to do this as part of the CPU restore!
1597 */
1598 rdmsr(MSR_IA32_APICBASE, l, h);
1599 l &= ~MSR_IA32_APICBASE_BASE;
1600 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1601 wrmsr(MSR_IA32_APICBASE, l, h);
1602 }
1603
1171 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1604 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1172 apic_write(APIC_ID, apic_pm_state.apic_id); 1605 apic_write(APIC_ID, apic_pm_state.apic_id);
1173 apic_write(APIC_DFR, apic_pm_state.apic_dfr); 1606 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
@@ -1176,7 +1609,7 @@ static int lapic_resume(struct sys_device *dev)
1176 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1609 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1177 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1610 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1178 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1611 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1179#ifdef CONFIG_X86_MCE_INTEL 1612#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1180 if (maxlvt >= 5) 1613 if (maxlvt >= 5)
1181 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1614 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1182#endif 1615#endif
@@ -1190,10 +1623,17 @@ static int lapic_resume(struct sys_device *dev)
1190 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1623 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1191 apic_write(APIC_ESR, 0); 1624 apic_write(APIC_ESR, 0);
1192 apic_read(APIC_ESR); 1625 apic_read(APIC_ESR);
1626
1193 local_irq_restore(flags); 1627 local_irq_restore(flags);
1628
1194 return 0; 1629 return 0;
1195} 1630}
1196 1631
1632/*
1633 * This device has no shutdown method - fully functioning local APICs
1634 * are needed on every CPU up until machine_halt/restart/poweroff.
1635 */
1636
1197static struct sysdev_class lapic_sysclass = { 1637static struct sysdev_class lapic_sysclass = {
1198 .name = "lapic", 1638 .name = "lapic",
1199 .resume = lapic_resume, 1639 .resume = lapic_resume,
@@ -1307,31 +1747,19 @@ __cpuinit int apic_is_clustered_box(void)
1307 return (clusters > 2); 1747 return (clusters > 2);
1308} 1748}
1309 1749
1310/* 1750static __init int setup_nox2apic(char *str)
1311 * APIC command line parameters
1312 */
1313static int __init apic_set_verbosity(char *str)
1314{ 1751{
1315 if (str == NULL) { 1752 disable_x2apic = 1;
1316 skip_ioapic_setup = 0; 1753 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC);
1317 ioapic_force = 1;
1318 return 0;
1319 }
1320 if (strcmp("debug", str) == 0)
1321 apic_verbosity = APIC_DEBUG;
1322 else if (strcmp("verbose", str) == 0)
1323 apic_verbosity = APIC_VERBOSE;
1324 else {
1325 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1326 " use apic=verbose or apic=debug\n", str);
1327 return -EINVAL;
1328 }
1329
1330 return 0; 1754 return 0;
1331} 1755}
1332early_param("apic", apic_set_verbosity); 1756early_param("nox2apic", setup_nox2apic);
1757
1333 1758
1334static __init int setup_disableapic(char *str) 1759/*
1760 * APIC command line parameters
1761 */
1762static int __init setup_disableapic(char *arg)
1335{ 1763{
1336 disable_apic = 1; 1764 disable_apic = 1;
1337 setup_clear_cpu_cap(X86_FEATURE_APIC); 1765 setup_clear_cpu_cap(X86_FEATURE_APIC);
@@ -1340,9 +1768,9 @@ static __init int setup_disableapic(char *str)
1340early_param("disableapic", setup_disableapic); 1768early_param("disableapic", setup_disableapic);
1341 1769
1342/* same as disableapic, for compatibility */ 1770/* same as disableapic, for compatibility */
1343static __init int setup_nolapic(char *str) 1771static int __init setup_nolapic(char *arg)
1344{ 1772{
1345 return setup_disableapic(str); 1773 return setup_disableapic(arg);
1346} 1774}
1347early_param("nolapic", setup_nolapic); 1775early_param("nolapic", setup_nolapic);
1348 1776
@@ -1353,14 +1781,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1353} 1781}
1354early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1782early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1355 1783
1356static __init int setup_noapictimer(char *str) 1784static int __init parse_disable_apic_timer(char *arg)
1357{ 1785{
1358 if (str[0] != ' ' && str[0] != 0)
1359 return 0;
1360 disable_apic_timer = 1; 1786 disable_apic_timer = 1;
1361 return 1; 1787 return 0;
1362} 1788}
1363__setup("noapictimer", setup_noapictimer); 1789early_param("noapictimer", parse_disable_apic_timer);
1790
1791static int __init parse_nolapic_timer(char *arg)
1792{
1793 disable_apic_timer = 1;
1794 return 0;
1795}
1796early_param("nolapic_timer", parse_nolapic_timer);
1364 1797
1365static __init int setup_apicpmtimer(char *s) 1798static __init int setup_apicpmtimer(char *s)
1366{ 1799{
@@ -1370,6 +1803,31 @@ static __init int setup_apicpmtimer(char *s)
1370} 1803}
1371__setup("apicpmtimer", setup_apicpmtimer); 1804__setup("apicpmtimer", setup_apicpmtimer);
1372 1805
1806static int __init apic_set_verbosity(char *arg)
1807{
1808 if (!arg) {
1809#ifdef CONFIG_X86_64
1810 skip_ioapic_setup = 0;
1811 ioapic_force = 1;
1812 return 0;
1813#endif
1814 return -EINVAL;
1815 }
1816
1817 if (strcmp("debug", arg) == 0)
1818 apic_verbosity = APIC_DEBUG;
1819 else if (strcmp("verbose", arg) == 0)
1820 apic_verbosity = APIC_VERBOSE;
1821 else {
1822 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1823 " use apic=verbose or apic=debug\n", arg);
1824 return -EINVAL;
1825 }
1826
1827 return 0;
1828}
1829early_param("apic", apic_set_verbosity);
1830
1373static int __init lapic_insert_resource(void) 1831static int __init lapic_insert_resource(void)
1374{ 1832{
1375 if (!apic_phys) 1833 if (!apic_phys)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee76eaad3001..7f0b45a5d788 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,22 +3,30 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o feature_names.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10obj-$(CONFIG_X86_32) += amd.o 10
11obj-$(CONFIG_X86_64) += amd_64.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_X86_32) += cyrix.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_X86_32) += centaur.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
14obj-$(CONFIG_X86_64) += centaur_64.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
15obj-$(CONFIG_X86_32) += transmeta.o 15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
16obj-$(CONFIG_X86_32) += intel.o 16obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
17obj-$(CONFIG_X86_64) += intel_64.o 17obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
18obj-$(CONFIG_X86_32) += umc.o
19 18
20obj-$(CONFIG_X86_MCE) += mcheck/ 19obj-$(CONFIG_X86_MCE) += mcheck/
21obj-$(CONFIG_MTRR) += mtrr/ 20obj-$(CONFIG_MTRR) += mtrr/
22obj-$(CONFIG_CPU_FREQ) += cpufreq/ 21obj-$(CONFIG_CPU_FREQ) += cpufreq/
23 22
24obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 23obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
24
25quiet_cmd_mkcapflags = MKCAP $@
26 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
27
28cpufeature = $(src)/../../../../include/asm-x86/cpufeature.h
29
30targets += capflags.c
31$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
32 $(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index a6ef672adbba..0d9c993aa93e 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h>
11
10struct cpuid_bit { 12struct cpuid_bit {
11 u16 feature; 13 u16 feature;
12 u8 reg; 14 u8 reg;
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
48 } 50 }
49} 51}
50 52
53/* leaf 0xb SMT level */
54#define SMT_LEVEL 0
55
56/* leaf 0xb sub-leaf types */
57#define INVALID_TYPE 0
58#define SMT_TYPE 1
59#define CORE_TYPE 2
60
61#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
62#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
63#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
64
65/*
66 * Check for extended topology enumeration cpuid leaf 0xb and if it
67 * exists, use it for populating initial_apicid and cpu topology
68 * detection.
69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{
72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings;
76
77 if (c->cpuid_level < 0xb)
78 return;
79
80 cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
81
82 /*
83 * check if the cpuid leaf 0xb is actually implemented.
84 */
85 if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
86 return;
87
88 set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
89
90 /*
91 * initial apic id, which also represents 32-bit extended x2apic id.
92 */
93 c->initial_apicid = edx;
94
95 /*
96 * Populate HT related information from sub-leaf level 0.
97 */
98 core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
99 core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
100
101 sub_index = 1;
102 do {
103 cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
104
105 /*
106 * Check for the Core type in the implemented sub leaves.
107 */
108 if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
109 core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
110 core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
111 break;
112 }
113
114 sub_index++;
115 } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118
119#ifdef CONFIG_X86_32
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
126#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128
129
130 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
131 c->phys_proc_id);
132 if (c->x86_max_cores > 1)
133 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
134 c->cpu_core_id);
135 return;
136#endif
137}
138
51#ifdef CONFIG_X86_PAT 139#ifdef CONFIG_X86_PAT
52void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) 140void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
53{ 141{
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 18514ed26104..32e73520adf7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,26 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28{ 37{
29 if (cpuid_eax(0x80000000) >= 0x80000007) { 38/*
30 c->x86_power = cpuid_edx(0x80000007); 39 * General Systems BIOSen alias the cpu frequency registers
31 if (c->x86_power & (1<<8)) 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 41 * drivers subsequently pokes it, and changes the CPU speed.
42 * Workaround : Remove the unneeded alias.
43 */
44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR);
33 } 50 }
34
35 /* Set MTRR capability flag if appropriate */
36 if (c->x86_model == 13 || c->x86_model == 9 ||
37 (c->x86_model == 8 && c->x86_mask >= 8))
38 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
39} 51}
40 52
41static void __cpuinit init_amd(struct cpuinfo_x86 *c) 53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
42{ 55{
43 u32 l, h; 56 u32 l, h;
44 int mbytes = num_physpages >> (20-PAGE_SHIFT); 57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
45 int r;
46 58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
64 }
65 return;
66 }
67
68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
143 }
144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
180}
181#endif
182
183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
187
188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
221{
222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
223 int cpu = smp_processor_id();
224 int node;
225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
253#endif
254}
255
256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
257{
258#ifdef CONFIG_X86_HT
259 unsigned bits, ecx;
260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
47#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
48 unsigned long long value; 304 unsigned long long value;
49 305
@@ -54,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
54 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
55 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
56 */ 312 */
57 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
58 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
59 value |= 1 << 6; 315 value |= 1 << 6;
60 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -64,209 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
64 early_init_amd(c); 320 early_init_amd(c);
65 321
66 /* 322 /*
67 * FIXME: We should handle the K5 here. Set up the write
68 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
69 * no bus pipeline)
70 */
71
72 /*
73 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
74 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
75 */ 325 */
76 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
77 327
78 r = get_model_name(c); 328#ifdef CONFIG_X86_64
329 /* On C+ stepping K8 rep microcode works well for copy/memset */
330 if (c->x86 == 0xf) {
331 u32 level;
79 332
80 switch (c->x86) { 333 level = cpuid_eax(1);
81 case 4: 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
82 /* 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
83 * General Systems BIOSen alias the cpu frequency registers
84 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
85 * drivers subsequently pokes it, and changes the CPU speed.
86 * Workaround : Remove the unneeded alias.
87 */
88#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
89#define CBAR_ENB (0x80000000)
90#define CBAR_KEY (0X000000CB)
91 if (c->x86_model == 9 || c->x86_model == 10) {
92 if (inl (CBAR) & CBAR_ENB)
93 outl (0 | CBAR_KEY, CBAR);
94 }
95 break;
96 case 5:
97 if (c->x86_model < 6) {
98 /* Based on AMD doc 20734R - June 2000 */
99 if (c->x86_model == 0) {
100 clear_cpu_cap(c, X86_FEATURE_APIC);
101 set_cpu_cap(c, X86_FEATURE_PGE);
102 }
103 break;
104 }
105
106 if (c->x86_model == 6 && c->x86_mask == 1) {
107 const int K6_BUG_LOOP = 1000000;
108 int n;
109 void (*f_vide)(void);
110 unsigned long d, d2;
111
112 printk(KERN_INFO "AMD K6 stepping B detected - ");
113
114 /*
115 * It looks like AMD fixed the 2.6.2 bug and improved indirect
116 * calls at the same time.
117 */
118
119 n = K6_BUG_LOOP;
120 f_vide = vide;
121 rdtscl(d);
122 while (n--)
123 f_vide();
124 rdtscl(d2);
125 d = d2-d;
126
127 if (d > 20*K6_BUG_LOOP)
128 printk("system stability may be impaired when more than 32 MB are used.\n");
129 else
130 printk("probably OK (after B9730xxxx).\n");
131 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
132 }
133
134 /* K6 with old style WHCR */
135 if (c->x86_model < 8 ||
136 (c->x86_model == 8 && c->x86_mask < 8)) {
137 /* We can only write allocate on the low 508Mb */
138 if (mbytes > 508)
139 mbytes = 508;
140
141 rdmsr(MSR_K6_WHCR, l, h);
142 if ((l&0x0000FFFF) == 0) {
143 unsigned long flags;
144 l = (1<<0)|((mbytes/4)<<1);
145 local_irq_save(flags);
146 wbinvd();
147 wrmsr(MSR_K6_WHCR, l, h);
148 local_irq_restore(flags);
149 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
150 mbytes);
151 }
152 break;
153 }
154
155 if ((c->x86_model == 8 && c->x86_mask > 7) ||
156 c->x86_model == 9 || c->x86_model == 13) {
157 /* The more serious chips .. */
158
159 if (mbytes > 4092)
160 mbytes = 4092;
161
162 rdmsr(MSR_K6_WHCR, l, h);
163 if ((l&0xFFFF0000) == 0) {
164 unsigned long flags;
165 l = ((mbytes>>2)<<22)|(1<<16);
166 local_irq_save(flags);
167 wbinvd();
168 wrmsr(MSR_K6_WHCR, l, h);
169 local_irq_restore(flags);
170 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
171 mbytes);
172 }
173
174 break;
175 }
176
177 if (c->x86_model == 10) {
178 /* AMD Geode LX is model 10 */
179 /* placeholder for any needed mods */
180 break;
181 }
182 break;
183 case 6: /* An Athlon/Duron */
184
185 /*
186 * Bit 15 of Athlon specific MSR 15, needs to be 0
187 * to enable SSE on Palomino/Morgan/Barton CPU's.
188 * If the BIOS didn't enable it already, enable it here.
189 */
190 if (c->x86_model >= 6 && c->x86_model <= 10) {
191 if (!cpu_has(c, X86_FEATURE_XMM)) {
192 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
193 rdmsr(MSR_K7_HWCR, l, h);
194 l &= ~0x00008000;
195 wrmsr(MSR_K7_HWCR, l, h);
196 set_cpu_cap(c, X86_FEATURE_XMM);
197 }
198 }
199
200 /*
201 * It's been determined by AMD that Athlons since model 8 stepping 1
202 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
203 * As per AMD technical note 27212 0.2
204 */
205 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
206 rdmsr(MSR_K7_CLK_CTL, l, h);
207 if ((l & 0xfff00000) != 0x20000000) {
208 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
209 ((l & 0x000fffff)|0x20000000));
210 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
211 }
212 }
213 break;
214 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
215 346
216 switch (c->x86) { 347 switch (c->x86) {
217 case 15: 348 case 4:
218 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
219 case 0x10:
220 case 0x11:
221 set_cpu_cap(c, X86_FEATURE_K8);
222 break; 350 break;
223 case 6: 351 case 5:
224 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
225 break; 356 break;
226 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
227 if (c->x86 >= 6) 365 if (c->x86 >= 6)
228 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
229 367
230 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
231 369 switch (c->x86) {
232 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
233 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
234 377
235#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
236 /*
237 * On a AMD multi core setup the lower bits of the APIC id
238 * distinguish the cores.
239 */
240 if (c->x86_max_cores > 1) {
241 int cpu = smp_processor_id();
242 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
243 379
244 if (bits == 0) { 380 /* Multi core CPU? */
245 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
246 bits++; 382 amd_detect_cmp(c);
247 } 383 srat_detect_node(c);
248 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
249 c->phys_proc_id >>= bits;
250 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
251 cpu, c->x86_max_cores, c->cpu_core_id);
252 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
253#endif 388#endif
254 389
255 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
256 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
257 num_cache_leaves = 4; 392 num_cache_leaves = 4;
258 else 393 else
259 num_cache_leaves = 3; 394 num_cache_leaves = 3;
260 } 395 }
261 396
262 /* K6s reports MCEs but don't actually have all the MSRs */ 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
263 if (c->x86 < 6) 398 set_cpu_cap(c, X86_FEATURE_K8);
264 clear_cpu_cap(c, X86_FEATURE_MCE);
265 399
266 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
267 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
268} 433}
269 434
435#ifdef CONFIG_X86_32
270static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
271{ 437{
272 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -279,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
279 } 445 }
280 return size; 446 return size;
281} 447}
448#endif
282 449
283static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
284 .c_vendor = "AMD", 451 .c_vendor = "AMD",
285 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
286 .c_models = { 454 .c_models = {
287 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
288 { 456 {
@@ -295,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
295 } 463 }
296 }, 464 },
297 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
298 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
299 .c_init = init_amd, 469 .c_init = init_amd,
300 .c_size_cache = amd_size_cache, 470 .c_x86_vendor = X86_VENDOR_AMD,
301}; 471};
302 472
303cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); 473cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index d1692b2a41ff..000000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,224 +0,0 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
120}
121
122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
123{
124 unsigned level;
125
126#ifdef CONFIG_SMP
127 unsigned long value;
128
129 /*
130 * Disable TLB flush filter by setting HWCR.FFDIS on K8
131 * bit 6 of msr C001_0015
132 *
133 * Errata 63 for SH-B3 steppings
134 * Errata 122 for all steppings (F+ have it disabled by default)
135 */
136 if (c->x86 == 0xf) {
137 rdmsrl(MSR_K8_HWCR, value);
138 value |= 1 << 6;
139 wrmsrl(MSR_K8_HWCR, value);
140 }
141#endif
142
143 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
144 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
145 clear_cpu_cap(c, 0*32+31);
146
147 /* On C+ stepping K8 rep microcode works well for copy/memset */
148 if (c->x86 == 0xf) {
149 level = cpuid_eax(1);
150 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
151 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
152 }
153 if (c->x86 == 0x10 || c->x86 == 0x11)
154 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
155
156 /* Enable workaround for FXSAVE leak */
157 if (c->x86 >= 6)
158 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
159
160 level = get_model_name(c);
161 if (!level) {
162 switch (c->x86) {
163 case 0xf:
164 /* Should distinguish Models here, but this is only
165 a fallback anyways. */
166 strcpy(c->x86_model_id, "Hammer");
167 break;
168 }
169 }
170 display_cacheinfo(c);
171
172 /* Multi core CPU? */
173 if (c->extended_cpuid_level >= 0x80000008)
174 amd_detect_cmp(c);
175
176 if (c->extended_cpuid_level >= 0x80000006 &&
177 (cpuid_edx(0x80000006) & 0xf000))
178 num_cache_leaves = 4;
179 else
180 num_cache_leaves = 3;
181
182 if (c->x86 >= 0xf && c->x86 <= 0x11)
183 set_cpu_cap(c, X86_FEATURE_K8);
184
185 /* MFENCE stops RDTSC speculation */
186 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
187
188 if (c->x86 == 0x10) {
189 /* do this for boot cpu */
190 if (c == &boot_cpu_data)
191 check_enable_amd_mmconf_dmi();
192
193 fam10h_check_enable_mmcfg();
194 }
195
196 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
197 unsigned long long tseg;
198
199 /*
200 * Split up direct mapping around the TSEG SMM area.
201 * Don't do it for gbpages because there seems very little
202 * benefit in doing so.
203 */
204 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
205 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
206 if ((tseg>>PMD_SHIFT) <
207 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
208 ((tseg>>PMD_SHIFT) <
209 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
210 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
211 set_memory_4k((unsigned long)__va(tseg), 1);
212 }
213 }
214}
215
216static struct cpu_dev amd_cpu_dev __cpuinitdata = {
217 .c_vendor = "AMD",
218 .c_ident = { "AuthenticAMD" },
219 .c_early_init = early_init_amd,
220 .c_init = init_amd,
221};
222
223cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
224
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index a0534c04d38a..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
@@ -475,6 +474,7 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
475 .c_early_init = early_init_centaur, 474 .c_early_init = early_init_centaur,
476 .c_init = init_centaur, 475 .c_init = init_centaur,
477 .c_size_cache = centaur_size_cache, 476 .c_size_cache = centaur_size_cache,
477 .c_x86_vendor = X86_VENDOR_CENTAUR,
478}; 478};
479 479
480cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 480cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c40e2e1..a1625f5a1e78 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
16 16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{ 18{
19 early_init_centaur(c);
20
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2; 22 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 } 24 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_ident = { "CentaurHauls" }, 30 .c_ident = { "CentaurHauls" },
30 .c_early_init = early_init_centaur, 31 .c_early_init = early_init_centaur,
31 .c_init = init_centaur, 32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
32}; 34};
33 35
34cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 36cpu_dev_register(centaur_cpu_dev);
35 37
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 000000000000..2056ccf572cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e456bd955bb..25581dcb280e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,28 +1,62 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
16#include <asm/asm.h> 21#include <asm/asm.h>
22#include <asm/numa.h>
17#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
18#include <asm/mpspec.h> 24#include <asm/mpspec.h>
19#include <asm/apic.h> 25#include <asm/apic.h>
20#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
21#endif 28#endif
22 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
23#include "cpu.h" 39#include "cpu.h"
24 40
41static struct cpu_dev *this_cpu __cpuinitdata;
42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
25DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
26 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
27 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
28 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 62 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -56,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
56 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
57 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
58} }; 92} };
93#endif
59EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
60 95
61__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 96#ifdef CONFIG_X86_32
62
63static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
64static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
65 99
66struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 100static int __init cachesize_setup(char *str)
101{
102 get_option(&str, &cachesize_override);
103 return 1;
104}
105__setup("cachesize=", cachesize_setup);
106
107static int __init x86_fxsr_setup(char *s)
108{
109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
110 setup_clear_cpu_cap(X86_FEATURE_XMM);
111 return 1;
112}
113__setup("nofxsr", x86_fxsr_setup);
114
115static int __init x86_sep_setup(char *s)
116{
117 setup_clear_cpu_cap(X86_FEATURE_SEP);
118 return 1;
119}
120__setup("nosep", x86_sep_setup);
121
122/* Standard macro to see if a specific flag is changeable */
123static inline int flag_is_changeable_p(u32 flag)
124{
125 u32 f1, f2;
126
127 /*
128 * Cyrix and IDT cpus allow disabling of CPUID
129 * so the code below may return different results
130 * when it is executed before and after enabling
131 * the CPUID. Add "volatile" to not allow gcc to
132 * optimize the subsequent calls to this function.
133 */
134 asm volatile ("pushfl\n\t"
135 "pushfl\n\t"
136 "popl %0\n\t"
137 "movl %0,%1\n\t"
138 "xorl %2,%0\n\t"
139 "pushl %0\n\t"
140 "popfl\n\t"
141 "pushfl\n\t"
142 "popl %0\n\t"
143 "popfl\n\t"
144 : "=&r" (f1), "=&r" (f2)
145 : "ir" (flag));
146
147 return ((f1^f2) & flag) != 0;
148}
149
150/* Probe for the CPUID instruction */
151static int __cpuinit have_cpuid_p(void)
152{
153 return flag_is_changeable_p(X86_EFLAGS_ID);
154}
155
156static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
157{
158 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
159 /* Disable processor serial number */
160 unsigned long lo, hi;
161 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
162 lo |= 0x200000;
163 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
164 printk(KERN_NOTICE "CPU serial number disabled.\n");
165 clear_cpu_cap(c, X86_FEATURE_PN);
166
167 /* Disabling the serial number may affect the cpuid level */
168 c->cpuid_level = cpuid_eax(0);
169 }
170}
171
172static int __init x86_serial_nr_setup(char *s)
173{
174 disable_x86_serial_nr = 0;
175 return 1;
176}
177__setup("serialnumber", x86_serial_nr_setup);
178#else
179static inline int flag_is_changeable_p(u32 flag)
180{
181 return 1;
182}
183/* Probe for the CPUID instruction */
184static inline int have_cpuid_p(void)
185{
186 return 1;
187}
188static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
189{
190}
191#endif
192
193/*
194 * Naming convention should be: <Name> [(<Codename>)]
195 * This table only is used unless init_<vendor>() below doesn't set it;
196 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
197 *
198 */
199
200/* Look up CPU names by table lookup. */
201static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
202{
203 struct cpu_model_info *info;
204
205 if (c->x86_model >= 16)
206 return NULL; /* Range check */
207
208 if (!this_cpu)
209 return NULL;
210
211 info = this_cpu->c_models;
212
213 while (info && info->family) {
214 if (info->family == c->x86)
215 return info->model_names[c->x86_model];
216 info++;
217 }
218 return NULL; /* Not found */
219}
220
221__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
222
223/* Current gdt points %fs at the "master" per-cpu area: after this,
224 * it's on the real one. */
225void switch_to_new_gdt(void)
226{
227 struct desc_ptr gdt_descr;
228
229 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
230 gdt_descr.size = GDT_SIZE - 1;
231 load_gdt(&gdt_descr);
232#ifdef CONFIG_X86_32
233 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
234#endif
235}
236
237static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
67 238
68static void __cpuinit default_init(struct cpuinfo_x86 *c) 239static void __cpuinit default_init(struct cpuinfo_x86 *c)
69{ 240{
241#ifdef CONFIG_X86_64
242 display_cacheinfo(c);
243#else
70 /* Not much we can do here... */ 244 /* Not much we can do here... */
71 /* Check if at least it has cpuid */ 245 /* Check if at least it has cpuid */
72 if (c->cpuid_level == -1) { 246 if (c->cpuid_level == -1) {
@@ -76,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
76 else if (c->x86 == 3) 250 else if (c->x86 == 3)
77 strcpy(c->x86_model_id, "386"); 251 strcpy(c->x86_model_id, "386");
78 } 252 }
253#endif
79} 254}
80 255
81static struct cpu_dev __cpuinitdata default_cpu = { 256static struct cpu_dev __cpuinitdata default_cpu = {
82 .c_init = default_init, 257 .c_init = default_init,
83 .c_vendor = "Unknown", 258 .c_vendor = "Unknown",
259 .c_x86_vendor = X86_VENDOR_UNKNOWN,
84}; 260};
85static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
86
87static int __init cachesize_setup(char *str)
88{
89 get_option(&str, &cachesize_override);
90 return 1;
91}
92__setup("cachesize=", cachesize_setup);
93 261
94int __cpuinit get_model_name(struct cpuinfo_x86 *c) 262static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
95{ 263{
96 unsigned int *v; 264 unsigned int *v;
97 char *p, *q; 265 char *p, *q;
98 266
99 if (cpuid_eax(0x80000000) < 0x80000004) 267 if (c->extended_cpuid_level < 0x80000004)
100 return 0; 268 return;
101 269
102 v = (unsigned int *) c->x86_model_id; 270 v = (unsigned int *) c->x86_model_id;
103 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 271 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -116,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
116 while (q <= &c->x86_model_id[48]) 284 while (q <= &c->x86_model_id[48])
117 *q++ = '\0'; /* Zero-pad the rest */ 285 *q++ = '\0'; /* Zero-pad the rest */
118 } 286 }
119
120 return 1;
121} 287}
122 288
123
124void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 289void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
125{ 290{
126 unsigned int n, dummy, ecx, edx, l2size; 291 unsigned int n, dummy, ebx, ecx, edx, l2size;
127 292
128 n = cpuid_eax(0x80000000); 293 n = c->extended_cpuid_level;
129 294
130 if (n >= 0x80000005) { 295 if (n >= 0x80000005) {
131 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); 296 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
132 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 297 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
133 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 298 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
134 c->x86_cache_size = (ecx>>24)+(edx>>24); 299 c->x86_cache_size = (ecx>>24) + (edx>>24);
300#ifdef CONFIG_X86_64
301 /* On K8 L1 TLB is inclusive, so don't count it */
302 c->x86_tlbsize = 0;
303#endif
135 } 304 }
136 305
137 if (n < 0x80000006) /* Some chips just has a large L1. */ 306 if (n < 0x80000006) /* Some chips just has a large L1. */
138 return; 307 return;
139 308
140 ecx = cpuid_ecx(0x80000006); 309 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
141 l2size = ecx >> 16; 310 l2size = ecx >> 16;
142 311
312#ifdef CONFIG_X86_64
313 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
314#else
143 /* do processor-specific cache resizing */ 315 /* do processor-specific cache resizing */
144 if (this_cpu->c_size_cache) 316 if (this_cpu->c_size_cache)
145 l2size = this_cpu->c_size_cache(c, l2size); 317 l2size = this_cpu->c_size_cache(c, l2size);
@@ -150,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
150 322
151 if (l2size == 0) 323 if (l2size == 0)
152 return; /* Again, no L2 cache is possible */ 324 return; /* Again, no L2 cache is possible */
325#endif
153 326
154 c->x86_cache_size = l2size; 327 c->x86_cache_size = l2size;
155 328
156 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 329 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
157 l2size, ecx & 0xFF); 330 l2size, ecx & 0xFF);
158} 331}
159 332
160/* 333void __cpuinit detect_ht(struct cpuinfo_x86 *c)
161 * Naming convention should be: <Name> [(<Codename>)]
162 * This table only is used unless init_<vendor>() below doesn't set it;
163 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
164 *
165 */
166
167/* Look up CPU names by table lookup. */
168static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
169{ 334{
170 struct cpu_model_info *info; 335#ifdef CONFIG_X86_HT
336 u32 eax, ebx, ecx, edx;
337 int index_msb, core_bits;
171 338
172 if (c->x86_model >= 16) 339 if (!cpu_has(c, X86_FEATURE_HT))
173 return NULL; /* Range check */ 340 return;
174 341
175 if (!this_cpu) 342 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
176 return NULL; 343 goto out;
177 344
178 info = this_cpu->c_models; 345 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
346 return;
179 347
180 while (info && info->family) { 348 cpuid(1, &eax, &ebx, &ecx, &edx);
181 if (info->family == c->x86) 349
182 return info->model_names[c->x86_model]; 350 smp_num_siblings = (ebx & 0xff0000) >> 16;
183 info++; 351
352 if (smp_num_siblings == 1) {
353 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
354 } else if (smp_num_siblings > 1) {
355
356 if (smp_num_siblings > NR_CPUS) {
357 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
358 smp_num_siblings);
359 smp_num_siblings = 1;
360 return;
361 }
362
363 index_msb = get_count_order(smp_num_siblings);
364#ifdef CONFIG_X86_64
365 c->phys_proc_id = phys_pkg_id(index_msb);
366#else
367 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
368#endif
369
370 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
371
372 index_msb = get_count_order(smp_num_siblings);
373
374 core_bits = get_count_order(c->x86_max_cores);
375
376#ifdef CONFIG_X86_64
377 c->cpu_core_id = phys_pkg_id(index_msb) &
378 ((1 << core_bits) - 1);
379#else
380 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
381 ((1 << core_bits) - 1);
382#endif
184 } 383 }
185 return NULL; /* Not found */
186}
187 384
385out:
386 if ((c->x86_max_cores * smp_num_siblings) > 1) {
387 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
388 c->phys_proc_id);
389 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
390 c->cpu_core_id);
391 }
392#endif
393}
188 394
189static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) 395static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
190{ 396{
191 char *v = c->x86_vendor_id; 397 char *v = c->x86_vendor_id;
192 int i; 398 int i;
193 static int printed; 399 static int printed;
194 400
195 for (i = 0; i < X86_VENDOR_NUM; i++) { 401 for (i = 0; i < X86_VENDOR_NUM; i++) {
196 if (cpu_devs[i]) { 402 if (!cpu_devs[i])
197 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 403 break;
198 (cpu_devs[i]->c_ident[1] && 404
199 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 405 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
200 c->x86_vendor = i; 406 (cpu_devs[i]->c_ident[1] &&
201 if (!early) 407 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
202 this_cpu = cpu_devs[i]; 408 this_cpu = cpu_devs[i];
203 return; 409 c->x86_vendor = this_cpu->c_x86_vendor;
204 } 410 return;
205 } 411 }
206 } 412 }
413
207 if (!printed) { 414 if (!printed) {
208 printed++; 415 printed++;
209 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 416 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
210 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 417 printk(KERN_ERR "CPU: Your system may be unstable.\n");
211 } 418 }
419
212 c->x86_vendor = X86_VENDOR_UNKNOWN; 420 c->x86_vendor = X86_VENDOR_UNKNOWN;
213 this_cpu = &default_cpu; 421 this_cpu = &default_cpu;
214} 422}
215 423
216 424void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
217static int __init x86_fxsr_setup(char *s)
218{
219 setup_clear_cpu_cap(X86_FEATURE_FXSR);
220 setup_clear_cpu_cap(X86_FEATURE_XMM);
221 return 1;
222}
223__setup("nofxsr", x86_fxsr_setup);
224
225
226static int __init x86_sep_setup(char *s)
227{
228 setup_clear_cpu_cap(X86_FEATURE_SEP);
229 return 1;
230}
231__setup("nosep", x86_sep_setup);
232
233
234/* Standard macro to see if a specific flag is changeable */
235static inline int flag_is_changeable_p(u32 flag)
236{
237 u32 f1, f2;
238
239 asm("pushfl\n\t"
240 "pushfl\n\t"
241 "popl %0\n\t"
242 "movl %0,%1\n\t"
243 "xorl %2,%0\n\t"
244 "pushl %0\n\t"
245 "popfl\n\t"
246 "pushfl\n\t"
247 "popl %0\n\t"
248 "popfl\n\t"
249 : "=&r" (f1), "=&r" (f2)
250 : "ir" (flag));
251
252 return ((f1^f2) & flag) != 0;
253}
254
255
256/* Probe for the CPUID instruction */
257static int __cpuinit have_cpuid_p(void)
258{
259 return flag_is_changeable_p(X86_EFLAGS_ID);
260}
261
262void __init cpu_detect(struct cpuinfo_x86 *c)
263{ 425{
264 /* Get vendor name */ 426 /* Get vendor name */
265 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 427 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -268,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
268 (unsigned int *)&c->x86_vendor_id[4]); 430 (unsigned int *)&c->x86_vendor_id[4]);
269 431
270 c->x86 = 4; 432 c->x86 = 4;
433 /* Intel-defined flags: level 0x00000001 */
271 if (c->cpuid_level >= 0x00000001) { 434 if (c->cpuid_level >= 0x00000001) {
272 u32 junk, tfms, cap0, misc; 435 u32 junk, tfms, cap0, misc;
273 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 436 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
274 c->x86 = (tfms >> 8) & 15; 437 c->x86 = (tfms >> 8) & 0xf;
275 c->x86_model = (tfms >> 4) & 15; 438 c->x86_model = (tfms >> 4) & 0xf;
439 c->x86_mask = tfms & 0xf;
276 if (c->x86 == 0xf) 440 if (c->x86 == 0xf)
277 c->x86 += (tfms >> 20) & 0xff; 441 c->x86 += (tfms >> 20) & 0xff;
278 if (c->x86 >= 0x6) 442 if (c->x86 >= 0x6)
279 c->x86_model += ((tfms >> 16) & 0xF) << 4; 443 c->x86_model += ((tfms >> 16) & 0xf) << 4;
280 c->x86_mask = tfms & 15;
281 if (cap0 & (1<<19)) { 444 if (cap0 & (1<<19)) {
282 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
283 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 445 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
446 c->x86_cache_alignment = c->x86_clflush_size;
284 } 447 }
285 } 448 }
286} 449}
287static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) 450
451static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
288{ 452{
289 u32 tfms, xlvl; 453 u32 tfms, xlvl;
290 unsigned int ebx; 454 u32 ebx;
291 455
292 memset(&c->x86_capability, 0, sizeof c->x86_capability); 456 /* Intel-defined flags: level 0x00000001 */
293 if (have_cpuid_p()) { 457 if (c->cpuid_level >= 0x00000001) {
294 /* Intel-defined flags: level 0x00000001 */ 458 u32 capability, excap;
295 if (c->cpuid_level >= 0x00000001) { 459 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
296 u32 capability, excap; 460 c->x86_capability[0] = capability;
297 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 461 c->x86_capability[4] = excap;
298 c->x86_capability[0] = capability; 462 }
299 c->x86_capability[4] = excap;
300 }
301 463
302 /* AMD-defined flags: level 0x80000001 */ 464 /* AMD-defined flags: level 0x80000001 */
303 xlvl = cpuid_eax(0x80000000); 465 xlvl = cpuid_eax(0x80000000);
304 if ((xlvl & 0xffff0000) == 0x80000000) { 466 c->extended_cpuid_level = xlvl;
305 if (xlvl >= 0x80000001) { 467 if ((xlvl & 0xffff0000) == 0x80000000) {
306 c->x86_capability[1] = cpuid_edx(0x80000001); 468 if (xlvl >= 0x80000001) {
307 c->x86_capability[6] = cpuid_ecx(0x80000001); 469 c->x86_capability[1] = cpuid_edx(0x80000001);
308 } 470 c->x86_capability[6] = cpuid_ecx(0x80000001);
309 } 471 }
472 }
473
474#ifdef CONFIG_X86_64
475 if (c->extended_cpuid_level >= 0x80000008) {
476 u32 eax = cpuid_eax(0x80000008);
310 477
478 c->x86_virt_bits = (eax >> 8) & 0xff;
479 c->x86_phys_bits = eax & 0xff;
311 } 480 }
481#endif
482
483 if (c->extended_cpuid_level >= 0x80000007)
484 c->x86_power = cpuid_edx(0x80000007);
312 485
313} 486}
314 487
488static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
489{
490#ifdef CONFIG_X86_32
491 int i;
492
493 /*
494 * First of all, decide if this is a 486 or higher
495 * It's a 486 if we can modify the AC flag
496 */
497 if (flag_is_changeable_p(X86_EFLAGS_AC))
498 c->x86 = 4;
499 else
500 c->x86 = 3;
501
502 for (i = 0; i < X86_VENDOR_NUM; i++)
503 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
504 c->x86_vendor_id[0] = 0;
505 cpu_devs[i]->c_identify(c);
506 if (c->x86_vendor_id[0]) {
507 get_cpu_vendor(c);
508 break;
509 }
510 }
511#endif
512}
513
315/* 514/*
316 * Do minimum CPU detection early. 515 * Do minimum CPU detection early.
317 * Fields really needed: vendor, cpuid_level, family, model, mask, 516 * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -321,25 +520,61 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
321 * WARNING: this function is only called on the BP. Don't add code here 520 * WARNING: this function is only called on the BP. Don't add code here
322 * that is supposed to run on all CPUs. 521 * that is supposed to run on all CPUs.
323 */ 522 */
324static void __init early_cpu_detect(void) 523static void __init early_identify_cpu(struct cpuinfo_x86 *c)
325{ 524{
326 struct cpuinfo_x86 *c = &boot_cpu_data; 525#ifdef CONFIG_X86_64
327 526 c->x86_clflush_size = 64;
328 c->x86_cache_alignment = 32; 527#else
329 c->x86_clflush_size = 32; 528 c->x86_clflush_size = 32;
529#endif
530 c->x86_cache_alignment = c->x86_clflush_size;
531
532 memset(&c->x86_capability, 0, sizeof c->x86_capability);
533 c->extended_cpuid_level = 0;
534
535 if (!have_cpuid_p())
536 identify_cpu_without_cpuid(c);
330 537
538 /* cyrix could have cpuid enabled via c_identify()*/
331 if (!have_cpuid_p()) 539 if (!have_cpuid_p())
332 return; 540 return;
333 541
334 cpu_detect(c); 542 cpu_detect(c);
335 543
336 get_cpu_vendor(c, 1); 544 get_cpu_vendor(c);
545
546 get_cpu_cap(c);
337 547
338 early_get_cap(c); 548 if (this_cpu->c_early_init)
549 this_cpu->c_early_init(c);
339 550
340 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 551 validate_pat_support(c);
341 cpu_devs[c->x86_vendor]->c_early_init) 552}
342 cpu_devs[c->x86_vendor]->c_early_init(c); 553
554void __init early_cpu_init(void)
555{
556 struct cpu_dev **cdev;
557 int count = 0;
558
559 printk("KERNEL supported cpus:\n");
560 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
561 struct cpu_dev *cpudev = *cdev;
562 unsigned int j;
563
564 if (count >= X86_VENDOR_NUM)
565 break;
566 cpu_devs[count] = cpudev;
567 count++;
568
569 for (j = 0; j < 2; j++) {
570 if (!cpudev->c_ident[j])
571 continue;
572 printk(" %s %s\n", cpudev->c_vendor,
573 cpudev->c_ident[j]);
574 }
575 }
576
577 early_identify_cpu(&boot_cpu_data);
343} 578}
344 579
345/* 580/*
@@ -357,86 +592,41 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
357 592
358static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 593static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
359{ 594{
360 u32 tfms, xlvl; 595 c->extended_cpuid_level = 0;
361 unsigned int ebx;
362
363 if (have_cpuid_p()) {
364 /* Get vendor name */
365 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
366 (unsigned int *)&c->x86_vendor_id[0],
367 (unsigned int *)&c->x86_vendor_id[8],
368 (unsigned int *)&c->x86_vendor_id[4]);
369
370 get_cpu_vendor(c, 0);
371 /* Initialize the standard set of capabilities */
372 /* Note that the vendor-specific code below might override */
373 /* Intel-defined flags: level 0x00000001 */
374 if (c->cpuid_level >= 0x00000001) {
375 u32 capability, excap;
376 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
377 c->x86_capability[0] = capability;
378 c->x86_capability[4] = excap;
379 c->x86 = (tfms >> 8) & 15;
380 c->x86_model = (tfms >> 4) & 15;
381 if (c->x86 == 0xf)
382 c->x86 += (tfms >> 20) & 0xff;
383 if (c->x86 >= 0x6)
384 c->x86_model += ((tfms >> 16) & 0xF) << 4;
385 c->x86_mask = tfms & 15;
386 c->initial_apicid = (ebx >> 24) & 0xFF;
387#ifdef CONFIG_X86_HT
388 c->apicid = phys_pkg_id(c->initial_apicid, 0);
389 c->phys_proc_id = c->initial_apicid;
390#else
391 c->apicid = c->initial_apicid;
392#endif
393 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
394 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
395 } else {
396 /* Have CPUID level 0 only - unheard of */
397 c->x86 = 4;
398 }
399 596
400 /* AMD-defined flags: level 0x80000001 */ 597 if (!have_cpuid_p())
401 xlvl = cpuid_eax(0x80000000); 598 identify_cpu_without_cpuid(c);
402 if ((xlvl & 0xffff0000) == 0x80000000) {
403 if (xlvl >= 0x80000001) {
404 c->x86_capability[1] = cpuid_edx(0x80000001);
405 c->x86_capability[6] = cpuid_ecx(0x80000001);
406 }
407 if (xlvl >= 0x80000004)
408 get_model_name(c); /* Default name */
409 }
410 599
411 init_scattered_cpuid_features(c); 600 /* cyrix could have cpuid enabled via c_identify()*/
412 detect_nopl(c); 601 if (!have_cpuid_p())
413 } 602 return;
414}
415 603
416static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 604 cpu_detect(c);
417{
418 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
419 /* Disable processor serial number */
420 unsigned long lo, hi;
421 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
422 lo |= 0x200000;
423 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
424 printk(KERN_NOTICE "CPU serial number disabled.\n");
425 clear_cpu_cap(c, X86_FEATURE_PN);
426 605
427 /* Disabling the serial number may affect the cpuid level */ 606 get_cpu_vendor(c);
428 c->cpuid_level = cpuid_eax(0);
429 }
430}
431 607
432static int __init x86_serial_nr_setup(char *s) 608 get_cpu_cap(c);
433{ 609
434 disable_x86_serial_nr = 0; 610 if (c->cpuid_level >= 0x00000001) {
435 return 1; 611 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
436} 612#ifdef CONFIG_X86_32
437__setup("serialnumber", x86_serial_nr_setup); 613# ifdef CONFIG_X86_HT
614 c->apicid = phys_pkg_id(c->initial_apicid, 0);
615# else
616 c->apicid = c->initial_apicid;
617# endif
618#endif
438 619
620#ifdef CONFIG_X86_HT
621 c->phys_proc_id = c->initial_apicid;
622#endif
623 }
439 624
625 get_model_name(c); /* Default name */
626
627 init_scattered_cpuid_features(c);
628 detect_nopl(c);
629}
440 630
441/* 631/*
442 * This does the hard work of actually picking apart the CPU stuff... 632 * This does the hard work of actually picking apart the CPU stuff...
@@ -448,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
448 c->loops_per_jiffy = loops_per_jiffy; 638 c->loops_per_jiffy = loops_per_jiffy;
449 c->x86_cache_size = -1; 639 c->x86_cache_size = -1;
450 c->x86_vendor = X86_VENDOR_UNKNOWN; 640 c->x86_vendor = X86_VENDOR_UNKNOWN;
451 c->cpuid_level = -1; /* CPUID not detected */
452 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 641 c->x86_model = c->x86_mask = 0; /* So far unknown... */
453 c->x86_vendor_id[0] = '\0'; /* Unset */ 642 c->x86_vendor_id[0] = '\0'; /* Unset */
454 c->x86_model_id[0] = '\0'; /* Unset */ 643 c->x86_model_id[0] = '\0'; /* Unset */
455 c->x86_max_cores = 1; 644 c->x86_max_cores = 1;
645 c->x86_coreid_bits = 0;
646#ifdef CONFIG_X86_64
647 c->x86_clflush_size = 64;
648#else
649 c->cpuid_level = -1; /* CPUID not detected */
456 c->x86_clflush_size = 32; 650 c->x86_clflush_size = 32;
651#endif
652 c->x86_cache_alignment = c->x86_clflush_size;
457 memset(&c->x86_capability, 0, sizeof c->x86_capability); 653 memset(&c->x86_capability, 0, sizeof c->x86_capability);
458 654
459 if (!have_cpuid_p()) {
460 /*
461 * First of all, decide if this is a 486 or higher
462 * It's a 486 if we can modify the AC flag
463 */
464 if (flag_is_changeable_p(X86_EFLAGS_AC))
465 c->x86 = 4;
466 else
467 c->x86 = 3;
468 }
469
470 generic_identify(c); 655 generic_identify(c);
471 656
472 if (this_cpu->c_identify) 657 if (this_cpu->c_identify)
473 this_cpu->c_identify(c); 658 this_cpu->c_identify(c);
474 659
660#ifdef CONFIG_X86_64
661 c->apicid = phys_pkg_id(0);
662#endif
663
475 /* 664 /*
476 * Vendor-specific initialization. In this section we 665 * Vendor-specific initialization. In this section we
477 * canonicalize the feature flags, meaning if there are 666 * canonicalize the feature flags, meaning if there are
@@ -505,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
505 c->x86, c->x86_model); 694 c->x86, c->x86_model);
506 } 695 }
507 696
697#ifdef CONFIG_X86_64
698 detect_ht(c);
699#endif
700
508 /* 701 /*
509 * On SMP, boot_cpu_data holds the common feature set between 702 * On SMP, boot_cpu_data holds the common feature set between
510 * all CPUs; so make sure that we indicate which features are 703 * all CPUs; so make sure that we indicate which features are
@@ -513,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
513 */ 706 */
514 if (c != &boot_cpu_data) { 707 if (c != &boot_cpu_data) {
515 /* AND the already accumulated flags with these */ 708 /* AND the already accumulated flags with these */
516 for (i = 0 ; i < NCAPINTS ; i++) 709 for (i = 0; i < NCAPINTS; i++)
517 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 710 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
518 } 711 }
519 712
@@ -521,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
521 for (i = 0; i < NCAPINTS; i++) 714 for (i = 0; i < NCAPINTS; i++)
522 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 715 c->x86_capability[i] &= ~cleared_cpu_caps[i];
523 716
717#ifdef CONFIG_X86_MCE
524 /* Init Machine Check Exception if available. */ 718 /* Init Machine Check Exception if available. */
525 mcheck_init(c); 719 mcheck_init(c);
720#endif
526 721
527 select_idle_routine(c); 722 select_idle_routine(c);
723
724#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
725 numa_add_cpu(smp_processor_id());
726#endif
528} 727}
529 728
729#ifdef CONFIG_X86_64
730static void vgetcpu_set_mode(void)
731{
732 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
733 vgetcpu_mode = VGETCPU_RDTSCP;
734 else
735 vgetcpu_mode = VGETCPU_LSL;
736}
737#endif
738
530void __init identify_boot_cpu(void) 739void __init identify_boot_cpu(void)
531{ 740{
532 identify_cpu(&boot_cpu_data); 741 identify_cpu(&boot_cpu_data);
742#ifdef CONFIG_X86_32
533 sysenter_setup(); 743 sysenter_setup();
534 enable_sep_cpu(); 744 enable_sep_cpu();
745#else
746 vgetcpu_set_mode();
747#endif
535} 748}
536 749
537void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 750void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
538{ 751{
539 BUG_ON(c == &boot_cpu_data); 752 BUG_ON(c == &boot_cpu_data);
540 identify_cpu(c); 753 identify_cpu(c);
754#ifdef CONFIG_X86_32
541 enable_sep_cpu(); 755 enable_sep_cpu();
756#endif
542 mtrr_ap_init(); 757 mtrr_ap_init();
543} 758}
544 759
545#ifdef CONFIG_X86_HT 760struct msr_range {
546void __cpuinit detect_ht(struct cpuinfo_x86 *c) 761 unsigned min;
547{ 762 unsigned max;
548 u32 eax, ebx, ecx, edx; 763};
549 int index_msb, core_bits;
550
551 cpuid(1, &eax, &ebx, &ecx, &edx);
552
553 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
554 return;
555
556 smp_num_siblings = (ebx & 0xff0000) >> 16;
557 764
558 if (smp_num_siblings == 1) { 765static struct msr_range msr_range_array[] __cpuinitdata = {
559 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 766 { 0x00000000, 0x00000418},
560 } else if (smp_num_siblings > 1) { 767 { 0xc0000000, 0xc000040b},
768 { 0xc0010000, 0xc0010142},
769 { 0xc0011000, 0xc001103b},
770};
561 771
562 if (smp_num_siblings > NR_CPUS) { 772static void __cpuinit print_cpu_msr(void)
563 printk(KERN_WARNING "CPU: Unsupported number of the " 773{
564 "siblings %d", smp_num_siblings); 774 unsigned index;
565 smp_num_siblings = 1; 775 u64 val;
566 return; 776 int i;
777 unsigned index_min, index_max;
778
779 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
780 index_min = msr_range_array[i].min;
781 index_max = msr_range_array[i].max;
782 for (index = index_min; index < index_max; index++) {
783 if (rdmsrl_amd_safe(index, &val))
784 continue;
785 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
567 } 786 }
787 }
788}
568 789
569 index_msb = get_count_order(smp_num_siblings); 790static int show_msr __cpuinitdata;
570 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 791static __init int setup_show_msr(char *arg)
571 792{
572 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 793 int num;
573 c->phys_proc_id);
574
575 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
576
577 index_msb = get_count_order(smp_num_siblings) ;
578
579 core_bits = get_count_order(c->x86_max_cores);
580 794
581 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 795 get_option(&arg, &num);
582 ((1 << core_bits) - 1);
583 796
584 if (c->x86_max_cores > 1) 797 if (num > 0)
585 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 798 show_msr = num;
586 c->cpu_core_id); 799 return 1;
587 }
588} 800}
589#endif 801__setup("show_msr=", setup_show_msr);
590 802
591static __init int setup_noclflush(char *arg) 803static __init int setup_noclflush(char *arg)
592{ 804{
@@ -604,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
604 else if (c->cpuid_level >= 0) 816 else if (c->cpuid_level >= 0)
605 vendor = c->x86_vendor_id; 817 vendor = c->x86_vendor_id;
606 818
607 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) 819 if (vendor && !strstr(c->x86_model_id, vendor))
608 printk("%s ", vendor); 820 printk(KERN_CONT "%s ", vendor);
609 821
610 if (!c->x86_model_id[0]) 822 if (c->x86_model_id[0])
611 printk("%d86", c->x86); 823 printk(KERN_CONT "%s", c->x86_model_id);
612 else 824 else
613 printk("%s", c->x86_model_id); 825 printk(KERN_CONT "%d86", c->x86);
614 826
615 if (c->x86_mask || c->cpuid_level >= 0) 827 if (c->x86_mask || c->cpuid_level >= 0)
616 printk(" stepping %02x\n", c->x86_mask); 828 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
617 else 829 else
618 printk("\n"); 830 printk(KERN_CONT "\n");
831
832#ifdef CONFIG_SMP
833 if (c->cpu_index < show_msr)
834 print_cpu_msr();
835#else
836 if (show_msr)
837 print_cpu_msr();
838#endif
619} 839}
620 840
621static __init int setup_disablecpuid(char *arg) 841static __init int setup_disablecpuid(char *arg)
@@ -631,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid);
631 851
632cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 852cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
633 853
634void __init early_cpu_init(void) 854#ifdef CONFIG_X86_64
855struct x8664_pda **_cpu_pda __read_mostly;
856EXPORT_SYMBOL(_cpu_pda);
857
858struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
859
860char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
861
862void __cpuinit pda_init(int cpu)
635{ 863{
636 struct cpu_vendor_dev *cvdev; 864 struct x8664_pda *pda = cpu_pda(cpu);
865
866 /* Setup up data that may be needed in __get_free_pages early */
867 loadsegment(fs, 0);
868 loadsegment(gs, 0);
869 /* Memory clobbers used to order PDA accessed */
870 mb();
871 wrmsrl(MSR_GS_BASE, pda);
872 mb();
873
874 pda->cpunumber = cpu;
875 pda->irqcount = -1;
876 pda->kernelstack = (unsigned long)stack_thread_info() -
877 PDA_STACKOFFSET + THREAD_SIZE;
878 pda->active_mm = &init_mm;
879 pda->mmu_state = 0;
880
881 if (cpu == 0) {
882 /* others are initialized in smpboot.c */
883 pda->pcurrent = &init_task;
884 pda->irqstackptr = boot_cpu_stack;
885 pda->irqstackptr += IRQSTACKSIZE - 64;
886 } else {
887 if (!pda->irqstackptr) {
888 pda->irqstackptr = (char *)
889 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
890 if (!pda->irqstackptr)
891 panic("cannot allocate irqstack for cpu %d",
892 cpu);
893 pda->irqstackptr += IRQSTACKSIZE - 64;
894 }
895
896 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
897 pda->nodenumber = cpu_to_node(cpu);
898 }
899}
900
901char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
902 DEBUG_STKSZ] __page_aligned_bss;
637 903
638 for (cvdev = __x86cpuvendor_start ; 904extern asmlinkage void ignore_sysret(void);
639 cvdev < __x86cpuvendor_end ;
640 cvdev++)
641 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
642 905
643 early_cpu_detect(); 906/* May not be marked __init: used by software suspend */
644 validate_pat_support(&boot_cpu_data); 907void syscall_init(void)
908{
909 /*
910 * LSTAR and STAR live in a bit strange symbiosis.
911 * They both write to the same internal register. STAR allows to
912 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
913 */
914 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
915 wrmsrl(MSR_LSTAR, system_call);
916 wrmsrl(MSR_CSTAR, ignore_sysret);
917
918#ifdef CONFIG_IA32_EMULATION
919 syscall32_cpu_init();
920#endif
921
922 /* Flags to clear on syscall */
923 wrmsrl(MSR_SYSCALL_MASK,
924 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
645} 925}
646 926
927unsigned long kernel_eflags;
928
929/*
930 * Copies of the original ist values from the tss are only accessed during
931 * debugging, no special alignment required.
932 */
933DEFINE_PER_CPU(struct orig_ist, orig_ist);
934
935#else
936
647/* Make sure %fs is initialized properly in idle threads */ 937/* Make sure %fs is initialized properly in idle threads */
648struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 938struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
649{ 939{
@@ -651,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
651 regs->fs = __KERNEL_PERCPU; 941 regs->fs = __KERNEL_PERCPU;
652 return regs; 942 return regs;
653} 943}
654 944#endif
655/* Current gdt points %fs at the "master" per-cpu area: after this,
656 * it's on the real one. */
657void switch_to_new_gdt(void)
658{
659 struct desc_ptr gdt_descr;
660
661 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
662 gdt_descr.size = GDT_SIZE - 1;
663 load_gdt(&gdt_descr);
664 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
665}
666 945
667/* 946/*
668 * cpu_init() initializes state that is per-CPU. Some data is already 947 * cpu_init() initializes state that is per-CPU. Some data is already
669 * initialized (naturally) in the bootstrap process, such as the GDT 948 * initialized (naturally) in the bootstrap process, such as the GDT
670 * and IDT. We reload them nevertheless, this function acts as a 949 * and IDT. We reload them nevertheless, this function acts as a
671 * 'CPU state barrier', nothing should get across. 950 * 'CPU state barrier', nothing should get across.
951 * A lot of state is already set up in PDA init for 64 bit
672 */ 952 */
953#ifdef CONFIG_X86_64
954void __cpuinit cpu_init(void)
955{
956 int cpu = stack_smp_processor_id();
957 struct tss_struct *t = &per_cpu(init_tss, cpu);
958 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
959 unsigned long v;
960 char *estacks = NULL;
961 struct task_struct *me;
962 int i;
963
964 /* CPU 0 is initialised in head64.c */
965 if (cpu != 0)
966 pda_init(cpu);
967 else
968 estacks = boot_exception_stacks;
969
970 me = current;
971
972 if (cpu_test_and_set(cpu, cpu_initialized))
973 panic("CPU#%d already initialized!\n", cpu);
974
975 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
976
977 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
978
979 /*
980 * Initialize the per-CPU GDT with the boot GDT,
981 * and set up the GDT descriptor:
982 */
983
984 switch_to_new_gdt();
985 load_idt((const struct desc_ptr *)&idt_descr);
986
987 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
988 syscall_init();
989
990 wrmsrl(MSR_FS_BASE, 0);
991 wrmsrl(MSR_KERNEL_GS_BASE, 0);
992 barrier();
993
994 check_efer();
995 if (cpu != 0 && x2apic)
996 enable_x2apic();
997
998 /*
999 * set up and load the per-CPU TSS
1000 */
1001 if (!orig_ist->ist[0]) {
1002 static const unsigned int order[N_EXCEPTION_STACKS] = {
1003 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1004 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1005 };
1006 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1007 if (cpu) {
1008 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1009 if (!estacks)
1010 panic("Cannot allocate exception "
1011 "stack %ld %d\n", v, cpu);
1012 }
1013 estacks += PAGE_SIZE << order[v];
1014 orig_ist->ist[v] = t->x86_tss.ist[v] =
1015 (unsigned long)estacks;
1016 }
1017 }
1018
1019 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1020 /*
1021 * <= is required because the CPU will access up to
1022 * 8 bits beyond the end of the IO permission bitmap.
1023 */
1024 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1025 t->io_bitmap[i] = ~0UL;
1026
1027 atomic_inc(&init_mm.mm_count);
1028 me->active_mm = &init_mm;
1029 if (me->mm)
1030 BUG();
1031 enter_lazy_tlb(&init_mm, me);
1032
1033 load_sp0(t, &current->thread);
1034 set_tss_desc(cpu, t);
1035 load_TR_desc();
1036 load_LDT(&init_mm.context);
1037
1038#ifdef CONFIG_KGDB
1039 /*
1040 * If the kgdb is connected no debug regs should be altered. This
1041 * is only applicable when KGDB and a KGDB I/O module are built
1042 * into the kernel and you are using early debugging with
1043 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1044 */
1045 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1046 arch_kgdb_ops.correct_hw_break();
1047 else {
1048#endif
1049 /*
1050 * Clear all 6 debug registers:
1051 */
1052
1053 set_debugreg(0UL, 0);
1054 set_debugreg(0UL, 1);
1055 set_debugreg(0UL, 2);
1056 set_debugreg(0UL, 3);
1057 set_debugreg(0UL, 6);
1058 set_debugreg(0UL, 7);
1059#ifdef CONFIG_KGDB
1060 /* If the kgdb is connected no debug regs should be altered. */
1061 }
1062#endif
1063
1064 fpu_init();
1065
1066 raw_local_save_flags(kernel_eflags);
1067
1068 if (is_uv_system())
1069 uv_cpu_init();
1070}
1071
1072#else
1073
673void __cpuinit cpu_init(void) 1074void __cpuinit cpu_init(void)
674{ 1075{
675 int cpu = smp_processor_id(); 1076 int cpu = smp_processor_id();
@@ -723,19 +1124,21 @@ void __cpuinit cpu_init(void)
723 /* 1124 /*
724 * Force FPU initialization: 1125 * Force FPU initialization:
725 */ 1126 */
726 current_thread_info()->status = 0; 1127 if (cpu_has_xsave)
1128 current_thread_info()->status = TS_XSAVE;
1129 else
1130 current_thread_info()->status = 0;
727 clear_used_math(); 1131 clear_used_math();
728 mxcsr_feature_mask_init(); 1132 mxcsr_feature_mask_init();
729}
730 1133
731#ifdef CONFIG_HOTPLUG_CPU 1134 /*
732void __cpuinit cpu_uninit(void) 1135 * Boot processor to setup the FP and extended state context info.
733{ 1136 */
734 int cpu = raw_smp_processor_id(); 1137 if (!smp_processor_id())
735 cpu_clear(cpu, cpu_initialized); 1138 init_thread_xstate();
736 1139
737 /* lazy TLB state */ 1140 xsave_init();
738 per_cpu(cpu_tlbstate, cpu).state = 0;
739 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
740} 1141}
1142
1143
741#endif 1144#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index 305b465889b0..000000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,763 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/delay.h>
11#include <linux/smp.h>
12#include <linux/percpu.h>
13#include <asm/i387.h>
14#include <asm/msr.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h>
18#include <asm/mtrr.h>
19#include <asm/mce.h>
20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
23#ifdef CONFIG_X86_LOCAL_APIC
24#include <asm/mpspec.h>
25#include <asm/apic.h>
26#include <mach_apic.h>
27#endif
28#include <asm/pda.h>
29#include <asm/pgtable.h>
30#include <asm/processor.h>
31#include <asm/desc.h>
32#include <asm/atomic.h>
33#include <asm/proto.h>
34#include <asm/sections.h>
35#include <asm/setup.h>
36#include <asm/genapic.h>
37
38#include "cpu.h"
39
40/* We need valid kernel segments for data and code in long mode too
41 * IRET will check the segment types kkeil 2000/10/28
42 * Also sysret mandates a special GDT layout
43 */
44/* The TLS descriptors are currently at a different place compared to i386.
45 Hopefully nobody expects them at a fixed place (Wine?) */
46DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
47 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
48 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
49 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
50 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
51 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
52 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
53} };
54EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
55
56__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
57
58/* Current gdt points %fs at the "master" per-cpu area: after this,
59 * it's on the real one. */
60void switch_to_new_gdt(void)
61{
62 struct desc_ptr gdt_descr;
63
64 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
65 gdt_descr.size = GDT_SIZE - 1;
66 load_gdt(&gdt_descr);
67}
68
69struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
70
71static void __cpuinit default_init(struct cpuinfo_x86 *c)
72{
73 display_cacheinfo(c);
74}
75
76static struct cpu_dev __cpuinitdata default_cpu = {
77 .c_init = default_init,
78 .c_vendor = "Unknown",
79};
80static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
81
82int __cpuinit get_model_name(struct cpuinfo_x86 *c)
83{
84 unsigned int *v;
85
86 if (c->extended_cpuid_level < 0x80000004)
87 return 0;
88
89 v = (unsigned int *) c->x86_model_id;
90 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
91 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
92 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
93 c->x86_model_id[48] = 0;
94 return 1;
95}
96
97
98void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
99{
100 unsigned int n, dummy, ebx, ecx, edx;
101
102 n = c->extended_cpuid_level;
103
104 if (n >= 0x80000005) {
105 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
106 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
107 "D cache %dK (%d bytes/line)\n",
108 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
109 c->x86_cache_size = (ecx>>24) + (edx>>24);
110 /* On K8 L1 TLB is inclusive, so don't count it */
111 c->x86_tlbsize = 0;
112 }
113
114 if (n >= 0x80000006) {
115 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
116 ecx = cpuid_ecx(0x80000006);
117 c->x86_cache_size = ecx >> 16;
118 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
119
120 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
121 c->x86_cache_size, ecx & 0xFF);
122 }
123}
124
125void __cpuinit detect_ht(struct cpuinfo_x86 *c)
126{
127#ifdef CONFIG_SMP
128 u32 eax, ebx, ecx, edx;
129 int index_msb, core_bits;
130
131 cpuid(1, &eax, &ebx, &ecx, &edx);
132
133
134 if (!cpu_has(c, X86_FEATURE_HT))
135 return;
136 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
137 goto out;
138
139 smp_num_siblings = (ebx & 0xff0000) >> 16;
140
141 if (smp_num_siblings == 1) {
142 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
143 } else if (smp_num_siblings > 1) {
144
145 if (smp_num_siblings > NR_CPUS) {
146 printk(KERN_WARNING "CPU: Unsupported number of "
147 "siblings %d", smp_num_siblings);
148 smp_num_siblings = 1;
149 return;
150 }
151
152 index_msb = get_count_order(smp_num_siblings);
153 c->phys_proc_id = phys_pkg_id(index_msb);
154
155 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
156
157 index_msb = get_count_order(smp_num_siblings);
158
159 core_bits = get_count_order(c->x86_max_cores);
160
161 c->cpu_core_id = phys_pkg_id(index_msb) &
162 ((1 << core_bits) - 1);
163 }
164out:
165 if ((c->x86_max_cores * smp_num_siblings) > 1) {
166 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
167 c->phys_proc_id);
168 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
169 c->cpu_core_id);
170 }
171
172#endif
173}
174
175static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
176{
177 char *v = c->x86_vendor_id;
178 int i;
179 static int printed;
180
181 for (i = 0; i < X86_VENDOR_NUM; i++) {
182 if (cpu_devs[i]) {
183 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
184 (cpu_devs[i]->c_ident[1] &&
185 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
186 c->x86_vendor = i;
187 this_cpu = cpu_devs[i];
188 return;
189 }
190 }
191 }
192 if (!printed) {
193 printed++;
194 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
195 printk(KERN_ERR "CPU: Your system may be unstable.\n");
196 }
197 c->x86_vendor = X86_VENDOR_UNKNOWN;
198}
199
200static void __init early_cpu_support_print(void)
201{
202 int i,j;
203 struct cpu_dev *cpu_devx;
204
205 printk("KERNEL supported cpus:\n");
206 for (i = 0; i < X86_VENDOR_NUM; i++) {
207 cpu_devx = cpu_devs[i];
208 if (!cpu_devx)
209 continue;
210 for (j = 0; j < 2; j++) {
211 if (!cpu_devx->c_ident[j])
212 continue;
213 printk(" %s %s\n", cpu_devx->c_vendor,
214 cpu_devx->c_ident[j]);
215 }
216 }
217}
218
219/*
220 * The NOPL instruction is supposed to exist on all CPUs with
221 * family >= 6, unfortunately, that's not true in practice because
222 * of early VIA chips and (more importantly) broken virtualizers that
223 * are not easy to detect. Hence, probe for it based on first
224 * principles.
225 *
226 * Note: no 64-bit chip is known to lack these, but put the code here
227 * for consistency with 32 bits, and to make it utterly trivial to
228 * diagnose the problem should it ever surface.
229 */
230static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
231{
232 const u32 nopl_signature = 0x888c53b1; /* Random number */
233 u32 has_nopl = nopl_signature;
234
235 clear_cpu_cap(c, X86_FEATURE_NOPL);
236 if (c->x86 >= 6) {
237 asm volatile("\n"
238 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
239 "2:\n"
240 " .section .fixup,\"ax\"\n"
241 "3: xor %0,%0\n"
242 " jmp 2b\n"
243 " .previous\n"
244 _ASM_EXTABLE(1b,3b)
245 : "+a" (has_nopl));
246
247 if (has_nopl == nopl_signature)
248 set_cpu_cap(c, X86_FEATURE_NOPL);
249 }
250}
251
252static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
253
254void __init early_cpu_init(void)
255{
256 struct cpu_vendor_dev *cvdev;
257
258 for (cvdev = __x86cpuvendor_start ;
259 cvdev < __x86cpuvendor_end ;
260 cvdev++)
261 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
262 early_cpu_support_print();
263 early_identify_cpu(&boot_cpu_data);
264}
265
266/* Do some early cpuid on the boot CPU to get some parameter that are
267 needed before check_bugs. Everything advanced is in identify_cpu
268 below. */
269static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
270{
271 u32 tfms, xlvl;
272
273 c->loops_per_jiffy = loops_per_jiffy;
274 c->x86_cache_size = -1;
275 c->x86_vendor = X86_VENDOR_UNKNOWN;
276 c->x86_model = c->x86_mask = 0; /* So far unknown... */
277 c->x86_vendor_id[0] = '\0'; /* Unset */
278 c->x86_model_id[0] = '\0'; /* Unset */
279 c->x86_clflush_size = 64;
280 c->x86_cache_alignment = c->x86_clflush_size;
281 c->x86_max_cores = 1;
282 c->x86_coreid_bits = 0;
283 c->extended_cpuid_level = 0;
284 memset(&c->x86_capability, 0, sizeof c->x86_capability);
285
286 /* Get vendor name */
287 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
288 (unsigned int *)&c->x86_vendor_id[0],
289 (unsigned int *)&c->x86_vendor_id[8],
290 (unsigned int *)&c->x86_vendor_id[4]);
291
292 get_cpu_vendor(c);
293
294 /* Initialize the standard set of capabilities */
295 /* Note that the vendor-specific code below might override */
296
297 /* Intel-defined flags: level 0x00000001 */
298 if (c->cpuid_level >= 0x00000001) {
299 __u32 misc;
300 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
301 &c->x86_capability[0]);
302 c->x86 = (tfms >> 8) & 0xf;
303 c->x86_model = (tfms >> 4) & 0xf;
304 c->x86_mask = tfms & 0xf;
305 if (c->x86 == 0xf)
306 c->x86 += (tfms >> 20) & 0xff;
307 if (c->x86 >= 0x6)
308 c->x86_model += ((tfms >> 16) & 0xF) << 4;
309 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
310 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
311 } else {
312 /* Have CPUID level 0 only - unheard of */
313 c->x86 = 4;
314 }
315
316 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
317#ifdef CONFIG_SMP
318 c->phys_proc_id = c->initial_apicid;
319#endif
320 /* AMD-defined flags: level 0x80000001 */
321 xlvl = cpuid_eax(0x80000000);
322 c->extended_cpuid_level = xlvl;
323 if ((xlvl & 0xffff0000) == 0x80000000) {
324 if (xlvl >= 0x80000001) {
325 c->x86_capability[1] = cpuid_edx(0x80000001);
326 c->x86_capability[6] = cpuid_ecx(0x80000001);
327 }
328 if (xlvl >= 0x80000004)
329 get_model_name(c); /* Default name */
330 }
331
332 /* Transmeta-defined flags: level 0x80860001 */
333 xlvl = cpuid_eax(0x80860000);
334 if ((xlvl & 0xffff0000) == 0x80860000) {
335 /* Don't set x86_cpuid_level here for now to not confuse. */
336 if (xlvl >= 0x80860001)
337 c->x86_capability[2] = cpuid_edx(0x80860001);
338 }
339
340 if (c->extended_cpuid_level >= 0x80000007)
341 c->x86_power = cpuid_edx(0x80000007);
342
343 if (c->extended_cpuid_level >= 0x80000008) {
344 u32 eax = cpuid_eax(0x80000008);
345
346 c->x86_virt_bits = (eax >> 8) & 0xff;
347 c->x86_phys_bits = eax & 0xff;
348 }
349
350 detect_nopl(c);
351
352 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
353 cpu_devs[c->x86_vendor]->c_early_init)
354 cpu_devs[c->x86_vendor]->c_early_init(c);
355
356 validate_pat_support(c);
357}
358
359/*
360 * This does the hard work of actually picking apart the CPU stuff...
361 */
362static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
363{
364 int i;
365
366 early_identify_cpu(c);
367
368 init_scattered_cpuid_features(c);
369
370 c->apicid = phys_pkg_id(0);
371
372 /*
373 * Vendor-specific initialization. In this section we
374 * canonicalize the feature flags, meaning if there are
375 * features a certain CPU supports which CPUID doesn't
376 * tell us, CPUID claiming incorrect flags, or other bugs,
377 * we handle them here.
378 *
379 * At the end of this section, c->x86_capability better
380 * indicate the features this CPU genuinely supports!
381 */
382 if (this_cpu->c_init)
383 this_cpu->c_init(c);
384
385 detect_ht(c);
386
387 /*
388 * On SMP, boot_cpu_data holds the common feature set between
389 * all CPUs; so make sure that we indicate which features are
390 * common between the CPUs. The first time this routine gets
391 * executed, c == &boot_cpu_data.
392 */
393 if (c != &boot_cpu_data) {
394 /* AND the already accumulated flags with these */
395 for (i = 0; i < NCAPINTS; i++)
396 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
397 }
398
399 /* Clear all flags overriden by options */
400 for (i = 0; i < NCAPINTS; i++)
401 c->x86_capability[i] &= ~cleared_cpu_caps[i];
402
403#ifdef CONFIG_X86_MCE
404 mcheck_init(c);
405#endif
406 select_idle_routine(c);
407
408#ifdef CONFIG_NUMA
409 numa_add_cpu(smp_processor_id());
410#endif
411
412}
413
414void __cpuinit identify_boot_cpu(void)
415{
416 identify_cpu(&boot_cpu_data);
417}
418
419void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
420{
421 BUG_ON(c == &boot_cpu_data);
422 identify_cpu(c);
423 mtrr_ap_init();
424}
425
426static __init int setup_noclflush(char *arg)
427{
428 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
429 return 1;
430}
431__setup("noclflush", setup_noclflush);
432
433struct msr_range {
434 unsigned min;
435 unsigned max;
436};
437
438static struct msr_range msr_range_array[] __cpuinitdata = {
439 { 0x00000000, 0x00000418},
440 { 0xc0000000, 0xc000040b},
441 { 0xc0010000, 0xc0010142},
442 { 0xc0011000, 0xc001103b},
443};
444
445static void __cpuinit print_cpu_msr(void)
446{
447 unsigned index;
448 u64 val;
449 int i;
450 unsigned index_min, index_max;
451
452 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
453 index_min = msr_range_array[i].min;
454 index_max = msr_range_array[i].max;
455 for (index = index_min; index < index_max; index++) {
456 if (rdmsrl_amd_safe(index, &val))
457 continue;
458 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
459 }
460 }
461}
462
463static int show_msr __cpuinitdata;
464static __init int setup_show_msr(char *arg)
465{
466 int num;
467
468 get_option(&arg, &num);
469
470 if (num > 0)
471 show_msr = num;
472 return 1;
473}
474__setup("show_msr=", setup_show_msr);
475
476void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
477{
478 if (c->x86_model_id[0])
479 printk(KERN_CONT "%s", c->x86_model_id);
480
481 if (c->x86_mask || c->cpuid_level >= 0)
482 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
483 else
484 printk(KERN_CONT "\n");
485
486#ifdef CONFIG_SMP
487 if (c->cpu_index < show_msr)
488 print_cpu_msr();
489#else
490 if (show_msr)
491 print_cpu_msr();
492#endif
493}
494
495static __init int setup_disablecpuid(char *arg)
496{
497 int bit;
498 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
499 setup_clear_cpu_cap(bit);
500 else
501 return 0;
502 return 1;
503}
504__setup("clearcpuid=", setup_disablecpuid);
505
506cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
507
508struct x8664_pda **_cpu_pda __read_mostly;
509EXPORT_SYMBOL(_cpu_pda);
510
511struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
512
513char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
514
515unsigned long __supported_pte_mask __read_mostly = ~0UL;
516EXPORT_SYMBOL_GPL(__supported_pte_mask);
517
518static int do_not_nx __cpuinitdata;
519
520/* noexec=on|off
521Control non executable mappings for 64bit processes.
522
523on Enable(default)
524off Disable
525*/
526static int __init nonx_setup(char *str)
527{
528 if (!str)
529 return -EINVAL;
530 if (!strncmp(str, "on", 2)) {
531 __supported_pte_mask |= _PAGE_NX;
532 do_not_nx = 0;
533 } else if (!strncmp(str, "off", 3)) {
534 do_not_nx = 1;
535 __supported_pte_mask &= ~_PAGE_NX;
536 }
537 return 0;
538}
539early_param("noexec", nonx_setup);
540
541int force_personality32;
542
543/* noexec32=on|off
544Control non executable heap for 32bit processes.
545To control the stack too use noexec=off
546
547on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
548off PROT_READ implies PROT_EXEC
549*/
550static int __init nonx32_setup(char *str)
551{
552 if (!strcmp(str, "on"))
553 force_personality32 &= ~READ_IMPLIES_EXEC;
554 else if (!strcmp(str, "off"))
555 force_personality32 |= READ_IMPLIES_EXEC;
556 return 1;
557}
558__setup("noexec32=", nonx32_setup);
559
560void pda_init(int cpu)
561{
562 struct x8664_pda *pda = cpu_pda(cpu);
563
564 /* Setup up data that may be needed in __get_free_pages early */
565 loadsegment(fs, 0);
566 loadsegment(gs, 0);
567 /* Memory clobbers used to order PDA accessed */
568 mb();
569 wrmsrl(MSR_GS_BASE, pda);
570 mb();
571
572 pda->cpunumber = cpu;
573 pda->irqcount = -1;
574 pda->kernelstack = (unsigned long)stack_thread_info() -
575 PDA_STACKOFFSET + THREAD_SIZE;
576 pda->active_mm = &init_mm;
577 pda->mmu_state = 0;
578
579 if (cpu == 0) {
580 /* others are initialized in smpboot.c */
581 pda->pcurrent = &init_task;
582 pda->irqstackptr = boot_cpu_stack;
583 pda->irqstackptr += IRQSTACKSIZE - 64;
584 } else {
585 if (!pda->irqstackptr) {
586 pda->irqstackptr = (char *)
587 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
588 if (!pda->irqstackptr)
589 panic("cannot allocate irqstack for cpu %d",
590 cpu);
591 pda->irqstackptr += IRQSTACKSIZE - 64;
592 }
593
594 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
595 pda->nodenumber = cpu_to_node(cpu);
596 }
597}
598
599char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
600 DEBUG_STKSZ] __page_aligned_bss;
601
602extern asmlinkage void ignore_sysret(void);
603
604/* May not be marked __init: used by software suspend */
605void syscall_init(void)
606{
607 /*
608 * LSTAR and STAR live in a bit strange symbiosis.
609 * They both write to the same internal register. STAR allows to
610 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
611 */
612 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
613 wrmsrl(MSR_LSTAR, system_call);
614 wrmsrl(MSR_CSTAR, ignore_sysret);
615
616#ifdef CONFIG_IA32_EMULATION
617 syscall32_cpu_init();
618#endif
619
620 /* Flags to clear on syscall */
621 wrmsrl(MSR_SYSCALL_MASK,
622 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
623}
624
625void __cpuinit check_efer(void)
626{
627 unsigned long efer;
628
629 rdmsrl(MSR_EFER, efer);
630 if (!(efer & EFER_NX) || do_not_nx)
631 __supported_pte_mask &= ~_PAGE_NX;
632}
633
634unsigned long kernel_eflags;
635
636/*
637 * Copies of the original ist values from the tss are only accessed during
638 * debugging, no special alignment required.
639 */
640DEFINE_PER_CPU(struct orig_ist, orig_ist);
641
642/*
643 * cpu_init() initializes state that is per-CPU. Some data is already
644 * initialized (naturally) in the bootstrap process, such as the GDT
645 * and IDT. We reload them nevertheless, this function acts as a
646 * 'CPU state barrier', nothing should get across.
647 * A lot of state is already set up in PDA init.
648 */
649void __cpuinit cpu_init(void)
650{
651 int cpu = stack_smp_processor_id();
652 struct tss_struct *t = &per_cpu(init_tss, cpu);
653 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
654 unsigned long v;
655 char *estacks = NULL;
656 struct task_struct *me;
657 int i;
658
659 /* CPU 0 is initialised in head64.c */
660 if (cpu != 0)
661 pda_init(cpu);
662 else
663 estacks = boot_exception_stacks;
664
665 me = current;
666
667 if (cpu_test_and_set(cpu, cpu_initialized))
668 panic("CPU#%d already initialized!\n", cpu);
669
670 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
671
672 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
673
674 /*
675 * Initialize the per-CPU GDT with the boot GDT,
676 * and set up the GDT descriptor:
677 */
678
679 switch_to_new_gdt();
680 load_idt((const struct desc_ptr *)&idt_descr);
681
682 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
683 syscall_init();
684
685 wrmsrl(MSR_FS_BASE, 0);
686 wrmsrl(MSR_KERNEL_GS_BASE, 0);
687 barrier();
688
689 check_efer();
690
691 /*
692 * set up and load the per-CPU TSS
693 */
694 if (!orig_ist->ist[0]) {
695 static const unsigned int order[N_EXCEPTION_STACKS] = {
696 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
697 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
698 };
699 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
700 if (cpu) {
701 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
702 if (!estacks)
703 panic("Cannot allocate exception "
704 "stack %ld %d\n", v, cpu);
705 }
706 estacks += PAGE_SIZE << order[v];
707 orig_ist->ist[v] = t->x86_tss.ist[v] =
708 (unsigned long)estacks;
709 }
710 }
711
712 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
713 /*
714 * <= is required because the CPU will access up to
715 * 8 bits beyond the end of the IO permission bitmap.
716 */
717 for (i = 0; i <= IO_BITMAP_LONGS; i++)
718 t->io_bitmap[i] = ~0UL;
719
720 atomic_inc(&init_mm.mm_count);
721 me->active_mm = &init_mm;
722 if (me->mm)
723 BUG();
724 enter_lazy_tlb(&init_mm, me);
725
726 load_sp0(t, &current->thread);
727 set_tss_desc(cpu, t);
728 load_TR_desc();
729 load_LDT(&init_mm.context);
730
731#ifdef CONFIG_KGDB
732 /*
733 * If the kgdb is connected no debug regs should be altered. This
734 * is only applicable when KGDB and a KGDB I/O module are built
735 * into the kernel and you are using early debugging with
736 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
737 */
738 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
739 arch_kgdb_ops.correct_hw_break();
740 else {
741#endif
742 /*
743 * Clear all 6 debug registers:
744 */
745
746 set_debugreg(0UL, 0);
747 set_debugreg(0UL, 1);
748 set_debugreg(0UL, 2);
749 set_debugreg(0UL, 3);
750 set_debugreg(0UL, 6);
751 set_debugreg(0UL, 7);
752#ifdef CONFIG_KGDB
753 /* If the kgdb is connected no debug regs should be altered. */
754 }
755#endif
756
757 fpu_init();
758
759 raw_local_save_flags(kernel_eflags);
760
761 if (is_uv_system())
762 uv_cpu_init();
763}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8565fe..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@ struct cpu_dev {
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 * c);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 * c);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
24 int c_x86_vendor;
24}; 25};
25 26
26extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX;
27 31
28struct cpu_vendor_dev { 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
29 int vendor;
30 struct cpu_dev *cpu_dev;
31};
32
33#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
34 static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
35 __attribute__((__section__(".x86cpuvendor.init"))) = \
36 { cpu_vendor_id, cpu_dev }
37
38extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
39 33
40extern int get_model_name(struct cpuinfo_x86 *c);
41extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
42 35
43#endif 36#endif
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 898a5a2002ed..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -121,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void)
121 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 121 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
122 122
123 /* Load/Store Serialize to mem access disable (=reorder it) */ 123 /* Load/Store Serialize to mem access disable (=reorder it) */
124 setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); 124 setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
125 /* set load/store serialize from 1GB to 4GB */ 125 /* set load/store serialize from 1GB to 4GB */
126 ccr3 |= 0xe0; 126 ccr3 |= 0xe0;
127 setCx86(CX86_CCR3, ccr3); 127 setCx86(CX86_CCR3, ccr3);
@@ -132,11 +132,11 @@ static void __cpuinit set_cx86_memwb(void)
132 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 132 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
133 133
134 /* CCR2 bit 2: unlock NW bit */ 134 /* CCR2 bit 2: unlock NW bit */
135 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 135 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
136 /* set 'Not Write-through' */ 136 /* set 'Not Write-through' */
137 write_cr0(read_cr0() | X86_CR0_NW); 137 write_cr0(read_cr0() | X86_CR0_NW);
138 /* CCR2 bit 2: lock NW bit and set WT1 */ 138 /* CCR2 bit 2: lock NW bit and set WT1 */
139 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); 139 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
140} 140}
141 141
142/* 142/*
@@ -150,14 +150,14 @@ static void __cpuinit geode_configure(void)
150 local_irq_save(flags); 150 local_irq_save(flags);
151 151
152 /* Suspend on halt power saving and enable #SUSP pin */ 152 /* Suspend on halt power saving and enable #SUSP pin */
153 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 153 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
154 154
155 ccr3 = getCx86(CX86_CCR3); 155 ccr3 = getCx86(CX86_CCR3);
156 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 156 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
157 157
158 158
159 /* FPU fast, DTE cache, Mem bypass */ 159 /* FPU fast, DTE cache, Mem bypass */
160 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); 160 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
161 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 161 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
162 162
163 set_cx86_memwb(); 163 set_cx86_memwb();
@@ -291,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
291 /* GXm supports extended cpuid levels 'ala' AMD */ 291 /* GXm supports extended cpuid levels 'ala' AMD */
292 if (c->cpuid_level == 2) { 292 if (c->cpuid_level == 2) {
293 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 293 /* Enable cxMMX extensions (GX1 Datasheet 54) */
294 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); 294 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
295 295
296 /* 296 /*
297 * GXm : 0x30 ... 0x5f GXm datasheet 51 297 * GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -301,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
301 */ 301 */
302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
303 geode_configure(); 303 geode_configure();
304 get_model_name(c); /* get CPU marketing name */
305 return; 304 return;
306 } else { /* MediaGX */ 305 } else { /* MediaGX */
307 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -314,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
314 if (dir1 > 7) { 313 if (dir1 > 7) {
315 dir0_msn++; /* M II */ 314 dir0_msn++; /* M II */
316 /* Enable MMX extensions (App note 108) */ 315 /* Enable MMX extensions (App note 108) */
317 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 316 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
318 } else { 317 } else {
319 c->coma_bug = 1; /* 6x86MX, it has the bug. */ 318 c->coma_bug = 1; /* 6x86MX, it has the bug. */
320 } 319 }
@@ -429,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
429 local_irq_save(flags); 428 local_irq_save(flags);
430 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
431 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
432 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ 431 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */
433 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
434 local_irq_restore(flags); 433 local_irq_restore(flags);
435 } 434 }
@@ -442,14 +441,16 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
442 .c_early_init = early_init_cyrix, 441 .c_early_init = early_init_cyrix,
443 .c_init = init_cyrix, 442 .c_init = init_cyrix,
444 .c_identify = cyrix_identify, 443 .c_identify = cyrix_identify,
444 .c_x86_vendor = X86_VENDOR_CYRIX,
445}; 445};
446 446
447cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
448 448
449static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
450 .c_vendor = "NSC", 450 .c_vendor = "NSC",
451 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
452 .c_init = init_nsc, 452 .c_init = init_nsc,
453 .c_x86_vendor = X86_VENDOR_NSC,
453}; 454};
454 455
455cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); 456cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index c9017799497c..000000000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,84 +0,0 @@
1/*
2 * Strings for the various x86 capability flags.
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9/*
10 * These flag bits must match the definitions in <asm/cpufeature.h>.
11 * NULL means this bit is undefined or reserved; either way it doesn't
12 * have meaning as far as Linux is concerned. Note that it's important
13 * to realize there is a difference between this table and CPUID -- if
14 * applications want to get the raw CPUID data, they should access
15 * /dev/cpu/<cpu_nr>/cpuid instead.
16 */
17const char * const x86_cap_flags[NCAPINTS*32] = {
18 /* Intel-defined */
19 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
20 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
21 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
22 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
23
24 /* AMD-defined */
25 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
26 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
27 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
28 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
29 "3dnowext", "3dnow",
30
31 /* Transmeta-defined */
32 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
33 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
34 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36
37 /* Other (Linux-defined) */
38 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL,
43 "nopl", NULL, NULL, NULL,
44 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
45
46 /* Intel-defined (#2) */
47 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
48 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
49 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
50 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
51
52 /* VIA/Cyrix/Centaur-defined */
53 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
54 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
57
58 /* AMD-defined (#2) */
59 "lahf_lm", "cmp_legacy", "svm", "extapic",
60 "cr8_legacy", "abm", "sse4a", "misalignsse",
61 "3dnowprefetch", "osvw", "ibs", "sse5",
62 "skinit", "wdt", NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
65
66 /* Auxiliary (Linux-defined) */
67 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
69 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
70 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
71};
72
73const char *const x86_power_flags[32] = {
74 "ts", /* temperature sensor */
75 "fid", /* frequency id control */
76 "vid", /* voltage id control */
77 "ttp", /* thermal trip */
78 "tm",
79 "stc",
80 "100mhzsteps",
81 "hwpstate",
82 "", /* tsc invariant mapped to constant_tsc */
83 /* nothing */
84};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f113ef4595f6..99468dbd08da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
23#include <mach_apic.h> 28#include <mach_apic.h>
24#endif 29#endif
25 30
26#ifdef CONFIG_X86_INTEL_USERCOPY
27/*
28 * Alignment at which movsl is preferred for bulk memory copies.
29 */
30struct movsl_mask movsl_mask __read_mostly;
31#endif
32
33static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
34{ 32{
35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
37 c->x86_cache_alignment = 128;
38 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
39 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
40 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
41} 44}
42 45
46#ifdef CONFIG_X86_32
43/* 47/*
44 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
45 * 49 *
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
59 return 0; 63 return 0;
60} 64}
61 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
62 70
63/* 71 /*
64 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
65 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
66 */ 74 */
67static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
68{ 81{
69 unsigned long lo, hi; 82 unsigned long lo, hi;
70 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
71 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
72 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
73 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
77 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
78 } 121 }
79 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
80} 156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
160}
161#endif
81 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
82 180
83/* 181/*
84 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
85 */ 183 */
86static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
87{ 185{
88 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
89 187
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
98 return 1; 196 return 1;
99} 197}
100 198
101#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
102static void __cpuinit trap_init_f00f_bug(void)
103{ 200{
104 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
105 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
106 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
107 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
108 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
109 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
110 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
111 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
112} 235}
113#endif
114 236
115static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
116{ 238{
117 unsigned int l2 = 0; 239 unsigned int l2 = 0;
118 char *p = NULL;
119 240
120 early_init_intel(c); 241 early_init_intel(c);
121 242
122#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
123 /*
124 * All current models of Pentium and Pentium with MMX technology CPUs
125 * have the F0 0F bug, which lets nonprivileged users lock up the system.
126 * Note that the workaround only should be initialized once...
127 */
128 c->f00f_bug = 0;
129 if (!paravirt_enabled() && c->x86 == 5) {
130 static int f00f_workaround_enabled;
131
132 c->f00f_bug = 1;
133 if (!f00f_workaround_enabled) {
134 trap_init_f00f_bug();
135 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
136 f00f_workaround_enabled = 1;
137 }
138 }
139#endif
140 244
141 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
142 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
146 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
147 } 251 }
148 252
149 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
150 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
151 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
152 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
153 /* 271 /*
154 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
155 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
156 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
157 */ 275 */
158 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
159 switch (c->x86_model) { 279 switch (c->x86_model) {
160 case 5: 280 case 5:
161 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -178,71 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
178 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
179 break; 299 break;
180 } 300 }
181 }
182
183 if (p)
184 strcpy(c->x86_model_id, p);
185
186 c->x86_max_cores = num_cpu_cores(c);
187
188 detect_ht(c);
189 301
190 /* Work around errata */ 302 if (p)
191 Intel_errata_workarounds(c); 303 strcpy(c->x86_model_id, p);
192
193#ifdef CONFIG_X86_INTEL_USERCOPY
194 /*
195 * Set up the preferred alignment for movsl bulk memory moves
196 */
197 switch (c->x86) {
198 case 4: /* 486: untested */
199 break;
200 case 5: /* Old Pentia: untested */
201 break;
202 case 6: /* PII/PIII only like movsl with 8-byte alignment */
203 movsl_mask.mask = 7;
204 break;
205 case 15: /* P4 is OK down to 8-byte alignment */
206 movsl_mask.mask = 7;
207 break;
208 } 304 }
209#endif
210 305
211 if (cpu_has_xmm2) 306 if (c->x86 == 15)
212 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
213 if (c->x86 == 15) {
214 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
215 }
216 if (c->x86 == 6) 308 if (c->x86 == 6)
217 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
218 if (cpu_has_ds) {
219 unsigned int l1;
220 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
221 if (!(l1 & (1<<11)))
222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 ds_init_intel(c);
226 }
227 310
228 if (cpu_has_bts) 311 if (cpu_has_bts)
229 ptrace_bts_init_intel(c); 312 ptrace_bts_init_intel(c);
230 313
231 /* 314#endif
232 * See if we have a good local APIC by checking for buggy Pentia,
233 * i.e. all B steppings and the C2 stepping of P54C when using their
234 * integrated APIC (see 11AP erratum in "Pentium Processor
235 * Specification Update").
236 */
237 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
238 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
239 set_cpu_cap(c, X86_FEATURE_11AP);
240 315
241#ifdef CONFIG_X86_NUMAQ 316 detect_extended_topology(c);
242 numaq_tsc_disable(); 317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
320 * detection.
321 */
322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
324 detect_ht(c);
243#endif 325#endif
326 }
327
328 /* Work around errata */
329 srat_detect_node();
330
331 if (cpu_has(c, X86_FEATURE_VMX))
332 detect_vmx_virtcap(c);
244} 333}
245 334
335#ifdef CONFIG_X86_32
246static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
247{ 337{
248 /* 338 /*
@@ -255,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
255 size = 256; 345 size = 256;
256 return size; 346 return size;
257} 347}
348#endif
258 349
259static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
260 .c_vendor = "Intel", 351 .c_vendor = "Intel",
261 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
262 .c_models = { 354 .c_models = {
263 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
264 { 356 {
@@ -308,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
308 } 400 }
309 }, 401 },
310 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
311 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
312 .c_init = init_intel, 406 .c_init = init_intel,
313 .c_size_cache = intel_size_cache, 407 .c_x86_vendor = X86_VENDOR_INTEL,
314}; 408};
315 409
316cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
317
318#ifndef CONFIG_X86_CMPXCHG
319unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
320{
321 u8 prev;
322 unsigned long flags;
323
324 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
325 local_irq_save(flags);
326 prev = *(u8 *)ptr;
327 if (prev == old)
328 *(u8 *)ptr = new;
329 local_irq_restore(flags);
330 return prev;
331}
332EXPORT_SYMBOL(cmpxchg_386_u8);
333
334unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
335{
336 u16 prev;
337 unsigned long flags;
338
339 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
340 local_irq_save(flags);
341 prev = *(u16 *)ptr;
342 if (prev == old)
343 *(u16 *)ptr = new;
344 local_irq_restore(flags);
345 return prev;
346}
347EXPORT_SYMBOL(cmpxchg_386_u16);
348
349unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
350{
351 u32 prev;
352 unsigned long flags;
353
354 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
355 local_irq_save(flags);
356 prev = *(u32 *)ptr;
357 if (prev == old)
358 *(u32 *)ptr = new;
359 local_irq_restore(flags);
360 return prev;
361}
362EXPORT_SYMBOL(cmpxchg_386_u32);
363#endif
364
365#ifndef CONFIG_X86_CMPXCHG64
366unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
367{
368 u64 prev;
369 unsigned long flags;
370
371 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
372 local_irq_save(flags);
373 prev = *(u64 *)ptr;
374 if (prev == old)
375 *(u64 *)ptr = new;
376 local_irq_restore(flags);
377 return prev;
378}
379EXPORT_SYMBOL(cmpxchg_486_u64);
380#endif
381
382/* arch_initcall(intel_cpu_init); */
383 411
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58d39f0..000000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83 c->x86_max_cores = intel_num_cpu_cores(c);
84
85 srat_detect_node();
86}
87
88static struct cpu_dev intel_cpu_dev __cpuinitdata = {
89 .c_vendor = "Intel",
90 .c_ident = { "GenuineIntel" },
91 .c_early_init = early_init_intel,
92 .c_init = init_intel,
93};
94cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
95
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b0a10b002f1..3f46afbb1cf1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Routines to indentify caches on Intel CPU. 2 * Routines to indentify caches on Intel CPU.
3 * 3 *
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
@@ -13,6 +13,7 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/pci.h>
16 17
17#include <asm/processor.h> 18#include <asm/processor.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
@@ -130,9 +131,18 @@ struct _cpuid4_info {
130 union _cpuid4_leaf_ebx ebx; 131 union _cpuid4_leaf_ebx ebx;
131 union _cpuid4_leaf_ecx ecx; 132 union _cpuid4_leaf_ecx ecx;
132 unsigned long size; 133 unsigned long size;
134 unsigned long can_disable;
133 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 135 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
134}; 136};
135 137
138#ifdef CONFIG_PCI
139static struct pci_device_id k8_nb_id[] = {
140 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
141 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
142 {}
143};
144#endif
145
136unsigned short num_cache_leaves; 146unsigned short num_cache_leaves;
137 147
138/* AMD doesn't have CPUID4. Emulate it here to report the same 148/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = {
182static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 192static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
183static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 193static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
184 194
185static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 195static void __cpuinit
186 union _cpuid4_leaf_ebx *ebx, 196amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
187 union _cpuid4_leaf_ecx *ecx) 197 union _cpuid4_leaf_ebx *ebx,
198 union _cpuid4_leaf_ecx *ecx)
188{ 199{
189 unsigned dummy; 200 unsigned dummy;
190 unsigned line_size, lines_per_tag, assoc, size_in_kb; 201 unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
251 (ebx->split.ways_of_associativity + 1) - 1; 262 (ebx->split.ways_of_associativity + 1) - 1;
252} 263}
253 264
254static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 265static void __cpuinit
266amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
267{
268 if (index < 3)
269 return;
270 this_leaf->can_disable = 1;
271}
272
273static int
274__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
255{ 275{
256 union _cpuid4_leaf_eax eax; 276 union _cpuid4_leaf_eax eax;
257 union _cpuid4_leaf_ebx ebx; 277 union _cpuid4_leaf_ebx ebx;
258 union _cpuid4_leaf_ecx ecx; 278 union _cpuid4_leaf_ecx ecx;
259 unsigned edx; 279 unsigned edx;
260 280
261 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 281 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
262 amd_cpuid4(index, &eax, &ebx, &ecx); 282 amd_cpuid4(index, &eax, &ebx, &ecx);
263 else 283 if (boot_cpu_data.x86 >= 0x10)
264 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 284 amd_check_l3_disable(index, this_leaf);
285 } else {
286 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
287 }
288
265 if (eax.split.type == CACHE_TYPE_NULL) 289 if (eax.split.type == CACHE_TYPE_NULL)
266 return -EIO; /* better error ? */ 290 return -EIO; /* better error ? */
267 291
268 this_leaf->eax = eax; 292 this_leaf->eax = eax;
269 this_leaf->ebx = ebx; 293 this_leaf->ebx = ebx;
270 this_leaf->ecx = ecx; 294 this_leaf->ecx = ecx;
271 this_leaf->size = (ecx.split.number_of_sets + 1) * 295 this_leaf->size = (ecx.split.number_of_sets + 1) *
272 (ebx.split.coherency_line_size + 1) * 296 (ebx.split.coherency_line_size + 1) *
273 (ebx.split.physical_line_partition + 1) * 297 (ebx.split.physical_line_partition + 1) *
274 (ebx.split.ways_of_associativity + 1); 298 (ebx.split.ways_of_associativity + 1);
275 return 0; 299 return 0;
276} 300}
277 301
@@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
453 477
454/* pointer to _cpuid4_info array (for each cache leaf) */ 478/* pointer to _cpuid4_info array (for each cache leaf) */
455static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 479static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
456#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 480#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
457 481
458#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
459static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 483static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -490,7 +514,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
490 514
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 515 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 516 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 517 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 518 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 519 }
496} 520}
@@ -572,7 +596,7 @@ struct _index_kobject {
572 596
573/* pointer to array of kobjects for cpuX/cache/indexY */ 597/* pointer to array of kobjects for cpuX/cache/indexY */
574static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 598static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
575#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 599#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
576 600
577#define show_one_plus(file_name, object, val) \ 601#define show_one_plus(file_name, object, val) \
578static ssize_t show_##file_name \ 602static ssize_t show_##file_name \
@@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
637 } 661 }
638} 662}
639 663
664#define to_object(k) container_of(k, struct _index_kobject, kobj)
665#define to_attr(a) container_of(a, struct _cache_attr, attr)
666
667#ifdef CONFIG_PCI
668static struct pci_dev *get_k8_northbridge(int node)
669{
670 struct pci_dev *dev = NULL;
671 int i;
672
673 for (i = 0; i <= node; i++) {
674 do {
675 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
676 if (!dev)
677 break;
678 } while (!pci_match_id(&k8_nb_id[0], dev));
679 if (!dev)
680 break;
681 }
682 return dev;
683}
684#else
685static struct pci_dev *get_k8_northbridge(int node)
686{
687 return NULL;
688}
689#endif
690
691static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
692{
693 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
694 struct pci_dev *dev = NULL;
695 ssize_t ret = 0;
696 int i;
697
698 if (!this_leaf->can_disable)
699 return sprintf(buf, "Feature not enabled\n");
700
701 dev = get_k8_northbridge(node);
702 if (!dev) {
703 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
704 return -EINVAL;
705 }
706
707 for (i = 0; i < 2; i++) {
708 unsigned int reg;
709
710 pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
711
712 ret += sprintf(buf, "%sEntry: %d\n", buf, i);
713 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
714 buf,
715 reg & 0x80000000 ? "Disabled" : "Allowed",
716 reg & 0x40000000 ? "Disabled" : "Allowed");
717 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
718 buf, (reg & 0x30000) >> 16, reg & 0xfff);
719 }
720 return ret;
721}
722
723static ssize_t
724store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
725 size_t count)
726{
727 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
728 struct pci_dev *dev = NULL;
729 unsigned int ret, index, val;
730
731 if (!this_leaf->can_disable)
732 return 0;
733
734 if (strlen(buf) > 15)
735 return -EINVAL;
736
737 ret = sscanf(buf, "%x %x", &index, &val);
738 if (ret != 2)
739 return -EINVAL;
740 if (index > 1)
741 return -EINVAL;
742
743 val |= 0xc0000000;
744 dev = get_k8_northbridge(node);
745 if (!dev) {
746 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
747 return -EINVAL;
748 }
749
750 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
751 wbinvd();
752 pci_write_config_dword(dev, 0x1BC + index * 4, val);
753
754 return 1;
755}
756
640struct _cache_attr { 757struct _cache_attr {
641 struct attribute attr; 758 struct attribute attr;
642 ssize_t (*show)(struct _cpuid4_info *, char *); 759 ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -657,6 +774,8 @@ define_one_ro(size);
657define_one_ro(shared_cpu_map); 774define_one_ro(shared_cpu_map);
658define_one_ro(shared_cpu_list); 775define_one_ro(shared_cpu_list);
659 776
777static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
778
660static struct attribute * default_attrs[] = { 779static struct attribute * default_attrs[] = {
661 &type.attr, 780 &type.attr,
662 &level.attr, 781 &level.attr,
@@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = {
667 &size.attr, 786 &size.attr,
668 &shared_cpu_map.attr, 787 &shared_cpu_map.attr,
669 &shared_cpu_list.attr, 788 &shared_cpu_list.attr,
789 &cache_disable.attr,
670 NULL 790 NULL
671}; 791};
672 792
673#define to_object(k) container_of(k, struct _index_kobject, kobj)
674#define to_attr(a) container_of(a, struct _cache_attr, attr)
675
676static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) 793static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
677{ 794{
678 struct _cache_attr *fattr = to_attr(attr); 795 struct _cache_attr *fattr = to_attr(attr);
@@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
682 ret = fattr->show ? 799 ret = fattr->show ?
683 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 800 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
684 buf) : 801 buf) :
685 0; 802 0;
686 return ret; 803 return ret;
687} 804}
688 805
689static ssize_t store(struct kobject * kobj, struct attribute * attr, 806static ssize_t store(struct kobject * kobj, struct attribute * attr,
690 const char * buf, size_t count) 807 const char * buf, size_t count)
691{ 808{
692 return 0; 809 struct _cache_attr *fattr = to_attr(attr);
810 struct _index_kobject *this_leaf = to_object(kobj);
811 ssize_t ret;
812
813 ret = fattr->store ?
814 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
815 buf, count) :
816 0;
817 return ret;
693} 818}
694 819
695static struct sysfs_ops sysfs_ops = { 820static struct sysfs_ops sysfs_ops = {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 726a5fcdf341..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -860,7 +860,7 @@ error:
860 return err; 860 return err;
861} 861}
862 862
863static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
864{ 864{
865 int i; 865 int i;
866 866
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 000000000000..dfea390e1608
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
1#!/usr/bin/perl
2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4#
5
6($in, $out) = @ARGV;
7
8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
10
11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13
14while (defined($line = <IN>)) {
15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
16 $macro = $1;
17 $feature = $2;
18 $tail = $3;
19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
20 $feature = $1;
21 }
22
23 if ($feature ne '') {
24 printf OUT "\t%-32s = \"%s\",\n",
25 "[$macro]", "\L$feature";
26 }
27 }
28}
29print OUT "};\n";
30
31close(IN);
32close(OUT);
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 000000000000..5abbea297e0c
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
1/*
2 * Strings for the various x86 power flags
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9const char *const x86_power_flags[32] = {
10 "ts", /* temperature sensor */
11 "fid", /* frequency id control */
12 "vid", /* voltage id control */
13 "ttp", /* thermal trip */
14 "tm",
15 "stc",
16 "100mhzsteps",
17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */
20};
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8f..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
105}; 107};
106 108
107cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); 109cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d75..e777f79e0960 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
19 } 19 }
20 }, 20 },
21 }, 21 },
22 .c_x86_vendor = X86_VENDOR_UMC,
22}; 23};
23 24
24cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); 25cpu_dev_register(umc_cpu_dev);
25 26
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f07..b4f14c6c09d9 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
66 .ds = __USER_DS, 66 .ds = __USER_DS,
67 .fs = __KERNEL_PERCPU, 67 .fs = __KERNEL_PERCPU,
68 68
69 .__cr3 = __pa(swapper_pg_dir) 69 .__cr3 = __pa_nodebug(swapper_pg_dir),
70 } 70 }
71}; 71};
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 000000000000..201ee359a1a9
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,447 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16
17#include <asm/stacktrace.h>
18
19#define STACKSLOTS_PER_LINE 8
20#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
21
22int panic_on_unrecovered_nmi;
23int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
24static unsigned int code_bytes = 64;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33static inline int valid_stack_ptr(struct thread_info *tinfo,
34 void *p, unsigned int size, void *end)
35{
36 void *t = tinfo;
37 if (end) {
38 if (p < end && p >= (end-THREAD_SIZE))
39 return 1;
40 else
41 return 0;
42 }
43 return p > t && p < t + THREAD_SIZE - size;
44}
45
46/* The form of the top of the frame on the stack */
47struct stack_frame {
48 struct stack_frame *next_frame;
49 unsigned long return_address;
50};
51
52static inline unsigned long
53print_context_stack(struct thread_info *tinfo,
54 unsigned long *stack, unsigned long bp,
55 const struct stacktrace_ops *ops, void *data,
56 unsigned long *end)
57{
58 struct stack_frame *frame = (struct stack_frame *)bp;
59
60 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
61 unsigned long addr;
62
63 addr = *stack;
64 if (__kernel_text_address(addr)) {
65 if ((unsigned long) stack == bp + sizeof(long)) {
66 ops->address(data, addr, 1);
67 frame = frame->next_frame;
68 bp = (unsigned long) frame;
69 } else {
70 ops->address(data, addr, bp == 0);
71 }
72 }
73 stack++;
74 }
75 return bp;
76}
77
78void dump_trace(struct task_struct *task, struct pt_regs *regs,
79 unsigned long *stack, unsigned long bp,
80 const struct stacktrace_ops *ops, void *data)
81{
82 if (!task)
83 task = current;
84
85 if (!stack) {
86 unsigned long dummy;
87 stack = &dummy;
88 if (task && task != current)
89 stack = (unsigned long *)task->thread.sp;
90 }
91
92#ifdef CONFIG_FRAME_POINTER
93 if (!bp) {
94 if (task == current) {
95 /* Grab bp right from our regs */
96 get_bp(bp);
97 } else {
98 /* bp is the last reg pushed by switch_to */
99 bp = *(unsigned long *) task->thread.sp;
100 }
101 }
102#endif
103
104 for (;;) {
105 struct thread_info *context;
106
107 context = (struct thread_info *)
108 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
109 bp = print_context_stack(context, stack, bp, ops, data, NULL);
110
111 stack = (unsigned long *)context->previous_esp;
112 if (!stack)
113 break;
114 if (ops->stack(data, "IRQ") < 0)
115 break;
116 touch_nmi_watchdog();
117 }
118}
119EXPORT_SYMBOL(dump_trace);
120
121static void
122print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
123{
124 printk(data);
125 print_symbol(msg, symbol);
126 printk("\n");
127}
128
129static void print_trace_warning(void *data, char *msg)
130{
131 printk("%s%s\n", (char *)data, msg);
132}
133
134static int print_trace_stack(void *data, char *name)
135{
136 printk("%s <%s> ", (char *)data, name);
137 return 0;
138}
139
140/*
141 * Print one address/symbol entries per line.
142 */
143static void print_trace_address(void *data, unsigned long addr, int reliable)
144{
145 touch_nmi_watchdog();
146 printk(data);
147 printk_address(addr, reliable);
148}
149
150static const struct stacktrace_ops print_trace_ops = {
151 .warning = print_trace_warning,
152 .warning_symbol = print_trace_warning_symbol,
153 .stack = print_trace_stack,
154 .address = print_trace_address,
155};
156
157static void
158show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
159 unsigned long *stack, unsigned long bp, char *log_lvl)
160{
161 printk("%sCall Trace:\n", log_lvl);
162 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
163}
164
165void show_trace(struct task_struct *task, struct pt_regs *regs,
166 unsigned long *stack, unsigned long bp)
167{
168 show_trace_log_lvl(task, regs, stack, bp, "");
169}
170
171static void
172show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
173 unsigned long *sp, unsigned long bp, char *log_lvl)
174{
175 unsigned long *stack;
176 int i;
177
178 if (sp == NULL) {
179 if (task)
180 sp = (unsigned long *)task->thread.sp;
181 else
182 sp = (unsigned long *)&sp;
183 }
184
185 stack = sp;
186 for (i = 0; i < kstack_depth_to_print; i++) {
187 if (kstack_end(stack))
188 break;
189 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
190 printk("\n%s", log_lvl);
191 printk(" %08lx", *stack++);
192 touch_nmi_watchdog();
193 }
194 printk("\n");
195 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
196}
197
198void show_stack(struct task_struct *task, unsigned long *sp)
199{
200 show_stack_log_lvl(task, NULL, sp, 0, "");
201}
202
203/*
204 * The architecture-independent dump_stack generator
205 */
206void dump_stack(void)
207{
208 unsigned long bp = 0;
209 unsigned long stack;
210
211#ifdef CONFIG_FRAME_POINTER
212 if (!bp)
213 get_bp(bp);
214#endif
215
216 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
217 current->pid, current->comm, print_tainted(),
218 init_utsname()->release,
219 (int)strcspn(init_utsname()->version, " "),
220 init_utsname()->version);
221 show_trace(NULL, NULL, &stack, bp);
222}
223
224EXPORT_SYMBOL(dump_stack);
225
226void show_registers(struct pt_regs *regs)
227{
228 int i;
229
230 print_modules();
231 __show_regs(regs, 0);
232
233 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
234 TASK_COMM_LEN, current->comm, task_pid_nr(current),
235 current_thread_info(), current, task_thread_info(current));
236 /*
237 * When in-kernel, we also print out the stack and code at the
238 * time of the fault..
239 */
240 if (!user_mode_vm(regs)) {
241 unsigned int code_prologue = code_bytes * 43 / 64;
242 unsigned int code_len = code_bytes;
243 unsigned char c;
244 u8 *ip;
245
246 printk(KERN_EMERG "Stack:\n");
247 show_stack_log_lvl(NULL, regs, &regs->sp,
248 0, KERN_EMERG);
249
250 printk(KERN_EMERG "Code: ");
251
252 ip = (u8 *)regs->ip - code_prologue;
253 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
254 /* try starting at IP */
255 ip = (u8 *)regs->ip;
256 code_len = code_len - code_prologue + 1;
257 }
258 for (i = 0; i < code_len; i++, ip++) {
259 if (ip < (u8 *)PAGE_OFFSET ||
260 probe_kernel_address(ip, c)) {
261 printk(" Bad EIP value.");
262 break;
263 }
264 if (ip == (u8 *)regs->ip)
265 printk("<%02x> ", c);
266 else
267 printk("%02x ", c);
268 }
269 }
270 printk("\n");
271}
272
273int is_valid_bugaddr(unsigned long ip)
274{
275 unsigned short ud2;
276
277 if (ip < PAGE_OFFSET)
278 return 0;
279 if (probe_kernel_address((unsigned short *)ip, ud2))
280 return 0;
281
282 return ud2 == 0x0b0f;
283}
284
285static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
286static int die_owner = -1;
287static unsigned int die_nest_count;
288
289unsigned __kprobes long oops_begin(void)
290{
291 unsigned long flags;
292
293 oops_enter();
294
295 if (die_owner != raw_smp_processor_id()) {
296 console_verbose();
297 raw_local_irq_save(flags);
298 __raw_spin_lock(&die_lock);
299 die_owner = smp_processor_id();
300 die_nest_count = 0;
301 bust_spinlocks(1);
302 } else {
303 raw_local_irq_save(flags);
304 }
305 die_nest_count++;
306 return flags;
307}
308
309void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
310{
311 bust_spinlocks(0);
312 die_owner = -1;
313 add_taint(TAINT_DIE);
314 __raw_spin_unlock(&die_lock);
315 raw_local_irq_restore(flags);
316
317 if (!regs)
318 return;
319
320 if (kexec_should_crash(current))
321 crash_kexec(regs);
322 if (in_interrupt())
323 panic("Fatal exception in interrupt");
324 if (panic_on_oops)
325 panic("Fatal exception");
326 oops_exit();
327 do_exit(signr);
328}
329
330int __kprobes __die(const char *str, struct pt_regs *regs, long err)
331{
332 unsigned short ss;
333 unsigned long sp;
334
335 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
336#ifdef CONFIG_PREEMPT
337 printk("PREEMPT ");
338#endif
339#ifdef CONFIG_SMP
340 printk("SMP ");
341#endif
342#ifdef CONFIG_DEBUG_PAGEALLOC
343 printk("DEBUG_PAGEALLOC");
344#endif
345 printk("\n");
346 if (notify_die(DIE_OOPS, str, regs, err,
347 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
348 return 1;
349
350 show_registers(regs);
351 /* Executive summary in case the oops scrolled away */
352 sp = (unsigned long) (&regs->sp);
353 savesegment(ss, ss);
354 if (user_mode(regs)) {
355 sp = regs->sp;
356 ss = regs->ss & 0xffff;
357 }
358 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
359 print_symbol("%s", regs->ip);
360 printk(" SS:ESP %04x:%08lx\n", ss, sp);
361 return 0;
362}
363
364/*
365 * This is gone through when something in the kernel has done something bad
366 * and is about to be terminated:
367 */
368void die(const char *str, struct pt_regs *regs, long err)
369{
370 unsigned long flags = oops_begin();
371
372 if (die_nest_count < 3) {
373 report_bug(regs->ip, regs);
374
375 if (__die(str, regs, err))
376 regs = NULL;
377 } else {
378 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
379 }
380
381 oops_end(flags, regs, SIGSEGV);
382}
383
384static DEFINE_SPINLOCK(nmi_print_lock);
385
386void notrace __kprobes
387die_nmi(char *str, struct pt_regs *regs, int do_panic)
388{
389 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
390 return;
391
392 spin_lock(&nmi_print_lock);
393 /*
394 * We are in trouble anyway, lets at least try
395 * to get a message out:
396 */
397 bust_spinlocks(1);
398 printk(KERN_EMERG "%s", str);
399 printk(" on CPU%d, ip %08lx, registers:\n",
400 smp_processor_id(), regs->ip);
401 show_registers(regs);
402 if (do_panic)
403 panic("Non maskable interrupt");
404 console_silent();
405 spin_unlock(&nmi_print_lock);
406 bust_spinlocks(0);
407
408 /*
409 * If we are in kernel we are probably nested up pretty bad
410 * and might aswell get out now while we still can:
411 */
412 if (!user_mode_vm(regs)) {
413 current->thread.trap_no = 2;
414 crash_kexec(regs);
415 }
416
417 do_exit(SIGSEGV);
418}
419
420static int __init oops_setup(char *s)
421{
422 if (!s)
423 return -EINVAL;
424 if (!strcmp(s, "panic"))
425 panic_on_oops = 1;
426 return 0;
427}
428early_param("oops", oops_setup);
429
430static int __init kstack_setup(char *s)
431{
432 if (!s)
433 return -EINVAL;
434 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
435 return 0;
436}
437early_param("kstack", kstack_setup);
438
439static int __init code_bytes_setup(char *s)
440{
441 code_bytes = simple_strtoul(s, NULL, 0);
442 if (code_bytes > 8192)
443 code_bytes = 8192;
444
445 return 1;
446}
447__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 000000000000..086cc8118e39
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,573 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16
17#include <asm/stacktrace.h>
18
19#define STACKSLOTS_PER_LINE 4
20#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
21
22int panic_on_unrecovered_nmi;
23int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
24static unsigned int code_bytes = 64;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
34 unsigned *usedp, char **idp)
35{
36 static char ids[][8] = {
37 [DEBUG_STACK - 1] = "#DB",
38 [NMI_STACK - 1] = "NMI",
39 [DOUBLEFAULT_STACK - 1] = "#DF",
40 [STACKFAULT_STACK - 1] = "#SS",
41 [MCE_STACK - 1] = "#MC",
42#if DEBUG_STKSZ > EXCEPTION_STKSZ
43 [N_EXCEPTION_STACKS ...
44 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
45#endif
46 };
47 unsigned k;
48
49 /*
50 * Iterate over all exception stacks, and figure out whether
51 * 'stack' is in one of them:
52 */
53 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
54 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
55 /*
56 * Is 'stack' above this exception frame's end?
57 * If yes then skip to the next frame.
58 */
59 if (stack >= end)
60 continue;
61 /*
62 * Is 'stack' above this exception frame's start address?
63 * If yes then we found the right frame.
64 */
65 if (stack >= end - EXCEPTION_STKSZ) {
66 /*
67 * Make sure we only iterate through an exception
68 * stack once. If it comes up for the second time
69 * then there's something wrong going on - just
70 * break out and return NULL:
71 */
72 if (*usedp & (1U << k))
73 break;
74 *usedp |= 1U << k;
75 *idp = ids[k];
76 return (unsigned long *)end;
77 }
78 /*
79 * If this is a debug stack, and if it has a larger size than
80 * the usual exception stacks, then 'stack' might still
81 * be within the lower portion of the debug stack:
82 */
83#if DEBUG_STKSZ > EXCEPTION_STKSZ
84 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
85 unsigned j = N_EXCEPTION_STACKS - 1;
86
87 /*
88 * Black magic. A large debug stack is composed of
89 * multiple exception stack entries, which we
90 * iterate through now. Dont look:
91 */
92 do {
93 ++j;
94 end -= EXCEPTION_STKSZ;
95 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
96 } while (stack < end - EXCEPTION_STKSZ);
97 if (*usedp & (1U << j))
98 break;
99 *usedp |= 1U << j;
100 *idp = ids[j];
101 return (unsigned long *)end;
102 }
103#endif
104 }
105 return NULL;
106}
107
108/*
109 * x86-64 can have up to three kernel stacks:
110 * process stack
111 * interrupt stack
112 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
113 */
114
115static inline int valid_stack_ptr(struct thread_info *tinfo,
116 void *p, unsigned int size, void *end)
117{
118 void *t = tinfo;
119 if (end) {
120 if (p < end && p >= (end-THREAD_SIZE))
121 return 1;
122 else
123 return 0;
124 }
125 return p > t && p < t + THREAD_SIZE - size;
126}
127
128/* The form of the top of the frame on the stack */
129struct stack_frame {
130 struct stack_frame *next_frame;
131 unsigned long return_address;
132};
133
134static inline unsigned long
135print_context_stack(struct thread_info *tinfo,
136 unsigned long *stack, unsigned long bp,
137 const struct stacktrace_ops *ops, void *data,
138 unsigned long *end)
139{
140 struct stack_frame *frame = (struct stack_frame *)bp;
141
142 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
143 unsigned long addr;
144
145 addr = *stack;
146 if (__kernel_text_address(addr)) {
147 if ((unsigned long) stack == bp + sizeof(long)) {
148 ops->address(data, addr, 1);
149 frame = frame->next_frame;
150 bp = (unsigned long) frame;
151 } else {
152 ops->address(data, addr, bp == 0);
153 }
154 }
155 stack++;
156 }
157 return bp;
158}
159
160void dump_trace(struct task_struct *task, struct pt_regs *regs,
161 unsigned long *stack, unsigned long bp,
162 const struct stacktrace_ops *ops, void *data)
163{
164 const unsigned cpu = get_cpu();
165 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
166 unsigned used = 0;
167 struct thread_info *tinfo;
168
169 if (!task)
170 task = current;
171
172 if (!stack) {
173 unsigned long dummy;
174 stack = &dummy;
175 if (task && task != current)
176 stack = (unsigned long *)task->thread.sp;
177 }
178
179#ifdef CONFIG_FRAME_POINTER
180 if (!bp) {
181 if (task == current) {
182 /* Grab bp right from our regs */
183 get_bp(bp);
184 } else {
185 /* bp is the last reg pushed by switch_to */
186 bp = *(unsigned long *) task->thread.sp;
187 }
188 }
189#endif
190
191 /*
192 * Print function call entries in all stacks, starting at the
193 * current stack address. If the stacks consist of nested
194 * exceptions
195 */
196 tinfo = task_thread_info(task);
197 for (;;) {
198 char *id;
199 unsigned long *estack_end;
200 estack_end = in_exception_stack(cpu, (unsigned long)stack,
201 &used, &id);
202
203 if (estack_end) {
204 if (ops->stack(data, id) < 0)
205 break;
206
207 bp = print_context_stack(tinfo, stack, bp, ops,
208 data, estack_end);
209 ops->stack(data, "<EOE>");
210 /*
211 * We link to the next stack via the
212 * second-to-last pointer (index -2 to end) in the
213 * exception stack:
214 */
215 stack = (unsigned long *) estack_end[-2];
216 continue;
217 }
218 if (irqstack_end) {
219 unsigned long *irqstack;
220 irqstack = irqstack_end -
221 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
222
223 if (stack >= irqstack && stack < irqstack_end) {
224 if (ops->stack(data, "IRQ") < 0)
225 break;
226 bp = print_context_stack(tinfo, stack, bp,
227 ops, data, irqstack_end);
228 /*
229 * We link to the next stack (which would be
230 * the process stack normally) the last
231 * pointer (index -1 to end) in the IRQ stack:
232 */
233 stack = (unsigned long *) (irqstack_end[-1]);
234 irqstack_end = NULL;
235 ops->stack(data, "EOI");
236 continue;
237 }
238 }
239 break;
240 }
241
242 /*
243 * This handles the process stack:
244 */
245 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
246 put_cpu();
247}
248EXPORT_SYMBOL(dump_trace);
249
250static void
251print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
252{
253 printk(data);
254 print_symbol(msg, symbol);
255 printk("\n");
256}
257
258static void print_trace_warning(void *data, char *msg)
259{
260 printk("%s%s\n", (char *)data, msg);
261}
262
263static int print_trace_stack(void *data, char *name)
264{
265 printk("%s <%s> ", (char *)data, name);
266 return 0;
267}
268
269/*
270 * Print one address/symbol entries per line.
271 */
272static void print_trace_address(void *data, unsigned long addr, int reliable)
273{
274 touch_nmi_watchdog();
275 printk(data);
276 printk_address(addr, reliable);
277}
278
279static const struct stacktrace_ops print_trace_ops = {
280 .warning = print_trace_warning,
281 .warning_symbol = print_trace_warning_symbol,
282 .stack = print_trace_stack,
283 .address = print_trace_address,
284};
285
286static void
287show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
288 unsigned long *stack, unsigned long bp, char *log_lvl)
289{
290 printk("%sCall Trace:\n", log_lvl);
291 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
292}
293
294void show_trace(struct task_struct *task, struct pt_regs *regs,
295 unsigned long *stack, unsigned long bp)
296{
297 show_trace_log_lvl(task, regs, stack, bp, "");
298}
299
300static void
301show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
302 unsigned long *sp, unsigned long bp, char *log_lvl)
303{
304 unsigned long *stack;
305 int i;
306 const int cpu = smp_processor_id();
307 unsigned long *irqstack_end =
308 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
309 unsigned long *irqstack =
310 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
311
312 /*
313 * debugging aid: "show_stack(NULL, NULL);" prints the
314 * back trace for this cpu.
315 */
316
317 if (sp == NULL) {
318 if (task)
319 sp = (unsigned long *)task->thread.sp;
320 else
321 sp = (unsigned long *)&sp;
322 }
323
324 stack = sp;
325 for (i = 0; i < kstack_depth_to_print; i++) {
326 if (stack >= irqstack && stack <= irqstack_end) {
327 if (stack == irqstack_end) {
328 stack = (unsigned long *) (irqstack_end[-1]);
329 printk(" <EOI> ");
330 }
331 } else {
332 if (((long) stack & (THREAD_SIZE-1)) == 0)
333 break;
334 }
335 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
336 printk("\n%s", log_lvl);
337 printk(" %016lx", *stack++);
338 touch_nmi_watchdog();
339 }
340 printk("\n");
341 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
342}
343
344void show_stack(struct task_struct *task, unsigned long *sp)
345{
346 show_stack_log_lvl(task, NULL, sp, 0, "");
347}
348
349/*
350 * The architecture-independent dump_stack generator
351 */
352void dump_stack(void)
353{
354 unsigned long bp = 0;
355 unsigned long stack;
356
357#ifdef CONFIG_FRAME_POINTER
358 if (!bp)
359 get_bp(bp);
360#endif
361
362 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
363 current->pid, current->comm, print_tainted(),
364 init_utsname()->release,
365 (int)strcspn(init_utsname()->version, " "),
366 init_utsname()->version);
367 show_trace(NULL, NULL, &stack, bp);
368}
369EXPORT_SYMBOL(dump_stack);
370
371void show_registers(struct pt_regs *regs)
372{
373 int i;
374 unsigned long sp;
375 const int cpu = smp_processor_id();
376 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
377
378 sp = regs->sp;
379 printk("CPU %d ", cpu);
380 __show_regs(regs, 1);
381 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
382 cur->comm, cur->pid, task_thread_info(cur), cur);
383
384 /*
385 * When in-kernel, we also print out the stack and code at the
386 * time of the fault..
387 */
388 if (!user_mode(regs)) {
389 unsigned int code_prologue = code_bytes * 43 / 64;
390 unsigned int code_len = code_bytes;
391 unsigned char c;
392 u8 *ip;
393
394 printk(KERN_EMERG "Stack:\n");
395 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
396 regs->bp, KERN_EMERG);
397
398 printk(KERN_EMERG "Code: ");
399
400 ip = (u8 *)regs->ip - code_prologue;
401 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
402 /* try starting at IP */
403 ip = (u8 *)regs->ip;
404 code_len = code_len - code_prologue + 1;
405 }
406 for (i = 0; i < code_len; i++, ip++) {
407 if (ip < (u8 *)PAGE_OFFSET ||
408 probe_kernel_address(ip, c)) {
409 printk(" Bad RIP value.");
410 break;
411 }
412 if (ip == (u8 *)regs->ip)
413 printk("<%02x> ", c);
414 else
415 printk("%02x ", c);
416 }
417 }
418 printk("\n");
419}
420
421int is_valid_bugaddr(unsigned long ip)
422{
423 unsigned short ud2;
424
425 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
426 return 0;
427
428 return ud2 == 0x0b0f;
429}
430
431static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
432static int die_owner = -1;
433static unsigned int die_nest_count;
434
435unsigned __kprobes long oops_begin(void)
436{
437 int cpu;
438 unsigned long flags;
439
440 oops_enter();
441
442 /* racy, but better than risking deadlock. */
443 raw_local_irq_save(flags);
444 cpu = smp_processor_id();
445 if (!__raw_spin_trylock(&die_lock)) {
446 if (cpu == die_owner)
447 /* nested oops. should stop eventually */;
448 else
449 __raw_spin_lock(&die_lock);
450 }
451 die_nest_count++;
452 die_owner = cpu;
453 console_verbose();
454 bust_spinlocks(1);
455 return flags;
456}
457
458void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
459{
460 die_owner = -1;
461 bust_spinlocks(0);
462 die_nest_count--;
463 if (!die_nest_count)
464 /* Nest count reaches zero, release the lock. */
465 __raw_spin_unlock(&die_lock);
466 raw_local_irq_restore(flags);
467 if (!regs) {
468 oops_exit();
469 return;
470 }
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473 if (panic_on_oops)
474 panic("Fatal exception");
475 oops_exit();
476 do_exit(signr);
477}
478
479int __kprobes __die(const char *str, struct pt_regs *regs, long err)
480{
481 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
482#ifdef CONFIG_PREEMPT
483 printk("PREEMPT ");
484#endif
485#ifdef CONFIG_SMP
486 printk("SMP ");
487#endif
488#ifdef CONFIG_DEBUG_PAGEALLOC
489 printk("DEBUG_PAGEALLOC");
490#endif
491 printk("\n");
492 if (notify_die(DIE_OOPS, str, regs, err,
493 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
494 return 1;
495
496 show_registers(regs);
497 add_taint(TAINT_DIE);
498 /* Executive summary in case the oops scrolled away */
499 printk(KERN_ALERT "RIP ");
500 printk_address(regs->ip, 1);
501 printk(" RSP <%016lx>\n", regs->sp);
502 if (kexec_should_crash(current))
503 crash_kexec(regs);
504 return 0;
505}
506
507void die(const char *str, struct pt_regs *regs, long err)
508{
509 unsigned long flags = oops_begin();
510
511 if (!user_mode(regs))
512 report_bug(regs->ip, regs);
513
514 if (__die(str, regs, err))
515 regs = NULL;
516 oops_end(flags, regs, SIGSEGV);
517}
518
519notrace __kprobes void
520die_nmi(char *str, struct pt_regs *regs, int do_panic)
521{
522 unsigned long flags;
523
524 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
525 return;
526
527 flags = oops_begin();
528 /*
529 * We are in trouble anyway, lets at least try
530 * to get a message out.
531 */
532 printk(KERN_EMERG "%s", str);
533 printk(" on CPU%d, ip %08lx, registers:\n",
534 smp_processor_id(), regs->ip);
535 show_registers(regs);
536 if (kexec_should_crash(current))
537 crash_kexec(regs);
538 if (do_panic || panic_on_oops)
539 panic("Non maskable interrupt");
540 oops_end(flags, NULL, SIGBUS);
541 nmi_exit();
542 local_irq_enable();
543 do_exit(SIGBUS);
544}
545
546static int __init oops_setup(char *s)
547{
548 if (!s)
549 return -EINVAL;
550 if (!strcmp(s, "panic"))
551 panic_on_oops = 1;
552 return 0;
553}
554early_param("oops", oops_setup);
555
556static int __init kstack_setup(char *s)
557{
558 if (!s)
559 return -EINVAL;
560 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
561 return 0;
562}
563early_param("kstack", kstack_setup);
564
565static int __init code_bytes_setup(char *s)
566{
567 code_bytes = simple_strtoul(s, NULL, 0);
568 if (code_bytes > 8192)
569 code_bytes = 8192;
570
571 return 1;
572}
573__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 66e48aa2dd1b..78e642feac30 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who)
148 case E820_NVS: 148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n"); 149 printk(KERN_CONT "(ACPI NVS)\n");
150 break; 150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
151 default: 154 default:
152 printk(KERN_CONT "type %u\n", e820.map[i].type); 155 printk(KERN_CONT "type %u\n", e820.map[i].type);
153 break; 156 break;
@@ -1260,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type)
1260 case E820_RAM: return "System RAM"; 1263 case E820_RAM: return "System RAM";
1261 case E820_ACPI: return "ACPI Tables"; 1264 case E820_ACPI: return "ACPI Tables";
1262 case E820_NVS: return "ACPI Non-volatile Storage"; 1265 case E820_NVS: return "ACPI Non-volatile Storage";
1266 case E820_UNUSABLE: return "Unusable memory";
1263 default: return "reserved"; 1267 default: return "reserved";
1264 } 1268 }
1265} 1269}
@@ -1267,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type)
1267/* 1271/*
1268 * Mark e820 reserved areas as busy for the resource manager. 1272 * Mark e820 reserved areas as busy for the resource manager.
1269 */ 1273 */
1274static struct resource __initdata *e820_res;
1270void __init e820_reserve_resources(void) 1275void __init e820_reserve_resources(void)
1271{ 1276{
1272 int i; 1277 int i;
@@ -1274,6 +1279,7 @@ void __init e820_reserve_resources(void)
1274 u64 end; 1279 u64 end;
1275 1280
1276 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1281 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1282 e820_res = res;
1277 for (i = 0; i < e820.nr_map; i++) { 1283 for (i = 0; i < e820.nr_map; i++) {
1278 end = e820.map[i].addr + e820.map[i].size - 1; 1284 end = e820.map[i].addr + e820.map[i].size - 1;
1279#ifndef CONFIG_RESOURCES_64BIT 1285#ifndef CONFIG_RESOURCES_64BIT
@@ -1287,7 +1293,14 @@ void __init e820_reserve_resources(void)
1287 res->end = end; 1293 res->end = end;
1288 1294
1289 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 1295 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1290 insert_resource(&iomem_resource, res); 1296
1297 /*
1298 * don't register the region that could be conflicted with
1299 * pci device BAR resource and insert them later in
1300 * pcibios_resource_survey()
1301 */
1302 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
1303 insert_resource(&iomem_resource, res);
1291 res++; 1304 res++;
1292 } 1305 }
1293 1306
@@ -1299,6 +1312,19 @@ void __init e820_reserve_resources(void)
1299 } 1312 }
1300} 1313}
1301 1314
1315void __init e820_reserve_resources_late(void)
1316{
1317 int i;
1318 struct resource *res;
1319
1320 res = e820_res;
1321 for (i = 0; i < e820.nr_map; i++) {
1322 if (!res->parent && res->end)
1323 reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
1324 res++;
1325 }
1326}
1327
1302char *__init default_machine_specific_memory_setup(void) 1328char *__init default_machine_specific_memory_setup(void)
1303{ 1329{
1304 char *who = "BIOS-e820"; 1330 char *who = "BIOS-e820";
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 24bb5faf5efa..733c4f8d42ea 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,52 @@ static void __init nvidia_bugs(int num, int slot, int func)
95 95
96} 96}
97 97
98static u32 ati_ixp4x0_rev(int num, int slot, int func)
99{
100 u32 d;
101 u8 b;
102
103 b = read_pci_config_byte(num, slot, func, 0xac);
104 b &= ~(1<<5);
105 write_pci_config_byte(num, slot, func, 0xac, b);
106
107 d = read_pci_config(num, slot, func, 0x70);
108 d |= 1<<8;
109 write_pci_config(num, slot, func, 0x70, d);
110
111 d = read_pci_config(num, slot, func, 0x8);
112 d &= 0xff;
113 return d;
114}
115
116static void __init ati_bugs(int num, int slot, int func)
117{
118#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC)
119 u32 d;
120 u8 b;
121
122 if (acpi_use_timer_override)
123 return;
124
125 d = ati_ixp4x0_rev(num, slot, func);
126 if (d < 0x82)
127 acpi_skip_timer_override = 1;
128 else {
129 /* check for IRQ0 interrupt swap */
130 outb(0x72, 0xcd6); b = inb(0xcd7);
131 if (!(b & 0x2))
132 acpi_skip_timer_override = 1;
133 }
134
135 if (acpi_skip_timer_override) {
136 printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
137 printk(KERN_INFO "Ignoring ACPI timer override.\n");
138 printk(KERN_INFO "If you got timer trouble "
139 "try acpi_use_timer_override\n");
140 }
141#endif
142}
143
98#ifdef CONFIG_DMAR 144#ifdef CONFIG_DMAR
99static void __init intel_g33_dmar(int num, int slot, int func) 145static void __init intel_g33_dmar(int num, int slot, int func)
100{ 146{
@@ -128,6 +174,8 @@ static struct chipset early_qrk[] __initdata = {
128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 174 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
129 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 175 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
130 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 176 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
177 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
178 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
131#ifdef CONFIG_DMAR 179#ifdef CONFIG_DMAR
132 { PCI_VENDOR_ID_INTEL, 0x29c0, 180 { PCI_VENDOR_ID_INTEL, 0x29c0,
133 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, 181 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index ff9e7350da54..34ad997d3834 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/screen_info.h> 5#include <linux/screen_info.h>
6#include <linux/usb/ch9.h>
7#include <linux/pci_regs.h>
8#include <linux/pci_ids.h>
9#include <linux/errno.h>
6#include <asm/io.h> 10#include <asm/io.h>
7#include <asm/processor.h> 11#include <asm/processor.h>
8#include <asm/fcntl.h> 12#include <asm/fcntl.h>
9#include <asm/setup.h> 13#include <asm/setup.h>
10#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h>
18#include <linux/usb/ehci_def.h>
11 19
12/* Simple VGA output */ 20/* Simple VGA output */
13#define VGABASE (__ISA_IO_base + 0xb8000) 21#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
78static int early_serial_putc(unsigned char ch) 86static int early_serial_putc(unsigned char ch)
79{ 87{
80 unsigned timeout = 0xffff; 88 unsigned timeout = 0xffff;
89
81 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 90 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
82 cpu_relax(); 91 cpu_relax();
83 outb(ch, early_serial_base + TXR); 92 outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
111 if (!strncmp(s, "0x", 2)) { 120 if (!strncmp(s, "0x", 2)) {
112 early_serial_base = simple_strtoul(s, &e, 16); 121 early_serial_base = simple_strtoul(s, &e, 16);
113 } else { 122 } else {
114 static int bases[] = { 0x3f8, 0x2f8 }; 123 static const int __initconst bases[] = { 0x3f8, 0x2f8 };
115 124
116 if (!strncmp(s, "ttyS", 4)) 125 if (!strncmp(s, "ttyS", 4))
117 s += 4; 126 s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
151 .index = -1, 160 .index = -1,
152}; 161};
153 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
390 int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t set_debug_port = default_set_debug_port;
565
566static void nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
154/* Console interface to a host file on AMD's SimNow! */ 878/* Console interface to a host file on AMD's SimNow! */
155 879
156static int simnow_fd; 880static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
165static noinline long simnow(long cmd, long a, long b, long c) 889static noinline long simnow(long cmd, long a, long b, long c)
166{ 890{
167 long ret; 891 long ret;
892
168 asm volatile("cpuid" : 893 asm volatile("cpuid" :
169 "=a" (ret) : 894 "=a" (ret) :
170 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); 895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
174static void __init simnow_init(char *str) 899static void __init simnow_init(char *str)
175{ 900{
176 char *fn = "klog"; 901 char *fn = "klog";
902
177 if (*str == '=') 903 if (*str == '=')
178 fn = ++str; 904 fn = ++str;
179 /* error ignored */ 905 /* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
194 920
195/* Direct interface for emergencies */ 921/* Direct interface for emergencies */
196static struct console *early_console = &early_vga_console; 922static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 923static int __initdata early_console_initialized;
198 924
199asmlinkage void early_printk(const char *fmt, ...) 925asmlinkage void early_printk(const char *fmt, ...)
200{ 926{
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
208 va_end(ap); 934 va_end(ap);
209} 935}
210 936
211static int __initdata keep_early;
212 937
213static int __init setup_early_printk(char *buf) 938static int __init setup_early_printk(char *buf)
214{ 939{
940 int keep_early;
941
215 if (!buf) 942 if (!buf)
216 return 0; 943 return 0;
217 944
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
219 return 0; 946 return 0;
220 early_console_initialized = 1; 947 early_console_initialized = 1;
221 948
222 if (strstr(buf, "keep")) 949 keep_early = (strstr(buf, "keep") != NULL);
223 keep_early = 1;
224 950
225 if (!strncmp(buf, "serial", 6)) { 951 if (!strncmp(buf, "serial", 6)) {
226 early_serial_init(buf + 6); 952 early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
238 simnow_init(buf + 6); 964 simnow_init(buf + 6);
239 early_console = &simnow_console; 965 early_console = &simnow_console;
240 keep_early = 1; 966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0)
970 return 0;
971 early_console = &early_dbgp_console;
972 /*
973 * usb subsys will reset ehci controller, so don't keep
974 * that early console
975 */
976 keep_early = 0;
977#endif
241#ifdef CONFIG_HVC_XEN 978#ifdef CONFIG_HVC_XEN
242 } else if (!strncmp(buf, "xen", 3)) { 979 } else if (!strncmp(buf, "xen", 3)) {
243 early_console = &xenboot_console; 980 early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
251 register_console(early_console); 988 register_console(early_console);
252 return 0; 989 return 0;
253} 990}
991
254early_param("earlyprintk", setup_early_printk); 992early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 109792bc7cfa..b21fbfaffe39 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -730,6 +730,7 @@ error_code:
730 movl $(__USER_DS), %ecx 730 movl $(__USER_DS), %ecx
731 movl %ecx, %ds 731 movl %ecx, %ds
732 movl %ecx, %es 732 movl %ecx, %es
733 TRACE_IRQS_OFF
733 movl %esp,%eax # pt_regs pointer 734 movl %esp,%eax # pt_regs pointer
734 call *%edi 735 call *%edi
735 jmp ret_from_exception 736 jmp ret_from_exception
@@ -760,20 +761,9 @@ ENTRY(device_not_available)
760 RING0_INT_FRAME 761 RING0_INT_FRAME
761 pushl $-1 # mark this as an int 762 pushl $-1 # mark this as an int
762 CFI_ADJUST_CFA_OFFSET 4 763 CFI_ADJUST_CFA_OFFSET 4
763 SAVE_ALL 764 pushl $do_device_not_available
764 GET_CR0_INTO_EAX
765 testl $0x4, %eax # EM (math emulation bit)
766 jne device_not_available_emulate
767 preempt_stop(CLBR_ANY)
768 call math_state_restore
769 jmp ret_from_exception
770device_not_available_emulate:
771 pushl $0 # temporary storage for ORIG_EIP
772 CFI_ADJUST_CFA_OFFSET 4 765 CFI_ADJUST_CFA_OFFSET 4
773 call math_emulate 766 jmp error_code
774 addl $4, %esp
775 CFI_ADJUST_CFA_OFFSET -4
776 jmp ret_from_exception
777 CFI_ENDPROC 767 CFI_ENDPROC
778END(device_not_available) 768END(device_not_available)
779 769
@@ -814,6 +804,7 @@ debug_stack_correct:
814 pushl $-1 # mark this as an int 804 pushl $-1 # mark this as an int
815 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
816 SAVE_ALL 806 SAVE_ALL
807 TRACE_IRQS_OFF
817 xorl %edx,%edx # error code 0 808 xorl %edx,%edx # error code 0
818 movl %esp,%eax # pt_regs pointer 809 movl %esp,%eax # pt_regs pointer
819 call do_debug 810 call do_debug
@@ -858,6 +849,7 @@ nmi_stack_correct:
858 pushl %eax 849 pushl %eax
859 CFI_ADJUST_CFA_OFFSET 4 850 CFI_ADJUST_CFA_OFFSET 4
860 SAVE_ALL 851 SAVE_ALL
852 TRACE_IRQS_OFF
861 xorl %edx,%edx # zero error code 853 xorl %edx,%edx # zero error code
862 movl %esp,%eax # pt_regs pointer 854 movl %esp,%eax # pt_regs pointer
863 call do_nmi 855 call do_nmi
@@ -898,6 +890,7 @@ nmi_espfix_stack:
898 pushl %eax 890 pushl %eax
899 CFI_ADJUST_CFA_OFFSET 4 891 CFI_ADJUST_CFA_OFFSET 4
900 SAVE_ALL 892 SAVE_ALL
893 TRACE_IRQS_OFF
901 FIXUP_ESPFIX_STACK # %eax == %esp 894 FIXUP_ESPFIX_STACK # %eax == %esp
902 xorl %edx,%edx # zero error code 895 xorl %edx,%edx # zero error code
903 call do_nmi 896 call do_nmi
@@ -928,6 +921,7 @@ KPROBE_ENTRY(int3)
928 pushl $-1 # mark this as an int 921 pushl $-1 # mark this as an int
929 CFI_ADJUST_CFA_OFFSET 4 922 CFI_ADJUST_CFA_OFFSET 4
930 SAVE_ALL 923 SAVE_ALL
924 TRACE_IRQS_OFF
931 xorl %edx,%edx # zero error code 925 xorl %edx,%edx # zero error code
932 movl %esp,%eax # pt_regs pointer 926 movl %esp,%eax # pt_regs pointer
933 call do_int3 927 call do_int3
@@ -1030,7 +1024,7 @@ ENTRY(machine_check)
1030 RING0_INT_FRAME 1024 RING0_INT_FRAME
1031 pushl $0 1025 pushl $0
1032 CFI_ADJUST_CFA_OFFSET 4 1026 CFI_ADJUST_CFA_OFFSET 4
1033 pushl machine_check_vector 1027 pushl $do_machine_check
1034 CFI_ADJUST_CFA_OFFSET 4 1028 CFI_ADJUST_CFA_OFFSET 4
1035 jmp error_code 1029 jmp error_code
1036 CFI_ENDPROC 1030 CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cf3a0b2d0059..1db6ce4314e1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -667,6 +667,13 @@ END(stub_rt_sigreturn)
667 SAVE_ARGS 667 SAVE_ARGS
668 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 668 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
669 pushq %rbp 669 pushq %rbp
670 /*
671 * Save rbp twice: One is for marking the stack frame, as usual, and the
672 * other, to fill pt_regs properly. This is because bx comes right
673 * before the last saved register in that structure, and not bp. If the
674 * base pointer were in the place bx is today, this would not be needed.
675 */
676 movq %rbp, -8(%rsp)
670 CFI_ADJUST_CFA_OFFSET 8 677 CFI_ADJUST_CFA_OFFSET 8
671 CFI_REL_OFFSET rbp, 0 678 CFI_REL_OFFSET rbp, 0
672 movq %rsp,%rbp 679 movq %rsp,%rbp
@@ -932,6 +939,9 @@ END(spurious_interrupt)
932 .if \ist 939 .if \ist
933 movq %gs:pda_data_offset, %rbp 940 movq %gs:pda_data_offset, %rbp
934 .endif 941 .endif
942 .if \irqtrace
943 TRACE_IRQS_OFF
944 .endif
935 movq %rsp,%rdi 945 movq %rsp,%rdi
936 movq ORIG_RAX(%rsp),%rsi 946 movq ORIG_RAX(%rsp),%rsi
937 movq $-1,ORIG_RAX(%rsp) 947 movq $-1,ORIG_RAX(%rsp)
@@ -1058,7 +1068,8 @@ KPROBE_ENTRY(error_entry)
1058 je error_kernelspace 1068 je error_kernelspace
1059error_swapgs: 1069error_swapgs:
1060 SWAPGS 1070 SWAPGS
1061error_sti: 1071error_sti:
1072 TRACE_IRQS_OFF
1062 movq %rdi,RDI(%rsp) 1073 movq %rdi,RDI(%rsp)
1063 CFI_REL_OFFSET rdi,RDI 1074 CFI_REL_OFFSET rdi,RDI
1064 movq %rsp,%rdi 1075 movq %rsp,%rdi
@@ -1232,7 +1243,7 @@ ENTRY(simd_coprocessor_error)
1232END(simd_coprocessor_error) 1243END(simd_coprocessor_error)
1233 1244
1234ENTRY(device_not_available) 1245ENTRY(device_not_available)
1235 zeroentry math_state_restore 1246 zeroentry do_device_not_available
1236END(device_not_available) 1247END(device_not_available)
1237 1248
1238 /* runs on exception stack */ 1249 /* runs on exception stack */
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c
index 50189af14b85..f454c78fcef6 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -39,10 +39,94 @@
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/apicdef.h> 41#include <asm/apicdef.h>
42#include "es7000.h"
43#include <mach_mpparse.h> 42#include <mach_mpparse.h>
44 43
45/* 44/*
45 * ES7000 chipsets
46 */
47
48#define NON_UNISYS 0
49#define ES7000_CLASSIC 1
50#define ES7000_ZORRO 2
51
52
53#define MIP_REG 1
54#define MIP_PSAI_REG 4
55
56#define MIP_BUSY 1
57#define MIP_SPIN 0xf0000
58#define MIP_VALID 0x0100000000000000ULL
59#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
60
61#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
62
63struct mip_reg_info {
64 unsigned long long mip_info;
65 unsigned long long delivery_info;
66 unsigned long long host_reg;
67 unsigned long long mip_reg;
68};
69
70struct part_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char part_id;
74 unsigned char apic_mode;
75 unsigned long snum;
76 char ptype[16];
77 char sname[64];
78 char pname[64];
79};
80
81struct psai {
82 unsigned long long entry_type;
83 unsigned long long addr;
84 unsigned long long bep_addr;
85};
86
87struct es7000_mem_info {
88 unsigned char type;
89 unsigned char length;
90 unsigned char resv[6];
91 unsigned long long start;
92 unsigned long long size;
93};
94
95struct es7000_oem_table {
96 unsigned long long hdr;
97 struct mip_reg_info mip;
98 struct part_info pif;
99 struct es7000_mem_info shm;
100 struct psai psai;
101};
102
103#ifdef CONFIG_ACPI
104
105struct oem_table {
106 struct acpi_table_header Header;
107 u32 OEMTableAddr;
108 u32 OEMTableSize;
109};
110
111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
112extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
113#endif
114
115struct mip_reg {
116 unsigned long long off_0;
117 unsigned long long off_8;
118 unsigned long long off_10;
119 unsigned long long off_18;
120 unsigned long long off_20;
121 unsigned long long off_28;
122 unsigned long long off_30;
123 unsigned long long off_38;
124};
125
126#define MIP_SW_APIC 0x1020b
127#define MIP_FUNC(VALUE) (VALUE & 0xff)
128
129/*
46 * ES7000 Globals 130 * ES7000 Globals
47 */ 131 */
48 132
@@ -72,7 +156,7 @@ es7000_rename_gsi(int ioapic, int gsi)
72 base += nr_ioapic_registers[i]; 156 base += nr_ioapic_registers[i];
73 } 157 }
74 158
75 if (!ioapic && (gsi < 16)) 159 if (!ioapic && (gsi < 16))
76 gsi += base; 160 gsi += base;
77 return gsi; 161 return gsi;
78} 162}
@@ -160,21 +244,38 @@ parse_unisys_oem (char *oemptr)
160} 244}
161 245
162#ifdef CONFIG_ACPI 246#ifdef CONFIG_ACPI
163int __init 247static unsigned long oem_addrX;
164find_unisys_acpi_oem_table(unsigned long *oem_addr) 248static unsigned long oem_size;
249int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
165{ 250{
166 struct acpi_table_header *header = NULL; 251 struct acpi_table_header *header = NULL;
167 int i = 0; 252 int i = 0;
168 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { 253 acpi_size tbl_size;
254
255 while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
169 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { 256 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
170 struct oem_table *t = (struct oem_table *)header; 257 struct oem_table *t = (struct oem_table *)header;
171 *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, 258
172 t->OEMTableSize); 259 oem_addrX = t->OEMTableAddr;
260 oem_size = t->OEMTableSize;
261 early_acpi_os_unmap_memory(header, tbl_size);
262
263 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
264 oem_size);
173 return 0; 265 return 0;
174 } 266 }
267 early_acpi_os_unmap_memory(header, tbl_size);
175 } 268 }
176 return -1; 269 return -1;
177} 270}
271
272void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
273{
274 if (!oem_addr)
275 return;
276
277 __acpi_unmap_table((char *)oem_addr, oem_size);
278}
178#endif 279#endif
179 280
180static void 281static void
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index eaff0bbb1444..6c9bfc9e1e95 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,87 +16,63 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/dmar.h>
19 20
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
23 24
24#ifdef CONFIG_ACPI 25extern struct genapic apic_flat;
25#include <acpi/acpi_bus.h> 26extern struct genapic apic_physflat;
26#endif 27extern struct genapic apic_x2xpic_uv_x;
27 28extern struct genapic apic_x2apic_phys;
28DEFINE_PER_CPU(int, x2apic_extra_bits); 29extern struct genapic apic_x2apic_cluster;
29 30
30struct genapic __read_mostly *genapic = &apic_flat; 31struct genapic __read_mostly *genapic = &apic_flat;
31 32
32static enum uv_system_type uv_system_type; 33static struct genapic *apic_probe[] __initdata = {
34 &apic_x2apic_uv_x,
35 &apic_x2apic_phys,
36 &apic_x2apic_cluster,
37 &apic_physflat,
38 NULL,
39};
33 40
34/* 41/*
35 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 42 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
36 */ 43 */
37void __init setup_apic_routing(void) 44void __init setup_apic_routing(void)
38{ 45{
39 if (uv_system_type == UV_NON_UNIQUE_APIC) 46 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
40 genapic = &apic_x2apic_uv_x; 47 if (!intr_remapping_enabled)
41 else 48 genapic = &apic_flat;
42#ifdef CONFIG_ACPI 49 }
43 /*
44 * Quirk: some x86_64 machines can only use physical APIC mode
45 * regardless of how many processors are present (x86_64 ES7000
46 * is an example).
47 */
48 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
49 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
50 genapic = &apic_physflat;
51 else
52#endif
53
54 if (max_physical_apicid < 8)
55 genapic = &apic_flat;
56 else
57 genapic = &apic_physflat;
58 50
59 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 51 if (genapic == &apic_flat) {
52 if (max_physical_apicid >= 8)
53 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 }
60} 56}
61 57
62/* Same for both flat and physical. */ 58/* Same for both flat and physical. */
63 59
64void send_IPI_self(int vector) 60void apic_send_IPI_self(int vector)
65{ 61{
66 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 62 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
67} 63}
68 64
69int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 65int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
70{ 66{
71 if (!strcmp(oem_id, "SGI")) { 67 int i;
72 if (!strcmp(oem_table_id, "UVL")) 68
73 uv_system_type = UV_LEGACY_APIC; 69 for (i = 0; apic_probe[i]; ++i) {
74 else if (!strcmp(oem_table_id, "UVX")) 70 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
75 uv_system_type = UV_X2APIC; 71 genapic = apic_probe[i];
76 else if (!strcmp(oem_table_id, "UVH")) 72 printk(KERN_INFO "Setting APIC routing to %s.\n",
77 uv_system_type = UV_NON_UNIQUE_APIC; 73 genapic->name);
74 return 1;
75 }
78 } 76 }
79 return 0; 77 return 0;
80} 78}
81
82unsigned int read_apic_id(void)
83{
84 unsigned int id;
85
86 WARN_ON(preemptible() && num_online_cpus() > 1);
87 id = apic_read(APIC_ID);
88 if (uv_system_type >= UV_X2APIC)
89 id |= __get_cpu_var(x2apic_extra_bits);
90 return id;
91}
92
93enum uv_system_type get_uv_system_type(void)
94{
95 return uv_system_type;
96}
97
98int is_uv_system(void)
99{
100 return uv_system_type != UV_NONE;
101}
102EXPORT_SYMBOL_GPL(is_uv_system);
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 786548a62d38..9eca5ba7a6b1 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
19#include <asm/ipi.h> 20#include <asm/ipi.h>
20#include <asm/genapic.h> 21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23
24#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h>
26#endif
27
28static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29{
30 return 1;
31}
21 32
22static cpumask_t flat_target_cpus(void) 33static cpumask_t flat_target_cpus(void)
23{ 34{
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector)
95 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
96} 107}
97 108
109static unsigned int get_apic_id(unsigned long x)
110{
111 unsigned int id;
112
113 id = (((x)>>24) & 0xFFu);
114 return id;
115}
116
117static unsigned long set_apic_id(unsigned int id)
118{
119 unsigned long x;
120
121 x = ((id & 0xFFu)<<24);
122 return x;
123}
124
125static unsigned int read_xapic_id(void)
126{
127 unsigned int id;
128
129 id = get_apic_id(apic_read(APIC_ID));
130 return id;
131}
132
98static int flat_apic_id_registered(void) 133static int flat_apic_id_registered(void)
99{ 134{
100 return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); 135 return physid_isset(read_xapic_id(), phys_cpu_present_map);
101} 136}
102 137
103static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb)
112 147
113struct genapic apic_flat = { 148struct genapic apic_flat = {
114 .name = "flat", 149 .name = "flat",
150 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
115 .int_delivery_mode = dest_LowestPrio, 151 .int_delivery_mode = dest_LowestPrio,
116 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 152 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
117 .target_cpus = flat_target_cpus, 153 .target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@ struct genapic apic_flat = {
121 .send_IPI_all = flat_send_IPI_all, 157 .send_IPI_all = flat_send_IPI_all,
122 .send_IPI_allbutself = flat_send_IPI_allbutself, 158 .send_IPI_allbutself = flat_send_IPI_allbutself,
123 .send_IPI_mask = flat_send_IPI_mask, 159 .send_IPI_mask = flat_send_IPI_mask,
160 .send_IPI_self = apic_send_IPI_self,
124 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
125 .phys_pkg_id = phys_pkg_id, 162 .phys_pkg_id = phys_pkg_id,
163 .get_apic_id = get_apic_id,
164 .set_apic_id = set_apic_id,
165 .apic_id_mask = (0xFFu<<24),
126}; 166};
127 167
128/* 168/*
@@ -130,6 +170,21 @@ struct genapic apic_flat = {
130 * We cannot use logical delivery in this case because the mask 170 * We cannot use logical delivery in this case because the mask
131 * overflows, so use physical mode. 171 * overflows, so use physical mode.
132 */ 172 */
173static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
174{
175#ifdef CONFIG_ACPI
176 /*
177 * Quirk: some x86_64 machines can only use physical APIC mode
178 * regardless of how many processors are present (x86_64 ES7000
179 * is an example).
180 */
181 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
182 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
183 return 1;
184#endif
185
186 return 0;
187}
133 188
134static cpumask_t physflat_target_cpus(void) 189static cpumask_t physflat_target_cpus(void)
135{ 190{
@@ -176,6 +231,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
176 231
177struct genapic apic_physflat = { 232struct genapic apic_physflat = {
178 .name = "physical flat", 233 .name = "physical flat",
234 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
179 .int_delivery_mode = dest_Fixed, 235 .int_delivery_mode = dest_Fixed,
180 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 236 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
181 .target_cpus = physflat_target_cpus, 237 .target_cpus = physflat_target_cpus,
@@ -185,6 +241,10 @@ struct genapic apic_physflat = {
185 .send_IPI_all = physflat_send_IPI_all, 241 .send_IPI_all = physflat_send_IPI_all,
186 .send_IPI_allbutself = physflat_send_IPI_allbutself, 242 .send_IPI_allbutself = physflat_send_IPI_allbutself,
187 .send_IPI_mask = physflat_send_IPI_mask, 243 .send_IPI_mask = physflat_send_IPI_mask,
244 .send_IPI_self = apic_send_IPI_self,
188 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 245 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
189 .phys_pkg_id = phys_pkg_id, 246 .phys_pkg_id = phys_pkg_id,
247 .get_apic_id = get_apic_id,
248 .set_apic_id = set_apic_id,
249 .apic_id_mask = (0xFFu<<24),
190}; 250};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 000000000000..e4bf2cc0d743
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14
15static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{
17 if (cpu_has_x2apic)
18 return 1;
19
20 return 0;
21}
22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24
25static cpumask_t x2apic_target_cpus(void)
26{
27 return cpumask_of_cpu(0);
28}
29
30/*
31 * for now each logical cpu is in its own vector allocation domain.
32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu)
34{
35 cpumask_t domain = CPU_MASK_NONE;
36 cpu_set(cpu, domain);
37 return domain;
38}
39
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
41 unsigned int dest)
42{
43 unsigned long cfg;
44
45 cfg = __prepare_ICR(0, vector, dest);
46
47 /*
48 * send the IPI.
49 */
50 x2apic_icr_write(cfg, apicid);
51}
52
53/*
54 * for now, we send the IPI's one by one in the cpumask.
55 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes.
58 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
60{
61 unsigned long flags;
62 unsigned long query_cpu;
63
64 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) {
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, APIC_DEST_LOGICAL);
68 }
69 local_irq_restore(flags);
70}
71
72static void x2apic_send_IPI_allbutself(int vector)
73{
74 cpumask_t mask = cpu_online_map;
75
76 cpu_clear(smp_processor_id(), mask);
77
78 if (!cpus_empty(mask))
79 x2apic_send_IPI_mask(mask, vector);
80}
81
82static void x2apic_send_IPI_all(int vector)
83{
84 x2apic_send_IPI_mask(cpu_online_map, vector);
85}
86
87static int x2apic_apic_id_registered(void)
88{
89 return 1;
90}
91
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
93{
94 int cpu;
95
96 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID.
98 * May as well be the first.
99 */
100 cpu = first_cpu(cpumask);
101 if ((unsigned)cpu < NR_CPUS)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else
104 return BAD_APICID;
105}
106
107static unsigned int get_apic_id(unsigned long x)
108{
109 unsigned int id;
110
111 id = x;
112 return id;
113}
114
115static unsigned long set_apic_id(unsigned int id)
116{
117 unsigned long x;
118
119 x = id;
120 return x;
121}
122
123static unsigned int phys_pkg_id(int index_msb)
124{
125 return current_cpu_data.initial_apicid >> index_msb;
126}
127
128static void x2apic_send_IPI_self(int vector)
129{
130 apic_write(APIC_SELF_IPI, vector);
131}
132
133static void init_x2apic_ldr(void)
134{
135 int cpu = smp_processor_id();
136
137 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
138 return;
139}
140
141struct genapic apic_x2apic_cluster = {
142 .name = "cluster x2apic",
143 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
144 .int_delivery_mode = dest_LowestPrio,
145 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
146 .target_cpus = x2apic_target_cpus,
147 .vector_allocation_domain = x2apic_vector_allocation_domain,
148 .apic_id_registered = x2apic_apic_id_registered,
149 .init_apic_ldr = init_x2apic_ldr,
150 .send_IPI_all = x2apic_send_IPI_all,
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
152 .send_IPI_mask = x2apic_send_IPI_mask,
153 .send_IPI_self = x2apic_send_IPI_self,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
155 .phys_pkg_id = phys_pkg_id,
156 .get_apic_id = get_apic_id,
157 .set_apic_id = set_apic_id,
158 .apic_id_mask = (0xFFFFFFFFu),
159};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 000000000000..8f1343df2627
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13static int x2apic_phys;
14
15static int set_x2apic_phys_mode(char *arg)
16{
17 x2apic_phys = 1;
18 return 0;
19}
20early_param("x2apic_phys", set_x2apic_phys_mode);
21
22static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
23{
24 if (cpu_has_x2apic && x2apic_phys)
25 return 1;
26
27 return 0;
28}
29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31
32static cpumask_t x2apic_target_cpus(void)
33{
34 return cpumask_of_cpu(0);
35}
36
37static cpumask_t x2apic_vector_allocation_domain(int cpu)
38{
39 cpumask_t domain = CPU_MASK_NONE;
40 cpu_set(cpu, domain);
41 return domain;
42}
43
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
45 unsigned int dest)
46{
47 unsigned long cfg;
48
49 cfg = __prepare_ICR(0, vector, dest);
50
51 /*
52 * send the IPI.
53 */
54 x2apic_icr_write(cfg, apicid);
55}
56
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
58{
59 unsigned long flags;
60 unsigned long query_cpu;
61
62 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL);
66 }
67 local_irq_restore(flags);
68}
69
70static void x2apic_send_IPI_allbutself(int vector)
71{
72 cpumask_t mask = cpu_online_map;
73
74 cpu_clear(smp_processor_id(), mask);
75
76 if (!cpus_empty(mask))
77 x2apic_send_IPI_mask(mask, vector);
78}
79
80static void x2apic_send_IPI_all(int vector)
81{
82 x2apic_send_IPI_mask(cpu_online_map, vector);
83}
84
85static int x2apic_apic_id_registered(void)
86{
87 return 1;
88}
89
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
91{
92 int cpu;
93
94 /*
95 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first.
97 */
98 cpu = first_cpu(cpumask);
99 if ((unsigned)cpu < NR_CPUS)
100 return per_cpu(x86_cpu_to_apicid, cpu);
101 else
102 return BAD_APICID;
103}
104
105static unsigned int get_apic_id(unsigned long x)
106{
107 unsigned int id;
108
109 id = x;
110 return id;
111}
112
113static unsigned long set_apic_id(unsigned int id)
114{
115 unsigned long x;
116
117 x = id;
118 return x;
119}
120
121static unsigned int phys_pkg_id(int index_msb)
122{
123 return current_cpu_data.initial_apicid >> index_msb;
124}
125
126void x2apic_send_IPI_self(int vector)
127{
128 apic_write(APIC_SELF_IPI, vector);
129}
130
131void init_x2apic_ldr(void)
132{
133 return;
134}
135
136struct genapic apic_x2apic_phys = {
137 .name = "physical x2apic",
138 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
139 .int_delivery_mode = dest_Fixed,
140 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
141 .target_cpus = x2apic_target_cpus,
142 .vector_allocation_domain = x2apic_vector_allocation_domain,
143 .apic_id_registered = x2apic_apic_id_registered,
144 .init_apic_ldr = init_x2apic_ldr,
145 .send_IPI_all = x2apic_send_IPI_all,
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
147 .send_IPI_mask = x2apic_send_IPI_mask,
148 .send_IPI_self = x2apic_send_IPI_self,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
150 .phys_pkg_id = phys_pkg_id,
151 .get_apic_id = get_apic_id,
152 .set_apic_id = set_apic_id,
153 .apic_id_mask = (0xFFFFFFFFu),
154};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index bfa837cb16be..33581d94a90e 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -12,12 +12,12 @@
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/sched.h> 17#include <linux/sched.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/hardirq.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
@@ -26,6 +26,36 @@
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h> 27#include <asm/uv/bios.h>
28 28
29DEFINE_PER_CPU(int, x2apic_extra_bits);
30
31static enum uv_system_type uv_system_type;
32
33static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
34{
35 if (!strcmp(oem_id, "SGI")) {
36 if (!strcmp(oem_table_id, "UVL"))
37 uv_system_type = UV_LEGACY_APIC;
38 else if (!strcmp(oem_table_id, "UVX"))
39 uv_system_type = UV_X2APIC;
40 else if (!strcmp(oem_table_id, "UVH")) {
41 uv_system_type = UV_NON_UNIQUE_APIC;
42 return 1;
43 }
44 }
45 return 0;
46}
47
48enum uv_system_type get_uv_system_type(void)
49{
50 return uv_system_type;
51}
52
53int is_uv_system(void)
54{
55 return uv_system_type != UV_NONE;
56}
57EXPORT_SYMBOL_GPL(is_uv_system);
58
29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 59DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 60EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
31 61
@@ -84,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector)
84 unsigned long val, apicid, lapicid; 114 unsigned long val, apicid, lapicid;
85 int pnode; 115 int pnode;
86 116
87 apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ 117 apicid = per_cpu(x86_cpu_to_apicid, cpu);
88 lapicid = apicid & 0x3f; /* ZZZ macro needed */ 118 lapicid = apicid & 0x3f; /* ZZZ macro needed */
89 pnode = uv_apicid_to_pnode(apicid); 119 pnode = uv_apicid_to_pnode(apicid);
90 val = 120 val =
@@ -123,6 +153,10 @@ static int uv_apic_id_registered(void)
123 return 1; 153 return 1;
124} 154}
125 155
156static void uv_init_apic_ldr(void)
157{
158}
159
126static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 160static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
127{ 161{
128 int cpu; 162 int cpu;
@@ -138,31 +172,59 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
138 return BAD_APICID; 172 return BAD_APICID;
139} 173}
140 174
175static unsigned int get_apic_id(unsigned long x)
176{
177 unsigned int id;
178
179 WARN_ON(preemptible() && num_online_cpus() > 1);
180 id = x | __get_cpu_var(x2apic_extra_bits);
181
182 return id;
183}
184
185static unsigned long set_apic_id(unsigned int id)
186{
187 unsigned long x;
188
189 /* maskout x2apic_extra_bits ? */
190 x = id;
191 return x;
192}
193
194static unsigned int uv_read_apic_id(void)
195{
196
197 return get_apic_id(apic_read(APIC_ID));
198}
199
141static unsigned int phys_pkg_id(int index_msb) 200static unsigned int phys_pkg_id(int index_msb)
142{ 201{
143 return GET_APIC_ID(read_apic_id()) >> index_msb; 202 return uv_read_apic_id() >> index_msb;
144} 203}
145 204
146#ifdef ZZZ /* Needs x2apic patch */
147static void uv_send_IPI_self(int vector) 205static void uv_send_IPI_self(int vector)
148{ 206{
149 apic_write(APIC_SELF_IPI, vector); 207 apic_write(APIC_SELF_IPI, vector);
150} 208}
151#endif
152 209
153struct genapic apic_x2apic_uv_x = { 210struct genapic apic_x2apic_uv_x = {
154 .name = "UV large system", 211 .name = "UV large system",
212 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
155 .int_delivery_mode = dest_Fixed, 213 .int_delivery_mode = dest_Fixed,
156 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 214 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
157 .target_cpus = uv_target_cpus, 215 .target_cpus = uv_target_cpus,
158 .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ 216 .vector_allocation_domain = uv_vector_allocation_domain,
159 .apic_id_registered = uv_apic_id_registered, 217 .apic_id_registered = uv_apic_id_registered,
218 .init_apic_ldr = uv_init_apic_ldr,
160 .send_IPI_all = uv_send_IPI_all, 219 .send_IPI_all = uv_send_IPI_all,
161 .send_IPI_allbutself = uv_send_IPI_allbutself, 220 .send_IPI_allbutself = uv_send_IPI_allbutself,
162 .send_IPI_mask = uv_send_IPI_mask, 221 .send_IPI_mask = uv_send_IPI_mask,
163 /* ZZZ.send_IPI_self = uv_send_IPI_self, */ 222 .send_IPI_self = uv_send_IPI_self,
164 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 223 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
165 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ 224 .phys_pkg_id = phys_pkg_id,
225 .get_apic_id = get_apic_id,
226 .set_apic_id = set_apic_id,
227 .apic_id_mask = (0xFFFFFFFFu),
166}; 228};
167 229
168static __cpuinit void set_x2apic_extra_bits(int pnode) 230static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -222,12 +284,13 @@ static __init void map_low_mmrs(void)
222 284
223enum map_type {map_wb, map_uc}; 285enum map_type {map_wb, map_uc};
224 286
225static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) 287static __init void map_high(char *id, unsigned long base, int shift,
288 int max_pnode, enum map_type map_type)
226{ 289{
227 unsigned long bytes, paddr; 290 unsigned long bytes, paddr;
228 291
229 paddr = base << shift; 292 paddr = base << shift;
230 bytes = (1UL << shift); 293 bytes = (1UL << shift) * (max_pnode + 1);
231 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 294 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
232 paddr + bytes); 295 paddr + bytes);
233 if (map_type == map_uc) 296 if (map_type == map_uc)
@@ -243,7 +306,7 @@ static __init void map_gru_high(int max_pnode)
243 306
244 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 307 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
245 if (gru.s.enable) 308 if (gru.s.enable)
246 map_high("GRU", gru.s.base, shift, map_wb); 309 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
247} 310}
248 311
249static __init void map_config_high(int max_pnode) 312static __init void map_config_high(int max_pnode)
@@ -253,7 +316,7 @@ static __init void map_config_high(int max_pnode)
253 316
254 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); 317 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
255 if (cfg.s.enable) 318 if (cfg.s.enable)
256 map_high("CONFIG", cfg.s.base, shift, map_uc); 319 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
257} 320}
258 321
259static __init void map_mmr_high(int max_pnode) 322static __init void map_mmr_high(int max_pnode)
@@ -263,7 +326,7 @@ static __init void map_mmr_high(int max_pnode)
263 326
264 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 327 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
265 if (mmr.s.enable) 328 if (mmr.s.enable)
266 map_high("MMR", mmr.s.base, shift, map_uc); 329 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
267} 330}
268 331
269static __init void map_mmioh_high(int max_pnode) 332static __init void map_mmioh_high(int max_pnode)
@@ -273,7 +336,7 @@ static __init void map_mmioh_high(int max_pnode)
273 336
274 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 337 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
275 if (mmioh.s.enable) 338 if (mmioh.s.enable)
276 map_high("MMIOH", mmioh.s.base, shift, map_uc); 339 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
277} 340}
278 341
279static __init void uv_rtc_init(void) 342static __init void uv_rtc_init(void)
@@ -401,3 +464,5 @@ void __cpuinit uv_cpu_init(void)
401 if (get_uv_system_type() == UV_NON_UNIQUE_APIC) 464 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
402 set_x2apic_extra_bits(uv_hub_info->pnode); 465 set_x2apic_extra_bits(uv_hub_info->pnode);
403} 466}
467
468
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..1dcb0f13897e 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
38 39
39 /* Fixup: bios puts an EBDA in the top 64K segment */ 40 /* Fixup: bios puts an EBDA in the top 64K segment */
40 /* of conventional memory, but does not adjust lowmem. */ 41 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 73deaffadd03..acf62fc233da 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -115,13 +115,17 @@ static void hpet_reserve_platform_timers(unsigned long id)
115 hd.hd_phys_address = hpet_address; 115 hd.hd_phys_address = hpet_address;
116 hd.hd_address = hpet; 116 hd.hd_address = hpet;
117 hd.hd_nirqs = nrtimers; 117 hd.hd_nirqs = nrtimers;
118 hd.hd_flags = HPET_DATA_PLATFORM;
119 hpet_reserve_timer(&hd, 0); 118 hpet_reserve_timer(&hd, 0);
120 119
121#ifdef CONFIG_HPET_EMULATE_RTC 120#ifdef CONFIG_HPET_EMULATE_RTC
122 hpet_reserve_timer(&hd, 1); 121 hpet_reserve_timer(&hd, 1);
123#endif 122#endif
124 123
124 /*
125 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
126 * is wrong for i8259!) not the output IRQ. Many BIOS writers
127 * don't bother configuring *any* comparator interrupts.
128 */
125 hd.hd_irq[0] = HPET_LEGACY_8254; 129 hd.hd_irq[0] = HPET_LEGACY_8254;
126 hd.hd_irq[1] = HPET_LEGACY_RTC; 130 hd.hd_irq[1] = HPET_LEGACY_RTC;
127 131
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb82..1f20608d4ca8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
21# include <asm/sigcontext32.h> 21# include <asm/sigcontext32.h>
22# include <asm/user32.h> 22# include <asm/user32.h>
23#else 23#else
24# define save_i387_ia32 save_i387 24# define save_i387_xstate_ia32 save_i387_xstate
25# define restore_i387_ia32 restore_i387 25# define restore_i387_xstate_ia32 restore_i387_xstate
26# define _fpstate_ia32 _fpstate 26# define _fpstate_ia32 _fpstate
27# define _xstate_ia32 _xstate
28# define sig_xstate_ia32_size sig_xstate_size
29# define fx_sw_reserved_ia32 fx_sw_reserved
27# define user_i387_ia32_struct user_i387_struct 30# define user_i387_ia32_struct user_i387_struct
28# define user32_fxsr_struct user_fxsr_struct 31# define user32_fxsr_struct user_fxsr_struct
29#endif 32#endif
@@ -36,6 +39,7 @@
36 39
37static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; 40static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
38unsigned int xstate_size; 41unsigned int xstate_size;
42unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
39static struct i387_fxsave_struct fx_scratch __cpuinitdata; 43static struct i387_fxsave_struct fx_scratch __cpuinitdata;
40 44
41void __cpuinit mxcsr_feature_mask_init(void) 45void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void)
61 return; 65 return;
62 } 66 }
63 67
68 if (cpu_has_xsave) {
69 xsave_cntxt_init();
70 return;
71 }
72
64 if (cpu_has_fxsr) 73 if (cpu_has_fxsr)
65 xstate_size = sizeof(struct i387_fxsave_struct); 74 xstate_size = sizeof(struct i387_fxsave_struct);
66#ifdef CONFIG_X86_32 75#ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void)
83 92
84 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 93 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
85 94
95 /*
96 * Boot processor to setup the FP and extended state context info.
97 */
98 if (!smp_processor_id())
99 init_thread_xstate();
100 xsave_init();
101
86 mxcsr_feature_mask_init(); 102 mxcsr_feature_mask_init();
87 /* clean state in init */ 103 /* clean state in init */
88 current_thread_info()->status = 0; 104 if (cpu_has_xsave)
105 current_thread_info()->status = TS_XSAVE;
106 else
107 current_thread_info()->status = 0;
89 clear_used_math(); 108 clear_used_math();
90} 109}
91#endif /* CONFIG_X86_64 */ 110#endif /* CONFIG_X86_64 */
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
195 */ 214 */
196 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 215 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
197 216
217 /*
218 * update the header bits in the xsave header, indicating the
219 * presence of FP and SSE state.
220 */
221 if (cpu_has_xsave)
222 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
223
198 return ret; 224 return ret;
199} 225}
200 226
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
395 if (!ret) 421 if (!ret)
396 convert_to_fxsr(target, &env); 422 convert_to_fxsr(target, &env);
397 423
424 /*
425 * update the header bit in the xsave header, indicating the
426 * presence of FP.
427 */
428 if (cpu_has_xsave)
429 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
398 return ret; 430 return ret;
399} 431}
400 432
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
407 struct task_struct *tsk = current; 439 struct task_struct *tsk = current;
408 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 440 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
409 441
410 unlazy_fpu(tsk);
411 fp->status = fp->swd; 442 fp->status = fp->swd;
412 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 443 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
413 return -1; 444 return -1;
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
421 struct user_i387_ia32_struct env; 452 struct user_i387_ia32_struct env;
422 int err = 0; 453 int err = 0;
423 454
424 unlazy_fpu(tsk);
425
426 convert_from_fxsr(&env, tsk); 455 convert_from_fxsr(&env, tsk);
427 if (__copy_to_user(buf, &env, sizeof(env))) 456 if (__copy_to_user(buf, &env, sizeof(env)))
428 return -1; 457 return -1;
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
432 if (err) 461 if (err)
433 return -1; 462 return -1;
434 463
435 if (__copy_to_user(&buf->_fxsr_env[0], fx, 464 if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
436 sizeof(struct i387_fxsave_struct))) 465 return -1;
466 return 1;
467}
468
469static int save_i387_xsave(void __user *buf)
470{
471 struct task_struct *tsk = current;
472 struct _fpstate_ia32 __user *fx = buf;
473 int err = 0;
474
475 /*
476 * For legacy compatible, we always set FP/SSE bits in the bit
477 * vector while saving the state to the user context.
478 * This will enable us capturing any changes(during sigreturn) to
479 * the FP/SSE bits by the legacy applications which don't touch
480 * xstate_bv in the xsave header.
481 *
482 * xsave aware applications can change the xstate_bv in the xsave
483 * header as well as change any contents in the memory layout.
484 * xrestore as part of sigreturn will capture all the changes.
485 */
486 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
487
488 if (save_i387_fxsave(fx) < 0)
489 return -1;
490
491 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
492 sizeof(struct _fpx_sw_bytes));
493 err |= __put_user(FP_XSTATE_MAGIC2,
494 (__u32 __user *) (buf + sig_xstate_ia32_size
495 - FP_XSTATE_MAGIC2_SIZE));
496 if (err)
437 return -1; 497 return -1;
498
438 return 1; 499 return 1;
439} 500}
440 501
441int save_i387_ia32(struct _fpstate_ia32 __user *buf) 502int save_i387_xstate_ia32(void __user *buf)
442{ 503{
504 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
505 struct task_struct *tsk = current;
506
443 if (!used_math()) 507 if (!used_math())
444 return 0; 508 return 0;
509
510 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
511 return -EACCES;
445 /* 512 /*
446 * This will cause a "finit" to be triggered by the next 513 * This will cause a "finit" to be triggered by the next
447 * attempted FPU operation by the 'current' process. 514 * attempted FPU operation by the 'current' process.
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
451 if (!HAVE_HWFP) { 518 if (!HAVE_HWFP) {
452 return fpregs_soft_get(current, NULL, 519 return fpregs_soft_get(current, NULL,
453 0, sizeof(struct user_i387_ia32_struct), 520 0, sizeof(struct user_i387_ia32_struct),
454 NULL, buf) ? -1 : 1; 521 NULL, fp) ? -1 : 1;
455 } 522 }
456 523
524 unlazy_fpu(tsk);
525
526 if (cpu_has_xsave)
527 return save_i387_xsave(fp);
457 if (cpu_has_fxsr) 528 if (cpu_has_fxsr)
458 return save_i387_fxsave(buf); 529 return save_i387_fxsave(fp);
459 else 530 else
460 return save_i387_fsave(buf); 531 return save_i387_fsave(fp);
461} 532}
462 533
463static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) 534static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
468 sizeof(struct i387_fsave_struct)); 539 sizeof(struct i387_fsave_struct));
469} 540}
470 541
471static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) 542static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
543 unsigned int size)
472{ 544{
473 struct task_struct *tsk = current; 545 struct task_struct *tsk = current;
474 struct user_i387_ia32_struct env; 546 struct user_i387_ia32_struct env;
475 int err; 547 int err;
476 548
477 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 549 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
478 sizeof(struct i387_fxsave_struct)); 550 size);
479 /* mxcsr reserved bits must be masked to zero for security reasons */ 551 /* mxcsr reserved bits must be masked to zero for security reasons */
480 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 552 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
481 if (err || __copy_from_user(&env, buf, sizeof(env))) 553 if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
485 return 0; 557 return 0;
486} 558}
487 559
488int restore_i387_ia32(struct _fpstate_ia32 __user *buf) 560static int restore_i387_xsave(void __user *buf)
561{
562 struct _fpx_sw_bytes fx_sw_user;
563 struct _fpstate_ia32 __user *fx_user =
564 ((struct _fpstate_ia32 __user *) buf);
565 struct i387_fxsave_struct __user *fx =
566 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
567 struct xsave_hdr_struct *xsave_hdr =
568 &current->thread.xstate->xsave.xsave_hdr;
569 u64 mask;
570 int err;
571
572 if (check_for_xstate(fx, buf, &fx_sw_user))
573 goto fx_only;
574
575 mask = fx_sw_user.xstate_bv;
576
577 err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
578
579 xsave_hdr->xstate_bv &= pcntxt_mask;
580 /*
581 * These bits must be zero.
582 */
583 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
584
585 /*
586 * Init the state that is not present in the memory layout
587 * and enabled by the OS.
588 */
589 mask = ~(pcntxt_mask & ~mask);
590 xsave_hdr->xstate_bv &= mask;
591
592 return err;
593fx_only:
594 /*
595 * Couldn't find the extended state information in the memory
596 * layout. Restore the FP/SSE and init the other extended state
597 * enabled by the OS.
598 */
599 xsave_hdr->xstate_bv = XSTATE_FPSSE;
600 return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
601}
602
603int restore_i387_xstate_ia32(void __user *buf)
489{ 604{
490 int err; 605 int err;
491 struct task_struct *tsk = current; 606 struct task_struct *tsk = current;
607 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
492 608
493 if (HAVE_HWFP) 609 if (HAVE_HWFP)
494 clear_fpu(tsk); 610 clear_fpu(tsk);
495 611
612 if (!buf) {
613 if (used_math()) {
614 clear_fpu(tsk);
615 clear_used_math();
616 }
617
618 return 0;
619 } else
620 if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
621 return -EACCES;
622
496 if (!used_math()) { 623 if (!used_math()) {
497 err = init_fpu(tsk); 624 err = init_fpu(tsk);
498 if (err) 625 if (err)
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
500 } 627 }
501 628
502 if (HAVE_HWFP) { 629 if (HAVE_HWFP) {
503 if (cpu_has_fxsr) 630 if (cpu_has_xsave)
504 err = restore_i387_fxsave(buf); 631 err = restore_i387_xsave(buf);
632 else if (cpu_has_fxsr)
633 err = restore_i387_fxsave(fp, sizeof(struct
634 i387_fxsave_struct));
505 else 635 else
506 err = restore_i387_fsave(buf); 636 err = restore_i387_fsave(fp);
507 } else { 637 } else {
508 err = fpregs_soft_set(current, NULL, 638 err = fpregs_soft_set(current, NULL,
509 0, sizeof(struct user_i387_ia32_struct), 639 0, sizeof(struct user_i387_ia32_struct),
510 NULL, buf) != 0; 640 NULL, fp) != 0;
511 } 641 }
512 set_used_math(); 642 set_used_math();
513 643
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index dc92b49d9204..4b8a53d841f7 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void)
282 282
283device_initcall(i8259A_init_sysfs); 283device_initcall(i8259A_init_sysfs);
284 284
285void mask_8259A(void)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&i8259A_lock, flags);
290
291 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
292 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
293
294 spin_unlock_irqrestore(&i8259A_lock, flags);
295}
296
297void unmask_8259A(void)
298{
299 unsigned long flags;
300
301 spin_lock_irqsave(&i8259A_lock, flags);
302
303 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
304 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
305
306 spin_unlock_irqrestore(&i8259A_lock, flags);
307}
308
285void init_8259A(int auto_eoi) 309void init_8259A(int auto_eoi)
286{ 310{
287 unsigned long flags; 311 unsigned long flags;
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 09cddb57bec4..e710289f673e 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -46,10 +46,13 @@
46#include <asm/nmi.h> 46#include <asm/nmi.h>
47#include <asm/msidef.h> 47#include <asm/msidef.h>
48#include <asm/hypertransport.h> 48#include <asm/hypertransport.h>
49#include <asm/setup.h>
49 50
50#include <mach_apic.h> 51#include <mach_apic.h>
51#include <mach_apicdef.h> 52#include <mach_apicdef.h>
52 53
54#define __apicdebuginit(type) static type __init
55
53int (*ioapic_renumber_irq)(int ioapic, int irq); 56int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count; 57atomic_t irq_mis_count;
55 58
@@ -1341,7 +1344,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1341 ioapic_write_entry(apic, pin, entry); 1344 ioapic_write_entry(apic, pin, entry);
1342} 1345}
1343 1346
1344void __init print_IO_APIC(void) 1347
1348__apicdebuginit(void) print_IO_APIC(void)
1345{ 1349{
1346 int apic, i; 1350 int apic, i;
1347 union IO_APIC_reg_00 reg_00; 1351 union IO_APIC_reg_00 reg_00;
@@ -1456,9 +1460,7 @@ void __init print_IO_APIC(void)
1456 return; 1460 return;
1457} 1461}
1458 1462
1459#if 0 1463__apicdebuginit(void) print_APIC_bitfield(int base)
1460
1461static void print_APIC_bitfield(int base)
1462{ 1464{
1463 unsigned int v; 1465 unsigned int v;
1464 int i, j; 1466 int i, j;
@@ -1479,9 +1481,10 @@ static void print_APIC_bitfield(int base)
1479 } 1481 }
1480} 1482}
1481 1483
1482void /*__init*/ print_local_APIC(void *dummy) 1484__apicdebuginit(void) print_local_APIC(void *dummy)
1483{ 1485{
1484 unsigned int v, ver, maxlvt; 1486 unsigned int v, ver, maxlvt;
1487 u64 icr;
1485 1488
1486 if (apic_verbosity == APIC_QUIET) 1489 if (apic_verbosity == APIC_QUIET)
1487 return; 1490 return;
@@ -1490,7 +1493,7 @@ void /*__init*/ print_local_APIC(void *dummy)
1490 smp_processor_id(), hard_smp_processor_id()); 1493 smp_processor_id(), hard_smp_processor_id());
1491 v = apic_read(APIC_ID); 1494 v = apic_read(APIC_ID);
1492 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, 1495 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
1493 GET_APIC_ID(read_apic_id())); 1496 GET_APIC_ID(v));
1494 v = apic_read(APIC_LVR); 1497 v = apic_read(APIC_LVR);
1495 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1498 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1496 ver = GET_APIC_VERSION(v); 1499 ver = GET_APIC_VERSION(v);
@@ -1532,10 +1535,9 @@ void /*__init*/ print_local_APIC(void *dummy)
1532 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1535 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1533 } 1536 }
1534 1537
1535 v = apic_read(APIC_ICR); 1538 icr = apic_icr_read();
1536 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1539 printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
1537 v = apic_read(APIC_ICR2); 1540 printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
1538 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1539 1541
1540 v = apic_read(APIC_LVTT); 1542 v = apic_read(APIC_LVTT);
1541 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1543 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1563,12 +1565,12 @@ void /*__init*/ print_local_APIC(void *dummy)
1563 printk("\n"); 1565 printk("\n");
1564} 1566}
1565 1567
1566void print_all_local_APICs(void) 1568__apicdebuginit(void) print_all_local_APICs(void)
1567{ 1569{
1568 on_each_cpu(print_local_APIC, NULL, 1); 1570 on_each_cpu(print_local_APIC, NULL, 1);
1569} 1571}
1570 1572
1571void /*__init*/ print_PIC(void) 1573__apicdebuginit(void) print_PIC(void)
1572{ 1574{
1573 unsigned int v; 1575 unsigned int v;
1574 unsigned long flags; 1576 unsigned long flags;
@@ -1600,7 +1602,17 @@ void /*__init*/ print_PIC(void)
1600 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1602 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1601} 1603}
1602 1604
1603#endif /* 0 */ 1605__apicdebuginit(int) print_all_ICs(void)
1606{
1607 print_PIC();
1608 print_all_local_APICs();
1609 print_IO_APIC();
1610
1611 return 0;
1612}
1613
1614fs_initcall(print_all_ICs);
1615
1604 1616
1605static void __init enable_IO_APIC(void) 1617static void __init enable_IO_APIC(void)
1606{ 1618{
@@ -1698,8 +1710,7 @@ void disable_IO_APIC(void)
1698 entry.dest_mode = 0; /* Physical */ 1710 entry.dest_mode = 0; /* Physical */
1699 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1711 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1700 entry.vector = 0; 1712 entry.vector = 0;
1701 entry.dest.physical.physical_dest = 1713 entry.dest.physical.physical_dest = read_apic_id();
1702 GET_APIC_ID(read_apic_id());
1703 1714
1704 /* 1715 /*
1705 * Add it to the IO-APIC irq-routing table: 1716 * Add it to the IO-APIC irq-routing table:
@@ -1725,10 +1736,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
1725 unsigned char old_id; 1736 unsigned char old_id;
1726 unsigned long flags; 1737 unsigned long flags;
1727 1738
1728#ifdef CONFIG_X86_NUMAQ 1739 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
1729 if (found_numaq)
1730 return; 1740 return;
1731#endif
1732 1741
1733 /* 1742 /*
1734 * Don't check I/O APIC IDs for xAPIC systems. They have 1743 * Don't check I/O APIC IDs for xAPIC systems. They have
@@ -2329,8 +2338,6 @@ void __init setup_IO_APIC(void)
2329 setup_IO_APIC_irqs(); 2338 setup_IO_APIC_irqs();
2330 init_IO_APIC_traps(); 2339 init_IO_APIC_traps();
2331 check_timer(); 2340 check_timer();
2332 if (!acpi_ioapic)
2333 print_IO_APIC();
2334} 2341}
2335 2342
2336/* 2343/*
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 61a83b70c18f..02063ae042f7 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -37,6 +37,7 @@
37#include <acpi/acpi_bus.h> 37#include <acpi/acpi_bus.h>
38#endif 38#endif
39#include <linux/bootmem.h> 39#include <linux/bootmem.h>
40#include <linux/dmar.h>
40 41
41#include <asm/idle.h> 42#include <asm/idle.h>
42#include <asm/io.h> 43#include <asm/io.h>
@@ -49,10 +50,13 @@
49#include <asm/nmi.h> 50#include <asm/nmi.h>
50#include <asm/msidef.h> 51#include <asm/msidef.h>
51#include <asm/hypertransport.h> 52#include <asm/hypertransport.h>
53#include <asm/irq_remapping.h>
52 54
53#include <mach_ipi.h> 55#include <mach_ipi.h>
54#include <mach_apic.h> 56#include <mach_apic.h>
55 57
58#define __apicdebuginit(type) static type __init
59
56struct irq_cfg { 60struct irq_cfg {
57 cpumask_t domain; 61 cpumask_t domain;
58 cpumask_t old_domain; 62 cpumask_t old_domain;
@@ -87,8 +91,6 @@ int first_system_vector = 0xfe;
87 91
88char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; 92char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
89 93
90#define __apicdebuginit __init
91
92int sis_apic_bug; /* not actually supported, dummy for compile */ 94int sis_apic_bug; /* not actually supported, dummy for compile */
93 95
94static int no_timer_check; 96static int no_timer_check;
@@ -108,6 +110,9 @@ static DEFINE_SPINLOCK(vector_lock);
108 */ 110 */
109int nr_ioapic_registers[MAX_IO_APICS]; 111int nr_ioapic_registers[MAX_IO_APICS];
110 112
113/* I/O APIC RTE contents at the OS boot up */
114struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
115
111/* I/O APIC entries */ 116/* I/O APIC entries */
112struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 117struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
113int nr_ioapics; 118int nr_ioapics;
@@ -303,7 +308,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
303 pin = entry->pin; 308 pin = entry->pin;
304 if (pin == -1) 309 if (pin == -1)
305 break; 310 break;
306 io_apic_write(apic, 0x11 + pin*2, dest); 311 /*
312 * With interrupt-remapping, destination information comes
313 * from interrupt-remapping table entry.
314 */
315 if (!irq_remapped(irq))
316 io_apic_write(apic, 0x11 + pin*2, dest);
307 reg = io_apic_read(apic, 0x10 + pin*2); 317 reg = io_apic_read(apic, 0x10 + pin*2);
308 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 318 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
309 reg |= vector; 319 reg |= vector;
@@ -440,6 +450,69 @@ static void clear_IO_APIC (void)
440 clear_IO_APIC_pin(apic, pin); 450 clear_IO_APIC_pin(apic, pin);
441} 451}
442 452
453/*
454 * Saves and masks all the unmasked IO-APIC RTE's
455 */
456int save_mask_IO_APIC_setup(void)
457{
458 union IO_APIC_reg_01 reg_01;
459 unsigned long flags;
460 int apic, pin;
461
462 /*
463 * The number of IO-APIC IRQ registers (== #pins):
464 */
465 for (apic = 0; apic < nr_ioapics; apic++) {
466 spin_lock_irqsave(&ioapic_lock, flags);
467 reg_01.raw = io_apic_read(apic, 1);
468 spin_unlock_irqrestore(&ioapic_lock, flags);
469 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
470 }
471
472 for (apic = 0; apic < nr_ioapics; apic++) {
473 early_ioapic_entries[apic] =
474 kzalloc(sizeof(struct IO_APIC_route_entry) *
475 nr_ioapic_registers[apic], GFP_KERNEL);
476 if (!early_ioapic_entries[apic])
477 return -ENOMEM;
478 }
479
480 for (apic = 0; apic < nr_ioapics; apic++)
481 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
482 struct IO_APIC_route_entry entry;
483
484 entry = early_ioapic_entries[apic][pin] =
485 ioapic_read_entry(apic, pin);
486 if (!entry.mask) {
487 entry.mask = 1;
488 ioapic_write_entry(apic, pin, entry);
489 }
490 }
491 return 0;
492}
493
494void restore_IO_APIC_setup(void)
495{
496 int apic, pin;
497
498 for (apic = 0; apic < nr_ioapics; apic++)
499 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
500 ioapic_write_entry(apic, pin,
501 early_ioapic_entries[apic][pin]);
502}
503
504void reinit_intr_remapped_IO_APIC(int intr_remapping)
505{
506 /*
507 * for now plain restore of previous settings.
508 * TBD: In the case of OS enabling interrupt-remapping,
509 * IO-APIC RTE's need to be setup to point to interrupt-remapping
510 * table entries. for now, do a plain restore, and wait for
511 * the setup_IO_APIC_irqs() to do proper initialization.
512 */
513 restore_IO_APIC_setup();
514}
515
443int skip_ioapic_setup; 516int skip_ioapic_setup;
444int ioapic_force; 517int ioapic_force;
445 518
@@ -839,18 +912,98 @@ void __setup_vector_irq(int cpu)
839} 912}
840 913
841static struct irq_chip ioapic_chip; 914static struct irq_chip ioapic_chip;
915#ifdef CONFIG_INTR_REMAP
916static struct irq_chip ir_ioapic_chip;
917#endif
842 918
843static void ioapic_register_intr(int irq, unsigned long trigger) 919static void ioapic_register_intr(int irq, unsigned long trigger)
844{ 920{
845 if (trigger) { 921 if (trigger)
846 irq_desc[irq].status |= IRQ_LEVEL; 922 irq_desc[irq].status |= IRQ_LEVEL;
847 set_irq_chip_and_handler_name(irq, &ioapic_chip, 923 else
848 handle_fasteoi_irq, "fasteoi");
849 } else {
850 irq_desc[irq].status &= ~IRQ_LEVEL; 924 irq_desc[irq].status &= ~IRQ_LEVEL;
925
926#ifdef CONFIG_INTR_REMAP
927 if (irq_remapped(irq)) {
928 irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
929 if (trigger)
930 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
931 handle_fasteoi_irq,
932 "fasteoi");
933 else
934 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
935 handle_edge_irq, "edge");
936 return;
937 }
938#endif
939 if (trigger)
940 set_irq_chip_and_handler_name(irq, &ioapic_chip,
941 handle_fasteoi_irq,
942 "fasteoi");
943 else
851 set_irq_chip_and_handler_name(irq, &ioapic_chip, 944 set_irq_chip_and_handler_name(irq, &ioapic_chip,
852 handle_edge_irq, "edge"); 945 handle_edge_irq, "edge");
946}
947
948static int setup_ioapic_entry(int apic, int irq,
949 struct IO_APIC_route_entry *entry,
950 unsigned int destination, int trigger,
951 int polarity, int vector)
952{
953 /*
954 * add it to the IO-APIC irq-routing table:
955 */
956 memset(entry,0,sizeof(*entry));
957
958#ifdef CONFIG_INTR_REMAP
959 if (intr_remapping_enabled) {
960 struct intel_iommu *iommu = map_ioapic_to_ir(apic);
961 struct irte irte;
962 struct IR_IO_APIC_route_entry *ir_entry =
963 (struct IR_IO_APIC_route_entry *) entry;
964 int index;
965
966 if (!iommu)
967 panic("No mapping iommu for ioapic %d\n", apic);
968
969 index = alloc_irte(iommu, irq, 1);
970 if (index < 0)
971 panic("Failed to allocate IRTE for ioapic %d\n", apic);
972
973 memset(&irte, 0, sizeof(irte));
974
975 irte.present = 1;
976 irte.dst_mode = INT_DEST_MODE;
977 irte.trigger_mode = trigger;
978 irte.dlvry_mode = INT_DELIVERY_MODE;
979 irte.vector = vector;
980 irte.dest_id = IRTE_DEST(destination);
981
982 modify_irte(irq, &irte);
983
984 ir_entry->index2 = (index >> 15) & 0x1;
985 ir_entry->zero = 0;
986 ir_entry->format = 1;
987 ir_entry->index = (index & 0x7fff);
988 } else
989#endif
990 {
991 entry->delivery_mode = INT_DELIVERY_MODE;
992 entry->dest_mode = INT_DEST_MODE;
993 entry->dest = destination;
853 } 994 }
995
996 entry->mask = 0; /* enable IRQ */
997 entry->trigger = trigger;
998 entry->polarity = polarity;
999 entry->vector = vector;
1000
1001 /* Mask level triggered irqs.
1002 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
1003 */
1004 if (trigger)
1005 entry->mask = 1;
1006 return 0;
854} 1007}
855 1008
856static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1009static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
@@ -875,24 +1028,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
875 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1028 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
876 irq, trigger, polarity); 1029 irq, trigger, polarity);
877 1030
878 /*
879 * add it to the IO-APIC irq-routing table:
880 */
881 memset(&entry,0,sizeof(entry));
882
883 entry.delivery_mode = INT_DELIVERY_MODE;
884 entry.dest_mode = INT_DEST_MODE;
885 entry.dest = cpu_mask_to_apicid(mask);
886 entry.mask = 0; /* enable IRQ */
887 entry.trigger = trigger;
888 entry.polarity = polarity;
889 entry.vector = cfg->vector;
890 1031
891 /* Mask level triggered irqs. 1032 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
892 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1033 cpu_mask_to_apicid(mask), trigger, polarity,
893 */ 1034 cfg->vector)) {
894 if (trigger) 1035 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
895 entry.mask = 1; 1036 mp_ioapics[apic].mp_apicid, pin);
1037 __clear_irq_vector(irq);
1038 return;
1039 }
896 1040
897 ioapic_register_intr(irq, trigger); 1041 ioapic_register_intr(irq, trigger);
898 if (irq < 16) 1042 if (irq < 16)
@@ -944,6 +1088,9 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
944{ 1088{
945 struct IO_APIC_route_entry entry; 1089 struct IO_APIC_route_entry entry;
946 1090
1091 if (intr_remapping_enabled)
1092 return;
1093
947 memset(&entry, 0, sizeof(entry)); 1094 memset(&entry, 0, sizeof(entry));
948 1095
949 /* 1096 /*
@@ -970,7 +1117,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
970 ioapic_write_entry(apic, pin, entry); 1117 ioapic_write_entry(apic, pin, entry);
971} 1118}
972 1119
973void __apicdebuginit print_IO_APIC(void) 1120
1121__apicdebuginit(void) print_IO_APIC(void)
974{ 1122{
975 int apic, i; 1123 int apic, i;
976 union IO_APIC_reg_00 reg_00; 1124 union IO_APIC_reg_00 reg_00;
@@ -1064,9 +1212,7 @@ void __apicdebuginit print_IO_APIC(void)
1064 return; 1212 return;
1065} 1213}
1066 1214
1067#if 0 1215__apicdebuginit(void) print_APIC_bitfield(int base)
1068
1069static __apicdebuginit void print_APIC_bitfield (int base)
1070{ 1216{
1071 unsigned int v; 1217 unsigned int v;
1072 int i, j; 1218 int i, j;
@@ -1087,9 +1233,10 @@ static __apicdebuginit void print_APIC_bitfield (int base)
1087 } 1233 }
1088} 1234}
1089 1235
1090void __apicdebuginit print_local_APIC(void * dummy) 1236__apicdebuginit(void) print_local_APIC(void *dummy)
1091{ 1237{
1092 unsigned int v, ver, maxlvt; 1238 unsigned int v, ver, maxlvt;
1239 unsigned long icr;
1093 1240
1094 if (apic_verbosity == APIC_QUIET) 1241 if (apic_verbosity == APIC_QUIET)
1095 return; 1242 return;
@@ -1097,7 +1244,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1097 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1244 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1098 smp_processor_id(), hard_smp_processor_id()); 1245 smp_processor_id(), hard_smp_processor_id());
1099 v = apic_read(APIC_ID); 1246 v = apic_read(APIC_ID);
1100 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); 1247 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
1101 v = apic_read(APIC_LVR); 1248 v = apic_read(APIC_LVR);
1102 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1249 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1103 ver = GET_APIC_VERSION(v); 1250 ver = GET_APIC_VERSION(v);
@@ -1133,10 +1280,9 @@ void __apicdebuginit print_local_APIC(void * dummy)
1133 v = apic_read(APIC_ESR); 1280 v = apic_read(APIC_ESR);
1134 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1281 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1135 1282
1136 v = apic_read(APIC_ICR); 1283 icr = apic_icr_read();
1137 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1284 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
1138 v = apic_read(APIC_ICR2); 1285 printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
1139 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1140 1286
1141 v = apic_read(APIC_LVTT); 1287 v = apic_read(APIC_LVTT);
1142 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1288 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1164,12 +1310,12 @@ void __apicdebuginit print_local_APIC(void * dummy)
1164 printk("\n"); 1310 printk("\n");
1165} 1311}
1166 1312
1167void print_all_local_APICs (void) 1313__apicdebuginit(void) print_all_local_APICs(void)
1168{ 1314{
1169 on_each_cpu(print_local_APIC, NULL, 1); 1315 on_each_cpu(print_local_APIC, NULL, 1);
1170} 1316}
1171 1317
1172void __apicdebuginit print_PIC(void) 1318__apicdebuginit(void) print_PIC(void)
1173{ 1319{
1174 unsigned int v; 1320 unsigned int v;
1175 unsigned long flags; 1321 unsigned long flags;
@@ -1201,7 +1347,17 @@ void __apicdebuginit print_PIC(void)
1201 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1347 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1202} 1348}
1203 1349
1204#endif /* 0 */ 1350__apicdebuginit(int) print_all_ICs(void)
1351{
1352 print_PIC();
1353 print_all_local_APICs();
1354 print_IO_APIC();
1355
1356 return 0;
1357}
1358
1359fs_initcall(print_all_ICs);
1360
1205 1361
1206void __init enable_IO_APIC(void) 1362void __init enable_IO_APIC(void)
1207{ 1363{
@@ -1291,7 +1447,7 @@ void disable_IO_APIC(void)
1291 entry.dest_mode = 0; /* Physical */ 1447 entry.dest_mode = 0; /* Physical */
1292 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1448 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1293 entry.vector = 0; 1449 entry.vector = 0;
1294 entry.dest = GET_APIC_ID(read_apic_id()); 1450 entry.dest = read_apic_id();
1295 1451
1296 /* 1452 /*
1297 * Add it to the IO-APIC irq-routing table: 1453 * Add it to the IO-APIC irq-routing table:
@@ -1397,6 +1553,147 @@ static int ioapic_retrigger_irq(unsigned int irq)
1397 */ 1553 */
1398 1554
1399#ifdef CONFIG_SMP 1555#ifdef CONFIG_SMP
1556
1557#ifdef CONFIG_INTR_REMAP
1558static void ir_irq_migration(struct work_struct *work);
1559
1560static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
1561
1562/*
1563 * Migrate the IO-APIC irq in the presence of intr-remapping.
1564 *
1565 * For edge triggered, irq migration is a simple atomic update(of vector
1566 * and cpu destination) of IRTE and flush the hardware cache.
1567 *
1568 * For level triggered, we need to modify the io-apic RTE aswell with the update
1569 * vector information, along with modifying IRTE with vector and destination.
1570 * So irq migration for level triggered is little bit more complex compared to
1571 * edge triggered migration. But the good news is, we use the same algorithm
1572 * for level triggered migration as we have today, only difference being,
1573 * we now initiate the irq migration from process context instead of the
1574 * interrupt context.
1575 *
1576 * In future, when we do a directed EOI (combined with cpu EOI broadcast
1577 * suppression) to the IO-APIC, level triggered irq migration will also be
1578 * as simple as edge triggered migration and we can do the irq migration
1579 * with a simple atomic update to IO-APIC RTE.
1580 */
1581static void migrate_ioapic_irq(int irq, cpumask_t mask)
1582{
1583 struct irq_cfg *cfg = irq_cfg + irq;
1584 struct irq_desc *desc = irq_desc + irq;
1585 cpumask_t tmp, cleanup_mask;
1586 struct irte irte;
1587 int modify_ioapic_rte = desc->status & IRQ_LEVEL;
1588 unsigned int dest;
1589 unsigned long flags;
1590
1591 cpus_and(tmp, mask, cpu_online_map);
1592 if (cpus_empty(tmp))
1593 return;
1594
1595 if (get_irte(irq, &irte))
1596 return;
1597
1598 if (assign_irq_vector(irq, mask))
1599 return;
1600
1601 cpus_and(tmp, cfg->domain, mask);
1602 dest = cpu_mask_to_apicid(tmp);
1603
1604 if (modify_ioapic_rte) {
1605 spin_lock_irqsave(&ioapic_lock, flags);
1606 __target_IO_APIC_irq(irq, dest, cfg->vector);
1607 spin_unlock_irqrestore(&ioapic_lock, flags);
1608 }
1609
1610 irte.vector = cfg->vector;
1611 irte.dest_id = IRTE_DEST(dest);
1612
1613 /*
1614 * Modified the IRTE and flushes the Interrupt entry cache.
1615 */
1616 modify_irte(irq, &irte);
1617
1618 if (cfg->move_in_progress) {
1619 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1620 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
1621 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
1622 cfg->move_in_progress = 0;
1623 }
1624
1625 irq_desc[irq].affinity = mask;
1626}
1627
1628static int migrate_irq_remapped_level(int irq)
1629{
1630 int ret = -1;
1631
1632 mask_IO_APIC_irq(irq);
1633
1634 if (io_apic_level_ack_pending(irq)) {
1635 /*
1636 * Interrupt in progress. Migrating irq now will change the
1637 * vector information in the IO-APIC RTE and that will confuse
1638 * the EOI broadcast performed by cpu.
1639 * So, delay the irq migration to the next instance.
1640 */
1641 schedule_delayed_work(&ir_migration_work, 1);
1642 goto unmask;
1643 }
1644
1645 /* everthing is clear. we have right of way */
1646 migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
1647
1648 ret = 0;
1649 irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
1650 cpus_clear(irq_desc[irq].pending_mask);
1651
1652unmask:
1653 unmask_IO_APIC_irq(irq);
1654 return ret;
1655}
1656
1657static void ir_irq_migration(struct work_struct *work)
1658{
1659 int irq;
1660
1661 for (irq = 0; irq < NR_IRQS; irq++) {
1662 struct irq_desc *desc = irq_desc + irq;
1663 if (desc->status & IRQ_MOVE_PENDING) {
1664 unsigned long flags;
1665
1666 spin_lock_irqsave(&desc->lock, flags);
1667 if (!desc->chip->set_affinity ||
1668 !(desc->status & IRQ_MOVE_PENDING)) {
1669 desc->status &= ~IRQ_MOVE_PENDING;
1670 spin_unlock_irqrestore(&desc->lock, flags);
1671 continue;
1672 }
1673
1674 desc->chip->set_affinity(irq,
1675 irq_desc[irq].pending_mask);
1676 spin_unlock_irqrestore(&desc->lock, flags);
1677 }
1678 }
1679}
1680
1681/*
1682 * Migrates the IRQ destination in the process context.
1683 */
1684static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
1685{
1686 if (irq_desc[irq].status & IRQ_LEVEL) {
1687 irq_desc[irq].status |= IRQ_MOVE_PENDING;
1688 irq_desc[irq].pending_mask = mask;
1689 migrate_irq_remapped_level(irq);
1690 return;
1691 }
1692
1693 migrate_ioapic_irq(irq, mask);
1694}
1695#endif
1696
1400asmlinkage void smp_irq_move_cleanup_interrupt(void) 1697asmlinkage void smp_irq_move_cleanup_interrupt(void)
1401{ 1698{
1402 unsigned vector, me; 1699 unsigned vector, me;
@@ -1453,6 +1750,17 @@ static void irq_complete_move(unsigned int irq)
1453#else 1750#else
1454static inline void irq_complete_move(unsigned int irq) {} 1751static inline void irq_complete_move(unsigned int irq) {}
1455#endif 1752#endif
1753#ifdef CONFIG_INTR_REMAP
1754static void ack_x2apic_level(unsigned int irq)
1755{
1756 ack_x2APIC_irq();
1757}
1758
1759static void ack_x2apic_edge(unsigned int irq)
1760{
1761 ack_x2APIC_irq();
1762}
1763#endif
1456 1764
1457static void ack_apic_edge(unsigned int irq) 1765static void ack_apic_edge(unsigned int irq)
1458{ 1766{
@@ -1527,6 +1835,21 @@ static struct irq_chip ioapic_chip __read_mostly = {
1527 .retrigger = ioapic_retrigger_irq, 1835 .retrigger = ioapic_retrigger_irq,
1528}; 1836};
1529 1837
1838#ifdef CONFIG_INTR_REMAP
1839static struct irq_chip ir_ioapic_chip __read_mostly = {
1840 .name = "IR-IO-APIC",
1841 .startup = startup_ioapic_irq,
1842 .mask = mask_IO_APIC_irq,
1843 .unmask = unmask_IO_APIC_irq,
1844 .ack = ack_x2apic_edge,
1845 .eoi = ack_x2apic_level,
1846#ifdef CONFIG_SMP
1847 .set_affinity = set_ir_ioapic_affinity_irq,
1848#endif
1849 .retrigger = ioapic_retrigger_irq,
1850};
1851#endif
1852
1530static inline void init_IO_APIC_traps(void) 1853static inline void init_IO_APIC_traps(void)
1531{ 1854{
1532 int irq; 1855 int irq;
@@ -1712,6 +2035,8 @@ static inline void __init check_timer(void)
1712 * 8259A. 2035 * 8259A.
1713 */ 2036 */
1714 if (pin1 == -1) { 2037 if (pin1 == -1) {
2038 if (intr_remapping_enabled)
2039 panic("BIOS bug: timer not connected to IO-APIC");
1715 pin1 = pin2; 2040 pin1 = pin2;
1716 apic1 = apic2; 2041 apic1 = apic2;
1717 no_pin1 = 1; 2042 no_pin1 = 1;
@@ -1738,6 +2063,8 @@ static inline void __init check_timer(void)
1738 clear_IO_APIC_pin(0, pin1); 2063 clear_IO_APIC_pin(0, pin1);
1739 goto out; 2064 goto out;
1740 } 2065 }
2066 if (intr_remapping_enabled)
2067 panic("timer doesn't work through Interrupt-remapped IO-APIC");
1741 clear_IO_APIC_pin(apic1, pin1); 2068 clear_IO_APIC_pin(apic1, pin1);
1742 if (!no_pin1) 2069 if (!no_pin1)
1743 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " 2070 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -1854,8 +2181,6 @@ void __init setup_IO_APIC(void)
1854 setup_IO_APIC_irqs(); 2181 setup_IO_APIC_irqs();
1855 init_IO_APIC_traps(); 2182 init_IO_APIC_traps();
1856 check_timer(); 2183 check_timer();
1857 if (!acpi_ioapic)
1858 print_IO_APIC();
1859} 2184}
1860 2185
1861struct sysfs_ioapic_data { 2186struct sysfs_ioapic_data {
@@ -1977,6 +2302,9 @@ void destroy_irq(unsigned int irq)
1977 2302
1978 dynamic_irq_cleanup(irq); 2303 dynamic_irq_cleanup(irq);
1979 2304
2305#ifdef CONFIG_INTR_REMAP
2306 free_irte(irq);
2307#endif
1980 spin_lock_irqsave(&vector_lock, flags); 2308 spin_lock_irqsave(&vector_lock, flags);
1981 __clear_irq_vector(irq); 2309 __clear_irq_vector(irq);
1982 spin_unlock_irqrestore(&vector_lock, flags); 2310 spin_unlock_irqrestore(&vector_lock, flags);
@@ -1995,11 +2323,42 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
1995 2323
1996 tmp = TARGET_CPUS; 2324 tmp = TARGET_CPUS;
1997 err = assign_irq_vector(irq, tmp); 2325 err = assign_irq_vector(irq, tmp);
1998 if (!err) { 2326 if (err)
1999 cpus_and(tmp, cfg->domain, tmp); 2327 return err;
2000 dest = cpu_mask_to_apicid(tmp); 2328
2329 cpus_and(tmp, cfg->domain, tmp);
2330 dest = cpu_mask_to_apicid(tmp);
2331
2332#ifdef CONFIG_INTR_REMAP
2333 if (irq_remapped(irq)) {
2334 struct irte irte;
2335 int ir_index;
2336 u16 sub_handle;
2337
2338 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
2339 BUG_ON(ir_index == -1);
2340
2341 memset (&irte, 0, sizeof(irte));
2342
2343 irte.present = 1;
2344 irte.dst_mode = INT_DEST_MODE;
2345 irte.trigger_mode = 0; /* edge */
2346 irte.dlvry_mode = INT_DELIVERY_MODE;
2347 irte.vector = cfg->vector;
2348 irte.dest_id = IRTE_DEST(dest);
2349
2350 modify_irte(irq, &irte);
2001 2351
2002 msg->address_hi = MSI_ADDR_BASE_HI; 2352 msg->address_hi = MSI_ADDR_BASE_HI;
2353 msg->data = sub_handle;
2354 msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
2355 MSI_ADDR_IR_SHV |
2356 MSI_ADDR_IR_INDEX1(ir_index) |
2357 MSI_ADDR_IR_INDEX2(ir_index);
2358 } else
2359#endif
2360 {
2361 msg->address_hi = MSI_ADDR_BASE_HI;
2003 msg->address_lo = 2362 msg->address_lo =
2004 MSI_ADDR_BASE_LO | 2363 MSI_ADDR_BASE_LO |
2005 ((INT_DEST_MODE == 0) ? 2364 ((INT_DEST_MODE == 0) ?
@@ -2049,6 +2408,55 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2049 write_msi_msg(irq, &msg); 2408 write_msi_msg(irq, &msg);
2050 irq_desc[irq].affinity = mask; 2409 irq_desc[irq].affinity = mask;
2051} 2410}
2411
2412#ifdef CONFIG_INTR_REMAP
2413/*
2414 * Migrate the MSI irq to another cpumask. This migration is
2415 * done in the process context using interrupt-remapping hardware.
2416 */
2417static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2418{
2419 struct irq_cfg *cfg = irq_cfg + irq;
2420 unsigned int dest;
2421 cpumask_t tmp, cleanup_mask;
2422 struct irte irte;
2423
2424 cpus_and(tmp, mask, cpu_online_map);
2425 if (cpus_empty(tmp))
2426 return;
2427
2428 if (get_irte(irq, &irte))
2429 return;
2430
2431 if (assign_irq_vector(irq, mask))
2432 return;
2433
2434 cpus_and(tmp, cfg->domain, mask);
2435 dest = cpu_mask_to_apicid(tmp);
2436
2437 irte.vector = cfg->vector;
2438 irte.dest_id = IRTE_DEST(dest);
2439
2440 /*
2441 * atomically update the IRTE with the new destination and vector.
2442 */
2443 modify_irte(irq, &irte);
2444
2445 /*
2446 * After this point, all the interrupts will start arriving
2447 * at the new destination. So, time to cleanup the previous
2448 * vector allocation.
2449 */
2450 if (cfg->move_in_progress) {
2451 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
2452 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2453 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2454 cfg->move_in_progress = 0;
2455 }
2456
2457 irq_desc[irq].affinity = mask;
2458}
2459#endif
2052#endif /* CONFIG_SMP */ 2460#endif /* CONFIG_SMP */
2053 2461
2054/* 2462/*
@@ -2066,26 +2474,157 @@ static struct irq_chip msi_chip = {
2066 .retrigger = ioapic_retrigger_irq, 2474 .retrigger = ioapic_retrigger_irq,
2067}; 2475};
2068 2476
2069int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) 2477#ifdef CONFIG_INTR_REMAP
2478static struct irq_chip msi_ir_chip = {
2479 .name = "IR-PCI-MSI",
2480 .unmask = unmask_msi_irq,
2481 .mask = mask_msi_irq,
2482 .ack = ack_x2apic_edge,
2483#ifdef CONFIG_SMP
2484 .set_affinity = ir_set_msi_irq_affinity,
2485#endif
2486 .retrigger = ioapic_retrigger_irq,
2487};
2488
2489/*
2490 * Map the PCI dev to the corresponding remapping hardware unit
2491 * and allocate 'nvec' consecutive interrupt-remapping table entries
2492 * in it.
2493 */
2494static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
2070{ 2495{
2496 struct intel_iommu *iommu;
2497 int index;
2498
2499 iommu = map_dev_to_ir(dev);
2500 if (!iommu) {
2501 printk(KERN_ERR
2502 "Unable to map PCI %s to iommu\n", pci_name(dev));
2503 return -ENOENT;
2504 }
2505
2506 index = alloc_irte(iommu, irq, nvec);
2507 if (index < 0) {
2508 printk(KERN_ERR
2509 "Unable to allocate %d IRTE for PCI %s\n", nvec,
2510 pci_name(dev));
2511 return -ENOSPC;
2512 }
2513 return index;
2514}
2515#endif
2516
2517static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
2518{
2519 int ret;
2071 struct msi_msg msg; 2520 struct msi_msg msg;
2521
2522 ret = msi_compose_msg(dev, irq, &msg);
2523 if (ret < 0)
2524 return ret;
2525
2526 set_irq_msi(irq, desc);
2527 write_msi_msg(irq, &msg);
2528
2529#ifdef CONFIG_INTR_REMAP
2530 if (irq_remapped(irq)) {
2531 struct irq_desc *desc = irq_desc + irq;
2532 /*
2533 * irq migration in process context
2534 */
2535 desc->status |= IRQ_MOVE_PCNTXT;
2536 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
2537 } else
2538#endif
2539 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
2540
2541 return 0;
2542}
2543
2544int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2545{
2072 int irq, ret; 2546 int irq, ret;
2547
2073 irq = create_irq(); 2548 irq = create_irq();
2074 if (irq < 0) 2549 if (irq < 0)
2075 return irq; 2550 return irq;
2076 2551
2077 ret = msi_compose_msg(dev, irq, &msg); 2552#ifdef CONFIG_INTR_REMAP
2553 if (!intr_remapping_enabled)
2554 goto no_ir;
2555
2556 ret = msi_alloc_irte(dev, irq, 1);
2557 if (ret < 0)
2558 goto error;
2559no_ir:
2560#endif
2561 ret = setup_msi_irq(dev, desc, irq);
2078 if (ret < 0) { 2562 if (ret < 0) {
2079 destroy_irq(irq); 2563 destroy_irq(irq);
2080 return ret; 2564 return ret;
2081 } 2565 }
2566 return 0;
2082 2567
2083 set_irq_msi(irq, desc); 2568#ifdef CONFIG_INTR_REMAP
2084 write_msi_msg(irq, &msg); 2569error:
2570 destroy_irq(irq);
2571 return ret;
2572#endif
2573}
2085 2574
2086 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 2575int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
2576{
2577 int irq, ret, sub_handle;
2578 struct msi_desc *desc;
2579#ifdef CONFIG_INTR_REMAP
2580 struct intel_iommu *iommu = 0;
2581 int index = 0;
2582#endif
2583
2584 sub_handle = 0;
2585 list_for_each_entry(desc, &dev->msi_list, list) {
2586 irq = create_irq();
2587 if (irq < 0)
2588 return irq;
2589#ifdef CONFIG_INTR_REMAP
2590 if (!intr_remapping_enabled)
2591 goto no_ir;
2087 2592
2593 if (!sub_handle) {
2594 /*
2595 * allocate the consecutive block of IRTE's
2596 * for 'nvec'
2597 */
2598 index = msi_alloc_irte(dev, irq, nvec);
2599 if (index < 0) {
2600 ret = index;
2601 goto error;
2602 }
2603 } else {
2604 iommu = map_dev_to_ir(dev);
2605 if (!iommu) {
2606 ret = -ENOENT;
2607 goto error;
2608 }
2609 /*
2610 * setup the mapping between the irq and the IRTE
2611 * base index, the sub_handle pointing to the
2612 * appropriate interrupt remap table entry.
2613 */
2614 set_irte_irq(irq, iommu, index, sub_handle);
2615 }
2616no_ir:
2617#endif
2618 ret = setup_msi_irq(dev, desc, irq);
2619 if (ret < 0)
2620 goto error;
2621 sub_handle++;
2622 }
2088 return 0; 2623 return 0;
2624
2625error:
2626 destroy_irq(irq);
2627 return ret;
2089} 2628}
2090 2629
2091void arch_teardown_msi_irq(unsigned int irq) 2630void arch_teardown_msi_irq(unsigned int irq)
@@ -2333,6 +2872,10 @@ void __init setup_ioapic_dest(void)
2333 setup_IO_APIC_irq(ioapic, pin, irq, 2872 setup_IO_APIC_irq(ioapic, pin, irq,
2334 irq_trigger(irq_entry), 2873 irq_trigger(irq_entry),
2335 irq_polarity(irq_entry)); 2874 irq_polarity(irq_entry));
2875#ifdef CONFIG_INTR_REMAP
2876 else if (intr_remapping_enabled)
2877 set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
2878#endif
2336 else 2879 else
2337 set_ioapic_affinity_irq(irq, TARGET_CPUS); 2880 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2338 } 2881 }
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d66914287ee1..9200a1e2752d 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -74,6 +74,15 @@ void __init init_ISA_irqs (void)
74 } 74 }
75} 75}
76 76
77/*
78 * IRQ2 is cascade interrupt to second interrupt controller
79 */
80static struct irqaction irq2 = {
81 .handler = no_action,
82 .mask = CPU_MASK_NONE,
83 .name = "cascade",
84};
85
77/* Overridden in paravirt.c */ 86/* Overridden in paravirt.c */
78void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 87void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
79 88
@@ -98,6 +107,46 @@ void __init native_init_IRQ(void)
98 set_intr_gate(vector, interrupt[i]); 107 set_intr_gate(vector, interrupt[i]);
99 } 108 }
100 109
110#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
111 /*
112 * IRQ0 must be given a fixed assignment and initialized,
113 * because it's used before the IO-APIC is set up.
114 */
115 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
116
117 /*
118 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
119 * IPI, driven by wakeup.
120 */
121 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
122
123 /* IPI for invalidation */
124 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
125
126 /* IPI for generic function call */
127 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
128
129 /* IPI for single call function */
130 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
131#endif
132
133#ifdef CONFIG_X86_LOCAL_APIC
134 /* self generated IPI for local APIC timer */
135 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
136
137 /* IPI vectors for APIC spurious and error interrupts */
138 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
139 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
140#endif
141
142#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
143 /* thermal monitor LVT interrupt */
144 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
145#endif
146
147 if (!acpi_ioapic)
148 setup_irq(2, &irq2);
149
101 /* setup after call gates are initialised (usually add in 150 /* setup after call gates are initialised (usually add in
102 * the architecture specific gates) 151 * the architecture specific gates)
103 */ 152 */
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 1f26fd9ec4f4..5b5be9d43c2a 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -135,7 +135,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
136}; 136};
137 137
138static void __init init_ISA_irqs (void) 138void __init init_ISA_irqs(void)
139{ 139{
140 int i; 140 int i;
141 141
@@ -164,22 +164,8 @@ static void __init init_ISA_irqs (void)
164 164
165void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 165void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
166 166
167void __init native_init_IRQ(void) 167static void __init smp_intr_init(void)
168{ 168{
169 int i;
170
171 init_ISA_irqs();
172 /*
173 * Cover the whole vector space, no vector can escape
174 * us. (some of these will be overridden and become
175 * 'special' SMP interrupts)
176 */
177 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
178 int vector = FIRST_EXTERNAL_VECTOR + i;
179 if (vector != IA32_SYSCALL_VECTOR)
180 set_intr_gate(vector, interrupt[i]);
181 }
182
183#ifdef CONFIG_SMP 169#ifdef CONFIG_SMP
184 /* 170 /*
185 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 171 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -207,6 +193,12 @@ void __init native_init_IRQ(void)
207 /* Low priority IPI to cleanup after moving an irq */ 193 /* Low priority IPI to cleanup after moving an irq */
208 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 194 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
209#endif 195#endif
196}
197
198static void __init apic_intr_init(void)
199{
200 smp_intr_init();
201
210 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 202 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
211 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 203 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
212 204
@@ -216,6 +208,25 @@ void __init native_init_IRQ(void)
216 /* IPI vectors for APIC spurious and error interrupts */ 208 /* IPI vectors for APIC spurious and error interrupts */
217 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 209 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
218 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 210 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
211}
212
213void __init native_init_IRQ(void)
214{
215 int i;
216
217 init_ISA_irqs();
218 /*
219 * Cover the whole vector space, no vector can escape
220 * us. (some of these will be overridden and become
221 * 'special' SMP interrupts)
222 */
223 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
224 int vector = FIRST_EXTERNAL_VECTOR + i;
225 if (vector != IA32_SYSCALL_VECTOR)
226 set_intr_gate(vector, interrupt[i]);
227 }
228
229 apic_intr_init();
219 230
220 if (!acpi_ioapic) 231 if (!acpi_ioapic)
221 setup_irq(2, &irq2); 232 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0ed5f939b905..eee32b43fee3 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -52,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
53 (mincount - oldsize) * LDT_ENTRY_SIZE); 53 (mincount - oldsize) * LDT_ENTRY_SIZE);
54 54
55 paravirt_alloc_ldt(newldt, mincount);
56
55#ifdef CONFIG_X86_64 57#ifdef CONFIG_X86_64
56 /* CHECKME: Do we really need this ? */ 58 /* CHECKME: Do we really need this ? */
57 wmb(); 59 wmb();
@@ -74,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
74#endif 76#endif
75 } 77 }
76 if (oldsize) { 78 if (oldsize) {
79 paravirt_free_ldt(oldldt, oldsize);
77 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 80 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
78 vfree(oldldt); 81 vfree(oldldt);
79 else 82 else
@@ -85,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
85static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 88static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
86{ 89{
87 int err = alloc_ldt(new, old->size, 0); 90 int err = alloc_ldt(new, old->size, 0);
91 int i;
88 92
89 if (err < 0) 93 if (err < 0)
90 return err; 94 return err;
91 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 95
96 for(i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
92 return 0; 98 return 0;
93} 99}
94 100
@@ -125,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
125 if (mm == current->active_mm) 131 if (mm == current->active_mm)
126 clear_LDT(); 132 clear_LDT();
127#endif 133#endif
134 paravirt_free_ldt(mm->context.ldt, mm->context.size);
128 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 135 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
129 vfree(mm->context.ldt); 136 vfree(mm->context.ldt);
130 else 137 else
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 652fa5c38ebe..000000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,853 +0,0 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to speculative
59 * nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73
74//#define DEBUG /* pr_debug */
75#include <linux/capability.h>
76#include <linux/kernel.h>
77#include <linux/init.h>
78#include <linux/sched.h>
79#include <linux/smp_lock.h>
80#include <linux/cpumask.h>
81#include <linux/module.h>
82#include <linux/slab.h>
83#include <linux/vmalloc.h>
84#include <linux/miscdevice.h>
85#include <linux/spinlock.h>
86#include <linux/mm.h>
87#include <linux/fs.h>
88#include <linux/mutex.h>
89#include <linux/cpu.h>
90#include <linux/firmware.h>
91#include <linux/platform_device.h>
92
93#include <asm/msr.h>
94#include <asm/uaccess.h>
95#include <asm/processor.h>
96
97MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
98MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
99MODULE_LICENSE("GPL");
100
101#define MICROCODE_VERSION "1.14a"
102
103#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
104#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
105#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
106#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
107#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
108#define DWSIZE (sizeof (u32))
109#define get_totalsize(mc) \
110 (((microcode_t *)mc)->hdr.totalsize ? \
111 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
112#define get_datasize(mc) \
113 (((microcode_t *)mc)->hdr.datasize ? \
114 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
115
116#define sigmatch(s1, s2, p1, p2) \
117 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
118
119#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
120
121/* serialize access to the physical write to MSR 0x79 */
122static DEFINE_SPINLOCK(microcode_update_lock);
123
124/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
125static DEFINE_MUTEX(microcode_mutex);
126
127static struct ucode_cpu_info {
128 int valid;
129 unsigned int sig;
130 unsigned int pf;
131 unsigned int rev;
132 microcode_t *mc;
133} ucode_cpu_info[NR_CPUS];
134
135static void collect_cpu_info(int cpu_num)
136{
137 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
138 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
139 unsigned int val[2];
140
141 /* We should bind the task to the CPU */
142 BUG_ON(raw_smp_processor_id() != cpu_num);
143 uci->pf = uci->rev = 0;
144 uci->mc = NULL;
145 uci->valid = 1;
146
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
150 "processor\n", cpu_num);
151 uci->valid = 0;
152 return;
153 }
154
155 uci->sig = cpuid_eax(0x00000001);
156
157 if ((c->x86_model >= 5) || (c->x86 > 6)) {
158 /* get processor flags from MSR 0x17 */
159 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
160 uci->pf = 1 << ((val[1] >> 18) & 7);
161 }
162
163 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
164 /* see notes above for revision 1.07. Apparent chip bug */
165 sync_core();
166 /* get the current revision from MSR 0x8B */
167 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
168 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
169 uci->sig, uci->pf, uci->rev);
170}
171
172static inline int microcode_update_match(int cpu_num,
173 microcode_header_t *mc_header, int sig, int pf)
174{
175 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
176
177 if (!sigmatch(sig, uci->sig, pf, uci->pf)
178 || mc_header->rev <= uci->rev)
179 return 0;
180 return 1;
181}
182
183static int microcode_sanity_check(void *mc)
184{
185 microcode_header_t *mc_header = mc;
186 struct extended_sigtable *ext_header = NULL;
187 struct extended_signature *ext_sig;
188 unsigned long total_size, data_size, ext_table_size;
189 int sum, orig_sum, ext_sigcount = 0, i;
190
191 total_size = get_totalsize(mc_header);
192 data_size = get_datasize(mc_header);
193 if (data_size + MC_HEADER_SIZE > total_size) {
194 printk(KERN_ERR "microcode: error! "
195 "Bad data size in microcode data file\n");
196 return -EINVAL;
197 }
198
199 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
200 printk(KERN_ERR "microcode: error! "
201 "Unknown microcode update format\n");
202 return -EINVAL;
203 }
204 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
205 if (ext_table_size) {
206 if ((ext_table_size < EXT_HEADER_SIZE)
207 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
208 printk(KERN_ERR "microcode: error! "
209 "Small exttable size in microcode data file\n");
210 return -EINVAL;
211 }
212 ext_header = mc + MC_HEADER_SIZE + data_size;
213 if (ext_table_size != exttable_size(ext_header)) {
214 printk(KERN_ERR "microcode: error! "
215 "Bad exttable size in microcode data file\n");
216 return -EFAULT;
217 }
218 ext_sigcount = ext_header->count;
219 }
220
221 /* check extended table checksum */
222 if (ext_table_size) {
223 int ext_table_sum = 0;
224 int *ext_tablep = (int *)ext_header;
225
226 i = ext_table_size / DWSIZE;
227 while (i--)
228 ext_table_sum += ext_tablep[i];
229 if (ext_table_sum) {
230 printk(KERN_WARNING "microcode: aborting, "
231 "bad extended signature table checksum\n");
232 return -EINVAL;
233 }
234 }
235
236 /* calculate the checksum */
237 orig_sum = 0;
238 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
239 while (i--)
240 orig_sum += ((int *)mc)[i];
241 if (orig_sum) {
242 printk(KERN_ERR "microcode: aborting, bad checksum\n");
243 return -EINVAL;
244 }
245 if (!ext_table_size)
246 return 0;
247 /* check extended signature checksum */
248 for (i = 0; i < ext_sigcount; i++) {
249 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
250 EXT_SIGNATURE_SIZE * i;
251 sum = orig_sum
252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
254 if (sum) {
255 printk(KERN_ERR "microcode: aborting, bad checksum\n");
256 return -EINVAL;
257 }
258 }
259 return 0;
260}
261
262/*
263 * return 0 - no update found
264 * return 1 - found update
265 * return < 0 - error
266 */
267static int get_maching_microcode(void *mc, int cpu)
268{
269 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
270 microcode_header_t *mc_header = mc;
271 struct extended_sigtable *ext_header;
272 unsigned long total_size = get_totalsize(mc_header);
273 int ext_sigcount, i;
274 struct extended_signature *ext_sig;
275 void *new_mc;
276
277 if (microcode_update_match(cpu, mc_header,
278 mc_header->sig, mc_header->pf))
279 goto find;
280
281 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
282 return 0;
283
284 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
285 ext_sigcount = ext_header->count;
286 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
287 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf))
290 goto find;
291 ext_sig++;
292 }
293 return 0;
294find:
295 pr_debug("microcode: CPU%d found a matching microcode update with"
296 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
297 new_mc = vmalloc(total_size);
298 if (!new_mc) {
299 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
300 return -ENOMEM;
301 }
302
303 /* free previous update file */
304 vfree(uci->mc);
305
306 memcpy(new_mc, mc, total_size);
307 uci->mc = new_mc;
308 return 1;
309}
310
311static void apply_microcode(int cpu)
312{
313 unsigned long flags;
314 unsigned int val[2];
315 int cpu_num = raw_smp_processor_id();
316 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
317
318 /* We should bind the task to the CPU */
319 BUG_ON(cpu_num != cpu);
320
321 if (uci->mc == NULL)
322 return;
323
324 /* serialize access to the physical write to MSR 0x79 */
325 spin_lock_irqsave(&microcode_update_lock, flags);
326
327 /* write microcode via MSR 0x79 */
328 wrmsr(MSR_IA32_UCODE_WRITE,
329 (unsigned long) uci->mc->bits,
330 (unsigned long) uci->mc->bits >> 16 >> 16);
331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
332
333 /* see notes above for revision 1.07. Apparent chip bug */
334 sync_core();
335
336 /* get the current revision from MSR 0x8B */
337 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
338
339 spin_unlock_irqrestore(&microcode_update_lock, flags);
340 if (val[1] != uci->mc->hdr.rev) {
341 printk(KERN_ERR "microcode: CPU%d update from revision "
342 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
343 return;
344 }
345 printk(KERN_INFO "microcode: CPU%d updated from revision "
346 "0x%x to 0x%x, date = %08x \n",
347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
348 uci->rev = val[1];
349}
350
351#ifdef CONFIG_MICROCODE_OLD_INTERFACE
352static void __user *user_buffer; /* user area microcode data buffer */
353static unsigned int user_buffer_size; /* it's size */
354
355static long get_next_ucode(void **mc, long offset)
356{
357 microcode_header_t mc_header;
358 unsigned long total_size;
359
360 /* No more data */
361 if (offset >= user_buffer_size)
362 return 0;
363 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
364 printk(KERN_ERR "microcode: error! Can not read user data\n");
365 return -EFAULT;
366 }
367 total_size = get_totalsize(&mc_header);
368 if (offset + total_size > user_buffer_size) {
369 printk(KERN_ERR "microcode: error! Bad total size in microcode "
370 "data file\n");
371 return -EINVAL;
372 }
373 *mc = vmalloc(total_size);
374 if (!*mc)
375 return -ENOMEM;
376 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
377 printk(KERN_ERR "microcode: error! Can not read user data\n");
378 vfree(*mc);
379 return -EFAULT;
380 }
381 return offset + total_size;
382}
383
384static int do_microcode_update (void)
385{
386 long cursor = 0;
387 int error = 0;
388 void *new_mc = NULL;
389 int cpu;
390 cpumask_t old;
391
392 old = current->cpus_allowed;
393
394 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
395 error = microcode_sanity_check(new_mc);
396 if (error)
397 goto out;
398 /*
399 * It's possible the data file has multiple matching ucode,
400 * lets keep searching till the latest version
401 */
402 for_each_online_cpu(cpu) {
403 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
404
405 if (!uci->valid)
406 continue;
407 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
408 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0)
410 goto out;
411 if (error == 1)
412 apply_microcode(cpu);
413 }
414 vfree(new_mc);
415 }
416out:
417 if (cursor > 0)
418 vfree(new_mc);
419 if (cursor < 0)
420 error = cursor;
421 set_cpus_allowed_ptr(current, &old);
422 return error;
423}
424
425static int microcode_open (struct inode *unused1, struct file *unused2)
426{
427 cycle_kernel_lock();
428 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
429}
430
431static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
432{
433 ssize_t ret;
434
435 if ((len >> PAGE_SHIFT) > num_physpages) {
436 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
437 return -EINVAL;
438 }
439
440 get_online_cpus();
441 mutex_lock(&microcode_mutex);
442
443 user_buffer = (void __user *) buf;
444 user_buffer_size = (int) len;
445
446 ret = do_microcode_update();
447 if (!ret)
448 ret = (ssize_t)len;
449
450 mutex_unlock(&microcode_mutex);
451 put_online_cpus();
452
453 return ret;
454}
455
456static const struct file_operations microcode_fops = {
457 .owner = THIS_MODULE,
458 .write = microcode_write,
459 .open = microcode_open,
460};
461
462static struct miscdevice microcode_dev = {
463 .minor = MICROCODE_MINOR,
464 .name = "microcode",
465 .fops = &microcode_fops,
466};
467
468static int __init microcode_dev_init (void)
469{
470 int error;
471
472 error = misc_register(&microcode_dev);
473 if (error) {
474 printk(KERN_ERR
475 "microcode: can't misc_register on minor=%d\n",
476 MICROCODE_MINOR);
477 return error;
478 }
479
480 return 0;
481}
482
483static void microcode_dev_exit (void)
484{
485 misc_deregister(&microcode_dev);
486}
487
488MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
489#else
490#define microcode_dev_init() 0
491#define microcode_dev_exit() do { } while(0)
492#endif
493
494static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
495 unsigned long size, long offset)
496{
497 microcode_header_t *mc_header;
498 unsigned long total_size;
499
500 /* No more data */
501 if (offset >= size)
502 return 0;
503 mc_header = (microcode_header_t *)(buf + offset);
504 total_size = get_totalsize(mc_header);
505
506 if (offset + total_size > size) {
507 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
508 return -EINVAL;
509 }
510
511 *mc = vmalloc(total_size);
512 if (!*mc) {
513 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
514 return -ENOMEM;
515 }
516 memcpy(*mc, buf + offset, total_size);
517 return offset + total_size;
518}
519
520/* fake device for request_firmware */
521static struct platform_device *microcode_pdev;
522
523static int cpu_request_microcode(int cpu)
524{
525 char name[30];
526 struct cpuinfo_x86 *c = &cpu_data(cpu);
527 const struct firmware *firmware;
528 const u8 *buf;
529 unsigned long size;
530 long offset = 0;
531 int error;
532 void *mc;
533
534 /* We should bind the task to the CPU */
535 BUG_ON(cpu != raw_smp_processor_id());
536 sprintf(name,"intel-ucode/%02x-%02x-%02x",
537 c->x86, c->x86_model, c->x86_mask);
538 error = request_firmware(&firmware, name, &microcode_pdev->dev);
539 if (error) {
540 pr_debug("microcode: data file %s load failed\n", name);
541 return error;
542 }
543 buf = firmware->data;
544 size = firmware->size;
545 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
546 > 0) {
547 error = microcode_sanity_check(mc);
548 if (error)
549 break;
550 error = get_maching_microcode(mc, cpu);
551 if (error < 0)
552 break;
553 /*
554 * It's possible the data file has multiple matching ucode,
555 * lets keep searching till the latest version
556 */
557 if (error == 1) {
558 apply_microcode(cpu);
559 error = 0;
560 }
561 vfree(mc);
562 }
563 if (offset > 0)
564 vfree(mc);
565 if (offset < 0)
566 error = offset;
567 release_firmware(firmware);
568
569 return error;
570}
571
572static int apply_microcode_check_cpu(int cpu)
573{
574 struct cpuinfo_x86 *c = &cpu_data(cpu);
575 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
576 cpumask_t old;
577 unsigned int val[2];
578 int err = 0;
579
580 /* Check if the microcode is available */
581 if (!uci->mc)
582 return 0;
583
584 old = current->cpus_allowed;
585 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
586
587 /* Check if the microcode we have in memory matches the CPU */
588 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
589 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
590 err = -EINVAL;
591
592 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
593 /* get processor flags from MSR 0x17 */
594 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
595 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
596 err = -EINVAL;
597 }
598
599 if (!err) {
600 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
601 /* see notes above for revision 1.07. Apparent chip bug */
602 sync_core();
603 /* get the current revision from MSR 0x8B */
604 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
605 if (uci->rev != val[1])
606 err = -EINVAL;
607 }
608
609 if (!err)
610 apply_microcode(cpu);
611 else
612 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
613 " sig=0x%x, pf=0x%x, rev=0x%x\n",
614 cpu, uci->sig, uci->pf, uci->rev);
615
616 set_cpus_allowed_ptr(current, &old);
617 return err;
618}
619
620static void microcode_init_cpu(int cpu, int resume)
621{
622 cpumask_t old;
623 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
624
625 old = current->cpus_allowed;
626
627 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
628 mutex_lock(&microcode_mutex);
629 collect_cpu_info(cpu);
630 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
631 cpu_request_microcode(cpu);
632 mutex_unlock(&microcode_mutex);
633 set_cpus_allowed_ptr(current, &old);
634}
635
636static void microcode_fini_cpu(int cpu)
637{
638 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
639
640 mutex_lock(&microcode_mutex);
641 uci->valid = 0;
642 vfree(uci->mc);
643 uci->mc = NULL;
644 mutex_unlock(&microcode_mutex);
645}
646
647static ssize_t reload_store(struct sys_device *dev,
648 struct sysdev_attribute *attr,
649 const char *buf, size_t sz)
650{
651 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
652 char *end;
653 unsigned long val = simple_strtoul(buf, &end, 0);
654 int err = 0;
655 int cpu = dev->id;
656
657 if (end == buf)
658 return -EINVAL;
659 if (val == 1) {
660 cpumask_t old = current->cpus_allowed;
661
662 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
664
665 mutex_lock(&microcode_mutex);
666 if (uci->valid)
667 err = cpu_request_microcode(cpu);
668 mutex_unlock(&microcode_mutex);
669 put_online_cpus();
670 set_cpus_allowed_ptr(current, &old);
671 }
672 if (err)
673 return err;
674 return sz;
675}
676
677static ssize_t version_show(struct sys_device *dev,
678 struct sysdev_attribute *attr, char *buf)
679{
680 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
681
682 return sprintf(buf, "0x%x\n", uci->rev);
683}
684
685static ssize_t pf_show(struct sys_device *dev,
686 struct sysdev_attribute *attr, char *buf)
687{
688 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
689
690 return sprintf(buf, "0x%x\n", uci->pf);
691}
692
693static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
694static SYSDEV_ATTR(version, 0400, version_show, NULL);
695static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
696
697static struct attribute *mc_default_attrs[] = {
698 &attr_reload.attr,
699 &attr_version.attr,
700 &attr_processor_flags.attr,
701 NULL
702};
703
704static struct attribute_group mc_attr_group = {
705 .attrs = mc_default_attrs,
706 .name = "microcode",
707};
708
709static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
710{
711 int err, cpu = sys_dev->id;
712 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
713
714 if (!cpu_online(cpu))
715 return 0;
716
717 pr_debug("microcode: CPU%d added\n", cpu);
718 memset(uci, 0, sizeof(*uci));
719
720 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
721 if (err)
722 return err;
723
724 microcode_init_cpu(cpu, resume);
725
726 return 0;
727}
728
729static int mc_sysdev_add(struct sys_device *sys_dev)
730{
731 return __mc_sysdev_add(sys_dev, 0);
732}
733
734static int mc_sysdev_remove(struct sys_device *sys_dev)
735{
736 int cpu = sys_dev->id;
737
738 if (!cpu_online(cpu))
739 return 0;
740
741 pr_debug("microcode: CPU%d removed\n", cpu);
742 microcode_fini_cpu(cpu);
743 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
744 return 0;
745}
746
747static int mc_sysdev_resume(struct sys_device *dev)
748{
749 int cpu = dev->id;
750
751 if (!cpu_online(cpu))
752 return 0;
753 pr_debug("microcode: CPU%d resumed\n", cpu);
754 /* only CPU 0 will apply ucode here */
755 apply_microcode(0);
756 return 0;
757}
758
759static struct sysdev_driver mc_sysdev_driver = {
760 .add = mc_sysdev_add,
761 .remove = mc_sysdev_remove,
762 .resume = mc_sysdev_resume,
763};
764
765static __cpuinit int
766mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
767{
768 unsigned int cpu = (unsigned long)hcpu;
769 struct sys_device *sys_dev;
770
771 sys_dev = get_cpu_sysdev(cpu);
772 switch (action) {
773 case CPU_UP_CANCELED_FROZEN:
774 /* The CPU refused to come up during a system resume */
775 microcode_fini_cpu(cpu);
776 break;
777 case CPU_ONLINE:
778 case CPU_DOWN_FAILED:
779 mc_sysdev_add(sys_dev);
780 break;
781 case CPU_ONLINE_FROZEN:
782 /* System-wide resume is in progress, try to apply microcode */
783 if (apply_microcode_check_cpu(cpu)) {
784 /* The application of microcode failed */
785 microcode_fini_cpu(cpu);
786 __mc_sysdev_add(sys_dev, 1);
787 break;
788 }
789 case CPU_DOWN_FAILED_FROZEN:
790 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
791 printk(KERN_ERR "microcode: Failed to create the sysfs "
792 "group for CPU%d\n", cpu);
793 break;
794 case CPU_DOWN_PREPARE:
795 mc_sysdev_remove(sys_dev);
796 break;
797 case CPU_DOWN_PREPARE_FROZEN:
798 /* Suspend is in progress, only remove the interface */
799 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
800 break;
801 }
802 return NOTIFY_OK;
803}
804
805static struct notifier_block __refdata mc_cpu_notifier = {
806 .notifier_call = mc_cpu_callback,
807};
808
809static int __init microcode_init (void)
810{
811 int error;
812
813 printk(KERN_INFO
814 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
815
816 error = microcode_dev_init();
817 if (error)
818 return error;
819 microcode_pdev = platform_device_register_simple("microcode", -1,
820 NULL, 0);
821 if (IS_ERR(microcode_pdev)) {
822 microcode_dev_exit();
823 return PTR_ERR(microcode_pdev);
824 }
825
826 get_online_cpus();
827 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
828 put_online_cpus();
829 if (error) {
830 microcode_dev_exit();
831 platform_device_unregister(microcode_pdev);
832 return error;
833 }
834
835 register_hotcpu_notifier(&mc_cpu_notifier);
836 return 0;
837}
838
839static void __exit microcode_exit (void)
840{
841 microcode_dev_exit();
842
843 unregister_hotcpu_notifier(&mc_cpu_notifier);
844
845 get_online_cpus();
846 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
847 put_online_cpus();
848
849 platform_device_unregister(microcode_pdev);
850}
851
852module_init(microcode_init)
853module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 000000000000..7a1f8eeac2c7
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
1/*
2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc.
4 *
5 * Author: Peter Oruba <peter.oruba@amd.com>
6 *
7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 *
10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors.
12 *
13 * Licensed unter the terms of the GNU General Public
14 * License version 2. See file COPYING for details.
15*/
16
17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h>
26#include <linux/spinlock.h>
27#include <linux/mm.h>
28#include <linux/fs.h>
29#include <linux/mutex.h>
30#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h>
34#include <linux/pci_ids.h>
35
36#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h>
39#include <asm/microcode.h>
40
41MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
43MODULE_LICENSE("GPL v2");
44
45#define UCODE_MAGIC 0x00414d44
46#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
47#define UCODE_UCODE_TYPE 0x00000001
48
49struct equiv_cpu_entry {
50 unsigned int installed_cpu;
51 unsigned int fixed_errata_mask;
52 unsigned int fixed_errata_compare;
53 unsigned int equiv_cpu;
54};
55
56struct microcode_header_amd {
57 unsigned int data_code;
58 unsigned int patch_id;
59 unsigned char mc_patch_data_id[2];
60 unsigned char mc_patch_data_len;
61 unsigned char init_flag;
62 unsigned int mc_patch_data_checksum;
63 unsigned int nb_dev_id;
64 unsigned int sb_dev_id;
65 unsigned char processor_rev_id[2];
66 unsigned char nb_rev_id;
67 unsigned char sb_rev_id;
68 unsigned char bios_api_rev;
69 unsigned char reserved1[3];
70 unsigned int match_reg[8];
71};
72
73struct microcode_amd {
74 struct microcode_header_amd hdr;
75 unsigned int mpb[0];
76};
77
78#define UCODE_MAX_SIZE (2048)
79#define DEFAULT_UCODE_DATASIZE (896)
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87
88/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock);
90
91static struct equiv_cpu_entry *equiv_cpu_table;
92
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{
95 struct cpuinfo_x86 *c = &cpu_data(cpu);
96
97 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
101 cpu);
102 return -1;
103 }
104
105 asm volatile("movl %1, %%ecx; rdmsr"
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0;
113}
114
115static int get_matching_microcode(int cpu, void *mc, int rev)
116{
117 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00;
121 unsigned int i = 0;
122
123 BUG_ON(equiv_cpu_table == NULL);
124 current_cpu_id = cpuid_eax(0x00000001);
125
126 while (equiv_cpu_table[i].installed_cpu != 0) {
127 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
128 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
129 break;
130 }
131 i++;
132 }
133
134 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id "
136 "not found in equivalent cpu table \n", cpu);
137 return 0;
138 }
139
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
141 printk(KERN_ERR
142 "microcode: CPU%d patch does not match "
143 "(patch is %x, cpu extended is %x) \n",
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0;
147 }
148
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
150 printk(KERN_ERR "microcode: CPU%d patch does not match "
151 "(patch is %x, cpu base id is %x) \n",
152 cpu, mc_header->processor_rev_id[1],
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0;
156 }
157
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev)
187 return 0;
188
189 return 1;
190}
191
192static void apply_microcode_amd(int cpu)
193{
194 unsigned long flags;
195 unsigned int eax, edx;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201
202 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu);
204
205 if (mc_amd == NULL)
206 return;
207
208 spin_lock_irqsave(&microcode_update_lock, flags);
209
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr"
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags);
223
224 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision "
227 "0x%x to 0x%x failed\n", cpu_num,
228 mc_amd->hdr.patch_id, rev);
229 return;
230 }
231
232 printk(KERN_INFO "microcode: CPU%d updated from revision "
233 "0x%x to 0x%x \n",
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235
236 uci->cpu_sig.rev = rev;
237}
238
239static void * get_next_ucode(u8 *buf, unsigned int size,
240 int (*get_ucode_data)(void *, const void *, size_t),
241 unsigned int *mc_size)
242{
243 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc;
247
248 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
249 return NULL;
250
251 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! "
253 "Wrong microcode payload type field\n");
254 return NULL;
255 }
256
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258
259 printk(KERN_INFO "microcode: size %u, total_size %u\n",
260 size, total_size);
261
262 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
264 return NULL;
265 }
266
267 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
271 vfree(mc);
272 mc = NULL;
273 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc;
278}
279
280
281static int install_equiv_cpu_table(u8 *buf,
282 int (*get_ucode_data)(void *, const void *, size_t))
283{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size;
288
289 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
290 return 0;
291
292 size = buf_pos[2];
293
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! "
296 "Wrong microcode equivalnet cpu table\n");
297 return 0;
298 }
299
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
303 return 0;
304 }
305
306 buf += UCODE_CONTAINER_HEADER_SIZE;
307 if (get_ucode_data(equiv_cpu_table, buf, size)) {
308 vfree(equiv_cpu_table);
309 return 0;
310 }
311
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314}
315
316static void free_equiv_cpu_table(void)
317{
318 if (equiv_cpu_table) {
319 vfree(equiv_cpu_table);
320 equiv_cpu_table = NULL;
321 }
322}
323
324static int generic_load_microcode(int cpu, void *data, size_t size,
325 int (*get_ucode_data)(void *, const void *, size_t))
326{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
329 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover;
331 unsigned long offset;
332
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
334 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
336 return -EINVAL;
337 }
338
339 ucode_ptr += offset;
340 leftover = size - offset;
341
342 while (leftover) {
343 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header;
345
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
347 if (!mc)
348 break;
349
350 mc_header = (struct microcode_header_amd *)mc;
351 if (get_matching_microcode(cpu, mc, new_rev)) {
352 if (new_mc)
353 vfree(new_mc);
354 new_rev = mc_header->patch_id;
355 new_mc = mc;
356 } else
357 vfree(mc);
358
359 ucode_ptr += mc_size;
360 leftover -= mc_size;
361 }
362
363 if (new_mc) {
364 if (!leftover) {
365 if (uci->mc)
366 vfree(uci->mc);
367 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with"
369 " version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev);
371 } else
372 vfree(new_mc);
373 }
374
375 free_equiv_cpu_table();
376
377 return (int)leftover;
378}
379
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device)
387{
388 const char *fw_name = "amd-ucode/microcode_amd.bin";
389 const struct firmware *firmware;
390 int ret;
391
392 /* We should bind the task to the CPU */
393 BUG_ON(cpu != raw_smp_processor_id());
394
395 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
398 return ret;
399 }
400
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
402 &get_ucode_fw);
403
404 release_firmware(firmware);
405
406 return ret;
407}
408
409static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
412 "is not supported\n");
413 return -1;
414}
415
416static void microcode_fini_cpu_amd(int cpu)
417{
418 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
419
420 vfree(uci->mc);
421 uci->mc = NULL;
422}
423
424static struct microcode_ops microcode_amd_ops = {
425 .request_microcode_user = request_microcode_user,
426 .request_microcode_fw = request_microcode_fw,
427 .collect_cpu_info = collect_cpu_info_amd,
428 .apply_microcode = apply_microcode_amd,
429 .microcode_fini_cpu = microcode_fini_cpu_amd,
430};
431
432struct microcode_ops * __init init_amd_microcode(void)
433{
434 return &microcode_amd_ops;
435}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 000000000000..936d8d55f230
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100#define MICROCODE_VERSION "2.00"
101
102struct microcode_ops *microcode_ops;
103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex);
106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size)
112{
113 cpumask_t old;
114 int error = 0;
115 int cpu;
116
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121
122 if (!uci->valid)
123 continue;
124
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0)
128 goto out;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error;
135}
136
137static int microcode_open(struct inode *unused1, struct file *unused2)
138{
139 cycle_kernel_lock();
140 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
141}
142
143static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos)
145{
146 ssize_t ret;
147
148 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
150 num_physpages);
151 return -EINVAL;
152 }
153
154 get_online_cpus();
155 mutex_lock(&microcode_mutex);
156
157 ret = do_microcode_update(buf, len);
158 if (!ret)
159 ret = (ssize_t)len;
160
161 mutex_unlock(&microcode_mutex);
162 put_online_cpus();
163
164 return ret;
165}
166
167static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE,
169 .write = microcode_write,
170 .open = microcode_open,
171};
172
173static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR,
175 .name = "microcode",
176 .fops = &microcode_fops,
177};
178
179static int __init microcode_dev_init(void)
180{
181 int error;
182
183 error = misc_register(&microcode_dev);
184 if (error) {
185 printk(KERN_ERR
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error;
189 }
190
191 return 0;
192}
193
194static void microcode_dev_exit(void)
195{
196 misc_deregister(&microcode_dev);
197}
198
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else
201#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0)
203#endif
204
205/* fake device for request_firmware */
206struct platform_device *microcode_pdev;
207
208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr,
210 const char *buf, size_t sz)
211{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0;
216 int cpu = dev->id;
217
218 if (end == buf)
219 return -EINVAL;
220 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus();
224 if (cpu_online(cpu)) {
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus();
237 }
238 if (err)
239 return err;
240 return sz;
241}
242
243static ssize_t version_show(struct sys_device *dev,
244 struct sysdev_attribute *attr, char *buf)
245{
246 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
247
248 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
249}
250
251static ssize_t pf_show(struct sys_device *dev,
252 struct sysdev_attribute *attr, char *buf)
253{
254 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
255
256 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
257}
258
259static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
260static SYSDEV_ATTR(version, 0400, version_show, NULL);
261static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
262
263static struct attribute *mc_default_attrs[] = {
264 &attr_reload.attr,
265 &attr_version.attr,
266 &attr_processor_flags.attr,
267 NULL
268};
269
270static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs,
272 .name = "microcode",
273};
274
275static void microcode_fini_cpu(int cpu)
276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0;
282 mutex_unlock(&microcode_mutex);
283}
284
285static void collect_cpu_info(int cpu)
286{
287 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
288
289 memset(uci, 0, sizeof(*uci));
290 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
291 uci->valid = 1;
292}
293
294static int microcode_resume_cpu(int cpu)
295{
296 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
297 struct cpu_signature nsig;
298
299 pr_debug("microcode: CPU%d resumed\n", cpu);
300
301 if (!uci->mc)
302 return 1;
303
304 /*
305 * Let's verify that the 'cached' ucode does belong
306 * to this cpu (a bit of paranoia):
307 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu);
310 return -1;
311 }
312
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
314 microcode_fini_cpu(cpu);
315 /* Should we look for a new ucode here? */
316 return 1;
317 }
318
319 return 0;
320}
321
322void microcode_update_cpu(int cpu)
323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0;
326
327 /*
328 * Check if the system resume is in progress (uci->valid != NULL),
329 * otherwise just request a firmware:
330 */
331 if (uci->valid) {
332 err = microcode_resume_cpu(cpu);
333 } else {
334 collect_cpu_info(cpu);
335 if (uci->valid && system_state == SYSTEM_RUNNING)
336 err = microcode_ops->request_microcode_fw(cpu,
337 &microcode_pdev->dev);
338 }
339 if (!err)
340 microcode_ops->apply_microcode(cpu);
341}
342
343static void microcode_init_cpu(int cpu)
344{
345 cpumask_t old = current->cpus_allowed;
346
347 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
348 /* We should bind the task to the CPU */
349 BUG_ON(raw_smp_processor_id() != cpu);
350
351 mutex_lock(&microcode_mutex);
352 microcode_update_cpu(cpu);
353 mutex_unlock(&microcode_mutex);
354
355 set_cpus_allowed_ptr(current, &old);
356}
357
358static int mc_sysdev_add(struct sys_device *sys_dev)
359{
360 int err, cpu = sys_dev->id;
361 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
362
363 if (!cpu_online(cpu))
364 return 0;
365
366 pr_debug("microcode: CPU%d added\n", cpu);
367 memset(uci, 0, sizeof(*uci));
368
369 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
370 if (err)
371 return err;
372
373 microcode_init_cpu(cpu);
374 return 0;
375}
376
377static int mc_sysdev_remove(struct sys_device *sys_dev)
378{
379 int cpu = sys_dev->id;
380
381 if (!cpu_online(cpu))
382 return 0;
383
384 pr_debug("microcode: CPU%d removed\n", cpu);
385 microcode_fini_cpu(cpu);
386 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
387 return 0;
388}
389
390static int mc_sysdev_resume(struct sys_device *dev)
391{
392 int cpu = dev->id;
393
394 if (!cpu_online(cpu))
395 return 0;
396
397 /* only CPU 0 will apply ucode here */
398 microcode_update_cpu(0);
399 return 0;
400}
401
402static struct sysdev_driver mc_sysdev_driver = {
403 .add = mc_sysdev_add,
404 .remove = mc_sysdev_remove,
405 .resume = mc_sysdev_resume,
406};
407
408static __cpuinit int
409mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
410{
411 unsigned int cpu = (unsigned long)hcpu;
412 struct sys_device *sys_dev;
413
414 sys_dev = get_cpu_sysdev(cpu);
415 switch (action) {
416 case CPU_ONLINE:
417 case CPU_ONLINE_FROZEN:
418 microcode_init_cpu(cpu);
419 case CPU_DOWN_FAILED:
420 case CPU_DOWN_FAILED_FROZEN:
421 pr_debug("microcode: CPU%d added\n", cpu);
422 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
423 printk(KERN_ERR "microcode: Failed to create the sysfs "
424 "group for CPU%d\n", cpu);
425 break;
426 case CPU_DOWN_PREPARE:
427 case CPU_DOWN_PREPARE_FROZEN:
428 /* Suspend is in progress, only remove the interface */
429 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
430 pr_debug("microcode: CPU%d removed\n", cpu);
431 break;
432 case CPU_DEAD:
433 case CPU_UP_CANCELED_FROZEN:
434 /* The CPU refused to come up during a system resume */
435 microcode_fini_cpu(cpu);
436 break;
437 }
438 return NOTIFY_OK;
439}
440
441static struct notifier_block __refdata mc_cpu_notifier = {
442 .notifier_call = mc_cpu_callback,
443};
444
445static int __init microcode_init(void)
446{
447 struct cpuinfo_x86 *c = &cpu_data(0);
448 int error;
449
450 if (c->x86_vendor == X86_VENDOR_INTEL)
451 microcode_ops = init_intel_microcode();
452 else if (c->x86_vendor == X86_VENDOR_AMD)
453 microcode_ops = init_amd_microcode();
454
455 if (!microcode_ops) {
456 printk(KERN_ERR "microcode: no support for this CPU vendor\n");
457 return -ENODEV;
458 }
459
460 error = microcode_dev_init();
461 if (error)
462 return error;
463 microcode_pdev = platform_device_register_simple("microcode", -1,
464 NULL, 0);
465 if (IS_ERR(microcode_pdev)) {
466 microcode_dev_exit();
467 return PTR_ERR(microcode_pdev);
468 }
469
470 get_online_cpus();
471 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
472 put_online_cpus();
473 if (error) {
474 microcode_dev_exit();
475 platform_device_unregister(microcode_pdev);
476 return error;
477 }
478
479 register_hotcpu_notifier(&mc_cpu_notifier);
480
481 printk(KERN_INFO
482 "Microcode Update Driver: v" MICROCODE_VERSION
483 " <tigran@aivazian.fsnet.co.uk>"
484 " <peter.oruba@amd.com>\n");
485
486 return 0;
487}
488
489static void __exit microcode_exit(void)
490{
491 microcode_dev_exit();
492
493 unregister_hotcpu_notifier(&mc_cpu_notifier);
494
495 get_online_cpus();
496 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
497 put_online_cpus();
498
499 platform_device_unregister(microcode_pdev);
500
501 microcode_ops = NULL;
502
503 printk(KERN_INFO
504 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
505}
506
507module_init(microcode_init);
508module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 000000000000..622dc4a21784
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100struct microcode_header_intel {
101 unsigned int hdrver;
102 unsigned int rev;
103 unsigned int date;
104 unsigned int sig;
105 unsigned int cksum;
106 unsigned int ldrver;
107 unsigned int pf;
108 unsigned int datasize;
109 unsigned int totalsize;
110 unsigned int reserved[3];
111};
112
113struct microcode_intel {
114 struct microcode_header_intel hdr;
115 unsigned int bits[0];
116};
117
118/* microcode format is extended from prescott processors */
119struct extended_signature {
120 unsigned int sig;
121 unsigned int pf;
122 unsigned int cksum;
123};
124
125struct extended_sigtable {
126 unsigned int count;
127 unsigned int cksum;
128 unsigned int reserved[3];
129 struct extended_signature sigs[0];
130};
131
132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32))
138#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \
141 DEFAULT_UCODE_TOTALSIZE)
142
143#define get_datasize(mc) \
144 (((struct microcode_intel *)mc)->hdr.datasize ? \
145 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
146
147#define sigmatch(s1, s2, p1, p2) \
148 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
149
150#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
151
152/* serialize access to the physical write to MSR 0x79 */
153static DEFINE_SPINLOCK(microcode_update_lock);
154
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned int val[2];
159
160 memset(csig, 0, sizeof(*csig));
161
162 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
163 cpu_has(c, X86_FEATURE_IA64)) {
164 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
165 "processor\n", cpu_num);
166 return -1;
167 }
168
169 csig->sig = cpuid_eax(0x00000001);
170
171 if ((c->x86_model >= 5) || (c->x86 > 6)) {
172 /* get processor flags from MSR 0x17 */
173 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
174 csig->pf = 1 << ((val[1] >> 18) & 7);
175 }
176
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core();
180 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev);
184
185 return 0;
186}
187
188static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
189{
190 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
191}
192
193static inline int
194update_match_revision(struct microcode_header_intel *mc_header, int rev)
195{
196 return (mc_header->rev <= rev) ? 0 : 1;
197}
198
199static int microcode_sanity_check(void *mc)
200{
201 struct microcode_header_intel *mc_header = mc;
202 struct extended_sigtable *ext_header = NULL;
203 struct extended_signature *ext_sig;
204 unsigned long total_size, data_size, ext_table_size;
205 int sum, orig_sum, ext_sigcount = 0, i;
206
207 total_size = get_totalsize(mc_header);
208 data_size = get_datasize(mc_header);
209 if (data_size + MC_HEADER_SIZE > total_size) {
210 printk(KERN_ERR "microcode: error! "
211 "Bad data size in microcode data file\n");
212 return -EINVAL;
213 }
214
215 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
216 printk(KERN_ERR "microcode: error! "
217 "Unknown microcode update format\n");
218 return -EINVAL;
219 }
220 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
221 if (ext_table_size) {
222 if ((ext_table_size < EXT_HEADER_SIZE)
223 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
224 printk(KERN_ERR "microcode: error! "
225 "Small exttable size in microcode data file\n");
226 return -EINVAL;
227 }
228 ext_header = mc + MC_HEADER_SIZE + data_size;
229 if (ext_table_size != exttable_size(ext_header)) {
230 printk(KERN_ERR "microcode: error! "
231 "Bad exttable size in microcode data file\n");
232 return -EFAULT;
233 }
234 ext_sigcount = ext_header->count;
235 }
236
237 /* check extended table checksum */
238 if (ext_table_size) {
239 int ext_table_sum = 0;
240 int *ext_tablep = (int *)ext_header;
241
242 i = ext_table_size / DWSIZE;
243 while (i--)
244 ext_table_sum += ext_tablep[i];
245 if (ext_table_sum) {
246 printk(KERN_WARNING "microcode: aborting, "
247 "bad extended signature table checksum\n");
248 return -EINVAL;
249 }
250 }
251
252 /* calculate the checksum */
253 orig_sum = 0;
254 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
255 while (i--)
256 orig_sum += ((int *)mc)[i];
257 if (orig_sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n");
259 return -EINVAL;
260 }
261 if (!ext_table_size)
262 return 0;
263 /* check extended signature checksum */
264 for (i = 0; i < ext_sigcount; i++) {
265 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
266 EXT_SIGNATURE_SIZE * i;
267 sum = orig_sum
268 - (mc_header->sig + mc_header->pf + mc_header->cksum)
269 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
270 if (sum) {
271 printk(KERN_ERR "microcode: aborting, bad checksum\n");
272 return -EINVAL;
273 }
274 }
275 return 0;
276}
277
278/*
279 * return 0 - no update found
280 * return 1 - found update
281 */
282static int
283get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
284{
285 struct microcode_header_intel *mc_header = mc;
286 struct extended_sigtable *ext_header;
287 unsigned long total_size = get_totalsize(mc_header);
288 int ext_sigcount, i;
289 struct extended_signature *ext_sig;
290
291 if (!update_match_revision(mc_header, rev))
292 return 0;
293
294 if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
295 return 1;
296
297 /* Look for ext. headers: */
298 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
299 return 0;
300
301 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
302 ext_sigcount = ext_header->count;
303 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
304
305 for (i = 0; i < ext_sigcount; i++) {
306 if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
307 return 1;
308 ext_sig++;
309 }
310 return 0;
311}
312
313static void apply_microcode(int cpu)
314{
315 unsigned long flags;
316 unsigned int val[2];
317 int cpu_num = raw_smp_processor_id();
318 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
319 struct microcode_intel *mc_intel = uci->mc;
320
321 /* We should bind the task to the CPU */
322 BUG_ON(cpu_num != cpu);
323
324 if (mc_intel == NULL)
325 return;
326
327 /* serialize access to the physical write to MSR 0x79 */
328 spin_lock_irqsave(&microcode_update_lock, flags);
329
330 /* write microcode via MSR 0x79 */
331 wrmsr(MSR_IA32_UCODE_WRITE,
332 (unsigned long) mc_intel->bits,
333 (unsigned long) mc_intel->bits >> 16 >> 16);
334 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
335
336 /* see notes above for revision 1.07. Apparent chip bug */
337 sync_core();
338
339 /* get the current revision from MSR 0x8B */
340 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
341
342 spin_unlock_irqrestore(&microcode_update_lock, flags);
343 if (val[1] != mc_intel->hdr.rev) {
344 printk(KERN_ERR "microcode: CPU%d update from revision "
345 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
346 return;
347 }
348 printk(KERN_INFO "microcode: CPU%d updated from revision "
349 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
350 cpu_num, uci->cpu_sig.rev, val[1],
351 mc_intel->hdr.date & 0xffff,
352 mc_intel->hdr.date >> 24,
353 (mc_intel->hdr.date >> 16) & 0xff);
354 uci->cpu_sig.rev = val[1];
355}
356
357static int generic_load_microcode(int cpu, void *data, size_t size,
358 int (*get_ucode_data)(void *, const void *, size_t))
359{
360 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
361 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
362 int new_rev = uci->cpu_sig.rev;
363 unsigned int leftover = size;
364
365 while (leftover) {
366 struct microcode_header_intel mc_header;
367 unsigned int mc_size;
368
369 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
370 break;
371
372 mc_size = get_totalsize(&mc_header);
373 if (!mc_size || mc_size > leftover) {
374 printk(KERN_ERR "microcode: error!"
375 "Bad data in microcode data file\n");
376 break;
377 }
378
379 mc = vmalloc(mc_size);
380 if (!mc)
381 break;
382
383 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
384 microcode_sanity_check(mc) < 0) {
385 vfree(mc);
386 break;
387 }
388
389 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
390 if (new_mc)
391 vfree(new_mc);
392 new_rev = mc_header.rev;
393 new_mc = mc;
394 } else
395 vfree(mc);
396
397 ucode_ptr += mc_size;
398 leftover -= mc_size;
399 }
400
401 if (new_mc) {
402 if (!leftover) {
403 if (uci->mc)
404 vfree(uci->mc);
405 uci->mc = (struct microcode_intel *)new_mc;
406 pr_debug("microcode: CPU%d found a matching microcode update with"
407 " version 0x%x (current=0x%x)\n",
408 cpu, new_rev, uci->cpu_sig.rev);
409 } else
410 vfree(new_mc);
411 }
412
413 return (int)leftover;
414}
415
416static int get_ucode_fw(void *to, const void *from, size_t n)
417{
418 memcpy(to, from, n);
419 return 0;
420}
421
422static int request_microcode_fw(int cpu, struct device *device)
423{
424 char name[30];
425 struct cpuinfo_x86 *c = &cpu_data(cpu);
426 const struct firmware *firmware;
427 int ret;
428
429 /* We should bind the task to the CPU */
430 BUG_ON(cpu != raw_smp_processor_id());
431 sprintf(name, "intel-ucode/%02x-%02x-%02x",
432 c->x86, c->x86_model, c->x86_mask);
433 ret = request_firmware(&firmware, name, device);
434 if (ret) {
435 pr_debug("microcode: data file %s load failed\n", name);
436 return ret;
437 }
438
439 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
440 &get_ucode_fw);
441
442 release_firmware(firmware);
443
444 return ret;
445}
446
447static int get_ucode_user(void *to, const void *from, size_t n)
448{
449 return copy_from_user(to, from, n);
450}
451
452static int request_microcode_user(int cpu, const void __user *buf, size_t size)
453{
454 /* We should bind the task to the CPU */
455 BUG_ON(cpu != raw_smp_processor_id());
456
457 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
458}
459
460static void microcode_fini_cpu(int cpu)
461{
462 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
463
464 vfree(uci->mc);
465 uci->mc = NULL;
466}
467
468struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info,
472 .apply_microcode = apply_microcode,
473 .microcode_fini_cpu = microcode_fini_cpu,
474};
475
476struct microcode_ops * __init init_intel_microcode(void)
477{
478 return &microcode_intel_ops;
479}
480
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b3fb430725cb..f98f4e1dba09 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -397,7 +397,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
397 generic_bigsmp_probe(); 397 generic_bigsmp_probe();
398#endif 398#endif
399 399
400#ifdef CONFIG_X86_32
400 setup_apic_routing(); 401 setup_apic_routing();
402#endif
401 if (!num_processors) 403 if (!num_processors)
402 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
403 return num_processors; 405 return num_processors;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index eecc8c18f010..4caff39078e0 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -229,6 +229,12 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
229 } 229 }
230} 230}
231 231
232static int __init numaq_setup_ioapic_ids(void)
233{
234 /* so can skip it */
235 return 1;
236}
237
232static struct x86_quirks numaq_x86_quirks __initdata = { 238static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init, 239 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL, 240 .arch_time_init = NULL,
@@ -243,6 +249,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
243 .mpc_oem_bus_info = mpc_oem_bus_info, 249 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus, 250 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem, 251 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids,
246}; 253};
247 254
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, 255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e2f43768723a..e4c8fb608873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
268 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
269} 269}
270 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
282struct pv_info pv_info = { 271struct pv_info pv_info = {
283 .name = "bare hardware", 272 .name = "bare hardware",
284 .paravirt_enabled = 0, 273 .paravirt_enabled = 0,
@@ -349,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
349 .write_ldt_entry = native_write_ldt_entry, 338 .write_ldt_entry = native_write_ldt_entry,
350 .write_gdt_entry = native_write_gdt_entry, 339 .write_gdt_entry = native_write_gdt_entry,
351 .write_idt_entry = native_write_idt_entry, 340 .write_idt_entry = native_write_idt_entry,
341
342 .alloc_ldt = paravirt_nop,
343 .free_ldt = paravirt_nop,
344
352 .load_sp0 = native_load_sp0, 345 .load_sp0 = native_load_sp0,
353 346
354#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -374,8 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = {
374 367
375struct pv_apic_ops pv_apic_ops = { 368struct pv_apic_ops pv_apic_ops = {
376#ifdef CONFIG_X86_LOCAL_APIC 369#ifdef CONFIG_X86_LOCAL_APIC
377 .apic_write = native_apic_write,
378 .apic_read = native_apic_read,
379 .setup_boot_clock = setup_boot_APIC_clock, 370 .setup_boot_clock = setup_boot_APIC_clock,
380 .setup_secondary_clock = setup_secondary_APIC_clock, 371 .setup_secondary_clock = setup_secondary_APIC_clock,
381 .startup_ipi_hook = paravirt_nop, 372 .startup_ipi_hook = paravirt_nop,
@@ -462,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = {
462 .set_fixmap = native_set_fixmap, 453 .set_fixmap = native_set_fixmap,
463}; 454};
464 455
465struct pv_lock_ops pv_lock_ops = {
466#ifdef CONFIG_SMP
467 .spin_is_locked = __ticket_spin_is_locked,
468 .spin_is_contended = __ticket_spin_is_contended,
469
470 .spin_lock = __ticket_spin_lock,
471 .spin_trylock = __ticket_spin_trylock,
472 .spin_unlock = __ticket_spin_unlock,
473#endif
474};
475EXPORT_SYMBOL(pv_lock_ops);
476
477EXPORT_SYMBOL_GPL(pv_time_ops); 456EXPORT_SYMBOL_GPL(pv_time_ops);
478EXPORT_SYMBOL (pv_cpu_ops); 457EXPORT_SYMBOL (pv_cpu_ops);
479EXPORT_SYMBOL (pv_mmu_ops); 458EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ec7a2ba9bce8..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
19 18
20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
21{ 20{
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 205188db9626..0a1302fe6d45 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -76,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
76 return ((unsigned long *)tsk->thread.sp)[3]; 76 return ((unsigned long *)tsk->thread.sp)[3];
77} 77}
78 78
79#ifdef CONFIG_HOTPLUG_CPU 79#ifndef CONFIG_SMP
80#include <asm/nmi.h>
81
82static void cpu_exit_clear(void)
83{
84 int cpu = raw_smp_processor_id();
85
86 idle_task_exit();
87
88 cpu_uninit();
89 irq_ctx_exit(cpu);
90
91 cpu_clear(cpu, cpu_callout_map);
92 cpu_clear(cpu, cpu_callin_map);
93
94 numa_remove_cpu(cpu);
95 c1e_remove_cpu(cpu);
96}
97
98/* We don't actually take CPU down, just spin without interrupts. */
99static inline void play_dead(void)
100{
101 /* This must be done before dead CPU ack */
102 cpu_exit_clear();
103 mb();
104 /* Ack it */
105 __get_cpu_var(cpu_state) = CPU_DEAD;
106
107 /*
108 * With physical CPU hotplug, we should halt the cpu
109 */
110 local_irq_disable();
111 /* mask all interrupts, flush any and all caches, and halt */
112 wbinvd_halt();
113}
114#else
115static inline void play_dead(void) 80static inline void play_dead(void)
116{ 81{
117 BUG(); 82 BUG();
118} 83}
119#endif /* CONFIG_HOTPLUG_CPU */ 84#endif
120 85
121/* 86/*
122 * The idle thread. There's no useful work to be 87 * The idle thread. There's no useful work to be
@@ -158,7 +123,7 @@ void cpu_idle(void)
158 } 123 }
159} 124}
160 125
161void __show_registers(struct pt_regs *regs, int all) 126void __show_regs(struct pt_regs *regs, int all)
162{ 127{
163 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 128 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
164 unsigned long d0, d1, d2, d3, d6, d7; 129 unsigned long d0, d1, d2, d3, d6, d7;
@@ -224,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all)
224 189
225void show_regs(struct pt_regs *regs) 190void show_regs(struct pt_regs *regs)
226{ 191{
227 __show_registers(regs, 1); 192 __show_regs(regs, 1);
228 show_trace(NULL, regs, &regs->sp, regs->bp); 193 show_trace(NULL, regs, &regs->sp, regs->bp);
229} 194}
230 195
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2a8ccb9238b4..cd8c0ed02b7e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -86,30 +86,12 @@ void exit_idle(void)
86 __exit_idle(); 86 __exit_idle();
87} 87}
88 88
89#ifdef CONFIG_HOTPLUG_CPU 89#ifndef CONFIG_SMP
90DECLARE_PER_CPU(int, cpu_state);
91
92#include <linux/nmi.h>
93/* We halt the CPU with physical CPU hotplug */
94static inline void play_dead(void)
95{
96 idle_task_exit();
97 c1e_remove_cpu(raw_smp_processor_id());
98
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
102
103 local_irq_disable();
104 /* mask all interrupts, flush any and all caches, and halt */
105 wbinvd_halt();
106}
107#else
108static inline void play_dead(void) 90static inline void play_dead(void)
109{ 91{
110 BUG(); 92 BUG();
111} 93}
112#endif /* CONFIG_HOTPLUG_CPU */ 94#endif
113 95
114/* 96/*
115 * The idle thread. There's no useful work to be 97 * The idle thread. There's no useful work to be
@@ -154,7 +136,7 @@ void cpu_idle(void)
154} 136}
155 137
156/* Prints also some state that isn't saved in the pt_regs */ 138/* Prints also some state that isn't saved in the pt_regs */
157void __show_regs(struct pt_regs *regs) 139void __show_regs(struct pt_regs *regs, int all)
158{ 140{
159 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 141 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
160 unsigned long d0, d1, d2, d3, d6, d7; 142 unsigned long d0, d1, d2, d3, d6, d7;
@@ -193,6 +175,9 @@ void __show_regs(struct pt_regs *regs)
193 rdmsrl(MSR_GS_BASE, gs); 175 rdmsrl(MSR_GS_BASE, gs);
194 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 176 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
195 177
178 if (!all)
179 return;
180
196 cr0 = read_cr0(); 181 cr0 = read_cr0();
197 cr2 = read_cr2(); 182 cr2 = read_cr2();
198 cr3 = read_cr3(); 183 cr3 = read_cr3();
@@ -218,7 +203,7 @@ void __show_regs(struct pt_regs *regs)
218void show_regs(struct pt_regs *regs) 203void show_regs(struct pt_regs *regs)
219{ 204{
220 printk(KERN_INFO "CPU %d:", smp_processor_id()); 205 printk(KERN_INFO "CPU %d:", smp_processor_id());
221 __show_regs(regs); 206 __show_regs(regs, 1);
222 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 207 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
223} 208}
224 209
@@ -754,12 +739,12 @@ unsigned long get_wchan(struct task_struct *p)
754 if (!p || p == current || p->state == TASK_RUNNING) 739 if (!p || p == current || p->state == TASK_RUNNING)
755 return 0; 740 return 0;
756 stack = (unsigned long)task_stack_page(p); 741 stack = (unsigned long)task_stack_page(p);
757 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 742 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
758 return 0; 743 return 0;
759 fp = *(u64 *)(p->thread.sp); 744 fp = *(u64 *)(p->thread.sp);
760 do { 745 do {
761 if (fp < (unsigned long)stack || 746 if (fp < (unsigned long)stack ||
762 fp > (unsigned long)stack+THREAD_SIZE) 747 fp >= (unsigned long)stack+THREAD_SIZE)
763 return 0; 748 return 0;
764 ip = *(u64 *)(fp+8); 749 ip = *(u64 *)(fp+8);
765 if (!in_sched_functions(ip)) 750 if (!in_sched_functions(ip))
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e375b658efc3..0a6d8c12e10d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -40,7 +40,9 @@ enum x86_regset {
40 REGSET_GENERAL, 40 REGSET_GENERAL,
41 REGSET_FP, 41 REGSET_FP,
42 REGSET_XFP, 42 REGSET_XFP,
43 REGSET_IOPERM64 = REGSET_XFP,
43 REGSET_TLS, 44 REGSET_TLS,
45 REGSET_IOPERM32,
44}; 46};
45 47
46/* 48/*
@@ -555,6 +557,29 @@ static int ptrace_set_debugreg(struct task_struct *child,
555 return 0; 557 return 0;
556} 558}
557 559
560/*
561 * These access the current or another (stopped) task's io permission
562 * bitmap for debugging or core dump.
563 */
564static int ioperm_active(struct task_struct *target,
565 const struct user_regset *regset)
566{
567 return target->thread.io_bitmap_max / regset->size;
568}
569
570static int ioperm_get(struct task_struct *target,
571 const struct user_regset *regset,
572 unsigned int pos, unsigned int count,
573 void *kbuf, void __user *ubuf)
574{
575 if (!target->thread.io_bitmap_ptr)
576 return -ENXIO;
577
578 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
579 target->thread.io_bitmap_ptr,
580 0, IO_BITMAP_BYTES);
581}
582
558#ifdef CONFIG_X86_PTRACE_BTS 583#ifdef CONFIG_X86_PTRACE_BTS
559/* 584/*
560 * The configuration for a particular BTS hardware implementation. 585 * The configuration for a particular BTS hardware implementation.
@@ -1385,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
1385 .size = sizeof(long), .align = sizeof(long), 1410 .size = sizeof(long), .align = sizeof(long),
1386 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1411 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1387 }, 1412 },
1413 [REGSET_IOPERM64] = {
1414 .core_note_type = NT_386_IOPERM,
1415 .n = IO_BITMAP_LONGS,
1416 .size = sizeof(long), .align = sizeof(long),
1417 .active = ioperm_active, .get = ioperm_get
1418 },
1388}; 1419};
1389 1420
1390static const struct user_regset_view user_x86_64_view = { 1421static const struct user_regset_view user_x86_64_view = {
@@ -1431,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
1431 .active = regset_tls_active, 1462 .active = regset_tls_active,
1432 .get = regset_tls_get, .set = regset_tls_set 1463 .get = regset_tls_get, .set = regset_tls_set
1433 }, 1464 },
1465 [REGSET_IOPERM32] = {
1466 .core_note_type = NT_386_IOPERM,
1467 .n = IO_BITMAP_BYTES / sizeof(u32),
1468 .size = sizeof(u32), .align = sizeof(u32),
1469 .active = ioperm_active, .get = ioperm_get
1470 },
1434}; 1471};
1435 1472
1436static const struct user_regset_view user_x86_32_view = { 1473static const struct user_regset_view user_x86_32_view = {
@@ -1452,7 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1452#endif 1489#endif
1453} 1490}
1454 1491
1455void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1492void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1493 int error_code, int si_code)
1456{ 1494{
1457 struct siginfo info; 1495 struct siginfo info;
1458 1496
@@ -1461,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1461 1499
1462 memset(&info, 0, sizeof(info)); 1500 memset(&info, 0, sizeof(info));
1463 info.si_signo = SIGTRAP; 1501 info.si_signo = SIGTRAP;
1464 info.si_code = TRAP_BRKPT; 1502 info.si_code = si_code;
1465 1503
1466 /* User-mode ip? */ 1504 /* User-mode ip? */
1467 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1505 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1548,5 +1586,5 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1548 */ 1586 */
1549 if (test_thread_flag(TIF_SINGLESTEP) && 1587 if (test_thread_flag(TIF_SINGLESTEP) &&
1550 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) 1588 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1551 send_sigtrap(current, regs, 0); 1589 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1552} 1590}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d13858818100..f6a11b9b1f98 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -354,9 +354,27 @@ static void ati_force_hpet_resume(void)
354 printk(KERN_DEBUG "Force enabled HPET at resume\n"); 354 printk(KERN_DEBUG "Force enabled HPET at resume\n");
355} 355}
356 356
357static u32 ati_ixp4x0_rev(struct pci_dev *dev)
358{
359 u32 d;
360 u8 b;
361
362 pci_read_config_byte(dev, 0xac, &b);
363 b &= ~(1<<5);
364 pci_write_config_byte(dev, 0xac, b);
365 pci_read_config_dword(dev, 0x70, &d);
366 d |= 1<<8;
367 pci_write_config_dword(dev, 0x70, d);
368 pci_read_config_dword(dev, 0x8, &d);
369 d &= 0xff;
370 dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
371 return d;
372}
373
357static void ati_force_enable_hpet(struct pci_dev *dev) 374static void ati_force_enable_hpet(struct pci_dev *dev)
358{ 375{
359 u32 uninitialized_var(val); 376 u32 d, val;
377 u8 b;
360 378
361 if (hpet_address || force_hpet_address) 379 if (hpet_address || force_hpet_address)
362 return; 380 return;
@@ -366,14 +384,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev)
366 return; 384 return;
367 } 385 }
368 386
387 d = ati_ixp4x0_rev(dev);
388 if (d < 0x82)
389 return;
390
391 /* base address */
369 pci_write_config_dword(dev, 0x14, 0xfed00000); 392 pci_write_config_dword(dev, 0x14, 0xfed00000);
370 pci_read_config_dword(dev, 0x14, &val); 393 pci_read_config_dword(dev, 0x14, &val);
394
395 /* enable interrupt */
396 outb(0x72, 0xcd6); b = inb(0xcd7);
397 b |= 0x1;
398 outb(0x72, 0xcd6); outb(b, 0xcd7);
399 outb(0x72, 0xcd6); b = inb(0xcd7);
400 if (!(b & 0x1))
401 return;
402 pci_read_config_dword(dev, 0x64, &d);
403 d |= (1<<10);
404 pci_write_config_dword(dev, 0x64, d);
405 pci_read_config_dword(dev, 0x64, &d);
406 if (!(d & (1<<10)))
407 return;
408
371 force_hpet_address = val; 409 force_hpet_address = val;
372 force_hpet_resume_type = ATI_FORCE_HPET_RESUME; 410 force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
373 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", 411 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
374 force_hpet_address); 412 force_hpet_address);
375 cached_dev = dev; 413 cached_dev = dev;
376 return;
377} 414}
378DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, 415DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
379 ati_force_enable_hpet); 416 ati_force_enable_hpet);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 141efab52400..2255782e8d4b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -302,7 +302,7 @@ static void __init relocate_initrd(void)
302 if (clen > MAX_MAP_CHUNK-slop) 302 if (clen > MAX_MAP_CHUNK-slop)
303 clen = MAX_MAP_CHUNK-slop; 303 clen = MAX_MAP_CHUNK-slop;
304 mapaddr = ramdisk_image & PAGE_MASK; 304 mapaddr = ramdisk_image & PAGE_MASK;
305 p = early_ioremap(mapaddr, clen+slop); 305 p = early_memremap(mapaddr, clen+slop);
306 memcpy(q, p+slop, clen); 306 memcpy(q, p+slop, clen);
307 early_iounmap(p, clen+slop); 307 early_iounmap(p, clen+slop);
308 q += clen; 308 q += clen;
@@ -379,7 +379,7 @@ static void __init parse_setup_data(void)
379 return; 379 return;
380 pa_data = boot_params.hdr.setup_data; 380 pa_data = boot_params.hdr.setup_data;
381 while (pa_data) { 381 while (pa_data) {
382 data = early_ioremap(pa_data, PAGE_SIZE); 382 data = early_memremap(pa_data, PAGE_SIZE);
383 switch (data->type) { 383 switch (data->type) {
384 case SETUP_E820_EXT: 384 case SETUP_E820_EXT:
385 parse_e820_ext(data, pa_data); 385 parse_e820_ext(data, pa_data);
@@ -402,7 +402,7 @@ static void __init e820_reserve_setup_data(void)
402 return; 402 return;
403 pa_data = boot_params.hdr.setup_data; 403 pa_data = boot_params.hdr.setup_data;
404 while (pa_data) { 404 while (pa_data) {
405 data = early_ioremap(pa_data, sizeof(*data)); 405 data = early_memremap(pa_data, sizeof(*data));
406 e820_update_range(pa_data, sizeof(*data)+data->len, 406 e820_update_range(pa_data, sizeof(*data)+data->len,
407 E820_RAM, E820_RESERVED_KERN); 407 E820_RAM, E820_RESERVED_KERN);
408 found = 1; 408 found = 1;
@@ -428,7 +428,7 @@ static void __init reserve_early_setup_data(void)
428 return; 428 return;
429 pa_data = boot_params.hdr.setup_data; 429 pa_data = boot_params.hdr.setup_data;
430 while (pa_data) { 430 while (pa_data) {
431 data = early_ioremap(pa_data, sizeof(*data)); 431 data = early_memremap(pa_data, sizeof(*data));
432 sprintf(buf, "setup data %x", data->type); 432 sprintf(buf, "setup data %x", data->type);
433 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); 433 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
434 pa_data = data->next; 434 pa_data = data->next;
@@ -582,6 +582,190 @@ static struct x86_quirks default_x86_quirks __initdata;
582struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 582struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
583 583
584/* 584/*
585 * Some BIOSes seem to corrupt the low 64k of memory during events
586 * like suspend/resume and unplugging an HDMI cable. Reserve all
587 * remaining free memory in that area and fill it with a distinct
588 * pattern.
589 */
590#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
591#define MAX_SCAN_AREAS 8
592
593static int __read_mostly memory_corruption_check = -1;
594
595static unsigned __read_mostly corruption_check_size = 64*1024;
596static unsigned __read_mostly corruption_check_period = 60; /* seconds */
597
598static struct e820entry scan_areas[MAX_SCAN_AREAS];
599static int num_scan_areas;
600
601
602static int set_corruption_check(char *arg)
603{
604 char *end;
605
606 memory_corruption_check = simple_strtol(arg, &end, 10);
607
608 return (*end == 0) ? 0 : -EINVAL;
609}
610early_param("memory_corruption_check", set_corruption_check);
611
612static int set_corruption_check_period(char *arg)
613{
614 char *end;
615
616 corruption_check_period = simple_strtoul(arg, &end, 10);
617
618 return (*end == 0) ? 0 : -EINVAL;
619}
620early_param("memory_corruption_check_period", set_corruption_check_period);
621
622static int set_corruption_check_size(char *arg)
623{
624 char *end;
625 unsigned size;
626
627 size = memparse(arg, &end);
628
629 if (*end == '\0')
630 corruption_check_size = size;
631
632 return (size == corruption_check_size) ? 0 : -EINVAL;
633}
634early_param("memory_corruption_check_size", set_corruption_check_size);
635
636
637static void __init setup_bios_corruption_check(void)
638{
639 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
640
641 if (memory_corruption_check == -1) {
642 memory_corruption_check =
643#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
644 1
645#else
646 0
647#endif
648 ;
649 }
650
651 if (corruption_check_size == 0)
652 memory_corruption_check = 0;
653
654 if (!memory_corruption_check)
655 return;
656
657 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
658
659 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
660 u64 size;
661 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
662
663 if (addr == 0)
664 break;
665
666 if ((addr + size) > corruption_check_size)
667 size = corruption_check_size - addr;
668
669 if (size == 0)
670 break;
671
672 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
673 scan_areas[num_scan_areas].addr = addr;
674 scan_areas[num_scan_areas].size = size;
675 num_scan_areas++;
676
677 /* Assume we've already mapped this early memory */
678 memset(__va(addr), 0, size);
679
680 addr += size;
681 }
682
683 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
684 num_scan_areas);
685 update_e820();
686}
687
688static struct timer_list periodic_check_timer;
689
690void check_for_bios_corruption(void)
691{
692 int i;
693 int corruption = 0;
694
695 if (!memory_corruption_check)
696 return;
697
698 for(i = 0; i < num_scan_areas; i++) {
699 unsigned long *addr = __va(scan_areas[i].addr);
700 unsigned long size = scan_areas[i].size;
701
702 for(; size; addr++, size -= sizeof(unsigned long)) {
703 if (!*addr)
704 continue;
705 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
706 addr, __pa(addr), *addr);
707 corruption = 1;
708 *addr = 0;
709 }
710 }
711
712 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
713}
714
715static void periodic_check_for_corruption(unsigned long data)
716{
717 check_for_bios_corruption();
718 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
719}
720
721void start_periodic_check_for_corruption(void)
722{
723 if (!memory_corruption_check || corruption_check_period == 0)
724 return;
725
726 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
727 corruption_check_period);
728
729 init_timer(&periodic_check_timer);
730 periodic_check_timer.function = &periodic_check_for_corruption;
731 periodic_check_for_corruption(0);
732}
733#endif
734
735static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
736{
737 printk(KERN_NOTICE
738 "%s detected: BIOS may corrupt low RAM, working it around.\n",
739 d->ident);
740
741 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
742 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
743
744 return 0;
745}
746
747/* List of systems that have known low memory corruption BIOS problems */
748static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
749#ifdef CONFIG_X86_RESERVE_LOW_64K
750 {
751 .callback = dmi_low_memory_corruption,
752 .ident = "AMI BIOS",
753 .matches = {
754 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
755 },
756 },
757 {
758 .callback = dmi_low_memory_corruption,
759 .ident = "Phoenix BIOS",
760 .matches = {
761 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
762 },
763 },
764#endif
765 {}
766};
767
768/*
585 * Determine if we were loaded by an EFI loader. If so, then we have also been 769 * Determine if we were loaded by an EFI loader. If so, then we have also been
586 * passed the efi memmap, systab, etc., so we should use these data structures 770 * passed the efi memmap, systab, etc., so we should use these data structures
587 * for initialization. Note, the efi init code path is determined by the 771 * for initialization. Note, the efi init code path is determined by the
@@ -715,6 +899,10 @@ void __init setup_arch(char **cmdline_p)
715 899
716 finish_e820_parsing(); 900 finish_e820_parsing();
717 901
902 dmi_scan_machine();
903
904 dmi_check_system(bad_bios_dmi_table);
905
718#ifdef CONFIG_X86_32 906#ifdef CONFIG_X86_32
719 probe_roms(); 907 probe_roms();
720#endif 908#endif
@@ -758,6 +946,8 @@ void __init setup_arch(char **cmdline_p)
758#else 946#else
759 num_physpages = max_pfn; 947 num_physpages = max_pfn;
760 948
949 if (cpu_has_x2apic)
950 check_x2apic();
761 951
762 /* How many end-of-memory variables you have, grandma! */ 952 /* How many end-of-memory variables you have, grandma! */
763 /* need this before calling reserve_initrd */ 953 /* need this before calling reserve_initrd */
@@ -769,6 +959,10 @@ void __init setup_arch(char **cmdline_p)
769 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 959 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
770#endif 960#endif
771 961
962#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
963 setup_bios_corruption_check();
964#endif
965
772 /* max_pfn_mapped is updated here */ 966 /* max_pfn_mapped is updated here */
773 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 967 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
774 max_pfn_mapped = max_low_pfn_mapped; 968 max_pfn_mapped = max_low_pfn_mapped;
@@ -797,8 +991,6 @@ void __init setup_arch(char **cmdline_p)
797 vsmp_init(); 991 vsmp_init();
798#endif 992#endif
799 993
800 dmi_scan_machine();
801
802 io_delay_init(); 994 io_delay_init();
803 995
804 /* 996 /*
@@ -806,6 +998,8 @@ void __init setup_arch(char **cmdline_p)
806 */ 998 */
807 acpi_boot_table_init(); 999 acpi_boot_table_init();
808 1000
1001 early_acpi_boot_init();
1002
809#ifdef CONFIG_ACPI_NUMA 1003#ifdef CONFIG_ACPI_NUMA
810 /* 1004 /*
811 * Parse SRAT to discover nodes. 1005 * Parse SRAT to discover nodes.
@@ -901,3 +1095,5 @@ void __init setup_arch(char **cmdline_p)
901#endif 1095#endif
902#endif 1096#endif
903} 1097}
1098
1099
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 8b4956e800ac..cc673aa55ce4 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@ struct sigframe {
3 char __user *pretcode; 3 char __user *pretcode;
4 int sig; 4 int sig;
5 struct sigcontext sc; 5 struct sigcontext sc;
6 struct _fpstate fpstate; 6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
7 unsigned long extramask[_NSIG_WORDS-1]; 15 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8]; 16 char retcode[8];
17 /* fp state follows here */
9}; 18};
10 19
11struct rt_sigframe { 20struct rt_sigframe {
@@ -15,14 +24,15 @@ struct rt_sigframe {
15 void __user *puc; 24 void __user *puc;
16 struct siginfo info; 25 struct siginfo info;
17 struct ucontext uc; 26 struct ucontext uc;
18 struct _fpstate fpstate;
19 char retcode[8]; 27 char retcode[8];
28 /* fp state follows here */
20}; 29};
21#else 30#else
22struct rt_sigframe { 31struct rt_sigframe {
23 char __user *pretcode; 32 char __user *pretcode;
24 struct ucontext uc; 33 struct ucontext uc;
25 struct siginfo info; 34 struct siginfo info;
35 /* fp state follows here */
26}; 36};
27 37
28int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 2a2435d3037d..d6dd057d0f22 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -27,6 +27,7 @@
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28#include <asm/i387.h> 28#include <asm/i387.h>
29#include <asm/vdso.h> 29#include <asm/vdso.h>
30#include <asm/syscall.h>
30#include <asm/syscalls.h> 31#include <asm/syscalls.h>
31 32
32#include "sigframe.h" 33#include "sigframe.h"
@@ -112,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
112 return do_sigaltstack(uss, uoss, regs->sp); 113 return do_sigaltstack(uss, uoss, regs->sp);
113} 114}
114 115
116#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \
118}
119
120#define COPY_SEG(seg) { \
121 unsigned short tmp; \
122 err |= __get_user(tmp, &sc->seg); \
123 regs->seg = tmp; \
124}
125
126#define COPY_SEG_STRICT(seg) { \
127 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \
130}
131
132#define GET_SEG(seg) { \
133 unsigned short tmp; \
134 err |= __get_user(tmp, &sc->seg); \
135 loadsegment(seg, tmp); \
136}
115 137
116/* 138/*
117 * Do a signal return; undo the signal stack. 139 * Do a signal return; undo the signal stack.
@@ -120,28 +142,13 @@ static int
120restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
121 unsigned long *pax) 143 unsigned long *pax)
122{ 144{
145 void __user *buf;
146 unsigned int tmpflags;
123 unsigned int err = 0; 147 unsigned int err = 0;
124 148
125 /* Always make any pending restarted system calls return -EINTR */ 149 /* Always make any pending restarted system calls return -EINTR */
126 current_thread_info()->restart_block.fn = do_no_restart_syscall; 150 current_thread_info()->restart_block.fn = do_no_restart_syscall;
127 151
128#define COPY(x) err |= __get_user(regs->x, &sc->x)
129
130#define COPY_SEG(seg) \
131 { unsigned short tmp; \
132 err |= __get_user(tmp, &sc->seg); \
133 regs->seg = tmp; }
134
135#define COPY_SEG_STRICT(seg) \
136 { unsigned short tmp; \
137 err |= __get_user(tmp, &sc->seg); \
138 regs->seg = tmp|3; }
139
140#define GET_SEG(seg) \
141 { unsigned short tmp; \
142 err |= __get_user(tmp, &sc->seg); \
143 loadsegment(seg, tmp); }
144
145 GET_SEG(gs); 152 GET_SEG(gs);
146 COPY_SEG(fs); 153 COPY_SEG(fs);
147 COPY_SEG(es); 154 COPY_SEG(es);
@@ -151,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
151 COPY_SEG_STRICT(cs); 158 COPY_SEG_STRICT(cs);
152 COPY_SEG_STRICT(ss); 159 COPY_SEG_STRICT(ss);
153 160
154 { 161 err |= __get_user(tmpflags, &sc->flags);
155 unsigned int tmpflags; 162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
156 163 regs->orig_ax = -1; /* disable syscall checks */
157 err |= __get_user(tmpflags, &sc->flags);
158 regs->flags = (regs->flags & ~FIX_EFLAGS) |
159 (tmpflags & FIX_EFLAGS);
160 regs->orig_ax = -1; /* disable syscall checks */
161 }
162
163 {
164 struct _fpstate __user *buf;
165 164
166 err |= __get_user(buf, &sc->fpstate); 165 err |= __get_user(buf, &sc->fpstate);
167 if (buf) { 166 err |= restore_i387_xstate(buf);
168 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
169 goto badframe;
170 err |= restore_i387(buf);
171 } else {
172 struct task_struct *me = current;
173
174 if (used_math()) {
175 clear_fpu(me);
176 clear_used_math();
177 }
178 }
179 }
180 167
181 err |= __get_user(*pax, &sc->ax); 168 err |= __get_user(*pax, &sc->ax);
182 return err; 169 return err;
183
184badframe:
185 return 1;
186} 170}
187 171
188asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -228,9 +212,8 @@ badframe:
228 return 0; 212 return 0;
229} 213}
230 214
231asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215static long do_rt_sigreturn(struct pt_regs *regs)
232{ 216{
233 struct pt_regs *regs = (struct pt_regs *)&__unused;
234 struct rt_sigframe __user *frame; 217 struct rt_sigframe __user *frame;
235 unsigned long ax; 218 unsigned long ax;
236 sigset_t set; 219 sigset_t set;
@@ -256,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
256 return ax; 239 return ax;
257 240
258badframe: 241badframe:
259 force_sig(SIGSEGV, current); 242 signal_fault(regs, frame, "rt_sigreturn");
260 return 0; 243 return 0;
261} 244}
262 245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
263/* 253/*
264 * Set up a signal frame. 254 * Set up a signal frame.
265 */ 255 */
266static int 256static int
267setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, 257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
268 struct pt_regs *regs, unsigned long mask) 258 struct pt_regs *regs, unsigned long mask)
269{ 259{
270 int tmp, err = 0; 260 int tmp, err = 0;
@@ -291,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
291 err |= __put_user(regs->sp, &sc->sp_at_signal); 281 err |= __put_user(regs->sp, &sc->sp_at_signal);
292 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
293 283
294 tmp = save_i387(fpstate); 284 tmp = save_i387_xstate(fpstate);
295 if (tmp < 0) 285 if (tmp < 0)
296 err = 1; 286 err = 1;
297 else 287 else
@@ -308,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
308 * Determine which stack to use.. 298 * Determine which stack to use..
309 */ 299 */
310static inline void __user * 300static inline void __user *
311get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) 301get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
302 void **fpstate)
312{ 303{
313 unsigned long sp; 304 unsigned long sp;
314 305
@@ -334,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
334 sp = (unsigned long) ka->sa.sa_restorer; 325 sp = (unsigned long) ka->sa.sa_restorer;
335 } 326 }
336 327
328 if (used_math()) {
329 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp;
331 }
332
337 sp -= frame_size; 333 sp -= frame_size;
338 /* 334 /*
339 * Align the stack pointer according to the i386 ABI, 335 * Align the stack pointer according to the i386 ABI,
@@ -345,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
345} 341}
346 342
347static int 343static int
348setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 344__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
349 struct pt_regs *regs) 345 struct pt_regs *regs)
350{ 346{
351 struct sigframe __user *frame; 347 struct sigframe __user *frame;
352 void __user *restorer; 348 void __user *restorer;
353 int err = 0; 349 int err = 0;
354 int usig; 350 void __user *fpstate = NULL;
355 351
356 frame = get_sigframe(ka, regs, sizeof(*frame)); 352 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
357 353
358 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 354 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
359 goto give_sigsegv; 355 return -EFAULT;
360 356
361 usig = current_thread_info()->exec_domain 357 if (__put_user(sig, &frame->sig))
362 && current_thread_info()->exec_domain->signal_invmap 358 return -EFAULT;
363 && sig < 32
364 ? current_thread_info()->exec_domain->signal_invmap[sig]
365 : sig;
366 359
367 err = __put_user(usig, &frame->sig); 360 if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
368 if (err) 361 return -EFAULT;
369 goto give_sigsegv;
370
371 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
372 if (err)
373 goto give_sigsegv;
374 362
375 if (_NSIG_WORDS > 1) { 363 if (_NSIG_WORDS > 1) {
376 err = __copy_to_user(&frame->extramask, &set->sig[1], 364 if (__copy_to_user(&frame->extramask, &set->sig[1],
377 sizeof(frame->extramask)); 365 sizeof(frame->extramask)))
378 if (err) 366 return -EFAULT;
379 goto give_sigsegv;
380 } 367 }
381 368
382 if (current->mm->context.vdso) 369 if (current->mm->context.vdso)
@@ -401,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
401 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); 388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
402 389
403 if (err) 390 if (err)
404 goto give_sigsegv; 391 return -EFAULT;
405 392
406 /* Set up registers for signal handler */ 393 /* Set up registers for signal handler */
407 regs->sp = (unsigned long)frame; 394 regs->sp = (unsigned long)frame;
@@ -416,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
416 regs->cs = __USER_CS; 403 regs->cs = __USER_CS;
417 404
418 return 0; 405 return 0;
419
420give_sigsegv:
421 force_sigsegv(sig, current);
422 return -EFAULT;
423} 406}
424 407
425static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
426 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
427{ 410{
428 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
429 void __user *restorer; 412 void __user *restorer;
430 int err = 0; 413 int err = 0;
431 int usig; 414 void __user *fpstate = NULL;
432 415
433 frame = get_sigframe(ka, regs, sizeof(*frame)); 416 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
434 417
435 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
436 goto give_sigsegv; 419 return -EFAULT;
437 420
438 usig = current_thread_info()->exec_domain 421 err |= __put_user(sig, &frame->sig);
439 && current_thread_info()->exec_domain->signal_invmap
440 && sig < 32
441 ? current_thread_info()->exec_domain->signal_invmap[sig]
442 : sig;
443
444 err |= __put_user(usig, &frame->sig);
445 err |= __put_user(&frame->info, &frame->pinfo); 422 err |= __put_user(&frame->info, &frame->pinfo);
446 err |= __put_user(&frame->uc, &frame->puc); 423 err |= __put_user(&frame->uc, &frame->puc);
447 err |= copy_siginfo_to_user(&frame->info, info); 424 err |= copy_siginfo_to_user(&frame->info, info);
448 if (err) 425 if (err)
449 goto give_sigsegv; 426 return -EFAULT;
450 427
451 /* Create the ucontext. */ 428 /* Create the ucontext. */
452 err |= __put_user(0, &frame->uc.uc_flags); 429 if (cpu_has_xsave)
430 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
431 else
432 err |= __put_user(0, &frame->uc.uc_flags);
453 err |= __put_user(0, &frame->uc.uc_link); 433 err |= __put_user(0, &frame->uc.uc_link);
454 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 434 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
455 err |= __put_user(sas_ss_flags(regs->sp), 435 err |= __put_user(sas_ss_flags(regs->sp),
456 &frame->uc.uc_stack.ss_flags); 436 &frame->uc.uc_stack.ss_flags);
457 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 437 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
458 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 438 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
459 regs, set->sig[0]); 439 regs, set->sig[0]);
460 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
461 if (err) 441 if (err)
462 goto give_sigsegv; 442 return -EFAULT;
463 443
464 /* Set up to return from userspace. */ 444 /* Set up to return from userspace. */
465 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 445 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -479,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
479 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); 459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
480 460
481 if (err) 461 if (err)
482 goto give_sigsegv; 462 return -EFAULT;
483 463
484 /* Set up registers for signal handler */ 464 /* Set up registers for signal handler */
485 regs->sp = (unsigned long)frame; 465 regs->sp = (unsigned long)frame;
486 regs->ip = (unsigned long)ka->sa.sa_handler; 466 regs->ip = (unsigned long)ka->sa.sa_handler;
487 regs->ax = (unsigned long)usig; 467 regs->ax = (unsigned long)sig;
488 regs->dx = (unsigned long)&frame->info; 468 regs->dx = (unsigned long)&frame->info;
489 regs->cx = (unsigned long)&frame->uc; 469 regs->cx = (unsigned long)&frame->uc;
490 470
@@ -494,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
494 regs->cs = __USER_CS; 474 regs->cs = __USER_CS;
495 475
496 return 0; 476 return 0;
497
498give_sigsegv:
499 force_sigsegv(sig, current);
500 return -EFAULT;
501} 477}
502 478
503/* 479/*
504 * OK, we're invoking a handler: 480 * OK, we're invoking a handler:
505 */ 481 */
482static int signr_convert(int sig)
483{
484 struct thread_info *info = current_thread_info();
485
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig];
488 return sig;
489}
490
491#define is_ia32 1
492#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame
494
495static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs)
498{
499 int usig = signr_convert(sig);
500 int ret;
501
502 /* Set up the stack frame */
503 if (is_ia32) {
504 if (ka->sa.sa_flags & SA_SIGINFO)
505 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
506 else
507 ret = ia32_setup_frame(usig, ka, set, regs);
508 } else
509 ret = __setup_rt_frame(sig, ka, info, set, regs);
510
511 if (ret) {
512 force_sigsegv(sig, current);
513 return -EFAULT;
514 }
515
516 return ret;
517}
518
506static int 519static int
507handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 520handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
508 sigset_t *oldset, struct pt_regs *regs) 521 sigset_t *oldset, struct pt_regs *regs)
@@ -510,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
510 int ret; 523 int ret;
511 524
512 /* Are we from a system call? */ 525 /* Are we from a system call? */
513 if ((long)regs->orig_ax >= 0) { 526 if (syscall_get_nr(current, regs) >= 0) {
514 /* If so, check system call restarting.. */ 527 /* If so, check system call restarting.. */
515 switch (regs->ax) { 528 switch (syscall_get_error(current, regs)) {
516 case -ERESTART_RESTARTBLOCK: 529 case -ERESTART_RESTARTBLOCK:
517 case -ERESTARTNOHAND: 530 case -ERESTARTNOHAND:
518 regs->ax = -EINTR; 531 regs->ax = -EINTR;
@@ -539,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
539 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 552 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
540 regs->flags &= ~X86_EFLAGS_TF; 553 regs->flags &= ~X86_EFLAGS_TF;
541 554
542 /* Set up the stack frame */ 555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
543 if (ka->sa.sa_flags & SA_SIGINFO)
544 ret = setup_rt_frame(sig, ka, info, oldset, regs);
545 else
546 ret = setup_frame(sig, ka, oldset, regs);
547 556
548 if (ret) 557 if (ret)
549 return ret; 558 return ret;
550 559
560#ifdef CONFIG_X86_64
561 /*
562 * This has nothing to do with segment registers,
563 * despite the name. This magic affects uaccess.h
564 * macros' behavior. Reset it to the normal setting.
565 */
566 set_fs(USER_DS);
567#endif
568
551 /* 569 /*
552 * Clear the direction flag as per the ABI for function entry. 570 * Clear the direction flag as per the ABI for function entry.
553 */ 571 */
@@ -574,6 +592,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
574 return 0; 592 return 0;
575} 593}
576 594
595#define NR_restart_syscall __NR_restart_syscall
577/* 596/*
578 * Note that 'init' is a special process: it doesn't get signals it doesn't 597 * Note that 'init' is a special process: it doesn't get signals it doesn't
579 * want to handle. Thus you cannot kill init even with a SIGKILL even by 598 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -626,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
626 } 645 }
627 646
628 /* Did we come from a system call? */ 647 /* Did we come from a system call? */
629 if ((long)regs->orig_ax >= 0) { 648 if (syscall_get_nr(current, regs) >= 0) {
630 /* Restart the system call - no handlers present */ 649 /* Restart the system call - no handlers present */
631 switch (regs->ax) { 650 switch (syscall_get_error(current, regs)) {
632 case -ERESTARTNOHAND: 651 case -ERESTARTNOHAND:
633 case -ERESTARTSYS: 652 case -ERESTARTSYS:
634 case -ERESTARTNOINTR: 653 case -ERESTARTNOINTR:
@@ -637,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
637 break; 656 break;
638 657
639 case -ERESTART_RESTARTBLOCK: 658 case -ERESTART_RESTARTBLOCK:
640 regs->ax = __NR_restart_syscall; 659 regs->ax = NR_restart_syscall;
641 regs->ip -= 2; 660 regs->ip -= 2;
642 break; 661 break;
643 } 662 }
@@ -660,6 +679,12 @@ static void do_signal(struct pt_regs *regs)
660void 679void
661do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 680do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
662{ 681{
682#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
683 /* notify userspace of pending MCEs */
684 if (thread_info_flags & _TIF_MCE_NOTIFY)
685 mce_notify_user();
686#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
687
663 /* deal with pending signal delivery */ 688 /* deal with pending signal delivery */
664 if (thread_info_flags & _TIF_SIGPENDING) 689 if (thread_info_flags & _TIF_SIGPENDING)
665 do_signal(regs); 690 do_signal(regs);
@@ -669,5 +694,23 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
669 tracehook_notify_resume(regs); 694 tracehook_notify_resume(regs);
670 } 695 }
671 696
697#ifdef CONFIG_X86_32
672 clear_thread_flag(TIF_IRET); 698 clear_thread_flag(TIF_IRET);
699#endif /* CONFIG_X86_32 */
700}
701
702void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
703{
704 struct task_struct *me = current;
705
706 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
709 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip);
712 printk(KERN_CONT "\n");
713 }
714
715 force_sig(SIGSEGV, me);
673} 716}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 694aa888bb19..a5c9627f4db9 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -52,67 +52,14 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
52 return do_sigaltstack(uss, uoss, regs->sp); 52 return do_sigaltstack(uss, uoss, regs->sp);
53} 53}
54 54
55/* 55#define COPY(x) { \
56 * Signal frame handlers. 56 err |= __get_user(regs->x, &sc->x); \
57 */
58
59static inline int save_i387(struct _fpstate __user *buf)
60{
61 struct task_struct *tsk = current;
62 int err = 0;
63
64 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
65 sizeof(tsk->thread.xstate->fxsave));
66
67 if ((unsigned long)buf % 16)
68 printk("save_i387: bad fpstate %p\n", buf);
69
70 if (!used_math())
71 return 0;
72 clear_used_math(); /* trigger finit */
73 if (task_thread_info(tsk)->status & TS_USEDFPU) {
74 err = save_i387_checking((struct i387_fxsave_struct __user *)
75 buf);
76 if (err)
77 return err;
78 task_thread_info(tsk)->status &= ~TS_USEDFPU;
79 stts();
80 } else {
81 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
82 sizeof(struct i387_fxsave_struct)))
83 return -1;
84 }
85 return 1;
86} 57}
87 58
88/* 59#define COPY_SEG_STRICT(seg) { \
89 * This restores directly out of user space. Exceptions are handled. 60 unsigned short tmp; \
90 */ 61 err |= __get_user(tmp, &sc->seg); \
91static inline int restore_i387(struct _fpstate __user *buf) 62 regs->seg = tmp | 3; \
92{
93 struct task_struct *tsk = current;
94 int err;
95
96 if (!used_math()) {
97 err = init_fpu(tsk);
98 if (err)
99 return err;
100 }
101
102 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
103 clts();
104 task_thread_info(current)->status |= TS_USEDFPU;
105 }
106 err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
107 if (unlikely(err)) {
108 /*
109 * Encountered an error while doing the restore from the
110 * user buffer, clear the fpu state.
111 */
112 clear_fpu(tsk);
113 clear_used_math();
114 }
115 return err;
116} 63}
117 64
118/* 65/*
@@ -122,13 +69,13 @@ static int
122restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
123 unsigned long *pax) 70 unsigned long *pax)
124{ 71{
72 void __user *buf;
73 unsigned int tmpflags;
125 unsigned int err = 0; 74 unsigned int err = 0;
126 75
127 /* Always make any pending restarted system calls return -EINTR */ 76 /* Always make any pending restarted system calls return -EINTR */
128 current_thread_info()->restart_block.fn = do_no_restart_syscall; 77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
129 78
130#define COPY(x) (err |= __get_user(regs->x, &sc->x))
131
132 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
133 COPY(dx); COPY(cx); COPY(ip); 80 COPY(dx); COPY(cx); COPY(ip);
134 COPY(r8); 81 COPY(r8);
@@ -143,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
143 /* Kernel saves and restores only the CS segment register on signals, 90 /* Kernel saves and restores only the CS segment register on signals,
144 * which is the bare minimum needed to allow mixed 32/64-bit code. 91 * which is the bare minimum needed to allow mixed 32/64-bit code.
145 * App's signal handler can save/restore other segments if needed. */ 92 * App's signal handler can save/restore other segments if needed. */
146 { 93 COPY_SEG_STRICT(cs);
147 unsigned cs;
148 err |= __get_user(cs, &sc->cs);
149 regs->cs = cs | 3; /* Force into user mode */
150 }
151 94
152 { 95 err |= __get_user(tmpflags, &sc->flags);
153 unsigned int tmpflags; 96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
154 err |= __get_user(tmpflags, &sc->flags); 97 regs->orig_ax = -1; /* disable syscall checks */
155 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
156 regs->orig_ax = -1; /* disable syscall checks */
157 }
158 98
159 { 99 err |= __get_user(buf, &sc->fpstate);
160 struct _fpstate __user *buf; 100 err |= restore_i387_xstate(buf);
161 err |= __get_user(buf, &sc->fpstate);
162
163 if (buf) {
164 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
165 goto badframe;
166 err |= restore_i387(buf);
167 } else {
168 struct task_struct *me = current;
169 if (used_math()) {
170 clear_fpu(me);
171 clear_used_math();
172 }
173 }
174 }
175 101
176 err |= __get_user(*pax, &sc->ax); 102 err |= __get_user(*pax, &sc->ax);
177 return err; 103 return err;
178
179badframe:
180 return 1;
181} 104}
182 105
183asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) 106static long do_rt_sigreturn(struct pt_regs *regs)
184{ 107{
185 struct rt_sigframe __user *frame; 108 struct rt_sigframe __user *frame;
186 sigset_t set;
187 unsigned long ax; 109 unsigned long ax;
110 sigset_t set;
188 111
189 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
190 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -207,10 +130,15 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
207 return ax; 130 return ax;
208 131
209badframe: 132badframe:
210 signal_fault(regs, frame, "sigreturn"); 133 signal_fault(regs, frame, "rt_sigreturn");
211 return 0; 134 return 0;
212} 135}
213 136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
141
214/* 142/*
215 * Set up a signal frame. 143 * Set up a signal frame.
216 */ 144 */
@@ -269,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
269 sp = current->sas_ss_sp + current->sas_ss_size; 197 sp = current->sas_ss_sp + current->sas_ss_size;
270 } 198 }
271 199
272 return (void __user *)round_down(sp - size, 16); 200 return (void __user *)round_down(sp - size, 64);
273} 201}
274 202
275static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
276 sigset_t *set, struct pt_regs *regs) 204 sigset_t *set, struct pt_regs *regs)
277{ 205{
278 struct rt_sigframe __user *frame; 206 struct rt_sigframe __user *frame;
279 struct _fpstate __user *fp = NULL; 207 void __user *fp = NULL;
280 int err = 0; 208 int err = 0;
281 struct task_struct *me = current; 209 struct task_struct *me = current;
282 210
283 if (used_math()) { 211 if (used_math()) {
284 fp = get_stack(ka, regs, sizeof(struct _fpstate)); 212 fp = get_stack(ka, regs, sig_xstate_size);
285 frame = (void __user *)round_down( 213 frame = (void __user *)round_down(
286 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
287 215
288 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) 216 if (save_i387_xstate(fp) < 0)
289 goto give_sigsegv; 217 return -EFAULT;
290
291 if (save_i387(fp) < 0)
292 err |= -1;
293 } else 218 } else
294 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
295 220
296 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
297 goto give_sigsegv; 222 return -EFAULT;
298 223
299 if (ka->sa.sa_flags & SA_SIGINFO) { 224 if (ka->sa.sa_flags & SA_SIGINFO) {
300 err |= copy_siginfo_to_user(&frame->info, info); 225 if (copy_siginfo_to_user(&frame->info, info))
301 if (err) 226 return -EFAULT;
302 goto give_sigsegv;
303 } 227 }
304 228
305 /* Create the ucontext. */ 229 /* Create the ucontext. */
306 err |= __put_user(0, &frame->uc.uc_flags); 230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
307 err |= __put_user(0, &frame->uc.uc_link); 234 err |= __put_user(0, &frame->uc.uc_link);
308 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
309 err |= __put_user(sas_ss_flags(regs->sp), 236 err |= __put_user(sas_ss_flags(regs->sp),
@@ -324,11 +251,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
324 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
325 } else { 252 } else {
326 /* could use a vstub here */ 253 /* could use a vstub here */
327 goto give_sigsegv; 254 return -EFAULT;
328 } 255 }
329 256
330 if (err) 257 if (err)
331 goto give_sigsegv; 258 return -EFAULT;
332 259
333 /* Set up registers for signal handler */ 260 /* Set up registers for signal handler */
334 regs->di = sig; 261 regs->di = sig;
@@ -348,15 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
348 regs->cs = __USER_CS; 275 regs->cs = __USER_CS;
349 276
350 return 0; 277 return 0;
351
352give_sigsegv:
353 force_sigsegv(sig, current);
354 return -EFAULT;
355} 278}
356 279
357/* 280/*
358 * OK, we're invoking a handler 281 * OK, we're invoking a handler
359 */ 282 */
283static int signr_convert(int sig)
284{
285 return sig;
286}
287
288#ifdef CONFIG_IA32_EMULATION
289#define is_ia32 test_thread_flag(TIF_IA32)
290#else
291#define is_ia32 0
292#endif
293
294static int
295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
360 317
361static int 318static int
362handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -394,51 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
394 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
395 regs->flags &= ~X86_EFLAGS_TF; 352 regs->flags &= ~X86_EFLAGS_TF;
396 353
397#ifdef CONFIG_IA32_EMULATION
398 if (test_thread_flag(TIF_IA32)) {
399 if (ka->sa.sa_flags & SA_SIGINFO)
400 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
401 else
402 ret = ia32_setup_frame(sig, ka, oldset, regs);
403 } else
404#endif
405 ret = setup_rt_frame(sig, ka, info, oldset, regs); 354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
406 355
407 if (ret == 0) { 356 if (ret)
408 /* 357 return ret;
409 * This has nothing to do with segment registers,
410 * despite the name. This magic affects uaccess.h
411 * macros' behavior. Reset it to the normal setting.
412 */
413 set_fs(USER_DS);
414 358
415 /* 359#ifdef CONFIG_X86_64
416 * Clear the direction flag as per the ABI for function entry. 360 /*
417 */ 361 * This has nothing to do with segment registers,
418 regs->flags &= ~X86_EFLAGS_DF; 362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
419 367
420 /* 368 /*
421 * Clear TF when entering the signal handler, but 369 * Clear the direction flag as per the ABI for function entry.
422 * notify any tracer that was single-stepping it. 370 */
423 * The tracer may want to single-step inside the 371 regs->flags &= ~X86_EFLAGS_DF;
424 * handler too.
425 */
426 regs->flags &= ~X86_EFLAGS_TF;
427 372
428 spin_lock_irq(&current->sighand->siglock); 373 /*
429 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 374 * Clear TF when entering the signal handler, but
430 if (!(ka->sa.sa_flags & SA_NODEFER)) 375 * notify any tracer that was single-stepping it.
431 sigaddset(&current->blocked, sig); 376 * The tracer may want to single-step inside the
432 recalc_sigpending(); 377 * handler too.
433 spin_unlock_irq(&current->sighand->siglock); 378 */
379 regs->flags &= ~X86_EFLAGS_TF;
434 380
435 tracehook_signal_handler(sig, info, ka, regs, 381 spin_lock_irq(&current->sighand->siglock);
436 test_thread_flag(TIF_SINGLESTEP)); 382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
437 } 383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
438 387
439 return ret; 388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
440} 392}
441 393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
442/* 396/*
443 * Note that 'init' is a special process: it doesn't get signals it doesn't 397 * Note that 'init' is a special process: it doesn't get signals it doesn't
444 * want to handle. Thus you cannot kill init even with a SIGKILL even by 398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -468,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
468 422
469 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
470 if (signr > 0) { 424 if (signr > 0) {
471 /* Re-enable any watchpoints before delivering the 425 /*
426 * Re-enable any watchpoints before delivering the
472 * signal to user space. The processor register will 427 * signal to user space. The processor register will
473 * have been cleared if the watchpoint triggered 428 * have been cleared if the watchpoint triggered
474 * inside the kernel. 429 * inside the kernel.
@@ -476,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
476 if (current->thread.debugreg7) 431 if (current->thread.debugreg7)
477 set_debugreg(current->thread.debugreg7, 7); 432 set_debugreg(current->thread.debugreg7, 7);
478 433
479 /* Whee! Actually deliver the signal. */ 434 /* Whee! Actually deliver the signal. */
480 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
481 /* 436 /*
482 * A signal was successfully delivered; the saved 437 * A signal was successfully delivered; the saved
@@ -499,10 +454,9 @@ static void do_signal(struct pt_regs *regs)
499 regs->ax = regs->orig_ax; 454 regs->ax = regs->orig_ax;
500 regs->ip -= 2; 455 regs->ip -= 2;
501 break; 456 break;
457
502 case -ERESTART_RESTARTBLOCK: 458 case -ERESTART_RESTARTBLOCK:
503 regs->ax = test_thread_flag(TIF_IA32) ? 459 regs->ax = NR_restart_syscall;
504 __NR_ia32_restart_syscall :
505 __NR_restart_syscall;
506 regs->ip -= 2; 460 regs->ip -= 2;
507 break; 461 break;
508 } 462 }
@@ -518,14 +472,18 @@ static void do_signal(struct pt_regs *regs)
518 } 472 }
519} 473}
520 474
521void do_notify_resume(struct pt_regs *regs, void *unused, 475/*
522 __u32 thread_info_flags) 476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
523{ 481{
524#ifdef CONFIG_X86_MCE 482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
525 /* notify userspace of pending MCEs */ 483 /* notify userspace of pending MCEs */
526 if (thread_info_flags & _TIF_MCE_NOTIFY) 484 if (thread_info_flags & _TIF_MCE_NOTIFY)
527 mce_notify_user(); 485 mce_notify_user();
528#endif /* CONFIG_X86_MCE */ 486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
529 487
530 /* deal with pending signal delivery */ 488 /* deal with pending signal delivery */
531 if (thread_info_flags & _TIF_SIGPENDING) 489 if (thread_info_flags & _TIF_SIGPENDING)
@@ -535,17 +493,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
535 clear_thread_flag(TIF_NOTIFY_RESUME); 493 clear_thread_flag(TIF_NOTIFY_RESUME);
536 tracehook_notify_resume(regs); 494 tracehook_notify_resume(regs);
537 } 495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
538} 500}
539 501
540void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
541{ 503{
542 struct task_struct *me = current; 504 struct task_struct *me = current;
505
543 if (show_unhandled_signals && printk_ratelimit()) { 506 if (show_unhandled_signals && printk_ratelimit()) {
544 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 507 printk(KERN_INFO
545 me->comm, me->pid, where, frame, regs->ip, 508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
546 regs->sp, regs->orig_ax); 509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
547 print_vma_addr(" in ", regs->ip); 511 print_vma_addr(" in ", regs->ip);
548 printk("\n"); 512 printk(KERN_CONT "\n");
549 } 513 }
550 514
551 force_sig(SIGSEGV, me); 515 force_sig(SIGSEGV, me);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640c..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
214struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
216 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
217 .cpu_up = native_cpu_up,
218 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
219 218
220 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
221 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
222 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
223 .send_call_func_ipi = native_send_call_func_ipi, 227 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi, 228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
225}; 229};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4e7ccb0e2a9b..8c3aca7cb343 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -123,7 +124,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
123 124
124static atomic_t init_deasserted; 125static atomic_t init_deasserted;
125 126
126static int boot_cpu_logical_apicid;
127 127
128/* representing cpus for which sibling maps can be computed */ 128/* representing cpus for which sibling maps can be computed */
129static cpumask_t cpu_sibling_setup_map; 129static cpumask_t cpu_sibling_setup_map;
@@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu)
165#endif 165#endif
166 166
167#ifdef CONFIG_X86_32 167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
168u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = 170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
169 { [0 ... NR_CPUS-1] = BAD_APICID }; 171 { [0 ... NR_CPUS-1] = BAD_APICID };
170 172
@@ -210,7 +212,7 @@ static void __cpuinit smp_callin(void)
210 /* 212 /*
211 * (This works even if the APIC is not enabled.) 213 * (This works even if the APIC is not enabled.)
212 */ 214 */
213 phys_id = GET_APIC_ID(read_apic_id()); 215 phys_id = read_apic_id();
214 cpuid = smp_processor_id(); 216 cpuid = smp_processor_id();
215 if (cpu_isset(cpuid, cpu_callin_map)) { 217 if (cpu_isset(cpuid, cpu_callin_map)) {
216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 218 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
@@ -332,14 +334,17 @@ static void __cpuinit start_secondary(void *unused)
332 * does not change while we are assigning vectors to cpus. Holding 334 * does not change while we are assigning vectors to cpus. Holding
333 * this lock ensures we don't half assign or remove an irq from a cpu. 335 * this lock ensures we don't half assign or remove an irq from a cpu.
334 */ 336 */
335 ipi_call_lock_irq(); 337 ipi_call_lock();
336 lock_vector_lock(); 338 lock_vector_lock();
337 __setup_vector_irq(smp_processor_id()); 339 __setup_vector_irq(smp_processor_id());
338 cpu_set(smp_processor_id(), cpu_online_map); 340 cpu_set(smp_processor_id(), cpu_online_map);
339 unlock_vector_lock(); 341 unlock_vector_lock();
340 ipi_call_unlock_irq(); 342 ipi_call_unlock();
341 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 343 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
342 344
345 /* enable local interrupts */
346 local_irq_enable();
347
343 setup_secondary_clock(); 348 setup_secondary_clock();
344 349
345 wmb(); 350 wmb();
@@ -551,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid)
551 printk(KERN_CONT 556 printk(KERN_CONT
552 "a previous APIC delivery may have failed\n"); 557 "a previous APIC delivery may have failed\n");
553 558
554 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 559 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
555 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
556 560
557 timeout = 0; 561 timeout = 0;
558 do { 562 do {
@@ -584,11 +588,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
584 int maxlvt; 588 int maxlvt;
585 589
586 /* Target chip */ 590 /* Target chip */
587 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
588
589 /* Boot on the stack */ 591 /* Boot on the stack */
590 /* Kick the second */ 592 /* Kick the second */
591 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 593 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
592 594
593 pr_debug("Waiting for send to finish...\n"); 595 pr_debug("Waiting for send to finish...\n");
594 send_status = safe_apic_wait_icr_idle(); 596 send_status = safe_apic_wait_icr_idle();
@@ -597,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
597 * Give the other CPU some time to accept the IPI. 599 * Give the other CPU some time to accept the IPI.
598 */ 600 */
599 udelay(200); 601 udelay(200);
600 maxlvt = lapic_get_maxlvt(); 602 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
601 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 603 maxlvt = lapic_get_maxlvt();
602 apic_write(APIC_ESR, 0); 604 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
603 accept_status = (apic_read(APIC_ESR) & 0xEF); 605 apic_write(APIC_ESR, 0);
606 accept_status = (apic_read(APIC_ESR) & 0xEF);
607 }
604 pr_debug("NMI sent.\n"); 608 pr_debug("NMI sent.\n");
605 609
606 if (send_status) 610 if (send_status)
@@ -641,13 +645,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
641 /* 645 /*
642 * Turn INIT on target chip 646 * Turn INIT on target chip
643 */ 647 */
644 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
645
646 /* 648 /*
647 * Send IPI 649 * Send IPI
648 */ 650 */
649 apic_write(APIC_ICR, 651 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
650 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); 652 phys_apicid);
651 653
652 pr_debug("Waiting for send to finish...\n"); 654 pr_debug("Waiting for send to finish...\n");
653 send_status = safe_apic_wait_icr_idle(); 655 send_status = safe_apic_wait_icr_idle();
@@ -657,10 +659,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
657 pr_debug("Deasserting INIT.\n"); 659 pr_debug("Deasserting INIT.\n");
658 660
659 /* Target chip */ 661 /* Target chip */
660 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
661
662 /* Send IPI */ 662 /* Send IPI */
663 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 663 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
664 664
665 pr_debug("Waiting for send to finish...\n"); 665 pr_debug("Waiting for send to finish...\n");
666 send_status = safe_apic_wait_icr_idle(); 666 send_status = safe_apic_wait_icr_idle();
@@ -703,11 +703,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
703 */ 703 */
704 704
705 /* Target chip */ 705 /* Target chip */
706 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
707
708 /* Boot on the stack */ 706 /* Boot on the stack */
709 /* Kick the second */ 707 /* Kick the second */
710 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); 708 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
709 phys_apicid);
711 710
712 /* 711 /*
713 * Give the other CPU some time to accept the IPI. 712 * Give the other CPU some time to accept the IPI.
@@ -1176,10 +1175,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1176 * Setup boot CPU information 1175 * Setup boot CPU information
1177 */ 1176 */
1178 smp_store_cpu_info(0); /* Final full version of the data */ 1177 smp_store_cpu_info(0); /* Final full version of the data */
1178#ifdef CONFIG_X86_32
1179 boot_cpu_logical_apicid = logical_smp_processor_id(); 1179 boot_cpu_logical_apicid = logical_smp_processor_id();
1180#endif
1180 current_thread_info()->cpu = 0; /* needed? */ 1181 current_thread_info()->cpu = 0; /* needed? */
1181 set_cpu_sibling_map(0); 1182 set_cpu_sibling_map(0);
1182 1183
1184#ifdef CONFIG_X86_64
1185 enable_IR_x2apic();
1186 setup_apic_routing();
1187#endif
1188
1183 if (smp_sanity_check(max_cpus) < 0) { 1189 if (smp_sanity_check(max_cpus) < 0) {
1184 printk(KERN_INFO "SMP disabled\n"); 1190 printk(KERN_INFO "SMP disabled\n");
1185 disable_smp(); 1191 disable_smp();
@@ -1187,9 +1193,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1187 } 1193 }
1188 1194
1189 preempt_disable(); 1195 preempt_disable();
1190 if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { 1196 if (read_apic_id() != boot_cpu_physical_apicid) {
1191 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1197 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1192 GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); 1198 read_apic_id(), boot_cpu_physical_apicid);
1193 /* Or can we switch back to PIC here? */ 1199 /* Or can we switch back to PIC here? */
1194 } 1200 }
1195 preempt_enable(); 1201 preempt_enable();
@@ -1255,39 +1261,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1255 check_nmi_watchdog(); 1261 check_nmi_watchdog();
1256} 1262}
1257 1263
1258#ifdef CONFIG_HOTPLUG_CPU
1259
1260static void remove_siblinginfo(int cpu)
1261{
1262 int sibling;
1263 struct cpuinfo_x86 *c = &cpu_data(cpu);
1264
1265 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1266 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1267 /*/
1268 * last thread sibling in this cpu core going down
1269 */
1270 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1271 cpu_data(sibling).booted_cores--;
1272 }
1273
1274 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1275 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1276 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1277 cpus_clear(per_cpu(cpu_core_map, cpu));
1278 c->phys_proc_id = 0;
1279 c->cpu_core_id = 0;
1280 cpu_clear(cpu, cpu_sibling_setup_map);
1281}
1282
1283static int additional_cpus __initdata = -1;
1284
1285static __init int setup_additional_cpus(char *s)
1286{
1287 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1288}
1289early_param("additional_cpus", setup_additional_cpus);
1290
1291/* 1264/*
1292 * cpu_possible_map should be static, it cannot change as cpu's 1265 * cpu_possible_map should be static, it cannot change as cpu's
1293 * are onlined, or offlined. The reason is per-cpu data-structures 1266 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1307,21 +1280,13 @@ early_param("additional_cpus", setup_additional_cpus);
1307 */ 1280 */
1308__init void prefill_possible_map(void) 1281__init void prefill_possible_map(void)
1309{ 1282{
1310 int i; 1283 int i, possible;
1311 int possible;
1312 1284
1313 /* no processor from mptable or madt */ 1285 /* no processor from mptable or madt */
1314 if (!num_processors) 1286 if (!num_processors)
1315 num_processors = 1; 1287 num_processors = 1;
1316 1288
1317 if (additional_cpus == -1) { 1289 possible = num_processors + disabled_cpus;
1318 if (disabled_cpus > 0)
1319 additional_cpus = disabled_cpus;
1320 else
1321 additional_cpus = 0;
1322 }
1323
1324 possible = num_processors + additional_cpus;
1325 if (possible > NR_CPUS) 1290 if (possible > NR_CPUS)
1326 possible = NR_CPUS; 1291 possible = NR_CPUS;
1327 1292
@@ -1334,6 +1299,31 @@ __init void prefill_possible_map(void)
1334 nr_cpu_ids = possible; 1299 nr_cpu_ids = possible;
1335} 1300}
1336 1301
1302#ifdef CONFIG_HOTPLUG_CPU
1303
1304static void remove_siblinginfo(int cpu)
1305{
1306 int sibling;
1307 struct cpuinfo_x86 *c = &cpu_data(cpu);
1308
1309 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1310 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1311 /*/
1312 * last thread sibling in this cpu core going down
1313 */
1314 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1315 cpu_data(sibling).booted_cores--;
1316 }
1317
1318 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1319 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1320 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1321 cpus_clear(per_cpu(cpu_core_map, cpu));
1322 c->phys_proc_id = 0;
1323 c->cpu_core_id = 0;
1324 cpu_clear(cpu, cpu_sibling_setup_map);
1325}
1326
1337static void __ref remove_cpu_from_maps(int cpu) 1327static void __ref remove_cpu_from_maps(int cpu)
1338{ 1328{
1339 cpu_clear(cpu, cpu_online_map); 1329 cpu_clear(cpu, cpu_online_map);
@@ -1344,25 +1334,9 @@ static void __ref remove_cpu_from_maps(int cpu)
1344 numa_remove_cpu(cpu); 1334 numa_remove_cpu(cpu);
1345} 1335}
1346 1336
1347int __cpu_disable(void) 1337void cpu_disable_common(void)
1348{ 1338{
1349 int cpu = smp_processor_id(); 1339 int cpu = smp_processor_id();
1350
1351 /*
1352 * Perhaps use cpufreq to drop frequency, but that could go
1353 * into generic code.
1354 *
1355 * We won't take down the boot processor on i386 due to some
1356 * interrupts only being able to be serviced by the BSP.
1357 * Especially so if we're not using an IOAPIC -zwane
1358 */
1359 if (cpu == 0)
1360 return -EBUSY;
1361
1362 if (nmi_watchdog == NMI_LOCAL_APIC)
1363 stop_apic_nmi_watchdog(NULL);
1364 clear_local_APIC();
1365
1366 /* 1340 /*
1367 * HACK: 1341 * HACK:
1368 * Allow any queued timer interrupts to get serviced 1342 * Allow any queued timer interrupts to get serviced
@@ -1380,10 +1354,32 @@ int __cpu_disable(void)
1380 remove_cpu_from_maps(cpu); 1354 remove_cpu_from_maps(cpu);
1381 unlock_vector_lock(); 1355 unlock_vector_lock();
1382 fixup_irqs(cpu_online_map); 1356 fixup_irqs(cpu_online_map);
1357}
1358
1359int native_cpu_disable(void)
1360{
1361 int cpu = smp_processor_id();
1362
1363 /*
1364 * Perhaps use cpufreq to drop frequency, but that could go
1365 * into generic code.
1366 *
1367 * We won't take down the boot processor on i386 due to some
1368 * interrupts only being able to be serviced by the BSP.
1369 * Especially so if we're not using an IOAPIC -zwane
1370 */
1371 if (cpu == 0)
1372 return -EBUSY;
1373
1374 if (nmi_watchdog == NMI_LOCAL_APIC)
1375 stop_apic_nmi_watchdog(NULL);
1376 clear_local_APIC();
1377
1378 cpu_disable_common();
1383 return 0; 1379 return 0;
1384} 1380}
1385 1381
1386void __cpu_die(unsigned int cpu) 1382void native_cpu_die(unsigned int cpu)
1387{ 1383{
1388 /* We don't do anything here: idle task is faking death itself. */ 1384 /* We don't do anything here: idle task is faking death itself. */
1389 unsigned int i; 1385 unsigned int i;
@@ -1400,15 +1396,45 @@ void __cpu_die(unsigned int cpu)
1400 } 1396 }
1401 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1397 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1402} 1398}
1399
1400void play_dead_common(void)
1401{
1402 idle_task_exit();
1403 reset_lazy_tlbstate();
1404 irq_ctx_exit(raw_smp_processor_id());
1405 c1e_remove_cpu(raw_smp_processor_id());
1406
1407 mb();
1408 /* Ack it */
1409 __get_cpu_var(cpu_state) = CPU_DEAD;
1410
1411 /*
1412 * With physical CPU hotplug, we should halt the cpu
1413 */
1414 local_irq_disable();
1415}
1416
1417void native_play_dead(void)
1418{
1419 play_dead_common();
1420 wbinvd_halt();
1421}
1422
1403#else /* ... !CONFIG_HOTPLUG_CPU */ 1423#else /* ... !CONFIG_HOTPLUG_CPU */
1404int __cpu_disable(void) 1424int native_cpu_disable(void)
1405{ 1425{
1406 return -ENOSYS; 1426 return -ENOSYS;
1407} 1427}
1408 1428
1409void __cpu_die(unsigned int cpu) 1429void native_cpu_die(unsigned int cpu)
1410{ 1430{
1411 /* We said "no" in __cpu_disable */ 1431 /* We said "no" in __cpu_disable */
1412 BUG(); 1432 BUG();
1413} 1433}
1434
1435void native_play_dead(void)
1436{
1437 BUG();
1438}
1439
1414#endif 1440#endif
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index d67ce5f044ba..7b987852e876 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,7 +30,7 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <asm/io.h> 31#include <asm/io.h>
32#include <asm/bios_ebda.h> 32#include <asm/bios_ebda.h>
33#include <asm/mach-summit/mach_mpparse.h> 33#include <asm/summit/mpparse.h>
34 34
35static struct rio_table_hdr *rio_table_hdr __initdata; 35static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index bbecf8b6bf96..77b400f06ea2 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -47,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs)
47 unsigned long pc = instruction_pointer(regs); 47 unsigned long pc = instruction_pointer(regs);
48 48
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && 50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
51 in_lock_functions(pc)) {
52#ifdef CONFIG_FRAME_POINTER 51#ifdef CONFIG_FRAME_POINTER
53 return *(unsigned long *)(regs->bp + 4); 52 return *(unsigned long *)(regs->bp + sizeof(long));
54#else 53#else
55 unsigned long *sp = (unsigned long *)&regs->sp; 54 unsigned long *sp = (unsigned long *)&regs->sp;
56 55
@@ -95,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
95 94
96 do_timer_interrupt_hook(); 95 do_timer_interrupt_hook();
97 96
97#ifdef CONFIG_MCA
98 if (MCA_bus) { 98 if (MCA_bus) {
99 /* The PS/2 uses level-triggered interrupts. You can't 99 /* The PS/2 uses level-triggered interrupts. You can't
100 turn them off, nor would you want to (any attempt to 100 turn them off, nor would you want to (any attempt to
@@ -108,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
108 u8 irq_v = inb_p( 0x61 ); /* read the current state */ 108 u8 irq_v = inb_p( 0x61 ); /* read the current state */
109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
110 } 110 }
111#endif
111 112
112 return IRQ_HANDLED; 113 return IRQ_HANDLED;
113} 114}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e3d49c553af2..cb19d650c216 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/mca.h>
19 20
20#include <asm/i8253.h> 21#include <asm/i8253.h>
21#include <asm/hpet.h> 22#include <asm/hpet.h>
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs)
33 /* Assume the lock function has either no stack frame or a copy 34 /* Assume the lock function has either no stack frame or a copy
34 of flags from PUSHF 35 of flags from PUSHF
35 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
36 if (!user_mode(regs) && in_lock_functions(pc)) { 37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
37 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp = (unsigned long *)regs->sp;
38 if (sp[0] >> 22) 42 if (sp[0] >> 22)
39 return sp[0]; 43 return sp[0];
40 if (sp[1] >> 22) 44 if (sp[1] >> 22)
41 return sp[1]; 45 return sp[1];
46#endif
42 } 47 }
43 return pc; 48 return pc;
44} 49}
45EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
46 51
47static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 52irqreturn_t timer_interrupt(int irq, void *dev_id)
48{ 53{
49 add_pda(irq0_irqs, 1); 54 add_pda(irq0_irqs, 1);
50 55
51 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
52 57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
53 return IRQ_HANDLED; 65 return IRQ_HANDLED;
54} 66}
55 67
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void)
100} 112}
101 113
102static struct irqaction irq0 = { 114static struct irqaction irq0 = {
103 .handler = timer_event_interrupt, 115 .handler = timer_interrupt,
104 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, 116 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
105 .mask = CPU_MASK_NONE, 117 .mask = CPU_MASK_NONE,
106 .name = "timer" 118 .name = "timer"
@@ -111,16 +123,13 @@ void __init hpet_time_init(void)
111 if (!hpet_enable()) 123 if (!hpet_enable())
112 setup_pit_timer(); 124 setup_pit_timer();
113 125
126 irq0.mask = cpumask_of_cpu(0);
114 setup_irq(0, &irq0); 127 setup_irq(0, &irq0);
115} 128}
116 129
117void __init time_init(void) 130void __init time_init(void)
118{ 131{
119 tsc_init(); 132 tsc_init();
120 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
121 vgetcpu_mode = VGETCPU_RDTSCP;
122 else
123 vgetcpu_mode = VGETCPU_LSL;
124 133
125 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
126} 135}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b7..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
241 on_each_cpu(do_flush_tlb_all, NULL, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
250}
251
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c
index 03df8e45e5a1..e062974cce34 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps.c
@@ -7,13 +7,11 @@
7 */ 7 */
8 8
9/* 9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some 10 * Handle hardware traps and faults.
11 * state in 'asm.s'.
12 */ 11 */
13#include <linux/interrupt.h> 12#include <linux/interrupt.h>
14#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16#include <linux/highmem.h>
17#include <linux/kprobes.h> 15#include <linux/kprobes.h>
18#include <linux/uaccess.h> 16#include <linux/uaccess.h>
19#include <linux/utsname.h> 17#include <linux/utsname.h>
@@ -32,6 +30,8 @@
32#include <linux/bug.h> 30#include <linux/bug.h>
33#include <linux/nmi.h> 31#include <linux/nmi.h>
34#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/smp.h>
34#include <linux/io.h>
35 35
36#ifdef CONFIG_EISA 36#ifdef CONFIG_EISA
37#include <linux/ioport.h> 37#include <linux/ioport.h>
@@ -46,21 +46,31 @@
46#include <linux/edac.h> 46#include <linux/edac.h>
47#endif 47#endif
48 48
49#include <asm/arch_hooks.h>
50#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
51#include <asm/processor.h> 50#include <asm/processor.h>
52#include <asm/debugreg.h> 51#include <asm/debugreg.h>
53#include <asm/atomic.h> 52#include <asm/atomic.h>
54#include <asm/system.h> 53#include <asm/system.h>
55#include <asm/unwind.h> 54#include <asm/unwind.h>
55#include <asm/traps.h>
56#include <asm/desc.h> 56#include <asm/desc.h>
57#include <asm/i387.h> 57#include <asm/i387.h>
58
59#include <mach_traps.h>
60
61#ifdef CONFIG_X86_64
62#include <asm/pgalloc.h>
63#include <asm/proto.h>
64#include <asm/pda.h>
65#else
66#include <asm/processor-flags.h>
67#include <asm/arch_hooks.h>
58#include <asm/nmi.h> 68#include <asm/nmi.h>
59#include <asm/smp.h> 69#include <asm/smp.h>
60#include <asm/io.h> 70#include <asm/io.h>
61#include <asm/traps.h> 71#include <asm/traps.h>
62 72
63#include "mach_traps.h" 73#include "cpu/mcheck/mce.h"
64 74
65DECLARE_BITMAP(used_vectors, NR_VECTORS); 75DECLARE_BITMAP(used_vectors, NR_VECTORS);
66EXPORT_SYMBOL_GPL(used_vectors); 76EXPORT_SYMBOL_GPL(used_vectors);
@@ -77,418 +87,104 @@ char ignore_fpu_irq;
77 */ 87 */
78gate_desc idt_table[256] 88gate_desc idt_table[256]
79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
80
81int panic_on_unrecovered_nmi;
82int kstack_depth_to_print = 24;
83static unsigned int code_bytes = 64;
84static int ignore_nmis;
85static int die_counter;
86
87void printk_address(unsigned long address, int reliable)
88{
89#ifdef CONFIG_KALLSYMS
90 unsigned long offset = 0;
91 unsigned long symsize;
92 const char *symname;
93 char *modname;
94 char *delim = ":";
95 char namebuf[KSYM_NAME_LEN];
96 char reliab[4] = "";
97
98 symname = kallsyms_lookup(address, &symsize, &offset,
99 &modname, namebuf);
100 if (!symname) {
101 printk(" [<%08lx>]\n", address);
102 return;
103 }
104 if (!reliable)
105 strcpy(reliab, "? ");
106
107 if (!modname)
108 modname = delim = "";
109 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
110 address, reliab, delim, modname, delim, symname, offset, symsize);
111#else
112 printk(" [<%08lx>]\n", address);
113#endif 90#endif
114}
115
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size)
118{
119 void *t = tinfo;
120 return p > t && p <= t + THREAD_SIZE - size;
121}
122
123/* The form of the top of the frame on the stack */
124struct stack_frame {
125 struct stack_frame *next_frame;
126 unsigned long return_address;
127};
128
129static inline unsigned long
130print_context_stack(struct thread_info *tinfo,
131 unsigned long *stack, unsigned long bp,
132 const struct stacktrace_ops *ops, void *data)
133{
134 struct stack_frame *frame = (struct stack_frame *)bp;
135
136 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
137 unsigned long addr;
138
139 addr = *stack;
140 if (__kernel_text_address(addr)) {
141 if ((unsigned long) stack == bp + 4) {
142 ops->address(data, addr, 1);
143 frame = frame->next_frame;
144 bp = (unsigned long) frame;
145 } else {
146 ops->address(data, addr, bp == 0);
147 }
148 }
149 stack++;
150 }
151 return bp;
152}
153
154void dump_trace(struct task_struct *task, struct pt_regs *regs,
155 unsigned long *stack, unsigned long bp,
156 const struct stacktrace_ops *ops, void *data)
157{
158 if (!task)
159 task = current;
160
161 if (!stack) {
162 unsigned long dummy;
163 stack = &dummy;
164 if (task != current)
165 stack = (unsigned long *)task->thread.sp;
166 }
167
168#ifdef CONFIG_FRAME_POINTER
169 if (!bp) {
170 if (task == current) {
171 /* Grab bp right from our regs */
172 asm("movl %%ebp, %0" : "=r" (bp) :);
173 } else {
174 /* bp is the last reg pushed by switch_to */
175 bp = *(unsigned long *) task->thread.sp;
176 }
177 }
178#endif
179
180 for (;;) {
181 struct thread_info *context;
182
183 context = (struct thread_info *)
184 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
185 bp = print_context_stack(context, stack, bp, ops, data);
186 /*
187 * Should be after the line below, but somewhere
188 * in early boot context comes out corrupted and we
189 * can't reference it:
190 */
191 if (ops->stack(data, "IRQ") < 0)
192 break;
193 stack = (unsigned long *)context->previous_esp;
194 if (!stack)
195 break;
196 touch_nmi_watchdog();
197 }
198}
199EXPORT_SYMBOL(dump_trace);
200
201static void
202print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
203{
204 printk(data);
205 print_symbol(msg, symbol);
206 printk("\n");
207}
208
209static void print_trace_warning(void *data, char *msg)
210{
211 printk("%s%s\n", (char *)data, msg);
212}
213 91
214static int print_trace_stack(void *data, char *name) 92static int ignore_nmis;
215{
216 return 0;
217}
218
219/*
220 * Print one address/symbol entries per line.
221 */
222static void print_trace_address(void *data, unsigned long addr, int reliable)
223{
224 printk("%s [<%08lx>] ", (char *)data, addr);
225 if (!reliable)
226 printk("? ");
227 print_symbol("%s\n", addr);
228 touch_nmi_watchdog();
229}
230
231static const struct stacktrace_ops print_trace_ops = {
232 .warning = print_trace_warning,
233 .warning_symbol = print_trace_warning_symbol,
234 .stack = print_trace_stack,
235 .address = print_trace_address,
236};
237 93
238static void 94static inline void conditional_sti(struct pt_regs *regs)
239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 unsigned long *stack, unsigned long bp, char *log_lvl)
241{ 95{
242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 96 if (regs->flags & X86_EFLAGS_IF)
243 printk("%s =======================\n", log_lvl); 97 local_irq_enable();
244} 98}
245 99
246void show_trace(struct task_struct *task, struct pt_regs *regs, 100static inline void preempt_conditional_sti(struct pt_regs *regs)
247 unsigned long *stack, unsigned long bp)
248{ 101{
249 show_trace_log_lvl(task, regs, stack, bp, ""); 102 inc_preempt_count();
103 if (regs->flags & X86_EFLAGS_IF)
104 local_irq_enable();
250} 105}
251 106
252static void 107static inline void preempt_conditional_cli(struct pt_regs *regs)
253show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
254 unsigned long *sp, unsigned long bp, char *log_lvl)
255{ 108{
256 unsigned long *stack; 109 if (regs->flags & X86_EFLAGS_IF)
257 int i; 110 local_irq_disable();
258 111 dec_preempt_count();
259 if (sp == NULL) {
260 if (task)
261 sp = (unsigned long *)task->thread.sp;
262 else
263 sp = (unsigned long *)&sp;
264 }
265
266 stack = sp;
267 for (i = 0; i < kstack_depth_to_print; i++) {
268 if (kstack_end(stack))
269 break;
270 if (i && ((i % 8) == 0))
271 printk("\n%s ", log_lvl);
272 printk("%08lx ", *stack++);
273 }
274 printk("\n%sCall Trace:\n", log_lvl);
275
276 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
277} 112}
278 113
279void show_stack(struct task_struct *task, unsigned long *sp) 114#ifdef CONFIG_X86_32
115static inline void
116die_if_kernel(const char *str, struct pt_regs *regs, long err)
280{ 117{
281 printk(" "); 118 if (!user_mode_vm(regs))
282 show_stack_log_lvl(task, NULL, sp, 0, ""); 119 die(str, regs, err);
283} 120}
284 121
285/* 122/*
286 * The architecture-independent dump_stack generator 123 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
124 * invalid offset set (the LAZY one) and the faulting thread has
125 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
126 * we set the offset field correctly and return 1.
287 */ 127 */
288void dump_stack(void) 128static int lazy_iobitmap_copy(void)
289{ 129{
290 unsigned long bp = 0; 130 struct thread_struct *thread;
291 unsigned long stack; 131 struct tss_struct *tss;
292 132 int cpu;
293#ifdef CONFIG_FRAME_POINTER
294 if (!bp)
295 asm("movl %%ebp, %0" : "=r" (bp):);
296#endif
297
298 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
299 current->pid, current->comm, print_tainted(),
300 init_utsname()->release,
301 (int)strcspn(init_utsname()->version, " "),
302 init_utsname()->version);
303
304 show_trace(current, NULL, &stack, bp);
305}
306
307EXPORT_SYMBOL(dump_stack);
308
309void show_registers(struct pt_regs *regs)
310{
311 int i;
312 133
313 print_modules(); 134 cpu = get_cpu();
314 __show_registers(regs, 0); 135 tss = &per_cpu(init_tss, cpu);
136 thread = &current->thread;
315 137
316 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 138 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
317 TASK_COMM_LEN, current->comm, task_pid_nr(current), 139 thread->io_bitmap_ptr) {
318 current_thread_info(), current, task_thread_info(current)); 140 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
319 /* 141 thread->io_bitmap_max);
320 * When in-kernel, we also print out the stack and code at the 142 /*
321 * time of the fault.. 143 * If the previously set map was extending to higher ports
322 */ 144 * than the current one, pad extra space with 0xff (no access).
323 if (!user_mode_vm(regs)) { 145 */
324 unsigned int code_prologue = code_bytes * 43 / 64; 146 if (thread->io_bitmap_max < tss->io_bitmap_max) {
325 unsigned int code_len = code_bytes; 147 memset((char *) tss->io_bitmap +
326 unsigned char c; 148 thread->io_bitmap_max, 0xff,
327 u8 *ip; 149 tss->io_bitmap_max - thread->io_bitmap_max);
328
329 printk("\n" KERN_EMERG "Stack: ");
330 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
331
332 printk(KERN_EMERG "Code: ");
333
334 ip = (u8 *)regs->ip - code_prologue;
335 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
336 /* try starting at EIP */
337 ip = (u8 *)regs->ip;
338 code_len = code_len - code_prologue + 1;
339 }
340 for (i = 0; i < code_len; i++, ip++) {
341 if (ip < (u8 *)PAGE_OFFSET ||
342 probe_kernel_address(ip, c)) {
343 printk(" Bad EIP value.");
344 break;
345 }
346 if (ip == (u8 *)regs->ip)
347 printk("<%02x> ", c);
348 else
349 printk("%02x ", c);
350 } 150 }
351 } 151 tss->io_bitmap_max = thread->io_bitmap_max;
352 printk("\n"); 152 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
353} 153 tss->io_bitmap_owner = thread;
354 154 put_cpu();
355int is_valid_bugaddr(unsigned long ip)
356{
357 unsigned short ud2;
358
359 if (ip < PAGE_OFFSET)
360 return 0;
361 if (probe_kernel_address((unsigned short *)ip, ud2))
362 return 0;
363
364 return ud2 == 0x0b0f;
365}
366
367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
414
415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
416{
417 unsigned short ss;
418 unsigned long sp;
419 155
420 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
421#ifdef CONFIG_PREEMPT
422 printk("PREEMPT ");
423#endif
424#ifdef CONFIG_SMP
425 printk("SMP ");
426#endif
427#ifdef CONFIG_DEBUG_PAGEALLOC
428 printk("DEBUG_PAGEALLOC");
429#endif
430 printk("\n");
431 if (notify_die(DIE_OOPS, str, regs, err,
432 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
433 return 1; 156 return 1;
434
435 show_registers(regs);
436 /* Executive summary in case the oops scrolled away */
437 sp = (unsigned long) (&regs->sp);
438 savesegment(ss, ss);
439 if (user_mode(regs)) {
440 sp = regs->sp;
441 ss = regs->ss & 0xffff;
442 } 157 }
443 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 158 put_cpu();
444 print_symbol("%s", regs->ip);
445 printk(" SS:ESP %04x:%08lx\n", ss, sp);
446 return 0;
447}
448
449/*
450 * This is gone through when something in the kernel has done something bad
451 * and is about to be terminated:
452 */
453void die(const char *str, struct pt_regs *regs, long err)
454{
455 unsigned long flags = oops_begin();
456
457 if (die_nest_count < 3) {
458 report_bug(regs->ip, regs);
459
460 if (__die(str, regs, err))
461 regs = NULL;
462 } else {
463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
464 }
465
466 oops_end(flags, regs, SIGSEGV);
467}
468 159
469static inline void 160 return 0;
470die_if_kernel(const char *str, struct pt_regs *regs, long err)
471{
472 if (!user_mode_vm(regs))
473 die(str, regs, err);
474} 161}
162#endif
475 163
476static void __kprobes 164static void __kprobes
477do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, 165do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
478 long error_code, siginfo_t *info) 166 long error_code, siginfo_t *info)
479{ 167{
480 struct task_struct *tsk = current; 168 struct task_struct *tsk = current;
481 169
170#ifdef CONFIG_X86_32
482 if (regs->flags & X86_VM_MASK) { 171 if (regs->flags & X86_VM_MASK) {
483 if (vm86) 172 /*
173 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
174 * On nmi (interrupt 2), do_trap should not be called.
175 */
176 if (trapnr < 6)
484 goto vm86_trap; 177 goto vm86_trap;
485 goto trap_signal; 178 goto trap_signal;
486 } 179 }
180#endif
487 181
488 if (!user_mode(regs)) 182 if (!user_mode(regs))
489 goto kernel_trap; 183 goto kernel_trap;
490 184
185#ifdef CONFIG_X86_32
491trap_signal: 186trap_signal:
187#endif
492 /* 188 /*
493 * We want error_code and trap_no set for userspace faults and 189 * We want error_code and trap_no set for userspace faults and
494 * kernelspace faults which result in die(), but not 190 * kernelspace faults which result in die(), but not
@@ -501,6 +197,18 @@ trap_signal:
501 tsk->thread.error_code = error_code; 197 tsk->thread.error_code = error_code;
502 tsk->thread.trap_no = trapnr; 198 tsk->thread.trap_no = trapnr;
503 199
200#ifdef CONFIG_X86_64
201 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
202 printk_ratelimit()) {
203 printk(KERN_INFO
204 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
205 tsk->comm, tsk->pid, str,
206 regs->ip, regs->sp, error_code);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
210#endif
211
504 if (info) 212 if (info)
505 force_sig_info(signr, info, tsk); 213 force_sig_info(signr, info, tsk);
506 else 214 else
@@ -515,29 +223,29 @@ kernel_trap:
515 } 223 }
516 return; 224 return;
517 225
226#ifdef CONFIG_X86_32
518vm86_trap: 227vm86_trap:
519 if (handle_vm86_trap((struct kernel_vm86_regs *) regs, 228 if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
520 error_code, trapnr)) 229 error_code, trapnr))
521 goto trap_signal; 230 goto trap_signal;
522 return; 231 return;
232#endif
523} 233}
524 234
525#define DO_ERROR(trapnr, signr, str, name) \ 235#define DO_ERROR(trapnr, signr, str, name) \
526void do_##name(struct pt_regs *regs, long error_code) \ 236dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
527{ \ 237{ \
528 trace_hardirqs_fixup(); \
529 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 238 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
530 == NOTIFY_STOP) \ 239 == NOTIFY_STOP) \
531 return; \ 240 return; \
532 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ 241 conditional_sti(regs); \
242 do_trap(trapnr, signr, str, regs, error_code, NULL); \
533} 243}
534 244
535#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ 245#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
536void do_##name(struct pt_regs *regs, long error_code) \ 246dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
537{ \ 247{ \
538 siginfo_t info; \ 248 siginfo_t info; \
539 if (irq) \
540 local_irq_enable(); \
541 info.si_signo = signr; \ 249 info.si_signo = signr; \
542 info.si_errno = 0; \ 250 info.si_errno = 0; \
543 info.si_code = sicode; \ 251 info.si_code = sicode; \
@@ -545,90 +253,68 @@ void do_##name(struct pt_regs *regs, long error_code) \
545 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 253 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
546 == NOTIFY_STOP) \ 254 == NOTIFY_STOP) \
547 return; \ 255 return; \
548 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ 256 conditional_sti(regs); \
257 do_trap(trapnr, signr, str, regs, error_code, &info); \
549} 258}
550 259
551#define DO_VM86_ERROR(trapnr, signr, str, name) \ 260DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
552void do_##name(struct pt_regs *regs, long error_code) \ 261DO_ERROR(4, SIGSEGV, "overflow", overflow)
553{ \ 262DO_ERROR(5, SIGSEGV, "bounds", bounds)
554 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 263DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
555 == NOTIFY_STOP) \
556 return; \
557 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
558}
559
560#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
561void do_##name(struct pt_regs *regs, long error_code) \
562{ \
563 siginfo_t info; \
564 info.si_signo = signr; \
565 info.si_errno = 0; \
566 info.si_code = sicode; \
567 info.si_addr = (void __user *)siaddr; \
568 trace_hardirqs_fixup(); \
569 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
570 == NOTIFY_STOP) \
571 return; \
572 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
573}
574
575DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
576#ifndef CONFIG_KPROBES
577DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
578#endif
579DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
580DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
581DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
582DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 264DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
583DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 265DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
584DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 266DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
267#ifdef CONFIG_X86_32
585DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 268DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
586DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 269#endif
587DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) 270DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
271
272#ifdef CONFIG_X86_64
273/* Runs on IST stack */
274dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
275{
276 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
277 12, SIGBUS) == NOTIFY_STOP)
278 return;
279 preempt_conditional_sti(regs);
280 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
281 preempt_conditional_cli(regs);
282}
283
284dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
285{
286 static const char str[] = "double fault";
287 struct task_struct *tsk = current;
288
289 /* Return not checked because double check cannot be ignored */
290 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
588 291
589void __kprobes 292 tsk->thread.error_code = error_code;
293 tsk->thread.trap_no = 8;
294
295 /* This is always a kernel trap and never fixable (and thus must
296 never return). */
297 for (;;)
298 die(str, regs, error_code);
299}
300#endif
301
302dotraplinkage void __kprobes
590do_general_protection(struct pt_regs *regs, long error_code) 303do_general_protection(struct pt_regs *regs, long error_code)
591{ 304{
592 struct task_struct *tsk; 305 struct task_struct *tsk;
593 struct thread_struct *thread;
594 struct tss_struct *tss;
595 int cpu;
596 306
597 cpu = get_cpu(); 307 conditional_sti(regs);
598 tss = &per_cpu(init_tss, cpu);
599 thread = &current->thread;
600
601 /*
602 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
603 * invalid offset set (the LAZY one) and the faulting thread has
604 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
605 * and we set the offset field correctly. Then we let the CPU to
606 * restart the faulting instruction.
607 */
608 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
609 thread->io_bitmap_ptr) {
610 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
611 thread->io_bitmap_max);
612 /*
613 * If the previously set map was extending to higher ports
614 * than the current one, pad extra space with 0xff (no access).
615 */
616 if (thread->io_bitmap_max < tss->io_bitmap_max) {
617 memset((char *) tss->io_bitmap +
618 thread->io_bitmap_max, 0xff,
619 tss->io_bitmap_max - thread->io_bitmap_max);
620 }
621 tss->io_bitmap_max = thread->io_bitmap_max;
622 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
623 tss->io_bitmap_owner = thread;
624 put_cpu();
625 308
309#ifdef CONFIG_X86_32
310 if (lazy_iobitmap_copy()) {
311 /* restart the faulting instruction */
626 return; 312 return;
627 } 313 }
628 put_cpu();
629 314
630 if (regs->flags & X86_VM_MASK) 315 if (regs->flags & X86_VM_MASK)
631 goto gp_in_vm86; 316 goto gp_in_vm86;
317#endif
632 318
633 tsk = current; 319 tsk = current;
634 if (!user_mode(regs)) 320 if (!user_mode(regs))
@@ -650,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
650 force_sig(SIGSEGV, tsk); 336 force_sig(SIGSEGV, tsk);
651 return; 337 return;
652 338
339#ifdef CONFIG_X86_32
653gp_in_vm86: 340gp_in_vm86:
654 local_irq_enable(); 341 local_irq_enable();
655 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 342 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
656 return; 343 return;
344#endif
657 345
658gp_in_kernel: 346gp_in_kernel:
659 if (fixup_exception(regs)) 347 if (fixup_exception(regs))
@@ -690,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
690 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 378 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
691 379
692 /* Clear and disable the memory parity error line. */ 380 /* Clear and disable the memory parity error line. */
693 clear_mem_error(reason); 381 reason = (reason & 0xf) | 4;
382 outb(reason, 0x61);
694} 383}
695 384
696static notrace __kprobes void 385static notrace __kprobes void
@@ -716,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
716static notrace __kprobes void 405static notrace __kprobes void
717unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 406unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
718{ 407{
719 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 408 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
409 NOTIFY_STOP)
720 return; 410 return;
721#ifdef CONFIG_MCA 411#ifdef CONFIG_MCA
722 /* 412 /*
@@ -739,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
739 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 429 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
740} 430}
741 431
742static DEFINE_SPINLOCK(nmi_print_lock);
743
744void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
745{
746 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
747 return;
748
749 spin_lock(&nmi_print_lock);
750 /*
751 * We are in trouble anyway, lets at least try
752 * to get a message out:
753 */
754 bust_spinlocks(1);
755 printk(KERN_EMERG "%s", str);
756 printk(" on CPU%d, ip %08lx, registers:\n",
757 smp_processor_id(), regs->ip);
758 show_registers(regs);
759 if (do_panic)
760 panic("Non maskable interrupt");
761 console_silent();
762 spin_unlock(&nmi_print_lock);
763 bust_spinlocks(0);
764
765 /*
766 * If we are in kernel we are probably nested up pretty bad
767 * and might aswell get out now while we still can:
768 */
769 if (!user_mode_vm(regs)) {
770 current->thread.trap_no = 2;
771 crash_kexec(regs);
772 }
773
774 do_exit(SIGSEGV);
775}
776
777static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 432static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
778{ 433{
779 unsigned char reason = 0; 434 unsigned char reason = 0;
@@ -812,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
812 mem_parity_error(reason, regs); 467 mem_parity_error(reason, regs);
813 if (reason & 0x40) 468 if (reason & 0x40)
814 io_check_error(reason, regs); 469 io_check_error(reason, regs);
470#ifdef CONFIG_X86_32
815 /* 471 /*
816 * Reassert NMI in case it became active meanwhile 472 * Reassert NMI in case it became active meanwhile
817 * as it's edge-triggered: 473 * as it's edge-triggered:
818 */ 474 */
819 reassert_nmi(); 475 reassert_nmi();
476#endif
820} 477}
821 478
822notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) 479dotraplinkage notrace __kprobes void
480do_nmi(struct pt_regs *regs, long error_code)
823{ 481{
824 int cpu;
825
826 nmi_enter(); 482 nmi_enter();
827 483
828 cpu = smp_processor_id(); 484#ifdef CONFIG_X86_32
829 485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
830 ++nmi_count(cpu); 486#else
487 add_pda(__nmi_count, 1);
488#endif
831 489
832 if (!ignore_nmis) 490 if (!ignore_nmis)
833 default_do_nmi(regs); 491 default_do_nmi(regs);
@@ -847,21 +505,44 @@ void restart_nmi(void)
847 acpi_nmi_enable(); 505 acpi_nmi_enable();
848} 506}
849 507
850#ifdef CONFIG_KPROBES 508/* May run on IST stack. */
851void __kprobes do_int3(struct pt_regs *regs, long error_code) 509dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
852{ 510{
853 trace_hardirqs_fixup(); 511#ifdef CONFIG_KPROBES
854
855 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 512 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
856 == NOTIFY_STOP) 513 == NOTIFY_STOP)
857 return; 514 return;
858 /* 515#else
859 * This is an interrupt gate, because kprobes wants interrupts 516 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
860 * disabled. Normal trap handlers don't. 517 == NOTIFY_STOP)
861 */ 518 return;
862 restore_interrupts(regs); 519#endif
520
521 preempt_conditional_sti(regs);
522 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
523 preempt_conditional_cli(regs);
524}
863 525
864 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); 526#ifdef CONFIG_X86_64
527/* Help handler running on IST stack to switch back to user stack
528 for scheduling or signal handling. The actual stack switch is done in
529 entry.S */
530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
531{
532 struct pt_regs *regs = eregs;
533 /* Did already sync */
534 if (eregs == (struct pt_regs *)eregs->sp)
535 ;
536 /* Exception from user space */
537 else if (user_mode(eregs))
538 regs = task_pt_regs(current);
539 /* Exception from kernel and interrupts are enabled. Move to
540 kernel process stack. */
541 else if (eregs->flags & X86_EFLAGS_IF)
542 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
543 if (eregs != regs)
544 *regs = *eregs;
545 return regs;
865} 546}
866#endif 547#endif
867 548
@@ -886,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
886 * about restoring all the debug state, and ptrace doesn't have to 567 * about restoring all the debug state, and ptrace doesn't have to
887 * find every occurrence of the TF bit that could be saved away even 568 * find every occurrence of the TF bit that could be saved away even
888 * by user code) 569 * by user code)
570 *
571 * May run on IST stack.
889 */ 572 */
890void __kprobes do_debug(struct pt_regs *regs, long error_code) 573dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
891{ 574{
892 struct task_struct *tsk = current; 575 struct task_struct *tsk = current;
893 unsigned int condition; 576 unsigned long condition;
894 577 int si_code;
895 trace_hardirqs_fixup();
896 578
897 get_debugreg(condition, 6); 579 get_debugreg(condition, 6);
898 580
@@ -905,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
905 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 587 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
906 SIGTRAP) == NOTIFY_STOP) 588 SIGTRAP) == NOTIFY_STOP)
907 return; 589 return;
590
908 /* It's safe to allow irq's after DR6 has been saved */ 591 /* It's safe to allow irq's after DR6 has been saved */
909 if (regs->flags & X86_EFLAGS_IF) 592 preempt_conditional_sti(regs);
910 local_irq_enable();
911 593
912 /* Mask out spurious debug traps due to lazy DR7 setting */ 594 /* Mask out spurious debug traps due to lazy DR7 setting */
913 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 595 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -915,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
915 goto clear_dr7; 597 goto clear_dr7;
916 } 598 }
917 599
600#ifdef CONFIG_X86_32
918 if (regs->flags & X86_VM_MASK) 601 if (regs->flags & X86_VM_MASK)
919 goto debug_vm86; 602 goto debug_vm86;
603#endif
920 604
921 /* Save debug status register where ptrace can see it */ 605 /* Save debug status register where ptrace can see it */
922 tsk->thread.debugreg6 = condition; 606 tsk->thread.debugreg6 = condition;
@@ -926,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
926 * kernel space (but re-enable TF when returning to user mode). 610 * kernel space (but re-enable TF when returning to user mode).
927 */ 611 */
928 if (condition & DR_STEP) { 612 if (condition & DR_STEP) {
929 /*
930 * We already checked v86 mode above, so we can
931 * check for kernel mode by just checking the CPL
932 * of CS.
933 */
934 if (!user_mode(regs)) 613 if (!user_mode(regs))
935 goto clear_TF_reenable; 614 goto clear_TF_reenable;
936 } 615 }
937 616
617 si_code = get_si_code(condition);
938 /* Ok, finally something we can handle */ 618 /* Ok, finally something we can handle */
939 send_sigtrap(tsk, regs, error_code); 619 send_sigtrap(tsk, regs, error_code, si_code);
940 620
941 /* 621 /*
942 * Disable additional traps. They'll be re-enabled when 622 * Disable additional traps. They'll be re-enabled when
@@ -944,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
944 */ 624 */
945clear_dr7: 625clear_dr7:
946 set_debugreg(0, 7); 626 set_debugreg(0, 7);
627 preempt_conditional_cli(regs);
947 return; 628 return;
948 629
630#ifdef CONFIG_X86_32
949debug_vm86: 631debug_vm86:
950 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); 632 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
633 preempt_conditional_cli(regs);
951 return; 634 return;
635#endif
952 636
953clear_TF_reenable: 637clear_TF_reenable:
954 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 638 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
955 regs->flags &= ~X86_EFLAGS_TF; 639 regs->flags &= ~X86_EFLAGS_TF;
640 preempt_conditional_cli(regs);
956 return; 641 return;
957} 642}
958 643
644#ifdef CONFIG_X86_64
645static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
646{
647 if (fixup_exception(regs))
648 return 1;
649
650 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
651 /* Illegal floating point operation in the kernel */
652 current->thread.trap_no = trapnr;
653 die(str, regs, 0);
654 return 0;
655}
656#endif
657
959/* 658/*
960 * Note that we play around with the 'TS' bit in an attempt to get 659 * Note that we play around with the 'TS' bit in an attempt to get
961 * the correct behaviour even in the presence of the asynchronous 660 * the correct behaviour even in the presence of the asynchronous
@@ -992,7 +691,9 @@ void math_error(void __user *ip)
992 swd = get_fpu_swd(task); 691 swd = get_fpu_swd(task);
993 switch (swd & ~cwd & 0x3f) { 692 switch (swd & ~cwd & 0x3f) {
994 case 0x000: /* No unmasked exception */ 693 case 0x000: /* No unmasked exception */
694#ifdef CONFIG_X86_32
995 return; 695 return;
696#endif
996 default: /* Multiple exceptions */ 697 default: /* Multiple exceptions */
997 break; 698 break;
998 case 0x001: /* Invalid Op */ 699 case 0x001: /* Invalid Op */
@@ -1020,9 +721,18 @@ void math_error(void __user *ip)
1020 force_sig_info(SIGFPE, &info, task); 721 force_sig_info(SIGFPE, &info, task);
1021} 722}
1022 723
1023void do_coprocessor_error(struct pt_regs *regs, long error_code) 724dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
1024{ 725{
726 conditional_sti(regs);
727
728#ifdef CONFIG_X86_32
1025 ignore_fpu_irq = 1; 729 ignore_fpu_irq = 1;
730#else
731 if (!user_mode(regs) &&
732 kernel_math_error(regs, "kernel x87 math error", 16))
733 return;
734#endif
735
1026 math_error((void __user *)regs->ip); 736 math_error((void __user *)regs->ip);
1027} 737}
1028 738
@@ -1074,8 +784,12 @@ static void simd_math_error(void __user *ip)
1074 force_sig_info(SIGFPE, &info, task); 784 force_sig_info(SIGFPE, &info, task);
1075} 785}
1076 786
1077void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 787dotraplinkage void
788do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1078{ 789{
790 conditional_sti(regs);
791
792#ifdef CONFIG_X86_32
1079 if (cpu_has_xmm) { 793 if (cpu_has_xmm) {
1080 /* Handle SIMD FPU exceptions on PIII+ processors. */ 794 /* Handle SIMD FPU exceptions on PIII+ processors. */
1081 ignore_fpu_irq = 1; 795 ignore_fpu_irq = 1;
@@ -1094,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1094 current->thread.error_code = error_code; 808 current->thread.error_code = error_code;
1095 die_if_kernel("cache flush denied", regs, error_code); 809 die_if_kernel("cache flush denied", regs, error_code);
1096 force_sig(SIGSEGV, current); 810 force_sig(SIGSEGV, current);
811#else
812 if (!user_mode(regs) &&
813 kernel_math_error(regs, "kernel simd math error", 19))
814 return;
815 simd_math_error((void __user *)regs->ip);
816#endif
1097} 817}
1098 818
1099void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) 819dotraplinkage void
820do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1100{ 821{
822 conditional_sti(regs);
1101#if 0 823#if 0
1102 /* No need to warn about this any longer. */ 824 /* No need to warn about this any longer. */
1103 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 825 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
1104#endif 826#endif
1105} 827}
1106 828
829#ifdef CONFIG_X86_32
1107unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) 830unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1108{ 831{
1109 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); 832 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
@@ -1122,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1122 845
1123 return new_kesp; 846 return new_kesp;
1124} 847}
848#else
849asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
850{
851}
852
853asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
854{
855}
856#endif
1125 857
1126/* 858/*
1127 * 'math_state_restore()' saves the current math information in the 859 * 'math_state_restore()' saves the current math information in the
@@ -1154,14 +886,24 @@ asmlinkage void math_state_restore(void)
1154 } 886 }
1155 887
1156 clts(); /* Allow maths ops (or we recurse) */ 888 clts(); /* Allow maths ops (or we recurse) */
889#ifdef CONFIG_X86_32
1157 restore_fpu(tsk); 890 restore_fpu(tsk);
891#else
892 /*
893 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
894 */
895 if (unlikely(restore_fpu_checking(tsk))) {
896 stts();
897 force_sig(SIGSEGV, tsk);
898 return;
899 }
900#endif
1158 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 901 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1159 tsk->fpu_counter++; 902 tsk->fpu_counter++;
1160} 903}
1161EXPORT_SYMBOL_GPL(math_state_restore); 904EXPORT_SYMBOL_GPL(math_state_restore);
1162 905
1163#ifndef CONFIG_MATH_EMULATION 906#ifndef CONFIG_MATH_EMULATION
1164
1165asmlinkage void math_emulate(long arg) 907asmlinkage void math_emulate(long arg)
1166{ 908{
1167 printk(KERN_EMERG 909 printk(KERN_EMERG
@@ -1170,12 +912,54 @@ asmlinkage void math_emulate(long arg)
1170 force_sig(SIGFPE, current); 912 force_sig(SIGFPE, current);
1171 schedule(); 913 schedule();
1172} 914}
1173
1174#endif /* CONFIG_MATH_EMULATION */ 915#endif /* CONFIG_MATH_EMULATION */
1175 916
917dotraplinkage void __kprobes
918do_device_not_available(struct pt_regs *regs, long error)
919{
920#ifdef CONFIG_X86_32
921 if (read_cr0() & X86_CR0_EM) {
922 conditional_sti(regs);
923 math_emulate(0);
924 } else {
925 math_state_restore(); /* interrupts still off */
926 conditional_sti(regs);
927 }
928#else
929 math_state_restore();
930#endif
931}
932
933#ifdef CONFIG_X86_32
934#ifdef CONFIG_X86_MCE
935dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error)
936{
937 conditional_sti(regs);
938 machine_check_vector(regs, error);
939}
940#endif
941
942dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
943{
944 siginfo_t info;
945 local_irq_enable();
946
947 info.si_signo = SIGILL;
948 info.si_errno = 0;
949 info.si_code = ILL_BADSTK;
950 info.si_addr = 0;
951 if (notify_die(DIE_TRAP, "iret exception",
952 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
953 return;
954 do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
955}
956#endif
957
1176void __init trap_init(void) 958void __init trap_init(void)
1177{ 959{
960#ifdef CONFIG_X86_32
1178 int i; 961 int i;
962#endif
1179 963
1180#ifdef CONFIG_EISA 964#ifdef CONFIG_EISA
1181 void __iomem *p = early_ioremap(0x0FFFD9, 4); 965 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1185,29 +969,40 @@ void __init trap_init(void)
1185 early_iounmap(p, 4); 969 early_iounmap(p, 4);
1186#endif 970#endif
1187 971
1188 set_trap_gate(0, &divide_error); 972 set_intr_gate(0, &divide_error);
1189 set_intr_gate(1, &debug); 973 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1190 set_intr_gate(2, &nmi); 974 set_intr_gate_ist(2, &nmi, NMI_STACK);
1191 set_system_intr_gate(3, &int3); /* int3 can be called from all */ 975 /* int3 can be called from all */
1192 set_system_gate(4, &overflow); /* int4 can be called from all */ 976 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
1193 set_trap_gate(5, &bounds); 977 /* int4 can be called from all */
1194 set_trap_gate(6, &invalid_op); 978 set_system_intr_gate(4, &overflow);
1195 set_trap_gate(7, &device_not_available); 979 set_intr_gate(5, &bounds);
980 set_intr_gate(6, &invalid_op);
981 set_intr_gate(7, &device_not_available);
982#ifdef CONFIG_X86_32
1196 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); 983 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1197 set_trap_gate(9, &coprocessor_segment_overrun); 984#else
1198 set_trap_gate(10, &invalid_TSS); 985 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1199 set_trap_gate(11, &segment_not_present); 986#endif
1200 set_trap_gate(12, &stack_segment); 987 set_intr_gate(9, &coprocessor_segment_overrun);
1201 set_trap_gate(13, &general_protection); 988 set_intr_gate(10, &invalid_TSS);
989 set_intr_gate(11, &segment_not_present);
990 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
991 set_intr_gate(13, &general_protection);
1202 set_intr_gate(14, &page_fault); 992 set_intr_gate(14, &page_fault);
1203 set_trap_gate(15, &spurious_interrupt_bug); 993 set_intr_gate(15, &spurious_interrupt_bug);
1204 set_trap_gate(16, &coprocessor_error); 994 set_intr_gate(16, &coprocessor_error);
1205 set_trap_gate(17, &alignment_check); 995 set_intr_gate(17, &alignment_check);
1206#ifdef CONFIG_X86_MCE 996#ifdef CONFIG_X86_MCE
1207 set_trap_gate(18, &machine_check); 997 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1208#endif 998#endif
1209 set_trap_gate(19, &simd_coprocessor_error); 999 set_intr_gate(19, &simd_coprocessor_error);
1210 1000
1001#ifdef CONFIG_IA32_EMULATION
1002 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1003#endif
1004
1005#ifdef CONFIG_X86_32
1211 if (cpu_has_fxsr) { 1006 if (cpu_has_fxsr) {
1212 printk(KERN_INFO "Enabling fast FPU save and restore... "); 1007 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1213 set_in_cr4(X86_CR4_OSFXSR); 1008 set_in_cr4(X86_CR4_OSFXSR);
@@ -1220,37 +1015,20 @@ void __init trap_init(void)
1220 printk("done.\n"); 1015 printk("done.\n");
1221 } 1016 }
1222 1017
1223 set_system_gate(SYSCALL_VECTOR, &system_call); 1018 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1224 1019
1225 /* Reserve all the builtin and the syscall vector: */ 1020 /* Reserve all the builtin and the syscall vector: */
1226 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1021 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1227 set_bit(i, used_vectors); 1022 set_bit(i, used_vectors);
1228 1023
1229 set_bit(SYSCALL_VECTOR, used_vectors); 1024 set_bit(SYSCALL_VECTOR, used_vectors);
1230 1025#endif
1231 init_thread_xstate();
1232 /* 1026 /*
1233 * Should be a barrier for any external CPU state: 1027 * Should be a barrier for any external CPU state:
1234 */ 1028 */
1235 cpu_init(); 1029 cpu_init();
1236 1030
1031#ifdef CONFIG_X86_32
1237 trap_init_hook(); 1032 trap_init_hook();
1033#endif
1238} 1034}
1239
1240static int __init kstack_setup(char *s)
1241{
1242 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1243
1244 return 1;
1245}
1246__setup("kstack=", kstack_setup);
1247
1248static int __init code_bytes_setup(char *s)
1249{
1250 code_bytes = simple_strtoul(s, NULL, 0);
1251 if (code_bytes > 8192)
1252 code_bytes = 8192;
1253
1254 return 1;
1255}
1256__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
deleted file mode 100644
index 7a31f104bef9..000000000000
--- a/arch/x86/kernel/traps_64.c
+++ /dev/null
@@ -1,1218 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some
11 * state in 'entry.S'.
12 */
13#include <linux/moduleparam.h>
14#include <linux/interrupt.h>
15#include <linux/kallsyms.h>
16#include <linux/spinlock.h>
17#include <linux/kprobes.h>
18#include <linux/uaccess.h>
19#include <linux/utsname.h>
20#include <linux/kdebug.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/ptrace.h>
24#include <linux/string.h>
25#include <linux/unwind.h>
26#include <linux/delay.h>
27#include <linux/errno.h>
28#include <linux/kexec.h>
29#include <linux/sched.h>
30#include <linux/timer.h>
31#include <linux/init.h>
32#include <linux/bug.h>
33#include <linux/nmi.h>
34#include <linux/mm.h>
35#include <linux/smp.h>
36#include <linux/io.h>
37
38#if defined(CONFIG_EDAC)
39#include <linux/edac.h>
40#endif
41
42#include <asm/stacktrace.h>
43#include <asm/processor.h>
44#include <asm/debugreg.h>
45#include <asm/atomic.h>
46#include <asm/system.h>
47#include <asm/unwind.h>
48#include <asm/desc.h>
49#include <asm/i387.h>
50#include <asm/pgalloc.h>
51#include <asm/proto.h>
52#include <asm/pda.h>
53#include <asm/traps.h>
54
55#include <mach_traps.h>
56
57int panic_on_unrecovered_nmi;
58int kstack_depth_to_print = 12;
59static unsigned int code_bytes = 64;
60static int ignore_nmis;
61static int die_counter;
62
63static inline void conditional_sti(struct pt_regs *regs)
64{
65 if (regs->flags & X86_EFLAGS_IF)
66 local_irq_enable();
67}
68
69static inline void preempt_conditional_sti(struct pt_regs *regs)
70{
71 inc_preempt_count();
72 if (regs->flags & X86_EFLAGS_IF)
73 local_irq_enable();
74}
75
76static inline void preempt_conditional_cli(struct pt_regs *regs)
77{
78 if (regs->flags & X86_EFLAGS_IF)
79 local_irq_disable();
80 /* Make sure to not schedule here because we could be running
81 on an exception stack. */
82 dec_preempt_count();
83}
84
85void printk_address(unsigned long address, int reliable)
86{
87 printk(" [<%016lx>] %s%pS\n",
88 address, reliable ? "" : "? ", (void *) address);
89}
90
91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
92 unsigned *usedp, char **idp)
93{
94 static char ids[][8] = {
95 [DEBUG_STACK - 1] = "#DB",
96 [NMI_STACK - 1] = "NMI",
97 [DOUBLEFAULT_STACK - 1] = "#DF",
98 [STACKFAULT_STACK - 1] = "#SS",
99 [MCE_STACK - 1] = "#MC",
100#if DEBUG_STKSZ > EXCEPTION_STKSZ
101 [N_EXCEPTION_STACKS ...
102 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
103#endif
104 };
105 unsigned k;
106
107 /*
108 * Iterate over all exception stacks, and figure out whether
109 * 'stack' is in one of them:
110 */
111 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
112 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
113 /*
114 * Is 'stack' above this exception frame's end?
115 * If yes then skip to the next frame.
116 */
117 if (stack >= end)
118 continue;
119 /*
120 * Is 'stack' above this exception frame's start address?
121 * If yes then we found the right frame.
122 */
123 if (stack >= end - EXCEPTION_STKSZ) {
124 /*
125 * Make sure we only iterate through an exception
126 * stack once. If it comes up for the second time
127 * then there's something wrong going on - just
128 * break out and return NULL:
129 */
130 if (*usedp & (1U << k))
131 break;
132 *usedp |= 1U << k;
133 *idp = ids[k];
134 return (unsigned long *)end;
135 }
136 /*
137 * If this is a debug stack, and if it has a larger size than
138 * the usual exception stacks, then 'stack' might still
139 * be within the lower portion of the debug stack:
140 */
141#if DEBUG_STKSZ > EXCEPTION_STKSZ
142 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
143 unsigned j = N_EXCEPTION_STACKS - 1;
144
145 /*
146 * Black magic. A large debug stack is composed of
147 * multiple exception stack entries, which we
148 * iterate through now. Dont look:
149 */
150 do {
151 ++j;
152 end -= EXCEPTION_STKSZ;
153 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
154 } while (stack < end - EXCEPTION_STKSZ);
155 if (*usedp & (1U << j))
156 break;
157 *usedp |= 1U << j;
158 *idp = ids[j];
159 return (unsigned long *)end;
160 }
161#endif
162 }
163 return NULL;
164}
165
166/*
167 * x86-64 can have up to three kernel stacks:
168 * process stack
169 * interrupt stack
170 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
171 */
172
173static inline int valid_stack_ptr(struct thread_info *tinfo,
174 void *p, unsigned int size, void *end)
175{
176 void *t = tinfo;
177 if (end) {
178 if (p < end && p >= (end-THREAD_SIZE))
179 return 1;
180 else
181 return 0;
182 }
183 return p > t && p < t + THREAD_SIZE - size;
184}
185
186/* The form of the top of the frame on the stack */
187struct stack_frame {
188 struct stack_frame *next_frame;
189 unsigned long return_address;
190};
191
192static inline unsigned long
193print_context_stack(struct thread_info *tinfo,
194 unsigned long *stack, unsigned long bp,
195 const struct stacktrace_ops *ops, void *data,
196 unsigned long *end)
197{
198 struct stack_frame *frame = (struct stack_frame *)bp;
199
200 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
201 unsigned long addr;
202
203 addr = *stack;
204 if (__kernel_text_address(addr)) {
205 if ((unsigned long) stack == bp + 8) {
206 ops->address(data, addr, 1);
207 frame = frame->next_frame;
208 bp = (unsigned long) frame;
209 } else {
210 ops->address(data, addr, bp == 0);
211 }
212 }
213 stack++;
214 }
215 return bp;
216}
217
218void dump_trace(struct task_struct *task, struct pt_regs *regs,
219 unsigned long *stack, unsigned long bp,
220 const struct stacktrace_ops *ops, void *data)
221{
222 const unsigned cpu = get_cpu();
223 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
224 unsigned used = 0;
225 struct thread_info *tinfo;
226
227 if (!task)
228 task = current;
229
230 if (!stack) {
231 unsigned long dummy;
232 stack = &dummy;
233 if (task && task != current)
234 stack = (unsigned long *)task->thread.sp;
235 }
236
237#ifdef CONFIG_FRAME_POINTER
238 if (!bp) {
239 if (task == current) {
240 /* Grab bp right from our regs */
241 asm("movq %%rbp, %0" : "=r" (bp) : );
242 } else {
243 /* bp is the last reg pushed by switch_to */
244 bp = *(unsigned long *) task->thread.sp;
245 }
246 }
247#endif
248
249 /*
250 * Print function call entries in all stacks, starting at the
251 * current stack address. If the stacks consist of nested
252 * exceptions
253 */
254 tinfo = task_thread_info(task);
255 for (;;) {
256 char *id;
257 unsigned long *estack_end;
258 estack_end = in_exception_stack(cpu, (unsigned long)stack,
259 &used, &id);
260
261 if (estack_end) {
262 if (ops->stack(data, id) < 0)
263 break;
264
265 bp = print_context_stack(tinfo, stack, bp, ops,
266 data, estack_end);
267 ops->stack(data, "<EOE>");
268 /*
269 * We link to the next stack via the
270 * second-to-last pointer (index -2 to end) in the
271 * exception stack:
272 */
273 stack = (unsigned long *) estack_end[-2];
274 continue;
275 }
276 if (irqstack_end) {
277 unsigned long *irqstack;
278 irqstack = irqstack_end -
279 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
280
281 if (stack >= irqstack && stack < irqstack_end) {
282 if (ops->stack(data, "IRQ") < 0)
283 break;
284 bp = print_context_stack(tinfo, stack, bp,
285 ops, data, irqstack_end);
286 /*
287 * We link to the next stack (which would be
288 * the process stack normally) the last
289 * pointer (index -1 to end) in the IRQ stack:
290 */
291 stack = (unsigned long *) (irqstack_end[-1]);
292 irqstack_end = NULL;
293 ops->stack(data, "EOI");
294 continue;
295 }
296 }
297 break;
298 }
299
300 /*
301 * This handles the process stack:
302 */
303 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
304 put_cpu();
305}
306EXPORT_SYMBOL(dump_trace);
307
308static void
309print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
310{
311 print_symbol(msg, symbol);
312 printk("\n");
313}
314
315static void print_trace_warning(void *data, char *msg)
316{
317 printk("%s\n", msg);
318}
319
320static int print_trace_stack(void *data, char *name)
321{
322 printk(" <%s> ", name);
323 return 0;
324}
325
326static void print_trace_address(void *data, unsigned long addr, int reliable)
327{
328 touch_nmi_watchdog();
329 printk_address(addr, reliable);
330}
331
332static const struct stacktrace_ops print_trace_ops = {
333 .warning = print_trace_warning,
334 .warning_symbol = print_trace_warning_symbol,
335 .stack = print_trace_stack,
336 .address = print_trace_address,
337};
338
339static void
340show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
341 unsigned long *stack, unsigned long bp, char *log_lvl)
342{
343 printk("Call Trace:\n");
344 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
345}
346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
353static void
354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
355 unsigned long *sp, unsigned long bp, char *log_lvl)
356{
357 unsigned long *stack;
358 int i;
359 const int cpu = smp_processor_id();
360 unsigned long *irqstack_end =
361 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
362 unsigned long *irqstack =
363 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
364
365 /*
366 * debugging aid: "show_stack(NULL, NULL);" prints the
367 * back trace for this cpu.
368 */
369
370 if (sp == NULL) {
371 if (task)
372 sp = (unsigned long *)task->thread.sp;
373 else
374 sp = (unsigned long *)&sp;
375 }
376
377 stack = sp;
378 for (i = 0; i < kstack_depth_to_print; i++) {
379 if (stack >= irqstack && stack <= irqstack_end) {
380 if (stack == irqstack_end) {
381 stack = (unsigned long *) (irqstack_end[-1]);
382 printk(" <EOI> ");
383 }
384 } else {
385 if (((long) stack & (THREAD_SIZE-1)) == 0)
386 break;
387 }
388 if (i && ((i % 4) == 0))
389 printk("\n");
390 printk(" %016lx", *stack++);
391 touch_nmi_watchdog();
392 }
393 printk("\n");
394 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
395}
396
397void show_stack(struct task_struct *task, unsigned long *sp)
398{
399 show_stack_log_lvl(task, NULL, sp, 0, "");
400}
401
402/*
403 * The architecture-independent dump_stack generator
404 */
405void dump_stack(void)
406{
407 unsigned long bp = 0;
408 unsigned long stack;
409
410#ifdef CONFIG_FRAME_POINTER
411 if (!bp)
412 asm("movq %%rbp, %0" : "=r" (bp) : );
413#endif
414
415 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
416 current->pid, current->comm, print_tainted(),
417 init_utsname()->release,
418 (int)strcspn(init_utsname()->version, " "),
419 init_utsname()->version);
420 show_trace(NULL, NULL, &stack, bp);
421}
422EXPORT_SYMBOL(dump_stack);
423
424void show_registers(struct pt_regs *regs)
425{
426 int i;
427 unsigned long sp;
428 const int cpu = smp_processor_id();
429 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
430
431 sp = regs->sp;
432 printk("CPU %d ", cpu);
433 __show_regs(regs);
434 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
435 cur->comm, cur->pid, task_thread_info(cur), cur);
436
437 /*
438 * When in-kernel, we also print out the stack and code at the
439 * time of the fault..
440 */
441 if (!user_mode(regs)) {
442 unsigned int code_prologue = code_bytes * 43 / 64;
443 unsigned int code_len = code_bytes;
444 unsigned char c;
445 u8 *ip;
446
447 printk("Stack: ");
448 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
449 regs->bp, "");
450
451 printk(KERN_EMERG "Code: ");
452
453 ip = (u8 *)regs->ip - code_prologue;
454 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
455 /* try starting at RIP */
456 ip = (u8 *)regs->ip;
457 code_len = code_len - code_prologue + 1;
458 }
459 for (i = 0; i < code_len; i++, ip++) {
460 if (ip < (u8 *)PAGE_OFFSET ||
461 probe_kernel_address(ip, c)) {
462 printk(" Bad RIP value.");
463 break;
464 }
465 if (ip == (u8 *)regs->ip)
466 printk("<%02x> ", c);
467 else
468 printk("%02x ", c);
469 }
470 }
471 printk("\n");
472}
473
474int is_valid_bugaddr(unsigned long ip)
475{
476 unsigned short ud2;
477
478 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
479 return 0;
480
481 return ud2 == 0x0b0f;
482}
483
484static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
485static int die_owner = -1;
486static unsigned int die_nest_count;
487
488unsigned __kprobes long oops_begin(void)
489{
490 int cpu;
491 unsigned long flags;
492
493 oops_enter();
494
495 /* racy, but better than risking deadlock. */
496 raw_local_irq_save(flags);
497 cpu = smp_processor_id();
498 if (!__raw_spin_trylock(&die_lock)) {
499 if (cpu == die_owner)
500 /* nested oops. should stop eventually */;
501 else
502 __raw_spin_lock(&die_lock);
503 }
504 die_nest_count++;
505 die_owner = cpu;
506 console_verbose();
507 bust_spinlocks(1);
508 return flags;
509}
510
511void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
512{
513 die_owner = -1;
514 bust_spinlocks(0);
515 die_nest_count--;
516 if (!die_nest_count)
517 /* Nest count reaches zero, release the lock. */
518 __raw_spin_unlock(&die_lock);
519 raw_local_irq_restore(flags);
520 if (!regs) {
521 oops_exit();
522 return;
523 }
524 if (panic_on_oops)
525 panic("Fatal exception");
526 oops_exit();
527 do_exit(signr);
528}
529
530int __kprobes __die(const char *str, struct pt_regs *regs, long err)
531{
532 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
533#ifdef CONFIG_PREEMPT
534 printk("PREEMPT ");
535#endif
536#ifdef CONFIG_SMP
537 printk("SMP ");
538#endif
539#ifdef CONFIG_DEBUG_PAGEALLOC
540 printk("DEBUG_PAGEALLOC");
541#endif
542 printk("\n");
543 if (notify_die(DIE_OOPS, str, regs, err,
544 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
545 return 1;
546
547 show_registers(regs);
548 add_taint(TAINT_DIE);
549 /* Executive summary in case the oops scrolled away */
550 printk(KERN_ALERT "RIP ");
551 printk_address(regs->ip, 1);
552 printk(" RSP <%016lx>\n", regs->sp);
553 if (kexec_should_crash(current))
554 crash_kexec(regs);
555 return 0;
556}
557
558void die(const char *str, struct pt_regs *regs, long err)
559{
560 unsigned long flags = oops_begin();
561
562 if (!user_mode(regs))
563 report_bug(regs->ip, regs);
564
565 if (__die(str, regs, err))
566 regs = NULL;
567 oops_end(flags, regs, SIGSEGV);
568}
569
570notrace __kprobes void
571die_nmi(char *str, struct pt_regs *regs, int do_panic)
572{
573 unsigned long flags;
574
575 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
576 return;
577
578 flags = oops_begin();
579 /*
580 * We are in trouble anyway, lets at least try
581 * to get a message out.
582 */
583 printk(KERN_EMERG "%s", str);
584 printk(" on CPU%d, ip %08lx, registers:\n",
585 smp_processor_id(), regs->ip);
586 show_registers(regs);
587 if (kexec_should_crash(current))
588 crash_kexec(regs);
589 if (do_panic || panic_on_oops)
590 panic("Non maskable interrupt");
591 oops_end(flags, NULL, SIGBUS);
592 nmi_exit();
593 local_irq_enable();
594 do_exit(SIGBUS);
595}
596
597static void __kprobes
598do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
599 long error_code, siginfo_t *info)
600{
601 struct task_struct *tsk = current;
602
603 if (!user_mode(regs))
604 goto kernel_trap;
605
606 /*
607 * We want error_code and trap_no set for userspace faults and
608 * kernelspace faults which result in die(), but not
609 * kernelspace faults which are fixed up. die() gives the
610 * process no chance to handle the signal and notice the
611 * kernel fault information, so that won't result in polluting
612 * the information about previously queued, but not yet
613 * delivered, faults. See also do_general_protection below.
614 */
615 tsk->thread.error_code = error_code;
616 tsk->thread.trap_no = trapnr;
617
618 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
619 printk_ratelimit()) {
620 printk(KERN_INFO
621 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
622 tsk->comm, tsk->pid, str,
623 regs->ip, regs->sp, error_code);
624 print_vma_addr(" in ", regs->ip);
625 printk("\n");
626 }
627
628 if (info)
629 force_sig_info(signr, info, tsk);
630 else
631 force_sig(signr, tsk);
632 return;
633
634kernel_trap:
635 if (!fixup_exception(regs)) {
636 tsk->thread.error_code = error_code;
637 tsk->thread.trap_no = trapnr;
638 die(str, regs, error_code);
639 }
640 return;
641}
642
643#define DO_ERROR(trapnr, signr, str, name) \
644asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
645{ \
646 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
647 == NOTIFY_STOP) \
648 return; \
649 conditional_sti(regs); \
650 do_trap(trapnr, signr, str, regs, error_code, NULL); \
651}
652
653#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
654asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
655{ \
656 siginfo_t info; \
657 info.si_signo = signr; \
658 info.si_errno = 0; \
659 info.si_code = sicode; \
660 info.si_addr = (void __user *)siaddr; \
661 trace_hardirqs_fixup(); \
662 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
663 == NOTIFY_STOP) \
664 return; \
665 conditional_sti(regs); \
666 do_trap(trapnr, signr, str, regs, error_code, &info); \
667}
668
669DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
670DO_ERROR(4, SIGSEGV, "overflow", overflow)
671DO_ERROR(5, SIGSEGV, "bounds", bounds)
672DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
673DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
674DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
675DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
676DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
677
678/* Runs on IST stack */
679asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
680{
681 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
682 12, SIGBUS) == NOTIFY_STOP)
683 return;
684 preempt_conditional_sti(regs);
685 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
686 preempt_conditional_cli(regs);
687}
688
689asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
690{
691 static const char str[] = "double fault";
692 struct task_struct *tsk = current;
693
694 /* Return not checked because double check cannot be ignored */
695 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
696
697 tsk->thread.error_code = error_code;
698 tsk->thread.trap_no = 8;
699
700 /* This is always a kernel trap and never fixable (and thus must
701 never return). */
702 for (;;)
703 die(str, regs, error_code);
704}
705
706asmlinkage void __kprobes
707do_general_protection(struct pt_regs *regs, long error_code)
708{
709 struct task_struct *tsk;
710
711 conditional_sti(regs);
712
713 tsk = current;
714 if (!user_mode(regs))
715 goto gp_in_kernel;
716
717 tsk->thread.error_code = error_code;
718 tsk->thread.trap_no = 13;
719
720 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
721 printk_ratelimit()) {
722 printk(KERN_INFO
723 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
724 tsk->comm, tsk->pid,
725 regs->ip, regs->sp, error_code);
726 print_vma_addr(" in ", regs->ip);
727 printk("\n");
728 }
729
730 force_sig(SIGSEGV, tsk);
731 return;
732
733gp_in_kernel:
734 if (fixup_exception(regs))
735 return;
736
737 tsk->thread.error_code = error_code;
738 tsk->thread.trap_no = 13;
739 if (notify_die(DIE_GPF, "general protection fault", regs,
740 error_code, 13, SIGSEGV) == NOTIFY_STOP)
741 return;
742 die("general protection fault", regs, error_code);
743}
744
745static notrace __kprobes void
746mem_parity_error(unsigned char reason, struct pt_regs *regs)
747{
748 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
749 reason);
750 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
751
752#if defined(CONFIG_EDAC)
753 if (edac_handler_set()) {
754 edac_atomic_assert_error();
755 return;
756 }
757#endif
758
759 if (panic_on_unrecovered_nmi)
760 panic("NMI: Not continuing");
761
762 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
763
764 /* Clear and disable the memory parity error line. */
765 reason = (reason & 0xf) | 4;
766 outb(reason, 0x61);
767}
768
769static notrace __kprobes void
770io_check_error(unsigned char reason, struct pt_regs *regs)
771{
772 printk("NMI: IOCK error (debug interrupt?)\n");
773 show_registers(regs);
774
775 /* Re-enable the IOCK line, wait for a few seconds */
776 reason = (reason & 0xf) | 8;
777 outb(reason, 0x61);
778 mdelay(2000);
779 reason &= ~8;
780 outb(reason, 0x61);
781}
782
783static notrace __kprobes void
784unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
785{
786 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
787 NOTIFY_STOP)
788 return;
789 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
790 reason);
791 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
792
793 if (panic_on_unrecovered_nmi)
794 panic("NMI: Not continuing");
795
796 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
797}
798
799/* Runs on IST stack. This code must keep interrupts off all the time.
800 Nested NMIs are prevented by the CPU. */
801asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
802{
803 unsigned char reason = 0;
804 int cpu;
805
806 cpu = smp_processor_id();
807
808 /* Only the BSP gets external NMIs from the system. */
809 if (!cpu)
810 reason = get_nmi_reason();
811
812 if (!(reason & 0xc0)) {
813 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
814 == NOTIFY_STOP)
815 return;
816 /*
817 * Ok, so this is none of the documented NMI sources,
818 * so it must be the NMI watchdog.
819 */
820 if (nmi_watchdog_tick(regs, reason))
821 return;
822 if (!do_nmi_callback(regs, cpu))
823 unknown_nmi_error(reason, regs);
824
825 return;
826 }
827 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
828 return;
829
830 /* AK: following checks seem to be broken on modern chipsets. FIXME */
831 if (reason & 0x80)
832 mem_parity_error(reason, regs);
833 if (reason & 0x40)
834 io_check_error(reason, regs);
835}
836
837asmlinkage notrace __kprobes void
838do_nmi(struct pt_regs *regs, long error_code)
839{
840 nmi_enter();
841
842 add_pda(__nmi_count, 1);
843
844 if (!ignore_nmis)
845 default_do_nmi(regs);
846
847 nmi_exit();
848}
849
850void stop_nmi(void)
851{
852 acpi_nmi_disable();
853 ignore_nmis++;
854}
855
856void restart_nmi(void)
857{
858 ignore_nmis--;
859 acpi_nmi_enable();
860}
861
862/* runs on IST stack. */
863asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
864{
865 trace_hardirqs_fixup();
866
867 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
868 == NOTIFY_STOP)
869 return;
870
871 preempt_conditional_sti(regs);
872 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
873 preempt_conditional_cli(regs);
874}
875
876/* Help handler running on IST stack to switch back to user stack
877 for scheduling or signal handling. The actual stack switch is done in
878 entry.S */
879asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
880{
881 struct pt_regs *regs = eregs;
882 /* Did already sync */
883 if (eregs == (struct pt_regs *)eregs->sp)
884 ;
885 /* Exception from user space */
886 else if (user_mode(eregs))
887 regs = task_pt_regs(current);
888 /* Exception from kernel and interrupts are enabled. Move to
889 kernel process stack. */
890 else if (eregs->flags & X86_EFLAGS_IF)
891 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
892 if (eregs != regs)
893 *regs = *eregs;
894 return regs;
895}
896
897/* runs on IST stack. */
898asmlinkage void __kprobes do_debug(struct pt_regs *regs,
899 unsigned long error_code)
900{
901 struct task_struct *tsk = current;
902 unsigned long condition;
903 siginfo_t info;
904
905 trace_hardirqs_fixup();
906
907 get_debugreg(condition, 6);
908
909 /*
910 * The processor cleared BTF, so don't mark that we need it set.
911 */
912 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
913 tsk->thread.debugctlmsr = 0;
914
915 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
916 SIGTRAP) == NOTIFY_STOP)
917 return;
918
919 preempt_conditional_sti(regs);
920
921 /* Mask out spurious debug traps due to lazy DR7 setting */
922 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
923 if (!tsk->thread.debugreg7)
924 goto clear_dr7;
925 }
926
927 tsk->thread.debugreg6 = condition;
928
929 /*
930 * Single-stepping through TF: make sure we ignore any events in
931 * kernel space (but re-enable TF when returning to user mode).
932 */
933 if (condition & DR_STEP) {
934 if (!user_mode(regs))
935 goto clear_TF_reenable;
936 }
937
938 /* Ok, finally something we can handle */
939 tsk->thread.trap_no = 1;
940 tsk->thread.error_code = error_code;
941 info.si_signo = SIGTRAP;
942 info.si_errno = 0;
943 info.si_code = TRAP_BRKPT;
944 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
945 force_sig_info(SIGTRAP, &info, tsk);
946
947clear_dr7:
948 set_debugreg(0, 7);
949 preempt_conditional_cli(regs);
950 return;
951
952clear_TF_reenable:
953 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
954 regs->flags &= ~X86_EFLAGS_TF;
955 preempt_conditional_cli(regs);
956 return;
957}
958
959static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
960{
961 if (fixup_exception(regs))
962 return 1;
963
964 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
965 /* Illegal floating point operation in the kernel */
966 current->thread.trap_no = trapnr;
967 die(str, regs, 0);
968 return 0;
969}
970
971/*
972 * Note that we play around with the 'TS' bit in an attempt to get
973 * the correct behaviour even in the presence of the asynchronous
974 * IRQ13 behaviour
975 */
976asmlinkage void do_coprocessor_error(struct pt_regs *regs)
977{
978 void __user *ip = (void __user *)(regs->ip);
979 struct task_struct *task;
980 siginfo_t info;
981 unsigned short cwd, swd;
982
983 conditional_sti(regs);
984 if (!user_mode(regs) &&
985 kernel_math_error(regs, "kernel x87 math error", 16))
986 return;
987
988 /*
989 * Save the info for the exception handler and clear the error.
990 */
991 task = current;
992 save_init_fpu(task);
993 task->thread.trap_no = 16;
994 task->thread.error_code = 0;
995 info.si_signo = SIGFPE;
996 info.si_errno = 0;
997 info.si_code = __SI_FAULT;
998 info.si_addr = ip;
999 /*
1000 * (~cwd & swd) will mask out exceptions that are not set to unmasked
1001 * status. 0x3f is the exception bits in these regs, 0x200 is the
1002 * C1 reg you need in case of a stack fault, 0x040 is the stack
1003 * fault bit. We should only be taking one exception at a time,
1004 * so if this combination doesn't produce any single exception,
1005 * then we have a bad program that isn't synchronizing its FPU usage
1006 * and it will suffer the consequences since we won't be able to
1007 * fully reproduce the context of the exception
1008 */
1009 cwd = get_fpu_cwd(task);
1010 swd = get_fpu_swd(task);
1011 switch (swd & ~cwd & 0x3f) {
1012 case 0x000: /* No unmasked exception */
1013 default: /* Multiple exceptions */
1014 break;
1015 case 0x001: /* Invalid Op */
1016 /*
1017 * swd & 0x240 == 0x040: Stack Underflow
1018 * swd & 0x240 == 0x240: Stack Overflow
1019 * User must clear the SF bit (0x40) if set
1020 */
1021 info.si_code = FPE_FLTINV;
1022 break;
1023 case 0x002: /* Denormalize */
1024 case 0x010: /* Underflow */
1025 info.si_code = FPE_FLTUND;
1026 break;
1027 case 0x004: /* Zero Divide */
1028 info.si_code = FPE_FLTDIV;
1029 break;
1030 case 0x008: /* Overflow */
1031 info.si_code = FPE_FLTOVF;
1032 break;
1033 case 0x020: /* Precision */
1034 info.si_code = FPE_FLTRES;
1035 break;
1036 }
1037 force_sig_info(SIGFPE, &info, task);
1038}
1039
1040asmlinkage void bad_intr(void)
1041{
1042 printk("bad interrupt");
1043}
1044
1045asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1046{
1047 void __user *ip = (void __user *)(regs->ip);
1048 struct task_struct *task;
1049 siginfo_t info;
1050 unsigned short mxcsr;
1051
1052 conditional_sti(regs);
1053 if (!user_mode(regs) &&
1054 kernel_math_error(regs, "kernel simd math error", 19))
1055 return;
1056
1057 /*
1058 * Save the info for the exception handler and clear the error.
1059 */
1060 task = current;
1061 save_init_fpu(task);
1062 task->thread.trap_no = 19;
1063 task->thread.error_code = 0;
1064 info.si_signo = SIGFPE;
1065 info.si_errno = 0;
1066 info.si_code = __SI_FAULT;
1067 info.si_addr = ip;
1068 /*
1069 * The SIMD FPU exceptions are handled a little differently, as there
1070 * is only a single status/control register. Thus, to determine which
1071 * unmasked exception was caught we must mask the exception mask bits
1072 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1073 */
1074 mxcsr = get_fpu_mxcsr(task);
1075 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1076 case 0x000:
1077 default:
1078 break;
1079 case 0x001: /* Invalid Op */
1080 info.si_code = FPE_FLTINV;
1081 break;
1082 case 0x002: /* Denormalize */
1083 case 0x010: /* Underflow */
1084 info.si_code = FPE_FLTUND;
1085 break;
1086 case 0x004: /* Zero Divide */
1087 info.si_code = FPE_FLTDIV;
1088 break;
1089 case 0x008: /* Overflow */
1090 info.si_code = FPE_FLTOVF;
1091 break;
1092 case 0x020: /* Precision */
1093 info.si_code = FPE_FLTRES;
1094 break;
1095 }
1096 force_sig_info(SIGFPE, &info, task);
1097}
1098
1099asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
1100{
1101}
1102
1103asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
1104{
1105}
1106
1107asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1108{
1109}
1110
1111/*
1112 * 'math_state_restore()' saves the current math information in the
1113 * old math state array, and gets the new ones from the current task
1114 *
1115 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1116 * Don't touch unless you *really* know how it works.
1117 */
1118asmlinkage void math_state_restore(void)
1119{
1120 struct task_struct *me = current;
1121
1122 if (!used_math()) {
1123 local_irq_enable();
1124 /*
1125 * does a slab alloc which can sleep
1126 */
1127 if (init_fpu(me)) {
1128 /*
1129 * ran out of memory!
1130 */
1131 do_group_exit(SIGKILL);
1132 return;
1133 }
1134 local_irq_disable();
1135 }
1136
1137 clts(); /* Allow maths ops (or we recurse) */
1138 /*
1139 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
1140 */
1141 if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
1142 stts();
1143 force_sig(SIGSEGV, me);
1144 return;
1145 }
1146 task_thread_info(me)->status |= TS_USEDFPU;
1147 me->fpu_counter++;
1148}
1149EXPORT_SYMBOL_GPL(math_state_restore);
1150
1151void __init trap_init(void)
1152{
1153 set_intr_gate(0, &divide_error);
1154 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1155 set_intr_gate_ist(2, &nmi, NMI_STACK);
1156 /* int3 can be called from all */
1157 set_system_gate_ist(3, &int3, DEBUG_STACK);
1158 /* int4 can be called from all */
1159 set_system_gate(4, &overflow);
1160 set_intr_gate(5, &bounds);
1161 set_intr_gate(6, &invalid_op);
1162 set_intr_gate(7, &device_not_available);
1163 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1164 set_intr_gate(9, &coprocessor_segment_overrun);
1165 set_intr_gate(10, &invalid_TSS);
1166 set_intr_gate(11, &segment_not_present);
1167 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
1168 set_intr_gate(13, &general_protection);
1169 set_intr_gate(14, &page_fault);
1170 set_intr_gate(15, &spurious_interrupt_bug);
1171 set_intr_gate(16, &coprocessor_error);
1172 set_intr_gate(17, &alignment_check);
1173#ifdef CONFIG_X86_MCE
1174 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1175#endif
1176 set_intr_gate(19, &simd_coprocessor_error);
1177
1178#ifdef CONFIG_IA32_EMULATION
1179 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1180#endif
1181 /*
1182 * initialize the per thread extended state:
1183 */
1184 init_thread_xstate();
1185 /*
1186 * Should be a barrier for any external CPU state:
1187 */
1188 cpu_init();
1189}
1190
1191static int __init oops_setup(char *s)
1192{
1193 if (!s)
1194 return -EINVAL;
1195 if (!strcmp(s, "panic"))
1196 panic_on_oops = 1;
1197 return 0;
1198}
1199early_param("oops", oops_setup);
1200
1201static int __init kstack_setup(char *s)
1202{
1203 if (!s)
1204 return -EINVAL;
1205 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1206 return 0;
1207}
1208early_param("kstack", kstack_setup);
1209
1210static int __init code_bytes_setup(char *s)
1211{
1212 code_bytes = simple_strtoul(s, NULL, 0);
1213 if (code_bytes > 8192)
1214 code_bytes = 8192;
1215
1216 return 1;
1217}
1218__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 8c9ad02af5a2..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -905,8 +905,8 @@ static inline int __init activate_vmi(void)
905#endif 905#endif
906 906
907#ifdef CONFIG_X86_LOCAL_APIC 907#ifdef CONFIG_X86_LOCAL_APIC
908 para_fill(pv_apic_ops.apic_read, APICRead); 908 para_fill(apic_ops->read, APICRead);
909 para_fill(pv_apic_ops.apic_write, APICWrite); 909 para_fill(apic_ops->write, APICWrite);
910#endif 910#endif
911 911
912 /* 912 /*
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index af5bdad84604..a9b8560adbc2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@ SECTIONS
140 *(.con_initcall.init) 140 *(.con_initcall.init)
141 __con_initcall_end = .; 141 __con_initcall_end = .;
142 } 142 }
143 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 143 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
144 __x86cpuvendor_start = .; 144 __x86_cpu_dev_start = .;
145 *(.x86cpuvendor.init) 145 *(.x86_cpu_dev.init)
146 __x86cpuvendor_end = .; 146 __x86_cpu_dev_end = .;
147 } 147 }
148 SECURITY_INIT 148 SECURITY_INIT
149 . = ALIGN(4); 149 . = ALIGN(4);
@@ -180,6 +180,7 @@ SECTIONS
180 . = ALIGN(PAGE_SIZE); 180 . = ALIGN(PAGE_SIZE);
181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
182 __per_cpu_start = .; 182 __per_cpu_start = .;
183 *(.data.percpu.page_aligned)
183 *(.data.percpu) 184 *(.data.percpu)
184 *(.data.percpu.shared_aligned) 185 *(.data.percpu.shared_aligned)
185 __per_cpu_end = .; 186 __per_cpu_end = .;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a22e88..46e05447405b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,12 +168,11 @@ SECTIONS
168 *(.con_initcall.init) 168 *(.con_initcall.init)
169 } 169 }
170 __con_initcall_end = .; 170 __con_initcall_end = .;
171 . = ALIGN(16); 171 __x86_cpu_dev_start = .;
172 __x86cpuvendor_start = .; 172 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
173 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 173 *(.x86_cpu_dev.init)
174 *(.x86cpuvendor.init)
175 } 174 }
176 __x86cpuvendor_end = .; 175 __x86_cpu_dev_end = .;
177 SECURITY_INIT 176 SECURITY_INIT
178 177
179 . = ALIGN(8); 178 . = ALIGN(8);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 000000000000..9abac8a9d823
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,345 @@
1/*
2 * xsave/xrstor support.
3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */
6#include <linux/bootmem.h>
7#include <linux/compat.h>
8#include <asm/i387.h>
9#ifdef CONFIG_IA32_EMULATION
10#include <asm/sigcontext32.h>
11#endif
12#include <asm/xcr.h>
13
14/*
15 * Supported feature mask by the CPU and the kernel.
16 */
17u64 pcntxt_mask;
18
19struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif
23
24/*
25 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext.
27 */
28int check_for_xstate(struct i387_fxsave_struct __user *buf,
29 void __user *fpstate,
30 struct _fpx_sw_bytes *fx_sw_user)
31{
32 int min_xstate_size = sizeof(struct i387_fxsave_struct) +
33 sizeof(struct xsave_hdr_struct);
34 unsigned int magic2;
35 int err;
36
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes));
39
40 if (err)
41 return err;
42
43 /*
44 * First Magic check failed.
45 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1;
48
49 /*
50 * Check for error scenarios.
51 */
52 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1;
56
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE));
60 /*
61 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information
64 * in the memory layout.
65 */
66 if (err || magic2 != FP_XSTATE_MAGIC2)
67 return -1;
68
69 return 0;
70}
71
72#ifdef CONFIG_X86_64
73/*
74 * Signal frame handlers.
75 */
76
77int save_i387_xstate(void __user *buf)
78{
79 struct task_struct *tsk = current;
80 int err = 0;
81
82 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
83 return -EACCES;
84
85 BUG_ON(sig_xstate_size < xstate_size);
86
87 if ((unsigned long)buf % 64)
88 printk("save_i387_xstate: bad fpstate %p\n", buf);
89
90 if (!used_math())
91 return 0;
92 clear_used_math(); /* trigger finit */
93 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (task_thread_info(tsk)->status & TS_XSAVE)
103 err = xsave_user(buf);
104 else
105 err = fxsave_user(buf);
106
107 if (err)
108 return err;
109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts();
111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
113 xstate_size))
114 return -1;
115 }
116
117 if (task_thread_info(tsk)->status & TS_XSAVE) {
118 struct _fpstate __user *fx = buf;
119 struct _xstate __user *x = buf;
120 u64 xstate_bv;
121
122 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
123 sizeof(struct _fpx_sw_bytes));
124
125 err |= __put_user(FP_XSTATE_MAGIC2,
126 (__u32 __user *) (buf + sig_xstate_size
127 - FP_XSTATE_MAGIC2_SIZE));
128
129 /*
130 * Read the xstate_bv which we copied (directly from the cpu or
131 * from the state in task struct) to the user buffers and
132 * set the FP/SSE bits.
133 */
134 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
135
136 /*
137 * For legacy compatible, we always set FP/SSE bits in the bit
138 * vector while saving the state to the user context. This will
139 * enable us capturing any changes(during sigreturn) to
140 * the FP/SSE bits by the legacy applications which don't touch
141 * xstate_bv in the xsave header.
142 *
143 * xsave aware apps can change the xstate_bv in the xsave
144 * header as well as change any contents in the memory layout.
145 * xrestore as part of sigreturn will capture all the changes.
146 */
147 xstate_bv |= XSTATE_FPSSE;
148
149 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
150
151 if (err)
152 return err;
153 }
154
155 return 1;
156}
157
158/*
159 * Restore the extended state if present. Otherwise, restore the FP/SSE
160 * state.
161 */
162int restore_user_xstate(void __user *buf)
163{
164 struct _fpx_sw_bytes fx_sw_user;
165 u64 mask;
166 int err;
167
168 if (((unsigned long)buf % 64) ||
169 check_for_xstate(buf, buf, &fx_sw_user))
170 goto fx_only;
171
172 mask = fx_sw_user.xstate_bv;
173
174 /*
175 * restore the state passed by the user.
176 */
177 err = xrestore_user(buf, mask);
178 if (err)
179 return err;
180
181 /*
182 * init the state skipped by the user.
183 */
184 mask = pcntxt_mask & ~mask;
185
186 xrstor_state(init_xstate_buf, mask);
187
188 return 0;
189
190fx_only:
191 /*
192 * couldn't find the extended state information in the
193 * memory layout. Restore just the FP/SSE and init all
194 * the other extended state.
195 */
196 xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
197 return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
198}
199
200/*
201 * This restores directly out of user space. Exceptions are handled.
202 */
203int restore_i387_xstate(void __user *buf)
204{
205 struct task_struct *tsk = current;
206 int err = 0;
207
208 if (!buf) {
209 if (used_math())
210 goto clear;
211 return 0;
212 } else
213 if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
214 return -EACCES;
215
216 if (!used_math()) {
217 err = init_fpu(tsk);
218 if (err)
219 return err;
220 }
221
222 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
223 clts();
224 task_thread_info(current)->status |= TS_USEDFPU;
225 }
226 if (task_thread_info(tsk)->status & TS_XSAVE)
227 err = restore_user_xstate(buf);
228 else
229 err = fxrstor_checking((__force struct i387_fxsave_struct *)
230 buf);
231 if (unlikely(err)) {
232 /*
233 * Encountered an error while doing the restore from the
234 * user buffer, clear the fpu state.
235 */
236clear:
237 clear_fpu(tsk);
238 clear_used_math();
239 }
240 return err;
241}
242#endif
243
244/*
245 * Prepare the SW reserved portion of the fxsave memory layout, indicating
246 * the presence of the extended state information in the memory layout
247 * pointed by the fpstate pointer in the sigcontext.
248 * This will be saved when ever the FP and extended state context is
249 * saved on the user stack during the signal handler delivery to the user.
250 */
251void prepare_fx_sw_frame(void)
252{
253 int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
254 FP_XSTATE_MAGIC2_SIZE;
255
256 sig_xstate_size = sizeof(struct _fpstate) + size_extended;
257
258#ifdef CONFIG_IA32_EMULATION
259 sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
260#endif
261
262 memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
263
264 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
265 fx_sw_reserved.extended_size = sig_xstate_size;
266 fx_sw_reserved.xstate_bv = pcntxt_mask;
267 fx_sw_reserved.xstate_size = xstate_size;
268#ifdef CONFIG_IA32_EMULATION
269 memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
270 sizeof(struct _fpx_sw_bytes));
271 fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
272#endif
273}
274
275/*
276 * Represents init state for the supported extended state.
277 */
278struct xsave_struct *init_xstate_buf;
279
280#ifdef CONFIG_X86_64
281unsigned int sig_xstate_size = sizeof(struct _fpstate);
282#endif
283
284/*
285 * Enable the extended processor state save/restore feature
286 */
287void __cpuinit xsave_init(void)
288{
289 if (!cpu_has_xsave)
290 return;
291
292 set_in_cr4(X86_CR4_OSXSAVE);
293
294 /*
295 * Enable all the features that the HW is capable of
296 * and the Linux kernel is aware of.
297 */
298 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
299}
300
301/*
302 * setup the xstate image representing the init state
303 */
304static void __init setup_xstate_init(void)
305{
306 init_xstate_buf = alloc_bootmem(xstate_size);
307 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
308}
309
310/*
311 * Enable and initialize the xsave feature.
312 */
313void __init xsave_cntxt_init(void)
314{
315 unsigned int eax, ebx, ecx, edx;
316
317 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
318 pcntxt_mask = eax + ((u64)edx << 32);
319
320 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
321 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
322 pcntxt_mask);
323 BUG();
324 }
325
326 /*
327 * for now OS knows only about FP/SSE
328 */
329 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
330 xsave_init();
331
332 /*
333 * Recompute the context size for enabled features
334 */
335 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
336 xstate_size = ebx;
337
338 prepare_fx_sw_frame();
339
340 setup_xstate_init();
341
342 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
343 "cntxt size 0x%x\n",
344 pcntxt_mask, xstate_size);
345}
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 23e8373507ad..17e25995b65b 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,21 +331,6 @@ enum vmcs_field {
331 331
332#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
333 333
334#define MSR_IA32_VMX_BASIC 0x480
335#define MSR_IA32_VMX_PINBASED_CTLS 0x481
336#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
337#define MSR_IA32_VMX_EXIT_CTLS 0x483
338#define MSR_IA32_VMX_ENTRY_CTLS 0x484
339#define MSR_IA32_VMX_MISC 0x485
340#define MSR_IA32_VMX_CR0_FIXED0 0x486
341#define MSR_IA32_VMX_CR0_FIXED1 0x487
342#define MSR_IA32_VMX_CR4_FIXED0 0x488
343#define MSR_IA32_VMX_CR4_FIXED1 0x489
344#define MSR_IA32_VMX_VMCS_ENUM 0x48a
345#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
346#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
347
348#define MSR_IA32_FEATURE_CONTROL 0x3a
349#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 334#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
350#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 335#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
351 336
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d9249a882aa5..65f0b8a47bed 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,6 +55,7 @@
55#include <linux/lguest_launcher.h> 55#include <linux/lguest_launcher.h>
56#include <linux/virtio_console.h> 56#include <linux/virtio_console.h>
57#include <linux/pm.h> 57#include <linux/pm.h>
58#include <asm/apic.h>
58#include <asm/lguest.h> 59#include <asm/lguest.h>
59#include <asm/paravirt.h> 60#include <asm/paravirt.h>
60#include <asm/param.h> 61#include <asm/param.h>
@@ -783,14 +784,44 @@ static void lguest_wbinvd(void)
783 * code qualifies for Advanced. It will also never interrupt anything. It 784 * code qualifies for Advanced. It will also never interrupt anything. It
784 * does, however, allow us to get through the Linux boot code. */ 785 * does, however, allow us to get through the Linux boot code. */
785#ifdef CONFIG_X86_LOCAL_APIC 786#ifdef CONFIG_X86_LOCAL_APIC
786static void lguest_apic_write(unsigned long reg, u32 v) 787static void lguest_apic_write(u32 reg, u32 v)
787{ 788{
788} 789}
789 790
790static u32 lguest_apic_read(unsigned long reg) 791static u32 lguest_apic_read(u32 reg)
791{ 792{
792 return 0; 793 return 0;
793} 794}
795
796static u64 lguest_apic_icr_read(void)
797{
798 return 0;
799}
800
801static void lguest_apic_icr_write(u32 low, u32 id)
802{
803 /* Warn to see if there's any stray references */
804 WARN_ON(1);
805}
806
807static void lguest_apic_wait_icr_idle(void)
808{
809 return;
810}
811
812static u32 lguest_apic_safe_wait_icr_idle(void)
813{
814 return 0;
815}
816
817static struct apic_ops lguest_basic_apic_ops = {
818 .read = lguest_apic_read,
819 .write = lguest_apic_write,
820 .icr_read = lguest_apic_icr_read,
821 .icr_write = lguest_apic_icr_write,
822 .wait_icr_idle = lguest_apic_wait_icr_idle,
823 .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
824};
794#endif 825#endif
795 826
796/* STOP! Until an interrupt comes in. */ 827/* STOP! Until an interrupt comes in. */
@@ -990,8 +1021,7 @@ __init void lguest_init(void)
990 1021
991#ifdef CONFIG_X86_LOCAL_APIC 1022#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 1023 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 1024 apic_ops = &lguest_basic_apic_ops;
994 pv_apic_ops.apic_read = lguest_apic_read;
995#endif 1025#endif
996 1026
997 /* time operations */ 1027 /* time operations */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index aa3fa4119424..55e11aa6d66c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y)
17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
18else 18else
19 obj-y += io_64.o iomap_copy_64.o 19 obj-y += io_64.o iomap_copy_64.o
20
21 CFLAGS_csum-partial_64.o := -funroll-loops
22
23 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 20 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
24 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 21 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
25 lib-y += memmove_64.o memset_64.o 22 lib-y += memmove_64.o memset_64.o
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 24e60944971a..9e68075544f6 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -14,6 +14,13 @@
14#include <asm/uaccess.h> 14#include <asm/uaccess.h>
15#include <asm/mmx.h> 15#include <asm/mmx.h>
16 16
17#ifdef CONFIG_X86_INTEL_USERCOPY
18/*
19 * Alignment at which movsl is preferred for bulk memory copies.
20 */
21struct movsl_mask movsl_mask __read_mostly;
22#endif
23
17static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) 24static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
18{ 25{
19#ifdef CONFIG_X86_INTEL_USERCOPY 26#ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 3f2cf11f201a..37b9ae4d44c5 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -38,15 +38,6 @@ void __init pre_intr_init_hook(void)
38 init_ISA_irqs(); 38 init_ISA_irqs();
39} 39}
40 40
41/*
42 * IRQ2 is cascade interrupt to second interrupt controller
43 */
44static struct irqaction irq2 = {
45 .handler = no_action,
46 .mask = CPU_MASK_NONE,
47 .name = "cascade",
48};
49
50/** 41/**
51 * intr_init_hook - post gate setup interrupt initialisation 42 * intr_init_hook - post gate setup interrupt initialisation
52 * 43 *
@@ -62,12 +53,6 @@ void __init intr_init_hook(void)
62 if (x86_quirks->arch_intr_init()) 53 if (x86_quirks->arch_intr_init())
63 return; 54 return;
64 } 55 }
65#ifdef CONFIG_X86_LOCAL_APIC
66 apic_intr_init();
67#endif
68
69 if (!acpi_ioapic)
70 setup_irq(2, &irq2);
71} 56}
72 57
73/** 58/**
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
deleted file mode 100644
index 3ef8b43b62fc..000000000000
--- a/arch/x86/mach-es7000/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-$(CONFIG_X86_ES7000) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
deleted file mode 100644
index c8d5aa132fa0..000000000000
--- a/arch/x86/mach-es7000/es7000.h
+++ /dev/null
@@ -1,114 +0,0 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27/*
28 * ES7000 chipsets
29 */
30
31#define NON_UNISYS 0
32#define ES7000_CLASSIC 1
33#define ES7000_ZORRO 2
34
35
36#define MIP_REG 1
37#define MIP_PSAI_REG 4
38
39#define MIP_BUSY 1
40#define MIP_SPIN 0xf0000
41#define MIP_VALID 0x0100000000000000ULL
42#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
43
44#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
45
46struct mip_reg_info {
47 unsigned long long mip_info;
48 unsigned long long delivery_info;
49 unsigned long long host_reg;
50 unsigned long long mip_reg;
51};
52
53struct part_info {
54 unsigned char type;
55 unsigned char length;
56 unsigned char part_id;
57 unsigned char apic_mode;
58 unsigned long snum;
59 char ptype[16];
60 char sname[64];
61 char pname[64];
62};
63
64struct psai {
65 unsigned long long entry_type;
66 unsigned long long addr;
67 unsigned long long bep_addr;
68};
69
70struct es7000_mem_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char resv[6];
74 unsigned long long start;
75 unsigned long long size;
76};
77
78struct es7000_oem_table {
79 unsigned long long hdr;
80 struct mip_reg_info mip;
81 struct part_info pif;
82 struct es7000_mem_info shm;
83 struct psai psai;
84};
85
86#ifdef CONFIG_ACPI
87
88struct oem_table {
89 struct acpi_table_header Header;
90 u32 OEMTableAddr;
91 u32 OEMTableSize;
92};
93
94extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
95#endif
96
97struct mip_reg {
98 unsigned long long off_0;
99 unsigned long long off_8;
100 unsigned long long off_10;
101 unsigned long long off_18;
102 unsigned long long off_20;
103 unsigned long long off_28;
104 unsigned long long off_30;
105 unsigned long long off_38;
106};
107
108#define MIP_SW_APIC 0x1020b
109#define MIP_FUNC(VALUE) (VALUE & 0xff)
110
111extern int parse_unisys_oem (char *oemptr);
112extern void setup_unisys(void);
113extern int es7000_start_cpu(int cpu, unsigned long eip);
114extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 0dbd7803a1d5..6730f4e7c744 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -9,4 +9,3 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o
9obj-$(CONFIG_X86_SUMMIT) += summit.o 9obj-$(CONFIG_X86_SUMMIT) += summit.o
10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o 10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
11obj-$(CONFIG_X86_ES7000) += es7000.o 11obj-$(CONFIG_X86_ES7000) += es7000.o
12obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 59d771714559..df37fc9d6a26 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -5,18 +5,17 @@
5#define APIC_DEFINITION 1 5#define APIC_DEFINITION 1
6#include <linux/threads.h> 6#include <linux/threads.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <asm/smp.h>
9#include <asm/mpspec.h> 8#include <asm/mpspec.h>
10#include <asm/genapic.h> 9#include <asm/genapic.h>
11#include <asm/fixmap.h> 10#include <asm/fixmap.h>
12#include <asm/apicdef.h> 11#include <asm/apicdef.h>
13#include <linux/kernel.h> 12#include <linux/kernel.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <linux/dmi.h> 14#include <linux/dmi.h>
17#include <asm/mach-bigsmp/mach_apic.h> 15#include <asm/bigsmp/apicdef.h>
18#include <asm/mach-bigsmp/mach_apicdef.h> 16#include <linux/smp.h>
19#include <asm/mach-bigsmp/mach_ipi.h> 17#include <asm/bigsmp/apic.h>
18#include <asm/bigsmp/ipi.h>
20#include <asm/mach-default/mach_mpparse.h> 19#include <asm/mach-default/mach_mpparse.h>
21 20
22static int dmi_bigsmp; /* can be set by dmi scanners */ 21static int dmi_bigsmp; /* can be set by dmi scanners */
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 4742626f08c4..6513d41ea21e 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -4,20 +4,19 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-es7000/mach_apicdef.h> 14#include <asm/es7000/apicdef.h>
17#include <asm/mach-es7000/mach_apic.h> 15#include <linux/smp.h>
18#include <asm/mach-es7000/mach_ipi.h> 16#include <asm/es7000/apic.h>
19#include <asm/mach-es7000/mach_mpparse.h> 17#include <asm/es7000/ipi.h>
20#include <asm/mach-es7000/mach_wakecpu.h> 18#include <asm/es7000/mpparse.h>
19#include <asm/es7000/wakecpu.h>
21 20
22static int probe_es7000(void) 21static int probe_es7000(void)
23{ 22{
@@ -48,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
48/* Hook from generic ACPI tables.c */ 47/* Hook from generic ACPI tables.c */
49static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 48static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 49{
51 unsigned long oem_addr; 50 unsigned long oem_addr = 0;
51 int check_dsdt;
52 int ret = 0;
53
54 /* check dsdt at first to avoid clear fix_map for oem_addr */
55 check_dsdt = es7000_check_dsdt();
56
52 if (!find_unisys_acpi_oem_table(&oem_addr)) { 57 if (!find_unisys_acpi_oem_table(&oem_addr)) {
53 if (es7000_check_dsdt()) 58 if (check_dsdt)
54 return parse_unisys_oem((char *)oem_addr); 59 ret = parse_unisys_oem((char *)oem_addr);
55 else { 60 else {
56 setup_unisys(); 61 setup_unisys();
57 return 1; 62 ret = 1;
58 } 63 }
64 /*
65 * we need to unmap it
66 */
67 unmap_unisys_acpi_oem_table(oem_addr);
59 } 68 }
60 return 0; 69 return ret;
61} 70}
62#else 71#else
63static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 72static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8091e68764c4..8cf58394975e 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -4,7 +4,6 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <linux/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
@@ -12,11 +11,12 @@
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/init.h> 13#include <linux/init.h>
15#include <asm/mach-numaq/mach_apic.h> 14#include <asm/numaq/apicdef.h>
16#include <asm/mach-numaq/mach_apicdef.h> 15#include <linux/smp.h>
17#include <asm/mach-numaq/mach_ipi.h> 16#include <asm/numaq/apic.h>
18#include <asm/mach-numaq/mach_mpparse.h> 17#include <asm/numaq/ipi.h>
19#include <asm/mach-numaq/mach_wakecpu.h> 18#include <asm/numaq/mpparse.h>
19#include <asm/numaq/wakecpu.h>
20#include <asm/numaq.h> 20#include <asm/numaq.h>
21 21
22static int mps_oem_check(struct mp_config_table *mpc, char *oem, 22static int mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index a97ea0f35b1e..6ad6b67a723d 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -4,19 +4,18 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-summit/mach_apic.h> 14#include <asm/summit/apicdef.h>
17#include <asm/mach-summit/mach_apicdef.h> 15#include <linux/smp.h>
18#include <asm/mach-summit/mach_ipi.h> 16#include <asm/summit/apic.h>
19#include <asm/mach-summit/mach_mpparse.h> 17#include <asm/summit/ipi.h>
18#include <asm/summit/mpparse.h>
20 19
21static int probe_summit(void) 20static int probe_summit(void)
22{ 21{
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index dfb932dcf136..59f89b434b45 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 15
16ifeq ($(CONFIG_X86_32),y) 16obj-$(CONFIG_NUMA) += numa_$(BITS).o
17obj-$(CONFIG_NUMA) += discontig_32.o
18else
19obj-$(CONFIG_NUMA) += numa_64.o
20obj-$(CONFIG_K8_NUMA) += k8topology_64.o 17obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 18obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 19
24obj-$(CONFIG_MEMTEST) += memtest.o 20obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8f92cac4e6db..3f2b8962cbd0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -592,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
592 unsigned long flags; 592 unsigned long flags;
593#endif 593#endif
594 594
595 /*
596 * We can fault from pretty much anywhere, with unknown IRQ state.
597 */
598 trace_hardirqs_fixup();
599
600 tsk = current; 595 tsk = current;
601 mm = tsk->mm; 596 mm = tsk->mm;
602 prefetchw(&mm->mmap_sem); 597 prefetchw(&mm->mmap_sem);
@@ -914,15 +909,15 @@ LIST_HEAD(pgd_list);
914 909
915void vmalloc_sync_all(void) 910void vmalloc_sync_all(void)
916{ 911{
917#ifdef CONFIG_X86_32
918 unsigned long start = VMALLOC_START & PGDIR_MASK;
919 unsigned long address; 912 unsigned long address;
920 913
914#ifdef CONFIG_X86_32
921 if (SHARED_KERNEL_PMD) 915 if (SHARED_KERNEL_PMD)
922 return; 916 return;
923 917
924 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 918 for (address = VMALLOC_START & PMD_MASK;
925 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 919 address >= TASK_SIZE && address < FIXADDR_TOP;
920 address += PMD_SIZE) {
926 unsigned long flags; 921 unsigned long flags;
927 struct page *page; 922 struct page *page;
928 923
@@ -935,10 +930,8 @@ void vmalloc_sync_all(void)
935 spin_unlock_irqrestore(&pgd_lock, flags); 930 spin_unlock_irqrestore(&pgd_lock, flags);
936 } 931 }
937#else /* CONFIG_X86_64 */ 932#else /* CONFIG_X86_64 */
938 unsigned long start = VMALLOC_START & PGDIR_MASK; 933 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
939 unsigned long address; 934 address += PGDIR_SIZE) {
940
941 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
942 const pgd_t *pgd_ref = pgd_offset_k(address); 935 const pgd_t *pgd_ref = pgd_offset_k(address);
943 unsigned long flags; 936 unsigned long flags;
944 struct page *page; 937 struct page *page;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 007bb06c7504..4ba373c5b8c8 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
82 pte_t pte = gup_get_pte(ptep); 82 pte_t pte = gup_get_pte(ptep);
83 struct page *page; 83 struct page *page;
84 84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { 85 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep); 86 pte_unmap(ptep);
87 return 0; 87 return 0;
88 } 88 }
@@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
116 mask = _PAGE_PRESENT|_PAGE_USER; 116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write) 117 if (write)
118 mask |= _PAGE_RW; 118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask) 119 if ((pte_flags(pte) & mask) != mask)
120 return 0; 120 return 0;
121 /* hugepages are never "special" */ 121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); 122 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124 124
125 refs = 0; 125 refs = 0;
@@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
173 mask = _PAGE_PRESENT|_PAGE_USER; 173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write) 174 if (write)
175 mask |= _PAGE_RW; 175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask) 176 if ((pte_flags(pte) & mask) != mask)
177 return 0; 177 return 0;
178 /* hugepages are never "special" */ 178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); 179 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181 181
182 refs = 0; 182 refs = 0;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c3789bb19308..8396868e82c5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32 32
33#include <asm/asm.h> 33#include <asm/asm.h>
34#include <asm/bios_ebda.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -557,7 +558,7 @@ void zap_low_mappings(void)
557 558
558int nx_enabled; 559int nx_enabled;
559 560
560pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); 561pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
561EXPORT_SYMBOL_GPL(__supported_pte_mask); 562EXPORT_SYMBOL_GPL(__supported_pte_mask);
562 563
563#ifdef CONFIG_X86_PAE 564#ifdef CONFIG_X86_PAE
@@ -969,6 +970,8 @@ void __init mem_init(void)
969 int codesize, reservedpages, datasize, initsize; 970 int codesize, reservedpages, datasize, initsize;
970 int tmp; 971 int tmp;
971 972
973 start_periodic_check_for_corruption();
974
972#ifdef CONFIG_FLATMEM 975#ifdef CONFIG_FLATMEM
973 BUG_ON(!mem_map); 976 BUG_ON(!mem_map);
974#endif 977#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index fb30486c82f7..b8e461d49412 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32 32
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -88,6 +89,62 @@ early_param("gbpages", parse_direct_gbpages_on);
88 89
89int after_bootmem; 90int after_bootmem;
90 91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask);
94
95static int do_not_nx __cpuinitdata;
96
97/*
98 * noexec=on|off
99 * Control non-executable mappings for 64-bit processes.
100 *
101 * on Enable (default)
102 * off Disable
103 */
104static int __init nonx_setup(char *str)
105{
106 if (!str)
107 return -EINVAL;
108 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0;
111 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX;
114 }
115 return 0;
116}
117early_param("noexec", nonx_setup);
118
119void __cpuinit check_efer(void)
120{
121 unsigned long efer;
122
123 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx)
125 __supported_pte_mask &= ~_PAGE_NX;
126}
127
128int force_personality32;
129
130/*
131 * noexec32=on|off
132 * Control non executable heap for 32bit processes.
133 * To control the stack too use noexec=off
134 *
135 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
136 * off PROT_READ implies PROT_EXEC
137 */
138static int __init nonx32_setup(char *str)
139{
140 if (!strcmp(str, "on"))
141 force_personality32 &= ~READ_IMPLIES_EXEC;
142 else if (!strcmp(str, "off"))
143 force_personality32 |= READ_IMPLIES_EXEC;
144 return 1;
145}
146__setup("noexec32=", nonx32_setup);
147
91/* 148/*
92 * NOTE: This function is marked __ref because it calls __init function 149 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 150 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -139,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
139 } 196 }
140 197
141 pte = pte_offset_kernel(pmd, vaddr); 198 pte = pte_offset_kernel(pmd, vaddr);
142 if (!pte_none(*pte) && pte_val(new_pte) &&
143 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
144 pte_ERROR(*pte);
145 set_pte(pte, new_pte); 199 set_pte(pte, new_pte);
146 200
147 /* 201 /*
@@ -256,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
256 if (pfn >= table_top) 310 if (pfn >= table_top)
257 panic("alloc_low_page: ran out of memory"); 311 panic("alloc_low_page: ran out of memory");
258 312
259 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
260 memset(adr, 0, PAGE_SIZE); 314 memset(adr, 0, PAGE_SIZE);
261 *phys = pfn * PAGE_SIZE; 315 *phys = pfn * PAGE_SIZE;
262 return adr; 316 return adr;
@@ -692,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
692 old_start = mr[i].start; 746 old_start = mr[i].start;
693 memmove(&mr[i], &mr[i+1], 747 memmove(&mr[i], &mr[i+1],
694 (nr_range - 1 - i) * sizeof (struct map_range)); 748 (nr_range - 1 - i) * sizeof (struct map_range));
695 mr[i].start = old_start; 749 mr[i--].start = old_start;
696 nr_range--; 750 nr_range--;
697 } 751 }
698 752
@@ -825,6 +879,8 @@ void __init mem_init(void)
825{ 879{
826 long codesize, reservedpages, datasize, initsize; 880 long codesize, reservedpages, datasize, initsize;
827 881
882 start_periodic_check_for_corruption();
883
828 pci_iommu_alloc(); 884 pci_iommu_alloc();
829 885
830 /* clear_bss() already clear the empty_zero_page */ 886 /* clear_bss() already clear the empty_zero_page */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 6ab3196d12b4..e4c43ec71b29 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,18 +24,47 @@
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26 26
27static inline int phys_addr_valid(unsigned long addr)
28{
29 return addr < (1UL << boot_cpu_data.x86_phys_bits);
30}
31
27unsigned long __phys_addr(unsigned long x) 32unsigned long __phys_addr(unsigned long x)
28{ 33{
29 if (x >= __START_KERNEL_map) 34 if (x >= __START_KERNEL_map) {
30 return x - __START_KERNEL_map + phys_base; 35 x -= __START_KERNEL_map;
31 return x - PAGE_OFFSET; 36 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
37 x += phys_base;
38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
42 !phys_addr_valid(x));
43 }
44 return x;
32} 45}
33EXPORT_SYMBOL(__phys_addr); 46EXPORT_SYMBOL(__phys_addr);
34 47
35static inline int phys_addr_valid(unsigned long addr) 48bool __virt_addr_valid(unsigned long x)
36{ 49{
37 return addr < (1UL << boot_cpu_data.x86_phys_bits); 50 if (x >= __START_KERNEL_map) {
51 x -= __START_KERNEL_map;
52 if (x >= KERNEL_IMAGE_SIZE)
53 return false;
54 x += phys_base;
55 } else {
56 if (x < PAGE_OFFSET)
57 return false;
58 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ?
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false;
62 }
63 }
64
65 return pfn_valid(x >> PAGE_SHIFT);
38} 66}
67EXPORT_SYMBOL(__virt_addr_valid);
39 68
40#else 69#else
41 70
@@ -44,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr)
44 return 1; 73 return 1;
45} 74}
46 75
76#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x)
78{
79 /* VMALLOC_* aren't constants; not available at the boot time */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET;
84}
85EXPORT_SYMBOL(__phys_addr);
86#endif
87
88bool __virt_addr_valid(unsigned long x)
89{
90 if (x < PAGE_OFFSET)
91 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
93 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95}
96EXPORT_SYMBOL(__virt_addr_valid);
97
47#endif 98#endif
48 99
49int page_is_ram(unsigned long pagenr) 100int page_is_ram(unsigned long pagenr)
@@ -223,16 +274,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
223 switch (prot_val) { 274 switch (prot_val) {
224 case _PAGE_CACHE_UC: 275 case _PAGE_CACHE_UC:
225 default: 276 default:
226 prot = PAGE_KERNEL_NOCACHE; 277 prot = PAGE_KERNEL_IO_NOCACHE;
227 break; 278 break;
228 case _PAGE_CACHE_UC_MINUS: 279 case _PAGE_CACHE_UC_MINUS:
229 prot = PAGE_KERNEL_UC_MINUS; 280 prot = PAGE_KERNEL_IO_UC_MINUS;
230 break; 281 break;
231 case _PAGE_CACHE_WC: 282 case _PAGE_CACHE_WC:
232 prot = PAGE_KERNEL_WC; 283 prot = PAGE_KERNEL_IO_WC;
233 break; 284 break;
234 case _PAGE_CACHE_WB: 285 case _PAGE_CACHE_WB:
235 prot = PAGE_KERNEL; 286 prot = PAGE_KERNEL_IO;
236 break; 287 break;
237 } 288 }
238 289
@@ -549,12 +600,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
549} 600}
550 601
551static inline void __init early_set_fixmap(enum fixed_addresses idx, 602static inline void __init early_set_fixmap(enum fixed_addresses idx,
552 unsigned long phys) 603 unsigned long phys, pgprot_t prot)
553{ 604{
554 if (after_paging_init) 605 if (after_paging_init)
555 set_fixmap(idx, phys); 606 __set_fixmap(idx, phys, prot);
556 else 607 else
557 __early_set_fixmap(idx, phys, PAGE_KERNEL); 608 __early_set_fixmap(idx, phys, prot);
558} 609}
559 610
560static inline void __init early_clear_fixmap(enum fixed_addresses idx) 611static inline void __init early_clear_fixmap(enum fixed_addresses idx)
@@ -565,16 +616,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
565 __early_set_fixmap(idx, 0, __pgprot(0)); 616 __early_set_fixmap(idx, 0, __pgprot(0));
566} 617}
567 618
568 619static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
569static int __initdata early_ioremap_nested; 620static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
570
571static int __init check_early_ioremap_leak(void) 621static int __init check_early_ioremap_leak(void)
572{ 622{
573 if (!early_ioremap_nested) 623 int count = 0;
624 int i;
625
626 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
627 if (prev_map[i])
628 count++;
629
630 if (!count)
574 return 0; 631 return 0;
575 WARN(1, KERN_WARNING 632 WARN(1, KERN_WARNING
576 "Debug warning: early ioremap leak of %d areas detected.\n", 633 "Debug warning: early ioremap leak of %d areas detected.\n",
577 early_ioremap_nested); 634 count);
578 printk(KERN_WARNING 635 printk(KERN_WARNING
579 "please boot with early_ioremap_debug and report the dmesg.\n"); 636 "please boot with early_ioremap_debug and report the dmesg.\n");
580 637
@@ -582,18 +639,33 @@ static int __init check_early_ioremap_leak(void)
582} 639}
583late_initcall(check_early_ioremap_leak); 640late_initcall(check_early_ioremap_leak);
584 641
585void __init *early_ioremap(unsigned long phys_addr, unsigned long size) 642static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
586{ 643{
587 unsigned long offset, last_addr; 644 unsigned long offset, last_addr;
588 unsigned int nrpages, nesting; 645 unsigned int nrpages;
589 enum fixed_addresses idx0, idx; 646 enum fixed_addresses idx0, idx;
647 int i, slot;
590 648
591 WARN_ON(system_state != SYSTEM_BOOTING); 649 WARN_ON(system_state != SYSTEM_BOOTING);
592 650
593 nesting = early_ioremap_nested; 651 slot = -1;
652 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
653 if (!prev_map[i]) {
654 slot = i;
655 break;
656 }
657 }
658
659 if (slot < 0) {
660 printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
661 phys_addr, size);
662 WARN_ON(1);
663 return NULL;
664 }
665
594 if (early_ioremap_debug) { 666 if (early_ioremap_debug) {
595 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", 667 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
596 phys_addr, size, nesting); 668 phys_addr, size, slot);
597 dump_stack(); 669 dump_stack();
598 } 670 }
599 671
@@ -604,17 +676,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
604 return NULL; 676 return NULL;
605 } 677 }
606 678
607 if (nesting >= FIX_BTMAPS_NESTING) { 679 prev_size[slot] = size;
608 WARN_ON(1);
609 return NULL;
610 }
611 early_ioremap_nested++;
612 /* 680 /*
613 * Mappings have to be page-aligned 681 * Mappings have to be page-aligned
614 */ 682 */
615 offset = phys_addr & ~PAGE_MASK; 683 offset = phys_addr & ~PAGE_MASK;
616 phys_addr &= PAGE_MASK; 684 phys_addr &= PAGE_MASK;
617 size = PAGE_ALIGN(last_addr) - phys_addr; 685 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
618 686
619 /* 687 /*
620 * Mappings have to fit in the FIX_BTMAP area. 688 * Mappings have to fit in the FIX_BTMAP area.
@@ -628,10 +696,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
628 /* 696 /*
629 * Ok, go for it.. 697 * Ok, go for it..
630 */ 698 */
631 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 699 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
632 idx = idx0; 700 idx = idx0;
633 while (nrpages > 0) { 701 while (nrpages > 0) {
634 early_set_fixmap(idx, phys_addr); 702 early_set_fixmap(idx, phys_addr, prot);
635 phys_addr += PAGE_SIZE; 703 phys_addr += PAGE_SIZE;
636 --idx; 704 --idx;
637 --nrpages; 705 --nrpages;
@@ -639,7 +707,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
639 if (early_ioremap_debug) 707 if (early_ioremap_debug)
640 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 708 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
641 709
642 return (void *) (offset + fix_to_virt(idx0)); 710 prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
711 return prev_map[slot];
712}
713
714/* Remap an IO device */
715void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
716{
717 return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
718}
719
720/* Remap memory */
721void __init *early_memremap(unsigned long phys_addr, unsigned long size)
722{
723 return __early_ioremap(phys_addr, size, PAGE_KERNEL);
643} 724}
644 725
645void __init early_iounmap(void *addr, unsigned long size) 726void __init early_iounmap(void *addr, unsigned long size)
@@ -648,15 +729,33 @@ void __init early_iounmap(void *addr, unsigned long size)
648 unsigned long offset; 729 unsigned long offset;
649 unsigned int nrpages; 730 unsigned int nrpages;
650 enum fixed_addresses idx; 731 enum fixed_addresses idx;
651 int nesting; 732 int i, slot;
733
734 slot = -1;
735 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
736 if (prev_map[i] == addr) {
737 slot = i;
738 break;
739 }
740 }
652 741
653 nesting = --early_ioremap_nested; 742 if (slot < 0) {
654 if (WARN_ON(nesting < 0)) 743 printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
744 addr, size);
745 WARN_ON(1);
655 return; 746 return;
747 }
748
749 if (prev_size[slot] != size) {
750 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
751 addr, size, slot, prev_size[slot]);
752 WARN_ON(1);
753 return;
754 }
656 755
657 if (early_ioremap_debug) { 756 if (early_ioremap_debug) {
658 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 757 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
659 size, nesting); 758 size, slot);
660 dump_stack(); 759 dump_stack();
661 } 760 }
662 761
@@ -668,12 +767,13 @@ void __init early_iounmap(void *addr, unsigned long size)
668 offset = virt_addr & ~PAGE_MASK; 767 offset = virt_addr & ~PAGE_MASK;
669 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 768 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
670 769
671 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 770 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
672 while (nrpages > 0) { 771 while (nrpages > 0) {
673 early_clear_fixmap(idx); 772 early_clear_fixmap(idx);
674 --idx; 773 --idx;
675 --nrpages; 774 --nrpages;
676 } 775 }
776 prev_map[slot] = 0;
677} 777}
678 778
679void __this_fixmap_does_not_exist(void) 779void __this_fixmap_does_not_exist(void)
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c
index 847c164725f4..847c164725f4 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/numa_32.c
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 1b4763e26ea9..51c0a2fc14fe 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
138 return; 138 return;
139 } 139 }
140 140
141 if (is_uv_system()) 141 if (get_uv_system_type() >= UV_X2APIC)
142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
143 else 143 else
144 apic_id = pa->apic_id; 144 apic_id = pa->apic_id;
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 30f3eb366667..446902b2a6b6 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
7 timer_int.o ) 7 timer_int.o )
8 8
9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o 9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ 10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
11 op_model_ppro.o op_model_p4.o 11 op_model_ppro.o op_model_p4.o
12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o 12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 8a5f1614a3d5..57f6c9088081 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -1,10 +1,11 @@
1/** 1/**
2 * @file nmi_int.c 2 * @file nmi_int.c
3 * 3 *
4 * @remark Copyright 2002 OProfile authors 4 * @remark Copyright 2002-2008 OProfile authors
5 * @remark Read the file COPYING 5 * @remark Read the file COPYING
6 * 6 *
7 * @author John Levon <levon@movementarian.org> 7 * @author John Levon <levon@movementarian.org>
8 * @author Robert Richter <robert.richter@amd.com>
8 */ 9 */
9 10
10#include <linux/init.h> 11#include <linux/init.h>
@@ -439,6 +440,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
439 __u8 vendor = boot_cpu_data.x86_vendor; 440 __u8 vendor = boot_cpu_data.x86_vendor;
440 __u8 family = boot_cpu_data.x86; 441 __u8 family = boot_cpu_data.x86;
441 char *cpu_type; 442 char *cpu_type;
443 int ret = 0;
442 444
443 if (!cpu_has_apic) 445 if (!cpu_has_apic)
444 return -ENODEV; 446 return -ENODEV;
@@ -451,19 +453,23 @@ int __init op_nmi_init(struct oprofile_operations *ops)
451 default: 453 default:
452 return -ENODEV; 454 return -ENODEV;
453 case 6: 455 case 6:
454 model = &op_athlon_spec; 456 model = &op_amd_spec;
455 cpu_type = "i386/athlon"; 457 cpu_type = "i386/athlon";
456 break; 458 break;
457 case 0xf: 459 case 0xf:
458 model = &op_athlon_spec; 460 model = &op_amd_spec;
459 /* Actually it could be i386/hammer too, but give 461 /* Actually it could be i386/hammer too, but give
460 user space an consistent name. */ 462 user space an consistent name. */
461 cpu_type = "x86-64/hammer"; 463 cpu_type = "x86-64/hammer";
462 break; 464 break;
463 case 0x10: 465 case 0x10:
464 model = &op_athlon_spec; 466 model = &op_amd_spec;
465 cpu_type = "x86-64/family10"; 467 cpu_type = "x86-64/family10";
466 break; 468 break;
469 case 0x11:
470 model = &op_amd_spec;
471 cpu_type = "x86-64/family11h";
472 break;
467 } 473 }
468 break; 474 break;
469 475
@@ -490,17 +496,24 @@ int __init op_nmi_init(struct oprofile_operations *ops)
490 return -ENODEV; 496 return -ENODEV;
491 } 497 }
492 498
493 init_sysfs();
494#ifdef CONFIG_SMP 499#ifdef CONFIG_SMP
495 register_cpu_notifier(&oprofile_cpu_nb); 500 register_cpu_notifier(&oprofile_cpu_nb);
496#endif 501#endif
497 using_nmi = 1; 502 /* default values, can be overwritten by model */
498 ops->create_files = nmi_create_files; 503 ops->create_files = nmi_create_files;
499 ops->setup = nmi_setup; 504 ops->setup = nmi_setup;
500 ops->shutdown = nmi_shutdown; 505 ops->shutdown = nmi_shutdown;
501 ops->start = nmi_start; 506 ops->start = nmi_start;
502 ops->stop = nmi_stop; 507 ops->stop = nmi_stop;
503 ops->cpu_type = cpu_type; 508 ops->cpu_type = cpu_type;
509
510 if (model->init)
511 ret = model->init(ops);
512 if (ret)
513 return ret;
514
515 init_sysfs();
516 using_nmi = 1;
504 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 517 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
505 return 0; 518 return 0;
506} 519}
@@ -513,4 +526,6 @@ void op_nmi_exit(void)
513 unregister_cpu_notifier(&oprofile_cpu_nb); 526 unregister_cpu_notifier(&oprofile_cpu_nb);
514#endif 527#endif
515 } 528 }
529 if (model->exit)
530 model->exit();
516} 531}
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
new file mode 100644
index 000000000000..d9faf607b3a6
--- /dev/null
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -0,0 +1,543 @@
1/*
2 * @file op_model_amd.c
3 * athlon / K7 / K8 / Family 10h model-specific MSR operations
4 *
5 * @remark Copyright 2002-2008 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 * @author Robert Richter <robert.richter@amd.com>
12 * @author Barry Kasindorf
13*/
14
15#include <linux/oprofile.h>
16#include <linux/device.h>
17#include <linux/pci.h>
18
19#include <asm/ptrace.h>
20#include <asm/msr.h>
21#include <asm/nmi.h>
22
23#include "op_x86_model.h"
24#include "op_counter.h"
25
26#define NUM_COUNTERS 4
27#define NUM_CONTROLS 4
28
29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
30#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
31#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
32#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
33
34#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
35#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
36#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
37#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
38#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
39#define CTRL_CLEAR_LO(x) (x &= (1<<21))
40#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
41#define CTRL_SET_ENABLE(val) (val |= 1<<20)
42#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
43#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
44#define CTRL_SET_UM(val, m) (val |= (m << 8))
45#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
46#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
47#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
48#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
49
50static unsigned long reset_value[NUM_COUNTERS];
51
52#ifdef CONFIG_OPROFILE_IBS
53
54/* IbsFetchCtl bits/masks */
55#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */
56#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */
57#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */
58
59/*IbsOpCtl bits */
60#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */
61#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */
62
63/* Codes used in cpu_buffer.c */
64/* This produces duplicate code, need to be fixed */
65#define IBS_FETCH_BEGIN 3
66#define IBS_OP_BEGIN 4
67
68/* The function interface needs to be fixed, something like add
69 data. Should then be added to linux/oprofile.h. */
70extern void oprofile_add_ibs_sample(struct pt_regs *const regs,
71 unsigned int * const ibs_sample, u8 code);
72
73struct ibs_fetch_sample {
74 /* MSRC001_1031 IBS Fetch Linear Address Register */
75 unsigned int ibs_fetch_lin_addr_low;
76 unsigned int ibs_fetch_lin_addr_high;
77 /* MSRC001_1030 IBS Fetch Control Register */
78 unsigned int ibs_fetch_ctl_low;
79 unsigned int ibs_fetch_ctl_high;
80 /* MSRC001_1032 IBS Fetch Physical Address Register */
81 unsigned int ibs_fetch_phys_addr_low;
82 unsigned int ibs_fetch_phys_addr_high;
83};
84
85struct ibs_op_sample {
86 /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */
87 unsigned int ibs_op_rip_low;
88 unsigned int ibs_op_rip_high;
89 /* MSRC001_1035 IBS Op Data Register */
90 unsigned int ibs_op_data1_low;
91 unsigned int ibs_op_data1_high;
92 /* MSRC001_1036 IBS Op Data 2 Register */
93 unsigned int ibs_op_data2_low;
94 unsigned int ibs_op_data2_high;
95 /* MSRC001_1037 IBS Op Data 3 Register */
96 unsigned int ibs_op_data3_low;
97 unsigned int ibs_op_data3_high;
98 /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */
99 unsigned int ibs_dc_linear_low;
100 unsigned int ibs_dc_linear_high;
101 /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */
102 unsigned int ibs_dc_phys_low;
103 unsigned int ibs_dc_phys_high;
104};
105
106/*
107 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
108*/
109static void clear_ibs_nmi(void);
110
111static int ibs_allowed; /* AMD Family10h and later */
112
113struct op_ibs_config {
114 unsigned long op_enabled;
115 unsigned long fetch_enabled;
116 unsigned long max_cnt_fetch;
117 unsigned long max_cnt_op;
118 unsigned long rand_en;
119 unsigned long dispatched_ops;
120};
121
122static struct op_ibs_config ibs_config;
123
124#endif
125
126/* functions for op_amd_spec */
127
128static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
129{
130 int i;
131
132 for (i = 0; i < NUM_COUNTERS; i++) {
133 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
134 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
135 else
136 msrs->counters[i].addr = 0;
137 }
138
139 for (i = 0; i < NUM_CONTROLS; i++) {
140 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
141 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
142 else
143 msrs->controls[i].addr = 0;
144 }
145}
146
147
148static void op_amd_setup_ctrs(struct op_msrs const * const msrs)
149{
150 unsigned int low, high;
151 int i;
152
153 /* clear all counters */
154 for (i = 0 ; i < NUM_CONTROLS; ++i) {
155 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
156 continue;
157 CTRL_READ(low, high, msrs, i);
158 CTRL_CLEAR_LO(low);
159 CTRL_CLEAR_HI(high);
160 CTRL_WRITE(low, high, msrs, i);
161 }
162
163 /* avoid a false detection of ctr overflows in NMI handler */
164 for (i = 0; i < NUM_COUNTERS; ++i) {
165 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
166 continue;
167 CTR_WRITE(1, msrs, i);
168 }
169
170 /* enable active counters */
171 for (i = 0; i < NUM_COUNTERS; ++i) {
172 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
173 reset_value[i] = counter_config[i].count;
174
175 CTR_WRITE(counter_config[i].count, msrs, i);
176
177 CTRL_READ(low, high, msrs, i);
178 CTRL_CLEAR_LO(low);
179 CTRL_CLEAR_HI(high);
180 CTRL_SET_ENABLE(low);
181 CTRL_SET_USR(low, counter_config[i].user);
182 CTRL_SET_KERN(low, counter_config[i].kernel);
183 CTRL_SET_UM(low, counter_config[i].unit_mask);
184 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
185 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
186 CTRL_SET_HOST_ONLY(high, 0);
187 CTRL_SET_GUEST_ONLY(high, 0);
188
189 CTRL_WRITE(low, high, msrs, i);
190 } else {
191 reset_value[i] = 0;
192 }
193 }
194}
195
196#ifdef CONFIG_OPROFILE_IBS
197
198static inline int
199op_amd_handle_ibs(struct pt_regs * const regs,
200 struct op_msrs const * const msrs)
201{
202 unsigned int low, high;
203 struct ibs_fetch_sample ibs_fetch;
204 struct ibs_op_sample ibs_op;
205
206 if (!ibs_allowed)
207 return 1;
208
209 if (ibs_config.fetch_enabled) {
210 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
211 if (high & IBS_FETCH_HIGH_VALID_BIT) {
212 ibs_fetch.ibs_fetch_ctl_high = high;
213 ibs_fetch.ibs_fetch_ctl_low = low;
214 rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high);
215 ibs_fetch.ibs_fetch_lin_addr_high = high;
216 ibs_fetch.ibs_fetch_lin_addr_low = low;
217 rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high);
218 ibs_fetch.ibs_fetch_phys_addr_high = high;
219 ibs_fetch.ibs_fetch_phys_addr_low = low;
220
221 oprofile_add_ibs_sample(regs,
222 (unsigned int *)&ibs_fetch,
223 IBS_FETCH_BEGIN);
224
225 /*reenable the IRQ */
226 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
227 high &= ~IBS_FETCH_HIGH_VALID_BIT;
228 high |= IBS_FETCH_HIGH_ENABLE;
229 low &= IBS_FETCH_LOW_MAX_CNT_MASK;
230 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
231 }
232 }
233
234 if (ibs_config.op_enabled) {
235 rdmsr(MSR_AMD64_IBSOPCTL, low, high);
236 if (low & IBS_OP_LOW_VALID_BIT) {
237 rdmsr(MSR_AMD64_IBSOPRIP, low, high);
238 ibs_op.ibs_op_rip_low = low;
239 ibs_op.ibs_op_rip_high = high;
240 rdmsr(MSR_AMD64_IBSOPDATA, low, high);
241 ibs_op.ibs_op_data1_low = low;
242 ibs_op.ibs_op_data1_high = high;
243 rdmsr(MSR_AMD64_IBSOPDATA2, low, high);
244 ibs_op.ibs_op_data2_low = low;
245 ibs_op.ibs_op_data2_high = high;
246 rdmsr(MSR_AMD64_IBSOPDATA3, low, high);
247 ibs_op.ibs_op_data3_low = low;
248 ibs_op.ibs_op_data3_high = high;
249 rdmsr(MSR_AMD64_IBSDCLINAD, low, high);
250 ibs_op.ibs_dc_linear_low = low;
251 ibs_op.ibs_dc_linear_high = high;
252 rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high);
253 ibs_op.ibs_dc_phys_low = low;
254 ibs_op.ibs_dc_phys_high = high;
255
256 /* reenable the IRQ */
257 oprofile_add_ibs_sample(regs,
258 (unsigned int *)&ibs_op,
259 IBS_OP_BEGIN);
260 rdmsr(MSR_AMD64_IBSOPCTL, low, high);
261 high = 0;
262 low &= ~IBS_OP_LOW_VALID_BIT;
263 low |= IBS_OP_LOW_ENABLE;
264 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
265 }
266 }
267
268 return 1;
269}
270
271#endif
272
273static int op_amd_check_ctrs(struct pt_regs * const regs,
274 struct op_msrs const * const msrs)
275{
276 unsigned int low, high;
277 int i;
278
279 for (i = 0 ; i < NUM_COUNTERS; ++i) {
280 if (!reset_value[i])
281 continue;
282 CTR_READ(low, high, msrs, i);
283 if (CTR_OVERFLOWED(low)) {
284 oprofile_add_sample(regs, i);
285 CTR_WRITE(reset_value[i], msrs, i);
286 }
287 }
288
289#ifdef CONFIG_OPROFILE_IBS
290 op_amd_handle_ibs(regs, msrs);
291#endif
292
293 /* See op_model_ppro.c */
294 return 1;
295}
296
297static void op_amd_start(struct op_msrs const * const msrs)
298{
299 unsigned int low, high;
300 int i;
301 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
302 if (reset_value[i]) {
303 CTRL_READ(low, high, msrs, i);
304 CTRL_SET_ACTIVE(low);
305 CTRL_WRITE(low, high, msrs, i);
306 }
307 }
308
309#ifdef CONFIG_OPROFILE_IBS
310 if (ibs_allowed && ibs_config.fetch_enabled) {
311 low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
312 high = IBS_FETCH_HIGH_ENABLE;
313 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
314 }
315
316 if (ibs_allowed && ibs_config.op_enabled) {
317 low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE;
318 high = 0;
319 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
320 }
321#endif
322}
323
324
325static void op_amd_stop(struct op_msrs const * const msrs)
326{
327 unsigned int low, high;
328 int i;
329
330 /* Subtle: stop on all counters to avoid race with
331 * setting our pm callback */
332 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
333 if (!reset_value[i])
334 continue;
335 CTRL_READ(low, high, msrs, i);
336 CTRL_SET_INACTIVE(low);
337 CTRL_WRITE(low, high, msrs, i);
338 }
339
340#ifdef CONFIG_OPROFILE_IBS
341 if (ibs_allowed && ibs_config.fetch_enabled) {
342 low = 0; /* clear max count and enable */
343 high = 0;
344 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
345 }
346
347 if (ibs_allowed && ibs_config.op_enabled) {
348 low = 0; /* clear max count and enable */
349 high = 0;
350 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
351 }
352#endif
353}
354
355static void op_amd_shutdown(struct op_msrs const * const msrs)
356{
357 int i;
358
359 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
360 if (CTR_IS_RESERVED(msrs, i))
361 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
362 }
363 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
364 if (CTRL_IS_RESERVED(msrs, i))
365 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
366 }
367}
368
369#ifndef CONFIG_OPROFILE_IBS
370
371/* no IBS support */
372
373static int op_amd_init(struct oprofile_operations *ops)
374{
375 return 0;
376}
377
378static void op_amd_exit(void) {}
379
380#else
381
382static u8 ibs_eilvt_off;
383
384static inline void apic_init_ibs_nmi_per_cpu(void *arg)
385{
386 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
387}
388
389static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
390{
391 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
392}
393
394static int pfm_amd64_setup_eilvt(void)
395{
396#define IBSCTL_LVTOFFSETVAL (1 << 8)
397#define IBSCTL 0x1cc
398 struct pci_dev *cpu_cfg;
399 int nodes;
400 u32 value = 0;
401
402 /* per CPU setup */
403 on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
404
405 nodes = 0;
406 cpu_cfg = NULL;
407 do {
408 cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
409 PCI_DEVICE_ID_AMD_10H_NB_MISC,
410 cpu_cfg);
411 if (!cpu_cfg)
412 break;
413 ++nodes;
414 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
415 | IBSCTL_LVTOFFSETVAL);
416 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
417 if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
418 printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
419 "IBSCTL = 0x%08x", value);
420 return 1;
421 }
422 } while (1);
423
424 if (!nodes) {
425 printk(KERN_DEBUG "No CPU node configured for IBS");
426 return 1;
427 }
428
429#ifdef CONFIG_NUMA
430 /* Sanity check */
431 /* Works only for 64bit with proper numa implementation. */
432 if (nodes != num_possible_nodes()) {
433 printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
434 "found: %d, expected %d",
435 nodes, num_possible_nodes());
436 return 1;
437 }
438#endif
439 return 0;
440}
441
442/*
443 * initialize the APIC for the IBS interrupts
444 * if available (AMD Family10h rev B0 and later)
445 */
446static void setup_ibs(void)
447{
448 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
449
450 if (!ibs_allowed)
451 return;
452
453 if (pfm_amd64_setup_eilvt()) {
454 ibs_allowed = 0;
455 return;
456 }
457
458 printk(KERN_INFO "oprofile: AMD IBS detected\n");
459}
460
461
462/*
463 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
464 * rev B0 and later */
465static void clear_ibs_nmi(void)
466{
467 if (ibs_allowed)
468 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
469}
470
471static int (*create_arch_files)(struct super_block * sb, struct dentry * root);
472
473static int setup_ibs_files(struct super_block * sb, struct dentry * root)
474{
475 char buf[12];
476 struct dentry *dir;
477 int ret = 0;
478
479 /* architecture specific files */
480 if (create_arch_files)
481 ret = create_arch_files(sb, root);
482
483 if (ret)
484 return ret;
485
486 if (!ibs_allowed)
487 return ret;
488
489 /* model specific files */
490
491 /* setup some reasonable defaults */
492 ibs_config.max_cnt_fetch = 250000;
493 ibs_config.fetch_enabled = 0;
494 ibs_config.max_cnt_op = 250000;
495 ibs_config.op_enabled = 0;
496 ibs_config.dispatched_ops = 1;
497 snprintf(buf, sizeof(buf), "ibs_fetch");
498 dir = oprofilefs_mkdir(sb, root, buf);
499 oprofilefs_create_ulong(sb, dir, "rand_enable",
500 &ibs_config.rand_en);
501 oprofilefs_create_ulong(sb, dir, "enable",
502 &ibs_config.fetch_enabled);
503 oprofilefs_create_ulong(sb, dir, "max_count",
504 &ibs_config.max_cnt_fetch);
505 snprintf(buf, sizeof(buf), "ibs_uops");
506 dir = oprofilefs_mkdir(sb, root, buf);
507 oprofilefs_create_ulong(sb, dir, "enable",
508 &ibs_config.op_enabled);
509 oprofilefs_create_ulong(sb, dir, "max_count",
510 &ibs_config.max_cnt_op);
511 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
512 &ibs_config.dispatched_ops);
513
514 return 0;
515}
516
517static int op_amd_init(struct oprofile_operations *ops)
518{
519 setup_ibs();
520 create_arch_files = ops->create_files;
521 ops->create_files = setup_ibs_files;
522 return 0;
523}
524
525static void op_amd_exit(void)
526{
527 clear_ibs_nmi();
528}
529
530#endif
531
532struct op_x86_model_spec const op_amd_spec = {
533 .init = op_amd_init,
534 .exit = op_amd_exit,
535 .num_counters = NUM_COUNTERS,
536 .num_controls = NUM_CONTROLS,
537 .fill_in_addresses = &op_amd_fill_in_addresses,
538 .setup_ctrs = &op_amd_setup_ctrs,
539 .check_ctrs = &op_amd_check_ctrs,
540 .start = &op_amd_start,
541 .stop = &op_amd_stop,
542 .shutdown = &op_amd_shutdown
543};
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
deleted file mode 100644
index 3d534879a9dc..000000000000
--- a/arch/x86/oprofile/op_model_athlon.c
+++ /dev/null
@@ -1,190 +0,0 @@
1/*
2 * @file op_model_athlon.h
3 * athlon / K7 / K8 / Family 10h model-specific MSR operations
4 *
5 * @remark Copyright 2002 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 */
12
13#include <linux/oprofile.h>
14#include <asm/ptrace.h>
15#include <asm/msr.h>
16#include <asm/nmi.h>
17
18#include "op_x86_model.h"
19#include "op_counter.h"
20
21#define NUM_COUNTERS 4
22#define NUM_CONTROLS 4
23
24#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
25#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
26#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
27#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
28
29#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
30#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
31#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
32#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
33#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
34#define CTRL_CLEAR_LO(x) (x &= (1<<21))
35#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
36#define CTRL_SET_ENABLE(val) (val |= 1<<20)
37#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
38#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
39#define CTRL_SET_UM(val, m) (val |= (m << 8))
40#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
41#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
42#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
43#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
44
45static unsigned long reset_value[NUM_COUNTERS];
46
47static void athlon_fill_in_addresses(struct op_msrs * const msrs)
48{
49 int i;
50
51 for (i = 0; i < NUM_COUNTERS; i++) {
52 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
53 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
54 else
55 msrs->counters[i].addr = 0;
56 }
57
58 for (i = 0; i < NUM_CONTROLS; i++) {
59 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
60 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
61 else
62 msrs->controls[i].addr = 0;
63 }
64}
65
66
67static void athlon_setup_ctrs(struct op_msrs const * const msrs)
68{
69 unsigned int low, high;
70 int i;
71
72 /* clear all counters */
73 for (i = 0 ; i < NUM_CONTROLS; ++i) {
74 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
75 continue;
76 CTRL_READ(low, high, msrs, i);
77 CTRL_CLEAR_LO(low);
78 CTRL_CLEAR_HI(high);
79 CTRL_WRITE(low, high, msrs, i);
80 }
81
82 /* avoid a false detection of ctr overflows in NMI handler */
83 for (i = 0; i < NUM_COUNTERS; ++i) {
84 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
85 continue;
86 CTR_WRITE(1, msrs, i);
87 }
88
89 /* enable active counters */
90 for (i = 0; i < NUM_COUNTERS; ++i) {
91 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
92 reset_value[i] = counter_config[i].count;
93
94 CTR_WRITE(counter_config[i].count, msrs, i);
95
96 CTRL_READ(low, high, msrs, i);
97 CTRL_CLEAR_LO(low);
98 CTRL_CLEAR_HI(high);
99 CTRL_SET_ENABLE(low);
100 CTRL_SET_USR(low, counter_config[i].user);
101 CTRL_SET_KERN(low, counter_config[i].kernel);
102 CTRL_SET_UM(low, counter_config[i].unit_mask);
103 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
104 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
105 CTRL_SET_HOST_ONLY(high, 0);
106 CTRL_SET_GUEST_ONLY(high, 0);
107
108 CTRL_WRITE(low, high, msrs, i);
109 } else {
110 reset_value[i] = 0;
111 }
112 }
113}
114
115
116static int athlon_check_ctrs(struct pt_regs * const regs,
117 struct op_msrs const * const msrs)
118{
119 unsigned int low, high;
120 int i;
121
122 for (i = 0 ; i < NUM_COUNTERS; ++i) {
123 if (!reset_value[i])
124 continue;
125 CTR_READ(low, high, msrs, i);
126 if (CTR_OVERFLOWED(low)) {
127 oprofile_add_sample(regs, i);
128 CTR_WRITE(reset_value[i], msrs, i);
129 }
130 }
131
132 /* See op_model_ppro.c */
133 return 1;
134}
135
136
137static void athlon_start(struct op_msrs const * const msrs)
138{
139 unsigned int low, high;
140 int i;
141 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
142 if (reset_value[i]) {
143 CTRL_READ(low, high, msrs, i);
144 CTRL_SET_ACTIVE(low);
145 CTRL_WRITE(low, high, msrs, i);
146 }
147 }
148}
149
150
151static void athlon_stop(struct op_msrs const * const msrs)
152{
153 unsigned int low, high;
154 int i;
155
156 /* Subtle: stop on all counters to avoid race with
157 * setting our pm callback */
158 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
159 if (!reset_value[i])
160 continue;
161 CTRL_READ(low, high, msrs, i);
162 CTRL_SET_INACTIVE(low);
163 CTRL_WRITE(low, high, msrs, i);
164 }
165}
166
167static void athlon_shutdown(struct op_msrs const * const msrs)
168{
169 int i;
170
171 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
172 if (CTR_IS_RESERVED(msrs, i))
173 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
174 }
175 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
176 if (CTRL_IS_RESERVED(msrs, i))
177 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
178 }
179}
180
181struct op_x86_model_spec const op_athlon_spec = {
182 .num_counters = NUM_COUNTERS,
183 .num_controls = NUM_CONTROLS,
184 .fill_in_addresses = &athlon_fill_in_addresses,
185 .setup_ctrs = &athlon_setup_ctrs,
186 .check_ctrs = &athlon_check_ctrs,
187 .start = &athlon_start,
188 .stop = &athlon_stop,
189 .shutdown = &athlon_shutdown
190};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 45b605fa71d0..05a0261ba0c3 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -32,6 +32,8 @@ struct pt_regs;
32 * various x86 CPU models' perfctr support. 32 * various x86 CPU models' perfctr support.
33 */ 33 */
34struct op_x86_model_spec { 34struct op_x86_model_spec {
35 int (*init)(struct oprofile_operations *ops);
36 void (*exit)(void);
35 unsigned int const num_counters; 37 unsigned int const num_counters;
36 unsigned int const num_controls; 38 unsigned int const num_controls;
37 void (*fill_in_addresses)(struct op_msrs * const msrs); 39 void (*fill_in_addresses)(struct op_msrs * const msrs);
@@ -46,6 +48,6 @@ struct op_x86_model_spec {
46extern struct op_x86_model_spec const op_ppro_spec; 48extern struct op_x86_model_spec const op_ppro_spec;
47extern struct op_x86_model_spec const op_p4_spec; 49extern struct op_x86_model_spec const op_p4_spec;
48extern struct op_x86_model_spec const op_p4_ht2_spec; 50extern struct op_x86_model_spec const op_p4_ht2_spec;
49extern struct op_x86_model_spec const op_athlon_spec; 51extern struct op_x86_model_spec const op_amd_spec;
50 52
51#endif /* OP_X86_MODEL_H */ 53#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 19af06927fbc..1d88d2b39771 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -250,10 +250,5 @@ int __init pci_acpi_init(void)
250 acpi_pci_irq_enable(dev); 250 acpi_pci_irq_enable(dev);
251 } 251 }
252 252
253#ifdef CONFIG_X86_IO_APIC
254 if (acpi_ioapic)
255 print_IO_APIC();
256#endif
257
258 return 0; 253 return 0;
259} 254}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 4bdaa590375d..3c27a809393b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -511,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size);
511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); 511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size);
512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); 512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size);
513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); 513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size);
514
515/*
516 * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
517 * confusing the PCI engine:
518 */
519static void sb600_disable_hpet_bar(struct pci_dev *dev)
520{
521 u8 val;
522
523 /*
524 * The SB600 and SB700 both share the same device
525 * ID, but the PM register 0x55 does something different
526 * for the SB700, so make sure we are dealing with the
527 * SB600 before touching the bit:
528 */
529
530 pci_read_config_byte(dev, 0x08, &val);
531
532 if (val < 0x2F) {
533 outb(0x55, 0xCD6);
534 val = inb(0xCD7);
535
536 /* Set bit 7 in PM register 0x55 */
537 outb(0x55, 0xCD6);
538 outb(val | 0x80, 0xCD7);
539 }
540}
541DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 8791fc55e715..844df0cbbd3e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34 34
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h>
36 37
37#include "pci.h" 38#include "pci.h"
38 39
@@ -227,6 +228,8 @@ void __init pcibios_resource_survey(void)
227 pcibios_allocate_bus_resources(&pci_root_buses); 228 pcibios_allocate_bus_resources(&pci_root_buses);
228 pcibios_allocate_resources(0); 229 pcibios_allocate_resources(0);
229 pcibios_allocate_resources(1); 230 pcibios_allocate_resources(1);
231
232 e820_reserve_resources_late();
230} 233}
231 234
232/** 235/**
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index d9635764ce3d..654a2234f8f3 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
209 return name != NULL; 209 return name != NULL;
210} 210}
211 211
212static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) 212static void __init pci_mmcfg_insert_resources(void)
213{ 213{
214#define PCI_MMCFG_RESOURCE_NAME_LEN 19 214#define PCI_MMCFG_RESOURCE_NAME_LEN 19
215 int i; 215 int i;
@@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
233 cfg->pci_segment); 233 cfg->pci_segment);
234 res->start = cfg->address; 234 res->start = cfg->address;
235 res->end = res->start + (num_buses << 20) - 1; 235 res->end = res->start + (num_buses << 20) - 1;
236 res->flags = IORESOURCE_MEM | resource_flags; 236 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
237 insert_resource(&iomem_resource, res); 237 insert_resource(&iomem_resource, res);
238 names += PCI_MMCFG_RESOURCE_NAME_LEN; 238 names += PCI_MMCFG_RESOURCE_NAME_LEN;
239 } 239 }
@@ -434,11 +434,9 @@ static void __init __pci_mmcfg_init(int early)
434 (pci_mmcfg_config[0].address == 0)) 434 (pci_mmcfg_config[0].address == 0))
435 return; 435 return;
436 436
437 if (pci_mmcfg_arch_init()) { 437 if (pci_mmcfg_arch_init())
438 if (known_bridge)
439 pci_mmcfg_insert_resources(IORESOURCE_BUSY);
440 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 438 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
441 } else { 439 else {
442 /* 440 /*
443 * Signal not to attempt to insert mmcfg resources because 441 * Signal not to attempt to insert mmcfg resources because
444 * the architecture mmcfg setup could not initialize. 442 * the architecture mmcfg setup could not initialize.
@@ -475,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
475 * marked so it won't cause request errors when __request_region is 473 * marked so it won't cause request errors when __request_region is
476 * called. 474 * called.
477 */ 475 */
478 pci_mmcfg_insert_resources(0); 476 pci_mmcfg_insert_resources();
479 477
480 return 0; 478 return 0;
481} 479}
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index d3e083dea720..274d06082f48 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -11,6 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <asm/mtrr.h> 12#include <asm/mtrr.h>
13#include <asm/mce.h> 13#include <asm/mce.h>
14#include <asm/xcr.h>
14 15
15static struct saved_context saved_context; 16static struct saved_context saved_context;
16 17
@@ -126,6 +127,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
126 if (boot_cpu_has(X86_FEATURE_SEP)) 127 if (boot_cpu_has(X86_FEATURE_SEP))
127 enable_sep_cpu(); 128 enable_sep_cpu();
128 129
130 /*
131 * restore XCR0 for xsave capable cpu's.
132 */
133 if (cpu_has_xsave)
134 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
135
129 fix_processor_context(); 136 fix_processor_context();
130 do_fpu_end(); 137 do_fpu_end();
131 mtrr_ap_init(); 138 mtrr_ap_init();
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index 66bdfb591fd8..e3b6cf70d62c 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -14,6 +14,7 @@
14#include <asm/page.h> 14#include <asm/page.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/xcr.h>
17 18
18static void fix_processor_context(void); 19static void fix_processor_context(void);
19 20
@@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
122 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 123 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
123 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 124 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
124 125
126 /*
127 * restore XCR0 for xsave capable cpu's.
128 */
129 if (cpu_has_xsave)
130 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
131
125 fix_processor_context(); 132 fix_processor_context();
126 133
127 do_fpu_end(); 134 do_fpu_end();
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 3815e425f470..87b9ab166423 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY
26 26
27config XEN_SAVE_RESTORE 27config XEN_SAVE_RESTORE
28 bool 28 bool
29 depends on PM 29 depends on XEN && PM
30 default y \ No newline at end of file 30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 59c1e539aed2..313947940a1a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
2 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 10
4obj-$(CONFIG_SMP) += smp.o 11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000000..b53225d2cac3
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000000..e28132084832
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7dcd321a0508..0013a729b41d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,12 +30,12 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
36#include <xen/hvc-console.h> 35#include <xen/hvc-console.h>
37 36
38#include <asm/paravirt.h> 37#include <asm/paravirt.h>
38#include <asm/apic.h>
39#include <asm/page.h> 39#include <asm/page.h>
40#include <asm/xen/hypercall.h> 40#include <asm/xen/hypercall.h>
41#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
@@ -57,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59 59
60enum xen_domain_type xen_domain_type = XEN_NATIVE;
61EXPORT_SYMBOL_GPL(xen_domain_type);
62
60/* 63/*
61 * Identity map, in addition to plain kernel map. This needs to be 64 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest. 65 * large enough to allocate page table pages to allocate the rest.
@@ -110,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
110 * 113 *
111 * 0: not available, 1: available 114 * 0: not available, 1: available
112 */ 115 */
113static int have_vcpu_info_placement = 1; 116static int have_vcpu_info_placement =
117#ifdef CONFIG_X86_32
118 1
119#else
120 0
121#endif
122 ;
123
114 124
115static void xen_vcpu_setup(int cpu) 125static void xen_vcpu_setup(int cpu)
116{ 126{
@@ -226,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
226 return HYPERVISOR_get_debugreg(reg); 236 return HYPERVISOR_get_debugreg(reg);
227} 237}
228 238
229static unsigned long xen_save_fl(void) 239static void xen_leave_lazy(void)
230{ 240{
231 struct vcpu_info *vcpu; 241 paravirt_leave_lazy(paravirt_get_lazy_mode());
232 unsigned long flags; 242 xen_mc_flush();
233
234 vcpu = x86_read_percpu(xen_vcpu);
235
236 /* flag has opposite sense of mask */
237 flags = !vcpu->evtchn_upcall_mask;
238
239 /* convert to IF type flag
240 -0 -> 0x00000000
241 -1 -> 0xffffffff
242 */
243 return (-flags) & X86_EFLAGS_IF;
244} 243}
245 244
246static void xen_restore_fl(unsigned long flags) 245static unsigned long xen_store_tr(void)
247{ 246{
248 struct vcpu_info *vcpu; 247 return 0;
249
250 /* convert from IF type flag */
251 flags = !(flags & X86_EFLAGS_IF);
252
253 /* There's a one instruction preempt window here. We need to
254 make sure we're don't switch CPUs between getting the vcpu
255 pointer and updating the mask. */
256 preempt_disable();
257 vcpu = x86_read_percpu(xen_vcpu);
258 vcpu->evtchn_upcall_mask = flags;
259 preempt_enable_no_resched();
260
261 /* Doesn't matter if we get preempted here, because any
262 pending event will get dealt with anyway. */
263
264 if (flags == 0) {
265 preempt_check_resched();
266 barrier(); /* unmask then check (avoid races) */
267 if (unlikely(vcpu->evtchn_upcall_pending))
268 force_evtchn_callback();
269 }
270} 248}
271 249
272static void xen_irq_disable(void) 250/*
251 * Set the page permissions for a particular virtual address. If the
252 * address is a vmalloc mapping (or other non-linear mapping), then
253 * find the linear mapping of the page and also set its protections to
254 * match.
255 */
256static void set_aliased_prot(void *v, pgprot_t prot)
273{ 257{
274 /* There's a one instruction preempt window here. We need to 258 int level;
275 make sure we're don't switch CPUs between getting the vcpu 259 pte_t *ptep;
276 pointer and updating the mask. */ 260 pte_t pte;
277 preempt_disable(); 261 unsigned long pfn;
278 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 262 struct page *page;
279 preempt_enable_no_resched();
280}
281 263
282static void xen_irq_enable(void) 264 ptep = lookup_address((unsigned long)v, &level);
283{ 265 BUG_ON(ptep == NULL);
284 struct vcpu_info *vcpu;
285 266
286 /* We don't need to worry about being preempted here, since 267 pfn = pte_pfn(*ptep);
287 either a) interrupts are disabled, so no preemption, or b) 268 page = pfn_to_page(pfn);
288 the caller is confused and is trying to re-enable interrupts
289 on an indeterminate processor. */
290 269
291 vcpu = x86_read_percpu(xen_vcpu); 270 pte = pfn_pte(pfn, prot);
292 vcpu->evtchn_upcall_mask = 0;
293 271
294 /* Doesn't matter if we get preempted here, because any 272 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
295 pending event will get dealt with anyway. */ 273 BUG();
296 274
297 barrier(); /* unmask then check (avoid races) */ 275 if (!PageHighMem(page)) {
298 if (unlikely(vcpu->evtchn_upcall_pending)) 276 void *av = __va(PFN_PHYS(pfn));
299 force_evtchn_callback();
300}
301 277
302static void xen_safe_halt(void) 278 if (av != v)
303{ 279 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
304 /* Blocking includes an implicit local_irq_enable(). */ 280 BUG();
305 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) 281 } else
306 BUG(); 282 kmap_flush_unused();
307} 283}
308 284
309static void xen_halt(void) 285static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
310{ 286{
311 if (irqs_disabled()) 287 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
312 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 288 int i;
313 else
314 xen_safe_halt();
315}
316 289
317static void xen_leave_lazy(void) 290 for(i = 0; i < entries; i += entries_per_page)
318{ 291 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
319 paravirt_leave_lazy(paravirt_get_lazy_mode());
320 xen_mc_flush();
321} 292}
322 293
323static unsigned long xen_store_tr(void) 294static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
324{ 295{
325 return 0; 296 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
297 int i;
298
299 for(i = 0; i < entries; i += entries_per_page)
300 set_aliased_prot(ldt + i, PAGE_KERNEL);
326} 301}
327 302
328static void xen_set_ldt(const void *addr, unsigned entries) 303static void xen_set_ldt(const void *addr, unsigned entries)
@@ -425,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx)
425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 400static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
426 const void *ptr) 401 const void *ptr)
427{ 402{
428 unsigned long lp = (unsigned long)&dt[entrynum]; 403 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
429 xmaddr_t mach_lp = virt_to_machine(lp);
430 u64 entry = *(u64 *)ptr; 404 u64 entry = *(u64 *)ptr;
431 405
432 preempt_disable(); 406 preempt_disable();
@@ -559,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
559} 533}
560 534
561static void xen_load_sp0(struct tss_struct *tss, 535static void xen_load_sp0(struct tss_struct *tss,
562 struct thread_struct *thread) 536 struct thread_struct *thread)
563{ 537{
564 struct multicall_space mcs = xen_mc_entry(0); 538 struct multicall_space mcs = xen_mc_entry(0);
565 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 539 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -580,16 +554,47 @@ static void xen_io_delay(void)
580} 554}
581 555
582#ifdef CONFIG_X86_LOCAL_APIC 556#ifdef CONFIG_X86_LOCAL_APIC
583static u32 xen_apic_read(unsigned long reg) 557static u32 xen_apic_read(u32 reg)
584{ 558{
585 return 0; 559 return 0;
586} 560}
587 561
588static void xen_apic_write(unsigned long reg, u32 val) 562static void xen_apic_write(u32 reg, u32 val)
589{ 563{
590 /* Warn to see if there's any stray references */ 564 /* Warn to see if there's any stray references */
591 WARN_ON(1); 565 WARN_ON(1);
592} 566}
567
568static u64 xen_apic_icr_read(void)
569{
570 return 0;
571}
572
573static void xen_apic_icr_write(u32 low, u32 id)
574{
575 /* Warn to see if there's any stray references */
576 WARN_ON(1);
577}
578
579static void xen_apic_wait_icr_idle(void)
580{
581 return;
582}
583
584static u32 xen_safe_apic_wait_icr_idle(void)
585{
586 return 0;
587}
588
589static struct apic_ops xen_basic_apic_ops = {
590 .read = xen_apic_read,
591 .write = xen_apic_write,
592 .icr_read = xen_apic_icr_read,
593 .icr_write = xen_apic_icr_write,
594 .wait_icr_idle = xen_apic_wait_icr_idle,
595 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
596};
597
593#endif 598#endif
594 599
595static void xen_flush_tlb(void) 600static void xen_flush_tlb(void)
@@ -803,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
803 ret = -EFAULT; 808 ret = -EFAULT;
804 break; 809 break;
805#endif 810#endif
811
812 case MSR_STAR:
813 case MSR_CSTAR:
814 case MSR_LSTAR:
815 case MSR_SYSCALL_MASK:
816 case MSR_IA32_SYSENTER_CS:
817 case MSR_IA32_SYSENTER_ESP:
818 case MSR_IA32_SYSENTER_EIP:
819 /* Fast syscall setup is all done in hypercalls, so
820 these are all ignored. Stub them out here to stop
821 Xen console noise. */
822 break;
823
806 default: 824 default:
807 ret = native_write_msr_safe(msr, low, high); 825 ret = native_write_msr_safe(msr, low, high);
808 } 826 }
@@ -846,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
846 SetPagePinned(page); 864 SetPagePinned(page);
847 865
848 if (!PageHighMem(page)) { 866 if (!PageHighMem(page)) {
849 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 867 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
850 if (level == PT_PTE) 868 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
851 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 869 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
852 } else 870 } else
853 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
@@ -915,7 +933,7 @@ static void xen_release_ptpage(unsigned long pfn, unsigned level)
915 933
916 if (PagePinned(page)) { 934 if (PagePinned(page)) {
917 if (!PageHighMem(page)) { 935 if (!PageHighMem(page)) {
918 if (level == PT_PTE) 936 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
919 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 937 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
920 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 938 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
921 } 939 }
@@ -962,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
962} 980}
963#endif 981#endif
964 982
983#ifdef CONFIG_X86_32
965static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 984static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
966{ 985{
967 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 986 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -980,6 +999,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
980 999
981 xen_set_pte(ptep, pte); 1000 xen_set_pte(ptep, pte);
982} 1001}
1002#endif
983 1003
984static __init void xen_pagetable_setup_start(pgd_t *base) 1004static __init void xen_pagetable_setup_start(pgd_t *base)
985{ 1005{
@@ -1046,7 +1066,6 @@ void xen_setup_vcpu_info_placement(void)
1046 1066
1047 /* xen_vcpu_setup managed to place the vcpu_info within the 1067 /* xen_vcpu_setup managed to place the vcpu_info within the
1048 percpu area for all cpus, so make use of it */ 1068 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
1050 if (have_vcpu_info_placement) { 1069 if (have_vcpu_info_placement) {
1051 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1070 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1052 1071
@@ -1056,7 +1075,6 @@ void xen_setup_vcpu_info_placement(void)
1056 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1075 pv_irq_ops.irq_enable = xen_irq_enable_direct;
1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1076 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1058 } 1077 }
1059#endif
1060} 1078}
1061 1079
1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1080static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1077,12 +1095,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1077 goto patch_site 1095 goto patch_site
1078 1096
1079 switch (type) { 1097 switch (type) {
1080#ifdef CONFIG_X86_32
1081 SITE(pv_irq_ops, irq_enable); 1098 SITE(pv_irq_ops, irq_enable);
1082 SITE(pv_irq_ops, irq_disable); 1099 SITE(pv_irq_ops, irq_disable);
1083 SITE(pv_irq_ops, save_fl); 1100 SITE(pv_irq_ops, save_fl);
1084 SITE(pv_irq_ops, restore_fl); 1101 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
1086#undef SITE 1102#undef SITE
1087 1103
1088 patch_site: 1104 patch_site:
@@ -1220,6 +1236,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1220 .load_gs_index = xen_load_gs_index, 1236 .load_gs_index = xen_load_gs_index,
1221#endif 1237#endif
1222 1238
1239 .alloc_ldt = xen_alloc_ldt,
1240 .free_ldt = xen_free_ldt,
1241
1223 .store_gdt = native_store_gdt, 1242 .store_gdt = native_store_gdt,
1224 .store_idt = native_store_idt, 1243 .store_idt = native_store_idt,
1225 .store_tr = xen_store_tr, 1244 .store_tr = xen_store_tr,
@@ -1241,40 +1260,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1241 }, 1260 },
1242}; 1261};
1243 1262
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1262 .init_IRQ = __xen_init_IRQ,
1263 .save_fl = xen_save_fl,
1264 .restore_fl = xen_restore_fl,
1265 .irq_disable = xen_irq_disable,
1266 .irq_enable = xen_irq_enable,
1267 .safe_halt = xen_safe_halt,
1268 .halt = xen_halt,
1269#ifdef CONFIG_X86_64
1270 .adjust_exception_frame = xen_adjust_exception_frame,
1271#endif
1272};
1273
1274static const struct pv_apic_ops xen_apic_ops __initdata = { 1263static const struct pv_apic_ops xen_apic_ops __initdata = {
1275#ifdef CONFIG_X86_LOCAL_APIC 1264#ifdef CONFIG_X86_LOCAL_APIC
1276 .apic_write = xen_apic_write,
1277 .apic_read = xen_apic_read,
1278 .setup_boot_clock = paravirt_nop, 1265 .setup_boot_clock = paravirt_nop,
1279 .setup_secondary_clock = paravirt_nop, 1266 .setup_secondary_clock = paravirt_nop,
1280 .startup_ipi_hook = paravirt_nop, 1267 .startup_ipi_hook = paravirt_nop,
@@ -1413,7 +1400,7 @@ static void __init xen_reserve_top(void)
1413 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1400 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1414 top = pp.virt_start; 1401 top = pp.virt_start;
1415 1402
1416 reserve_top_address(-top + 2 * PAGE_SIZE); 1403 reserve_top_address(-top);
1417#endif /* CONFIG_X86_32 */ 1404#endif /* CONFIG_X86_32 */
1418} 1405}
1419 1406
@@ -1447,48 +1434,11 @@ static void *m2v(phys_addr_t maddr)
1447 return __ka(m2p(maddr)); 1434 return __ka(m2p(maddr));
1448} 1435}
1449 1436
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot) 1437static void set_page_prot(void *addr, pgprot_t prot)
1484{ 1438{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1439 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot); 1440 pte_t pte = pfn_pte(pfn, prot);
1487 1441
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) 1442 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG(); 1443 BUG();
1494} 1444}
@@ -1664,6 +1614,8 @@ asmlinkage void __init xen_start_kernel(void)
1664 if (!xen_start_info) 1614 if (!xen_start_info)
1665 return; 1615 return;
1666 1616
1617 xen_domain_type = XEN_PV_DOMAIN;
1618
1667 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1619 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1668 1620
1669 xen_setup_features(); 1621 xen_setup_features();
@@ -1673,10 +1625,18 @@ asmlinkage void __init xen_start_kernel(void)
1673 pv_init_ops = xen_init_ops; 1625 pv_init_ops = xen_init_ops;
1674 pv_time_ops = xen_time_ops; 1626 pv_time_ops = xen_time_ops;
1675 pv_cpu_ops = xen_cpu_ops; 1627 pv_cpu_ops = xen_cpu_ops;
1676 pv_irq_ops = xen_irq_ops;
1677 pv_apic_ops = xen_apic_ops; 1628 pv_apic_ops = xen_apic_ops;
1678 pv_mmu_ops = xen_mmu_ops; 1629 pv_mmu_ops = xen_mmu_ops;
1679 1630
1631 xen_init_irq_ops();
1632
1633#ifdef CONFIG_X86_LOCAL_APIC
1634 /*
1635 * set up the basic apic ops.
1636 */
1637 apic_ops = &xen_basic_apic_ops;
1638#endif
1639
1680 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1640 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1681 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1641 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1682 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1642 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1700,7 +1660,7 @@ asmlinkage void __init xen_start_kernel(void)
1700 1660
1701 /* Prevent unwanted bits from being set in PTEs. */ 1661 /* Prevent unwanted bits from being set in PTEs. */
1702 __supported_pte_mask &= ~_PAGE_GLOBAL; 1662 __supported_pte_mask &= ~_PAGE_GLOBAL;
1703 if (!is_initial_xendomain()) 1663 if (!xen_initial_domain())
1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1664 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1705 1665
1706 /* Don't do the full vcpu_info placement stuff until we have a 1666 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1735,7 +1695,7 @@ asmlinkage void __init xen_start_kernel(void)
1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1695 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1696 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1737 1697
1738 if (!is_initial_xendomain()) { 1698 if (!xen_initial_domain()) {
1739 add_preferred_console("xenboot", 0, NULL); 1699 add_preferred_console("xenboot", 0, NULL);
1740 add_preferred_console("tty", 0, NULL); 1700 add_preferred_console("tty", 0, NULL);
1741 add_preferred_console("hvc", 0, NULL); 1701 add_preferred_console("hvc", 0, NULL);
@@ -1743,15 +1703,6 @@ asmlinkage void __init xen_start_kernel(void)
1743 1703
1744 xen_raw_console_write("about to get started...\n"); 1704 xen_raw_console_write("about to get started...\n");
1745 1705
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1754
1755 /* Start the world */ 1706 /* Start the world */
1756#ifdef CONFIG_X86_32 1707#ifdef CONFIG_X86_32
1757 i386_start_kernel(); 1708 i386_start_kernel();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000000..28b85ab8422e
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,143 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24#ifdef CONFIG_X86_64
25 int i;
26
27 /* Create identity vector->irq map */
28 for(i = 0; i < NR_VECTORS; i++) {
29 int cpu;
30
31 for_each_possible_cpu(cpu)
32 per_cpu(vector_irq, cpu)[i] = i;
33 }
34#endif /* CONFIG_X86_64 */
35
36 xen_init_IRQ();
37}
38
39static unsigned long xen_save_fl(void)
40{
41 struct vcpu_info *vcpu;
42 unsigned long flags;
43
44 vcpu = x86_read_percpu(xen_vcpu);
45
46 /* flag has opposite sense of mask */
47 flags = !vcpu->evtchn_upcall_mask;
48
49 /* convert to IF type flag
50 -0 -> 0x00000000
51 -1 -> 0xffffffff
52 */
53 return (-flags) & X86_EFLAGS_IF;
54}
55
56static void xen_restore_fl(unsigned long flags)
57{
58 struct vcpu_info *vcpu;
59
60 /* convert from IF type flag */
61 flags = !(flags & X86_EFLAGS_IF);
62
63 /* There's a one instruction preempt window here. We need to
64 make sure we're don't switch CPUs between getting the vcpu
65 pointer and updating the mask. */
66 preempt_disable();
67 vcpu = x86_read_percpu(xen_vcpu);
68 vcpu->evtchn_upcall_mask = flags;
69 preempt_enable_no_resched();
70
71 /* Doesn't matter if we get preempted here, because any
72 pending event will get dealt with anyway. */
73
74 if (flags == 0) {
75 preempt_check_resched();
76 barrier(); /* unmask then check (avoid races) */
77 if (unlikely(vcpu->evtchn_upcall_pending))
78 xen_force_evtchn_callback();
79 }
80}
81
82static void xen_irq_disable(void)
83{
84 /* There's a one instruction preempt window here. We need to
85 make sure we're don't switch CPUs between getting the vcpu
86 pointer and updating the mask. */
87 preempt_disable();
88 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
89 preempt_enable_no_resched();
90}
91
92static void xen_irq_enable(void)
93{
94 struct vcpu_info *vcpu;
95
96 /* We don't need to worry about being preempted here, since
97 either a) interrupts are disabled, so no preemption, or b)
98 the caller is confused and is trying to re-enable interrupts
99 on an indeterminate processor. */
100
101 vcpu = x86_read_percpu(xen_vcpu);
102 vcpu->evtchn_upcall_mask = 0;
103
104 /* Doesn't matter if we get preempted here, because any
105 pending event will get dealt with anyway. */
106
107 barrier(); /* unmask then check (avoid races) */
108 if (unlikely(vcpu->evtchn_upcall_pending))
109 xen_force_evtchn_callback();
110}
111
112static void xen_safe_halt(void)
113{
114 /* Blocking includes an implicit local_irq_enable(). */
115 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
116 BUG();
117}
118
119static void xen_halt(void)
120{
121 if (irqs_disabled())
122 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
123 else
124 xen_safe_halt();
125}
126
127static const struct pv_irq_ops xen_irq_ops __initdata = {
128 .init_IRQ = __xen_init_IRQ,
129 .save_fl = xen_save_fl,
130 .restore_fl = xen_restore_fl,
131 .irq_disable = xen_irq_disable,
132 .irq_enable = xen_irq_enable,
133 .safe_halt = xen_safe_halt,
134 .halt = xen_halt,
135#ifdef CONFIG_X86_64
136 .adjust_exception_frame = xen_adjust_exception_frame,
137#endif
138};
139
140void __init xen_init_irq_ops()
141{
142 pv_irq_ops = xen_irq_ops;
143}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da696..ae173f6edd8b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
57 58
58#include "multicalls.h" 59#include "multicalls.h"
59#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
60 116
61/* 117/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a 118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
229} 285}
230 286
231 287
232static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
233{ 289{
234 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
235 291
236 return PagePinned(page); 292 return PagePinned(page);
237} 293}
238 294
239static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
240{ 296{
241 struct multicall_space mcs; 297 struct multicall_space mcs;
242 struct mmu_update *u; 298 struct mmu_update *u;
243 299
244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245 301
246 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
247 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
248 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
249 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
251 } 317 }
252 318
253 u = mcs.args; 319 u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
265 /* ptr may be ioremapped for 64-bit pagetable setup */ 331 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
269 337
270 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
271 339
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
274 342
275void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{ 344{
345 ADD_STATS(pmd_update, 1);
346
277 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
278 directly */ 348 directly */
279 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
280 *ptr = val; 350 *ptr = val;
281 return; 351 return;
282 } 352 }
283 353
354 ADD_STATS(pmd_update_pinned, 1);
355
284 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
285} 357}
286 358
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 if (mm == &init_mm) 372 if (mm == &init_mm)
301 preempt_disable(); 373 preempt_disable();
302 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
303 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
304 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
305 struct multicall_space mcs; 382 struct multicall_space mcs;
306 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
307 384
308 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
309 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
310 goto out; 388 goto out;
311 } else 389 } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
334 412
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
338 419
339 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
340} 421}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
400 /* ptr may be ioremapped for 64-bit pagetable setup */ 481 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
402 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
403 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
404 487
405 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
406 489
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
409 492
410void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
411{ 494{
495 ADD_STATS(pud_update, 1);
496
412 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
413 directly */ 498 directly */
414 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
415 *ptr = val; 500 *ptr = val;
416 return; 501 return;
417 } 502 }
418 503
504 ADD_STATS(pud_update_pinned, 1);
505
419 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
420} 507}
421 508
422void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
423{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
424#ifdef CONFIG_X86_PAE 515#ifdef CONFIG_X86_PAE
425 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
426 smp_wmb(); 517 smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
490 581
491 u.ptr = virt_to_machine(ptr).maddr; 582 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val); 583 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u); 584 xen_extend_mmu_update(&u);
494} 585}
495 586
496/* 587/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{ 608{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr); 609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519 610
611 ADD_STATS(pgd_update, 1);
612
520 /* If page is not pinned, we can just update the entry 613 /* If page is not pinned, we can just update the entry
521 directly */ 614 directly */
522 if (!page_pinned(ptr)) { 615 if (!xen_page_pinned(ptr)) {
523 *ptr = val; 616 *ptr = val;
524 if (user_ptr) { 617 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr)); 618 WARN_ON(xen_page_pinned(user_ptr));
526 *user_ptr = val; 619 *user_ptr = val;
527 } 620 }
528 return; 621 return;
529 } 622 }
530 623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
531 /* If it's pinned, then we can at least batch the kernel and 627 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */ 628 user updates together. */
533 xen_mc_batch(); 629 xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
555 * For 64-bit, we must skip the Xen hole in the middle of the address 651 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole. 652 * space, just after the big x86-64 virtual hole.
557 */ 653 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), 654static int xen_pgd_walk(struct mm_struct *mm,
559 unsigned long limit) 655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
560{ 658{
659 pgd_t *pgd = mm->pgd;
561 int flush = 0; 660 int flush = 0;
562 unsigned hole_low, hole_high; 661 unsigned hole_low, hole_high;
563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
590 pmdidx_limit = 0; 689 pmdidx_limit = 0;
591#endif 690#endif
592 691
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
596 pud_t *pud; 693 pud_t *pud;
597 694
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
604 pud = pud_offset(&pgd[pgdidx], 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
605 702
606 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
607 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
608 705
609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
610 pmd_t *pmd; 707 pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
619 pmd = pmd_offset(&pud[pudidx], 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
620 717
621 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
622 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
623 720
624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { 721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
625 struct page *pte; 722 struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
633 continue; 730 continue;
634 731
635 pte = pmd_page(pmd[pmdidx]); 732 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE); 733 flush |= (*func)(mm, pte, PT_PTE);
637 } 734 }
638 } 735 }
639 } 736 }
737
640out: 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
641 742
642 return flush; 743 return flush;
643} 744}
644 745
645static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
646{ 749{
647 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
648 751
649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
650 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
651 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
652#endif 755#endif
653 756
654 return ptl; 757 return ptl;
655} 758}
656 759
657static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
658{ 761{
659 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
660 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
672 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
673} 776}
674 777
675static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
676{ 780{
677 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
678 int flush; 782 int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
691 795
692 flush = 0; 796 flush = 0;
693 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
694 ptl = NULL; 818 ptl = NULL;
695 if (level == PT_PTE) 819 if (level == PT_PTE)
696 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
697 821
698 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
699 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
700 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
701 825
702 if (level == PT_PTE) 826 if (ptl) {
703 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
704 828
705 if (ptl) {
706 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
707 is completed. */ 830 is completed. */
708 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
709 } 832 }
710 } 833 }
711 834
@@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level)
715/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
716 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
717 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
718void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
719{ 842{
720 xen_mc_batch(); 843 xen_mc_batch();
721 844
722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
723 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
724 xen_mc_issue(0); 847 xen_mc_issue(0);
725 kmap_flush_unused(); 848 kmap_flush_unused();
@@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd)
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 856 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734 857
735 if (user_pgd) { 858 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD); 859 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 860 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 } 861 }
739 } 862 }
740#else /* CONFIG_X86_32 */ 863#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE 864#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */ 865 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 866 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
867 PT_PMD);
744#endif 868#endif
745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 869 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */ 870#endif /* CONFIG_X86_64 */
747 xen_mc_issue(0); 871 xen_mc_issue(0);
748} 872}
749 873
874static void xen_pgd_pin(struct mm_struct *mm)
875{
876 __xen_pgd_pin(mm, mm->pgd);
877}
878
750/* 879/*
751 * On save, we need to pin all pagetables to make sure they get their 880 * On save, we need to pin all pagetables to make sure they get their
752 * mfns turned into pfns. Search the list for any unpinned pgds and pin 881 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the 882 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction). 883 * process is under construction or destruction).
884 *
885 * Expected to be called in stop_machine() ("equivalent to taking
886 * every spinlock in the system"), so the locking doesn't really
887 * matter all that much.
755 */ 888 */
756void xen_mm_pin_all(void) 889void xen_mm_pin_all(void)
757{ 890{
@@ -762,7 +895,7 @@ void xen_mm_pin_all(void)
762 895
763 list_for_each_entry(page, &pgd_list, lru) { 896 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) { 897 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page)); 898 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
766 SetPageSavePinned(page); 899 SetPageSavePinned(page);
767 } 900 }
768 } 901 }
@@ -775,7 +908,8 @@ void xen_mm_pin_all(void)
775 * that's before we have page structures to store the bits. So do all 908 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now. 909 * the book-keeping now.
777 */ 910 */
778static __init int mark_pinned(struct page *page, enum pt_level level) 911static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
912 enum pt_level level)
779{ 913{
780 SetPagePinned(page); 914 SetPagePinned(page);
781 return 0; 915 return 0;
@@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
783 917
784void __init xen_mark_init_mm_pinned(void) 918void __init xen_mark_init_mm_pinned(void)
785{ 919{
786 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 920 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
787} 921}
788 922
789static int unpin_page(struct page *page, enum pt_level level) 923static int xen_unpin_page(struct mm_struct *mm, struct page *page,
924 enum pt_level level)
790{ 925{
791 unsigned pgfl = TestClearPagePinned(page); 926 unsigned pgfl = TestClearPagePinned(page);
792 927
@@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level)
796 spinlock_t *ptl = NULL; 931 spinlock_t *ptl = NULL;
797 struct multicall_space mcs; 932 struct multicall_space mcs;
798 933
934 /*
935 * Do the converse to pin_page. If we're using split
936 * pte locks, we must be holding the lock for while
937 * the pte page is unpinned but still RO to prevent
938 * concurrent updates from seeing it in this
939 * partially-pinned state.
940 */
799 if (level == PT_PTE) { 941 if (level == PT_PTE) {
800 ptl = lock_pte(page); 942 ptl = xen_pte_lock(page, mm);
801 943
802 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 944 if (ptl)
945 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
803 } 946 }
804 947
805 mcs = __xen_mc_entry(0); 948 mcs = __xen_mc_entry(0);
@@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level)
810 953
811 if (ptl) { 954 if (ptl) {
812 /* unlock when batch completed */ 955 /* unlock when batch completed */
813 xen_mc_callback(do_unlock, ptl); 956 xen_mc_callback(xen_pte_unlock, ptl);
814 } 957 }
815 } 958 }
816 959
@@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level)
818} 961}
819 962
820/* Release a pagetables pages back as normal RW */ 963/* Release a pagetables pages back as normal RW */
821static void xen_pgd_unpin(pgd_t *pgd) 964static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
822{ 965{
823 xen_mc_batch(); 966 xen_mc_batch();
824 967
@@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
830 973
831 if (user_pgd) { 974 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 975 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD); 976 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
834 } 977 }
835 } 978 }
836#endif 979#endif
837 980
838#ifdef CONFIG_X86_PAE 981#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */ 982 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 983 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
984 PT_PMD);
841#endif 985#endif
842 986
843 pgd_walk(pgd, unpin_page, USER_LIMIT); 987 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
844 988
845 xen_mc_issue(0); 989 xen_mc_issue(0);
846} 990}
847 991
992static void xen_pgd_unpin(struct mm_struct *mm)
993{
994 __xen_pgd_unpin(mm, mm->pgd);
995}
996
848/* 997/*
849 * On resume, undo any pinning done at save, so that the rest of the 998 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables. 999 * kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void)
859 list_for_each_entry(page, &pgd_list, lru) { 1008 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) { 1009 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page)); 1010 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page)); 1011 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
863 ClearPageSavePinned(page); 1012 ClearPageSavePinned(page);
864 } 1013 }
865 } 1014 }
@@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void)
870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1019void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
871{ 1020{
872 spin_lock(&next->page_table_lock); 1021 spin_lock(&next->page_table_lock);
873 xen_pgd_pin(next->pgd); 1022 xen_pgd_pin(next);
874 spin_unlock(&next->page_table_lock); 1023 spin_unlock(&next->page_table_lock);
875} 1024}
876 1025
877void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1026void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
878{ 1027{
879 spin_lock(&mm->page_table_lock); 1028 spin_lock(&mm->page_table_lock);
880 xen_pgd_pin(mm->pgd); 1029 xen_pgd_pin(mm);
881 spin_unlock(&mm->page_table_lock); 1030 spin_unlock(&mm->page_table_lock);
882} 1031}
883 1032
@@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info)
907 } 1056 }
908} 1057}
909 1058
910static void drop_mm_ref(struct mm_struct *mm) 1059static void xen_drop_mm_ref(struct mm_struct *mm)
911{ 1060{
912 cpumask_t mask; 1061 cpumask_t mask;
913 unsigned cpu; 1062 unsigned cpu;
@@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm)
937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1086 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
938} 1087}
939#else 1088#else
940static void drop_mm_ref(struct mm_struct *mm) 1089static void xen_drop_mm_ref(struct mm_struct *mm)
941{ 1090{
942 if (current->active_mm == mm) 1091 if (current->active_mm == mm)
943 load_cr3(swapper_pg_dir); 1092 load_cr3(swapper_pg_dir);
@@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm)
961void xen_exit_mmap(struct mm_struct *mm) 1110void xen_exit_mmap(struct mm_struct *mm)
962{ 1111{
963 get_cpu(); /* make sure we don't move around */ 1112 get_cpu(); /* make sure we don't move around */
964 drop_mm_ref(mm); 1113 xen_drop_mm_ref(mm);
965 put_cpu(); 1114 put_cpu();
966 1115
967 spin_lock(&mm->page_table_lock); 1116 spin_lock(&mm->page_table_lock);
968 1117
969 /* pgd may not be pinned in the error exit path of execve */ 1118 /* pgd may not be pinned in the error exit path of execve */
970 if (page_pinned(mm->pgd)) 1119 if (xen_page_pinned(mm->pgd))
971 xen_pgd_unpin(mm->pgd); 1120 xen_pgd_unpin(mm);
972 1121
973 spin_unlock(&mm->page_table_lock); 1122 spin_unlock(&mm->page_table_lock);
974} 1123}
1124
1125#ifdef CONFIG_XEN_DEBUG_FS
1126
1127static struct dentry *d_mmu_debug;
1128
1129static int __init xen_mmu_debugfs(void)
1130{
1131 struct dentry *d_xen = xen_init_debugfs();
1132
1133 if (d_xen == NULL)
1134 return -ENOMEM;
1135
1136 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1137
1138 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1139
1140 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1141 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1142 &mmu_stats.pgd_update_pinned);
1143 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1144 &mmu_stats.pgd_update_pinned);
1145
1146 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1147 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1148 &mmu_stats.pud_update_pinned);
1149 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1150 &mmu_stats.pud_update_pinned);
1151
1152 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1153 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1154 &mmu_stats.pmd_update_pinned);
1155 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1156 &mmu_stats.pmd_update_pinned);
1157
1158 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1159// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1160// &mmu_stats.pte_update_pinned);
1161 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1162 &mmu_stats.pte_update_pinned);
1163
1164 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1165 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1166 &mmu_stats.mmu_update_extended);
1167 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1168 mmu_stats.mmu_update_histo, 20);
1169
1170 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1171 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1172 &mmu_stats.set_pte_at_batched);
1173 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1174 &mmu_stats.set_pte_at_current);
1175 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1176 &mmu_stats.set_pte_at_kernel);
1177
1178 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1179 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1180 &mmu_stats.prot_commit_batched);
1181
1182 return 0;
1183}
1184fs_initcall(xen_mmu_debugfs);
1185
1186#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 0f59bd03f9e3..98d71659da5a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
19void xen_exit_mmap(struct mm_struct *mm); 19void xen_exit_mmap(struct mm_struct *mm);
20 20
21void xen_pgd_pin(pgd_t *pgd);
22//void xen_pgd_unpin(pgd_t *pgd);
23
24pteval_t xen_pte_val(pte_t); 21pteval_t xen_pte_val(pte_t);
25pmdval_t xen_pmd_val(pmd_t); 22pmdval_t xen_pmd_val(pmd_t);
26pgdval_t xen_pgd_val(pgd_t); 23pgdval_t xen_pgd_val(pgd_t);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 9efd1c6c9776..8ea8a0d0b0de 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16) 35#define MC_ARGS (MC_BATCH * 16)
33 36
37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
115 191
116 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
117 (argidx + args) > MC_ARGS) { 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
118 xen_mc_flush(); 195 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64)); 196 argidx = roundup(b->argidx, sizeof(u64));
120 } 197 }
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
158 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
159 struct callback *cb; 236 struct callback *cb;
160 237
161 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
162 xen_mc_flush(); 240 xen_mc_flush();
241 }
163 242
164 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
165 cb->fn = fn; 244 cb->fn = fn;
166 cb->data = data; 245 cb->data = data;
167} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d8faf79a0a1d..d77da613b1d2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,11 +11,8 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/kernel_stat.h>
19#include <linux/err.h> 16#include <linux/err.h>
20#include <linux/smp.h> 17#include <linux/smp.h>
21 18
@@ -36,8 +33,6 @@
36#include "xen-ops.h" 33#include "xen-ops.h"
37#include "mmu.h" 34#include "mmu.h"
38 35
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
41cpumask_t xen_cpu_initialized_map; 36cpumask_t xen_cpu_initialized_map;
42 37
43static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, resched_irq);
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
64 return IRQ_HANDLED; 59 return IRQ_HANDLED;
65} 60}
66 61
67static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
68{ 63{
69 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
70 65
71 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
72 preempt_disable(); 68 preempt_disable();
73 69
74 xen_enable_sysenter(); 70 xen_enable_sysenter();
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
89 local_irq_enable(); 85 local_irq_enable();
90 86
91 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
92 cpu_idle(); 93 cpu_idle();
93} 94}
94 95
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
212 213
213 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
214 } 215 }
215
216 //init_xenbus_allowed_cpumask();
217} 216}
218 217
219static __cpuinit int 218static __cpuinit int
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
281 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
282 int rc; 281 int rc;
283 282
284#if 0
285 rc = cpu_up_check(cpu);
286 if (rc)
287 return rc;
288#endif
289
290#ifdef CONFIG_X86_64 283#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */ 284 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0); 285 WARN_ON(cpu == 0);
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
339{ 332{
340} 333}
341 334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
345 return 0;
346}
347
348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
374{
375 return -ENOSYS;
376}
377
378static void xen_cpu_die(unsigned int cpu)
379{
380 BUG();
381}
382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
342static void stop_self(void *v) 389static void stop_self(void *v)
343{ 390{
344 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
419 return IRQ_HANDLED; 466 return IRQ_HANDLED;
420} 467}
421 468
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = { 469static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus, 471 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done, 472 .smp_cpus_done = xen_smp_cpus_done,
591 473
474 .cpu_up = xen_cpu_up,
475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
478
592 .smp_send_stop = xen_smp_send_stop, 479 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule, 480 .smp_send_reschedule = xen_smp_send_reschedule,
594 481
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000000..dd71e3a021cd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_this_cpu.irqs[irq]++;
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc3..004ba86326ae 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void)
213 return xen_khz; 211 return xen_khz;
214} 212}
215 213
216static cycle_t xen_clocksource_read(void) 214cycle_t xen_clocksource_read(void)
217{ 215{
218 struct pvclock_vcpu_time_info *src; 216 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 217 cycle_t ret;
@@ -452,6 +450,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 450 setup_runstate_info(cpu);
453} 451}
454 452
453void xen_teardown_timer(int cpu)
454{
455 struct clock_event_device *evt;
456 BUG_ON(cpu == 0);
457 evt = &per_cpu(xen_clock_events, cpu);
458 unbind_from_irqhandler(evt->irq, NULL);
459}
460
455void xen_setup_cpu_clockevents(void) 461void xen_setup_cpu_clockevents(void)
456{ 462{
457 BUG_ON(preemptible()); 463 BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..42786f59d9c0 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 7f58304fafb3..05794c566e87 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,8 +26,15 @@
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */ 26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000 27#define XEN_EFLAGS_NMI 0x80000000
28 28
29#if 0 29#if 1
30#include <asm/percpu.h> 30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
31 38
32/* 39/*
33 Enable events. This clears the event mask and tests the pending 40 Enable events. This clears the event mask and tests the pending
@@ -35,6 +42,8 @@
35 events, then enter the hypervisor to get them handled. 42 events, then enter the hypervisor to get them handled.
36 */ 43 */
37ENTRY(xen_irq_enable_direct) 44ENTRY(xen_irq_enable_direct)
45 BUG
46
38 /* Unmask events */ 47 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40 49
@@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct)
58 non-zero. 67 non-zero.
59 */ 68 */
60ENTRY(xen_irq_disable_direct) 69ENTRY(xen_irq_disable_direct)
70 BUG
71
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct) 73ENDPATCH(xen_irq_disable_direct)
63 ret 74 ret
@@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct)
74 Xen and x86 use opposite senses (mask vs enable). 85 Xen and x86 use opposite senses (mask vs enable).
75 */ 86 */
76ENTRY(xen_save_fl_direct) 87ENTRY(xen_save_fl_direct)
88 BUG
89
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah 91 setz %ah
79 addb %ah,%ah 92 addb %ah,%ah
@@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct)
91 if so. 104 if so.
92 */ 105 */
93ENTRY(xen_restore_fl_direct) 106ENTRY(xen_restore_fl_direct)
107 BUG
108
94 testb $X86_EFLAGS_IF>>8, %ah 109 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with 111 /* Preempt here doesn't matter because that will deal with
@@ -122,7 +137,7 @@ check_events:
122 push %r9 137 push %r9
123 push %r10 138 push %r10
124 push %r11 139 push %r11
125 call force_evtchn_callback 140 call xen_force_evtchn_callback
126 pop %r11 141 pop %r11
127 pop %r10 142 pop %r10
128 pop %r9 143 pop %r9
@@ -133,7 +148,6 @@ check_events:
133 pop %rcx 148 pop %rcx
134 pop %rax 149 pop %rax
135 ret 150 ret
136#endif
137 151
138ENTRY(xen_adjust_exception_frame) 152ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx 153 mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index dd3c23152a2e..d7422dc2a55c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -31,7 +32,10 @@ void xen_vcpu_restore(void);
31 32
32void __init xen_build_dynamic_phys_to_machine(void); 33void __init xen_build_dynamic_phys_to_machine(void);
33 34
35void xen_init_irq_ops(void);
34void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
35void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
36unsigned long xen_tsc_khz(void); 40unsigned long xen_tsc_khz(void);
37void __init xen_time_init(void); 41void __init xen_time_init(void);
@@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void);
50#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
51void xen_smp_init(void); 55void xen_smp_init(void);
52 56
57void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu);
60
53extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_t xen_cpu_initialized_map;
54#else 62#else
55static inline void xen_smp_init(void) {} 63static inline void xen_smp_init(void) {}