aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-18 13:31:12 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-18 13:31:12 -0400
commit3e370b29d35fb01bfb92c2814d6f79bf6a2cb970 (patch)
tree3b8fb467d60bfe6a34686f4abdc3a60050ba40a4 /arch/x86
parent88d1dce3a74367291f65a757fbdcaf17f042f30c (diff)
parent5b664cb235e97afbf34db9c4d77f08ebd725335e (diff)
Merge branch 'linus' into x86/pci-ioapic-boot-irq-quirks
Conflicts: drivers/pci/quirks.c Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig91
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Kconfig.debug27
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/ia32/ia32entry.S25
-rw-r--r--arch/x86/kernel/Makefile12
-rw-r--r--arch/x86/kernel/acpi/boot.c57
-rw-r--r--arch/x86/kernel/acpi/processor.c6
-rw-r--r--arch/x86/kernel/acpi/sleep.c32
-rw-r--r--arch/x86/kernel/alternative.c22
-rw-r--r--arch/x86/kernel/apic_32.c29
-rw-r--r--arch/x86/kernel/apm_32.c12
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/cpu/amd_64.c11
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c12
-rw-r--r--arch/x86/kernel/cpu/common_64.c17
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/intel_64.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpuid.c27
-rw-r--r--arch/x86/kernel/e820.c161
-rw-r--r--arch/x86/kernel/early-quirks.c26
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/entry_32.S73
-rw-r--r--arch/x86/kernel/entry_64.S139
-rw-r--r--arch/x86/kernel/ftrace.c141
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c75
-rw-r--r--arch/x86/kernel/hpet.c20
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c9
-rw-r--r--arch/x86/kernel/io_apic_32.c42
-rw-r--r--arch/x86/kernel/io_apic_64.c61
-rw-r--r--arch/x86/kernel/irqinit_64.c4
-rw-r--r--arch/x86/kernel/ldt.c2
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c4
-rw-r--r--arch/x86/kernel/microcode.c6
-rw-r--r--arch/x86/kernel/mpparse.c20
-rw-r--r--arch/x86/kernel/msr.c16
-rw-r--r--arch/x86/kernel/nmi.c9
-rw-r--r--arch/x86/kernel/numaq_32.c7
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/pci-gart_64.c8
-rw-r--r--arch/x86/kernel/process.c30
-rw-r--r--arch/x86/kernel/process_32.c3
-rw-r--r--arch/x86/kernel/process_64.c3
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/setup.c73
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/smp.c158
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/smpcommon.c56
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/time_32.c3
-rw-r--r--arch/x86/kernel/time_64.c18
-rw-r--r--arch/x86/kernel/tlb_32.c2
-rw-r--r--arch/x86/kernel/tlb_64.c2
-rw-r--r--arch/x86/kernel/traps_32.c177
-rw-r--r--arch/x86/kernel/traps_64.c512
-rw-r--r--arch/x86/kernel/tsc.c535
-rw-r--r--arch/x86/kernel/tsc_32.c455
-rw-r--r--arch/x86/kernel/tsc_64.c357
-rw-r--r--arch/x86/kernel/visws_quirks.c709
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmiclock_32.c4
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S8
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S10
-rw-r--r--arch/x86/kernel/vsyscall_64.c7
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c11
-rw-r--r--arch/x86/kvm/vmx.c4
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/lguest/Kconfig2
-rw-r--r--arch/x86/lguest/boot.c4
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/copy_user_64.S429
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S283
-rw-r--r--arch/x86/lib/delay.c (renamed from arch/x86/lib/delay_32.c)17
-rw-r--r--arch/x86/lib/delay_64.c85
-rw-r--r--arch/x86/lib/getuser.S (renamed from arch/x86/lib/getuser_64.S)87
-rw-r--r--arch/x86/lib/getuser_32.S78
-rw-r--r--arch/x86/lib/msr-on-cpu.c8
-rw-r--r--arch/x86/lib/putuser.S (renamed from arch/x86/lib/putuser_32.S)73
-rw-r--r--arch/x86/lib/putuser_64.S106
-rw-r--r--arch/x86/lib/thunk_32.S47
-rw-r--r--arch/x86/lib/thunk_64.S19
-rw-r--r--arch/x86/lib/usercopy_64.c23
-rw-r--r--arch/x86/mach-default/setup.c32
-rw-r--r--arch/x86/mach-visws/Makefile8
-rw-r--r--arch/x86/mach-visws/mpparse.c85
-rw-r--r--arch/x86/mach-visws/reboot.c55
-rw-r--r--arch/x86/mach-visws/setup.c183
-rw-r--r--arch/x86/mach-visws/traps.c69
-rw-r--r--arch/x86/mach-visws/visws_apic.c296
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c98
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c292
-rw-r--r--arch/x86/mm/ioremap.c11
-rw-r--r--arch/x86/mm/kmmio.c510
-rw-r--r--arch/x86/mm/mmio-mod.c515
-rw-r--r--arch/x86/mm/pageattr.c26
-rw-r--r--arch/x86/mm/pat.c3
-rw-r--r--arch/x86/mm/pf_in.c489
-rw-r--r--arch/x86/mm/pf_in.h39
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/srat_32.c3
-rw-r--r--arch/x86/mm/srat_64.c14
-rw-r--r--arch/x86/mm/testmmiotrace.c71
-rw-r--r--arch/x86/oprofile/nmi_int.c10
-rw-r--r--arch/x86/pci/Makefile22
-rw-r--r--arch/x86/pci/Makefile_3226
-rw-r--r--arch/x86/pci/Makefile_6417
-rw-r--r--arch/x86/pci/acpi.c3
-rw-r--r--arch/x86/pci/amd_bus.c76
-rw-r--r--arch/x86/pci/common.c34
-rw-r--r--arch/x86/pci/early.c60
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/init.c4
-rw-r--r--arch/x86/pci/irq.c125
-rw-r--r--arch/x86/pci/legacy.c16
-rw-r--r--arch/x86/pci/mp_bus_to_node.c23
-rw-r--r--arch/x86/pci/numa.c4
-rw-r--r--arch/x86/pci/pci.h13
-rw-r--r--arch/x86/pci/visws.c28
-rw-r--r--arch/x86/vdso/vclock_gettime.c15
-rw-r--r--arch/x86/vdso/vdso32-setup.c11
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--arch/x86/xen/enlighten.c6
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/smp.c135
-rw-r--r--arch/x86/xen/time.c4
-rw-r--r--arch/x86/xen/xen-ops.h11
136 files changed, 5474 insertions, 3685 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bb0c0d0f6db7..96e0c2ebc388 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,8 @@ config X86
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_KPROBES 24 select HAVE_KPROBES
25 select HAVE_KRETPROBES 25 select HAVE_KRETPROBES
26 select HAVE_DYNAMIC_FTRACE
27 select HAVE_FTRACE
26 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 28 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
27 select HAVE_ARCH_KGDB if !X86_VOYAGER 29 select HAVE_ARCH_KGDB if !X86_VOYAGER
28 30
@@ -168,6 +170,7 @@ config GENERIC_PENDING_IRQ
168config X86_SMP 170config X86_SMP
169 bool 171 bool
170 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) 172 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
173 select USE_GENERIC_SMP_HELPERS
171 default y 174 default y
172 175
173config X86_32_SMP 176config X86_32_SMP
@@ -181,12 +184,12 @@ config X86_64_SMP
181config X86_HT 184config X86_HT
182 bool 185 bool
183 depends on SMP 186 depends on SMP
184 depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || X86_64 187 depends on (X86_32 && !X86_VOYAGER) || X86_64
185 default y 188 default y
186 189
187config X86_BIOS_REBOOT 190config X86_BIOS_REBOOT
188 bool 191 bool
189 depends on !X86_VISWS && !X86_VOYAGER 192 depends on !X86_VOYAGER
190 default y 193 default y
191 194
192config X86_TRAMPOLINE 195config X86_TRAMPOLINE
@@ -232,13 +235,13 @@ config SMP
232 235
233config X86_FIND_SMP_CONFIG 236config X86_FIND_SMP_CONFIG
234 def_bool y 237 def_bool y
235 depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS 238 depends on X86_MPPARSE || X86_VOYAGER
236 239
237if ACPI 240if ACPI
238config X86_MPPARSE 241config X86_MPPARSE
239 def_bool y 242 def_bool y
240 bool "Enable MPS table" 243 bool "Enable MPS table"
241 depends on X86_LOCAL_APIC && !X86_VISWS 244 depends on X86_LOCAL_APIC
242 help 245 help
243 For old smp systems that do not have proper acpi support. Newer systems 246 For old smp systems that do not have proper acpi support. Newer systems
244 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it 247 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
@@ -247,7 +250,7 @@ endif
247if !ACPI 250if !ACPI
248config X86_MPPARSE 251config X86_MPPARSE
249 def_bool y 252 def_bool y
250 depends on X86_LOCAL_APIC && !X86_VISWS 253 depends on X86_LOCAL_APIC
251endif 254endif
252 255
253choice 256choice
@@ -281,18 +284,6 @@ config X86_VOYAGER
281 If you do not specifically know you have a Voyager based machine, 284 If you do not specifically know you have a Voyager based machine,
282 say N here, otherwise the kernel you build will not be bootable. 285 say N here, otherwise the kernel you build will not be bootable.
283 286
284config X86_VISWS
285 bool "SGI 320/540 (Visual Workstation)"
286 depends on X86_32 && !PCI
287 help
288 The SGI Visual Workstation series is an IA32-based workstation
289 based on SGI systems chips with some legacy PC hardware attached.
290
291 Say Y here to create a kernel to run on the SGI 320 or 540.
292
293 A kernel compiled for the Visual Workstation will not run on PCs
294 and vice versa. See <file:Documentation/sgi-visws.txt> for details.
295
296config X86_GENERICARCH 287config X86_GENERICARCH
297 bool "Generic architecture" 288 bool "Generic architecture"
298 depends on X86_32 289 depends on X86_32
@@ -355,7 +346,7 @@ config X86_RDC321X
355config X86_VSMP 346config X86_VSMP
356 bool "Support for ScaleMP vSMP" 347 bool "Support for ScaleMP vSMP"
357 select PARAVIRT 348 select PARAVIRT
358 depends on X86_64 && !PCI 349 depends on X86_64 && PCI
359 help 350 help
360 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is 351 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
361 supposed to run on these EM64T-based machines. Only choose this option 352 supposed to run on these EM64T-based machines. Only choose this option
@@ -363,6 +354,18 @@ config X86_VSMP
363 354
364endchoice 355endchoice
365 356
357config X86_VISWS
358 bool "SGI 320/540 (Visual Workstation)"
359 depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
360 help
361 The SGI Visual Workstation series is an IA32-based workstation
362 based on SGI systems chips with some legacy PC hardware attached.
363
364 Say Y here to create a kernel to run on the SGI 320 or 540.
365
366 A kernel compiled for the Visual Workstation will run on general
367 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
368
366config SCHED_NO_NO_OMIT_FRAME_POINTER 369config SCHED_NO_NO_OMIT_FRAME_POINTER
367 def_bool y 370 def_bool y
368 prompt "Single-depth WCHAN output" 371 prompt "Single-depth WCHAN output"
@@ -391,7 +394,7 @@ config VMI
391 bool "VMI Guest support" 394 bool "VMI Guest support"
392 select PARAVIRT 395 select PARAVIRT
393 depends on X86_32 396 depends on X86_32
394 depends on !(X86_VISWS || X86_VOYAGER) 397 depends on !X86_VOYAGER
395 help 398 help
396 VMI provides a paravirtualized interface to the VMware ESX server 399 VMI provides a paravirtualized interface to the VMware ESX server
397 (it could be used by other hypervisors in theory too, but is not 400 (it could be used by other hypervisors in theory too, but is not
@@ -402,7 +405,7 @@ config KVM_CLOCK
402 bool "KVM paravirtualized clock" 405 bool "KVM paravirtualized clock"
403 select PARAVIRT 406 select PARAVIRT
404 select PARAVIRT_CLOCK 407 select PARAVIRT_CLOCK
405 depends on !(X86_VISWS || X86_VOYAGER) 408 depends on !X86_VOYAGER
406 help 409 help
407 Turning on this option will allow you to run a paravirtualized clock 410 Turning on this option will allow you to run a paravirtualized clock
408 when running over the KVM hypervisor. Instead of relying on a PIT 411 when running over the KVM hypervisor. Instead of relying on a PIT
@@ -413,7 +416,7 @@ config KVM_CLOCK
413config KVM_GUEST 416config KVM_GUEST
414 bool "KVM Guest support" 417 bool "KVM Guest support"
415 select PARAVIRT 418 select PARAVIRT
416 depends on !(X86_VISWS || X86_VOYAGER) 419 depends on !X86_VOYAGER
417 help 420 help
418 This option enables various optimizations for running under the KVM 421 This option enables various optimizations for running under the KVM
419 hypervisor. 422 hypervisor.
@@ -422,7 +425,7 @@ source "arch/x86/lguest/Kconfig"
422 425
423config PARAVIRT 426config PARAVIRT
424 bool "Enable paravirtualization code" 427 bool "Enable paravirtualization code"
425 depends on !(X86_VISWS || X86_VOYAGER) 428 depends on !X86_VOYAGER
426 help 429 help
427 This changes the kernel so it can modify itself when it is run 430 This changes the kernel so it can modify itself when it is run
428 under a hypervisor, potentially improving performance significantly 431 under a hypervisor, potentially improving performance significantly
@@ -445,7 +448,6 @@ config PARAVIRT_DEBUG
445config MEMTEST 448config MEMTEST
446 bool "Memtest" 449 bool "Memtest"
447 depends on X86_64 450 depends on X86_64
448 default y
449 help 451 help
450 This option adds a kernel parameter 'memtest', which allows memtest 452 This option adds a kernel parameter 'memtest', which allows memtest
451 to be set. 453 to be set.
@@ -453,7 +455,7 @@ config MEMTEST
453 memtest=1, mean do 1 test pattern; 455 memtest=1, mean do 1 test pattern;
454 ... 456 ...
455 memtest=4, mean do 4 test patterns. 457 memtest=4, mean do 4 test patterns.
456 If you are unsure how to answer this question, answer Y. 458 If you are unsure how to answer this question, answer N.
457 459
458config X86_SUMMIT_NUMA 460config X86_SUMMIT_NUMA
459 def_bool y 461 def_bool y
@@ -575,7 +577,7 @@ config SWIOTLB
575 3 GB of memory. If unsure, say Y. 577 3 GB of memory. If unsure, say Y.
576 578
577config IOMMU_HELPER 579config IOMMU_HELPER
578 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB) 580 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
579config MAXSMP 581config MAXSMP
580 bool "Configure Maximum number of SMP Processors and NUMA Nodes" 582 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
581 depends on X86_64 && SMP 583 depends on X86_64 && SMP
@@ -628,7 +630,7 @@ source "kernel/Kconfig.preempt"
628 630
629config X86_UP_APIC 631config X86_UP_APIC
630 bool "Local APIC support on uniprocessors" 632 bool "Local APIC support on uniprocessors"
631 depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) 633 depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
632 help 634 help
633 A local APIC (Advanced Programmable Interrupt Controller) is an 635 A local APIC (Advanced Programmable Interrupt Controller) is an
634 integrated interrupt controller in the CPU. If you have a single-CPU 636 integrated interrupt controller in the CPU. If you have a single-CPU
@@ -653,11 +655,11 @@ config X86_UP_IOAPIC
653 655
654config X86_LOCAL_APIC 656config X86_LOCAL_APIC
655 def_bool y 657 def_bool y
656 depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) 658 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
657 659
658config X86_IO_APIC 660config X86_IO_APIC
659 def_bool y 661 def_bool y
660 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) 662 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
661 663
662config X86_VISWS_APIC 664config X86_VISWS_APIC
663 def_bool y 665 def_bool y
@@ -711,7 +713,7 @@ config X86_MCE_NONFATAL
711 713
712config X86_MCE_P4THERMAL 714config X86_MCE_P4THERMAL
713 bool "check for P4 thermal throttling interrupt." 715 bool "check for P4 thermal throttling interrupt."
714 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS 716 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
715 help 717 help
716 Enabling this feature will cause a message to be printed when the P4 718 Enabling this feature will cause a message to be printed when the P4
717 enters thermal throttling. 719 enters thermal throttling.
@@ -1133,21 +1135,18 @@ config MTRR
1133 See <file:Documentation/mtrr.txt> for more information. 1135 See <file:Documentation/mtrr.txt> for more information.
1134 1136
1135config MTRR_SANITIZER 1137config MTRR_SANITIZER
1136 def_bool y 1138 bool
1137 prompt "MTRR cleanup support" 1139 prompt "MTRR cleanup support"
1138 depends on MTRR 1140 depends on MTRR
1139 help 1141 help
1140 Convert MTRR layout from continuous to discrete, so some X driver 1142 Convert MTRR layout from continuous to discrete, so X drivers can
1141 could add WB entries. 1143 add writeback entries.
1142
1143 Say N here if you see bootup problems (boot crash, boot hang,
1144 spontaneous reboots).
1145 1144
1146 Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size 1145 Can be disabled with disable_mtrr_cleanup on the kernel command line.
1147 could be used to send largest mtrr entry size for continuous block 1146 The largest mtrr entry size for a continous block can be set with
1148 to hold holes (aka. UC entries) 1147 mtrr_chunk_size.
1149 1148
1150 If unsure, say Y. 1149 If unsure, say N.
1151 1150
1152config MTRR_SANITIZER_ENABLE_DEFAULT 1151config MTRR_SANITIZER_ENABLE_DEFAULT
1153 int "MTRR cleanup enable value (0-1)" 1152 int "MTRR cleanup enable value (0-1)"
@@ -1164,7 +1163,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1164 depends on MTRR_SANITIZER 1163 depends on MTRR_SANITIZER
1165 help 1164 help
1166 mtrr cleanup spare entries default, it can be changed via 1165 mtrr cleanup spare entries default, it can be changed via
1167 mtrr_spare_reg_nr= 1166 mtrr_spare_reg_nr=N on the kernel command line.
1168 1167
1169config X86_PAT 1168config X86_PAT
1170 bool 1169 bool
@@ -1414,7 +1413,7 @@ config X86_APM_BOOT
1414 1413
1415menuconfig APM 1414menuconfig APM
1416 tristate "APM (Advanced Power Management) BIOS support" 1415 tristate "APM (Advanced Power Management) BIOS support"
1417 depends on X86_32 && PM_SLEEP && !X86_VISWS 1416 depends on X86_32 && PM_SLEEP
1418 ---help--- 1417 ---help---
1419 APM is a BIOS specification for saving power using several different 1418 APM is a BIOS specification for saving power using several different
1420 techniques. This is mostly useful for battery powered laptops with 1419 techniques. This is mostly useful for battery powered laptops with
@@ -1561,7 +1560,7 @@ config PCI
1561 1560
1562choice 1561choice
1563 prompt "PCI access mode" 1562 prompt "PCI access mode"
1564 depends on X86_32 && PCI && !X86_VISWS 1563 depends on X86_32 && PCI
1565 default PCI_GOANY 1564 default PCI_GOANY
1566 ---help--- 1565 ---help---
1567 On PCI systems, the BIOS can be used to detect the PCI devices and 1566 On PCI systems, the BIOS can be used to detect the PCI devices and
@@ -1598,12 +1597,12 @@ endchoice
1598 1597
1599config PCI_BIOS 1598config PCI_BIOS
1600 def_bool y 1599 def_bool y
1601 depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) 1600 depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
1602 1601
1603# x86-64 doesn't support PCI BIOS access from long mode so always go direct. 1602# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
1604config PCI_DIRECT 1603config PCI_DIRECT
1605 def_bool y 1604 def_bool y
1606 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC) || X86_VISWS) 1605 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
1607 1606
1608config PCI_MMCONFIG 1607config PCI_MMCONFIG
1609 def_bool y 1608 def_bool y
@@ -1663,7 +1662,7 @@ if X86_32
1663 1662
1664config ISA 1663config ISA
1665 bool "ISA support" 1664 bool "ISA support"
1666 depends on !(X86_VOYAGER || X86_VISWS) 1665 depends on !X86_VOYAGER
1667 help 1666 help
1668 Find out whether you have ISA slots on your motherboard. ISA is the 1667 Find out whether you have ISA slots on your motherboard. ISA is the
1669 name of a bus system, i.e. the way the CPU talks to the other stuff 1668 name of a bus system, i.e. the way the CPU talks to the other stuff
@@ -1690,7 +1689,7 @@ config EISA
1690source "drivers/eisa/Kconfig" 1689source "drivers/eisa/Kconfig"
1691 1690
1692config MCA 1691config MCA
1693 bool "MCA support" if !(X86_VISWS || X86_VOYAGER) 1692 bool "MCA support" if !X86_VOYAGER
1694 default y if X86_VOYAGER 1693 default y if X86_VOYAGER
1695 help 1694 help
1696 MicroChannel Architecture is found in some IBM PS/2 machines and 1695 MicroChannel Architecture is found in some IBM PS/2 machines and
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 3d22bb8175b4..abff1b84ed5b 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -344,7 +344,7 @@ config X86_F00F_BUG
344 344
345config X86_WP_WORKS_OK 345config X86_WP_WORKS_OK
346 def_bool y 346 def_bool y
347 depends on X86_32 && !M386 347 depends on !M386
348 348
349config X86_INVLPG 349config X86_INVLPG
350 def_bool y 350 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index acc0271920f2..ae36bfa814e5 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -171,6 +171,33 @@ config IOMMU_LEAK
171 Add a simple leak tracer to the IOMMU code. This is useful when you 171 Add a simple leak tracer to the IOMMU code. This is useful when you
172 are debugging a buggy device driver that leaks IOMMU mappings. 172 are debugging a buggy device driver that leaks IOMMU mappings.
173 173
174config MMIOTRACE_HOOKS
175 bool
176
177config MMIOTRACE
178 bool "Memory mapped IO tracing"
179 depends on DEBUG_KERNEL && PCI
180 select TRACING
181 select MMIOTRACE_HOOKS
182 help
183 Mmiotrace traces Memory Mapped I/O access and is meant for
184 debugging and reverse engineering. It is called from the ioremap
185 implementation and works via page faults. Tracing is disabled by
186 default and can be enabled at run-time.
187
188 See Documentation/tracers/mmiotrace.txt.
189 If you are not helping to develop drivers, say N.
190
191config MMIOTRACE_TEST
192 tristate "Test module for mmiotrace"
193 depends on MMIOTRACE && m
194 help
195 This is a dumb module for testing mmiotrace. It is very dangerous
196 as it will write garbage to IO memory starting at a given address.
197 However, it should be safe to use on e.g. unused portion of VRAM.
198
199 Say N, unless you absolutely know what you are doing.
200
174# 201#
175# IO delay types: 202# IO delay types:
176# 203#
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b03d24b44bf9..919ce21ea654 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -113,10 +113,6 @@ mcore-y := arch/x86/mach-default/
113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager 113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ 114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
115 115
116# VISWS subarch support
117mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws
118mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/
119
120# generic subarchitecture 116# generic subarchitecture
121mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic 117mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
122fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 24e4d4928d65..20371d0635e4 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -116,7 +116,7 @@ ENTRY(ia32_sysenter_target)
116 pushfq 116 pushfq
117 CFI_ADJUST_CFA_OFFSET 8 117 CFI_ADJUST_CFA_OFFSET 8
118 /*CFI_REL_OFFSET rflags,0*/ 118 /*CFI_REL_OFFSET rflags,0*/
119 movl 8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d 119 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
120 CFI_REGISTER rip,r10 120 CFI_REGISTER rip,r10
121 pushq $__USER32_CS 121 pushq $__USER32_CS
122 CFI_ADJUST_CFA_OFFSET 8 122 CFI_ADJUST_CFA_OFFSET 8
@@ -136,8 +136,9 @@ ENTRY(ia32_sysenter_target)
136 .quad 1b,ia32_badarg 136 .quad 1b,ia32_badarg
137 .previous 137 .previous
138 GET_THREAD_INFO(%r10) 138 GET_THREAD_INFO(%r10)
139 orl $TS_COMPAT,threadinfo_status(%r10) 139 orl $TS_COMPAT,TI_status(%r10)
140 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 140 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
141 TI_flags(%r10)
141 CFI_REMEMBER_STATE 142 CFI_REMEMBER_STATE
142 jnz sysenter_tracesys 143 jnz sysenter_tracesys
143sysenter_do_call: 144sysenter_do_call:
@@ -149,9 +150,9 @@ sysenter_do_call:
149 GET_THREAD_INFO(%r10) 150 GET_THREAD_INFO(%r10)
150 DISABLE_INTERRUPTS(CLBR_NONE) 151 DISABLE_INTERRUPTS(CLBR_NONE)
151 TRACE_IRQS_OFF 152 TRACE_IRQS_OFF
152 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) 153 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
153 jnz int_ret_from_sys_call 154 jnz int_ret_from_sys_call
154 andl $~TS_COMPAT,threadinfo_status(%r10) 155 andl $~TS_COMPAT,TI_status(%r10)
155 /* clear IF, that popfq doesn't enable interrupts early */ 156 /* clear IF, that popfq doesn't enable interrupts early */
156 andl $~0x200,EFLAGS-R11(%rsp) 157 andl $~0x200,EFLAGS-R11(%rsp)
157 movl RIP-R11(%rsp),%edx /* User %eip */ 158 movl RIP-R11(%rsp),%edx /* User %eip */
@@ -240,8 +241,9 @@ ENTRY(ia32_cstar_target)
240 .quad 1b,ia32_badarg 241 .quad 1b,ia32_badarg
241 .previous 242 .previous
242 GET_THREAD_INFO(%r10) 243 GET_THREAD_INFO(%r10)
243 orl $TS_COMPAT,threadinfo_status(%r10) 244 orl $TS_COMPAT,TI_status(%r10)
244 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 245 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
246 TI_flags(%r10)
245 CFI_REMEMBER_STATE 247 CFI_REMEMBER_STATE
246 jnz cstar_tracesys 248 jnz cstar_tracesys
247cstar_do_call: 249cstar_do_call:
@@ -253,9 +255,9 @@ cstar_do_call:
253 GET_THREAD_INFO(%r10) 255 GET_THREAD_INFO(%r10)
254 DISABLE_INTERRUPTS(CLBR_NONE) 256 DISABLE_INTERRUPTS(CLBR_NONE)
255 TRACE_IRQS_OFF 257 TRACE_IRQS_OFF
256 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) 258 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
257 jnz int_ret_from_sys_call 259 jnz int_ret_from_sys_call
258 andl $~TS_COMPAT,threadinfo_status(%r10) 260 andl $~TS_COMPAT,TI_status(%r10)
259 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 261 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
260 movl RIP-ARGOFFSET(%rsp),%ecx 262 movl RIP-ARGOFFSET(%rsp),%ecx
261 CFI_REGISTER rip,rcx 263 CFI_REGISTER rip,rcx
@@ -333,8 +335,9 @@ ENTRY(ia32_syscall)
333 this could be a problem. */ 335 this could be a problem. */
334 SAVE_ARGS 0,0,1 336 SAVE_ARGS 0,0,1
335 GET_THREAD_INFO(%r10) 337 GET_THREAD_INFO(%r10)
336 orl $TS_COMPAT,threadinfo_status(%r10) 338 orl $TS_COMPAT,TI_status(%r10)
337 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 339 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
340 TI_flags(%r10)
338 jnz ia32_tracesys 341 jnz ia32_tracesys
339ia32_do_syscall: 342ia32_do_syscall:
340 cmpl $(IA32_NR_syscalls-1),%eax 343 cmpl $(IA32_NR_syscalls-1),%eax
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 54829e2b5160..da140611bb57 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -6,6 +6,12 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
6 6
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE
10# Do not profile debug utilities
11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg
13endif
14
9# 15#
10# vsyscalls (which work on the user stack) should have 16# vsyscalls (which work on the user stack) should have
11# no stack-protector checks: 17# no stack-protector checks:
@@ -13,12 +19,13 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
13nostackp := $(call cc-option, -fno-stack-protector) 19nostackp := $(call cc-option, -fno-stack-protector)
14CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 20CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
15CFLAGS_hpet.o := $(nostackp) 21CFLAGS_hpet.o := $(nostackp)
16CFLAGS_tsc_64.o := $(nostackp) 22CFLAGS_tsc.o := $(nostackp)
17 23
18obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 24obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
19obj-y += traps_$(BITS).o irq_$(BITS).o 25obj-y += traps_$(BITS).o irq_$(BITS).o
20obj-y += time_$(BITS).o ioport.o ldt.o 26obj-y += time_$(BITS).o ioport.o ldt.o
21obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 27obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
28obj-$(CONFIG_X86_VISWS) += visws_quirks.o
22obj-$(CONFIG_X86_32) += probe_roms_32.o 29obj-$(CONFIG_X86_32) += probe_roms_32.o
23obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 30obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
24obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 31obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
@@ -26,7 +33,7 @@ obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
26obj-y += bootflag.o e820.o 33obj-y += bootflag.o e820.o
27obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 34obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
28obj-y += alternative.o i8253.o pci-nommu.o 35obj-y += alternative.o i8253.o pci-nommu.o
29obj-y += tsc_$(BITS).o io_delay.o rtc.o 36obj-y += tsc.o io_delay.o rtc.o
30 37
31obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 38obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
32obj-y += process.o 39obj-y += process.o
@@ -56,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
56obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o 63obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o
57obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o 64obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
58obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
59obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
60obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
61obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 5c0107602b62..f489d7a9be92 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,6 +37,7 @@
37#include <asm/pgtable.h> 37#include <asm/pgtable.h>
38#include <asm/io_apic.h> 38#include <asm/io_apic.h>
39#include <asm/apic.h> 39#include <asm/apic.h>
40#include <asm/genapic.h>
40#include <asm/io.h> 41#include <asm/io.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
42#include <asm/smp.h> 43#include <asm/smp.h>
@@ -83,8 +84,6 @@ int acpi_lapic;
83int acpi_ioapic; 84int acpi_ioapic;
84int acpi_strict; 85int acpi_strict;
85 86
86static int disable_irq0_through_ioapic __initdata;
87
88u8 acpi_sci_flags __initdata; 87u8 acpi_sci_flags __initdata;
89int acpi_sci_override_gsi __initdata; 88int acpi_sci_override_gsi __initdata;
90int acpi_skip_timer_override __initdata; 89int acpi_skip_timer_override __initdata;
@@ -108,21 +107,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
108 */ 107 */
109enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; 108enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
110 109
111#ifdef CONFIG_X86_64
112
113/* rely on all ACPI tables being in the direct mapping */
114char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
115{
116 if (!phys_addr || !size)
117 return NULL;
118
119 if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE)
120 return __va(phys_addr);
121
122 return NULL;
123}
124
125#else
126 110
127/* 111/*
128 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, 112 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
@@ -141,11 +125,15 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
141 unsigned long base, offset, mapped_size; 125 unsigned long base, offset, mapped_size;
142 int idx; 126 int idx;
143 127
144 if (phys + size < 8 * 1024 * 1024) 128 if (!phys || !size)
129 return NULL;
130
131 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
145 return __va(phys); 132 return __va(phys);
146 133
147 offset = phys & (PAGE_SIZE - 1); 134 offset = phys & (PAGE_SIZE - 1);
148 mapped_size = PAGE_SIZE - offset; 135 mapped_size = PAGE_SIZE - offset;
136 clear_fixmap(FIX_ACPI_END);
149 set_fixmap(FIX_ACPI_END, phys); 137 set_fixmap(FIX_ACPI_END, phys);
150 base = fix_to_virt(FIX_ACPI_END); 138 base = fix_to_virt(FIX_ACPI_END);
151 139
@@ -157,13 +145,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
157 if (--idx < FIX_ACPI_BEGIN) 145 if (--idx < FIX_ACPI_BEGIN)
158 return NULL; /* cannot handle this */ 146 return NULL; /* cannot handle this */
159 phys += PAGE_SIZE; 147 phys += PAGE_SIZE;
148 clear_fixmap(idx);
160 set_fixmap(idx, phys); 149 set_fixmap(idx, phys);
161 mapped_size += PAGE_SIZE; 150 mapped_size += PAGE_SIZE;
162 } 151 }
163 152
164 return ((unsigned char *)base + offset); 153 return ((unsigned char *)base + offset);
165} 154}
166#endif
167 155
168#ifdef CONFIG_PCI_MMCONFIG 156#ifdef CONFIG_PCI_MMCONFIG
169/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 157/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
@@ -992,10 +980,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
992 int pin; 980 int pin;
993 struct mp_config_intsrc mp_irq; 981 struct mp_config_intsrc mp_irq;
994 982
995 /* Skip the 8254 timer interrupt (IRQ 0) if requested. */
996 if (bus_irq == 0 && disable_irq0_through_ioapic)
997 return;
998
999 /* 983 /*
1000 * Convert 'gsi' to 'ioapic.pin'. 984 * Convert 'gsi' to 'ioapic.pin'.
1001 */ 985 */
@@ -1062,10 +1046,6 @@ void __init mp_config_acpi_legacy_irqs(void)
1062 for (i = 0; i < 16; i++) { 1046 for (i = 0; i < 16; i++) {
1063 int idx; 1047 int idx;
1064 1048
1065 /* Skip the 8254 timer interrupt (IRQ 0) if requested. */
1066 if (i == 0 && disable_irq0_through_ioapic)
1067 continue;
1068
1069 for (idx = 0; idx < mp_irq_entries; idx++) { 1049 for (idx = 0; idx < mp_irq_entries; idx++) {
1070 struct mp_config_intsrc *irq = mp_irqs + idx; 1050 struct mp_config_intsrc *irq = mp_irqs + idx;
1071 1051
@@ -1373,8 +1353,6 @@ static void __init acpi_process_madt(void)
1373 return; 1353 return;
1374} 1354}
1375 1355
1376#ifdef __i386__
1377
1378static int __init disable_acpi_irq(const struct dmi_system_id *d) 1356static int __init disable_acpi_irq(const struct dmi_system_id *d)
1379{ 1357{
1380 if (!acpi_force) { 1358 if (!acpi_force) {
@@ -1425,13 +1403,12 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1425} 1403}
1426 1404
1427/* 1405/*
1428 * Don't register any I/O APIC entries for the 8254 timer IRQ. 1406 * Force ignoring BIOS IRQ0 pin2 override
1429 */ 1407 */
1430static int __init 1408static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1431dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d)
1432{ 1409{
1433 pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident); 1410 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
1434 disable_irq0_through_ioapic = 1; 1411 acpi_skip_timer_override = 1;
1435 return 0; 1412 return 0;
1436} 1413}
1437 1414
@@ -1609,11 +1586,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1609 * is enabled. This input is incorrectly designated the 1586 * is enabled. This input is incorrectly designated the
1610 * ISA IRQ 0 via an interrupt source override even though 1587 * ISA IRQ 0 via an interrupt source override even though
1611 * it is wired to the output of the master 8259A and INTIN0 1588 * it is wired to the output of the master 8259A and INTIN0
1612 * is not connected at all. Abandon any attempts to route 1589 * is not connected at all. Force ignoring BIOS IRQ0 pin2
1613 * IRQ 0 through the I/O APIC therefore. 1590 * override in that cases.
1614 */ 1591 */
1615 { 1592 {
1616 .callback = dmi_disable_irq0_through_ioapic, 1593 .callback = dmi_ignore_irq0_timer_override,
1617 .ident = "HP NX6125 laptop", 1594 .ident = "HP NX6125 laptop",
1618 .matches = { 1595 .matches = {
1619 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1596 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1621,7 +1598,7 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1621 }, 1598 },
1622 }, 1599 },
1623 { 1600 {
1624 .callback = dmi_disable_irq0_through_ioapic, 1601 .callback = dmi_ignore_irq0_timer_override,
1625 .ident = "HP NX6325 laptop", 1602 .ident = "HP NX6325 laptop",
1626 .matches = { 1603 .matches = {
1627 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1604 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1631,8 +1608,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1631 {} 1608 {}
1632}; 1609};
1633 1610
1634#endif /* __i386__ */
1635
1636/* 1611/*
1637 * acpi_boot_table_init() and acpi_boot_init() 1612 * acpi_boot_table_init() and acpi_boot_init()
1638 * called from setup_arch(), always. 1613 * called from setup_arch(), always.
@@ -1660,9 +1635,7 @@ int __init acpi_boot_table_init(void)
1660{ 1635{
1661 int error; 1636 int error;
1662 1637
1663#ifdef __i386__
1664 dmi_check_system(acpi_dmi_table); 1638 dmi_check_system(acpi_dmi_table);
1665#endif
1666 1639
1667 /* 1640 /*
1668 * If acpi_disabled, bail out 1641 * If acpi_disabled, bail out
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index de2d2e4ebad9..7c074eec39fb 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -56,6 +56,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
56 if (cpu_has(c, X86_FEATURE_ACPI)) 56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH; 57 buf[2] |= ACPI_PDC_T_FFH;
58 58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
59 obj->type = ACPI_TYPE_BUFFER; 65 obj->type = ACPI_TYPE_BUFFER;
60 obj->buffer.length = 12; 66 obj->buffer.length = 12;
61 obj->buffer.pointer = (u8 *) buf; 67 obj->buffer.pointer = (u8 *) buf;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index e6a4b564ccaa..868de3d5c39d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -23,6 +23,15 @@ static unsigned long acpi_realmode;
23static char temp_stack[10240]; 23static char temp_stack[10240];
24#endif 24#endif
25 25
26/* XXX: this macro should move to asm-x86/segment.h and be shared with the
27 boot code... */
28#define GDT_ENTRY(flags, base, limit) \
29 (((u64)(base & 0xff000000) << 32) | \
30 ((u64)flags << 40) | \
31 ((u64)(limit & 0x00ff0000) << 32) | \
32 ((u64)(base & 0x00ffffff) << 16) | \
33 ((u64)(limit & 0x0000ffff)))
34
26/** 35/**
27 * acpi_save_state_mem - save kernel state 36 * acpi_save_state_mem - save kernel state
28 * 37 *
@@ -51,18 +60,27 @@ int acpi_save_state_mem(void)
51 header->video_mode = saved_video_mode; 60 header->video_mode = saved_video_mode;
52 61
53 header->wakeup_jmp_seg = acpi_wakeup_address >> 4; 62 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
63
64 /*
65 * Set up the wakeup GDT. We set these up as Big Real Mode,
66 * that is, with limits set to 4 GB. At least the Lenovo
67 * Thinkpad X61 is known to need this for the video BIOS
68 * initialization quirk to work; this is likely to also
69 * be the case for other laptops or integrated video devices.
70 */
71
54 /* GDT[0]: GDT self-pointer */ 72 /* GDT[0]: GDT self-pointer */
55 header->wakeup_gdt[0] = 73 header->wakeup_gdt[0] =
56 (u64)(sizeof(header->wakeup_gdt) - 1) + 74 (u64)(sizeof(header->wakeup_gdt) - 1) +
57 ((u64)(acpi_wakeup_address + 75 ((u64)(acpi_wakeup_address +
58 ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) 76 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
59 << 16); 77 << 16);
60 /* GDT[1]: real-mode-like code segment */ 78 /* GDT[1]: big real mode-like code segment */
61 header->wakeup_gdt[1] = (0x009bULL << 40) + 79 header->wakeup_gdt[1] =
62 ((u64)acpi_wakeup_address << 16) + 0xffff; 80 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
63 /* GDT[2]: real-mode-like data segment */ 81 /* GDT[2]: big real mode-like data segment */
64 header->wakeup_gdt[2] = (0x0093ULL << 40) + 82 header->wakeup_gdt[2] =
65 ((u64)acpi_wakeup_address << 16) + 0xffff; 83 GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
66 84
67#ifndef CONFIG_64BIT 85#ifndef CONFIG_64BIT
68 store_gdt((struct desc_ptr *)&header->pmode_gdt); 86 store_gdt((struct desc_ptr *)&header->pmode_gdt);
@@ -140,6 +158,8 @@ static int __init acpi_sleep_setup(char *str)
140 acpi_realmode_flags |= 2; 158 acpi_realmode_flags |= 2;
141 if (strncmp(str, "s3_beep", 7) == 0) 159 if (strncmp(str, "s3_beep", 7) == 0)
142 acpi_realmode_flags |= 4; 160 acpi_realmode_flags |= 4;
161 if (strncmp(str, "old_ordering", 12) == 0)
162 acpi_old_suspend_ordering();
143 str = strchr(str, ','); 163 str = strchr(str, ',');
144 if (str != NULL) 164 if (str != NULL)
145 str += strspn(str, ", \t"); 165 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65c7857a90dd..2763cb37b553 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,6 +1,6 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/spinlock.h> 3#include <linux/mutex.h>
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/kprobes.h> 5#include <linux/kprobes.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
143#ifdef CONFIG_X86_64 143#ifdef CONFIG_X86_64
144 144
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146static inline const unsigned char*const * find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -162,7 +162,7 @@ static const struct nop {
162 { -1, NULL } 162 { -1, NULL }
163}; 163};
164 164
165static const unsigned char*const * find_nop_table(void) 165const unsigned char *const *find_nop_table(void)
166{ 166{
167 const unsigned char *const *noptable = intel_nops; 167 const unsigned char *const *noptable = intel_nops;
168 int i; 168 int i;
@@ -279,7 +279,7 @@ struct smp_alt_module {
279 struct list_head next; 279 struct list_head next;
280}; 280};
281static LIST_HEAD(smp_alt_modules); 281static LIST_HEAD(smp_alt_modules);
282static DEFINE_SPINLOCK(smp_alt); 282static DEFINE_MUTEX(smp_alt);
283static int smp_mode = 1; /* protected by smp_alt */ 283static int smp_mode = 1; /* protected by smp_alt */
284 284
285void alternatives_smp_module_add(struct module *mod, char *name, 285void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
312 __func__, smp->locks, smp->locks_end, 312 __func__, smp->locks, smp->locks_end,
313 smp->text, smp->text_end, smp->name); 313 smp->text, smp->text_end, smp->name);
314 314
315 spin_lock(&smp_alt); 315 mutex_lock(&smp_alt);
316 list_add_tail(&smp->next, &smp_alt_modules); 316 list_add_tail(&smp->next, &smp_alt_modules);
317 if (boot_cpu_has(X86_FEATURE_UP)) 317 if (boot_cpu_has(X86_FEATURE_UP))
318 alternatives_smp_unlock(smp->locks, smp->locks_end, 318 alternatives_smp_unlock(smp->locks, smp->locks_end,
319 smp->text, smp->text_end); 319 smp->text, smp->text_end);
320 spin_unlock(&smp_alt); 320 mutex_unlock(&smp_alt);
321} 321}
322 322
323void alternatives_smp_module_del(struct module *mod) 323void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod)
327 if (smp_alt_once || noreplace_smp) 327 if (smp_alt_once || noreplace_smp)
328 return; 328 return;
329 329
330 spin_lock(&smp_alt); 330 mutex_lock(&smp_alt);
331 list_for_each_entry(item, &smp_alt_modules, next) { 331 list_for_each_entry(item, &smp_alt_modules, next) {
332 if (mod != item->mod) 332 if (mod != item->mod)
333 continue; 333 continue;
334 list_del(&item->next); 334 list_del(&item->next);
335 spin_unlock(&smp_alt); 335 mutex_unlock(&smp_alt);
336 DPRINTK("%s: %s\n", __func__, item->name); 336 DPRINTK("%s: %s\n", __func__, item->name);
337 kfree(item); 337 kfree(item);
338 return; 338 return;
339 } 339 }
340 spin_unlock(&smp_alt); 340 mutex_unlock(&smp_alt);
341} 341}
342 342
343void alternatives_smp_switch(int smp) 343void alternatives_smp_switch(int smp)
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp)
359 return; 359 return;
360 BUG_ON(!smp && (num_online_cpus() > 1)); 360 BUG_ON(!smp && (num_online_cpus() > 1));
361 361
362 spin_lock(&smp_alt); 362 mutex_lock(&smp_alt);
363 363
364 /* 364 /*
365 * Avoid unnecessary switches because it forces JIT based VMs to 365 * Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp)
383 mod->text, mod->text_end); 383 mod->text, mod->text_end);
384 } 384 }
385 smp_mode = smp; 385 smp_mode = smp;
386 spin_unlock(&smp_alt); 386 mutex_unlock(&smp_alt);
387} 387}
388 388
389#endif 389#endif
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 6dea8306d8c0..a437d027f20b 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -82,6 +82,11 @@ int pic_mode;
82/* Have we found an MP table */ 82/* Have we found an MP table */
83int smp_found_config; 83int smp_found_config;
84 84
85static struct resource lapic_resource = {
86 .name = "Local APIC",
87 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
88};
89
85static unsigned int calibration_result; 90static unsigned int calibration_result;
86 91
87static int lapic_next_event(unsigned long delta, 92static int lapic_next_event(unsigned long delta,
@@ -969,7 +974,7 @@ void __cpuinit setup_local_APIC(void)
969 * Double-check whether this APIC is really registered. 974 * Double-check whether this APIC is really registered.
970 */ 975 */
971 if (!apic_id_registered()) 976 if (!apic_id_registered())
972 BUG(); 977 WARN_ON_ONCE(1);
973 978
974 /* 979 /*
975 * Intel recommends to set DFR, LDR and TPR before enabling 980 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -1335,6 +1340,10 @@ void __init smp_intr_init(void)
1335 1340
1336 /* IPI for generic function call */ 1341 /* IPI for generic function call */
1337 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 1342 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1343
1344 /* IPI for single call function */
1345 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
1346 call_function_single_interrupt);
1338} 1347}
1339#endif 1348#endif
1340 1349
@@ -1720,3 +1729,21 @@ static int __init apic_set_verbosity(char *str)
1720} 1729}
1721__setup("apic=", apic_set_verbosity); 1730__setup("apic=", apic_set_verbosity);
1722 1731
1732static int __init lapic_insert_resource(void)
1733{
1734 if (!apic_phys)
1735 return -1;
1736
1737 /* Put local APIC into the resource map. */
1738 lapic_resource.start = apic_phys;
1739 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
1740 insert_resource(&iomem_resource, &lapic_resource);
1741
1742 return 0;
1743}
1744
1745/*
1746 * need call insert after e820_reserve_resources()
1747 * that is using request_resource
1748 */
1749late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 00e6d1370954..bf9b441331e9 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,6 +204,7 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
207#include <linux/types.h> 208#include <linux/types.h>
208#include <linux/stddef.h> 209#include <linux/stddef.h>
209#include <linux/timer.h> 210#include <linux/timer.h>
@@ -1212,9 +1213,9 @@ static int suspend(int vetoable)
1212 if (err != APM_SUCCESS) 1213 if (err != APM_SUCCESS)
1213 apm_error("suspend", err); 1214 apm_error("suspend", err);
1214 err = (err == APM_SUCCESS) ? 0 : -EIO; 1215 err = (err == APM_SUCCESS) ? 0 : -EIO;
1215 device_power_up(); 1216 device_power_up(PMSG_RESUME);
1216 local_irq_enable(); 1217 local_irq_enable();
1217 device_resume(); 1218 device_resume(PMSG_RESUME);
1218 queue_event(APM_NORMAL_RESUME, NULL); 1219 queue_event(APM_NORMAL_RESUME, NULL);
1219 spin_lock(&user_list_lock); 1220 spin_lock(&user_list_lock);
1220 for (as = user_list; as != NULL; as = as->next) { 1221 for (as = user_list; as != NULL; as = as->next) {
@@ -1239,7 +1240,7 @@ static void standby(void)
1239 apm_error("standby", err); 1240 apm_error("standby", err);
1240 1241
1241 local_irq_disable(); 1242 local_irq_disable();
1242 device_power_up(); 1243 device_power_up(PMSG_RESUME);
1243 local_irq_enable(); 1244 local_irq_enable();
1244} 1245}
1245 1246
@@ -1325,7 +1326,7 @@ static void check_events(void)
1325 ignore_bounce = 1; 1326 ignore_bounce = 1;
1326 if ((event != APM_NORMAL_RESUME) 1327 if ((event != APM_NORMAL_RESUME)
1327 || (ignore_normal_resume == 0)) { 1328 || (ignore_normal_resume == 0)) {
1328 device_resume(); 1329 device_resume(PMSG_RESUME);
1329 queue_event(event, NULL); 1330 queue_event(event, NULL);
1330 } 1331 }
1331 ignore_normal_resume = 0; 1332 ignore_normal_resume = 0;
@@ -1549,10 +1550,12 @@ static int do_open(struct inode *inode, struct file *filp)
1549{ 1550{
1550 struct apm_user *as; 1551 struct apm_user *as;
1551 1552
1553 lock_kernel();
1552 as = kmalloc(sizeof(*as), GFP_KERNEL); 1554 as = kmalloc(sizeof(*as), GFP_KERNEL);
1553 if (as == NULL) { 1555 if (as == NULL) {
1554 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1556 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1555 sizeof(*as)); 1557 sizeof(*as));
1558 unlock_kernel();
1556 return -ENOMEM; 1559 return -ENOMEM;
1557 } 1560 }
1558 as->magic = APM_BIOS_MAGIC; 1561 as->magic = APM_BIOS_MAGIC;
@@ -1574,6 +1577,7 @@ static int do_open(struct inode *inode, struct file *filp)
1574 user_list = as; 1577 user_list = as;
1575 spin_unlock(&user_list_lock); 1578 spin_unlock(&user_list_lock);
1576 filp->private_data = as; 1579 filp->private_data = as;
1580 unlock_kernel();
1577 return 0; 1581 return 0;
1578} 1582}
1579 1583
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 3295e7c08fe7..bacf5deeec2d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -34,7 +34,7 @@ int main(void)
34 ENTRY(pid); 34 ENTRY(pid);
35 BLANK(); 35 BLANK();
36#undef ENTRY 36#undef ENTRY
37#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) 37#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
38 ENTRY(flags); 38 ENTRY(flags);
39 ENTRY(addr_limit); 39 ENTRY(addr_limit);
40 ENTRY(preempt_count); 40 ENTRY(preempt_count);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 958526d6a74a..7c36fb8a28d4 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -199,10 +199,15 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
199 * Don't do it for gbpages because there seems very little 199 * Don't do it for gbpages because there seems very little
200 * benefit in doing so. 200 * benefit in doing so.
201 */ 201 */
202 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && 202 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
203 (tseg >> PMD_SHIFT) < 203 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
204 (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) 204 if ((tseg>>PMD_SHIFT) <
205 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
206 ((tseg>>PMD_SHIFT) <
207 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
208 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
205 set_memory_4k((unsigned long)__va(tseg), 1); 209 set_memory_4k((unsigned long)__va(tseg), 1);
210 }
206 } 211 }
207} 212}
208 213
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 13526fd5cce1..1d181c40e2e1 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -10,20 +10,12 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
10{ 10{
11 if (c->x86 == 0x6 && c->x86_model >= 0xf) 11 if (c->x86 == 0x6 && c->x86_model >= 0xf)
12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
13
14 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
13} 15}
14 16
15static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
16{ 18{
17 /* Cache sizes */
18 unsigned n;
19
20 n = c->extended_cpuid_level;
21 if (n >= 0x80000008) {
22 unsigned eax = cpuid_eax(0x80000008);
23 c->x86_virt_bits = (eax >> 8) & 0xff;
24 c->x86_phys_bits = eax & 0xff;
25 }
26
27 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 19 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
28 c->x86_cache_alignment = c->x86_clflush_size * 2; 20 c->x86_cache_alignment = c->x86_clflush_size * 2;
29 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 751850235291..7b8cc72feb40 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -98,7 +98,7 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
98 98
99void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 99void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
100{ 100{
101 unsigned int n, dummy, eax, ebx, ecx, edx; 101 unsigned int n, dummy, ebx, ecx, edx;
102 102
103 n = c->extended_cpuid_level; 103 n = c->extended_cpuid_level;
104 104
@@ -121,11 +121,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
121 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 121 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
122 c->x86_cache_size, ecx & 0xFF); 122 c->x86_cache_size, ecx & 0xFF);
123 } 123 }
124 if (n >= 0x80000008) {
125 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
126 c->x86_virt_bits = (eax >> 8) & 0xff;
127 c->x86_phys_bits = eax & 0xff;
128 }
129} 124}
130 125
131void __cpuinit detect_ht(struct cpuinfo_x86 *c) 126void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -314,6 +309,16 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
314 if (c->extended_cpuid_level >= 0x80000007) 309 if (c->extended_cpuid_level >= 0x80000007)
315 c->x86_power = cpuid_edx(0x80000007); 310 c->x86_power = cpuid_edx(0x80000007);
316 311
312 if (c->extended_cpuid_level >= 0x80000008) {
313 u32 eax = cpuid_eax(0x80000008);
314
315 c->x86_virt_bits = (eax >> 8) & 0xff;
316 c->x86_phys_bits = eax & 0xff;
317 }
318
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
317 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 322 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
318 cpu_devs[c->x86_vendor]->c_early_init) 323 cpu_devs[c->x86_vendor]->c_early_init)
319 cpu_devs[c->x86_vendor]->c_early_init(c); 324 cpu_devs[c->x86_vendor]->c_early_init(c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fe9224c51d37..70609efdf1da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -226,6 +226,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
226 226
227 if (cpu_has_bts) 227 if (cpu_has_bts)
228 ds_init_intel(c); 228 ds_init_intel(c);
229
230#ifdef CONFIG_X86_NUMAQ
231 numaq_tsc_disable();
232#endif
229} 233}
230 234
231static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 235static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
index fcb1cc9d75ca..1019c58d39f0 100644
--- a/arch/x86/kernel/cpu/intel_64.c
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -12,6 +12,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
15} 17}
16 18
17/* 19/*
@@ -52,9 +54,6 @@ static void __cpuinit srat_detect_node(void)
52 54
53static void __cpuinit init_intel(struct cpuinfo_x86 *c) 55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
54{ 56{
55 /* Cache sizes */
56 unsigned n;
57
58 init_intel_cacheinfo(c); 57 init_intel_cacheinfo(c);
59 if (c->cpuid_level > 9) { 58 if (c->cpuid_level > 9) {
60 unsigned eax = cpuid_eax(10); 59 unsigned eax = cpuid_eax(10);
@@ -76,13 +75,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
76 if (cpu_has_bts) 75 if (cpu_has_bts)
77 ds_init_intel(c); 76 ds_init_intel(c);
78 77
79 n = c->extended_cpuid_level;
80 if (n >= 0x80000008) {
81 unsigned eax = cpuid_eax(0x80000008);
82 c->x86_virt_bits = (eax >> 8) & 0xff;
83 c->x86_phys_bits = eax & 0xff;
84 }
85
86 if (c->x86 == 15) 78 if (c->x86 == 15)
87 c->x86_cache_alignment = c->x86_clflush_size * 2; 79 c->x86_cache_alignment = c->x86_clflush_size * 2;
88 if (c->x86 == 6) 80 if (c->x86 == 6)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 501ca1cea27d..c4a7ec31394c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -9,6 +9,7 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
12#include <linux/string.h> 13#include <linux/string.h>
13#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
14#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
@@ -363,7 +364,7 @@ static void mcheck_check_cpu(void *info)
363 364
364static void mcheck_timer(struct work_struct *work) 365static void mcheck_timer(struct work_struct *work)
365{ 366{
366 on_each_cpu(mcheck_check_cpu, NULL, 1, 1); 367 on_each_cpu(mcheck_check_cpu, NULL, 1);
367 368
368 /* 369 /*
369 * Alert userspace if needed. If we logged an MCE, reduce the 370 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -532,10 +533,12 @@ static int open_exclu; /* already open exclusive? */
532 533
533static int mce_open(struct inode *inode, struct file *file) 534static int mce_open(struct inode *inode, struct file *file)
534{ 535{
536 lock_kernel();
535 spin_lock(&mce_state_lock); 537 spin_lock(&mce_state_lock);
536 538
537 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 539 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
538 spin_unlock(&mce_state_lock); 540 spin_unlock(&mce_state_lock);
541 unlock_kernel();
539 return -EBUSY; 542 return -EBUSY;
540 } 543 }
541 544
@@ -544,6 +547,7 @@ static int mce_open(struct inode *inode, struct file *file)
544 open_count++; 547 open_count++;
545 548
546 spin_unlock(&mce_state_lock); 549 spin_unlock(&mce_state_lock);
550 unlock_kernel();
547 551
548 return nonseekable_open(inode, file); 552 return nonseekable_open(inode, file);
549} 553}
@@ -617,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
617 * Collect entries that were still getting written before the 621 * Collect entries that were still getting written before the
618 * synchronize. 622 * synchronize.
619 */ 623 */
620 on_each_cpu(collect_tscs, cpu_tsc, 1, 1); 624 on_each_cpu(collect_tscs, cpu_tsc, 1);
621 for (i = next; i < MCE_LOG_LEN; i++) { 625 for (i = next; i < MCE_LOG_LEN; i++) {
622 if (mcelog.entry[i].finished && 626 if (mcelog.entry[i].finished &&
623 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 627 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
@@ -742,7 +746,7 @@ static void mce_restart(void)
742 if (next_interval) 746 if (next_interval)
743 cancel_delayed_work(&mcheck_work); 747 cancel_delayed_work(&mcheck_work);
744 /* Timer race is harmless here */ 748 /* Timer race is harmless here */
745 on_each_cpu(mce_init, NULL, 1, 1); 749 on_each_cpu(mce_init, NULL, 1);
746 next_interval = check_interval * HZ; 750 next_interval = check_interval * HZ;
747 if (next_interval) 751 if (next_interval)
748 schedule_delayed_work(&mcheck_work, 752 schedule_delayed_work(&mcheck_work,
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 00ccb6c14ec2..cc1fccdd31e0 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
59 59
60static void mce_work_fn(struct work_struct *work) 60static void mce_work_fn(struct work_struct *work)
61{ 61{
62 on_each_cpu(mce_checkregs, NULL, 1, 1); 62 on_each_cpu(mce_checkregs, NULL, 1);
63 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 63 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
64} 64}
65 65
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 105afe12beb0..6f23969c8faf 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -223,7 +223,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
223 atomic_set(&data.gate,0); 223 atomic_set(&data.gate,0);
224 224
225 /* Start the ball rolling on other CPUs */ 225 /* Start the ball rolling on other CPUs */
226 if (smp_call_function(ipi_handler, &data, 1, 0) != 0) 226 if (smp_call_function(ipi_handler, &data, 0) != 0)
227 panic("mtrr: timed out waiting for other CPUs\n"); 227 panic("mtrr: timed out waiting for other CPUs\n");
228 228
229 local_irq_save(flags); 229 local_irq_save(flags);
@@ -1682,7 +1682,7 @@ void mtrr_ap_init(void)
1682 */ 1682 */
1683void mtrr_save_state(void) 1683void mtrr_save_state(void)
1684{ 1684{
1685 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); 1685 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
1686} 1686}
1687 1687
1688static int __init mtrr_init_finialize(void) 1688static int __init mtrr_init_finialize(void)
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 2e9bef6e3aa3..6d4bdc02388a 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -189,7 +189,7 @@ void disable_lapic_nmi_watchdog(void)
189 if (atomic_read(&nmi_active) <= 0) 189 if (atomic_read(&nmi_active) <= 0)
190 return; 190 return;
191 191
192 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); 192 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
193 193
194 if (wd_ops) 194 if (wd_ops)
195 wd_ops->unreserve(); 195 wd_ops->unreserve();
@@ -213,7 +213,7 @@ void enable_lapic_nmi_watchdog(void)
213 return; 213 return;
214 } 214 }
215 215
216 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); 216 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
217 touch_nmi_watchdog(); 217 touch_nmi_watchdog();
218} 218}
219 219
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index daff52a62248..2de5fa2bbf77 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,6 +33,7 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/poll.h> 34#include <linux/poll.h>
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/smp_lock.h>
36#include <linux/major.h> 37#include <linux/major.h>
37#include <linux/fs.h> 38#include <linux/fs.h>
38#include <linux/smp_lock.h> 39#include <linux/smp_lock.h>
@@ -95,7 +96,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
95 for (; count; count -= 16) { 96 for (; count; count -= 16) {
96 cmd.eax = pos; 97 cmd.eax = pos;
97 cmd.ecx = pos >> 32; 98 cmd.ecx = pos >> 32;
98 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); 99 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
99 if (copy_to_user(tmp, &cmd, 16)) 100 if (copy_to_user(tmp, &cmd, 16))
100 return -EFAULT; 101 return -EFAULT;
101 tmp += 16; 102 tmp += 16;
@@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
107 108
108static int cpuid_open(struct inode *inode, struct file *file) 109static int cpuid_open(struct inode *inode, struct file *file)
109{ 110{
110 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 111 unsigned int cpu;
111 struct cpuinfo_x86 *c = &cpu_data(cpu); 112 struct cpuinfo_x86 *c;
112 113 int ret = 0;
113 if (cpu >= NR_CPUS || !cpu_online(cpu)) 114
114 return -ENXIO; /* No such CPU */ 115 lock_kernel();
116
117 cpu = iminor(file->f_path.dentry->d_inode);
118 if (cpu >= NR_CPUS || !cpu_online(cpu)) {
119 ret = -ENXIO; /* No such CPU */
120 goto out;
121 }
122 c = &cpu_data(cpu);
115 if (c->cpuid_level < 0) 123 if (c->cpuid_level < 0)
116 return -EIO; /* CPUID not supported */ 124 ret = -EIO; /* CPUID not supported */
117 125out:
118 return 0; 126 unlock_kernel();
127 return ret;
119} 128}
120 129
121/* 130/*
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d0335853ff52..28c29180b380 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/pfn.h> 20#include <linux/pfn.h>
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/firmware-map.h>
22 23
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/page.h> 25#include <asm/page.h>
@@ -27,7 +28,22 @@
27#include <asm/setup.h> 28#include <asm/setup.h>
28#include <asm/trampoline.h> 29#include <asm/trampoline.h>
29 30
31/*
32 * The e820 map is the map that gets modified e.g. with command line parameters
33 * and that is also registered with modifications in the kernel resource tree
34 * with the iomem_resource as parent.
35 *
36 * The e820_saved is directly saved after the BIOS-provided memory map is
37 * copied. It doesn't get modified afterwards. It's registered for the
38 * /sys/firmware/memmap interface.
39 *
40 * That memory map is not modified and is used as base for kexec. The kexec'd
41 * kernel should get the same memory map as the firmware provides. Then the
42 * user can e.g. boot the original kernel with mem=1G while still booting the
43 * next kernel with full memory.
44 */
30struct e820map e820; 45struct e820map e820;
46struct e820map e820_saved;
31 47
32/* For PCI or other memory-mapped resources */ 48/* For PCI or other memory-mapped resources */
33unsigned long pci_mem_start = 0xaeedbabe; 49unsigned long pci_mem_start = 0xaeedbabe;
@@ -398,8 +414,9 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
398 return __append_e820_map(biosmap, nr_map); 414 return __append_e820_map(biosmap, nr_map);
399} 415}
400 416
401u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, 417static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
402 unsigned new_type) 418 u64 size, unsigned old_type,
419 unsigned new_type)
403{ 420{
404 int i; 421 int i;
405 u64 real_updated_size = 0; 422 u64 real_updated_size = 0;
@@ -410,7 +427,7 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
410 size = ULLONG_MAX - start; 427 size = ULLONG_MAX - start;
411 428
412 for (i = 0; i < e820.nr_map; i++) { 429 for (i = 0; i < e820.nr_map; i++) {
413 struct e820entry *ei = &e820.map[i]; 430 struct e820entry *ei = &e820x->map[i];
414 u64 final_start, final_end; 431 u64 final_start, final_end;
415 if (ei->type != old_type) 432 if (ei->type != old_type)
416 continue; 433 continue;
@@ -438,6 +455,19 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
438 return real_updated_size; 455 return real_updated_size;
439} 456}
440 457
458u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
459 unsigned new_type)
460{
461 return e820_update_range_map(&e820, start, size, old_type, new_type);
462}
463
464static u64 __init e820_update_range_saved(u64 start, u64 size,
465 unsigned old_type, unsigned new_type)
466{
467 return e820_update_range_map(&e820_saved, start, size, old_type,
468 new_type);
469}
470
441/* make e820 not cover the range */ 471/* make e820 not cover the range */
442u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, 472u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
443 int checktype) 473 int checktype)
@@ -487,6 +517,15 @@ void __init update_e820(void)
487 printk(KERN_INFO "modified physical RAM map:\n"); 517 printk(KERN_INFO "modified physical RAM map:\n");
488 e820_print_map("modified"); 518 e820_print_map("modified");
489} 519}
520static void __init update_e820_saved(void)
521{
522 int nr_map;
523
524 nr_map = e820_saved.nr_map;
525 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
526 return;
527 e820_saved.nr_map = nr_map;
528}
490#define MAX_GAP_END 0x100000000ull 529#define MAX_GAP_END 0x100000000ull
491/* 530/*
492 * Search for a gap in the e820 memory space from start_addr to end_addr. 531 * Search for a gap in the e820 memory space from start_addr to end_addr.
@@ -991,8 +1030,10 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
991 1030
992 addr = round_down(start + size - sizet, align); 1031 addr = round_down(start + size - sizet, align);
993 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); 1032 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
1033 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
994 printk(KERN_INFO "update e820 for early_reserve_e820\n"); 1034 printk(KERN_INFO "update e820 for early_reserve_e820\n");
995 update_e820(); 1035 update_e820();
1036 update_e820_saved();
996 1037
997 return addr; 1038 return addr;
998} 1039}
@@ -1008,30 +1049,51 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
1008#endif 1049#endif
1009 1050
1010/* 1051/*
1011 * Last pfn which the user wants to use.
1012 */
1013unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
1014
1015/*
1016 * Find the highest page frame number we have available 1052 * Find the highest page frame number we have available
1017 */ 1053 */
1018unsigned long __init e820_end_of_ram(void) 1054static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
1019{ 1055{
1020 unsigned long last_pfn; 1056 int i;
1057 unsigned long last_pfn = 0;
1021 unsigned long max_arch_pfn = MAX_ARCH_PFN; 1058 unsigned long max_arch_pfn = MAX_ARCH_PFN;
1022 1059
1023 last_pfn = find_max_pfn_with_active_regions(); 1060 for (i = 0; i < e820.nr_map; i++) {
1061 struct e820entry *ei = &e820.map[i];
1062 unsigned long start_pfn;
1063 unsigned long end_pfn;
1064
1065 if (ei->type != type)
1066 continue;
1067
1068 start_pfn = ei->addr >> PAGE_SHIFT;
1069 end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
1070
1071 if (start_pfn >= limit_pfn)
1072 continue;
1073 if (end_pfn > limit_pfn) {
1074 last_pfn = limit_pfn;
1075 break;
1076 }
1077 if (end_pfn > last_pfn)
1078 last_pfn = end_pfn;
1079 }
1024 1080
1025 if (last_pfn > max_arch_pfn) 1081 if (last_pfn > max_arch_pfn)
1026 last_pfn = max_arch_pfn; 1082 last_pfn = max_arch_pfn;
1027 if (last_pfn > end_user_pfn)
1028 last_pfn = end_user_pfn;
1029 1083
1030 printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", 1084 printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
1031 last_pfn, max_arch_pfn); 1085 last_pfn, max_arch_pfn);
1032 return last_pfn; 1086 return last_pfn;
1033} 1087}
1088unsigned long __init e820_end_of_ram_pfn(void)
1089{
1090 return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
1091}
1034 1092
1093unsigned long __init e820_end_of_low_ram_pfn(void)
1094{
1095 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
1096}
1035/* 1097/*
1036 * Finds an active region in the address range from start_pfn to last_pfn and 1098 * Finds an active region in the address range from start_pfn to last_pfn and
1037 * returns its range in ei_startpfn and ei_endpfn for the e820 entry. 1099 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
@@ -1062,12 +1124,6 @@ int __init e820_find_active_region(const struct e820entry *ei,
1062 if (*ei_endpfn > last_pfn) 1124 if (*ei_endpfn > last_pfn)
1063 *ei_endpfn = last_pfn; 1125 *ei_endpfn = last_pfn;
1064 1126
1065 /* Obey end_user_pfn to save on memmap */
1066 if (*ei_startpfn >= end_user_pfn)
1067 return 0;
1068 if (*ei_endpfn > end_user_pfn)
1069 *ei_endpfn = end_user_pfn;
1070
1071 return 1; 1127 return 1;
1072} 1128}
1073 1129
@@ -1113,6 +1169,8 @@ static void early_panic(char *msg)
1113 panic(msg); 1169 panic(msg);
1114} 1170}
1115 1171
1172static int userdef __initdata;
1173
1116/* "mem=nopentium" disables the 4MB page tables. */ 1174/* "mem=nopentium" disables the 4MB page tables. */
1117static int __init parse_memopt(char *p) 1175static int __init parse_memopt(char *p)
1118{ 1176{
@@ -1128,22 +1186,22 @@ static int __init parse_memopt(char *p)
1128 } 1186 }
1129#endif 1187#endif
1130 1188
1189 userdef = 1;
1131 mem_size = memparse(p, &p); 1190 mem_size = memparse(p, &p);
1132 end_user_pfn = mem_size>>PAGE_SHIFT; 1191 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
1133 e820_update_range(mem_size, ULLONG_MAX - mem_size,
1134 E820_RAM, E820_RESERVED);
1135 1192
1136 return 0; 1193 return 0;
1137} 1194}
1138early_param("mem", parse_memopt); 1195early_param("mem", parse_memopt);
1139 1196
1140static int userdef __initdata;
1141
1142static int __init parse_memmap_opt(char *p) 1197static int __init parse_memmap_opt(char *p)
1143{ 1198{
1144 char *oldp; 1199 char *oldp;
1145 u64 start_at, mem_size; 1200 u64 start_at, mem_size;
1146 1201
1202 if (!p)
1203 return -EINVAL;
1204
1147 if (!strcmp(p, "exactmap")) { 1205 if (!strcmp(p, "exactmap")) {
1148#ifdef CONFIG_CRASH_DUMP 1206#ifdef CONFIG_CRASH_DUMP
1149 /* 1207 /*
@@ -1151,9 +1209,7 @@ static int __init parse_memmap_opt(char *p)
1151 * the real mem size before original memory map is 1209 * the real mem size before original memory map is
1152 * reset. 1210 * reset.
1153 */ 1211 */
1154 e820_register_active_regions(0, 0, -1UL); 1212 saved_max_pfn = e820_end_of_ram_pfn();
1155 saved_max_pfn = e820_end_of_ram();
1156 remove_all_active_ranges();
1157#endif 1213#endif
1158 e820.nr_map = 0; 1214 e820.nr_map = 0;
1159 userdef = 1; 1215 userdef = 1;
@@ -1175,11 +1231,9 @@ static int __init parse_memmap_opt(char *p)
1175 } else if (*p == '$') { 1231 } else if (*p == '$') {
1176 start_at = memparse(p+1, &p); 1232 start_at = memparse(p+1, &p);
1177 e820_add_region(start_at, mem_size, E820_RESERVED); 1233 e820_add_region(start_at, mem_size, E820_RESERVED);
1178 } else { 1234 } else
1179 end_user_pfn = (mem_size >> PAGE_SHIFT); 1235 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
1180 e820_update_range(mem_size, ULLONG_MAX - mem_size, 1236
1181 E820_RAM, E820_RESERVED);
1182 }
1183 return *p == '\0' ? 0 : -EINVAL; 1237 return *p == '\0' ? 0 : -EINVAL;
1184} 1238}
1185early_param("memmap", parse_memmap_opt); 1239early_param("memmap", parse_memmap_opt);
@@ -1198,6 +1252,17 @@ void __init finish_e820_parsing(void)
1198 } 1252 }
1199} 1253}
1200 1254
1255static inline const char *e820_type_to_string(int e820_type)
1256{
1257 switch (e820_type) {
1258 case E820_RESERVED_KERN:
1259 case E820_RAM: return "System RAM";
1260 case E820_ACPI: return "ACPI Tables";
1261 case E820_NVS: return "ACPI Non-volatile Storage";
1262 default: return "reserved";
1263 }
1264}
1265
1201/* 1266/*
1202 * Mark e820 reserved areas as busy for the resource manager. 1267 * Mark e820 reserved areas as busy for the resource manager.
1203 */ 1268 */
@@ -1209,13 +1274,6 @@ void __init e820_reserve_resources(void)
1209 1274
1210 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1275 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1211 for (i = 0; i < e820.nr_map; i++) { 1276 for (i = 0; i < e820.nr_map; i++) {
1212 switch (e820.map[i].type) {
1213 case E820_RESERVED_KERN:
1214 case E820_RAM: res->name = "System RAM"; break;
1215 case E820_ACPI: res->name = "ACPI Tables"; break;
1216 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
1217 default: res->name = "reserved";
1218 }
1219 end = e820.map[i].addr + e820.map[i].size - 1; 1277 end = e820.map[i].addr + e820.map[i].size - 1;
1220#ifndef CONFIG_RESOURCES_64BIT 1278#ifndef CONFIG_RESOURCES_64BIT
1221 if (end > 0x100000000ULL) { 1279 if (end > 0x100000000ULL) {
@@ -1223,6 +1281,7 @@ void __init e820_reserve_resources(void)
1223 continue; 1281 continue;
1224 } 1282 }
1225#endif 1283#endif
1284 res->name = e820_type_to_string(e820.map[i].type);
1226 res->start = e820.map[i].addr; 1285 res->start = e820.map[i].addr;
1227 res->end = end; 1286 res->end = end;
1228 1287
@@ -1230,8 +1289,20 @@ void __init e820_reserve_resources(void)
1230 insert_resource(&iomem_resource, res); 1289 insert_resource(&iomem_resource, res);
1231 res++; 1290 res++;
1232 } 1291 }
1292
1293 for (i = 0; i < e820_saved.nr_map; i++) {
1294 struct e820entry *entry = &e820_saved.map[i];
1295 firmware_map_add_early(entry->addr,
1296 entry->addr + entry->size - 1,
1297 e820_type_to_string(entry->type));
1298 }
1233} 1299}
1234 1300
1301/*
1302 * Non-standard memory setup can be specified via this quirk:
1303 */
1304char * (*arch_memory_setup_quirk)(void);
1305
1235char *__init default_machine_specific_memory_setup(void) 1306char *__init default_machine_specific_memory_setup(void)
1236{ 1307{
1237 char *who = "BIOS-e820"; 1308 char *who = "BIOS-e820";
@@ -1272,6 +1343,12 @@ char *__init default_machine_specific_memory_setup(void)
1272 1343
1273char *__init __attribute__((weak)) machine_specific_memory_setup(void) 1344char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1274{ 1345{
1346 if (arch_memory_setup_quirk) {
1347 char *who = arch_memory_setup_quirk();
1348
1349 if (who)
1350 return who;
1351 }
1275 return default_machine_specific_memory_setup(); 1352 return default_machine_specific_memory_setup();
1276} 1353}
1277 1354
@@ -1283,8 +1360,12 @@ char * __init __attribute__((weak)) memory_setup(void)
1283 1360
1284void __init setup_memory_map(void) 1361void __init setup_memory_map(void)
1285{ 1362{
1363 char *who;
1364
1365 who = memory_setup();
1366 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1286 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1367 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1287 e820_print_map(memory_setup()); 1368 e820_print_map(who);
1288} 1369}
1289 1370
1290#ifdef CONFIG_X86_64 1371#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a4665f37cfc5..a0e11c0cc872 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -120,7 +120,18 @@ static struct chipset early_qrk[] __initdata = {
120 {} 120 {}
121}; 121};
122 122
123static void __init check_dev_quirk(int num, int slot, int func) 123/**
124 * check_dev_quirk - apply early quirks to a given PCI device
125 * @num: bus number
126 * @slot: slot number
127 * @func: PCI function
128 *
129 * Check the vendor & device ID against the early quirks table.
130 *
131 * If the device is single function, let early_quirks() know so we don't
132 * poke at this device again.
133 */
134static int __init check_dev_quirk(int num, int slot, int func)
124{ 135{
125 u16 class; 136 u16 class;
126 u16 vendor; 137 u16 vendor;
@@ -131,7 +142,7 @@ static void __init check_dev_quirk(int num, int slot, int func)
131 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); 142 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
132 143
133 if (class == 0xffff) 144 if (class == 0xffff)
134 return; 145 return -1; /* no class, treat as single function */
135 146
136 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); 147 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
137 148
@@ -154,7 +165,9 @@ static void __init check_dev_quirk(int num, int slot, int func)
154 type = read_pci_config_byte(num, slot, func, 165 type = read_pci_config_byte(num, slot, func,
155 PCI_HEADER_TYPE); 166 PCI_HEADER_TYPE);
156 if (!(type & 0x80)) 167 if (!(type & 0x80))
157 return; 168 return -1;
169
170 return 0;
158} 171}
159 172
160void __init early_quirks(void) 173void __init early_quirks(void)
@@ -167,6 +180,9 @@ void __init early_quirks(void)
167 /* Poor man's PCI discovery */ 180 /* Poor man's PCI discovery */
168 for (num = 0; num < 32; num++) 181 for (num = 0; num < 32; num++)
169 for (slot = 0; slot < 32; slot++) 182 for (slot = 0; slot < 32; slot++)
170 for (func = 0; func < 8; func++) 183 for (func = 0; func < 8; func++) {
171 check_dev_quirk(num, slot, func); 184 /* Only probe function 0 on single fn devices */
185 if (check_dev_quirk(num, slot, func))
186 break;
187 }
172} 188}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 643fd861b724..ff9e7350da54 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -196,7 +196,7 @@ static struct console simnow_console = {
196static struct console *early_console = &early_vga_console; 196static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 197static int early_console_initialized;
198 198
199void early_printk(const char *fmt, ...) 199asmlinkage void early_printk(const char *fmt, ...)
200{ 200{
201 char buf[512]; 201 char buf[512];
202 int n; 202 int n;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 94382faeadb6..06cc8d4254b1 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -473,7 +473,7 @@ void __init efi_enter_virtual_mode(void)
473 size = md->num_pages << EFI_PAGE_SHIFT; 473 size = md->num_pages << EFI_PAGE_SHIFT;
474 end = md->phys_addr + size; 474 end = md->phys_addr + size;
475 475
476 if (PFN_UP(end) <= max_pfn_mapped) 476 if (PFN_UP(end) <= max_low_pfn_mapped)
477 va = __va(md->phys_addr); 477 va = __va(md->phys_addr);
478 else 478 else
479 va = efi_ioremap(md->phys_addr, size); 479 va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 53393c306e11..6bc07f0f1202 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,6 +51,7 @@
51#include <asm/percpu.h> 51#include <asm/percpu.h>
52#include <asm/dwarf2.h> 52#include <asm/dwarf2.h>
53#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
54#include <asm/ftrace.h>
54#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
55 56
56/* 57/*
@@ -1024,6 +1025,7 @@ ENTRY(xen_sysenter_target)
1024 RING0_INT_FRAME 1025 RING0_INT_FRAME
1025 addl $5*4, %esp /* remove xen-provided frame */ 1026 addl $5*4, %esp /* remove xen-provided frame */
1026 jmp sysenter_past_esp 1027 jmp sysenter_past_esp
1028 CFI_ENDPROC
1027 1029
1028ENTRY(xen_hypervisor_callback) 1030ENTRY(xen_hypervisor_callback)
1029 CFI_STARTPROC 1031 CFI_STARTPROC
@@ -1110,6 +1112,77 @@ ENDPROC(xen_failsafe_callback)
1110 1112
1111#endif /* CONFIG_XEN */ 1113#endif /* CONFIG_XEN */
1112 1114
1115#ifdef CONFIG_FTRACE
1116#ifdef CONFIG_DYNAMIC_FTRACE
1117
1118ENTRY(mcount)
1119 pushl %eax
1120 pushl %ecx
1121 pushl %edx
1122 movl 0xc(%esp), %eax
1123 subl $MCOUNT_INSN_SIZE, %eax
1124
1125.globl mcount_call
1126mcount_call:
1127 call ftrace_stub
1128
1129 popl %edx
1130 popl %ecx
1131 popl %eax
1132
1133 ret
1134END(mcount)
1135
1136ENTRY(ftrace_caller)
1137 pushl %eax
1138 pushl %ecx
1139 pushl %edx
1140 movl 0xc(%esp), %eax
1141 movl 0x4(%ebp), %edx
1142 subl $MCOUNT_INSN_SIZE, %eax
1143
1144.globl ftrace_call
1145ftrace_call:
1146 call ftrace_stub
1147
1148 popl %edx
1149 popl %ecx
1150 popl %eax
1151
1152.globl ftrace_stub
1153ftrace_stub:
1154 ret
1155END(ftrace_caller)
1156
1157#else /* ! CONFIG_DYNAMIC_FTRACE */
1158
1159ENTRY(mcount)
1160 cmpl $ftrace_stub, ftrace_trace_function
1161 jnz trace
1162.globl ftrace_stub
1163ftrace_stub:
1164 ret
1165
1166 /* taken from glibc */
1167trace:
1168 pushl %eax
1169 pushl %ecx
1170 pushl %edx
1171 movl 0xc(%esp), %eax
1172 movl 0x4(%ebp), %edx
1173 subl $MCOUNT_INSN_SIZE, %eax
1174
1175 call *ftrace_trace_function
1176
1177 popl %edx
1178 popl %ecx
1179 popl %eax
1180
1181 jmp ftrace_stub
1182END(mcount)
1183#endif /* CONFIG_DYNAMIC_FTRACE */
1184#endif /* CONFIG_FTRACE */
1185
1113.section .rodata,"a" 1186.section .rodata,"a"
1114#include "syscall_table_32.S" 1187#include "syscall_table_32.S"
1115 1188
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 07d69f262337..ae63e584c340 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -51,9 +51,115 @@
51#include <asm/page.h> 51#include <asm/page.h>
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h>
54 55
55 .code64 56 .code64
56 57
58#ifdef CONFIG_FTRACE
59#ifdef CONFIG_DYNAMIC_FTRACE
60ENTRY(mcount)
61
62 subq $0x38, %rsp
63 movq %rax, (%rsp)
64 movq %rcx, 8(%rsp)
65 movq %rdx, 16(%rsp)
66 movq %rsi, 24(%rsp)
67 movq %rdi, 32(%rsp)
68 movq %r8, 40(%rsp)
69 movq %r9, 48(%rsp)
70
71 movq 0x38(%rsp), %rdi
72 subq $MCOUNT_INSN_SIZE, %rdi
73
74.globl mcount_call
75mcount_call:
76 call ftrace_stub
77
78 movq 48(%rsp), %r9
79 movq 40(%rsp), %r8
80 movq 32(%rsp), %rdi
81 movq 24(%rsp), %rsi
82 movq 16(%rsp), %rdx
83 movq 8(%rsp), %rcx
84 movq (%rsp), %rax
85 addq $0x38, %rsp
86
87 retq
88END(mcount)
89
90ENTRY(ftrace_caller)
91
92 /* taken from glibc */
93 subq $0x38, %rsp
94 movq %rax, (%rsp)
95 movq %rcx, 8(%rsp)
96 movq %rdx, 16(%rsp)
97 movq %rsi, 24(%rsp)
98 movq %rdi, 32(%rsp)
99 movq %r8, 40(%rsp)
100 movq %r9, 48(%rsp)
101
102 movq 0x38(%rsp), %rdi
103 movq 8(%rbp), %rsi
104 subq $MCOUNT_INSN_SIZE, %rdi
105
106.globl ftrace_call
107ftrace_call:
108 call ftrace_stub
109
110 movq 48(%rsp), %r9
111 movq 40(%rsp), %r8
112 movq 32(%rsp), %rdi
113 movq 24(%rsp), %rsi
114 movq 16(%rsp), %rdx
115 movq 8(%rsp), %rcx
116 movq (%rsp), %rax
117 addq $0x38, %rsp
118
119.globl ftrace_stub
120ftrace_stub:
121 retq
122END(ftrace_caller)
123
124#else /* ! CONFIG_DYNAMIC_FTRACE */
125ENTRY(mcount)
126 cmpq $ftrace_stub, ftrace_trace_function
127 jnz trace
128.globl ftrace_stub
129ftrace_stub:
130 retq
131
132trace:
133 /* taken from glibc */
134 subq $0x38, %rsp
135 movq %rax, (%rsp)
136 movq %rcx, 8(%rsp)
137 movq %rdx, 16(%rsp)
138 movq %rsi, 24(%rsp)
139 movq %rdi, 32(%rsp)
140 movq %r8, 40(%rsp)
141 movq %r9, 48(%rsp)
142
143 movq 0x38(%rsp), %rdi
144 movq 8(%rbp), %rsi
145 subq $MCOUNT_INSN_SIZE, %rdi
146
147 call *ftrace_trace_function
148
149 movq 48(%rsp), %r9
150 movq 40(%rsp), %r8
151 movq 32(%rsp), %rdi
152 movq 24(%rsp), %rsi
153 movq 16(%rsp), %rdx
154 movq 8(%rsp), %rcx
155 movq (%rsp), %rax
156 addq $0x38, %rsp
157
158 jmp ftrace_stub
159END(mcount)
160#endif /* CONFIG_DYNAMIC_FTRACE */
161#endif /* CONFIG_FTRACE */
162
57#ifndef CONFIG_PREEMPT 163#ifndef CONFIG_PREEMPT
58#define retint_kernel retint_restore_args 164#define retint_kernel retint_restore_args
59#endif 165#endif
@@ -168,13 +274,13 @@ ENTRY(ret_from_fork)
168 CFI_ADJUST_CFA_OFFSET -4 274 CFI_ADJUST_CFA_OFFSET -4
169 call schedule_tail 275 call schedule_tail
170 GET_THREAD_INFO(%rcx) 276 GET_THREAD_INFO(%rcx)
171 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) 277 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
172 jnz rff_trace 278 jnz rff_trace
173rff_action: 279rff_action:
174 RESTORE_REST 280 RESTORE_REST
175 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? 281 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
176 je int_ret_from_sys_call 282 je int_ret_from_sys_call
177 testl $_TIF_IA32,threadinfo_flags(%rcx) 283 testl $_TIF_IA32,TI_flags(%rcx)
178 jnz int_ret_from_sys_call 284 jnz int_ret_from_sys_call
179 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET 285 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
180 jmp ret_from_sys_call 286 jmp ret_from_sys_call
@@ -243,7 +349,8 @@ ENTRY(system_call_after_swapgs)
243 movq %rcx,RIP-ARGOFFSET(%rsp) 349 movq %rcx,RIP-ARGOFFSET(%rsp)
244 CFI_REL_OFFSET rip,RIP-ARGOFFSET 350 CFI_REL_OFFSET rip,RIP-ARGOFFSET
245 GET_THREAD_INFO(%rcx) 351 GET_THREAD_INFO(%rcx)
246 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) 352 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
353 TI_flags(%rcx)
247 jnz tracesys 354 jnz tracesys
248 cmpq $__NR_syscall_max,%rax 355 cmpq $__NR_syscall_max,%rax
249 ja badsys 356 ja badsys
@@ -262,7 +369,7 @@ sysret_check:
262 GET_THREAD_INFO(%rcx) 369 GET_THREAD_INFO(%rcx)
263 DISABLE_INTERRUPTS(CLBR_NONE) 370 DISABLE_INTERRUPTS(CLBR_NONE)
264 TRACE_IRQS_OFF 371 TRACE_IRQS_OFF
265 movl threadinfo_flags(%rcx),%edx 372 movl TI_flags(%rcx),%edx
266 andl %edi,%edx 373 andl %edi,%edx
267 jnz sysret_careful 374 jnz sysret_careful
268 CFI_REMEMBER_STATE 375 CFI_REMEMBER_STATE
@@ -305,7 +412,7 @@ sysret_signal:
305 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 412 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
306 xorl %esi,%esi # oldset -> arg2 413 xorl %esi,%esi # oldset -> arg2
307 call ptregscall_common 414 call ptregscall_common
3081: movl $_TIF_NEED_RESCHED,%edi 4151: movl $_TIF_WORK_MASK,%edi
309 /* Use IRET because user could have changed frame. This 416 /* Use IRET because user could have changed frame. This
310 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 417 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
311 DISABLE_INTERRUPTS(CLBR_NONE) 418 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -347,10 +454,10 @@ int_ret_from_sys_call:
347int_with_check: 454int_with_check:
348 LOCKDEP_SYS_EXIT_IRQ 455 LOCKDEP_SYS_EXIT_IRQ
349 GET_THREAD_INFO(%rcx) 456 GET_THREAD_INFO(%rcx)
350 movl threadinfo_flags(%rcx),%edx 457 movl TI_flags(%rcx),%edx
351 andl %edi,%edx 458 andl %edi,%edx
352 jnz int_careful 459 jnz int_careful
353 andl $~TS_COMPAT,threadinfo_status(%rcx) 460 andl $~TS_COMPAT,TI_status(%rcx)
354 jmp retint_swapgs 461 jmp retint_swapgs
355 462
356 /* Either reschedule or signal or syscall exit tracking needed. */ 463 /* Either reschedule or signal or syscall exit tracking needed. */
@@ -393,7 +500,7 @@ int_signal:
393 movq %rsp,%rdi # &ptregs -> arg1 500 movq %rsp,%rdi # &ptregs -> arg1
394 xorl %esi,%esi # oldset -> arg2 501 xorl %esi,%esi # oldset -> arg2
395 call do_notify_resume 502 call do_notify_resume
3961: movl $_TIF_NEED_RESCHED,%edi 5031: movl $_TIF_WORK_MASK,%edi
397int_restore_rest: 504int_restore_rest:
398 RESTORE_REST 505 RESTORE_REST
399 DISABLE_INTERRUPTS(CLBR_NONE) 506 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -558,7 +665,7 @@ retint_with_reschedule:
558 movl $_TIF_WORK_MASK,%edi 665 movl $_TIF_WORK_MASK,%edi
559retint_check: 666retint_check:
560 LOCKDEP_SYS_EXIT_IRQ 667 LOCKDEP_SYS_EXIT_IRQ
561 movl threadinfo_flags(%rcx),%edx 668 movl TI_flags(%rcx),%edx
562 andl %edi,%edx 669 andl %edi,%edx
563 CFI_REMEMBER_STATE 670 CFI_REMEMBER_STATE
564 jnz retint_careful 671 jnz retint_careful
@@ -646,17 +753,16 @@ retint_signal:
646 RESTORE_REST 753 RESTORE_REST
647 DISABLE_INTERRUPTS(CLBR_NONE) 754 DISABLE_INTERRUPTS(CLBR_NONE)
648 TRACE_IRQS_OFF 755 TRACE_IRQS_OFF
649 movl $_TIF_NEED_RESCHED,%edi
650 GET_THREAD_INFO(%rcx) 756 GET_THREAD_INFO(%rcx)
651 jmp retint_check 757 jmp retint_with_reschedule
652 758
653#ifdef CONFIG_PREEMPT 759#ifdef CONFIG_PREEMPT
654 /* Returning to kernel space. Check if we need preemption */ 760 /* Returning to kernel space. Check if we need preemption */
655 /* rcx: threadinfo. interrupts off. */ 761 /* rcx: threadinfo. interrupts off. */
656ENTRY(retint_kernel) 762ENTRY(retint_kernel)
657 cmpl $0,threadinfo_preempt_count(%rcx) 763 cmpl $0,TI_preempt_count(%rcx)
658 jnz retint_restore_args 764 jnz retint_restore_args
659 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) 765 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
660 jnc retint_restore_args 766 jnc retint_restore_args
661 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ 767 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
662 jnc retint_restore_args 768 jnc retint_restore_args
@@ -710,6 +816,9 @@ END(invalidate_interrupt\num)
710ENTRY(call_function_interrupt) 816ENTRY(call_function_interrupt)
711 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt 817 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
712END(call_function_interrupt) 818END(call_function_interrupt)
819ENTRY(call_function_single_interrupt)
820 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
821END(call_function_single_interrupt)
713ENTRY(irq_move_cleanup_interrupt) 822ENTRY(irq_move_cleanup_interrupt)
714 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt 823 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
715END(irq_move_cleanup_interrupt) 824END(irq_move_cleanup_interrupt)
@@ -819,7 +928,7 @@ paranoid_restore\trace:
819 jmp irq_return 928 jmp irq_return
820paranoid_userspace\trace: 929paranoid_userspace\trace:
821 GET_THREAD_INFO(%rcx) 930 GET_THREAD_INFO(%rcx)
822 movl threadinfo_flags(%rcx),%ebx 931 movl TI_flags(%rcx),%ebx
823 andl $_TIF_WORK_MASK,%ebx 932 andl $_TIF_WORK_MASK,%ebx
824 jz paranoid_swapgs\trace 933 jz paranoid_swapgs\trace
825 movq %rsp,%rdi /* &pt_regs */ 934 movq %rsp,%rdi /* &pt_regs */
@@ -917,7 +1026,7 @@ error_exit:
917 testl %eax,%eax 1026 testl %eax,%eax
918 jne retint_kernel 1027 jne retint_kernel
919 LOCKDEP_SYS_EXIT_IRQ 1028 LOCKDEP_SYS_EXIT_IRQ
920 movl threadinfo_flags(%rcx),%edx 1029 movl TI_flags(%rcx),%edx
921 movl $_TIF_WORK_MASK,%edi 1030 movl $_TIF_WORK_MASK,%edi
922 andl %edi,%edx 1031 andl %edi,%edx
923 jnz retint_careful 1032 jnz retint_careful
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 000000000000..ab115cd15fdf
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,141 @@
1/*
2 * Code for replacing ftrace calls with jumps.
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 *
6 * Thanks goes to Ingo Molnar, for suggesting the idea.
7 * Mathieu Desnoyers, for suggesting postponing the modifications.
8 * Arjan van de Ven, for keeping me straight, and explaining to me
9 * the dangers of modifying code on the run.
10 */
11
12#include <linux/spinlock.h>
13#include <linux/hardirq.h>
14#include <linux/ftrace.h>
15#include <linux/percpu.h>
16#include <linux/init.h>
17#include <linux/list.h>
18
19#include <asm/alternative.h>
20#include <asm/ftrace.h>
21
22
23/* Long is fine, even if it is only 4 bytes ;-) */
24static long *ftrace_nop;
25
26union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE];
28 struct {
29 char e8;
30 int offset;
31 } __attribute__((packed));
32};
33
34
35static int notrace ftrace_calc_offset(long ip, long addr)
36{
37 return (int)(addr - ip);
38}
39
40notrace unsigned char *ftrace_nop_replace(void)
41{
42 return (char *)ftrace_nop;
43}
44
45notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{
47 static union ftrace_code_union calc;
48
49 calc.e8 = 0xe8;
50 calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
51
52 /*
53 * No locking needed, this must be called via kstop_machine
54 * which in essence is like running on a uniprocessor machine.
55 */
56 return calc.code;
57}
58
59notrace int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code)
62{
63 unsigned replaced;
64 unsigned old = *(unsigned *)old_code; /* 4 bytes */
65 unsigned new = *(unsigned *)new_code; /* 4 bytes */
66 unsigned char newch = new_code[4];
67 int faulted = 0;
68
69 /*
70 * Note: Due to modules and __init, code can
71 * disappear and change, we need to protect against faulting
72 * as well as code changing.
73 *
74 * No real locking needed, this code is run through
75 * kstop_machine.
76 */
77 asm volatile (
78 "1: lock\n"
79 " cmpxchg %3, (%2)\n"
80 " jnz 2f\n"
81 " movb %b4, 4(%2)\n"
82 "2:\n"
83 ".section .fixup, \"ax\"\n"
84 "3: movl $1, %0\n"
85 " jmp 2b\n"
86 ".previous\n"
87 _ASM_EXTABLE(1b, 3b)
88 : "=r"(faulted), "=a"(replaced)
89 : "r"(ip), "r"(new), "c"(newch),
90 "0"(faulted), "a"(old)
91 : "memory");
92 sync_core();
93
94 if (replaced != old && replaced != new)
95 faulted = 2;
96
97 return faulted;
98}
99
100notrace int ftrace_update_ftrace_func(ftrace_func_t func)
101{
102 unsigned long ip = (unsigned long)(&ftrace_call);
103 unsigned char old[MCOUNT_INSN_SIZE], *new;
104 int ret;
105
106 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
107 new = ftrace_call_replace(ip, (unsigned long)func);
108 ret = ftrace_modify_code(ip, old, new);
109
110 return ret;
111}
112
113notrace int ftrace_mcount_set(unsigned long *data)
114{
115 unsigned long ip = (long)(&mcount_call);
116 unsigned long *addr = data;
117 unsigned char old[MCOUNT_INSN_SIZE], *new;
118
119 /*
120 * Replace the mcount stub with a pointer to the
121 * ip recorder function.
122 */
123 memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
124 new = ftrace_call_replace(ip, *addr);
125 *addr = ftrace_modify_code(ip, old, new);
126
127 return 0;
128}
129
130int __init ftrace_dyn_arch_init(void *data)
131{
132 const unsigned char *const *noptable = find_nop_table();
133
134 /* This is running in kstop_machine */
135
136 ftrace_mcount_set(data);
137
138 ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
139
140 return 0;
141}
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 45e84acca8a9..711f11c30b06 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -8,6 +8,7 @@
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#include <linux/kernel.h>
11#include <linux/threads.h> 12#include <linux/threads.h>
12#include <linux/cpumask.h> 13#include <linux/cpumask.h>
13#include <linux/string.h> 14#include <linux/string.h>
@@ -20,6 +21,7 @@
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/pgtable.h>
23#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
25 27
@@ -208,14 +210,79 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
208 BUG(); 210 BUG();
209} 211}
210 212
213static __init void map_low_mmrs(void)
214{
215 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
216 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
217}
218
219enum map_type {map_wb, map_uc};
220
221static void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
222{
223 unsigned long bytes, paddr;
224
225 paddr = base << shift;
226 bytes = (1UL << shift);
227 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
228 paddr + bytes);
229 if (map_type == map_uc)
230 init_extra_mapping_uc(paddr, bytes);
231 else
232 init_extra_mapping_wb(paddr, bytes);
233
234}
235static __init void map_gru_high(int max_pnode)
236{
237 union uvh_rh_gam_gru_overlay_config_mmr_u gru;
238 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
239
240 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
241 if (gru.s.enable)
242 map_high("GRU", gru.s.base, shift, map_wb);
243}
244
245static __init void map_config_high(int max_pnode)
246{
247 union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
248 int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
249
250 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
251 if (cfg.s.enable)
252 map_high("CONFIG", cfg.s.base, shift, map_uc);
253}
254
255static __init void map_mmr_high(int max_pnode)
256{
257 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
258 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
259
260 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
261 if (mmr.s.enable)
262 map_high("MMR", mmr.s.base, shift, map_uc);
263}
264
265static __init void map_mmioh_high(int max_pnode)
266{
267 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
268 int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
269
270 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
271 if (mmioh.s.enable)
272 map_high("MMIOH", mmioh.s.base, shift, map_uc);
273}
274
211static __init void uv_system_init(void) 275static __init void uv_system_init(void)
212{ 276{
213 union uvh_si_addr_map_config_u m_n_config; 277 union uvh_si_addr_map_config_u m_n_config;
214 union uvh_node_id_u node_id; 278 union uvh_node_id_u node_id;
215 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 279 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
216 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 280 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
281 int max_pnode = 0;
217 unsigned long mmr_base, present; 282 unsigned long mmr_base, present;
218 283
284 map_low_mmrs();
285
219 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 286 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
220 m_val = m_n_config.s.m_skt; 287 m_val = m_n_config.s.m_skt;
221 n_val = m_n_config.s.n_skt; 288 n_val = m_n_config.s.n_skt;
@@ -281,12 +348,18 @@ static __init void uv_system_init(void)
281 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ 348 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
282 uv_node_to_blade[nid] = blade; 349 uv_node_to_blade[nid] = blade;
283 uv_cpu_to_blade[cpu] = blade; 350 uv_cpu_to_blade[cpu] = blade;
351 max_pnode = max(pnode, max_pnode);
284 352
285 printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, " 353 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, "
286 "lcpu %d, blade %d\n", 354 "lcpu %d, blade %d\n",
287 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, 355 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
288 lcpu, blade); 356 lcpu, blade);
289 } 357 }
358
359 map_gru_high(max_pnode);
360 map_mmr_high(max_pnode);
361 map_config_high(max_pnode);
362 map_mmioh_high(max_pnode);
290} 363}
291 364
292/* 365/*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ea230ec69057..0ea6a19bfdfe 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a)
36} 36}
37 37
38#ifdef CONFIG_X86_64 38#ifdef CONFIG_X86_64
39
40#include <asm/pgtable.h> 39#include <asm/pgtable.h>
41 40#endif
42static inline void hpet_set_mapping(void)
43{
44 set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
45 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
46 hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
47}
48
49static inline void hpet_clear_mapping(void)
50{
51 hpet_virt_address = NULL;
52}
53
54#else
55 41
56static inline void hpet_set_mapping(void) 42static inline void hpet_set_mapping(void)
57{ 43{
58 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); 44 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
45#ifdef CONFIG_X86_64
46 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
47#endif
59} 48}
60 49
61static inline void hpet_clear_mapping(void) 50static inline void hpet_clear_mapping(void)
@@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void)
63 iounmap(hpet_virt_address); 52 iounmap(hpet_virt_address);
64 hpet_virt_address = NULL; 53 hpet_virt_address = NULL;
65} 54}
66#endif
67 55
68/* 56/*
69 * HPET command line enable / disable 57 * HPET command line enable / disable
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index deb43785e923..dd7ebee446af 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,7 +1,14 @@
1#include <linux/module.h> 1#include <linux/module.h>
2
2#include <asm/checksum.h> 3#include <asm/checksum.h>
3#include <asm/desc.h>
4#include <asm/pgtable.h> 4#include <asm/pgtable.h>
5#include <asm/desc.h>
6#include <asm/ftrace.h>
7
8#ifdef CONFIG_FTRACE
9/* mcount is defined in assembly */
10EXPORT_SYMBOL(mcount);
11#endif
5 12
6/* Networking helper routines. */ 13/* Networking helper routines. */
7EXPORT_SYMBOL(csum_partial_copy_generic); 14EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 337ec3438a8f..558abf4c796a 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1569,7 +1569,7 @@ void /*__init*/ print_local_APIC(void *dummy)
1569 1569
1570void print_all_local_APICs(void) 1570void print_all_local_APICs(void)
1571{ 1571{
1572 on_each_cpu(print_local_APIC, NULL, 1, 1); 1572 on_each_cpu(print_local_APIC, NULL, 1);
1573} 1573}
1574 1574
1575void /*__init*/ print_PIC(void) 1575void /*__init*/ print_PIC(void)
@@ -2020,7 +2020,7 @@ static inline void init_IO_APIC_traps(void)
2020 * The local APIC irq-chip implementation: 2020 * The local APIC irq-chip implementation:
2021 */ 2021 */
2022 2022
2023static void ack_apic(unsigned int irq) 2023static void ack_lapic_irq(unsigned int irq)
2024{ 2024{
2025 ack_APIC_irq(); 2025 ack_APIC_irq();
2026} 2026}
@@ -2045,9 +2045,17 @@ static struct irq_chip lapic_chip __read_mostly = {
2045 .name = "local-APIC", 2045 .name = "local-APIC",
2046 .mask = mask_lapic_irq, 2046 .mask = mask_lapic_irq,
2047 .unmask = unmask_lapic_irq, 2047 .unmask = unmask_lapic_irq,
2048 .eoi = ack_apic, 2048 .ack = ack_lapic_irq,
2049}; 2049};
2050 2050
2051static void lapic_register_intr(int irq, int vector)
2052{
2053 irq_desc[irq].status &= ~IRQ_LEVEL;
2054 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2055 "edge");
2056 set_intr_gate(vector, interrupt[irq]);
2057}
2058
2051static void __init setup_nmi(void) 2059static void __init setup_nmi(void)
2052{ 2060{
2053 /* 2061 /*
@@ -2247,8 +2255,7 @@ static inline void __init check_timer(void)
2247 2255
2248 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2256 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2249 2257
2250 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, 2258 lapic_register_intr(0, vector);
2251 "fasteoi");
2252 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2259 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2253 enable_8259A_irq(0); 2260 enable_8259A_irq(0);
2254 2261
@@ -2280,11 +2287,21 @@ out:
2280} 2287}
2281 2288
2282/* 2289/*
2283 * 2290 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
2284 * IRQ's that are handled by the PIC in the MPS IOAPIC case. 2291 * to devices. However there may be an I/O APIC pin available for
2285 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. 2292 * this interrupt regardless. The pin may be left unconnected, but
2286 * Linux doesn't really care, as it's not actually used 2293 * typically it will be reused as an ExtINT cascade interrupt for
2287 * for any interrupt handling anyway. 2294 * the master 8259A. In the MPS case such a pin will normally be
2295 * reported as an ExtINT interrupt in the MP table. With ACPI
2296 * there is no provision for ExtINT interrupts, and in the absence
2297 * of an override it would be treated as an ordinary ISA I/O APIC
2298 * interrupt, that is edge-triggered and unmasked by default. We
2299 * used to do this, but it caused problems on some systems because
2300 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
2301 * the same ExtINT cascade interrupt to drive the local APIC of the
2302 * bootstrap processor. Therefore we refrain from routing IRQ2 to
2303 * the I/O APIC in all cases now. No actual device should request
2304 * it anyway. --macro
2288 */ 2305 */
2289#define PIC_IRQS (1 << PIC_CASCADE_IR) 2306#define PIC_IRQS (1 << PIC_CASCADE_IR)
2290 2307
@@ -2298,10 +2315,7 @@ void __init setup_IO_APIC(void)
2298 2315
2299 enable_IO_APIC(); 2316 enable_IO_APIC();
2300 2317
2301 if (acpi_ioapic) 2318 io_apic_irqs = ~PIC_IRQS;
2302 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
2303 else
2304 io_apic_irqs = ~PIC_IRQS;
2305 2319
2306 printk("ENABLING IO-APIC IRQs\n"); 2320 printk("ENABLING IO-APIC IRQs\n");
2307 2321
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 2b4c40bc12c9..6510cde36b35 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1160,7 +1160,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1160 1160
1161void print_all_local_APICs (void) 1161void print_all_local_APICs (void)
1162{ 1162{
1163 on_each_cpu(print_local_APIC, NULL, 1, 1); 1163 on_each_cpu(print_local_APIC, NULL, 1);
1164} 1164}
1165 1165
1166void __apicdebuginit print_PIC(void) 1166void __apicdebuginit print_PIC(void)
@@ -1554,7 +1554,7 @@ static inline void init_IO_APIC_traps(void)
1554 } 1554 }
1555} 1555}
1556 1556
1557static void enable_lapic_irq (unsigned int irq) 1557static void unmask_lapic_irq(unsigned int irq)
1558{ 1558{
1559 unsigned long v; 1559 unsigned long v;
1560 1560
@@ -1562,7 +1562,7 @@ static void enable_lapic_irq (unsigned int irq)
1562 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 1562 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1563} 1563}
1564 1564
1565static void disable_lapic_irq (unsigned int irq) 1565static void mask_lapic_irq(unsigned int irq)
1566{ 1566{
1567 unsigned long v; 1567 unsigned long v;
1568 1568
@@ -1575,19 +1575,20 @@ static void ack_lapic_irq (unsigned int irq)
1575 ack_APIC_irq(); 1575 ack_APIC_irq();
1576} 1576}
1577 1577
1578static void end_lapic_irq (unsigned int i) { /* nothing */ } 1578static struct irq_chip lapic_chip __read_mostly = {
1579 1579 .name = "local-APIC",
1580static struct hw_interrupt_type lapic_irq_type __read_mostly = { 1580 .mask = mask_lapic_irq,
1581 .name = "local-APIC", 1581 .unmask = unmask_lapic_irq,
1582 .typename = "local-APIC-edge", 1582 .ack = ack_lapic_irq,
1583 .startup = NULL, /* startup_irq() not used for IRQ0 */
1584 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1585 .enable = enable_lapic_irq,
1586 .disable = disable_lapic_irq,
1587 .ack = ack_lapic_irq,
1588 .end = end_lapic_irq,
1589}; 1583};
1590 1584
1585static void lapic_register_intr(int irq)
1586{
1587 irq_desc[irq].status &= ~IRQ_LEVEL;
1588 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
1589 "edge");
1590}
1591
1591static void __init setup_nmi(void) 1592static void __init setup_nmi(void)
1592{ 1593{
1593 /* 1594 /*
@@ -1714,11 +1715,6 @@ static inline void __init check_timer(void)
1714 apic2 = apic1; 1715 apic2 = apic1;
1715 } 1716 }
1716 1717
1717 replace_pin_at_irq(0, 0, 0, apic1, pin1);
1718 apic1 = 0;
1719 pin1 = 0;
1720 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
1721
1722 if (pin1 != -1) { 1718 if (pin1 != -1) {
1723 /* 1719 /*
1724 * Ok, does IRQ0 through the IOAPIC work? 1720 * Ok, does IRQ0 through the IOAPIC work?
@@ -1779,7 +1775,7 @@ static inline void __init check_timer(void)
1779 1775
1780 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 1776 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
1781 1777
1782 irq_desc[0].chip = &lapic_irq_type; 1778 lapic_register_intr(0);
1783 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 1779 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1784 enable_8259A_irq(0); 1780 enable_8259A_irq(0);
1785 1781
@@ -1817,11 +1813,21 @@ static int __init notimercheck(char *s)
1817__setup("no_timer_check", notimercheck); 1813__setup("no_timer_check", notimercheck);
1818 1814
1819/* 1815/*
1820 * 1816 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
1821 * IRQs that are handled by the PIC in the MPS IOAPIC case. 1817 * to devices. However there may be an I/O APIC pin available for
1822 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. 1818 * this interrupt regardless. The pin may be left unconnected, but
1823 * Linux doesn't really care, as it's not actually used 1819 * typically it will be reused as an ExtINT cascade interrupt for
1824 * for any interrupt handling anyway. 1820 * the master 8259A. In the MPS case such a pin will normally be
1821 * reported as an ExtINT interrupt in the MP table. With ACPI
1822 * there is no provision for ExtINT interrupts, and in the absence
1823 * of an override it would be treated as an ordinary ISA I/O APIC
1824 * interrupt, that is edge-triggered and unmasked by default. We
1825 * used to do this, but it caused problems on some systems because
1826 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
1827 * the same ExtINT cascade interrupt to drive the local APIC of the
1828 * bootstrap processor. Therefore we refrain from routing IRQ2 to
1829 * the I/O APIC in all cases now. No actual device should request
1830 * it anyway. --macro
1825 */ 1831 */
1826#define PIC_IRQS (1<<2) 1832#define PIC_IRQS (1<<2)
1827 1833
@@ -1832,10 +1838,7 @@ void __init setup_IO_APIC(void)
1832 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 1838 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1833 */ 1839 */
1834 1840
1835 if (acpi_ioapic) 1841 io_apic_irqs = ~PIC_IRQS;
1836 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
1837 else
1838 io_apic_irqs = ~PIC_IRQS;
1839 1842
1840 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 1843 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1841 1844
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 31f49e8f46a7..0373e88de95a 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -199,6 +199,10 @@ void __init native_init_IRQ(void)
199 /* IPI for generic function call */ 199 /* IPI for generic function call */
200 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 200 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
201 201
202 /* IPI for generic single function call */
203 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
204 call_function_single_interrupt);
205
202 /* Low priority IPI to cleanup after moving an irq */ 206 /* Low priority IPI to cleanup after moving an irq */
203 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 207 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
204#endif 208#endif
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 21f2bae98c15..a8449571858a 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
68 load_LDT(pc); 68 load_LDT(pc);
69 mask = cpumask_of_cpu(smp_processor_id()); 69 mask = cpumask_of_cpu(smp_processor_id());
70 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 70 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
71 smp_call_function(flush_ldt, current->mm, 1, 1); 71 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 72 preempt_enable();
73#else 73#else
74 load_LDT(pc); 74 load_LDT(pc);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f4960171bc66..8864230d55af 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -11,6 +11,8 @@
11#include <linux/delay.h> 11#include <linux/delay.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h>
15
14#include <asm/pgtable.h> 16#include <asm/pgtable.h>
15#include <asm/pgalloc.h> 17#include <asm/pgalloc.h>
16#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
@@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
107 unsigned long page_list[PAGES_NR]; 109 unsigned long page_list[PAGES_NR];
108 void *control_page; 110 void *control_page;
109 111
112 tracer_disable();
113
110 /* Interrupts aren't acceptable while we reboot */ 114 /* Interrupts aren't acceptable while we reboot */
111 local_irq_disable(); 115 local_irq_disable();
112 116
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 7830dc4a8380..9dd9262693a3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -11,6 +11,8 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h>
15
14#include <asm/pgtable.h> 16#include <asm/pgtable.h>
15#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
16#include <asm/mmu_context.h> 18#include <asm/mmu_context.h>
@@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
184 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
185 void *control_page; 187 void *control_page;
186 188
189 tracer_disable();
190
187 /* Interrupts aren't acceptable while we reboot */ 191 /* Interrupts aren't acceptable while we reboot */
188 local_irq_disable(); 192 local_irq_disable();
189 193
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 9758fea87c5b..56b933119a04 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -76,6 +76,7 @@
76#include <linux/kernel.h> 76#include <linux/kernel.h>
77#include <linux/init.h> 77#include <linux/init.h>
78#include <linux/sched.h> 78#include <linux/sched.h>
79#include <linux/smp_lock.h>
79#include <linux/cpumask.h> 80#include <linux/cpumask.h>
80#include <linux/module.h> 81#include <linux/module.h>
81#include <linux/slab.h> 82#include <linux/slab.h>
@@ -423,6 +424,7 @@ out:
423 424
424static int microcode_open (struct inode *unused1, struct file *unused2) 425static int microcode_open (struct inode *unused1, struct file *unused2)
425{ 426{
427 cycle_kernel_lock();
426 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 428 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
427} 429}
428 430
@@ -489,7 +491,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
489#define microcode_dev_exit() do { } while(0) 491#define microcode_dev_exit() do { } while(0)
490#endif 492#endif
491 493
492static long get_next_ucode_from_buffer(void **mc, void *buf, 494static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
493 unsigned long size, long offset) 495 unsigned long size, long offset)
494{ 496{
495 microcode_header_t *mc_header; 497 microcode_header_t *mc_header;
@@ -523,7 +525,7 @@ static int cpu_request_microcode(int cpu)
523 char name[30]; 525 char name[30];
524 struct cpuinfo_x86 *c = &cpu_data(cpu); 526 struct cpuinfo_x86 *c = &cpu_data(cpu);
525 const struct firmware *firmware; 527 const struct firmware *firmware;
526 void *buf; 528 const u8 *buf;
527 unsigned long size; 529 unsigned long size;
528 long offset = 0; 530 long offset = 0;
529 int error; 531 int error;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 8b6b1e05c306..3b25e49380c6 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -726,12 +726,22 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
726static struct intel_mp_floating *mpf_found; 726static struct intel_mp_floating *mpf_found;
727 727
728/* 728/*
729 * Machine specific quirk for finding the SMP config before other setup
730 * activities destroy the table:
731 */
732int (*mach_get_smp_config_quirk)(unsigned int early);
733
734/*
729 * Scan the memory blocks for an SMP configuration block. 735 * Scan the memory blocks for an SMP configuration block.
730 */ 736 */
731static void __init __get_smp_config(unsigned early) 737static void __init __get_smp_config(unsigned int early)
732{ 738{
733 struct intel_mp_floating *mpf = mpf_found; 739 struct intel_mp_floating *mpf = mpf_found;
734 740
741 if (mach_get_smp_config_quirk) {
742 if (mach_get_smp_config_quirk(early))
743 return;
744 }
735 if (acpi_lapic && early) 745 if (acpi_lapic && early)
736 return; 746 return;
737 /* 747 /*
@@ -889,10 +899,16 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
889 return 0; 899 return 0;
890} 900}
891 901
892static void __init __find_smp_config(unsigned reserve) 902int (*mach_find_smp_config_quirk)(unsigned int reserve);
903
904static void __init __find_smp_config(unsigned int reserve)
893{ 905{
894 unsigned int address; 906 unsigned int address;
895 907
908 if (mach_find_smp_config_quirk) {
909 if (mach_find_smp_config_quirk(reserve))
910 return;
911 }
896 /* 912 /*
897 * FIXME: Linux assumes you have 640K of base ram.. 913 * FIXME: Linux assumes you have 640K of base ram..
898 * this continues the error... 914 * this continues the error...
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1f3abe048e93..a153b3905f60 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file)
117{ 117{
118 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 118 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
119 struct cpuinfo_x86 *c = &cpu_data(cpu); 119 struct cpuinfo_x86 *c = &cpu_data(cpu);
120 int ret = 0;
120 121
121 if (cpu >= NR_CPUS || !cpu_online(cpu)) 122 lock_kernel();
122 return -ENXIO; /* No such CPU */ 123 cpu = iminor(file->f_path.dentry->d_inode);
123 if (!cpu_has(c, X86_FEATURE_MSR))
124 return -EIO; /* MSR not supported */
125 124
125 if (cpu >= NR_CPUS || !cpu_online(cpu)) {
126 ret = -ENXIO; /* No such CPU */
127 goto out;
128 }
129 c = &cpu_data(cpu);
130 if (!cpu_has(c, X86_FEATURE_MSR))
131 ret = -EIO; /* MSR not supported */
132out:
133 unlock_kernel();
126 return 0; 134 return 0;
127} 135}
128 136
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8dfe9db87a9e..ec024b3baad0 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -130,7 +130,7 @@ int __init check_nmi_watchdog(void)
130 130
131#ifdef CONFIG_SMP 131#ifdef CONFIG_SMP
132 if (nmi_watchdog == NMI_LOCAL_APIC) 132 if (nmi_watchdog == NMI_LOCAL_APIC)
133 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 133 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
134#endif 134#endif
135 135
136 for_each_possible_cpu(cpu) 136 for_each_possible_cpu(cpu)
@@ -171,6 +171,9 @@ int __init check_nmi_watchdog(void)
171error: 171error:
172 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) 172 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
173 disable_8259A_irq(0); 173 disable_8259A_irq(0);
174#ifdef CONFIG_X86_32
175 timer_ack = 0;
176#endif
174 return -1; 177 return -1;
175} 178}
176 179
@@ -269,7 +272,7 @@ static void __acpi_nmi_enable(void *__unused)
269void acpi_nmi_enable(void) 272void acpi_nmi_enable(void)
270{ 273{
271 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) 274 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
272 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); 275 on_each_cpu(__acpi_nmi_enable, NULL, 1);
273} 276}
274 277
275static void __acpi_nmi_disable(void *__unused) 278static void __acpi_nmi_disable(void *__unused)
@@ -283,7 +286,7 @@ static void __acpi_nmi_disable(void *__unused)
283void acpi_nmi_disable(void) 286void acpi_nmi_disable(void)
284{ 287{
285 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) 288 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
286 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); 289 on_each_cpu(__acpi_nmi_disable, NULL, 1);
287} 290}
288 291
289void setup_apic_nmi_watchdog(void *unused) 292void setup_apic_nmi_watchdog(void *unused)
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index f0f1de1c4a1d..a23e8233b9ac 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -93,12 +93,13 @@ int __init get_memcfg_numaq(void)
93 return 1; 93 return 1;
94} 94}
95 95
96static int __init numaq_tsc_disable(void) 96void __init numaq_tsc_disable(void)
97{ 97{
98 if (!found_numaq)
99 return;
100
98 if (num_online_nodes() > 1) { 101 if (num_online_nodes() > 1) {
99 printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); 102 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
100 setup_clear_cpu_cap(X86_FEATURE_TSC); 103 setup_clear_cpu_cap(X86_FEATURE_TSC);
101 } 104 }
102 return 0;
103} 105}
104arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e7e5652f65bc..e0f571d58c19 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -285,7 +285,7 @@ struct pv_time_ops pv_time_ops = {
285 .get_wallclock = native_get_wallclock, 285 .get_wallclock = native_get_wallclock,
286 .set_wallclock = native_set_wallclock, 286 .set_wallclock = native_set_wallclock,
287 .sched_clock = native_sched_clock, 287 .sched_clock = native_sched_clock,
288 .get_cpu_khz = native_calculate_cpu_khz, 288 .get_tsc_khz = native_calibrate_tsc,
289}; 289};
290 290
291struct pv_irq_ops pv_irq_ops = { 291struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d0d18db5d2a4..c3fe78406d18 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -630,6 +630,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
630 struct pci_dev *dev; 630 struct pci_dev *dev;
631 void *gatt; 631 void *gatt;
632 int i, error; 632 int i, error;
633 unsigned long start_pfn, end_pfn;
633 634
634 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 635 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
635 aper_size = aper_base = info->aper_size = 0; 636 aper_size = aper_base = info->aper_size = 0;
@@ -674,6 +675,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
674 675
675 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 676 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
676 aper_base, aper_size>>10); 677 aper_base, aper_size>>10);
678
679 /* need to map that range */
680 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
681 if (end_pfn > max_low_pfn_mapped) {
682 start_pfn = (aper_base>>PAGE_SHIFT);
683 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
684 }
677 return 0; 685 return 0;
678 686
679 nommu: 687 nommu:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4061d63aabe7..4d629c62f4f8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,12 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/clockchips.h> 9#include <linux/clockchips.h>
10#include <asm/system.h>
11
12unsigned long idle_halt;
13EXPORT_SYMBOL(idle_halt);
14unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait);
10 16
11struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
12 18
@@ -132,7 +138,7 @@ void cpu_idle_wait(void)
132{ 138{
133 smp_mb(); 139 smp_mb();
134 /* kick all the CPUs so that they exit out of pm_idle */ 140 /* kick all the CPUs so that they exit out of pm_idle */
135 smp_call_function(do_nothing, NULL, 0, 1); 141 smp_call_function(do_nothing, NULL, 1);
136} 142}
137EXPORT_SYMBOL_GPL(cpu_idle_wait); 143EXPORT_SYMBOL_GPL(cpu_idle_wait);
138 144
@@ -325,7 +331,27 @@ static int __init idle_setup(char *str)
325 pm_idle = poll_idle; 331 pm_idle = poll_idle;
326 } else if (!strcmp(str, "mwait")) 332 } else if (!strcmp(str, "mwait"))
327 force_mwait = 1; 333 force_mwait = 1;
328 else 334 else if (!strcmp(str, "halt")) {
335 /*
336 * When the boot option of idle=halt is added, halt is
337 * forced to be used for CPU idle. In such case CPU C2/C3
338 * won't be used again.
339 * To continue to load the CPU idle driver, don't touch
340 * the boot_option_idle_override.
341 */
342 pm_idle = default_idle;
343 idle_halt = 1;
344 return 0;
345 } else if (!strcmp(str, "nomwait")) {
346 /*
347 * If the boot option of "idle=nomwait" is added,
348 * it means that mwait will be disabled for CPU C2/C3
349 * states. In such case it won't touch the variable
350 * of boot_option_idle_override.
351 */
352 idle_nomwait = 1;
353 return 0;
354 } else
329 return -1; 355 return -1;
330 356
331 boot_option_idle_override = 1; 357 boot_option_idle_override = 1;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9a139f6c9df3..0c3927accb00 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -142,7 +142,10 @@ void cpu_idle(void)
142 142
143 local_irq_disable(); 143 local_irq_disable();
144 __get_cpu_var(irq_stat).idle_timestamp = jiffies; 144 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
145 /* Don't trace irqs off for idle */
146 stop_critical_timings();
145 pm_idle(); 147 pm_idle();
148 start_critical_timings();
146 } 149 }
147 tick_nohz_restart_sched_tick(); 150 tick_nohz_restart_sched_tick();
148 preempt_enable_no_resched(); 151 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index db5eb963e4df..a8e53626ac9a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -134,7 +134,10 @@ void cpu_idle(void)
134 */ 134 */
135 local_irq_disable(); 135 local_irq_disable();
136 enter_idle(); 136 enter_idle();
137 /* Don't trace irqs off for idle */
138 stop_critical_timings();
137 pm_idle(); 139 pm_idle();
140 start_critical_timings();
138 /* In many cases the interrupt that ended idle 141 /* In many cases the interrupt that ended idle
139 has already called exit_idle. But some idle 142 has already called exit_idle. But some idle
140 loops can be woken up without interrupt. */ 143 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 79bdcd11c66e..d13858818100 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -266,6 +266,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
266 hpet_print_force_info(); 266 hpet_print_force_info();
267} 267}
268 268
269DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
270 old_ich_force_enable_hpet_user);
269DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, 271DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
270 old_ich_force_enable_hpet_user); 272 old_ich_force_enable_hpet_user);
271DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, 273DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cfcfbefee0b9..531b55b8e81a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -394,11 +394,10 @@ static void __init parse_setup_data(void)
394 } 394 }
395} 395}
396 396
397static void __init reserve_setup_data(void) 397static void __init e820_reserve_setup_data(void)
398{ 398{
399 struct setup_data *data; 399 struct setup_data *data;
400 u64 pa_data; 400 u64 pa_data;
401 char buf[32];
402 int found = 0; 401 int found = 0;
403 402
404 if (boot_params.hdr.version < 0x0209) 403 if (boot_params.hdr.version < 0x0209)
@@ -406,8 +405,6 @@ static void __init reserve_setup_data(void)
406 pa_data = boot_params.hdr.setup_data; 405 pa_data = boot_params.hdr.setup_data;
407 while (pa_data) { 406 while (pa_data) {
408 data = early_ioremap(pa_data, sizeof(*data)); 407 data = early_ioremap(pa_data, sizeof(*data));
409 sprintf(buf, "setup data %x", data->type);
410 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
411 e820_update_range(pa_data, sizeof(*data)+data->len, 408 e820_update_range(pa_data, sizeof(*data)+data->len,
412 E820_RAM, E820_RESERVED_KERN); 409 E820_RAM, E820_RESERVED_KERN);
413 found = 1; 410 found = 1;
@@ -418,10 +415,29 @@ static void __init reserve_setup_data(void)
418 return; 415 return;
419 416
420 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 417 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
418 memcpy(&e820_saved, &e820, sizeof(struct e820map));
421 printk(KERN_INFO "extended physical RAM map:\n"); 419 printk(KERN_INFO "extended physical RAM map:\n");
422 e820_print_map("reserve setup_data"); 420 e820_print_map("reserve setup_data");
423} 421}
424 422
423static void __init reserve_early_setup_data(void)
424{
425 struct setup_data *data;
426 u64 pa_data;
427 char buf[32];
428
429 if (boot_params.hdr.version < 0x0209)
430 return;
431 pa_data = boot_params.hdr.setup_data;
432 while (pa_data) {
433 data = early_ioremap(pa_data, sizeof(*data));
434 sprintf(buf, "setup data %x", data->type);
435 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
436 pa_data = data->next;
437 early_iounmap(data, sizeof(*data));
438 }
439}
440
425/* 441/*
426 * --------- Crashkernel reservation ------------------------------ 442 * --------- Crashkernel reservation ------------------------------
427 */ 443 */
@@ -580,6 +596,7 @@ void __init setup_arch(char **cmdline_p)
580{ 596{
581#ifdef CONFIG_X86_32 597#ifdef CONFIG_X86_32
582 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 598 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
599 visws_early_detect();
583 pre_setup_arch_hook(); 600 pre_setup_arch_hook();
584 early_cpu_init(); 601 early_cpu_init();
585#else 602#else
@@ -626,6 +643,8 @@ void __init setup_arch(char **cmdline_p)
626 643
627 setup_memory_map(); 644 setup_memory_map();
628 parse_setup_data(); 645 parse_setup_data();
646 /* update the e820_saved too */
647 e820_reserve_setup_data();
629 648
630 copy_edd(); 649 copy_edd();
631 650
@@ -656,7 +675,7 @@ void __init setup_arch(char **cmdline_p)
656 parse_early_param(); 675 parse_early_param();
657 676
658 /* after early param, so could get panic from serial */ 677 /* after early param, so could get panic from serial */
659 reserve_setup_data(); 678 reserve_early_setup_data();
660 679
661 if (acpi_mps_check()) { 680 if (acpi_mps_check()) {
662#ifdef CONFIG_X86_LOCAL_APIC 681#ifdef CONFIG_X86_LOCAL_APIC
@@ -665,6 +684,11 @@ void __init setup_arch(char **cmdline_p)
665 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 684 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
666 } 685 }
667 686
687#ifdef CONFIG_PCI
688 if (pci_early_dump_regs)
689 early_dump_pci_devices();
690#endif
691
668 finish_e820_parsing(); 692 finish_e820_parsing();
669 693
670#ifdef CONFIG_X86_32 694#ifdef CONFIG_X86_32
@@ -691,22 +715,18 @@ void __init setup_arch(char **cmdline_p)
691 early_gart_iommu_check(); 715 early_gart_iommu_check();
692#endif 716#endif
693 717
694 e820_register_active_regions(0, 0, -1UL);
695 /* 718 /*
696 * partially used pages are not usable - thus 719 * partially used pages are not usable - thus
697 * we are rounding upwards: 720 * we are rounding upwards:
698 */ 721 */
699 max_pfn = e820_end_of_ram(); 722 max_pfn = e820_end_of_ram_pfn();
700 723
701 /* preallocate 4k for mptable mpc */ 724 /* preallocate 4k for mptable mpc */
702 early_reserve_e820_mpc_new(); 725 early_reserve_e820_mpc_new();
703 /* update e820 for memory not covered by WB MTRRs */ 726 /* update e820 for memory not covered by WB MTRRs */
704 mtrr_bp_init(); 727 mtrr_bp_init();
705 if (mtrr_trim_uncached_memory(max_pfn)) { 728 if (mtrr_trim_uncached_memory(max_pfn))
706 remove_all_active_ranges(); 729 max_pfn = e820_end_of_ram_pfn();
707 e820_register_active_regions(0, 0, -1UL);
708 max_pfn = e820_end_of_ram();
709 }
710 730
711#ifdef CONFIG_X86_32 731#ifdef CONFIG_X86_32
712 /* max_low_pfn get updated here */ 732 /* max_low_pfn get updated here */
@@ -718,12 +738,26 @@ void __init setup_arch(char **cmdline_p)
718 738
719 /* How many end-of-memory variables you have, grandma! */ 739 /* How many end-of-memory variables you have, grandma! */
720 /* need this before calling reserve_initrd */ 740 /* need this before calling reserve_initrd */
721 max_low_pfn = max_pfn; 741 if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
742 max_low_pfn = e820_end_of_low_ram_pfn();
743 else
744 max_low_pfn = max_pfn;
745
722 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 746 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
723#endif 747#endif
724 748
725 /* max_pfn_mapped is updated here */ 749 /* max_pfn_mapped is updated here */
726 max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); 750 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
751 max_pfn_mapped = max_low_pfn_mapped;
752
753#ifdef CONFIG_X86_64
754 if (max_pfn > max_low_pfn) {
755 max_pfn_mapped = init_memory_mapping(1UL<<32,
756 max_pfn<<PAGE_SHIFT);
757 /* can we preseve max_low_pfn ?*/
758 max_low_pfn = max_pfn;
759 }
760#endif
727 761
728 /* 762 /*
729 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 763 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -749,9 +783,6 @@ void __init setup_arch(char **cmdline_p)
749 */ 783 */
750 acpi_boot_table_init(); 784 acpi_boot_table_init();
751 785
752 /* Remove active ranges so rediscovery with NUMA-awareness happens */
753 remove_all_active_ranges();
754
755#ifdef CONFIG_ACPI_NUMA 786#ifdef CONFIG_ACPI_NUMA
756 /* 787 /*
757 * Parse SRAT to discover nodes. 788 * Parse SRAT to discover nodes.
@@ -823,6 +854,14 @@ void __init setup_arch(char **cmdline_p)
823 init_cpu_to_node(); 854 init_cpu_to_node();
824#endif 855#endif
825 856
857#ifdef CONFIG_X86_NUMAQ
858 /*
859 * need to check online nodes num, call it
860 * here before time_init/tsc_init
861 */
862 numaq_tsc_disable();
863#endif
864
826 init_apic_mappings(); 865 init_apic_mappings();
827 ioapic_init_mappings(); 866 ioapic_init_mappings();
828 867
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5fc310f746fc..cac68430d31f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -343,23 +343,23 @@ static const cpumask_t cpu_mask_none;
343/* 343/*
344 * Returns a pointer to the bitmask of CPUs on Node 'node'. 344 * Returns a pointer to the bitmask of CPUs on Node 'node'.
345 */ 345 */
346cpumask_t *_node_to_cpumask_ptr(int node) 346const cpumask_t *_node_to_cpumask_ptr(int node)
347{ 347{
348 if (node_to_cpumask_map == NULL) { 348 if (node_to_cpumask_map == NULL) {
349 printk(KERN_WARNING 349 printk(KERN_WARNING
350 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", 350 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
351 node); 351 node);
352 dump_stack(); 352 dump_stack();
353 return &cpu_online_map; 353 return (const cpumask_t *)&cpu_online_map;
354 } 354 }
355 if (node >= nr_node_ids) { 355 if (node >= nr_node_ids) {
356 printk(KERN_WARNING 356 printk(KERN_WARNING
357 "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", 357 "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
358 node, nr_node_ids); 358 node, nr_node_ids);
359 dump_stack(); 359 dump_stack();
360 return (cpumask_t *)&cpu_mask_none; 360 return &cpu_mask_none;
361 } 361 }
362 return (cpumask_t *)&node_to_cpumask_map[node]; 362 return &node_to_cpumask_map[node];
363} 363}
364EXPORT_SYMBOL(_node_to_cpumask_ptr); 364EXPORT_SYMBOL(_node_to_cpumask_ptr);
365 365
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 0cb7aadc87cd..361b7a4c640c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -121,132 +121,23 @@ static void native_smp_send_reschedule(int cpu)
121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); 121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
122} 122}
123 123
124/* 124void native_send_call_func_single_ipi(int cpu)
125 * Structure and data for smp_call_function(). This is designed to minimise
126 * static memory requirements. It also looks cleaner.
127 */
128static DEFINE_SPINLOCK(call_lock);
129
130struct call_data_struct {
131 void (*func) (void *info);
132 void *info;
133 atomic_t started;
134 atomic_t finished;
135 int wait;
136};
137
138void lock_ipi_call_lock(void)
139{ 125{
140 spin_lock_irq(&call_lock); 126 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
141}
142
143void unlock_ipi_call_lock(void)
144{
145 spin_unlock_irq(&call_lock);
146}
147
148static struct call_data_struct *call_data;
149
150static void __smp_call_function(void (*func) (void *info), void *info,
151 int nonatomic, int wait)
152{
153 struct call_data_struct data;
154 int cpus = num_online_cpus() - 1;
155
156 if (!cpus)
157 return;
158
159 data.func = func;
160 data.info = info;
161 atomic_set(&data.started, 0);
162 data.wait = wait;
163 if (wait)
164 atomic_set(&data.finished, 0);
165
166 call_data = &data;
167 mb();
168
169 /* Send a message to all other CPUs and wait for them to respond */
170 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
171
172 /* Wait for response */
173 while (atomic_read(&data.started) != cpus)
174 cpu_relax();
175
176 if (wait)
177 while (atomic_read(&data.finished) != cpus)
178 cpu_relax();
179} 127}
180 128
181 129void native_send_call_func_ipi(cpumask_t mask)
182/**
183 * smp_call_function_mask(): Run a function on a set of other CPUs.
184 * @mask: The set of cpus to run on. Must not include the current cpu.
185 * @func: The function to run. This must be fast and non-blocking.
186 * @info: An arbitrary pointer to pass to the function.
187 * @wait: If true, wait (atomically) until function has completed on other CPUs.
188 *
189 * Returns 0 on success, else a negative status code.
190 *
191 * If @wait is true, then returns once @func has returned; otherwise
192 * it returns just before the target cpu calls @func.
193 *
194 * You must not call this function with disabled interrupts or from a
195 * hardware interrupt handler or from a bottom half handler.
196 */
197static int
198native_smp_call_function_mask(cpumask_t mask,
199 void (*func)(void *), void *info,
200 int wait)
201{ 130{
202 struct call_data_struct data;
203 cpumask_t allbutself; 131 cpumask_t allbutself;
204 int cpus;
205
206 /* Can deadlock when called with interrupts disabled */
207 WARN_ON(irqs_disabled());
208
209 /* Holding any lock stops cpus from going down. */
210 spin_lock(&call_lock);
211 132
212 allbutself = cpu_online_map; 133 allbutself = cpu_online_map;
213 cpu_clear(smp_processor_id(), allbutself); 134 cpu_clear(smp_processor_id(), allbutself);
214 135
215 cpus_and(mask, mask, allbutself);
216 cpus = cpus_weight(mask);
217
218 if (!cpus) {
219 spin_unlock(&call_lock);
220 return 0;
221 }
222
223 data.func = func;
224 data.info = info;
225 atomic_set(&data.started, 0);
226 data.wait = wait;
227 if (wait)
228 atomic_set(&data.finished, 0);
229
230 call_data = &data;
231 wmb();
232
233 /* Send a message to other CPUs */
234 if (cpus_equal(mask, allbutself) && 136 if (cpus_equal(mask, allbutself) &&
235 cpus_equal(cpu_online_map, cpu_callout_map)) 137 cpus_equal(cpu_online_map, cpu_callout_map))
236 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 138 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
237 else 139 else
238 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
239
240 /* Wait for response */
241 while (atomic_read(&data.started) != cpus)
242 cpu_relax();
243
244 if (wait)
245 while (atomic_read(&data.finished) != cpus)
246 cpu_relax();
247 spin_unlock(&call_lock);
248
249 return 0;
250} 141}
251 142
252static void stop_this_cpu(void *dummy) 143static void stop_this_cpu(void *dummy)
@@ -268,18 +159,13 @@ static void stop_this_cpu(void *dummy)
268 159
269static void native_smp_send_stop(void) 160static void native_smp_send_stop(void)
270{ 161{
271 int nolock;
272 unsigned long flags; 162 unsigned long flags;
273 163
274 if (reboot_force) 164 if (reboot_force)
275 return; 165 return;
276 166
277 /* Don't deadlock on the call lock in panic */ 167 smp_call_function(stop_this_cpu, NULL, 0);
278 nolock = !spin_trylock(&call_lock);
279 local_irq_save(flags); 168 local_irq_save(flags);
280 __smp_call_function(stop_this_cpu, NULL, 0, 0);
281 if (!nolock)
282 spin_unlock(&call_lock);
283 disable_local_APIC(); 169 disable_local_APIC();
284 local_irq_restore(flags); 170 local_irq_restore(flags);
285} 171}
@@ -301,33 +187,28 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
301 187
302void smp_call_function_interrupt(struct pt_regs *regs) 188void smp_call_function_interrupt(struct pt_regs *regs)
303{ 189{
304 void (*func) (void *info) = call_data->func;
305 void *info = call_data->info;
306 int wait = call_data->wait;
307
308 ack_APIC_irq(); 190 ack_APIC_irq();
309 /*
310 * Notify initiating CPU that I've grabbed the data and am
311 * about to execute the function
312 */
313 mb();
314 atomic_inc(&call_data->started);
315 /*
316 * At this point the info structure may be out of scope unless wait==1
317 */
318 irq_enter(); 191 irq_enter();
319 (*func)(info); 192 generic_smp_call_function_interrupt();
320#ifdef CONFIG_X86_32 193#ifdef CONFIG_X86_32
321 __get_cpu_var(irq_stat).irq_call_count++; 194 __get_cpu_var(irq_stat).irq_call_count++;
322#else 195#else
323 add_pda(irq_call_count, 1); 196 add_pda(irq_call_count, 1);
324#endif 197#endif
325 irq_exit(); 198 irq_exit();
199}
326 200
327 if (wait) { 201void smp_call_function_single_interrupt(struct pt_regs *regs)
328 mb(); 202{
329 atomic_inc(&call_data->finished); 203 ack_APIC_irq();
330 } 204 irq_enter();
205 generic_smp_call_function_single_interrupt();
206#ifdef CONFIG_X86_32
207 __get_cpu_var(irq_stat).irq_call_count++;
208#else
209 add_pda(irq_call_count, 1);
210#endif
211 irq_exit();
331} 212}
332 213
333struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
@@ -338,7 +219,8 @@ struct smp_ops smp_ops = {
338 219
339 .smp_send_stop = native_smp_send_stop, 220 .smp_send_stop = native_smp_send_stop,
340 .smp_send_reschedule = native_smp_send_reschedule, 221 .smp_send_reschedule = native_smp_send_reschedule,
341 .smp_call_function_mask = native_smp_call_function_mask, 222
223 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi,
342}; 225};
343EXPORT_SYMBOL_GPL(smp_ops); 226EXPORT_SYMBOL_GPL(smp_ops);
344
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e1200b202ed7..687376ab07e8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -327,12 +327,12 @@ static void __cpuinit start_secondary(void *unused)
327 * lock helps us to not include this cpu in a currently in progress 327 * lock helps us to not include this cpu in a currently in progress
328 * smp_call_function(). 328 * smp_call_function().
329 */ 329 */
330 lock_ipi_call_lock(); 330 ipi_call_lock_irq();
331#ifdef CONFIG_X86_IO_APIC 331#ifdef CONFIG_X86_IO_APIC
332 setup_vector_irq(smp_processor_id()); 332 setup_vector_irq(smp_processor_id());
333#endif 333#endif
334 cpu_set(smp_processor_id(), cpu_online_map); 334 cpu_set(smp_processor_id(), cpu_online_map);
335 unlock_ipi_call_lock(); 335 ipi_call_unlock_irq();
336 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 336 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
337 337
338 setup_secondary_clock(); 338 setup_secondary_clock();
@@ -939,9 +939,9 @@ do_rest:
939 inquire_remote_apic(apicid); 939 inquire_remote_apic(apicid);
940 } 940 }
941 } 941 }
942 942#ifdef CONFIG_X86_64
943restore_state: 943restore_state:
944 944#endif
945 if (boot_error) { 945 if (boot_error) {
946 /* Try to put things back the way they were before ... */ 946 /* Try to put things back the way they were before ... */
947 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ 947 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 3449064d141a..99941b37eca0 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -25,59 +25,3 @@ __cpuinit void init_gdt(int cpu)
25 per_cpu(cpu_number, cpu) = cpu; 25 per_cpu(cpu_number, cpu) = cpu;
26} 26}
27#endif 27#endif
28
29/**
30 * smp_call_function(): Run a function on all other CPUs.
31 * @func: The function to run. This must be fast and non-blocking.
32 * @info: An arbitrary pointer to pass to the function.
33 * @nonatomic: Unused.
34 * @wait: If true, wait (atomically) until function has completed on other CPUs.
35 *
36 * Returns 0 on success, else a negative status code.
37 *
38 * If @wait is true, then returns once @func has returned; otherwise
39 * it returns just before the target cpu calls @func.
40 *
41 * You must not call this function with disabled interrupts or from a
42 * hardware interrupt handler or from a bottom half handler.
43 */
44int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
45 int wait)
46{
47 return smp_call_function_mask(cpu_online_map, func, info, wait);
48}
49EXPORT_SYMBOL(smp_call_function);
50
51/**
52 * smp_call_function_single - Run a function on a specific CPU
53 * @cpu: The target CPU. Cannot be the calling CPU.
54 * @func: The function to run. This must be fast and non-blocking.
55 * @info: An arbitrary pointer to pass to the function.
56 * @nonatomic: Unused.
57 * @wait: If true, wait until function has completed on other CPUs.
58 *
59 * Returns 0 on success, else a negative status code.
60 *
61 * If @wait is true, then returns once @func has returned; otherwise
62 * it returns just before the target cpu calls @func.
63 */
64int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
65 int nonatomic, int wait)
66{
67 /* prevent preemption and reschedule on another processor */
68 int ret;
69 int me = get_cpu();
70 if (cpu == me) {
71 local_irq_disable();
72 func(info);
73 local_irq_enable();
74 put_cpu();
75 return 0;
76 }
77
78 ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
79
80 put_cpu();
81 return ret;
82}
83EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c28c342c162f..a03e7f6d90c3 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -74,6 +74,7 @@ void save_stack_trace(struct stack_trace *trace)
74 if (trace->nr_entries < trace->max_entries) 74 if (trace->nr_entries < trace->max_entries)
75 trace->entries[trace->nr_entries++] = ULONG_MAX; 75 trace->entries[trace->nr_entries++] = ULONG_MAX;
76} 76}
77EXPORT_SYMBOL_GPL(save_stack_trace);
77 78
78void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 79void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
79{ 80{
@@ -81,3 +82,4 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81 if (trace->nr_entries < trace->max_entries) 82 if (trace->nr_entries < trace->max_entries)
82 trace->entries[trace->nr_entries++] = ULONG_MAX; 83 trace->entries[trace->nr_entries++] = ULONG_MAX;
83} 84}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 5f29f12da50c..059ca6ee59b4 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -39,9 +39,6 @@
39 39
40#include "do_timer.h" 40#include "do_timer.h"
41 41
42unsigned int cpu_khz; /* Detected as we calibrate the TSC */
43EXPORT_SYMBOL(cpu_khz);
44
45int timer_ack; 42int timer_ack;
46 43
47unsigned long profile_pc(struct pt_regs *regs) 44unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 39ae8511a137..e3d49c553af2 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
56/* calibrate_cpu is used on systems with fixed rate TSCs to determine 56/* calibrate_cpu is used on systems with fixed rate TSCs to determine
57 * processor frequency */ 57 * processor frequency */
58#define TICK_COUNT 100000000 58#define TICK_COUNT 100000000
59unsigned long __init native_calculate_cpu_khz(void) 59unsigned long __init calibrate_cpu(void)
60{ 60{
61 int tsc_start, tsc_now; 61 int tsc_start, tsc_now;
62 int i, no_ctr_free; 62 int i, no_ctr_free;
@@ -116,25 +116,11 @@ void __init hpet_time_init(void)
116 116
117void __init time_init(void) 117void __init time_init(void)
118{ 118{
119 tsc_calibrate(); 119 tsc_init();
120
121 cpu_khz = tsc_khz;
122 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
123 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
124 cpu_khz = calculate_cpu_khz();
125
126 lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ;
127
128 if (unsynchronized_tsc())
129 mark_tsc_unstable("TSCs unsynchronized");
130
131 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) 120 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
132 vgetcpu_mode = VGETCPU_RDTSCP; 121 vgetcpu_mode = VGETCPU_RDTSCP;
133 else 122 else
134 vgetcpu_mode = VGETCPU_LSL; 123 vgetcpu_mode = VGETCPU_LSL;
135 124
136 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
137 cpu_khz / 1000, cpu_khz % 1000);
138 init_tsc_clocksource();
139 late_time_init = choose_time_init(); 125 late_time_init = choose_time_init();
140} 126}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 9bb2363851af..fec1ecedc9b7 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info)
238 238
239void flush_tlb_all(void) 239void flush_tlb_all(void)
240{ 240{
241 on_each_cpu(do_flush_tlb_all, NULL, 1, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index 5039d0f097a2..dcbf7a1159ea 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -275,5 +275,5 @@ static void do_flush_tlb_all(void *info)
275 275
276void flush_tlb_all(void) 276void flush_tlb_all(void)
277{ 277{
278 on_each_cpu(do_flush_tlb_all, NULL, 1, 1); 278 on_each_cpu(do_flush_tlb_all, NULL, 1);
279} 279}
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index d7cc292691ff..8a768973c4f0 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
3 * 4 *
4 * Pentium III FXSR, SSE support 5 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * Gareth Hughes <gareth@valinux.com>, May 2000
@@ -60,8 +61,6 @@
60 61
61#include "mach_traps.h" 62#include "mach_traps.h"
62 63
63int panic_on_unrecovered_nmi;
64
65DECLARE_BITMAP(used_vectors, NR_VECTORS); 64DECLARE_BITMAP(used_vectors, NR_VECTORS);
66EXPORT_SYMBOL_GPL(used_vectors); 65EXPORT_SYMBOL_GPL(used_vectors);
67 66
@@ -98,19 +97,22 @@ asmlinkage void alignment_check(void);
98asmlinkage void spurious_interrupt_bug(void); 97asmlinkage void spurious_interrupt_bug(void);
99asmlinkage void machine_check(void); 98asmlinkage void machine_check(void);
100 99
100int panic_on_unrecovered_nmi;
101int kstack_depth_to_print = 24; 101int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 102static unsigned int code_bytes = 64;
103static int ignore_nmis;
104static int die_counter;
103 105
104void printk_address(unsigned long address, int reliable) 106void printk_address(unsigned long address, int reliable)
105{ 107{
106#ifdef CONFIG_KALLSYMS 108#ifdef CONFIG_KALLSYMS
107 char namebuf[KSYM_NAME_LEN];
108 unsigned long offset = 0; 109 unsigned long offset = 0;
109 unsigned long symsize; 110 unsigned long symsize;
110 const char *symname; 111 const char *symname;
111 char reliab[4] = "";
112 char *delim = ":";
113 char *modname; 112 char *modname;
113 char *delim = ":";
114 char namebuf[KSYM_NAME_LEN];
115 char reliab[4] = "";
114 116
115 symname = kallsyms_lookup(address, &symsize, &offset, 117 symname = kallsyms_lookup(address, &symsize, &offset,
116 &modname, namebuf); 118 &modname, namebuf);
@@ -130,22 +132,23 @@ void printk_address(unsigned long address, int reliable)
130#endif 132#endif
131} 133}
132 134
133static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) 135static inline int valid_stack_ptr(struct thread_info *tinfo,
136 void *p, unsigned int size)
134{ 137{
135 return p > (void *)tinfo && 138 void *t = tinfo;
136 p <= (void *)tinfo + THREAD_SIZE - size; 139 return p > t && p <= t + THREAD_SIZE - size;
137} 140}
138 141
139/* The form of the top of the frame on the stack */ 142/* The form of the top of the frame on the stack */
140struct stack_frame { 143struct stack_frame {
141 struct stack_frame *next_frame; 144 struct stack_frame *next_frame;
142 unsigned long return_address; 145 unsigned long return_address;
143}; 146};
144 147
145static inline unsigned long 148static inline unsigned long
146print_context_stack(struct thread_info *tinfo, 149print_context_stack(struct thread_info *tinfo,
147 unsigned long *stack, unsigned long bp, 150 unsigned long *stack, unsigned long bp,
148 const struct stacktrace_ops *ops, void *data) 151 const struct stacktrace_ops *ops, void *data)
149{ 152{
150 struct stack_frame *frame = (struct stack_frame *)bp; 153 struct stack_frame *frame = (struct stack_frame *)bp;
151 154
@@ -167,8 +170,6 @@ print_context_stack(struct thread_info *tinfo,
167 return bp; 170 return bp;
168} 171}
169 172
170#define MSG(msg) ops->warning(data, msg)
171
172void dump_trace(struct task_struct *task, struct pt_regs *regs, 173void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 unsigned long *stack, unsigned long bp, 174 unsigned long *stack, unsigned long bp,
174 const struct stacktrace_ops *ops, void *data) 175 const struct stacktrace_ops *ops, void *data)
@@ -178,7 +179,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
178 179
179 if (!stack) { 180 if (!stack) {
180 unsigned long dummy; 181 unsigned long dummy;
181
182 stack = &dummy; 182 stack = &dummy;
183 if (task != current) 183 if (task != current)
184 stack = (unsigned long *)task->thread.sp; 184 stack = (unsigned long *)task->thread.sp;
@@ -196,7 +196,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
196 } 196 }
197#endif 197#endif
198 198
199 while (1) { 199 for (;;) {
200 struct thread_info *context; 200 struct thread_info *context;
201 201
202 context = (struct thread_info *) 202 context = (struct thread_info *)
@@ -248,10 +248,10 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
248} 248}
249 249
250static const struct stacktrace_ops print_trace_ops = { 250static const struct stacktrace_ops print_trace_ops = {
251 .warning = print_trace_warning, 251 .warning = print_trace_warning,
252 .warning_symbol = print_trace_warning_symbol, 252 .warning_symbol = print_trace_warning_symbol,
253 .stack = print_trace_stack, 253 .stack = print_trace_stack,
254 .address = print_trace_address, 254 .address = print_trace_address,
255}; 255};
256 256
257static void 257static void
@@ -351,15 +351,14 @@ void show_registers(struct pt_regs *regs)
351 printk(KERN_EMERG "Code: "); 351 printk(KERN_EMERG "Code: ");
352 352
353 ip = (u8 *)regs->ip - code_prologue; 353 ip = (u8 *)regs->ip - code_prologue;
354 if (ip < (u8 *)PAGE_OFFSET || 354 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
355 probe_kernel_address(ip, c)) {
356 /* try starting at EIP */ 355 /* try starting at EIP */
357 ip = (u8 *)regs->ip; 356 ip = (u8 *)regs->ip;
358 code_len = code_len - code_prologue + 1; 357 code_len = code_len - code_prologue + 1;
359 } 358 }
360 for (i = 0; i < code_len; i++, ip++) { 359 for (i = 0; i < code_len; i++, ip++) {
361 if (ip < (u8 *)PAGE_OFFSET || 360 if (ip < (u8 *)PAGE_OFFSET ||
362 probe_kernel_address(ip, c)) { 361 probe_kernel_address(ip, c)) {
363 printk(" Bad EIP value."); 362 printk(" Bad EIP value.");
364 break; 363 break;
365 } 364 }
@@ -384,8 +383,6 @@ int is_valid_bugaddr(unsigned long ip)
384 return ud2 == 0x0b0f; 383 return ud2 == 0x0b0f;
385} 384}
386 385
387static int die_counter;
388
389int __kprobes __die(const char *str, struct pt_regs *regs, long err) 386int __kprobes __die(const char *str, struct pt_regs *regs, long err)
390{ 387{
391 unsigned short ss; 388 unsigned short ss;
@@ -402,26 +399,22 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
402 printk("DEBUG_PAGEALLOC"); 399 printk("DEBUG_PAGEALLOC");
403#endif 400#endif
404 printk("\n"); 401 printk("\n");
405
406 if (notify_die(DIE_OOPS, str, regs, err, 402 if (notify_die(DIE_OOPS, str, regs, err,
407 current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { 403 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
408 404 return 1;
409 show_registers(regs);
410 /* Executive summary in case the oops scrolled away */
411 sp = (unsigned long) (&regs->sp);
412 savesegment(ss, ss);
413 if (user_mode(regs)) {
414 sp = regs->sp;
415 ss = regs->ss & 0xffff;
416 }
417 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
418 print_symbol("%s", regs->ip);
419 printk(" SS:ESP %04x:%08lx\n", ss, sp);
420 405
421 return 0; 406 show_registers(regs);
407 /* Executive summary in case the oops scrolled away */
408 sp = (unsigned long) (&regs->sp);
409 savesegment(ss, ss);
410 if (user_mode(regs)) {
411 sp = regs->sp;
412 ss = regs->ss & 0xffff;
422 } 413 }
423 414 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
424 return 1; 415 print_symbol("%s", regs->ip);
416 printk(" SS:ESP %04x:%08lx\n", ss, sp);
417 return 0;
425} 418}
426 419
427/* 420/*
@@ -546,7 +539,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
546{ \ 539{ \
547 trace_hardirqs_fixup(); \ 540 trace_hardirqs_fixup(); \
548 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 541 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
549 == NOTIFY_STOP) \ 542 == NOTIFY_STOP) \
550 return; \ 543 return; \
551 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ 544 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
552} 545}
@@ -562,7 +555,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
562 info.si_code = sicode; \ 555 info.si_code = sicode; \
563 info.si_addr = (void __user *)siaddr; \ 556 info.si_addr = (void __user *)siaddr; \
564 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 557 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
565 == NOTIFY_STOP) \ 558 == NOTIFY_STOP) \
566 return; \ 559 return; \
567 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ 560 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
568} 561}
@@ -571,7 +564,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
571void do_##name(struct pt_regs *regs, long error_code) \ 564void do_##name(struct pt_regs *regs, long error_code) \
572{ \ 565{ \
573 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 566 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
574 == NOTIFY_STOP) \ 567 == NOTIFY_STOP) \
575 return; \ 568 return; \
576 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ 569 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
577} 570}
@@ -586,27 +579,29 @@ void do_##name(struct pt_regs *regs, long error_code) \
586 info.si_addr = (void __user *)siaddr; \ 579 info.si_addr = (void __user *)siaddr; \
587 trace_hardirqs_fixup(); \ 580 trace_hardirqs_fixup(); \
588 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 581 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
589 == NOTIFY_STOP) \ 582 == NOTIFY_STOP) \
590 return; \ 583 return; \
591 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 584 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
592} 585}
593 586
594DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) 587DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
595#ifndef CONFIG_KPROBES 588#ifndef CONFIG_KPROBES
596DO_VM86_ERROR(3, SIGTRAP, "int3", int3) 589DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
597#endif 590#endif
598DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) 591DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
599DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) 592DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
600DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) 593DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
601DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 594DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
602DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 595DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
603DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 596DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
604DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 597DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
605DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 598DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
606DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) 599DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
607 600
608void __kprobes do_general_protection(struct pt_regs *regs, long error_code) 601void __kprobes
602do_general_protection(struct pt_regs *regs, long error_code)
609{ 603{
604 struct task_struct *tsk;
610 struct thread_struct *thread; 605 struct thread_struct *thread;
611 struct tss_struct *tss; 606 struct tss_struct *tss;
612 int cpu; 607 int cpu;
@@ -647,23 +642,24 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
647 if (regs->flags & X86_VM_MASK) 642 if (regs->flags & X86_VM_MASK)
648 goto gp_in_vm86; 643 goto gp_in_vm86;
649 644
645 tsk = current;
650 if (!user_mode(regs)) 646 if (!user_mode(regs))
651 goto gp_in_kernel; 647 goto gp_in_kernel;
652 648
653 current->thread.error_code = error_code; 649 tsk->thread.error_code = error_code;
654 current->thread.trap_no = 13; 650 tsk->thread.trap_no = 13;
655 651
656 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && 652 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
657 printk_ratelimit()) { 653 printk_ratelimit()) {
658 printk(KERN_INFO 654 printk(KERN_INFO
659 "%s[%d] general protection ip:%lx sp:%lx error:%lx", 655 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
660 current->comm, task_pid_nr(current), 656 tsk->comm, task_pid_nr(tsk),
661 regs->ip, regs->sp, error_code); 657 regs->ip, regs->sp, error_code);
662 print_vma_addr(" in ", regs->ip); 658 print_vma_addr(" in ", regs->ip);
663 printk("\n"); 659 printk("\n");
664 } 660 }
665 661
666 force_sig(SIGSEGV, current); 662 force_sig(SIGSEGV, tsk);
667 return; 663 return;
668 664
669gp_in_vm86: 665gp_in_vm86:
@@ -672,14 +668,15 @@ gp_in_vm86:
672 return; 668 return;
673 669
674gp_in_kernel: 670gp_in_kernel:
675 if (!fixup_exception(regs)) { 671 if (fixup_exception(regs))
676 current->thread.error_code = error_code; 672 return;
677 current->thread.trap_no = 13; 673
678 if (notify_die(DIE_GPF, "general protection fault", regs, 674 tsk->thread.error_code = error_code;
675 tsk->thread.trap_no = 13;
676 if (notify_die(DIE_GPF, "general protection fault", regs,
679 error_code, 13, SIGSEGV) == NOTIFY_STOP) 677 error_code, 13, SIGSEGV) == NOTIFY_STOP)
680 return; 678 return;
681 die("general protection fault", regs, error_code); 679 die("general protection fault", regs, error_code);
682 }
683} 680}
684 681
685static notrace __kprobes void 682static notrace __kprobes void
@@ -792,14 +789,17 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
792static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 789static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
793{ 790{
794 unsigned char reason = 0; 791 unsigned char reason = 0;
792 int cpu;
793
794 cpu = smp_processor_id();
795 795
796 /* Only the BSP gets external NMIs from the system: */ 796 /* Only the BSP gets external NMIs from the system. */
797 if (!smp_processor_id()) 797 if (!cpu)
798 reason = get_nmi_reason(); 798 reason = get_nmi_reason();
799 799
800 if (!(reason & 0xc0)) { 800 if (!(reason & 0xc0)) {
801 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 801 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
802 == NOTIFY_STOP) 802 == NOTIFY_STOP)
803 return; 803 return;
804#ifdef CONFIG_X86_LOCAL_APIC 804#ifdef CONFIG_X86_LOCAL_APIC
805 /* 805 /*
@@ -808,7 +808,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
808 */ 808 */
809 if (nmi_watchdog_tick(regs, reason)) 809 if (nmi_watchdog_tick(regs, reason))
810 return; 810 return;
811 if (!do_nmi_callback(regs, smp_processor_id())) 811 if (!do_nmi_callback(regs, cpu))
812 unknown_nmi_error(reason, regs); 812 unknown_nmi_error(reason, regs);
813#else 813#else
814 unknown_nmi_error(reason, regs); 814 unknown_nmi_error(reason, regs);
@@ -818,6 +818,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
818 } 818 }
819 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 819 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
820 return; 820 return;
821
822 /* AK: following checks seem to be broken on modern chipsets. FIXME */
821 if (reason & 0x80) 823 if (reason & 0x80)
822 mem_parity_error(reason, regs); 824 mem_parity_error(reason, regs);
823 if (reason & 0x40) 825 if (reason & 0x40)
@@ -829,8 +831,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
829 reassert_nmi(); 831 reassert_nmi();
830} 832}
831 833
832static int ignore_nmis;
833
834notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) 834notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
835{ 835{
836 int cpu; 836 int cpu;
@@ -915,7 +915,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
915 tsk->thread.debugctlmsr = 0; 915 tsk->thread.debugctlmsr = 0;
916 916
917 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 917 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
918 SIGTRAP) == NOTIFY_STOP) 918 SIGTRAP) == NOTIFY_STOP)
919 return; 919 return;
920 /* It's safe to allow irq's after DR6 has been saved */ 920 /* It's safe to allow irq's after DR6 has been saved */
921 if (regs->flags & X86_EFLAGS_IF) 921 if (regs->flags & X86_EFLAGS_IF)
@@ -976,9 +976,8 @@ clear_TF_reenable:
976void math_error(void __user *ip) 976void math_error(void __user *ip)
977{ 977{
978 struct task_struct *task; 978 struct task_struct *task;
979 unsigned short cwd;
980 unsigned short swd;
981 siginfo_t info; 979 siginfo_t info;
980 unsigned short cwd, swd;
982 981
983 /* 982 /*
984 * Save the info for the exception handler and clear the error. 983 * Save the info for the exception handler and clear the error.
@@ -997,7 +996,7 @@ void math_error(void __user *ip)
997 * C1 reg you need in case of a stack fault, 0x040 is the stack 996 * C1 reg you need in case of a stack fault, 0x040 is the stack
998 * fault bit. We should only be taking one exception at a time, 997 * fault bit. We should only be taking one exception at a time,
999 * so if this combination doesn't produce any single exception, 998 * so if this combination doesn't produce any single exception,
1000 * then we have a bad program that isn't syncronizing its FPU usage 999 * then we have a bad program that isn't synchronizing its FPU usage
1001 * and it will suffer the consequences since we won't be able to 1000 * and it will suffer the consequences since we won't be able to
1002 * fully reproduce the context of the exception 1001 * fully reproduce the context of the exception
1003 */ 1002 */
@@ -1006,7 +1005,7 @@ void math_error(void __user *ip)
1006 switch (swd & ~cwd & 0x3f) { 1005 switch (swd & ~cwd & 0x3f) {
1007 case 0x000: /* No unmasked exception */ 1006 case 0x000: /* No unmasked exception */
1008 return; 1007 return;
1009 default: /* Multiple exceptions */ 1008 default: /* Multiple exceptions */
1010 break; 1009 break;
1011 case 0x001: /* Invalid Op */ 1010 case 0x001: /* Invalid Op */
1012 /* 1011 /*
@@ -1042,8 +1041,8 @@ void do_coprocessor_error(struct pt_regs *regs, long error_code)
1042static void simd_math_error(void __user *ip) 1041static void simd_math_error(void __user *ip)
1043{ 1042{
1044 struct task_struct *task; 1043 struct task_struct *task;
1045 unsigned short mxcsr;
1046 siginfo_t info; 1044 siginfo_t info;
1045 unsigned short mxcsr;
1047 1046
1048 /* 1047 /*
1049 * Save the info for the exception handler and clear the error. 1048 * Save the info for the exception handler and clear the error.
@@ -1198,16 +1197,16 @@ void __init trap_init(void)
1198 early_iounmap(p, 4); 1197 early_iounmap(p, 4);
1199#endif 1198#endif
1200 1199
1201 set_trap_gate(0, &divide_error); 1200 set_trap_gate(0, &divide_error);
1202 set_intr_gate(1, &debug); 1201 set_intr_gate(1, &debug);
1203 set_intr_gate(2, &nmi); 1202 set_intr_gate(2, &nmi);
1204 set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ 1203 set_system_intr_gate(3, &int3); /* int3 can be called from all */
1205 set_system_gate(4, &overflow); 1204 set_system_gate(4, &overflow); /* int4 can be called from all */
1206 set_trap_gate(5, &bounds); 1205 set_trap_gate(5, &bounds);
1207 set_trap_gate(6, &invalid_op); 1206 set_trap_gate(6, &invalid_op);
1208 set_trap_gate(7, &device_not_available); 1207 set_trap_gate(7, &device_not_available);
1209 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); 1208 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1210 set_trap_gate(9, &coprocessor_segment_overrun); 1209 set_trap_gate(9, &coprocessor_segment_overrun);
1211 set_trap_gate(10, &invalid_TSS); 1210 set_trap_gate(10, &invalid_TSS);
1212 set_trap_gate(11, &segment_not_present); 1211 set_trap_gate(11, &segment_not_present);
1213 set_trap_gate(12, &stack_segment); 1212 set_trap_gate(12, &stack_segment);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 80ba6d37bfe0..2696a6837782 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -10,49 +10,49 @@
10 * 'Traps.c' handles hardware traps and faults after we have saved some 10 * 'Traps.c' handles hardware traps and faults after we have saved some
11 * state in 'entry.S'. 11 * state in 'entry.S'.
12 */ 12 */
13#include <linux/sched.h> 13#include <linux/moduleparam.h>
14#include <linux/interrupt.h>
15#include <linux/kallsyms.h>
16#include <linux/spinlock.h>
17#include <linux/kprobes.h>
18#include <linux/uaccess.h>
19#include <linux/utsname.h>
20#include <linux/kdebug.h>
14#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/ptrace.h>
15#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/unwind.h>
26#include <linux/delay.h>
16#include <linux/errno.h> 27#include <linux/errno.h>
17#include <linux/ptrace.h> 28#include <linux/kexec.h>
29#include <linux/sched.h>
18#include <linux/timer.h> 30#include <linux/timer.h>
19#include <linux/mm.h>
20#include <linux/init.h> 31#include <linux/init.h>
21#include <linux/delay.h>
22#include <linux/spinlock.h>
23#include <linux/interrupt.h>
24#include <linux/kallsyms.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/nmi.h>
28#include <linux/kprobes.h>
29#include <linux/kexec.h>
30#include <linux/unwind.h>
31#include <linux/uaccess.h>
32#include <linux/bug.h> 32#include <linux/bug.h>
33#include <linux/kdebug.h> 33#include <linux/nmi.h>
34#include <linux/utsname.h> 34#include <linux/mm.h>
35
36#include <mach_traps.h>
37 35
38#if defined(CONFIG_EDAC) 36#if defined(CONFIG_EDAC)
39#include <linux/edac.h> 37#include <linux/edac.h>
40#endif 38#endif
41 39
42#include <asm/system.h> 40#include <asm/stacktrace.h>
43#include <asm/io.h> 41#include <asm/processor.h>
44#include <asm/atomic.h>
45#include <asm/debugreg.h> 42#include <asm/debugreg.h>
43#include <asm/atomic.h>
44#include <asm/system.h>
45#include <asm/unwind.h>
46#include <asm/desc.h> 46#include <asm/desc.h>
47#include <asm/i387.h> 47#include <asm/i387.h>
48#include <asm/processor.h> 48#include <asm/nmi.h>
49#include <asm/unwind.h>
50#include <asm/smp.h> 49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
52#include <asm/pda.h>
53#include <asm/proto.h> 52#include <asm/proto.h>
54#include <asm/nmi.h> 53#include <asm/pda.h>
55#include <asm/stacktrace.h> 54
55#include <mach_traps.h>
56 56
57asmlinkage void divide_error(void); 57asmlinkage void divide_error(void);
58asmlinkage void debug(void); 58asmlinkage void debug(void);
@@ -72,12 +72,14 @@ asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void); 72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void); 73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void alignment_check(void); 74asmlinkage void alignment_check(void);
75asmlinkage void machine_check(void);
76asmlinkage void spurious_interrupt_bug(void); 75asmlinkage void spurious_interrupt_bug(void);
76asmlinkage void machine_check(void);
77 77
78int panic_on_unrecovered_nmi; 78int panic_on_unrecovered_nmi;
79int kstack_depth_to_print = 12;
79static unsigned int code_bytes = 64; 80static unsigned int code_bytes = 64;
80static unsigned ignore_nmis; 81static int ignore_nmis;
82static int die_counter;
81 83
82static inline void conditional_sti(struct pt_regs *regs) 84static inline void conditional_sti(struct pt_regs *regs)
83{ 85{
@@ -101,34 +103,9 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
101 dec_preempt_count(); 103 dec_preempt_count();
102} 104}
103 105
104int kstack_depth_to_print = 12;
105
106void printk_address(unsigned long address, int reliable) 106void printk_address(unsigned long address, int reliable)
107{ 107{
108#ifdef CONFIG_KALLSYMS 108 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
109 unsigned long offset = 0, symsize;
110 const char *symname;
111 char *modname;
112 char *delim = ":";
113 char namebuf[KSYM_NAME_LEN];
114 char reliab[4] = "";
115
116 symname = kallsyms_lookup(address, &symsize, &offset,
117 &modname, namebuf);
118 if (!symname) {
119 printk(" [<%016lx>]\n", address);
120 return;
121 }
122 if (!reliable)
123 strcpy(reliab, "? ");
124
125 if (!modname)
126 modname = delim = "";
127 printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
128 address, reliab, delim, modname, delim, symname, offset, symsize);
129#else
130 printk(" [<%016lx>]\n", address);
131#endif
132} 109}
133 110
134static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 111static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -205,8 +182,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
205 return NULL; 182 return NULL;
206} 183}
207 184
208#define MSG(txt) ops->warning(data, txt)
209
210/* 185/*
211 * x86-64 can have up to three kernel stacks: 186 * x86-64 can have up to three kernel stacks:
212 * process stack 187 * process stack
@@ -233,11 +208,11 @@ struct stack_frame {
233 unsigned long return_address; 208 unsigned long return_address;
234}; 209};
235 210
236 211static inline unsigned long
237static inline unsigned long print_context_stack(struct thread_info *tinfo, 212print_context_stack(struct thread_info *tinfo,
238 unsigned long *stack, unsigned long bp, 213 unsigned long *stack, unsigned long bp,
239 const struct stacktrace_ops *ops, void *data, 214 const struct stacktrace_ops *ops, void *data,
240 unsigned long *end) 215 unsigned long *end)
241{ 216{
242 struct stack_frame *frame = (struct stack_frame *)bp; 217 struct stack_frame *frame = (struct stack_frame *)bp;
243 218
@@ -259,7 +234,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
259 return bp; 234 return bp;
260} 235}
261 236
262void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 237void dump_trace(struct task_struct *task, struct pt_regs *regs,
263 unsigned long *stack, unsigned long bp, 238 unsigned long *stack, unsigned long bp,
264 const struct stacktrace_ops *ops, void *data) 239 const struct stacktrace_ops *ops, void *data)
265{ 240{
@@ -268,36 +243,34 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
268 unsigned used = 0; 243 unsigned used = 0;
269 struct thread_info *tinfo; 244 struct thread_info *tinfo;
270 245
271 if (!tsk) 246 if (!task)
272 tsk = current; 247 task = current;
273 tinfo = task_thread_info(tsk);
274 248
275 if (!stack) { 249 if (!stack) {
276 unsigned long dummy; 250 unsigned long dummy;
277 stack = &dummy; 251 stack = &dummy;
278 if (tsk && tsk != current) 252 if (task && task != current)
279 stack = (unsigned long *)tsk->thread.sp; 253 stack = (unsigned long *)task->thread.sp;
280 } 254 }
281 255
282#ifdef CONFIG_FRAME_POINTER 256#ifdef CONFIG_FRAME_POINTER
283 if (!bp) { 257 if (!bp) {
284 if (tsk == current) { 258 if (task == current) {
285 /* Grab bp right from our regs */ 259 /* Grab bp right from our regs */
286 asm("movq %%rbp, %0" : "=r" (bp):); 260 asm("movq %%rbp, %0" : "=r" (bp) :);
287 } else { 261 } else {
288 /* bp is the last reg pushed by switch_to */ 262 /* bp is the last reg pushed by switch_to */
289 bp = *(unsigned long *) tsk->thread.sp; 263 bp = *(unsigned long *) task->thread.sp;
290 } 264 }
291 } 265 }
292#endif 266#endif
293 267
294
295
296 /* 268 /*
297 * Print function call entries in all stacks, starting at the 269 * Print function call entries in all stacks, starting at the
298 * current stack address. If the stacks consist of nested 270 * current stack address. If the stacks consist of nested
299 * exceptions 271 * exceptions
300 */ 272 */
273 tinfo = task_thread_info(task);
301 for (;;) { 274 for (;;) {
302 char *id; 275 char *id;
303 unsigned long *estack_end; 276 unsigned long *estack_end;
@@ -382,18 +355,17 @@ static const struct stacktrace_ops print_trace_ops = {
382 .address = print_trace_address, 355 .address = print_trace_address,
383}; 356};
384 357
385void 358void show_trace(struct task_struct *task, struct pt_regs *regs,
386show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, 359 unsigned long *stack, unsigned long bp)
387 unsigned long bp)
388{ 360{
389 printk("\nCall Trace:\n"); 361 printk("\nCall Trace:\n");
390 dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); 362 dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
391 printk("\n"); 363 printk("\n");
392} 364}
393 365
394static void 366static void
395_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, 367_show_stack(struct task_struct *task, struct pt_regs *regs,
396 unsigned long bp) 368 unsigned long *sp, unsigned long bp)
397{ 369{
398 unsigned long *stack; 370 unsigned long *stack;
399 int i; 371 int i;
@@ -405,14 +377,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
405 // back trace for this cpu. 377 // back trace for this cpu.
406 378
407 if (sp == NULL) { 379 if (sp == NULL) {
408 if (tsk) 380 if (task)
409 sp = (unsigned long *)tsk->thread.sp; 381 sp = (unsigned long *)task->thread.sp;
410 else 382 else
411 sp = (unsigned long *)&sp; 383 sp = (unsigned long *)&sp;
412 } 384 }
413 385
414 stack = sp; 386 stack = sp;
415 for(i=0; i < kstack_depth_to_print; i++) { 387 for (i = 0; i < kstack_depth_to_print; i++) {
416 if (stack >= irqstack && stack <= irqstack_end) { 388 if (stack >= irqstack && stack <= irqstack_end) {
417 if (stack == irqstack_end) { 389 if (stack == irqstack_end) {
418 stack = (unsigned long *) (irqstack_end[-1]); 390 stack = (unsigned long *) (irqstack_end[-1]);
@@ -427,12 +399,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
427 printk(" %016lx", *stack++); 399 printk(" %016lx", *stack++);
428 touch_nmi_watchdog(); 400 touch_nmi_watchdog();
429 } 401 }
430 show_trace(tsk, regs, sp, bp); 402 show_trace(task, regs, sp, bp);
431} 403}
432 404
433void show_stack(struct task_struct *tsk, unsigned long * sp) 405void show_stack(struct task_struct *task, unsigned long *sp)
434{ 406{
435 _show_stack(tsk, NULL, sp, 0); 407 _show_stack(task, NULL, sp, 0);
436} 408}
437 409
438/* 410/*
@@ -440,8 +412,8 @@ void show_stack(struct task_struct *tsk, unsigned long * sp)
440 */ 412 */
441void dump_stack(void) 413void dump_stack(void)
442{ 414{
443 unsigned long dummy;
444 unsigned long bp = 0; 415 unsigned long bp = 0;
416 unsigned long stack;
445 417
446#ifdef CONFIG_FRAME_POINTER 418#ifdef CONFIG_FRAME_POINTER
447 if (!bp) 419 if (!bp)
@@ -453,7 +425,7 @@ void dump_stack(void)
453 init_utsname()->release, 425 init_utsname()->release,
454 (int)strcspn(init_utsname()->version, " "), 426 (int)strcspn(init_utsname()->version, " "),
455 init_utsname()->version); 427 init_utsname()->version);
456 show_trace(NULL, NULL, &dummy, bp); 428 show_trace(NULL, NULL, &stack, bp);
457} 429}
458 430
459EXPORT_SYMBOL(dump_stack); 431EXPORT_SYMBOL(dump_stack);
@@ -464,12 +436,8 @@ void show_registers(struct pt_regs *regs)
464 unsigned long sp; 436 unsigned long sp;
465 const int cpu = smp_processor_id(); 437 const int cpu = smp_processor_id();
466 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 438 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
467 u8 *ip;
468 unsigned int code_prologue = code_bytes * 43 / 64;
469 unsigned int code_len = code_bytes;
470 439
471 sp = regs->sp; 440 sp = regs->sp;
472 ip = (u8 *) regs->ip - code_prologue;
473 printk("CPU %d ", cpu); 441 printk("CPU %d ", cpu);
474 __show_regs(regs); 442 __show_regs(regs);
475 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 443 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -480,15 +448,21 @@ void show_registers(struct pt_regs *regs)
480 * time of the fault.. 448 * time of the fault..
481 */ 449 */
482 if (!user_mode(regs)) { 450 if (!user_mode(regs)) {
451 unsigned int code_prologue = code_bytes * 43 / 64;
452 unsigned int code_len = code_bytes;
483 unsigned char c; 453 unsigned char c;
454 u8 *ip;
455
484 printk("Stack: "); 456 printk("Stack: ");
485 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); 457 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
486 printk("\n"); 458 printk("\n");
487 459
488 printk(KERN_EMERG "Code: "); 460 printk(KERN_EMERG "Code: ");
461
462 ip = (u8 *)regs->ip - code_prologue;
489 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 463 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
490 /* try starting at RIP */ 464 /* try starting at RIP */
491 ip = (u8 *) regs->ip; 465 ip = (u8 *)regs->ip;
492 code_len = code_len - code_prologue + 1; 466 code_len = code_len - code_prologue + 1;
493 } 467 }
494 for (i = 0; i < code_len; i++, ip++) { 468 for (i = 0; i < code_len; i++, ip++) {
@@ -504,7 +478,7 @@ void show_registers(struct pt_regs *regs)
504 } 478 }
505 } 479 }
506 printk("\n"); 480 printk("\n");
507} 481}
508 482
509int is_valid_bugaddr(unsigned long ip) 483int is_valid_bugaddr(unsigned long ip)
510{ 484{
@@ -562,10 +536,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
562 do_exit(signr); 536 do_exit(signr);
563} 537}
564 538
565int __kprobes __die(const char * str, struct pt_regs * regs, long err) 539int __kprobes __die(const char *str, struct pt_regs *regs, long err)
566{ 540{
567 static int die_counter; 541 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
568 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
569#ifdef CONFIG_PREEMPT 542#ifdef CONFIG_PREEMPT
570 printk("PREEMPT "); 543 printk("PREEMPT ");
571#endif 544#endif
@@ -576,8 +549,10 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
576 printk("DEBUG_PAGEALLOC"); 549 printk("DEBUG_PAGEALLOC");
577#endif 550#endif
578 printk("\n"); 551 printk("\n");
579 if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) 552 if (notify_die(DIE_OOPS, str, regs, err,
553 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
580 return 1; 554 return 1;
555
581 show_registers(regs); 556 show_registers(regs);
582 add_taint(TAINT_DIE); 557 add_taint(TAINT_DIE);
583 /* Executive summary in case the oops scrolled away */ 558 /* Executive summary in case the oops scrolled away */
@@ -589,7 +564,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
589 return 0; 564 return 0;
590} 565}
591 566
592void die(const char * str, struct pt_regs * regs, long err) 567void die(const char *str, struct pt_regs *regs, long err)
593{ 568{
594 unsigned long flags = oops_begin(); 569 unsigned long flags = oops_begin();
595 570
@@ -606,8 +581,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
606{ 581{
607 unsigned long flags; 582 unsigned long flags;
608 583
609 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == 584 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
610 NOTIFY_STOP)
611 return; 585 return;
612 586
613 flags = oops_begin(); 587 flags = oops_begin();
@@ -629,44 +603,44 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
629 do_exit(SIGBUS); 603 do_exit(SIGBUS);
630} 604}
631 605
632static void __kprobes do_trap(int trapnr, int signr, char *str, 606static void __kprobes
633 struct pt_regs * regs, long error_code, 607do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
634 siginfo_t *info) 608 long error_code, siginfo_t *info)
635{ 609{
636 struct task_struct *tsk = current; 610 struct task_struct *tsk = current;
637 611
638 if (user_mode(regs)) { 612 if (!user_mode(regs))
639 /* 613 goto kernel_trap;
640 * We want error_code and trap_no set for userspace
641 * faults and kernelspace faults which result in
642 * die(), but not kernelspace faults which are fixed
643 * up. die() gives the process no chance to handle
644 * the signal and notice the kernel fault information,
645 * so that won't result in polluting the information
646 * about previously queued, but not yet delivered,
647 * faults. See also do_general_protection below.
648 */
649 tsk->thread.error_code = error_code;
650 tsk->thread.trap_no = trapnr;
651
652 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
653 printk_ratelimit()) {
654 printk(KERN_INFO
655 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
656 tsk->comm, tsk->pid, str,
657 regs->ip, regs->sp, error_code);
658 print_vma_addr(" in ", regs->ip);
659 printk("\n");
660 }
661 614
662 if (info) 615 /*
663 force_sig_info(signr, info, tsk); 616 * We want error_code and trap_no set for userspace faults and
664 else 617 * kernelspace faults which result in die(), but not
665 force_sig(signr, tsk); 618 * kernelspace faults which are fixed up. die() gives the
666 return; 619 * process no chance to handle the signal and notice the
620 * kernel fault information, so that won't result in polluting
621 * the information about previously queued, but not yet
622 * delivered, faults. See also do_general_protection below.
623 */
624 tsk->thread.error_code = error_code;
625 tsk->thread.trap_no = trapnr;
626
627 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
628 printk_ratelimit()) {
629 printk(KERN_INFO
630 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
631 tsk->comm, tsk->pid, str,
632 regs->ip, regs->sp, error_code);
633 print_vma_addr(" in ", regs->ip);
634 printk("\n");
667 } 635 }
668 636
637 if (info)
638 force_sig_info(signr, info, tsk);
639 else
640 force_sig(signr, tsk);
641 return;
669 642
643kernel_trap:
670 if (!fixup_exception(regs)) { 644 if (!fixup_exception(regs)) {
671 tsk->thread.error_code = error_code; 645 tsk->thread.error_code = error_code;
672 tsk->thread.trap_no = trapnr; 646 tsk->thread.trap_no = trapnr;
@@ -676,38 +650,38 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
676} 650}
677 651
678#define DO_ERROR(trapnr, signr, str, name) \ 652#define DO_ERROR(trapnr, signr, str, name) \
679asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 653asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
680{ \ 654{ \
681 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 655 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
682 == NOTIFY_STOP) \ 656 == NOTIFY_STOP) \
683 return; \ 657 return; \
684 conditional_sti(regs); \ 658 conditional_sti(regs); \
685 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 659 do_trap(trapnr, signr, str, regs, error_code, NULL); \
686} 660}
687 661
688#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 662#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
689asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 663asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
690{ \ 664{ \
691 siginfo_t info; \ 665 siginfo_t info; \
692 info.si_signo = signr; \ 666 info.si_signo = signr; \
693 info.si_errno = 0; \ 667 info.si_errno = 0; \
694 info.si_code = sicode; \ 668 info.si_code = sicode; \
695 info.si_addr = (void __user *)siaddr; \ 669 info.si_addr = (void __user *)siaddr; \
696 trace_hardirqs_fixup(); \ 670 trace_hardirqs_fixup(); \
697 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 671 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
698 == NOTIFY_STOP) \ 672 == NOTIFY_STOP) \
699 return; \ 673 return; \
700 conditional_sti(regs); \ 674 conditional_sti(regs); \
701 do_trap(trapnr, signr, str, regs, error_code, &info); \ 675 do_trap(trapnr, signr, str, regs, error_code, &info); \
702} 676}
703 677
704DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) 678DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
705DO_ERROR( 4, SIGSEGV, "overflow", overflow) 679DO_ERROR(4, SIGSEGV, "overflow", overflow)
706DO_ERROR( 5, SIGSEGV, "bounds", bounds) 680DO_ERROR(5, SIGSEGV, "bounds", bounds)
707DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) 681DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
708DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 682DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
709DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 683DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
710DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 684DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
711DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) 685DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
712 686
713/* Runs on IST stack */ 687/* Runs on IST stack */
@@ -738,31 +712,34 @@ asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
738 die(str, regs, error_code); 712 die(str, regs, error_code);
739} 713}
740 714
741asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, 715asmlinkage void __kprobes
742 long error_code) 716do_general_protection(struct pt_regs *regs, long error_code)
743{ 717{
744 struct task_struct *tsk = current; 718 struct task_struct *tsk;
745 719
746 conditional_sti(regs); 720 conditional_sti(regs);
747 721
748 if (user_mode(regs)) { 722 tsk = current;
749 tsk->thread.error_code = error_code; 723 if (!user_mode(regs))
750 tsk->thread.trap_no = 13; 724 goto gp_in_kernel;
751
752 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
753 printk_ratelimit()) {
754 printk(KERN_INFO
755 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
756 tsk->comm, tsk->pid,
757 regs->ip, regs->sp, error_code);
758 print_vma_addr(" in ", regs->ip);
759 printk("\n");
760 }
761 725
762 force_sig(SIGSEGV, tsk); 726 tsk->thread.error_code = error_code;
763 return; 727 tsk->thread.trap_no = 13;
764 }
765 728
729 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
730 printk_ratelimit()) {
731 printk(KERN_INFO
732 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
733 tsk->comm, tsk->pid,
734 regs->ip, regs->sp, error_code);
735 print_vma_addr(" in ", regs->ip);
736 printk("\n");
737 }
738
739 force_sig(SIGSEGV, tsk);
740 return;
741
742gp_in_kernel:
766 if (fixup_exception(regs)) 743 if (fixup_exception(regs))
767 return; 744 return;
768 745
@@ -775,14 +752,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
775} 752}
776 753
777static notrace __kprobes void 754static notrace __kprobes void
778mem_parity_error(unsigned char reason, struct pt_regs * regs) 755mem_parity_error(unsigned char reason, struct pt_regs *regs)
779{ 756{
780 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", 757 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
781 reason); 758 reason);
782 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); 759 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
783 760
784#if defined(CONFIG_EDAC) 761#if defined(CONFIG_EDAC)
785 if(edac_handler_set()) { 762 if (edac_handler_set()) {
786 edac_atomic_assert_error(); 763 edac_atomic_assert_error();
787 return; 764 return;
788 } 765 }
@@ -799,7 +776,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
799} 776}
800 777
801static notrace __kprobes void 778static notrace __kprobes void
802io_check_error(unsigned char reason, struct pt_regs * regs) 779io_check_error(unsigned char reason, struct pt_regs *regs)
803{ 780{
804 printk("NMI: IOCK error (debug interrupt?)\n"); 781 printk("NMI: IOCK error (debug interrupt?)\n");
805 show_registers(regs); 782 show_registers(regs);
@@ -829,14 +806,14 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
829 806
830/* Runs on IST stack. This code must keep interrupts off all the time. 807/* Runs on IST stack. This code must keep interrupts off all the time.
831 Nested NMIs are prevented by the CPU. */ 808 Nested NMIs are prevented by the CPU. */
832asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) 809asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
833{ 810{
834 unsigned char reason = 0; 811 unsigned char reason = 0;
835 int cpu; 812 int cpu;
836 813
837 cpu = smp_processor_id(); 814 cpu = smp_processor_id();
838 815
839 /* Only the BSP gets external NMIs from the system. */ 816 /* Only the BSP gets external NMIs from the system. */
840 if (!cpu) 817 if (!cpu)
841 reason = get_nmi_reason(); 818 reason = get_nmi_reason();
842 819
@@ -848,18 +825,17 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
848 * Ok, so this is none of the documented NMI sources, 825 * Ok, so this is none of the documented NMI sources,
849 * so it must be the NMI watchdog. 826 * so it must be the NMI watchdog.
850 */ 827 */
851 if (nmi_watchdog_tick(regs,reason)) 828 if (nmi_watchdog_tick(regs, reason))
852 return; 829 return;
853 if (!do_nmi_callback(regs,cpu)) 830 if (!do_nmi_callback(regs, cpu))
854 unknown_nmi_error(reason, regs); 831 unknown_nmi_error(reason, regs);
855 832
856 return; 833 return;
857 } 834 }
858 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 835 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
859 return; 836 return;
860 837
861 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 838 /* AK: following checks seem to be broken on modern chipsets. FIXME */
862
863 if (reason & 0x80) 839 if (reason & 0x80)
864 mem_parity_error(reason, regs); 840 mem_parity_error(reason, regs);
865 if (reason & 0x40) 841 if (reason & 0x40)
@@ -870,9 +846,12 @@ asmlinkage notrace __kprobes void
870do_nmi(struct pt_regs *regs, long error_code) 846do_nmi(struct pt_regs *regs, long error_code)
871{ 847{
872 nmi_enter(); 848 nmi_enter();
849
873 add_pda(__nmi_count, 1); 850 add_pda(__nmi_count, 1);
851
874 if (!ignore_nmis) 852 if (!ignore_nmis)
875 default_do_nmi(regs); 853 default_do_nmi(regs);
854
876 nmi_exit(); 855 nmi_exit();
877} 856}
878 857
@@ -889,13 +868,14 @@ void restart_nmi(void)
889} 868}
890 869
891/* runs on IST stack. */ 870/* runs on IST stack. */
892asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) 871asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
893{ 872{
894 trace_hardirqs_fixup(); 873 trace_hardirqs_fixup();
895 874
896 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { 875 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
876 == NOTIFY_STOP)
897 return; 877 return;
898 } 878
899 preempt_conditional_sti(regs); 879 preempt_conditional_sti(regs);
900 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); 880 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
901 preempt_conditional_cli(regs); 881 preempt_conditional_cli(regs);
@@ -926,8 +906,8 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
926asmlinkage void __kprobes do_debug(struct pt_regs * regs, 906asmlinkage void __kprobes do_debug(struct pt_regs * regs,
927 unsigned long error_code) 907 unsigned long error_code)
928{ 908{
929 unsigned long condition;
930 struct task_struct *tsk = current; 909 struct task_struct *tsk = current;
910 unsigned long condition;
931 siginfo_t info; 911 siginfo_t info;
932 912
933 trace_hardirqs_fixup(); 913 trace_hardirqs_fixup();
@@ -948,21 +928,19 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
948 928
949 /* Mask out spurious debug traps due to lazy DR7 setting */ 929 /* Mask out spurious debug traps due to lazy DR7 setting */
950 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 930 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
951 if (!tsk->thread.debugreg7) { 931 if (!tsk->thread.debugreg7)
952 goto clear_dr7; 932 goto clear_dr7;
953 }
954 } 933 }
955 934
956 tsk->thread.debugreg6 = condition; 935 tsk->thread.debugreg6 = condition;
957 936
958
959 /* 937 /*
960 * Single-stepping through TF: make sure we ignore any events in 938 * Single-stepping through TF: make sure we ignore any events in
961 * kernel space (but re-enable TF when returning to user mode). 939 * kernel space (but re-enable TF when returning to user mode).
962 */ 940 */
963 if (condition & DR_STEP) { 941 if (condition & DR_STEP) {
964 if (!user_mode(regs)) 942 if (!user_mode(regs))
965 goto clear_TF_reenable; 943 goto clear_TF_reenable;
966 } 944 }
967 945
968 /* Ok, finally something we can handle */ 946 /* Ok, finally something we can handle */
@@ -975,7 +953,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
975 force_sig_info(SIGTRAP, &info, tsk); 953 force_sig_info(SIGTRAP, &info, tsk);
976 954
977clear_dr7: 955clear_dr7:
978 set_debugreg(0UL, 7); 956 set_debugreg(0, 7);
979 preempt_conditional_cli(regs); 957 preempt_conditional_cli(regs);
980 return; 958 return;
981 959
@@ -983,6 +961,7 @@ clear_TF_reenable:
983 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 961 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
984 regs->flags &= ~X86_EFLAGS_TF; 962 regs->flags &= ~X86_EFLAGS_TF;
985 preempt_conditional_cli(regs); 963 preempt_conditional_cli(regs);
964 return;
986} 965}
987 966
988static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) 967static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
@@ -1005,7 +984,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
1005asmlinkage void do_coprocessor_error(struct pt_regs *regs) 984asmlinkage void do_coprocessor_error(struct pt_regs *regs)
1006{ 985{
1007 void __user *ip = (void __user *)(regs->ip); 986 void __user *ip = (void __user *)(regs->ip);
1008 struct task_struct * task; 987 struct task_struct *task;
1009 siginfo_t info; 988 siginfo_t info;
1010 unsigned short cwd, swd; 989 unsigned short cwd, swd;
1011 990
@@ -1038,30 +1017,30 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
1038 cwd = get_fpu_cwd(task); 1017 cwd = get_fpu_cwd(task);
1039 swd = get_fpu_swd(task); 1018 swd = get_fpu_swd(task);
1040 switch (swd & ~cwd & 0x3f) { 1019 switch (swd & ~cwd & 0x3f) {
1041 case 0x000: 1020 case 0x000: /* No unmasked exception */
1042 default: 1021 default: /* Multiple exceptions */
1043 break; 1022 break;
1044 case 0x001: /* Invalid Op */ 1023 case 0x001: /* Invalid Op */
1045 /* 1024 /*
1046 * swd & 0x240 == 0x040: Stack Underflow 1025 * swd & 0x240 == 0x040: Stack Underflow
1047 * swd & 0x240 == 0x240: Stack Overflow 1026 * swd & 0x240 == 0x240: Stack Overflow
1048 * User must clear the SF bit (0x40) if set 1027 * User must clear the SF bit (0x40) if set
1049 */ 1028 */
1050 info.si_code = FPE_FLTINV; 1029 info.si_code = FPE_FLTINV;
1051 break; 1030 break;
1052 case 0x002: /* Denormalize */ 1031 case 0x002: /* Denormalize */
1053 case 0x010: /* Underflow */ 1032 case 0x010: /* Underflow */
1054 info.si_code = FPE_FLTUND; 1033 info.si_code = FPE_FLTUND;
1055 break; 1034 break;
1056 case 0x004: /* Zero Divide */ 1035 case 0x004: /* Zero Divide */
1057 info.si_code = FPE_FLTDIV; 1036 info.si_code = FPE_FLTDIV;
1058 break; 1037 break;
1059 case 0x008: /* Overflow */ 1038 case 0x008: /* Overflow */
1060 info.si_code = FPE_FLTOVF; 1039 info.si_code = FPE_FLTOVF;
1061 break; 1040 break;
1062 case 0x020: /* Precision */ 1041 case 0x020: /* Precision */
1063 info.si_code = FPE_FLTRES; 1042 info.si_code = FPE_FLTRES;
1064 break; 1043 break;
1065 } 1044 }
1066 force_sig_info(SIGFPE, &info, task); 1045 force_sig_info(SIGFPE, &info, task);
1067} 1046}
@@ -1074,7 +1053,7 @@ asmlinkage void bad_intr(void)
1074asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1053asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1075{ 1054{
1076 void __user *ip = (void __user *)(regs->ip); 1055 void __user *ip = (void __user *)(regs->ip);
1077 struct task_struct * task; 1056 struct task_struct *task;
1078 siginfo_t info; 1057 siginfo_t info;
1079 unsigned short mxcsr; 1058 unsigned short mxcsr;
1080 1059
@@ -1102,25 +1081,25 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1102 */ 1081 */
1103 mxcsr = get_fpu_mxcsr(task); 1082 mxcsr = get_fpu_mxcsr(task);
1104 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { 1083 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1105 case 0x000: 1084 case 0x000:
1106 default: 1085 default:
1107 break; 1086 break;
1108 case 0x001: /* Invalid Op */ 1087 case 0x001: /* Invalid Op */
1109 info.si_code = FPE_FLTINV; 1088 info.si_code = FPE_FLTINV;
1110 break; 1089 break;
1111 case 0x002: /* Denormalize */ 1090 case 0x002: /* Denormalize */
1112 case 0x010: /* Underflow */ 1091 case 0x010: /* Underflow */
1113 info.si_code = FPE_FLTUND; 1092 info.si_code = FPE_FLTUND;
1114 break; 1093 break;
1115 case 0x004: /* Zero Divide */ 1094 case 0x004: /* Zero Divide */
1116 info.si_code = FPE_FLTDIV; 1095 info.si_code = FPE_FLTDIV;
1117 break; 1096 break;
1118 case 0x008: /* Overflow */ 1097 case 0x008: /* Overflow */
1119 info.si_code = FPE_FLTOVF; 1098 info.si_code = FPE_FLTOVF;
1120 break; 1099 break;
1121 case 0x020: /* Precision */ 1100 case 0x020: /* Precision */
1122 info.si_code = FPE_FLTRES; 1101 info.si_code = FPE_FLTRES;
1123 break; 1102 break;
1124 } 1103 }
1125 force_sig_info(SIGFPE, &info, task); 1104 force_sig_info(SIGFPE, &info, task);
1126} 1105}
@@ -1138,7 +1117,7 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1138} 1117}
1139 1118
1140/* 1119/*
1141 * 'math_state_restore()' saves the current math information in the 1120 * 'math_state_restore()' saves the current math information in the
1142 * old math state array, and gets the new ones from the current task 1121 * old math state array, and gets the new ones from the current task
1143 * 1122 *
1144 * Careful.. There are problems with IBM-designed IRQ13 behaviour. 1123 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
@@ -1163,7 +1142,7 @@ asmlinkage void math_state_restore(void)
1163 local_irq_disable(); 1142 local_irq_disable();
1164 } 1143 }
1165 1144
1166 clts(); /* Allow maths ops (or we recurse) */ 1145 clts(); /* Allow maths ops (or we recurse) */
1167 restore_fpu_checking(&me->thread.xstate->fxsave); 1146 restore_fpu_checking(&me->thread.xstate->fxsave);
1168 task_thread_info(me)->status |= TS_USEDFPU; 1147 task_thread_info(me)->status |= TS_USEDFPU;
1169 me->fpu_counter++; 1148 me->fpu_counter++;
@@ -1172,64 +1151,61 @@ EXPORT_SYMBOL_GPL(math_state_restore);
1172 1151
1173void __init trap_init(void) 1152void __init trap_init(void)
1174{ 1153{
1175 set_intr_gate(0,&divide_error); 1154 set_intr_gate(0, &divide_error);
1176 set_intr_gate_ist(1,&debug,DEBUG_STACK); 1155 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1177 set_intr_gate_ist(2,&nmi,NMI_STACK); 1156 set_intr_gate_ist(2, &nmi, NMI_STACK);
1178 set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ 1157 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
1179 set_system_gate(4,&overflow); /* int4 can be called from all */ 1158 set_system_gate(4, &overflow); /* int4 can be called from all */
1180 set_intr_gate(5,&bounds); 1159 set_intr_gate(5, &bounds);
1181 set_intr_gate(6,&invalid_op); 1160 set_intr_gate(6, &invalid_op);
1182 set_intr_gate(7,&device_not_available); 1161 set_intr_gate(7, &device_not_available);
1183 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); 1162 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1184 set_intr_gate(9,&coprocessor_segment_overrun); 1163 set_intr_gate(9, &coprocessor_segment_overrun);
1185 set_intr_gate(10,&invalid_TSS); 1164 set_intr_gate(10, &invalid_TSS);
1186 set_intr_gate(11,&segment_not_present); 1165 set_intr_gate(11, &segment_not_present);
1187 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); 1166 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
1188 set_intr_gate(13,&general_protection); 1167 set_intr_gate(13, &general_protection);
1189 set_intr_gate(14,&page_fault); 1168 set_intr_gate(14, &page_fault);
1190 set_intr_gate(15,&spurious_interrupt_bug); 1169 set_intr_gate(15, &spurious_interrupt_bug);
1191 set_intr_gate(16,&coprocessor_error); 1170 set_intr_gate(16, &coprocessor_error);
1192 set_intr_gate(17,&alignment_check); 1171 set_intr_gate(17, &alignment_check);
1193#ifdef CONFIG_X86_MCE 1172#ifdef CONFIG_X86_MCE
1194 set_intr_gate_ist(18,&machine_check, MCE_STACK); 1173 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1195#endif 1174#endif
1196 set_intr_gate(19,&simd_coprocessor_error); 1175 set_intr_gate(19, &simd_coprocessor_error);
1197 1176
1198#ifdef CONFIG_IA32_EMULATION 1177#ifdef CONFIG_IA32_EMULATION
1199 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 1178 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1200#endif 1179#endif
1201
1202 /* 1180 /*
1203 * initialize the per thread extended state: 1181 * initialize the per thread extended state:
1204 */ 1182 */
1205 init_thread_xstate(); 1183 init_thread_xstate();
1206 /* 1184 /*
1207 * Should be a barrier for any external CPU state. 1185 * Should be a barrier for any external CPU state:
1208 */ 1186 */
1209 cpu_init(); 1187 cpu_init();
1210} 1188}
1211 1189
1212
1213static int __init oops_setup(char *s) 1190static int __init oops_setup(char *s)
1214{ 1191{
1215 if (!s) 1192 if (!s)
1216 return -EINVAL; 1193 return -EINVAL;
1217 if (!strcmp(s, "panic")) 1194 if (!strcmp(s, "panic"))
1218 panic_on_oops = 1; 1195 panic_on_oops = 1;
1219 return 0; 1196 return 0;
1220} 1197}
1221early_param("oops", oops_setup); 1198early_param("oops", oops_setup);
1222 1199
1223static int __init kstack_setup(char *s) 1200static int __init kstack_setup(char *s)
1224{ 1201{
1225 if (!s) 1202 if (!s)
1226 return -EINVAL; 1203 return -EINVAL;
1227 kstack_depth_to_print = simple_strtoul(s,NULL,0); 1204 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1228 return 0; 1205 return 0;
1229} 1206}
1230early_param("kstack", kstack_setup); 1207early_param("kstack", kstack_setup);
1231 1208
1232
1233static int __init code_bytes_setup(char *s) 1209static int __init code_bytes_setup(char *s)
1234{ 1210{
1235 code_bytes = simple_strtoul(s, NULL, 0); 1211 code_bytes = simple_strtoul(s, NULL, 0);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
new file mode 100644
index 000000000000..7603c0553909
--- /dev/null
+++ b/arch/x86/kernel/tsc.c
@@ -0,0 +1,535 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/timer.h>
6#include <linux/acpi_pmtmr.h>
7#include <linux/cpufreq.h>
8#include <linux/dmi.h>
9#include <linux/delay.h>
10#include <linux/clocksource.h>
11#include <linux/percpu.h>
12
13#include <asm/hpet.h>
14#include <asm/timer.h>
15#include <asm/vgtod.h>
16#include <asm/time.h>
17#include <asm/delay.h>
18
19unsigned int cpu_khz; /* TSC clocks / usec, not used here */
20EXPORT_SYMBOL(cpu_khz);
21unsigned int tsc_khz;
22EXPORT_SYMBOL(tsc_khz);
23
24/*
25 * TSC can be unstable due to cpufreq or due to unsynced TSCs
26 */
27static int tsc_unstable;
28
29/* native_sched_clock() is called before tsc_init(), so
30 we must start with the TSC soft disabled to prevent
31 erroneous rdtsc usage on !cpu_has_tsc processors */
32static int tsc_disabled = -1;
33
34/*
35 * Scheduler clock - returns current time in nanosec units.
36 */
37u64 native_sched_clock(void)
38{
39 u64 this_offset;
40
41 /*
42 * Fall back to jiffies if there's no TSC available:
43 * ( But note that we still use it if the TSC is marked
44 * unstable. We do this because unlike Time Of Day,
45 * the scheduler clock tolerates small errors and it's
46 * very important for it to be as fast as the platform
47 * can achive it. )
48 */
49 if (unlikely(tsc_disabled)) {
50 /* No locking but a rare wrong value is not a big deal: */
51 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
52 }
53
54 /* read the Time Stamp Counter: */
55 rdtscll(this_offset);
56
57 /* return the value in ns */
58 return cycles_2_ns(this_offset);
59}
60
61/* We need to define a real function for sched_clock, to override the
62 weak default version */
63#ifdef CONFIG_PARAVIRT
64unsigned long long sched_clock(void)
65{
66 return paravirt_sched_clock();
67}
68#else
69unsigned long long
70sched_clock(void) __attribute__((alias("native_sched_clock")));
71#endif
72
73int check_tsc_unstable(void)
74{
75 return tsc_unstable;
76}
77EXPORT_SYMBOL_GPL(check_tsc_unstable);
78
79#ifdef CONFIG_X86_TSC
80int __init notsc_setup(char *str)
81{
82 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
83 "cannot disable TSC completely.\n");
84 tsc_disabled = 1;
85 return 1;
86}
87#else
88/*
89 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
90 * in cpu/common.c
91 */
92int __init notsc_setup(char *str)
93{
94 setup_clear_cpu_cap(X86_FEATURE_TSC);
95 return 1;
96}
97#endif
98
99__setup("notsc", notsc_setup);
100
101#define MAX_RETRIES 5
102#define SMI_TRESHOLD 50000
103
104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */
107static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
108{
109 u64 t1, t2;
110 int i;
111
112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles();
114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else
117 *pm = acpi_pm_read_early();
118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2;
121 }
122 return ULLONG_MAX;
123}
124
125/**
126 * native_calibrate_tsc - calibrate the tsc on boot
127 */
128unsigned long native_calibrate_tsc(void)
129{
130 unsigned long flags;
131 u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
132 int hpet = is_hpet_enabled();
133 unsigned int tsc_khz_val = 0;
134
135 local_irq_save(flags);
136
137 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
138
139 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
140
141 outb(0xb0, 0x43);
142 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
143 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
144 tr1 = get_cycles();
145 while ((inb(0x61) & 0x20) == 0);
146 tr2 = get_cycles();
147
148 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
149
150 local_irq_restore(flags);
151
152 /*
153 * Preset the result with the raw and inaccurate PIT
154 * calibration value
155 */
156 delta = (tr2 - tr1);
157 do_div(delta, 50);
158 tsc_khz_val = delta;
159
160 /* hpet or pmtimer available ? */
161 if (!hpet && !pm1 && !pm2) {
162 printk(KERN_INFO "TSC calibrated against PIT\n");
163 goto out;
164 }
165
166 /* Check, whether the sampling was disturbed by an SMI */
167 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) {
168 printk(KERN_WARNING "TSC calibration disturbed by SMI, "
169 "using PIT calibration result\n");
170 goto out;
171 }
172
173 tsc2 = (tsc2 - tsc1) * 1000000LL;
174
175 if (hpet) {
176 printk(KERN_INFO "TSC calibrated against HPET\n");
177 if (hpet2 < hpet1)
178 hpet2 += 0x100000000ULL;
179 hpet2 -= hpet1;
180 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
181 do_div(tsc1, 1000000);
182 } else {
183 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
184 if (pm2 < pm1)
185 pm2 += (u64)ACPI_PM_OVRRUN;
186 pm2 -= pm1;
187 tsc1 = pm2 * 1000000000LL;
188 do_div(tsc1, PMTMR_TICKS_PER_SEC);
189 }
190
191 do_div(tsc2, tsc1);
192 tsc_khz_val = tsc2;
193
194out:
195 return tsc_khz_val;
196}
197
198
199#ifdef CONFIG_X86_32
200/* Only called from the Powernow K7 cpu freq driver */
201int recalibrate_cpu_khz(void)
202{
203#ifndef CONFIG_SMP
204 unsigned long cpu_khz_old = cpu_khz;
205
206 if (cpu_has_tsc) {
207 tsc_khz = calibrate_tsc();
208 cpu_khz = tsc_khz;
209 cpu_data(0).loops_per_jiffy =
210 cpufreq_scale(cpu_data(0).loops_per_jiffy,
211 cpu_khz_old, cpu_khz);
212 return 0;
213 } else
214 return -ENODEV;
215#else
216 return -ENODEV;
217#endif
218}
219
220EXPORT_SYMBOL(recalibrate_cpu_khz);
221
222#endif /* CONFIG_X86_32 */
223
224/* Accelerators for sched_clock()
225 * convert from cycles(64bits) => nanoseconds (64bits)
226 * basic equation:
227 * ns = cycles / (freq / ns_per_sec)
228 * ns = cycles * (ns_per_sec / freq)
229 * ns = cycles * (10^9 / (cpu_khz * 10^3))
230 * ns = cycles * (10^6 / cpu_khz)
231 *
232 * Then we use scaling math (suggested by george@mvista.com) to get:
233 * ns = cycles * (10^6 * SC / cpu_khz) / SC
234 * ns = cycles * cyc2ns_scale / SC
235 *
236 * And since SC is a constant power of two, we can convert the div
237 * into a shift.
238 *
239 * We can use khz divisor instead of mhz to keep a better precision, since
240 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
241 * (mathieu.desnoyers@polymtl.ca)
242 *
243 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
244 */
245
246DEFINE_PER_CPU(unsigned long, cyc2ns);
247
248static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
249{
250 unsigned long long tsc_now, ns_now;
251 unsigned long flags, *scale;
252
253 local_irq_save(flags);
254 sched_clock_idle_sleep_event();
255
256 scale = &per_cpu(cyc2ns, cpu);
257
258 rdtscll(tsc_now);
259 ns_now = __cycles_2_ns(tsc_now);
260
261 if (cpu_khz)
262 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
263
264 sched_clock_idle_wakeup_event(0);
265 local_irq_restore(flags);
266}
267
268#ifdef CONFIG_CPU_FREQ
269
270/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
271 * changes.
272 *
273 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
274 * not that important because current Opteron setups do not support
275 * scaling on SMP anyroads.
276 *
277 * Should fix up last_tsc too. Currently gettimeofday in the
278 * first tick after the change will be slightly wrong.
279 */
280
281static unsigned int ref_freq;
282static unsigned long loops_per_jiffy_ref;
283static unsigned long tsc_khz_ref;
284
285static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
286 void *data)
287{
288 struct cpufreq_freqs *freq = data;
289 unsigned long *lpj, dummy;
290
291 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
292 return 0;
293
294 lpj = &dummy;
295 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
296#ifdef CONFIG_SMP
297 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
298#else
299 lpj = &boot_cpu_data.loops_per_jiffy;
300#endif
301
302 if (!ref_freq) {
303 ref_freq = freq->old;
304 loops_per_jiffy_ref = *lpj;
305 tsc_khz_ref = tsc_khz;
306 }
307 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
308 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
309 (val == CPUFREQ_RESUMECHANGE)) {
310 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
311
312 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
313 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
314 mark_tsc_unstable("cpufreq changes");
315 }
316
317 set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
318
319 return 0;
320}
321
322static struct notifier_block time_cpufreq_notifier_block = {
323 .notifier_call = time_cpufreq_notifier
324};
325
326static int __init cpufreq_tsc(void)
327{
328 cpufreq_register_notifier(&time_cpufreq_notifier_block,
329 CPUFREQ_TRANSITION_NOTIFIER);
330 return 0;
331}
332
333core_initcall(cpufreq_tsc);
334
335#endif /* CONFIG_CPU_FREQ */
336
337/* clocksource code */
338
339static struct clocksource clocksource_tsc;
340
341/*
342 * We compare the TSC to the cycle_last value in the clocksource
343 * structure to avoid a nasty time-warp. This can be observed in a
344 * very small window right after one CPU updated cycle_last under
345 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
346 * is smaller than the cycle_last reference value due to a TSC which
347 * is slighty behind. This delta is nowhere else observable, but in
348 * that case it results in a forward time jump in the range of hours
349 * due to the unsigned delta calculation of the time keeping core
350 * code, which is necessary to support wrapping clocksources like pm
351 * timer.
352 */
353static cycle_t read_tsc(void)
354{
355 cycle_t ret = (cycle_t)get_cycles();
356
357 return ret >= clocksource_tsc.cycle_last ?
358 ret : clocksource_tsc.cycle_last;
359}
360
361#ifdef CONFIG_X86_64
362static cycle_t __vsyscall_fn vread_tsc(void)
363{
364 cycle_t ret = (cycle_t)vget_cycles();
365
366 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
367 ret : __vsyscall_gtod_data.clock.cycle_last;
368}
369#endif
370
371static struct clocksource clocksource_tsc = {
372 .name = "tsc",
373 .rating = 300,
374 .read = read_tsc,
375 .mask = CLOCKSOURCE_MASK(64),
376 .shift = 22,
377 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
378 CLOCK_SOURCE_MUST_VERIFY,
379#ifdef CONFIG_X86_64
380 .vread = vread_tsc,
381#endif
382};
383
384void mark_tsc_unstable(char *reason)
385{
386 if (!tsc_unstable) {
387 tsc_unstable = 1;
388 printk("Marking TSC unstable due to %s\n", reason);
389 /* Change only the rating, when not registered */
390 if (clocksource_tsc.mult)
391 clocksource_change_rating(&clocksource_tsc, 0);
392 else
393 clocksource_tsc.rating = 0;
394 }
395}
396
397EXPORT_SYMBOL_GPL(mark_tsc_unstable);
398
399static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
400{
401 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
402 d->ident);
403 tsc_unstable = 1;
404 return 0;
405}
406
407/* List of systems that have known TSC problems */
408static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
409 {
410 .callback = dmi_mark_tsc_unstable,
411 .ident = "IBM Thinkpad 380XD",
412 .matches = {
413 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
414 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
415 },
416 },
417 {}
418};
419
420/*
421 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
422 */
423#ifdef CONFIG_MGEODE_LX
424/* RTSC counts during suspend */
425#define RTSC_SUSP 0x100
426
427static void __init check_geode_tsc_reliable(void)
428{
429 unsigned long res_low, res_high;
430
431 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
432 if (res_low & RTSC_SUSP)
433 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
434}
435#else
436static inline void check_geode_tsc_reliable(void) { }
437#endif
438
439/*
440 * Make an educated guess if the TSC is trustworthy and synchronized
441 * over all CPUs.
442 */
443__cpuinit int unsynchronized_tsc(void)
444{
445 if (!cpu_has_tsc || tsc_unstable)
446 return 1;
447
448#ifdef CONFIG_SMP
449 if (apic_is_clustered_box())
450 return 1;
451#endif
452
453 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
454 return 0;
455 /*
456 * Intel systems are normally all synchronized.
457 * Exceptions must mark TSC as unstable:
458 */
459 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
460 /* assume multi socket systems are not synchronized: */
461 if (num_possible_cpus() > 1)
462 tsc_unstable = 1;
463 }
464
465 return tsc_unstable;
466}
467
468static void __init init_tsc_clocksource(void)
469{
470 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
471 clocksource_tsc.shift);
472 /* lower the rating if we already know its unstable: */
473 if (check_tsc_unstable()) {
474 clocksource_tsc.rating = 0;
475 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
476 }
477 clocksource_register(&clocksource_tsc);
478}
479
480void __init tsc_init(void)
481{
482 u64 lpj;
483 int cpu;
484
485 if (!cpu_has_tsc)
486 return;
487
488 tsc_khz = calibrate_tsc();
489 cpu_khz = tsc_khz;
490
491 if (!tsc_khz) {
492 mark_tsc_unstable("could not calculate TSC khz");
493 return;
494 }
495
496#ifdef CONFIG_X86_64
497 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
498 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
499 cpu_khz = calibrate_cpu();
500#endif
501
502 lpj = ((u64)tsc_khz * 1000);
503 do_div(lpj, HZ);
504 lpj_fine = lpj;
505
506 printk("Detected %lu.%03lu MHz processor.\n",
507 (unsigned long)cpu_khz / 1000,
508 (unsigned long)cpu_khz % 1000);
509
510 /*
511 * Secondary CPUs do not run through tsc_init(), so set up
512 * all the scale factors for all CPUs, assuming the same
513 * speed as the bootup CPU. (cpufreq notifiers will fix this
514 * up if their speed diverges)
515 */
516 for_each_possible_cpu(cpu)
517 set_cyc2ns_scale(cpu_khz, cpu);
518
519 if (tsc_disabled > 0)
520 return;
521
522 /* now allow native_sched_clock() to use rdtsc */
523 tsc_disabled = 0;
524
525 use_tsc_delay();
526 /* Check and install the TSC clocksource */
527 dmi_check_system(bad_tsc_dmi_table);
528
529 if (unsynchronized_tsc())
530 mark_tsc_unstable("TSCs unsynchronized");
531
532 check_geode_tsc_reliable();
533 init_tsc_clocksource();
534}
535
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
deleted file mode 100644
index 6240922e497c..000000000000
--- a/arch/x86/kernel/tsc_32.c
+++ /dev/null
@@ -1,455 +0,0 @@
1#include <linux/sched.h>
2#include <linux/clocksource.h>
3#include <linux/workqueue.h>
4#include <linux/delay.h>
5#include <linux/cpufreq.h>
6#include <linux/jiffies.h>
7#include <linux/init.h>
8#include <linux/dmi.h>
9#include <linux/percpu.h>
10
11#include <asm/delay.h>
12#include <asm/tsc.h>
13#include <asm/io.h>
14#include <asm/timer.h>
15
16#include "mach_timer.h"
17
18/* native_sched_clock() is called before tsc_init(), so
19 we must start with the TSC soft disabled to prevent
20 erroneous rdtsc usage on !cpu_has_tsc processors */
21static int tsc_disabled = -1;
22
23/*
24 * On some systems the TSC frequency does not
25 * change with the cpu frequency. So we need
26 * an extra value to store the TSC freq
27 */
28unsigned int tsc_khz;
29EXPORT_SYMBOL_GPL(tsc_khz);
30
31#ifdef CONFIG_X86_TSC
32static int __init tsc_setup(char *str)
33{
34 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
35 "cannot disable TSC completely.\n");
36 tsc_disabled = 1;
37 return 1;
38}
39#else
40/*
41 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
42 * in cpu/common.c
43 */
44static int __init tsc_setup(char *str)
45{
46 setup_clear_cpu_cap(X86_FEATURE_TSC);
47 return 1;
48}
49#endif
50
51__setup("notsc", tsc_setup);
52
53/*
54 * code to mark and check if the TSC is unstable
55 * due to cpufreq or due to unsynced TSCs
56 */
57static int tsc_unstable;
58
59int check_tsc_unstable(void)
60{
61 return tsc_unstable;
62}
63EXPORT_SYMBOL_GPL(check_tsc_unstable);
64
65/* Accelerators for sched_clock()
66 * convert from cycles(64bits) => nanoseconds (64bits)
67 * basic equation:
68 * ns = cycles / (freq / ns_per_sec)
69 * ns = cycles * (ns_per_sec / freq)
70 * ns = cycles * (10^9 / (cpu_khz * 10^3))
71 * ns = cycles * (10^6 / cpu_khz)
72 *
73 * Then we use scaling math (suggested by george@mvista.com) to get:
74 * ns = cycles * (10^6 * SC / cpu_khz) / SC
75 * ns = cycles * cyc2ns_scale / SC
76 *
77 * And since SC is a constant power of two, we can convert the div
78 * into a shift.
79 *
80 * We can use khz divisor instead of mhz to keep a better precision, since
81 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
82 * (mathieu.desnoyers@polymtl.ca)
83 *
84 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
85 */
86
87DEFINE_PER_CPU(unsigned long, cyc2ns);
88
89static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
90{
91 unsigned long long tsc_now, ns_now;
92 unsigned long flags, *scale;
93
94 local_irq_save(flags);
95 sched_clock_idle_sleep_event();
96
97 scale = &per_cpu(cyc2ns, cpu);
98
99 rdtscll(tsc_now);
100 ns_now = __cycles_2_ns(tsc_now);
101
102 if (cpu_khz)
103 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
104
105 /*
106 * Start smoothly with the new frequency:
107 */
108 sched_clock_idle_wakeup_event(0);
109 local_irq_restore(flags);
110}
111
112/*
113 * Scheduler clock - returns current time in nanosec units.
114 */
115unsigned long long native_sched_clock(void)
116{
117 unsigned long long this_offset;
118
119 /*
120 * Fall back to jiffies if there's no TSC available:
121 * ( But note that we still use it if the TSC is marked
122 * unstable. We do this because unlike Time Of Day,
123 * the scheduler clock tolerates small errors and it's
124 * very important for it to be as fast as the platform
125 * can achive it. )
126 */
127 if (unlikely(tsc_disabled))
128 /* No locking but a rare wrong value is not a big deal: */
129 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
130
131 /* read the Time Stamp Counter: */
132 rdtscll(this_offset);
133
134 /* return the value in ns */
135 return cycles_2_ns(this_offset);
136}
137
138/* We need to define a real function for sched_clock, to override the
139 weak default version */
140#ifdef CONFIG_PARAVIRT
141unsigned long long sched_clock(void)
142{
143 return paravirt_sched_clock();
144}
145#else
146unsigned long long sched_clock(void)
147 __attribute__((alias("native_sched_clock")));
148#endif
149
150unsigned long native_calculate_cpu_khz(void)
151{
152 unsigned long long start, end;
153 unsigned long count;
154 u64 delta64 = (u64)ULLONG_MAX;
155 int i;
156 unsigned long flags;
157
158 local_irq_save(flags);
159
160 /* run 3 times to ensure the cache is warm and to get an accurate reading */
161 for (i = 0; i < 3; i++) {
162 mach_prepare_counter();
163 rdtscll(start);
164 mach_countup(&count);
165 rdtscll(end);
166
167 /*
168 * Error: ECTCNEVERSET
169 * The CTC wasn't reliable: we got a hit on the very first read,
170 * or the CPU was so fast/slow that the quotient wouldn't fit in
171 * 32 bits..
172 */
173 if (count <= 1)
174 continue;
175
176 /* cpu freq too slow: */
177 if ((end - start) <= CALIBRATE_TIME_MSEC)
178 continue;
179
180 /*
181 * We want the minimum time of all runs in case one of them
182 * is inaccurate due to SMI or other delay
183 */
184 delta64 = min(delta64, (end - start));
185 }
186
187 /* cpu freq too fast (or every run was bad): */
188 if (delta64 > (1ULL<<32))
189 goto err;
190
191 delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
192 do_div(delta64,CALIBRATE_TIME_MSEC);
193
194 local_irq_restore(flags);
195 return (unsigned long)delta64;
196err:
197 local_irq_restore(flags);
198 return 0;
199}
200
201int recalibrate_cpu_khz(void)
202{
203#ifndef CONFIG_SMP
204 unsigned long cpu_khz_old = cpu_khz;
205
206 if (cpu_has_tsc) {
207 cpu_khz = calculate_cpu_khz();
208 tsc_khz = cpu_khz;
209 cpu_data(0).loops_per_jiffy =
210 cpufreq_scale(cpu_data(0).loops_per_jiffy,
211 cpu_khz_old, cpu_khz);
212 return 0;
213 } else
214 return -ENODEV;
215#else
216 return -ENODEV;
217#endif
218}
219
220EXPORT_SYMBOL(recalibrate_cpu_khz);
221
222#ifdef CONFIG_CPU_FREQ
223
224/*
225 * if the CPU frequency is scaled, TSC-based delays will need a different
226 * loops_per_jiffy value to function properly.
227 */
228static unsigned int ref_freq;
229static unsigned long loops_per_jiffy_ref;
230static unsigned long cpu_khz_ref;
231
232static int
233time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
234{
235 struct cpufreq_freqs *freq = data;
236
237 if (!ref_freq) {
238 if (!freq->old){
239 ref_freq = freq->new;
240 return 0;
241 }
242 ref_freq = freq->old;
243 loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy;
244 cpu_khz_ref = cpu_khz;
245 }
246
247 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
248 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
249 (val == CPUFREQ_RESUMECHANGE)) {
250 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
251 cpu_data(freq->cpu).loops_per_jiffy =
252 cpufreq_scale(loops_per_jiffy_ref,
253 ref_freq, freq->new);
254
255 if (cpu_khz) {
256
257 if (num_online_cpus() == 1)
258 cpu_khz = cpufreq_scale(cpu_khz_ref,
259 ref_freq, freq->new);
260 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
261 tsc_khz = cpu_khz;
262 set_cyc2ns_scale(cpu_khz, freq->cpu);
263 /*
264 * TSC based sched_clock turns
265 * to junk w/ cpufreq
266 */
267 mark_tsc_unstable("cpufreq changes");
268 }
269 }
270 }
271
272 return 0;
273}
274
275static struct notifier_block time_cpufreq_notifier_block = {
276 .notifier_call = time_cpufreq_notifier
277};
278
279static int __init cpufreq_tsc(void)
280{
281 return cpufreq_register_notifier(&time_cpufreq_notifier_block,
282 CPUFREQ_TRANSITION_NOTIFIER);
283}
284core_initcall(cpufreq_tsc);
285
286#endif
287
288/* clock source code */
289
290static struct clocksource clocksource_tsc;
291
292/*
293 * We compare the TSC to the cycle_last value in the clocksource
294 * structure to avoid a nasty time-warp issue. This can be observed in
295 * a very small window right after one CPU updated cycle_last under
296 * xtime lock and the other CPU reads a TSC value which is smaller
297 * than the cycle_last reference value due to a TSC which is slighty
298 * behind. This delta is nowhere else observable, but in that case it
299 * results in a forward time jump in the range of hours due to the
300 * unsigned delta calculation of the time keeping core code, which is
301 * necessary to support wrapping clocksources like pm timer.
302 */
303static cycle_t read_tsc(void)
304{
305 cycle_t ret;
306
307 rdtscll(ret);
308
309 return ret >= clocksource_tsc.cycle_last ?
310 ret : clocksource_tsc.cycle_last;
311}
312
313static struct clocksource clocksource_tsc = {
314 .name = "tsc",
315 .rating = 300,
316 .read = read_tsc,
317 .mask = CLOCKSOURCE_MASK(64),
318 .mult = 0, /* to be set */
319 .shift = 22,
320 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
321 CLOCK_SOURCE_MUST_VERIFY,
322};
323
324void mark_tsc_unstable(char *reason)
325{
326 if (!tsc_unstable) {
327 tsc_unstable = 1;
328 printk("Marking TSC unstable due to: %s.\n", reason);
329 /* Can be called before registration */
330 if (clocksource_tsc.mult)
331 clocksource_change_rating(&clocksource_tsc, 0);
332 else
333 clocksource_tsc.rating = 0;
334 }
335}
336EXPORT_SYMBOL_GPL(mark_tsc_unstable);
337
338static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
339{
340 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
341 d->ident);
342 tsc_unstable = 1;
343 return 0;
344}
345
346/* List of systems that have known TSC problems */
347static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
348 {
349 .callback = dmi_mark_tsc_unstable,
350 .ident = "IBM Thinkpad 380XD",
351 .matches = {
352 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
353 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
354 },
355 },
356 {}
357};
358
359/*
360 * Make an educated guess if the TSC is trustworthy and synchronized
361 * over all CPUs.
362 */
363__cpuinit int unsynchronized_tsc(void)
364{
365 if (!cpu_has_tsc || tsc_unstable)
366 return 1;
367
368 /* Anything with constant TSC should be synchronized */
369 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
370 return 0;
371
372 /*
373 * Intel systems are normally all synchronized.
374 * Exceptions must mark TSC as unstable:
375 */
376 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
377 /* assume multi socket systems are not synchronized: */
378 if (num_possible_cpus() > 1)
379 tsc_unstable = 1;
380 }
381 return tsc_unstable;
382}
383
384/*
385 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
386 */
387#ifdef CONFIG_MGEODE_LX
388/* RTSC counts during suspend */
389#define RTSC_SUSP 0x100
390
391static void __init check_geode_tsc_reliable(void)
392{
393 unsigned long res_low, res_high;
394
395 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
396 if (res_low & RTSC_SUSP)
397 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
398}
399#else
400static inline void check_geode_tsc_reliable(void) { }
401#endif
402
403
404void __init tsc_init(void)
405{
406 int cpu;
407 u64 lpj;
408
409 if (!cpu_has_tsc || tsc_disabled > 0)
410 return;
411
412 cpu_khz = calculate_cpu_khz();
413 tsc_khz = cpu_khz;
414
415 if (!cpu_khz) {
416 mark_tsc_unstable("could not calculate TSC khz");
417 return;
418 }
419
420 lpj = ((u64)tsc_khz * 1000);
421 do_div(lpj, HZ);
422 lpj_fine = lpj;
423
424 /* now allow native_sched_clock() to use rdtsc */
425 tsc_disabled = 0;
426
427 printk("Detected %lu.%03lu MHz processor.\n",
428 (unsigned long)cpu_khz / 1000,
429 (unsigned long)cpu_khz % 1000);
430
431 /*
432 * Secondary CPUs do not run through tsc_init(), so set up
433 * all the scale factors for all CPUs, assuming the same
434 * speed as the bootup CPU. (cpufreq notifiers will fix this
435 * up if their speed diverges)
436 */
437 for_each_possible_cpu(cpu)
438 set_cyc2ns_scale(cpu_khz, cpu);
439
440 use_tsc_delay();
441
442 /* Check and install the TSC clocksource */
443 dmi_check_system(bad_tsc_dmi_table);
444
445 unsynchronized_tsc();
446 check_geode_tsc_reliable();
447 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
448 clocksource_tsc.shift);
449 /* lower the rating if we already know its unstable: */
450 if (check_tsc_unstable()) {
451 clocksource_tsc.rating = 0;
452 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
453 }
454 clocksource_register(&clocksource_tsc);
455}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
deleted file mode 100644
index 9898fb01edfd..000000000000
--- a/arch/x86/kernel/tsc_64.c
+++ /dev/null
@@ -1,357 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/interrupt.h>
4#include <linux/init.h>
5#include <linux/clocksource.h>
6#include <linux/time.h>
7#include <linux/acpi.h>
8#include <linux/cpufreq.h>
9#include <linux/acpi_pmtmr.h>
10
11#include <asm/hpet.h>
12#include <asm/timex.h>
13#include <asm/timer.h>
14#include <asm/vgtod.h>
15
16static int notsc __initdata = 0;
17
18unsigned int cpu_khz; /* TSC clocks / usec, not used here */
19EXPORT_SYMBOL(cpu_khz);
20unsigned int tsc_khz;
21EXPORT_SYMBOL(tsc_khz);
22
23/* Accelerators for sched_clock()
24 * convert from cycles(64bits) => nanoseconds (64bits)
25 * basic equation:
26 * ns = cycles / (freq / ns_per_sec)
27 * ns = cycles * (ns_per_sec / freq)
28 * ns = cycles * (10^9 / (cpu_khz * 10^3))
29 * ns = cycles * (10^6 / cpu_khz)
30 *
31 * Then we use scaling math (suggested by george@mvista.com) to get:
32 * ns = cycles * (10^6 * SC / cpu_khz) / SC
33 * ns = cycles * cyc2ns_scale / SC
34 *
35 * And since SC is a constant power of two, we can convert the div
36 * into a shift.
37 *
38 * We can use khz divisor instead of mhz to keep a better precision, since
39 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
40 * (mathieu.desnoyers@polymtl.ca)
41 *
42 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
43 */
44DEFINE_PER_CPU(unsigned long, cyc2ns);
45
46static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
47{
48 unsigned long long tsc_now, ns_now;
49 unsigned long flags, *scale;
50
51 local_irq_save(flags);
52 sched_clock_idle_sleep_event();
53
54 scale = &per_cpu(cyc2ns, cpu);
55
56 rdtscll(tsc_now);
57 ns_now = __cycles_2_ns(tsc_now);
58
59 if (cpu_khz)
60 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
61
62 sched_clock_idle_wakeup_event(0);
63 local_irq_restore(flags);
64}
65
66unsigned long long native_sched_clock(void)
67{
68 unsigned long a = 0;
69
70 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
71 * which means it is not completely exact and may not be monotonous
72 * between CPUs. But the errors should be too small to matter for
73 * scheduling purposes.
74 */
75
76 rdtscll(a);
77 return cycles_2_ns(a);
78}
79
80/* We need to define a real function for sched_clock, to override the
81 weak default version */
82#ifdef CONFIG_PARAVIRT
83unsigned long long sched_clock(void)
84{
85 return paravirt_sched_clock();
86}
87#else
88unsigned long long
89sched_clock(void) __attribute__((alias("native_sched_clock")));
90#endif
91
92
93static int tsc_unstable;
94
95int check_tsc_unstable(void)
96{
97 return tsc_unstable;
98}
99EXPORT_SYMBOL_GPL(check_tsc_unstable);
100
101#ifdef CONFIG_CPU_FREQ
102
103/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
104 * changes.
105 *
106 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
107 * not that important because current Opteron setups do not support
108 * scaling on SMP anyroads.
109 *
110 * Should fix up last_tsc too. Currently gettimeofday in the
111 * first tick after the change will be slightly wrong.
112 */
113
114static unsigned int ref_freq;
115static unsigned long loops_per_jiffy_ref;
116static unsigned long tsc_khz_ref;
117
118static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
119 void *data)
120{
121 struct cpufreq_freqs *freq = data;
122 unsigned long *lpj, dummy;
123
124 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
125 return 0;
126
127 lpj = &dummy;
128 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
129#ifdef CONFIG_SMP
130 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
131#else
132 lpj = &boot_cpu_data.loops_per_jiffy;
133#endif
134
135 if (!ref_freq) {
136 ref_freq = freq->old;
137 loops_per_jiffy_ref = *lpj;
138 tsc_khz_ref = tsc_khz;
139 }
140 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
141 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
142 (val == CPUFREQ_RESUMECHANGE)) {
143 *lpj =
144 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
145
146 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
147 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
148 mark_tsc_unstable("cpufreq changes");
149 }
150
151 set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
152
153 return 0;
154}
155
156static struct notifier_block time_cpufreq_notifier_block = {
157 .notifier_call = time_cpufreq_notifier
158};
159
160static int __init cpufreq_tsc(void)
161{
162 cpufreq_register_notifier(&time_cpufreq_notifier_block,
163 CPUFREQ_TRANSITION_NOTIFIER);
164 return 0;
165}
166
167core_initcall(cpufreq_tsc);
168
169#endif
170
171#define MAX_RETRIES 5
172#define SMI_TRESHOLD 50000
173
174/*
175 * Read TSC and the reference counters. Take care of SMI disturbance
176 */
177static unsigned long __init tsc_read_refs(unsigned long *pm,
178 unsigned long *hpet)
179{
180 unsigned long t1, t2;
181 int i;
182
183 for (i = 0; i < MAX_RETRIES; i++) {
184 t1 = get_cycles();
185 if (hpet)
186 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
187 else
188 *pm = acpi_pm_read_early();
189 t2 = get_cycles();
190 if ((t2 - t1) < SMI_TRESHOLD)
191 return t2;
192 }
193 return ULONG_MAX;
194}
195
196/**
197 * tsc_calibrate - calibrate the tsc on boot
198 */
199void __init tsc_calibrate(void)
200{
201 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
202 int hpet = is_hpet_enabled(), cpu;
203
204 local_irq_save(flags);
205
206 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
207
208 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
209
210 outb(0xb0, 0x43);
211 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
212 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
213 tr1 = get_cycles();
214 while ((inb(0x61) & 0x20) == 0);
215 tr2 = get_cycles();
216
217 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
218
219 local_irq_restore(flags);
220
221 /*
222 * Preset the result with the raw and inaccurate PIT
223 * calibration value
224 */
225 tsc_khz = (tr2 - tr1) / 50;
226
227 /* hpet or pmtimer available ? */
228 if (!hpet && !pm1 && !pm2) {
229 printk(KERN_INFO "TSC calibrated against PIT\n");
230 goto out;
231 }
232
233 /* Check, whether the sampling was disturbed by an SMI */
234 if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
235 printk(KERN_WARNING "TSC calibration disturbed by SMI, "
236 "using PIT calibration result\n");
237 goto out;
238 }
239
240 tsc2 = (tsc2 - tsc1) * 1000000L;
241
242 if (hpet) {
243 printk(KERN_INFO "TSC calibrated against HPET\n");
244 if (hpet2 < hpet1)
245 hpet2 += 0x100000000UL;
246 hpet2 -= hpet1;
247 tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000;
248 } else {
249 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
250 if (pm2 < pm1)
251 pm2 += ACPI_PM_OVRRUN;
252 pm2 -= pm1;
253 tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC;
254 }
255
256 tsc_khz = tsc2 / tsc1;
257
258out:
259 for_each_possible_cpu(cpu)
260 set_cyc2ns_scale(tsc_khz, cpu);
261}
262
263/*
264 * Make an educated guess if the TSC is trustworthy and synchronized
265 * over all CPUs.
266 */
267__cpuinit int unsynchronized_tsc(void)
268{
269 if (tsc_unstable)
270 return 1;
271
272#ifdef CONFIG_SMP
273 if (apic_is_clustered_box())
274 return 1;
275#endif
276
277 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
278 return 0;
279
280 /* Assume multi socket systems are not synchronized */
281 return num_present_cpus() > 1;
282}
283
284int __init notsc_setup(char *s)
285{
286 notsc = 1;
287 return 1;
288}
289
290__setup("notsc", notsc_setup);
291
292static struct clocksource clocksource_tsc;
293
294/*
295 * We compare the TSC to the cycle_last value in the clocksource
296 * structure to avoid a nasty time-warp. This can be observed in a
297 * very small window right after one CPU updated cycle_last under
298 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
299 * is smaller than the cycle_last reference value due to a TSC which
300 * is slighty behind. This delta is nowhere else observable, but in
301 * that case it results in a forward time jump in the range of hours
302 * due to the unsigned delta calculation of the time keeping core
303 * code, which is necessary to support wrapping clocksources like pm
304 * timer.
305 */
306static cycle_t read_tsc(void)
307{
308 cycle_t ret = (cycle_t)get_cycles();
309
310 return ret >= clocksource_tsc.cycle_last ?
311 ret : clocksource_tsc.cycle_last;
312}
313
314static cycle_t __vsyscall_fn vread_tsc(void)
315{
316 cycle_t ret = (cycle_t)vget_cycles();
317
318 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
319 ret : __vsyscall_gtod_data.clock.cycle_last;
320}
321
322static struct clocksource clocksource_tsc = {
323 .name = "tsc",
324 .rating = 300,
325 .read = read_tsc,
326 .mask = CLOCKSOURCE_MASK(64),
327 .shift = 22,
328 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
329 CLOCK_SOURCE_MUST_VERIFY,
330 .vread = vread_tsc,
331};
332
333void mark_tsc_unstable(char *reason)
334{
335 if (!tsc_unstable) {
336 tsc_unstable = 1;
337 printk("Marking TSC unstable due to %s\n", reason);
338 /* Change only the rating, when not registered */
339 if (clocksource_tsc.mult)
340 clocksource_change_rating(&clocksource_tsc, 0);
341 else
342 clocksource_tsc.rating = 0;
343 }
344}
345EXPORT_SYMBOL_GPL(mark_tsc_unstable);
346
347void __init init_tsc_clocksource(void)
348{
349 if (!notsc) {
350 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
351 clocksource_tsc.shift);
352 if (check_tsc_unstable())
353 clocksource_tsc.rating = 0;
354
355 clocksource_register(&clocksource_tsc);
356 }
357}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
new file mode 100644
index 000000000000..e94bdb6add1d
--- /dev/null
+++ b/arch/x86/kernel/visws_quirks.c
@@ -0,0 +1,709 @@
1/*
2 * SGI Visual Workstation support and quirks, unmaintained.
3 *
4 * Split out from setup.c by davej@suse.de
5 *
6 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
7 *
8 * SGI Visual Workstation interrupt controller
9 *
10 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
11 * which serves as the main interrupt controller in the system. Non-legacy
12 * hardware in the system uses this controller directly. Legacy devices
13 * are connected to the PIIX4 which in turn has its 8259(s) connected to
14 * a of the Cobalt APIC entry.
15 *
16 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
17 *
18 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
19 */
20#include <linux/interrupt.h>
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/smp.h>
24
25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h>
28#include <asm/fixmap.h>
29#include <asm/reboot.h>
30#include <asm/setup.h>
31#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h>
34
35#include <mach_ipi.h>
36
37#include "mach_apic.h"
38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h>
49#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53
54#include <linux/sched.h>
55#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h>
58#include <linux/pci_ids.h>
59
60extern int no_broadcast;
61
62#include <asm/io.h>
63#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67
68char visws_board_type = -1;
69char visws_board_rev = -1;
70
71int is_visws_box(void)
72{
73 return visws_board_type >= 0;
74}
75
76static int __init visws_time_init_quirk(void)
77{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79
80 /* Set the countdown value */
81 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
82
83 /* Start the timer */
84 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
85
86 /* Enable (unmask) the timer interrupt */
87 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
88
89 /*
90 * Zero return means the generic timer setup code will set up
91 * the standard vector:
92 */
93 return 0;
94}
95
96static int __init visws_pre_intr_init_quirk(void)
97{
98 init_VISWS_APIC_irqs();
99
100 /*
101 * We dont want ISA irqs to be set up by the generic code:
102 */
103 return 1;
104}
105
106/* Quirk for machine specific memory setup. */
107
108#define MB (1024 * 1024)
109
110unsigned long sgivwfb_mem_phys;
111unsigned long sgivwfb_mem_size;
112EXPORT_SYMBOL(sgivwfb_mem_phys);
113EXPORT_SYMBOL(sgivwfb_mem_size);
114
115long long mem_size __initdata = 0;
116
117static char * __init visws_memory_setup_quirk(void)
118{
119 long long gfx_mem_size = 8 * MB;
120
121 mem_size = boot_params.alt_mem_k;
122
123 if (!mem_size) {
124 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
125 mem_size = 128 * MB;
126 }
127
128 /*
129 * this hardcodes the graphics memory to 8 MB
130 * it really should be sized dynamically (or at least
131 * set as a boot param)
132 */
133 if (!sgivwfb_mem_size) {
134 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
135 sgivwfb_mem_size = 8 * MB;
136 }
137
138 /*
139 * Trim to nearest MB
140 */
141 sgivwfb_mem_size &= ~((1 << 20) - 1);
142 sgivwfb_mem_phys = mem_size - gfx_mem_size;
143
144 e820_add_region(0, LOWMEMSIZE(), E820_RAM);
145 e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
146 e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
147
148 return "PROM";
149}
150
151static void visws_machine_emergency_restart(void)
152{
153 /*
154 * Visual Workstations restart after this
155 * register is poked on the PIIX4
156 */
157 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
158}
159
160static void visws_machine_power_off(void)
161{
162 unsigned short pm_status;
163/* extern unsigned int pci_bus0; */
164
165 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
166 outw(pm_status, PMSTS_PORT);
167
168 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
169
170 mdelay(10);
171
172#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
173 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
174
175/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
176 outl(PIIX_SPECIAL_STOP, 0xCFC);
177}
178
179static int __init visws_get_smp_config_quirk(unsigned int early)
180{
181 /*
182 * Prevent MP-table parsing by the generic code:
183 */
184 return 1;
185}
186
187extern unsigned int __cpuinitdata maxcpus;
188
189/*
190 * The Visual Workstation is Intel MP compliant in the hardware
191 * sense, but it doesn't have a BIOS(-configuration table).
192 * No problem for Linux.
193 */
194
195static void __init MP_processor_info (struct mpc_config_processor *m)
196{
197 int ver, logical_apicid;
198 physid_mask_t apic_cpus;
199
200 if (!(m->mpc_cpuflag & CPU_ENABLED))
201 return;
202
203 logical_apicid = m->mpc_apicid;
204 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
205 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
206 m->mpc_apicid,
207 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
208 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
209 m->mpc_apicver);
210
211 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
212 boot_cpu_physical_apicid = m->mpc_apicid;
213
214 ver = m->mpc_apicver;
215 if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
216 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
217 m->mpc_apicid, MAX_APICS);
218 return;
219 }
220
221 apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
222 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
223 /*
224 * Validate version
225 */
226 if (ver == 0x0) {
227 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
228 "fixing up to 0x10. (tell your hw vendor)\n",
229 m->mpc_apicid);
230 ver = 0x10;
231 }
232 apic_version[m->mpc_apicid] = ver;
233}
234
235int __init visws_find_smp_config_quirk(unsigned int reserve)
236{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
239
240 if (ncpus > CO_CPU_MAX) {
241 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
242 ncpus, mp);
243
244 ncpus = CO_CPU_MAX;
245 }
246
247 if (ncpus > maxcpus)
248 ncpus = maxcpus;
249
250#ifdef CONFIG_X86_LOCAL_APIC
251 smp_found_config = 1;
252#endif
253 while (ncpus--)
254 MP_processor_info(mp++);
255
256 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
257
258 return 1;
259}
260
261extern int visws_trap_init_quirk(void);
262
263void __init visws_early_detect(void)
264{
265 int raw;
266
267 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
268 >> PIIX_GPI_BD_SHIFT;
269
270 if (visws_board_type < 0)
271 return;
272
273 /*
274 * Install special quirks for timer, interrupt and memory setup:
275 */
276 arch_time_init_quirk = visws_time_init_quirk;
277 arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
278 arch_memory_setup_quirk = visws_memory_setup_quirk;
279
280 /*
281 * Fall back to generic behavior for traps:
282 */
283 arch_intr_init_quirk = NULL;
284 arch_trap_init_quirk = visws_trap_init_quirk;
285
286 /*
287 * Install reboot quirks:
288 */
289 pm_power_off = visws_machine_power_off;
290 machine_ops.emergency_restart = visws_machine_emergency_restart;
291
292 /*
293 * Do not use broadcast IPIs:
294 */
295 no_broadcast = 0;
296
297 /*
298 * Override generic MP-table parsing:
299 */
300 mach_get_smp_config_quirk = visws_get_smp_config_quirk;
301 mach_find_smp_config_quirk = visws_find_smp_config_quirk;
302
303#ifdef CONFIG_X86_IO_APIC
304 /*
305 * Turn off IO-APIC detection and initialization:
306 */
307 skip_ioapic_setup = 1;
308#endif
309
310 /*
311 * Get Board rev.
312 * First, we have to initialize the 307 part to allow us access
313 * to the GPIO registers. Let's map them at 0x0fc0 which is right
314 * after the PIIX4 PM section.
315 */
316 outb_p(SIO_DEV_SEL, SIO_INDEX);
317 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
318
319 outb_p(SIO_DEV_MSB, SIO_INDEX);
320 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
321
322 outb_p(SIO_DEV_LSB, SIO_INDEX);
323 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
324
325 outb_p(SIO_DEV_ENB, SIO_INDEX);
326 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
327
328 /*
329 * Now, we have to map the power management section to write
330 * a bit which enables access to the GPIO registers.
331 * What lunatic came up with this shit?
332 */
333 outb_p(SIO_DEV_SEL, SIO_INDEX);
334 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
335
336 outb_p(SIO_DEV_MSB, SIO_INDEX);
337 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
338
339 outb_p(SIO_DEV_LSB, SIO_INDEX);
340 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
341
342 outb_p(SIO_DEV_ENB, SIO_INDEX);
343 outb_p(1, SIO_DATA); /* Enable PM registers. */
344
345 /*
346 * Now, write the PM register which enables the GPIO registers.
347 */
348 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
349 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
350
351 /*
352 * Now, initialize the GPIO registers.
353 * We want them all to be inputs which is the
354 * power on default, so let's leave them alone.
355 * So, let's just read the board rev!
356 */
357 raw = inb_p(SIO_GP_DATA1);
358 raw &= 0x7f; /* 7 bits of valid board revision ID. */
359
360 if (visws_board_type == VISWS_320) {
361 if (raw < 0x6) {
362 visws_board_rev = 4;
363 } else if (raw < 0xc) {
364 visws_board_rev = 5;
365 } else {
366 visws_board_rev = 6;
367 }
368 } else if (visws_board_type == VISWS_540) {
369 visws_board_rev = 2;
370 } else {
371 visws_board_rev = raw;
372 }
373
374 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
375 (visws_board_type == VISWS_320 ? "320" :
376 (visws_board_type == VISWS_540 ? "540" :
377 "unknown")), visws_board_rev);
378}
379
380#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
381#define BCD (LI_INTB | LI_INTC | LI_INTD)
382#define ALLDEVS (A01234 | BCD)
383
384static __init void lithium_init(void)
385{
386 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
387 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
388
389 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
390 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
391 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
392/* panic("This machine is not SGI Visual Workstation 320/540"); */
393 }
394
395 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
396 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
397 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
398/* panic("This machine is not SGI Visual Workstation 320/540"); */
399 }
400
401 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
402 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
403}
404
405static __init void cobalt_init(void)
406{
407 /*
408 * On normal SMP PC this is used only with SMP, but we have to
409 * use it and set it up here to start the Cobalt clock
410 */
411 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
412 setup_local_APIC();
413 printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
414 (unsigned int)apic_read(APIC_LVR),
415 (unsigned int)apic_read(APIC_ID));
416
417 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
418 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
419 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
420 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
421
422 /* Enable Cobalt APIC being careful to NOT change the ID! */
423 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
424
425 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
426 co_apic_read(CO_APIC_ID));
427}
428
429int __init visws_trap_init_quirk(void)
430{
431 lithium_init();
432 cobalt_init();
433
434 return 1;
435}
436
437/*
438 * IRQ controller / APIC support:
439 */
440
441static DEFINE_SPINLOCK(cobalt_lock);
442
443/*
444 * Set the given Cobalt APIC Redirection Table entry to point
445 * to the given IDT vector/index.
446 */
447static inline void co_apic_set(int entry, int irq)
448{
449 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
450 co_apic_write(CO_APIC_HI(entry), 0);
451}
452
453/*
454 * Cobalt (IO)-APIC functions to handle PCI devices.
455 */
456static inline int co_apic_ide0_hack(void)
457{
458 extern char visws_board_type;
459 extern char visws_board_rev;
460
461 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
462 return 5;
463 return CO_APIC_IDE0;
464}
465
466static int is_co_apic(unsigned int irq)
467{
468 if (IS_CO_APIC(irq))
469 return CO_APIC(irq);
470
471 switch (irq) {
472 case 0: return CO_APIC_CPU;
473 case CO_IRQ_IDE0: return co_apic_ide0_hack();
474 case CO_IRQ_IDE1: return CO_APIC_IDE1;
475 default: return -1;
476 }
477}
478
479
480/*
481 * This is the SGI Cobalt (IO-)APIC:
482 */
483
484static void enable_cobalt_irq(unsigned int irq)
485{
486 co_apic_set(is_co_apic(irq), irq);
487}
488
489static void disable_cobalt_irq(unsigned int irq)
490{
491 int entry = is_co_apic(irq);
492
493 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
494 co_apic_read(CO_APIC_LO(entry));
495}
496
497/*
498 * "irq" really just serves to identify the device. Here is where we
499 * map this to the Cobalt APIC entry where it's physically wired.
500 * This is called via request_irq -> setup_irq -> irq_desc->startup()
501 */
502static unsigned int startup_cobalt_irq(unsigned int irq)
503{
504 unsigned long flags;
505
506 spin_lock_irqsave(&cobalt_lock, flags);
507 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
508 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
509 enable_cobalt_irq(irq);
510 spin_unlock_irqrestore(&cobalt_lock, flags);
511 return 0;
512}
513
514static void ack_cobalt_irq(unsigned int irq)
515{
516 unsigned long flags;
517
518 spin_lock_irqsave(&cobalt_lock, flags);
519 disable_cobalt_irq(irq);
520 apic_write(APIC_EOI, APIC_EIO_ACK);
521 spin_unlock_irqrestore(&cobalt_lock, flags);
522}
523
524static void end_cobalt_irq(unsigned int irq)
525{
526 unsigned long flags;
527
528 spin_lock_irqsave(&cobalt_lock, flags);
529 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
530 enable_cobalt_irq(irq);
531 spin_unlock_irqrestore(&cobalt_lock, flags);
532}
533
534static struct irq_chip cobalt_irq_type = {
535 .typename = "Cobalt-APIC",
536 .startup = startup_cobalt_irq,
537 .shutdown = disable_cobalt_irq,
538 .enable = enable_cobalt_irq,
539 .disable = disable_cobalt_irq,
540 .ack = ack_cobalt_irq,
541 .end = end_cobalt_irq,
542};
543
544
545/*
546 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
547 * -- not the manner expected by the code in i8259.c.
548 *
549 * there is a 'master' physical interrupt source that gets sent to
550 * the CPU. But in the chipset there are various 'virtual' interrupts
551 * waiting to be handled. We represent this to Linux through a 'master'
552 * interrupt controller type, and through a special virtual interrupt-
553 * controller. Device drivers only see the virtual interrupt sources.
554 */
555static unsigned int startup_piix4_master_irq(unsigned int irq)
556{
557 init_8259A(0);
558
559 return startup_cobalt_irq(irq);
560}
561
562static void end_piix4_master_irq(unsigned int irq)
563{
564 unsigned long flags;
565
566 spin_lock_irqsave(&cobalt_lock, flags);
567 enable_cobalt_irq(irq);
568 spin_unlock_irqrestore(&cobalt_lock, flags);
569}
570
571static struct irq_chip piix4_master_irq_type = {
572 .typename = "PIIX4-master",
573 .startup = startup_piix4_master_irq,
574 .ack = ack_cobalt_irq,
575 .end = end_piix4_master_irq,
576};
577
578
579static struct irq_chip piix4_virtual_irq_type = {
580 .typename = "PIIX4-virtual",
581 .shutdown = disable_8259A_irq,
582 .enable = enable_8259A_irq,
583 .disable = disable_8259A_irq,
584};
585
586
587/*
588 * PIIX4-8259 master/virtual functions to handle interrupt requests
589 * from legacy devices: floppy, parallel, serial, rtc.
590 *
591 * None of these get Cobalt APIC entries, neither do they have IDT
592 * entries. These interrupts are purely virtual and distributed from
593 * the 'master' interrupt source: CO_IRQ_8259.
594 *
595 * When the 8259 interrupts its handler figures out which of these
596 * devices is interrupting and dispatches to its handler.
597 *
598 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
599 * enable_irq gets the right irq. This 'master' irq is never directly
600 * manipulated by any driver.
601 */
602static irqreturn_t piix4_master_intr(int irq, void *dev_id)
603{
604 int realirq;
605 irq_desc_t *desc;
606 unsigned long flags;
607
608 spin_lock_irqsave(&i8259A_lock, flags);
609
610 /* Find out what's interrupting in the PIIX4 master 8259 */
611 outb(0x0c, 0x20); /* OCW3 Poll command */
612 realirq = inb(0x20);
613
614 /*
615 * Bit 7 == 0 means invalid/spurious
616 */
617 if (unlikely(!(realirq & 0x80)))
618 goto out_unlock;
619
620 realirq &= 7;
621
622 if (unlikely(realirq == 2)) {
623 outb(0x0c, 0xa0);
624 realirq = inb(0xa0);
625
626 if (unlikely(!(realirq & 0x80)))
627 goto out_unlock;
628
629 realirq = (realirq & 7) + 8;
630 }
631
632 /* mask and ack interrupt */
633 cached_irq_mask |= 1 << realirq;
634 if (unlikely(realirq > 7)) {
635 inb(0xa1);
636 outb(cached_slave_mask, 0xa1);
637 outb(0x60 + (realirq & 7), 0xa0);
638 outb(0x60 + 2, 0x20);
639 } else {
640 inb(0x21);
641 outb(cached_master_mask, 0x21);
642 outb(0x60 + realirq, 0x20);
643 }
644
645 spin_unlock_irqrestore(&i8259A_lock, flags);
646
647 desc = irq_desc + realirq;
648
649 /*
650 * handle this 'virtual interrupt' as a Cobalt one now.
651 */
652 kstat_cpu(smp_processor_id()).irqs[realirq]++;
653
654 if (likely(desc->action != NULL))
655 handle_IRQ_event(realirq, desc->action);
656
657 if (!(desc->status & IRQ_DISABLED))
658 enable_8259A_irq(realirq);
659
660 return IRQ_HANDLED;
661
662out_unlock:
663 spin_unlock_irqrestore(&i8259A_lock, flags);
664 return IRQ_NONE;
665}
666
667static struct irqaction master_action = {
668 .handler = piix4_master_intr,
669 .name = "PIIX4-8259",
670};
671
672static struct irqaction cascade_action = {
673 .handler = no_action,
674 .name = "cascade",
675};
676
677
678void init_VISWS_APIC_irqs(void)
679{
680 int i;
681
682 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
683 irq_desc[i].status = IRQ_DISABLED;
684 irq_desc[i].action = 0;
685 irq_desc[i].depth = 1;
686
687 if (i == 0) {
688 irq_desc[i].chip = &cobalt_irq_type;
689 }
690 else if (i == CO_IRQ_IDE0) {
691 irq_desc[i].chip = &cobalt_irq_type;
692 }
693 else if (i == CO_IRQ_IDE1) {
694 irq_desc[i].chip = &cobalt_irq_type;
695 }
696 else if (i == CO_IRQ_8259) {
697 irq_desc[i].chip = &piix4_master_irq_type;
698 }
699 else if (i < CO_IRQ_APIC0) {
700 irq_desc[i].chip = &piix4_virtual_irq_type;
701 }
702 else if (IS_CO_APIC(i)) {
703 irq_desc[i].chip = &cobalt_irq_type;
704 }
705 }
706
707 setup_irq(CO_IRQ_8259, &master_action);
708 setup_irq(2, &cascade_action);
709}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 946bf13b44ab..b15346092b7b 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -932,7 +932,7 @@ static inline int __init activate_vmi(void)
932 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; 932 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
933#endif 933#endif
934 pv_time_ops.sched_clock = vmi_sched_clock; 934 pv_time_ops.sched_clock = vmi_sched_clock;
935 pv_time_ops.get_cpu_khz = vmi_cpu_khz; 935 pv_time_ops.get_tsc_khz = vmi_tsc_khz;
936 936
937 /* We have true wallclock functions; disable CMOS clock sync */ 937 /* We have true wallclock functions; disable CMOS clock sync */
938 no_sync_cmos_clock = 1; 938 no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index ba7d19e102b1..6953859fe289 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -69,8 +69,8 @@ unsigned long long vmi_sched_clock(void)
69 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); 69 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
70} 70}
71 71
72/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ 72/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */
73unsigned long vmi_cpu_khz(void) 73unsigned long vmi_tsc_khz(void)
74{ 74{
75 unsigned long long khz; 75 unsigned long long khz;
76 khz = vmi_timer_ops.get_cycle_frequency(); 76 khz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 2674f5796275..cdb2363697d2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -49,16 +49,14 @@ SECTIONS
49 _etext = .; /* End of text section */ 49 _etext = .; /* End of text section */
50 } :text = 0x9090 50 } :text = 0x9090
51 51
52 NOTES :text :note
53
52 . = ALIGN(16); /* Exception table */ 54 . = ALIGN(16); /* Exception table */
53 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { 55 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
54 __start___ex_table = .; 56 __start___ex_table = .;
55 *(__ex_table) 57 *(__ex_table)
56 __stop___ex_table = .; 58 __stop___ex_table = .;
57 } 59 } :text = 0x9090
58
59 NOTES :text :note
60
61 BUG_TABLE :text
62 60
63 RODATA 61 RODATA
64 62
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fd246e22fe6b..63e5c1a22e88 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -19,7 +19,7 @@ PHDRS {
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(4); /* R__ */ 22 note PT_NOTE FLAGS(0); /* ___ */
23} 23}
24SECTIONS 24SECTIONS
25{ 25{
@@ -40,16 +40,14 @@ SECTIONS
40 _etext = .; /* End of text section */ 40 _etext = .; /* End of text section */
41 } :text = 0x9090 41 } :text = 0x9090
42 42
43 NOTES :text :note
44
43 . = ALIGN(16); /* Exception table */ 45 . = ALIGN(16); /* Exception table */
44 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { 46 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
45 __start___ex_table = .; 47 __start___ex_table = .;
46 *(__ex_table) 48 *(__ex_table)
47 __stop___ex_table = .; 49 __stop___ex_table = .;
48 } 50 } :text = 0x9090
49
50 NOTES :text :note
51
52 BUG_TABLE :text
53 51
54 RODATA 52 RODATA
55 53
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index c87cbd84c3e5..0b8b6690a86d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -42,7 +42,8 @@
42#include <asm/topology.h> 42#include <asm/topology.h>
43#include <asm/vgtod.h> 43#include <asm/vgtod.h>
44 44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 45#define __vsyscall(nr) \
46 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
46#define __syscall_clobber "r11","cx","memory" 47#define __syscall_clobber "r11","cx","memory"
47 48
48/* 49/*
@@ -278,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
278{ 279{
279 long cpu = (long)arg; 280 long cpu = (long)arg;
280 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 281 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
281 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); 282 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
282 return NOTIFY_DONE; 283 return NOTIFY_DONE;
283} 284}
284 285
@@ -301,7 +302,7 @@ static int __init vsyscall_init(void)
301#ifdef CONFIG_SYSCTL 302#ifdef CONFIG_SYSCTL
302 register_sysctl_table(kernel_root_table2); 303 register_sysctl_table(kernel_root_table2);
303#endif 304#endif
304 on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); 305 on_each_cpu(cpu_vsyscall_init, NULL, 1);
305 hotcpu_notifier(cpu_vsyscall_notifier, 0); 306 hotcpu_notifier(cpu_vsyscall_notifier, 0);
306 return 0; 307 return 0;
307} 308}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 2f306a826897..b545f371b5f5 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -2,13 +2,20 @@
2 All C exports should go in the respective C files. */ 2 All C exports should go in the respective C files. */
3 3
4#include <linux/module.h> 4#include <linux/module.h>
5#include <net/checksum.h>
6#include <linux/smp.h> 5#include <linux/smp.h>
7 6
7#include <net/checksum.h>
8
8#include <asm/processor.h> 9#include <asm/processor.h>
9#include <asm/uaccess.h>
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/uaccess.h>
11#include <asm/desc.h> 12#include <asm/desc.h>
13#include <asm/ftrace.h>
14
15#ifdef CONFIG_FTRACE
16/* mcount is defined in assembly */
17EXPORT_SYMBOL(mcount);
18#endif
12 19
13EXPORT_SYMBOL(kernel_thread); 20EXPORT_SYMBOL(kernel_thread);
14 21
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 540e95179074..10ce6ee4c491 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -335,7 +335,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
335{ 335{
336 if (vmx->vcpu.cpu == -1) 336 if (vmx->vcpu.cpu == -1)
337 return; 337 return;
338 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); 338 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
339 vmx->launched = 0; 339 vmx->launched = 0;
340} 340}
341 341
@@ -2968,7 +2968,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2968 struct vcpu_vmx *vmx = to_vmx(vcpu); 2968 struct vcpu_vmx *vmx = to_vmx(vcpu);
2969 2969
2970 if (vmx->vmcs) { 2970 if (vmx->vmcs) {
2971 on_each_cpu(__vcpu_clear, vmx, 0, 1); 2971 on_each_cpu(__vcpu_clear, vmx, 1);
2972 free_vmcs(vmx->vmcs); 2972 free_vmcs(vmx->vmcs);
2973 vmx->vmcs = NULL; 2973 vmx->vmcs = NULL;
2974 } 2974 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63a77caa59f1..0faa2546b1cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4044,6 +4044,6 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4044 * So need not to call smp_call_function_single() in that case. 4044 * So need not to call smp_call_function_single() in that case.
4045 */ 4045 */
4046 if (vcpu->guest_mode && vcpu->cpu != cpu) 4046 if (vcpu->guest_mode && vcpu->cpu != cpu)
4047 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); 4047 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4048 put_cpu(); 4048 put_cpu();
4049} 4049}
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 964dfa36d367..c70e12b1a637 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -3,7 +3,7 @@ config LGUEST_GUEST
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE 5 depends on !X86_PAE
6 depends on !(X86_VISWS || X86_VOYAGER) 6 depends on !X86_VOYAGER
7 select VIRTIO 7 select VIRTIO
8 select VIRTIO_RING 8 select VIRTIO_RING
9 select VIRTIO_CONSOLE 9 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index e72cf0793fbe..50dad44fb542 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -607,7 +607,7 @@ static unsigned long lguest_get_wallclock(void)
607 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 607 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
608 * This matches what we want here: if we return 0 from this function, the x86 608 * This matches what we want here: if we return 0 from this function, the x86
609 * TSC clock will give up and not register itself. */ 609 * TSC clock will give up and not register itself. */
610static unsigned long lguest_cpu_khz(void) 610static unsigned long lguest_tsc_khz(void)
611{ 611{
612 return lguest_data.tsc_khz; 612 return lguest_data.tsc_khz;
613} 613}
@@ -998,7 +998,7 @@ __init void lguest_init(void)
998 /* time operations */ 998 /* time operations */
999 pv_time_ops.get_wallclock = lguest_get_wallclock; 999 pv_time_ops.get_wallclock = lguest_get_wallclock;
1000 pv_time_ops.time_init = lguest_time_init; 1000 pv_time_ops.time_init = lguest_time_init;
1001 pv_time_ops.get_cpu_khz = lguest_cpu_khz; 1001 pv_time_ops.get_tsc_khz = lguest_tsc_khz;
1002 1002
1003 /* Now is a good time to look at the implementations of these functions 1003 /* Now is a good time to look at the implementations of these functions
1004 * before returning to the rest of lguest_init(). */ 1004 * before returning to the rest of lguest_init(). */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 76f60f52a885..aa3fa4119424 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -4,8 +4,9 @@
4 4
5obj-$(CONFIG_SMP) := msr-on-cpu.o 5obj-$(CONFIG_SMP) := msr-on-cpu.o
6 6
7lib-y := delay_$(BITS).o 7lib-y := delay.o
8lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o 8lib-y += thunk_$(BITS).o
9lib-y += usercopy_$(BITS).o getuser.o putuser.o
9lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
10 11
11ifeq ($(CONFIG_X86_32),y) 12ifeq ($(CONFIG_X86_32),y)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index ee1c3f635157..dfdf428975c0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -1,8 +1,10 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs. 1/*
2 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
3 * Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2. 4 * Subject to the GNU Public License v2.
3 * 5 *
4 * Functions to copy from and to user space. 6 * Functions to copy from and to user space.
5 */ 7 */
6 8
7#include <linux/linkage.h> 9#include <linux/linkage.h>
8#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
@@ -20,60 +22,88 @@
20 .long \orig-1f /* by default jump to orig */ 22 .long \orig-1f /* by default jump to orig */
211: 231:
22 .section .altinstr_replacement,"ax" 24 .section .altinstr_replacement,"ax"
232: .byte 0xe9 /* near jump with 32bit immediate */ 252: .byte 0xe9 /* near jump with 32bit immediate */
24 .long \alt-1b /* offset */ /* or alternatively to alt */ 26 .long \alt-1b /* offset */ /* or alternatively to alt */
25 .previous 27 .previous
26 .section .altinstructions,"a" 28 .section .altinstructions,"a"
27 .align 8 29 .align 8
28 .quad 0b 30 .quad 0b
29 .quad 2b 31 .quad 2b
30 .byte \feature /* when feature is set */ 32 .byte \feature /* when feature is set */
31 .byte 5 33 .byte 5
32 .byte 5 34 .byte 5
33 .previous 35 .previous
34 .endm 36 .endm
35 37
36/* Standard copy_to_user with segment limit checking */ 38 .macro ALIGN_DESTINATION
39#ifdef FIX_ALIGNMENT
40 /* check for bad alignment of destination */
41 movl %edi,%ecx
42 andl $7,%ecx
43 jz 102f /* already aligned */
44 subl $8,%ecx
45 negl %ecx
46 subl %ecx,%edx
47100: movb (%rsi),%al
48101: movb %al,(%rdi)
49 incq %rsi
50 incq %rdi
51 decl %ecx
52 jnz 100b
53102:
54 .section .fixup,"ax"
55103: addl %r8d,%edx /* ecx is zerorest also */
56 jmp copy_user_handle_tail
57 .previous
58
59 .section __ex_table,"a"
60 .align 8
61 .quad 100b,103b
62 .quad 101b,103b
63 .previous
64#endif
65 .endm
66
67/* Standard copy_to_user with segment limit checking */
37ENTRY(copy_to_user) 68ENTRY(copy_to_user)
38 CFI_STARTPROC 69 CFI_STARTPROC
39 GET_THREAD_INFO(%rax) 70 GET_THREAD_INFO(%rax)
40 movq %rdi,%rcx 71 movq %rdi,%rcx
41 addq %rdx,%rcx 72 addq %rdx,%rcx
42 jc bad_to_user 73 jc bad_to_user
43 cmpq threadinfo_addr_limit(%rax),%rcx 74 cmpq TI_addr_limit(%rax),%rcx
44 jae bad_to_user 75 jae bad_to_user
45 xorl %eax,%eax /* clear zero flag */
46 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
47 CFI_ENDPROC 77 CFI_ENDPROC
48 78
49ENTRY(copy_user_generic) 79/* Standard copy_from_user with segment limit checking */
80ENTRY(copy_from_user)
50 CFI_STARTPROC 81 CFI_STARTPROC
51 movl $1,%ecx /* set zero flag */ 82 GET_THREAD_INFO(%rax)
83 movq %rsi,%rcx
84 addq %rdx,%rcx
85 jc bad_from_user
86 cmpq TI_addr_limit(%rax),%rcx
87 jae bad_from_user
52 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 88 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
53 CFI_ENDPROC 89 CFI_ENDPROC
90ENDPROC(copy_from_user)
54 91
55ENTRY(__copy_from_user_inatomic) 92ENTRY(copy_user_generic)
56 CFI_STARTPROC 93 CFI_STARTPROC
57 xorl %ecx,%ecx /* clear zero flag */
58 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 94 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
59 CFI_ENDPROC 95 CFI_ENDPROC
96ENDPROC(copy_user_generic)
60 97
61/* Standard copy_from_user with segment limit checking */ 98ENTRY(__copy_from_user_inatomic)
62ENTRY(copy_from_user)
63 CFI_STARTPROC 99 CFI_STARTPROC
64 GET_THREAD_INFO(%rax)
65 movq %rsi,%rcx
66 addq %rdx,%rcx
67 jc bad_from_user
68 cmpq threadinfo_addr_limit(%rax),%rcx
69 jae bad_from_user
70 movl $1,%ecx /* set zero flag */
71 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 100 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
72 CFI_ENDPROC 101 CFI_ENDPROC
73ENDPROC(copy_from_user) 102ENDPROC(__copy_from_user_inatomic)
74 103
75 .section .fixup,"ax" 104 .section .fixup,"ax"
76 /* must zero dest */ 105 /* must zero dest */
106ENTRY(bad_from_user)
77bad_from_user: 107bad_from_user:
78 CFI_STARTPROC 108 CFI_STARTPROC
79 movl %edx,%ecx 109 movl %edx,%ecx
@@ -81,271 +111,158 @@ bad_from_user:
81 rep 111 rep
82 stosb 112 stosb
83bad_to_user: 113bad_to_user:
84 movl %edx,%eax 114 movl %edx,%eax
85 ret 115 ret
86 CFI_ENDPROC 116 CFI_ENDPROC
87END(bad_from_user) 117ENDPROC(bad_from_user)
88 .previous 118 .previous
89 119
90
91/* 120/*
92 * copy_user_generic_unrolled - memory copy with exception handling. 121 * copy_user_generic_unrolled - memory copy with exception handling.
93 * This version is for CPUs like P4 that don't have efficient micro code for rep movsq 122 * This version is for CPUs like P4 that don't have efficient micro
94 * 123 * code for rep movsq
95 * Input: 124 *
125 * Input:
96 * rdi destination 126 * rdi destination
97 * rsi source 127 * rsi source
98 * rdx count 128 * rdx count
99 * ecx zero flag -- if true zero destination on error
100 * 129 *
101 * Output: 130 * Output:
102 * eax uncopied bytes or 0 if successful. 131 * eax uncopied bytes or 0 if successfull.
103 */ 132 */
104ENTRY(copy_user_generic_unrolled) 133ENTRY(copy_user_generic_unrolled)
105 CFI_STARTPROC 134 CFI_STARTPROC
106 pushq %rbx 135 cmpl $8,%edx
107 CFI_ADJUST_CFA_OFFSET 8 136 jb 20f /* less then 8 bytes, go to byte copy loop */
108 CFI_REL_OFFSET rbx, 0 137 ALIGN_DESTINATION
109 pushq %rcx 138 movl %edx,%ecx
110 CFI_ADJUST_CFA_OFFSET 8 139 andl $63,%edx
111 CFI_REL_OFFSET rcx, 0 140 shrl $6,%ecx
112 xorl %eax,%eax /*zero for the exception handler */ 141 jz 17f
113 1421: movq (%rsi),%r8
114#ifdef FIX_ALIGNMENT 1432: movq 1*8(%rsi),%r9
115 /* check for bad alignment of destination */ 1443: movq 2*8(%rsi),%r10
116 movl %edi,%ecx 1454: movq 3*8(%rsi),%r11
117 andl $7,%ecx 1465: movq %r8,(%rdi)
118 jnz .Lbad_alignment 1476: movq %r9,1*8(%rdi)
119.Lafter_bad_alignment: 1487: movq %r10,2*8(%rdi)
120#endif 1498: movq %r11,3*8(%rdi)
121 1509: movq 4*8(%rsi),%r8
122 movq %rdx,%rcx 15110: movq 5*8(%rsi),%r9
123 15211: movq 6*8(%rsi),%r10
124 movl $64,%ebx 15312: movq 7*8(%rsi),%r11
125 shrq $6,%rdx 15413: movq %r8,4*8(%rdi)
126 decq %rdx 15514: movq %r9,5*8(%rdi)
127 js .Lhandle_tail 15615: movq %r10,6*8(%rdi)
128 15716: movq %r11,7*8(%rdi)
129 .p2align 4
130.Lloop:
131.Ls1: movq (%rsi),%r11
132.Ls2: movq 1*8(%rsi),%r8
133.Ls3: movq 2*8(%rsi),%r9
134.Ls4: movq 3*8(%rsi),%r10
135.Ld1: movq %r11,(%rdi)
136.Ld2: movq %r8,1*8(%rdi)
137.Ld3: movq %r9,2*8(%rdi)
138.Ld4: movq %r10,3*8(%rdi)
139
140.Ls5: movq 4*8(%rsi),%r11
141.Ls6: movq 5*8(%rsi),%r8
142.Ls7: movq 6*8(%rsi),%r9
143.Ls8: movq 7*8(%rsi),%r10
144.Ld5: movq %r11,4*8(%rdi)
145.Ld6: movq %r8,5*8(%rdi)
146.Ld7: movq %r9,6*8(%rdi)
147.Ld8: movq %r10,7*8(%rdi)
148
149 decq %rdx
150
151 leaq 64(%rsi),%rsi 158 leaq 64(%rsi),%rsi
152 leaq 64(%rdi),%rdi 159 leaq 64(%rdi),%rdi
153
154 jns .Lloop
155
156 .p2align 4
157.Lhandle_tail:
158 movl %ecx,%edx
159 andl $63,%ecx
160 shrl $3,%ecx
161 jz .Lhandle_7
162 movl $8,%ebx
163 .p2align 4
164.Lloop_8:
165.Ls9: movq (%rsi),%r8
166.Ld9: movq %r8,(%rdi)
167 decl %ecx 160 decl %ecx
168 leaq 8(%rdi),%rdi 161 jnz 1b
16217: movl %edx,%ecx
163 andl $7,%edx
164 shrl $3,%ecx
165 jz 20f
16618: movq (%rsi),%r8
16719: movq %r8,(%rdi)
169 leaq 8(%rsi),%rsi 168 leaq 8(%rsi),%rsi
170 jnz .Lloop_8 169 leaq 8(%rdi),%rdi
171 170 decl %ecx
172.Lhandle_7: 171 jnz 18b
17220: andl %edx,%edx
173 jz 23f
173 movl %edx,%ecx 174 movl %edx,%ecx
174 andl $7,%ecx 17521: movb (%rsi),%al
175 jz .Lende 17622: movb %al,(%rdi)
176 .p2align 4
177.Lloop_1:
178.Ls10: movb (%rsi),%bl
179.Ld10: movb %bl,(%rdi)
180 incq %rdi
181 incq %rsi 177 incq %rsi
178 incq %rdi
182 decl %ecx 179 decl %ecx
183 jnz .Lloop_1 180 jnz 21b
184 18123: xor %eax,%eax
185 CFI_REMEMBER_STATE
186.Lende:
187 popq %rcx
188 CFI_ADJUST_CFA_OFFSET -8
189 CFI_RESTORE rcx
190 popq %rbx
191 CFI_ADJUST_CFA_OFFSET -8
192 CFI_RESTORE rbx
193 ret 182 ret
194 CFI_RESTORE_STATE
195 183
196#ifdef FIX_ALIGNMENT 184 .section .fixup,"ax"
197 /* align destination */ 18530: shll $6,%ecx
198 .p2align 4 186 addl %ecx,%edx
199.Lbad_alignment: 187 jmp 60f
200 movl $8,%r9d 18840: lea (%rdx,%rcx,8),%rdx
201 subl %ecx,%r9d 189 jmp 60f
202 movl %r9d,%ecx 19050: movl %ecx,%edx
203 cmpq %r9,%rdx 19160: jmp copy_user_handle_tail /* ecx is zerorest also */
204 jz .Lhandle_7 192 .previous
205 js .Lhandle_7
206.Lalign_1:
207.Ls11: movb (%rsi),%bl
208.Ld11: movb %bl,(%rdi)
209 incq %rsi
210 incq %rdi
211 decl %ecx
212 jnz .Lalign_1
213 subq %r9,%rdx
214 jmp .Lafter_bad_alignment
215#endif
216 193
217 /* table sorted by exception address */
218 .section __ex_table,"a" 194 .section __ex_table,"a"
219 .align 8 195 .align 8
220 .quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */ 196 .quad 1b,30b
221 .quad .Ls2,.Ls1e 197 .quad 2b,30b
222 .quad .Ls3,.Ls1e 198 .quad 3b,30b
223 .quad .Ls4,.Ls1e 199 .quad 4b,30b
224 .quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */ 200 .quad 5b,30b
225 .quad .Ld2,.Ls2e 201 .quad 6b,30b
226 .quad .Ld3,.Ls3e 202 .quad 7b,30b
227 .quad .Ld4,.Ls4e 203 .quad 8b,30b
228 .quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */ 204 .quad 9b,30b
229 .quad .Ls6,.Ls5e 205 .quad 10b,30b
230 .quad .Ls7,.Ls5e 206 .quad 11b,30b
231 .quad .Ls8,.Ls5e 207 .quad 12b,30b
232 .quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */ 208 .quad 13b,30b
233 .quad .Ld6,.Ls6e 209 .quad 14b,30b
234 .quad .Ld7,.Ls7e 210 .quad 15b,30b
235 .quad .Ld8,.Ls8e 211 .quad 16b,30b
236 .quad .Ls9,.Le_quad 212 .quad 18b,40b
237 .quad .Ld9,.Le_quad 213 .quad 19b,40b
238 .quad .Ls10,.Le_byte 214 .quad 21b,50b
239 .quad .Ld10,.Le_byte 215 .quad 22b,50b
240#ifdef FIX_ALIGNMENT
241 .quad .Ls11,.Lzero_rest
242 .quad .Ld11,.Lzero_rest
243#endif
244 .quad .Le5,.Le_zero
245 .previous 216 .previous
246
247 /* eax: zero, ebx: 64 */
248.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
249.Ls2e: addl $8,%eax
250.Ls3e: addl $8,%eax
251.Ls4e: addl $8,%eax
252.Ls5e: addl $8,%eax
253.Ls6e: addl $8,%eax
254.Ls7e: addl $8,%eax
255.Ls8e: addl $8,%eax
256 addq %rbx,%rdi /* +64 */
257 subq %rax,%rdi /* correct destination with computed offset */
258
259 shlq $6,%rdx /* loop counter * 64 (stride length) */
260 addq %rax,%rdx /* add offset to loopcnt */
261 andl $63,%ecx /* remaining bytes */
262 addq %rcx,%rdx /* add them */
263 jmp .Lzero_rest
264
265 /* exception on quad word loop in tail handling */
266 /* ecx: loopcnt/8, %edx: length, rdi: correct */
267.Le_quad:
268 shll $3,%ecx
269 andl $7,%edx
270 addl %ecx,%edx
271 /* edx: bytes to zero, rdi: dest, eax:zero */
272.Lzero_rest:
273 cmpl $0,(%rsp)
274 jz .Le_zero
275 movq %rdx,%rcx
276.Le_byte:
277 xorl %eax,%eax
278.Le5: rep
279 stosb
280 /* when there is another exception while zeroing the rest just return */
281.Le_zero:
282 movq %rdx,%rax
283 jmp .Lende
284 CFI_ENDPROC 217 CFI_ENDPROC
285ENDPROC(copy_user_generic) 218ENDPROC(copy_user_generic_unrolled)
286 219
287 220/* Some CPUs run faster using the string copy instructions.
288 /* Some CPUs run faster using the string copy instructions. 221 * This is also a lot simpler. Use them when possible.
289 This is also a lot simpler. Use them when possible. 222 *
290 Patch in jmps to this code instead of copying it fully 223 * Only 4GB of copy is supported. This shouldn't be a problem
291 to avoid unwanted aliasing in the exception tables. */ 224 * because the kernel normally only writes from/to page sized chunks
292 225 * even if user space passed a longer buffer.
293 /* rdi destination 226 * And more would be dangerous because both Intel and AMD have
294 * rsi source 227 * errata with rep movsq > 4GB. If someone feels the need to fix
295 * rdx count 228 * this please consider this.
296 * ecx zero flag 229 *
297 * 230 * Input:
298 * Output: 231 * rdi destination
299 * eax uncopied bytes or 0 if successfull. 232 * rsi source
300 * 233 * rdx count
301 * Only 4GB of copy is supported. This shouldn't be a problem 234 *
302 * because the kernel normally only writes from/to page sized chunks 235 * Output:
303 * even if user space passed a longer buffer. 236 * eax uncopied bytes or 0 if successful.
304 * And more would be dangerous because both Intel and AMD have 237 */
305 * errata with rep movsq > 4GB. If someone feels the need to fix
306 * this please consider this.
307 */
308ENTRY(copy_user_generic_string) 238ENTRY(copy_user_generic_string)
309 CFI_STARTPROC 239 CFI_STARTPROC
310 movl %ecx,%r8d /* save zero flag */ 240 andl %edx,%edx
241 jz 4f
242 cmpl $8,%edx
243 jb 2f /* less than 8 bytes, go to byte copy loop */
244 ALIGN_DESTINATION
311 movl %edx,%ecx 245 movl %edx,%ecx
312 shrl $3,%ecx 246 shrl $3,%ecx
313 andl $7,%edx 247 andl $7,%edx
314 jz 10f 2481: rep
3151: rep
316 movsq
317 movl %edx,%ecx
3182: rep
319 movsb
3209: movl %ecx,%eax
321 ret
322
323 /* multiple of 8 byte */
32410: rep
325 movsq 249 movsq
326 xor %eax,%eax 2502: movl %edx,%ecx
2513: rep
252 movsb
2534: xorl %eax,%eax
327 ret 254 ret
328 255
329 /* exception handling */ 256 .section .fixup,"ax"
3303: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ 25711: lea (%rdx,%rcx,8),%rcx
331 jmp 6f 25812: movl %ecx,%edx /* ecx is zerorest also */
3325: movl %ecx,%eax /* exception on byte loop */ 259 jmp copy_user_handle_tail
333 /* eax: left over bytes */ 260 .previous
3346: testl %r8d,%r8d /* zero flag set? */
335 jz 7f
336 movl %eax,%ecx /* initialize x86 loop counter */
337 push %rax
338 xorl %eax,%eax
3398: rep
340 stosb /* zero the rest */
34111: pop %rax
3427: ret
343 CFI_ENDPROC
344END(copy_user_generic_c)
345 261
346 .section __ex_table,"a" 262 .section __ex_table,"a"
347 .quad 1b,3b 263 .align 8
348 .quad 2b,5b 264 .quad 1b,11b
349 .quad 8b,11b 265 .quad 3b,12b
350 .quad 10b,3b
351 .previous 266 .previous
267 CFI_ENDPROC
268ENDPROC(copy_user_generic_string)
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 9d3d1ab83763..40e0e309d27e 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -1,4 +1,6 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs. 1/*
2 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
3 * Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2. 4 * Subject to the GNU Public License v2.
3 * 5 *
4 * Functions to copy from and to user space. 6 * Functions to copy from and to user space.
@@ -12,204 +14,125 @@
12#include <asm/current.h> 14#include <asm/current.h>
13#include <asm/asm-offsets.h> 15#include <asm/asm-offsets.h>
14#include <asm/thread_info.h> 16#include <asm/thread_info.h>
15#include <asm/cpufeature.h>
16
17/*
18 * copy_user_nocache - Uncached memory copy with exception handling
19 * This will force destination/source out of cache for more performance.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * rdx count
25 * rcx zero flag when 1 zero on exception
26 *
27 * Output:
28 * eax uncopied bytes or 0 if successful.
29 */
30ENTRY(__copy_user_nocache)
31 CFI_STARTPROC
32 pushq %rbx
33 CFI_ADJUST_CFA_OFFSET 8
34 CFI_REL_OFFSET rbx, 0
35 pushq %rcx /* save zero flag */
36 CFI_ADJUST_CFA_OFFSET 8
37 CFI_REL_OFFSET rcx, 0
38
39 xorl %eax,%eax /* zero for the exception handler */
40 17
18 .macro ALIGN_DESTINATION
41#ifdef FIX_ALIGNMENT 19#ifdef FIX_ALIGNMENT
42 /* check for bad alignment of destination */ 20 /* check for bad alignment of destination */
43 movl %edi,%ecx 21 movl %edi,%ecx
44 andl $7,%ecx 22 andl $7,%ecx
45 jnz .Lbad_alignment 23 jz 102f /* already aligned */
46.Lafter_bad_alignment: 24 subl $8,%ecx
47#endif 25 negl %ecx
48 26 subl %ecx,%edx
49 movq %rdx,%rcx 27100: movb (%rsi),%al
50 28101: movb %al,(%rdi)
51 movl $64,%ebx 29 incq %rsi
52 shrq $6,%rdx 30 incq %rdi
53 decq %rdx 31 decl %ecx
54 js .Lhandle_tail 32 jnz 100b
55 33102:
56 .p2align 4 34 .section .fixup,"ax"
57.Lloop: 35103: addl %r8d,%edx /* ecx is zerorest also */
58.Ls1: movq (%rsi),%r11 36 jmp copy_user_handle_tail
59.Ls2: movq 1*8(%rsi),%r8 37 .previous
60.Ls3: movq 2*8(%rsi),%r9
61.Ls4: movq 3*8(%rsi),%r10
62.Ld1: movnti %r11,(%rdi)
63.Ld2: movnti %r8,1*8(%rdi)
64.Ld3: movnti %r9,2*8(%rdi)
65.Ld4: movnti %r10,3*8(%rdi)
66
67.Ls5: movq 4*8(%rsi),%r11
68.Ls6: movq 5*8(%rsi),%r8
69.Ls7: movq 6*8(%rsi),%r9
70.Ls8: movq 7*8(%rsi),%r10
71.Ld5: movnti %r11,4*8(%rdi)
72.Ld6: movnti %r8,5*8(%rdi)
73.Ld7: movnti %r9,6*8(%rdi)
74.Ld8: movnti %r10,7*8(%rdi)
75 38
76 dec %rdx 39 .section __ex_table,"a"
40 .align 8
41 .quad 100b,103b
42 .quad 101b,103b
43 .previous
44#endif
45 .endm
77 46
47/*
48 * copy_user_nocache - Uncached memory copy with exception handling
49 * This will force destination/source out of cache for more performance.
50 */
51ENTRY(__copy_user_nocache)
52 CFI_STARTPROC
53 cmpl $8,%edx
54 jb 20f /* less then 8 bytes, go to byte copy loop */
55 ALIGN_DESTINATION
56 movl %edx,%ecx
57 andl $63,%edx
58 shrl $6,%ecx
59 jz 17f
601: movq (%rsi),%r8
612: movq 1*8(%rsi),%r9
623: movq 2*8(%rsi),%r10
634: movq 3*8(%rsi),%r11
645: movnti %r8,(%rdi)
656: movnti %r9,1*8(%rdi)
667: movnti %r10,2*8(%rdi)
678: movnti %r11,3*8(%rdi)
689: movq 4*8(%rsi),%r8
6910: movq 5*8(%rsi),%r9
7011: movq 6*8(%rsi),%r10
7112: movq 7*8(%rsi),%r11
7213: movnti %r8,4*8(%rdi)
7314: movnti %r9,5*8(%rdi)
7415: movnti %r10,6*8(%rdi)
7516: movnti %r11,7*8(%rdi)
78 leaq 64(%rsi),%rsi 76 leaq 64(%rsi),%rsi
79 leaq 64(%rdi),%rdi 77 leaq 64(%rdi),%rdi
80
81 jns .Lloop
82
83 .p2align 4
84.Lhandle_tail:
85 movl %ecx,%edx
86 andl $63,%ecx
87 shrl $3,%ecx
88 jz .Lhandle_7
89 movl $8,%ebx
90 .p2align 4
91.Lloop_8:
92.Ls9: movq (%rsi),%r8
93.Ld9: movnti %r8,(%rdi)
94 decl %ecx 78 decl %ecx
95 leaq 8(%rdi),%rdi 79 jnz 1b
8017: movl %edx,%ecx
81 andl $7,%edx
82 shrl $3,%ecx
83 jz 20f
8418: movq (%rsi),%r8
8519: movnti %r8,(%rdi)
96 leaq 8(%rsi),%rsi 86 leaq 8(%rsi),%rsi
97 jnz .Lloop_8 87 leaq 8(%rdi),%rdi
98 88 decl %ecx
99.Lhandle_7: 89 jnz 18b
9020: andl %edx,%edx
91 jz 23f
100 movl %edx,%ecx 92 movl %edx,%ecx
101 andl $7,%ecx 9321: movb (%rsi),%al
102 jz .Lende 9422: movb %al,(%rdi)
103 .p2align 4
104.Lloop_1:
105.Ls10: movb (%rsi),%bl
106.Ld10: movb %bl,(%rdi)
107 incq %rdi
108 incq %rsi 95 incq %rsi
96 incq %rdi
109 decl %ecx 97 decl %ecx
110 jnz .Lloop_1 98 jnz 21b
111 9923: xorl %eax,%eax
112 CFI_REMEMBER_STATE
113.Lende:
114 popq %rcx
115 CFI_ADJUST_CFA_OFFSET -8
116 CFI_RESTORE %rcx
117 popq %rbx
118 CFI_ADJUST_CFA_OFFSET -8
119 CFI_RESTORE rbx
120 sfence 100 sfence
121 ret 101 ret
122 CFI_RESTORE_STATE
123 102
124#ifdef FIX_ALIGNMENT 103 .section .fixup,"ax"
125 /* align destination */ 10430: shll $6,%ecx
126 .p2align 4 105 addl %ecx,%edx
127.Lbad_alignment: 106 jmp 60f
128 movl $8,%r9d 10740: lea (%rdx,%rcx,8),%rdx
129 subl %ecx,%r9d 108 jmp 60f
130 movl %r9d,%ecx 10950: movl %ecx,%edx
131 cmpq %r9,%rdx 11060: sfence
132 jz .Lhandle_7 111 movl %r8d,%ecx
133 js .Lhandle_7 112 jmp copy_user_handle_tail
134.Lalign_1: 113 .previous
135.Ls11: movb (%rsi),%bl
136.Ld11: movb %bl,(%rdi)
137 incq %rsi
138 incq %rdi
139 decl %ecx
140 jnz .Lalign_1
141 subq %r9,%rdx
142 jmp .Lafter_bad_alignment
143#endif
144 114
145 /* table sorted by exception address */
146 .section __ex_table,"a" 115 .section __ex_table,"a"
147 .align 8 116 .quad 1b,30b
148 .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */ 117 .quad 2b,30b
149 .quad .Ls2,.Ls1e 118 .quad 3b,30b
150 .quad .Ls3,.Ls1e 119 .quad 4b,30b
151 .quad .Ls4,.Ls1e 120 .quad 5b,30b
152 .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */ 121 .quad 6b,30b
153 .quad .Ld2,.Ls2e 122 .quad 7b,30b
154 .quad .Ld3,.Ls3e 123 .quad 8b,30b
155 .quad .Ld4,.Ls4e 124 .quad 9b,30b
156 .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */ 125 .quad 10b,30b
157 .quad .Ls6,.Ls5e 126 .quad 11b,30b
158 .quad .Ls7,.Ls5e 127 .quad 12b,30b
159 .quad .Ls8,.Ls5e 128 .quad 13b,30b
160 .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */ 129 .quad 14b,30b
161 .quad .Ld6,.Ls6e 130 .quad 15b,30b
162 .quad .Ld7,.Ls7e 131 .quad 16b,30b
163 .quad .Ld8,.Ls8e 132 .quad 18b,40b
164 .quad .Ls9,.Le_quad 133 .quad 19b,40b
165 .quad .Ld9,.Le_quad 134 .quad 21b,50b
166 .quad .Ls10,.Le_byte 135 .quad 22b,50b
167 .quad .Ld10,.Le_byte
168#ifdef FIX_ALIGNMENT
169 .quad .Ls11,.Lzero_rest
170 .quad .Ld11,.Lzero_rest
171#endif
172 .quad .Le5,.Le_zero
173 .previous 136 .previous
174
175 /* eax: zero, ebx: 64 */
176.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
177.Ls2e: addl $8,%eax
178.Ls3e: addl $8,%eax
179.Ls4e: addl $8,%eax
180.Ls5e: addl $8,%eax
181.Ls6e: addl $8,%eax
182.Ls7e: addl $8,%eax
183.Ls8e: addl $8,%eax
184 addq %rbx,%rdi /* +64 */
185 subq %rax,%rdi /* correct destination with computed offset */
186
187 shlq $6,%rdx /* loop counter * 64 (stride length) */
188 addq %rax,%rdx /* add offset to loopcnt */
189 andl $63,%ecx /* remaining bytes */
190 addq %rcx,%rdx /* add them */
191 jmp .Lzero_rest
192
193 /* exception on quad word loop in tail handling */
194 /* ecx: loopcnt/8, %edx: length, rdi: correct */
195.Le_quad:
196 shll $3,%ecx
197 andl $7,%edx
198 addl %ecx,%edx
199 /* edx: bytes to zero, rdi: dest, eax:zero */
200.Lzero_rest:
201 cmpl $0,(%rsp) /* zero flag set? */
202 jz .Le_zero
203 movq %rdx,%rcx
204.Le_byte:
205 xorl %eax,%eax
206.Le5: rep
207 stosb
208 /* when there is another exception while zeroing the rest just return */
209.Le_zero:
210 movq %rdx,%rax
211 jmp .Lende
212 CFI_ENDPROC 137 CFI_ENDPROC
213ENDPROC(__copy_user_nocache) 138ENDPROC(__copy_user_nocache)
214
215
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay.c
index ef691316f8b6..f4568605d7d5 100644
--- a/arch/x86/lib/delay_32.c
+++ b/arch/x86/lib/delay.c
@@ -29,7 +29,7 @@
29/* simple loop based delay: */ 29/* simple loop based delay: */
30static void delay_loop(unsigned long loops) 30static void delay_loop(unsigned long loops)
31{ 31{
32 __asm__ __volatile__( 32 asm volatile(
33 " test %0,%0 \n" 33 " test %0,%0 \n"
34 " jz 3f \n" 34 " jz 3f \n"
35 " jmp 1f \n" 35 " jmp 1f \n"
@@ -38,9 +38,9 @@ static void delay_loop(unsigned long loops)
38 "1: jmp 2f \n" 38 "1: jmp 2f \n"
39 39
40 ".align 16 \n" 40 ".align 16 \n"
41 "2: decl %0 \n" 41 "2: dec %0 \n"
42 " jnz 2b \n" 42 " jnz 2b \n"
43 "3: decl %0 \n" 43 "3: dec %0 \n"
44 44
45 : /* we don't need output */ 45 : /* we don't need output */
46 :"a" (loops) 46 :"a" (loops)
@@ -98,7 +98,7 @@ void use_tsc_delay(void)
98int __devinit read_current_timer(unsigned long *timer_val) 98int __devinit read_current_timer(unsigned long *timer_val)
99{ 99{
100 if (delay_fn == delay_tsc) { 100 if (delay_fn == delay_tsc) {
101 rdtscl(*timer_val); 101 rdtscll(*timer_val);
102 return 0; 102 return 0;
103 } 103 }
104 return -1; 104 return -1;
@@ -108,31 +108,30 @@ void __delay(unsigned long loops)
108{ 108{
109 delay_fn(loops); 109 delay_fn(loops);
110} 110}
111EXPORT_SYMBOL(__delay);
111 112
112inline void __const_udelay(unsigned long xloops) 113inline void __const_udelay(unsigned long xloops)
113{ 114{
114 int d0; 115 int d0;
115 116
116 xloops *= 4; 117 xloops *= 4;
117 __asm__("mull %0" 118 asm("mull %%edx"
118 :"=d" (xloops), "=&a" (d0) 119 :"=d" (xloops), "=&a" (d0)
119 :"1" (xloops), "0" 120 :"1" (xloops), "0"
120 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); 121 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
121 122
122 __delay(++xloops); 123 __delay(++xloops);
123} 124}
125EXPORT_SYMBOL(__const_udelay);
124 126
125void __udelay(unsigned long usecs) 127void __udelay(unsigned long usecs)
126{ 128{
127 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ 129 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
128} 130}
131EXPORT_SYMBOL(__udelay);
129 132
130void __ndelay(unsigned long nsecs) 133void __ndelay(unsigned long nsecs)
131{ 134{
132 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ 135 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
133} 136}
134
135EXPORT_SYMBOL(__delay);
136EXPORT_SYMBOL(__const_udelay);
137EXPORT_SYMBOL(__udelay);
138EXPORT_SYMBOL(__ndelay); 137EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
deleted file mode 100644
index 4c441be92641..000000000000
--- a/arch/x86/lib/delay_64.c
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * Precise Delay Loops for x86-64
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 *
7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors.
9 */
10
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/timex.h>
14#include <linux/preempt.h>
15#include <linux/delay.h>
16#include <linux/init.h>
17
18#include <asm/delay.h>
19#include <asm/msr.h>
20
21#ifdef CONFIG_SMP
22#include <asm/smp.h>
23#endif
24
25int __devinit read_current_timer(unsigned long *timer_value)
26{
27 rdtscll(*timer_value);
28 return 0;
29}
30
31void __delay(unsigned long loops)
32{
33 unsigned bclock, now;
34 int cpu;
35
36 preempt_disable();
37 cpu = smp_processor_id();
38 rdtscl(bclock);
39 for (;;) {
40 rdtscl(now);
41 if ((now - bclock) >= loops)
42 break;
43
44 /* Allow RT tasks to run */
45 preempt_enable();
46 rep_nop();
47 preempt_disable();
48
49 /*
50 * It is possible that we moved to another CPU, and
51 * since TSC's are per-cpu we need to calculate
52 * that. The delay must guarantee that we wait "at
53 * least" the amount of time. Being moved to another
54 * CPU could make the wait longer but we just need to
55 * make sure we waited long enough. Rebalance the
56 * counter for this CPU.
57 */
58 if (unlikely(cpu != smp_processor_id())) {
59 loops -= (now - bclock);
60 cpu = smp_processor_id();
61 rdtscl(bclock);
62 }
63 }
64 preempt_enable();
65}
66EXPORT_SYMBOL(__delay);
67
68inline void __const_udelay(unsigned long xloops)
69{
70 __delay(((xloops * HZ *
71 cpu_data(raw_smp_processor_id()).loops_per_jiffy) >> 32) + 1);
72}
73EXPORT_SYMBOL(__const_udelay);
74
75void __udelay(unsigned long usecs)
76{
77 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
78}
79EXPORT_SYMBOL(__udelay);
80
81void __ndelay(unsigned long nsecs)
82{
83 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
84}
85EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser.S
index 5448876261f8..ad374003742f 100644
--- a/arch/x86/lib/getuser_64.S
+++ b/arch/x86/lib/getuser.S
@@ -3,6 +3,7 @@
3 * 3 *
4 * (C) Copyright 1998 Linus Torvalds 4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen 5 * (C) Copyright 2005 Andi Kleen
6 * (C) Copyright 2008 Glauber Costa
6 * 7 *
7 * These functions have a non-standard call interface 8 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they 9 * to make them more efficient, especially as they
@@ -13,14 +14,13 @@
13/* 14/*
14 * __get_user_X 15 * __get_user_X
15 * 16 *
16 * Inputs: %rcx contains the address. 17 * Inputs: %[r|e]ax contains the address.
17 * The register is modified, but all changes are undone 18 * The register is modified, but all changes are undone
18 * before returning because the C code doesn't know about it. 19 * before returning because the C code doesn't know about it.
19 * 20 *
20 * Outputs: %rax is error code (0 or -EFAULT) 21 * Outputs: %[r|e]ax is error code (0 or -EFAULT)
21 * %rdx contains zero-extended value 22 * %[r|e]dx contains zero-extended value
22 * 23 *
23 * %r8 is destroyed.
24 * 24 *
25 * These functions should not modify any other registers, 25 * These functions should not modify any other registers,
26 * as they get called from within inline assembly. 26 * as they get called from within inline assembly.
@@ -32,78 +32,73 @@
32#include <asm/errno.h> 32#include <asm/errno.h>
33#include <asm/asm-offsets.h> 33#include <asm/asm-offsets.h>
34#include <asm/thread_info.h> 34#include <asm/thread_info.h>
35#include <asm/asm.h>
35 36
36 .text 37 .text
37ENTRY(__get_user_1) 38ENTRY(__get_user_1)
38 CFI_STARTPROC 39 CFI_STARTPROC
39 GET_THREAD_INFO(%r8) 40 GET_THREAD_INFO(%_ASM_DX)
40 cmpq threadinfo_addr_limit(%r8),%rcx 41 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
41 jae bad_get_user 42 jae bad_get_user
421: movzb (%rcx),%edx 431: movzb (%_ASM_AX),%edx
43 xorl %eax,%eax 44 xor %eax,%eax
44 ret 45 ret
45 CFI_ENDPROC 46 CFI_ENDPROC
46ENDPROC(__get_user_1) 47ENDPROC(__get_user_1)
47 48
48ENTRY(__get_user_2) 49ENTRY(__get_user_2)
49 CFI_STARTPROC 50 CFI_STARTPROC
50 GET_THREAD_INFO(%r8) 51 add $1,%_ASM_AX
51 addq $1,%rcx 52 jc bad_get_user
52 jc 20f 53 GET_THREAD_INFO(%_ASM_DX)
53 cmpq threadinfo_addr_limit(%r8),%rcx 54 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
54 jae 20f 55 jae bad_get_user
55 decq %rcx 562: movzwl -1(%_ASM_AX),%edx
562: movzwl (%rcx),%edx 57 xor %eax,%eax
57 xorl %eax,%eax
58 ret 58 ret
5920: decq %rcx
60 jmp bad_get_user
61 CFI_ENDPROC 59 CFI_ENDPROC
62ENDPROC(__get_user_2) 60ENDPROC(__get_user_2)
63 61
64ENTRY(__get_user_4) 62ENTRY(__get_user_4)
65 CFI_STARTPROC 63 CFI_STARTPROC
66 GET_THREAD_INFO(%r8) 64 add $3,%_ASM_AX
67 addq $3,%rcx 65 jc bad_get_user
68 jc 30f 66 GET_THREAD_INFO(%_ASM_DX)
69 cmpq threadinfo_addr_limit(%r8),%rcx 67 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
70 jae 30f 68 jae bad_get_user
71 subq $3,%rcx 693: mov -3(%_ASM_AX),%edx
723: movl (%rcx),%edx 70 xor %eax,%eax
73 xorl %eax,%eax
74 ret 71 ret
7530: subq $3,%rcx
76 jmp bad_get_user
77 CFI_ENDPROC 72 CFI_ENDPROC
78ENDPROC(__get_user_4) 73ENDPROC(__get_user_4)
79 74
75#ifdef CONFIG_X86_64
80ENTRY(__get_user_8) 76ENTRY(__get_user_8)
81 CFI_STARTPROC 77 CFI_STARTPROC
82 GET_THREAD_INFO(%r8) 78 add $7,%_ASM_AX
83 addq $7,%rcx 79 jc bad_get_user
84 jc 40f 80 GET_THREAD_INFO(%_ASM_DX)
85 cmpq threadinfo_addr_limit(%r8),%rcx 81 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
86 jae 40f 82 jae bad_get_user
87 subq $7,%rcx 834: movq -7(%_ASM_AX),%_ASM_DX
884: movq (%rcx),%rdx 84 xor %eax,%eax
89 xorl %eax,%eax
90 ret 85 ret
9140: subq $7,%rcx
92 jmp bad_get_user
93 CFI_ENDPROC 86 CFI_ENDPROC
94ENDPROC(__get_user_8) 87ENDPROC(__get_user_8)
88#endif
95 89
96bad_get_user: 90bad_get_user:
97 CFI_STARTPROC 91 CFI_STARTPROC
98 xorl %edx,%edx 92 xor %edx,%edx
99 movq $(-EFAULT),%rax 93 mov $(-EFAULT),%_ASM_AX
100 ret 94 ret
101 CFI_ENDPROC 95 CFI_ENDPROC
102END(bad_get_user) 96END(bad_get_user)
103 97
104.section __ex_table,"a" 98.section __ex_table,"a"
105 .quad 1b,bad_get_user 99 _ASM_PTR 1b,bad_get_user
106 .quad 2b,bad_get_user 100 _ASM_PTR 2b,bad_get_user
107 .quad 3b,bad_get_user 101 _ASM_PTR 3b,bad_get_user
108 .quad 4b,bad_get_user 102#ifdef CONFIG_X86_64
109.previous 103 _ASM_PTR 4b,bad_get_user
104#endif
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
deleted file mode 100644
index 6d84b53f12a2..000000000000
--- a/arch/x86/lib/getuser_32.S
+++ /dev/null
@@ -1,78 +0,0 @@
1/*
2 * __get_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 *
6 * These functions have a non-standard call interface
7 * to make them more efficient, especially as they
8 * return an error value in addition to the "real"
9 * return value.
10 */
11#include <linux/linkage.h>
12#include <asm/dwarf2.h>
13#include <asm/thread_info.h>
14
15
16/*
17 * __get_user_X
18 *
19 * Inputs: %eax contains the address
20 *
21 * Outputs: %eax is error code (0 or -EFAULT)
22 * %edx contains zero-extended value
23 *
24 * These functions should not modify any other registers,
25 * as they get called from within inline assembly.
26 */
27
28.text
29ENTRY(__get_user_1)
30 CFI_STARTPROC
31 GET_THREAD_INFO(%edx)
32 cmpl TI_addr_limit(%edx),%eax
33 jae bad_get_user
341: movzbl (%eax),%edx
35 xorl %eax,%eax
36 ret
37 CFI_ENDPROC
38ENDPROC(__get_user_1)
39
40ENTRY(__get_user_2)
41 CFI_STARTPROC
42 addl $1,%eax
43 jc bad_get_user
44 GET_THREAD_INFO(%edx)
45 cmpl TI_addr_limit(%edx),%eax
46 jae bad_get_user
472: movzwl -1(%eax),%edx
48 xorl %eax,%eax
49 ret
50 CFI_ENDPROC
51ENDPROC(__get_user_2)
52
53ENTRY(__get_user_4)
54 CFI_STARTPROC
55 addl $3,%eax
56 jc bad_get_user
57 GET_THREAD_INFO(%edx)
58 cmpl TI_addr_limit(%edx),%eax
59 jae bad_get_user
603: movl -3(%eax),%edx
61 xorl %eax,%eax
62 ret
63 CFI_ENDPROC
64ENDPROC(__get_user_4)
65
66bad_get_user:
67 CFI_STARTPROC
68 xorl %edx,%edx
69 movl $-14,%eax
70 ret
71 CFI_ENDPROC
72END(bad_get_user)
73
74.section __ex_table,"a"
75 .long 1b,bad_get_user
76 .long 2b,bad_get_user
77 .long 3b,bad_get_user
78.previous
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 57d043fa893e..d5a2b39f882b 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -30,10 +30,10 @@ static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
30 30
31 rv.msr_no = msr_no; 31 rv.msr_no = msr_no;
32 if (safe) { 32 if (safe) {
33 smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1); 33 smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
34 err = rv.err; 34 err = rv.err;
35 } else { 35 } else {
36 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1); 36 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
37 } 37 }
38 *l = rv.l; 38 *l = rv.l;
39 *h = rv.h; 39 *h = rv.h;
@@ -64,10 +64,10 @@ static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
64 rv.l = l; 64 rv.l = l;
65 rv.h = h; 65 rv.h = h;
66 if (safe) { 66 if (safe) {
67 smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1); 67 smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
68 err = rv.err; 68 err = rv.err;
69 } else { 69 } else {
70 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1); 70 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
71 } 71 }
72 72
73 return err; 73 return err;
diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser.S
index f58fba109d18..36b0d15ae6e9 100644
--- a/arch/x86/lib/putuser_32.S
+++ b/arch/x86/lib/putuser.S
@@ -2,6 +2,8 @@
2 * __put_user functions. 2 * __put_user functions.
3 * 3 *
4 * (C) Copyright 2005 Linus Torvalds 4 * (C) Copyright 2005 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 * (C) Copyright 2008 Glauber Costa
5 * 7 *
6 * These functions have a non-standard call interface 8 * These functions have a non-standard call interface
7 * to make them more efficient, especially as they 9 * to make them more efficient, especially as they
@@ -11,6 +13,8 @@
11#include <linux/linkage.h> 13#include <linux/linkage.h>
12#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
13#include <asm/thread_info.h> 15#include <asm/thread_info.h>
16#include <asm/errno.h>
17#include <asm/asm.h>
14 18
15 19
16/* 20/*
@@ -26,73 +30,68 @@
26 */ 30 */
27 31
28#define ENTER CFI_STARTPROC ; \ 32#define ENTER CFI_STARTPROC ; \
29 pushl %ebx ; \ 33 GET_THREAD_INFO(%_ASM_BX)
30 CFI_ADJUST_CFA_OFFSET 4 ; \ 34#define EXIT ret ; \
31 CFI_REL_OFFSET ebx, 0 ; \
32 GET_THREAD_INFO(%ebx)
33#define EXIT popl %ebx ; \
34 CFI_ADJUST_CFA_OFFSET -4 ; \
35 CFI_RESTORE ebx ; \
36 ret ; \
37 CFI_ENDPROC 35 CFI_ENDPROC
38 36
39.text 37.text
40ENTRY(__put_user_1) 38ENTRY(__put_user_1)
41 ENTER 39 ENTER
42 cmpl TI_addr_limit(%ebx),%ecx 40 cmp TI_addr_limit(%_ASM_BX),%_ASM_CX
43 jae bad_put_user 41 jae bad_put_user
441: movb %al,(%ecx) 421: movb %al,(%_ASM_CX)
45 xorl %eax,%eax 43 xor %eax,%eax
46 EXIT 44 EXIT
47ENDPROC(__put_user_1) 45ENDPROC(__put_user_1)
48 46
49ENTRY(__put_user_2) 47ENTRY(__put_user_2)
50 ENTER 48 ENTER
51 movl TI_addr_limit(%ebx),%ebx 49 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
52 subl $1,%ebx 50 sub $1,%_ASM_BX
53 cmpl %ebx,%ecx 51 cmp %_ASM_BX,%_ASM_CX
54 jae bad_put_user 52 jae bad_put_user
552: movw %ax,(%ecx) 532: movw %ax,(%_ASM_CX)
56 xorl %eax,%eax 54 xor %eax,%eax
57 EXIT 55 EXIT
58ENDPROC(__put_user_2) 56ENDPROC(__put_user_2)
59 57
60ENTRY(__put_user_4) 58ENTRY(__put_user_4)
61 ENTER 59 ENTER
62 movl TI_addr_limit(%ebx),%ebx 60 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
63 subl $3,%ebx 61 sub $3,%_ASM_BX
64 cmpl %ebx,%ecx 62 cmp %_ASM_BX,%_ASM_CX
65 jae bad_put_user 63 jae bad_put_user
663: movl %eax,(%ecx) 643: movl %eax,(%_ASM_CX)
67 xorl %eax,%eax 65 xor %eax,%eax
68 EXIT 66 EXIT
69ENDPROC(__put_user_4) 67ENDPROC(__put_user_4)
70 68
71ENTRY(__put_user_8) 69ENTRY(__put_user_8)
72 ENTER 70 ENTER
73 movl TI_addr_limit(%ebx),%ebx 71 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
74 subl $7,%ebx 72 sub $7,%_ASM_BX
75 cmpl %ebx,%ecx 73 cmp %_ASM_BX,%_ASM_CX
76 jae bad_put_user 74 jae bad_put_user
774: movl %eax,(%ecx) 754: mov %_ASM_AX,(%_ASM_CX)
785: movl %edx,4(%ecx) 76#ifdef CONFIG_X86_32
79 xorl %eax,%eax 775: movl %edx,4(%_ASM_CX)
78#endif
79 xor %eax,%eax
80 EXIT 80 EXIT
81ENDPROC(__put_user_8) 81ENDPROC(__put_user_8)
82 82
83bad_put_user: 83bad_put_user:
84 CFI_STARTPROC simple 84 CFI_STARTPROC
85 CFI_DEF_CFA esp, 2*4 85 movl $-EFAULT,%eax
86 CFI_OFFSET eip, -1*4
87 CFI_OFFSET ebx, -2*4
88 movl $-14,%eax
89 EXIT 86 EXIT
90END(bad_put_user) 87END(bad_put_user)
91 88
92.section __ex_table,"a" 89.section __ex_table,"a"
93 .long 1b,bad_put_user 90 _ASM_PTR 1b,bad_put_user
94 .long 2b,bad_put_user 91 _ASM_PTR 2b,bad_put_user
95 .long 3b,bad_put_user 92 _ASM_PTR 3b,bad_put_user
96 .long 4b,bad_put_user 93 _ASM_PTR 4b,bad_put_user
97 .long 5b,bad_put_user 94#ifdef CONFIG_X86_32
95 _ASM_PTR 5b,bad_put_user
96#endif
98.previous 97.previous
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
deleted file mode 100644
index 4989f5a8fa9b..000000000000
--- a/arch/x86/lib/putuser_64.S
+++ /dev/null
@@ -1,106 +0,0 @@
1/*
2 * __put_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __put_user_X
15 *
16 * Inputs: %rcx contains the address
17 * %rdx contains new value
18 *
19 * Outputs: %rax is error code (0 or -EFAULT)
20 *
21 * %r8 is destroyed.
22 *
23 * These functions should not modify any other registers,
24 * as they get called from within inline assembly.
25 */
26
27#include <linux/linkage.h>
28#include <asm/dwarf2.h>
29#include <asm/page.h>
30#include <asm/errno.h>
31#include <asm/asm-offsets.h>
32#include <asm/thread_info.h>
33
34 .text
35ENTRY(__put_user_1)
36 CFI_STARTPROC
37 GET_THREAD_INFO(%r8)
38 cmpq threadinfo_addr_limit(%r8),%rcx
39 jae bad_put_user
401: movb %dl,(%rcx)
41 xorl %eax,%eax
42 ret
43 CFI_ENDPROC
44ENDPROC(__put_user_1)
45
46ENTRY(__put_user_2)
47 CFI_STARTPROC
48 GET_THREAD_INFO(%r8)
49 addq $1,%rcx
50 jc 20f
51 cmpq threadinfo_addr_limit(%r8),%rcx
52 jae 20f
53 decq %rcx
542: movw %dx,(%rcx)
55 xorl %eax,%eax
56 ret
5720: decq %rcx
58 jmp bad_put_user
59 CFI_ENDPROC
60ENDPROC(__put_user_2)
61
62ENTRY(__put_user_4)
63 CFI_STARTPROC
64 GET_THREAD_INFO(%r8)
65 addq $3,%rcx
66 jc 30f
67 cmpq threadinfo_addr_limit(%r8),%rcx
68 jae 30f
69 subq $3,%rcx
703: movl %edx,(%rcx)
71 xorl %eax,%eax
72 ret
7330: subq $3,%rcx
74 jmp bad_put_user
75 CFI_ENDPROC
76ENDPROC(__put_user_4)
77
78ENTRY(__put_user_8)
79 CFI_STARTPROC
80 GET_THREAD_INFO(%r8)
81 addq $7,%rcx
82 jc 40f
83 cmpq threadinfo_addr_limit(%r8),%rcx
84 jae 40f
85 subq $7,%rcx
864: movq %rdx,(%rcx)
87 xorl %eax,%eax
88 ret
8940: subq $7,%rcx
90 jmp bad_put_user
91 CFI_ENDPROC
92ENDPROC(__put_user_8)
93
94bad_put_user:
95 CFI_STARTPROC
96 movq $(-EFAULT),%rax
97 ret
98 CFI_ENDPROC
99END(bad_put_user)
100
101.section __ex_table,"a"
102 .quad 1b,bad_put_user
103 .quad 2b,bad_put_user
104 .quad 3b,bad_put_user
105 .quad 4b,bad_put_user
106.previous
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644
index 000000000000..650b11e00ecc
--- /dev/null
+++ b/arch/x86/lib/thunk_32.S
@@ -0,0 +1,47 @@
1/*
2 * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
3 * Copyright 2008 by Steven Rostedt, Red Hat, Inc
4 * (inspired by Andi Kleen's thunk_64.S)
5 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */
7
8 #include <linux/linkage.h>
9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func
31 .globl \name
32\name:
33 pushl %eax
34 pushl %ecx
35 pushl %edx
36 /* Place EIP in the arg1 */
37 movl 3*4(%esp), %eax
38 call \func
39 popl %edx
40 popl %ecx
41 popl %eax
42 ret
43 .endm
44
45 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
46 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
47#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index e009251d4e9f..bf9a7d5a5428 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -2,6 +2,7 @@
2 * Save registers before calling assembly functions. This avoids 2 * Save registers before calling assembly functions. This avoids
3 * disturbance of register allocation in some inline assembly constructs. 3 * disturbance of register allocation in some inline assembly constructs.
4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs. 4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
5 * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
5 * Subject to the GNU public license, v.2. No warranty of any kind. 6 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */ 7 */
7 8
@@ -42,8 +43,22 @@
42#endif 43#endif
43 44
44#ifdef CONFIG_TRACE_IRQFLAGS 45#ifdef CONFIG_TRACE_IRQFLAGS
45 thunk trace_hardirqs_on_thunk,trace_hardirqs_on 46 /* put return address in rdi (arg1) */
46 thunk trace_hardirqs_off_thunk,trace_hardirqs_off 47 .macro thunk_ra name,func
48 .globl \name
49\name:
50 CFI_STARTPROC
51 SAVE_ARGS
52 /* SAVE_ARGS pushs 9 elements */
53 /* the next element would be the rip */
54 movq 9*8(%rsp), %rdi
55 call \func
56 jmp restore
57 CFI_ENDPROC
58 .endm
59
60 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
61 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
47#endif 62#endif
48 63
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 64#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0c89d1bb0287..f4df6e7c718b 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -158,3 +158,26 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le
158} 158}
159EXPORT_SYMBOL(copy_in_user); 159EXPORT_SYMBOL(copy_in_user);
160 160
161/*
162 * Try to copy last bytes and clear the rest if needed.
163 * Since protection fault in copy_from/to_user is not a normal situation,
164 * it is not necessary to optimize tail handling.
165 */
166unsigned long
167copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
168{
169 char c;
170 unsigned zero_len;
171
172 for (; len; --len) {
173 if (__get_user_nocheck(c, from++, sizeof(char)))
174 break;
175 if (__put_user_nocheck(c, to++, sizeof(char)))
176 break;
177 }
178
179 for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
180 if (__put_user_nocheck(c, to++, sizeof(char)))
181 break;
182 return len;
183}
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 2f5e277686b8..48278fa7d3de 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,6 +10,14 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13/*
14 * Any quirks to be performed to initialize timers/irqs/etc?
15 */
16int (*arch_time_init_quirk)(void);
17int (*arch_pre_intr_init_quirk)(void);
18int (*arch_intr_init_quirk)(void);
19int (*arch_trap_init_quirk)(void);
20
13#ifdef CONFIG_HOTPLUG_CPU 21#ifdef CONFIG_HOTPLUG_CPU
14#define DEFAULT_SEND_IPI (1) 22#define DEFAULT_SEND_IPI (1)
15#else 23#else
@@ -29,6 +37,10 @@ int no_broadcast=DEFAULT_SEND_IPI;
29 **/ 37 **/
30void __init pre_intr_init_hook(void) 38void __init pre_intr_init_hook(void)
31{ 39{
40 if (arch_pre_intr_init_quirk) {
41 if (arch_pre_intr_init_quirk())
42 return;
43 }
32 init_ISA_irqs(); 44 init_ISA_irqs();
33} 45}
34 46
@@ -52,6 +64,10 @@ static struct irqaction irq2 = {
52 **/ 64 **/
53void __init intr_init_hook(void) 65void __init intr_init_hook(void)
54{ 66{
67 if (arch_intr_init_quirk) {
68 if (arch_intr_init_quirk())
69 return;
70 }
55#ifdef CONFIG_X86_LOCAL_APIC 71#ifdef CONFIG_X86_LOCAL_APIC
56 apic_intr_init(); 72 apic_intr_init();
57#endif 73#endif
@@ -65,7 +81,7 @@ void __init intr_init_hook(void)
65 * 81 *
66 * Description: 82 * Description:
67 * generally used to activate any machine specific identification 83 * generally used to activate any machine specific identification
68 * routines that may be needed before setup_arch() runs. On VISWS 84 * routines that may be needed before setup_arch() runs. On Voyager
69 * this is used to get the board revision and type. 85 * this is used to get the board revision and type.
70 **/ 86 **/
71void __init pre_setup_arch_hook(void) 87void __init pre_setup_arch_hook(void)
@@ -81,6 +97,10 @@ void __init pre_setup_arch_hook(void)
81 **/ 97 **/
82void __init trap_init_hook(void) 98void __init trap_init_hook(void)
83{ 99{
100 if (arch_trap_init_quirk) {
101 if (arch_trap_init_quirk())
102 return;
103 }
84} 104}
85 105
86static struct irqaction irq0 = { 106static struct irqaction irq0 = {
@@ -99,6 +119,16 @@ static struct irqaction irq0 = {
99 **/ 119 **/
100void __init time_init_hook(void) 120void __init time_init_hook(void)
101{ 121{
122 if (arch_time_init_quirk) {
123 /*
124 * A nonzero return code does not mean failure, it means
125 * that the architecture quirk does not want any
126 * generic (timer) setup to be performed after this:
127 */
128 if (arch_time_init_quirk())
129 return;
130 }
131
102 irq0.mask = cpumask_of_cpu(0); 132 irq0.mask = cpumask_of_cpu(0);
103 setup_irq(0, &irq0); 133 setup_irq(0, &irq0);
104} 134}
diff --git a/arch/x86/mach-visws/Makefile b/arch/x86/mach-visws/Makefile
deleted file mode 100644
index 835fd96ad768..000000000000
--- a/arch/x86/mach-visws/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-y := setup.o traps.o reboot.o
6
7obj-$(CONFIG_X86_VISWS_APIC) += visws_apic.o
8obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
deleted file mode 100644
index a2fb78c0d154..000000000000
--- a/arch/x86/mach-visws/mpparse.c
+++ /dev/null
@@ -1,85 +0,0 @@
1
2#include <linux/init.h>
3#include <linux/smp.h>
4
5#include <asm/smp.h>
6#include <asm/io.h>
7
8#include "cobalt.h"
9#include "mach_apic.h"
10
11extern unsigned int __cpuinitdata maxcpus;
12
13/*
14 * The Visual Workstation is Intel MP compliant in the hardware
15 * sense, but it doesn't have a BIOS(-configuration table).
16 * No problem for Linux.
17 */
18
19static void __init MP_processor_info (struct mpc_config_processor *m)
20{
21 int ver, logical_apicid;
22 physid_mask_t apic_cpus;
23
24 if (!(m->mpc_cpuflag & CPU_ENABLED))
25 return;
26
27 logical_apicid = m->mpc_apicid;
28 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
29 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
30 m->mpc_apicid,
31 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
32 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
33 m->mpc_apicver);
34
35 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
36 boot_cpu_physical_apicid = m->mpc_apicid;
37
38 ver = m->mpc_apicver;
39 if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
40 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
41 m->mpc_apicid, MAX_APICS);
42 return;
43 }
44
45 apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
46 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
47 /*
48 * Validate version
49 */
50 if (ver == 0x0) {
51 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
52 "fixing up to 0x10. (tell your hw vendor)\n",
53 m->mpc_apicid);
54 ver = 0x10;
55 }
56 apic_version[m->mpc_apicid] = ver;
57}
58
59void __init find_smp_config(void)
60{
61 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
62 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
63
64 if (ncpus > CO_CPU_MAX) {
65 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
66 ncpus, mp);
67
68 ncpus = CO_CPU_MAX;
69 }
70
71 if (ncpus > maxcpus)
72 ncpus = maxcpus;
73
74#ifdef CONFIG_X86_LOCAL_APIC
75 smp_found_config = 1;
76#endif
77 while (ncpus--)
78 MP_processor_info(mp++);
79
80 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
81}
82
83void __init get_smp_config (void)
84{
85}
diff --git a/arch/x86/mach-visws/reboot.c b/arch/x86/mach-visws/reboot.c
deleted file mode 100644
index 99332abfad42..000000000000
--- a/arch/x86/mach-visws/reboot.c
+++ /dev/null
@@ -1,55 +0,0 @@
1#include <linux/module.h>
2#include <linux/smp.h>
3#include <linux/delay.h>
4
5#include <asm/io.h>
6#include "piix4.h"
7
8void (*pm_power_off)(void);
9EXPORT_SYMBOL(pm_power_off);
10
11void machine_shutdown(void)
12{
13#ifdef CONFIG_SMP
14 smp_send_stop();
15#endif
16}
17
18void machine_emergency_restart(void)
19{
20 /*
21 * Visual Workstations restart after this
22 * register is poked on the PIIX4
23 */
24 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
25}
26
27void machine_restart(char * __unused)
28{
29 machine_shutdown();
30 machine_emergency_restart();
31}
32
33void machine_power_off(void)
34{
35 unsigned short pm_status;
36 extern unsigned int pci_bus0;
37
38 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
39 outw(pm_status, PMSTS_PORT);
40
41 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
42
43 mdelay(10);
44
45#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
46 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
47
48 outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8);
49 outl(PIIX_SPECIAL_STOP, 0xCFC);
50}
51
52void machine_halt(void)
53{
54}
55
diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c
deleted file mode 100644
index d67868ec9b7f..000000000000
--- a/arch/x86/mach-visws/setup.c
+++ /dev/null
@@ -1,183 +0,0 @@
1/*
2 * Unmaintained SGI Visual Workstation support.
3 * Split out from setup.c by davej@suse.de
4 */
5
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/interrupt.h>
9#include <linux/module.h>
10
11#include <asm/fixmap.h>
12#include <asm/arch_hooks.h>
13#include <asm/io.h>
14#include <asm/e820.h>
15#include <asm/setup.h>
16#include "cobalt.h"
17#include "piix4.h"
18
19int no_broadcast;
20
21char visws_board_type = -1;
22char visws_board_rev = -1;
23
24void __init visws_get_board_type_and_rev(void)
25{
26 int raw;
27
28 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
29 >> PIIX_GPI_BD_SHIFT;
30 /*
31 * Get Board rev.
32 * First, we have to initialize the 307 part to allow us access
33 * to the GPIO registers. Let's map them at 0x0fc0 which is right
34 * after the PIIX4 PM section.
35 */
36 outb_p(SIO_DEV_SEL, SIO_INDEX);
37 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
38
39 outb_p(SIO_DEV_MSB, SIO_INDEX);
40 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
41
42 outb_p(SIO_DEV_LSB, SIO_INDEX);
43 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
44
45 outb_p(SIO_DEV_ENB, SIO_INDEX);
46 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
47
48 /*
49 * Now, we have to map the power management section to write
50 * a bit which enables access to the GPIO registers.
51 * What lunatic came up with this shit?
52 */
53 outb_p(SIO_DEV_SEL, SIO_INDEX);
54 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
55
56 outb_p(SIO_DEV_MSB, SIO_INDEX);
57 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
58
59 outb_p(SIO_DEV_LSB, SIO_INDEX);
60 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
61
62 outb_p(SIO_DEV_ENB, SIO_INDEX);
63 outb_p(1, SIO_DATA); /* Enable PM registers. */
64
65 /*
66 * Now, write the PM register which enables the GPIO registers.
67 */
68 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
69 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
70
71 /*
72 * Now, initialize the GPIO registers.
73 * We want them all to be inputs which is the
74 * power on default, so let's leave them alone.
75 * So, let's just read the board rev!
76 */
77 raw = inb_p(SIO_GP_DATA1);
78 raw &= 0x7f; /* 7 bits of valid board revision ID. */
79
80 if (visws_board_type == VISWS_320) {
81 if (raw < 0x6) {
82 visws_board_rev = 4;
83 } else if (raw < 0xc) {
84 visws_board_rev = 5;
85 } else {
86 visws_board_rev = 6;
87 }
88 } else if (visws_board_type == VISWS_540) {
89 visws_board_rev = 2;
90 } else {
91 visws_board_rev = raw;
92 }
93
94 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
95 (visws_board_type == VISWS_320 ? "320" :
96 (visws_board_type == VISWS_540 ? "540" :
97 "unknown")), visws_board_rev);
98}
99
100void __init pre_intr_init_hook(void)
101{
102 init_VISWS_APIC_irqs();
103}
104
105void __init intr_init_hook(void)
106{
107#ifdef CONFIG_X86_LOCAL_APIC
108 apic_intr_init();
109#endif
110}
111
112void __init pre_setup_arch_hook()
113{
114 visws_get_board_type_and_rev();
115}
116
117static struct irqaction irq0 = {
118 .handler = timer_interrupt,
119 .flags = IRQF_DISABLED | IRQF_IRQPOLL,
120 .name = "timer",
121};
122
123void __init time_init_hook(void)
124{
125 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
126
127 /* Set the countdown value */
128 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
129
130 /* Start the timer */
131 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
132
133 /* Enable (unmask) the timer interrupt */
134 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
135
136 /* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */
137 setup_irq(0, &irq0);
138}
139
140/* Hook for machine specific memory setup. */
141
142#define MB (1024 * 1024)
143
144unsigned long sgivwfb_mem_phys;
145unsigned long sgivwfb_mem_size;
146EXPORT_SYMBOL(sgivwfb_mem_phys);
147EXPORT_SYMBOL(sgivwfb_mem_size);
148
149long long mem_size __initdata = 0;
150
151char * __init machine_specific_memory_setup(void)
152{
153 long long gfx_mem_size = 8 * MB;
154
155 mem_size = boot_params.alt_mem_k;
156
157 if (!mem_size) {
158 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
159 mem_size = 128 * MB;
160 }
161
162 /*
163 * this hardcodes the graphics memory to 8 MB
164 * it really should be sized dynamically (or at least
165 * set as a boot param)
166 */
167 if (!sgivwfb_mem_size) {
168 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
169 sgivwfb_mem_size = 8 * MB;
170 }
171
172 /*
173 * Trim to nearest MB
174 */
175 sgivwfb_mem_size &= ~((1 << 20) - 1);
176 sgivwfb_mem_phys = mem_size - gfx_mem_size;
177
178 e820_add_region(0, LOWMEMSIZE(), E820_RAM);
179 e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
180 e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
181
182 return "PROM";
183}
diff --git a/arch/x86/mach-visws/traps.c b/arch/x86/mach-visws/traps.c
deleted file mode 100644
index bfac6ba10f8a..000000000000
--- a/arch/x86/mach-visws/traps.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/* VISWS traps */
2
3#include <linux/sched.h>
4#include <linux/kernel.h>
5#include <linux/init.h>
6#include <linux/pci.h>
7#include <linux/pci_ids.h>
8
9#include <asm/io.h>
10#include <asm/arch_hooks.h>
11#include <asm/apic.h>
12#include "cobalt.h"
13#include "lithium.h"
14
15
16#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
17#define BCD (LI_INTB | LI_INTC | LI_INTD)
18#define ALLDEVS (A01234 | BCD)
19
20static __init void lithium_init(void)
21{
22 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
23 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
24
25 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
26 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
27 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
28 panic("This machine is not SGI Visual Workstation 320/540");
29 }
30
31 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
32 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
33 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
34 panic("This machine is not SGI Visual Workstation 320/540");
35 }
36
37 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
38 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
39}
40
41static __init void cobalt_init(void)
42{
43 /*
44 * On normal SMP PC this is used only with SMP, but we have to
45 * use it and set it up here to start the Cobalt clock
46 */
47 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
48 setup_local_APIC();
49 printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
50 (unsigned int)apic_read(APIC_LVR),
51 (unsigned int)apic_read(APIC_ID));
52
53 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
54 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
55 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
56 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
57
58 /* Enable Cobalt APIC being careful to NOT change the ID! */
59 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
60
61 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
62 co_apic_read(CO_APIC_ID));
63}
64
65void __init trap_init_hook(void)
66{
67 lithium_init();
68 cobalt_init();
69}
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
deleted file mode 100644
index d8b2cfd85d92..000000000000
--- a/arch/x86/mach-visws/visws_apic.c
+++ /dev/null
@@ -1,296 +0,0 @@
1/*
2 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
3 *
4 * SGI Visual Workstation interrupt controller
5 *
6 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
7 * which serves as the main interrupt controller in the system. Non-legacy
8 * hardware in the system uses this controller directly. Legacy devices
9 * are connected to the PIIX4 which in turn has its 8259(s) connected to
10 * a of the Cobalt APIC entry.
11 *
12 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
13 *
14 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
15 */
16
17#include <linux/kernel_stat.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20
21#include <asm/io.h>
22#include <asm/apic.h>
23#include <asm/i8259.h>
24#include <asm/irq_vectors.h>
25
26#include "cobalt.h"
27
28static DEFINE_SPINLOCK(cobalt_lock);
29
30/*
31 * Set the given Cobalt APIC Redirection Table entry to point
32 * to the given IDT vector/index.
33 */
34static inline void co_apic_set(int entry, int irq)
35{
36 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
37 co_apic_write(CO_APIC_HI(entry), 0);
38}
39
40/*
41 * Cobalt (IO)-APIC functions to handle PCI devices.
42 */
43static inline int co_apic_ide0_hack(void)
44{
45 extern char visws_board_type;
46 extern char visws_board_rev;
47
48 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
49 return 5;
50 return CO_APIC_IDE0;
51}
52
53static int is_co_apic(unsigned int irq)
54{
55 if (IS_CO_APIC(irq))
56 return CO_APIC(irq);
57
58 switch (irq) {
59 case 0: return CO_APIC_CPU;
60 case CO_IRQ_IDE0: return co_apic_ide0_hack();
61 case CO_IRQ_IDE1: return CO_APIC_IDE1;
62 default: return -1;
63 }
64}
65
66
67/*
68 * This is the SGI Cobalt (IO-)APIC:
69 */
70
71static void enable_cobalt_irq(unsigned int irq)
72{
73 co_apic_set(is_co_apic(irq), irq);
74}
75
76static void disable_cobalt_irq(unsigned int irq)
77{
78 int entry = is_co_apic(irq);
79
80 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
81 co_apic_read(CO_APIC_LO(entry));
82}
83
84/*
85 * "irq" really just serves to identify the device. Here is where we
86 * map this to the Cobalt APIC entry where it's physically wired.
87 * This is called via request_irq -> setup_irq -> irq_desc->startup()
88 */
89static unsigned int startup_cobalt_irq(unsigned int irq)
90{
91 unsigned long flags;
92
93 spin_lock_irqsave(&cobalt_lock, flags);
94 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
95 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
96 enable_cobalt_irq(irq);
97 spin_unlock_irqrestore(&cobalt_lock, flags);
98 return 0;
99}
100
101static void ack_cobalt_irq(unsigned int irq)
102{
103 unsigned long flags;
104
105 spin_lock_irqsave(&cobalt_lock, flags);
106 disable_cobalt_irq(irq);
107 apic_write(APIC_EOI, APIC_EIO_ACK);
108 spin_unlock_irqrestore(&cobalt_lock, flags);
109}
110
111static void end_cobalt_irq(unsigned int irq)
112{
113 unsigned long flags;
114
115 spin_lock_irqsave(&cobalt_lock, flags);
116 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
117 enable_cobalt_irq(irq);
118 spin_unlock_irqrestore(&cobalt_lock, flags);
119}
120
121static struct irq_chip cobalt_irq_type = {
122 .typename = "Cobalt-APIC",
123 .startup = startup_cobalt_irq,
124 .shutdown = disable_cobalt_irq,
125 .enable = enable_cobalt_irq,
126 .disable = disable_cobalt_irq,
127 .ack = ack_cobalt_irq,
128 .end = end_cobalt_irq,
129};
130
131
132/*
133 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
134 * -- not the manner expected by the code in i8259.c.
135 *
136 * there is a 'master' physical interrupt source that gets sent to
137 * the CPU. But in the chipset there are various 'virtual' interrupts
138 * waiting to be handled. We represent this to Linux through a 'master'
139 * interrupt controller type, and through a special virtual interrupt-
140 * controller. Device drivers only see the virtual interrupt sources.
141 */
142static unsigned int startup_piix4_master_irq(unsigned int irq)
143{
144 init_8259A(0);
145
146 return startup_cobalt_irq(irq);
147}
148
149static void end_piix4_master_irq(unsigned int irq)
150{
151 unsigned long flags;
152
153 spin_lock_irqsave(&cobalt_lock, flags);
154 enable_cobalt_irq(irq);
155 spin_unlock_irqrestore(&cobalt_lock, flags);
156}
157
158static struct irq_chip piix4_master_irq_type = {
159 .typename = "PIIX4-master",
160 .startup = startup_piix4_master_irq,
161 .ack = ack_cobalt_irq,
162 .end = end_piix4_master_irq,
163};
164
165
166static struct irq_chip piix4_virtual_irq_type = {
167 .typename = "PIIX4-virtual",
168 .shutdown = disable_8259A_irq,
169 .enable = enable_8259A_irq,
170 .disable = disable_8259A_irq,
171};
172
173
174/*
175 * PIIX4-8259 master/virtual functions to handle interrupt requests
176 * from legacy devices: floppy, parallel, serial, rtc.
177 *
178 * None of these get Cobalt APIC entries, neither do they have IDT
179 * entries. These interrupts are purely virtual and distributed from
180 * the 'master' interrupt source: CO_IRQ_8259.
181 *
182 * When the 8259 interrupts its handler figures out which of these
183 * devices is interrupting and dispatches to its handler.
184 *
185 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
186 * enable_irq gets the right irq. This 'master' irq is never directly
187 * manipulated by any driver.
188 */
189static irqreturn_t piix4_master_intr(int irq, void *dev_id)
190{
191 int realirq;
192 irq_desc_t *desc;
193 unsigned long flags;
194
195 spin_lock_irqsave(&i8259A_lock, flags);
196
197 /* Find out what's interrupting in the PIIX4 master 8259 */
198 outb(0x0c, 0x20); /* OCW3 Poll command */
199 realirq = inb(0x20);
200
201 /*
202 * Bit 7 == 0 means invalid/spurious
203 */
204 if (unlikely(!(realirq & 0x80)))
205 goto out_unlock;
206
207 realirq &= 7;
208
209 if (unlikely(realirq == 2)) {
210 outb(0x0c, 0xa0);
211 realirq = inb(0xa0);
212
213 if (unlikely(!(realirq & 0x80)))
214 goto out_unlock;
215
216 realirq = (realirq & 7) + 8;
217 }
218
219 /* mask and ack interrupt */
220 cached_irq_mask |= 1 << realirq;
221 if (unlikely(realirq > 7)) {
222 inb(0xa1);
223 outb(cached_slave_mask, 0xa1);
224 outb(0x60 + (realirq & 7), 0xa0);
225 outb(0x60 + 2, 0x20);
226 } else {
227 inb(0x21);
228 outb(cached_master_mask, 0x21);
229 outb(0x60 + realirq, 0x20);
230 }
231
232 spin_unlock_irqrestore(&i8259A_lock, flags);
233
234 desc = irq_desc + realirq;
235
236 /*
237 * handle this 'virtual interrupt' as a Cobalt one now.
238 */
239 kstat_cpu(smp_processor_id()).irqs[realirq]++;
240
241 if (likely(desc->action != NULL))
242 handle_IRQ_event(realirq, desc->action);
243
244 if (!(desc->status & IRQ_DISABLED))
245 enable_8259A_irq(realirq);
246
247 return IRQ_HANDLED;
248
249out_unlock:
250 spin_unlock_irqrestore(&i8259A_lock, flags);
251 return IRQ_NONE;
252}
253
254static struct irqaction master_action = {
255 .handler = piix4_master_intr,
256 .name = "PIIX4-8259",
257};
258
259static struct irqaction cascade_action = {
260 .handler = no_action,
261 .name = "cascade",
262};
263
264
265void init_VISWS_APIC_irqs(void)
266{
267 int i;
268
269 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
270 irq_desc[i].status = IRQ_DISABLED;
271 irq_desc[i].action = 0;
272 irq_desc[i].depth = 1;
273
274 if (i == 0) {
275 irq_desc[i].chip = &cobalt_irq_type;
276 }
277 else if (i == CO_IRQ_IDE0) {
278 irq_desc[i].chip = &cobalt_irq_type;
279 }
280 else if (i == CO_IRQ_IDE1) {
281 irq_desc[i].chip = &cobalt_irq_type;
282 }
283 else if (i == CO_IRQ_8259) {
284 irq_desc[i].chip = &piix4_master_irq_type;
285 }
286 else if (i < CO_IRQ_APIC0) {
287 irq_desc[i].chip = &piix4_virtual_irq_type;
288 }
289 else if (IS_CO_APIC(i)) {
290 irq_desc[i].chip = &cobalt_irq_type;
291 }
292 }
293
294 setup_irq(CO_IRQ_8259, &master_action);
295 setup_irq(2, &cascade_action);
296}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 8dedd01e909f..ee0fba092157 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -950,94 +950,24 @@ static void smp_stop_cpu_function(void *dummy)
950 halt(); 950 halt();
951} 951}
952 952
953static DEFINE_SPINLOCK(call_lock);
954
955struct call_data_struct {
956 void (*func) (void *info);
957 void *info;
958 volatile unsigned long started;
959 volatile unsigned long finished;
960 int wait;
961};
962
963static struct call_data_struct *call_data;
964
965/* execute a thread on a new CPU. The function to be called must be 953/* execute a thread on a new CPU. The function to be called must be
966 * previously set up. This is used to schedule a function for 954 * previously set up. This is used to schedule a function for
967 * execution on all CPUs - set up the function then broadcast a 955 * execution on all CPUs - set up the function then broadcast a
968 * function_interrupt CPI to come here on each CPU */ 956 * function_interrupt CPI to come here on each CPU */
969static void smp_call_function_interrupt(void) 957static void smp_call_function_interrupt(void)
970{ 958{
971 void (*func) (void *info) = call_data->func;
972 void *info = call_data->info;
973 /* must take copy of wait because call_data may be replaced
974 * unless the function is waiting for us to finish */
975 int wait = call_data->wait;
976 __u8 cpu = smp_processor_id();
977
978 /*
979 * Notify initiating CPU that I've grabbed the data and am
980 * about to execute the function
981 */
982 mb();
983 if (!test_and_clear_bit(cpu, &call_data->started)) {
984 /* If the bit wasn't set, this could be a replay */
985 printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion"
986 " with no call pending\n", cpu);
987 return;
988 }
989 /*
990 * At this point the info structure may be out of scope unless wait==1
991 */
992 irq_enter(); 959 irq_enter();
993 (*func) (info); 960 generic_smp_call_function_interrupt();
994 __get_cpu_var(irq_stat).irq_call_count++; 961 __get_cpu_var(irq_stat).irq_call_count++;
995 irq_exit(); 962 irq_exit();
996 if (wait) {
997 mb();
998 clear_bit(cpu, &call_data->finished);
999 }
1000} 963}
1001 964
1002static int 965static void smp_call_function_single_interrupt(void)
1003voyager_smp_call_function_mask(cpumask_t cpumask,
1004 void (*func) (void *info), void *info, int wait)
1005{ 966{
1006 struct call_data_struct data; 967 irq_enter();
1007 u32 mask = cpus_addr(cpumask)[0]; 968 generic_smp_call_function_single_interrupt();
1008 969 __get_cpu_var(irq_stat).irq_call_count++;
1009 mask &= ~(1 << smp_processor_id()); 970 irq_exit();
1010
1011 if (!mask)
1012 return 0;
1013
1014 /* Can deadlock when called with interrupts disabled */
1015 WARN_ON(irqs_disabled());
1016
1017 data.func = func;
1018 data.info = info;
1019 data.started = mask;
1020 data.wait = wait;
1021 if (wait)
1022 data.finished = mask;
1023
1024 spin_lock(&call_lock);
1025 call_data = &data;
1026 wmb();
1027 /* Send a message to all other CPUs and wait for them to respond */
1028 send_CPI(mask, VIC_CALL_FUNCTION_CPI);
1029
1030 /* Wait for response */
1031 while (data.started)
1032 barrier();
1033
1034 if (wait)
1035 while (data.finished)
1036 barrier();
1037
1038 spin_unlock(&call_lock);
1039
1040 return 0;
1041} 971}
1042 972
1043/* Sorry about the name. In an APIC based system, the APICs 973/* Sorry about the name. In an APIC based system, the APICs
@@ -1094,6 +1024,12 @@ void smp_qic_call_function_interrupt(struct pt_regs *regs)
1094 smp_call_function_interrupt(); 1024 smp_call_function_interrupt();
1095} 1025}
1096 1026
1027void smp_qic_call_function_single_interrupt(struct pt_regs *regs)
1028{
1029 ack_QIC_CPI(QIC_CALL_FUNCTION_SINGLE_CPI);
1030 smp_call_function_single_interrupt();
1031}
1032
1097void smp_vic_cpi_interrupt(struct pt_regs *regs) 1033void smp_vic_cpi_interrupt(struct pt_regs *regs)
1098{ 1034{
1099 struct pt_regs *old_regs = set_irq_regs(regs); 1035 struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1114,6 +1050,8 @@ void smp_vic_cpi_interrupt(struct pt_regs *regs)
1114 smp_enable_irq_interrupt(); 1050 smp_enable_irq_interrupt();
1115 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) 1051 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
1116 smp_call_function_interrupt(); 1052 smp_call_function_interrupt();
1053 if (test_and_clear_bit(VIC_CALL_FUNCTION_SINGLE_CPI, &vic_cpi_mailbox[cpu]))
1054 smp_call_function_single_interrupt();
1117 set_irq_regs(old_regs); 1055 set_irq_regs(old_regs);
1118} 1056}
1119 1057
@@ -1129,7 +1067,7 @@ static void do_flush_tlb_all(void *info)
1129/* flush the TLB of every active CPU in the system */ 1067/* flush the TLB of every active CPU in the system */
1130void flush_tlb_all(void) 1068void flush_tlb_all(void)
1131{ 1069{
1132 on_each_cpu(do_flush_tlb_all, 0, 1, 1); 1070 on_each_cpu(do_flush_tlb_all, 0, 1);
1133} 1071}
1134 1072
1135/* send a reschedule CPI to one CPU by physical CPU number*/ 1073/* send a reschedule CPI to one CPU by physical CPU number*/
@@ -1161,7 +1099,7 @@ int safe_smp_processor_id(void)
1161/* broadcast a halt to all other CPUs */ 1099/* broadcast a halt to all other CPUs */
1162static void voyager_smp_send_stop(void) 1100static void voyager_smp_send_stop(void)
1163{ 1101{
1164 smp_call_function(smp_stop_cpu_function, NULL, 1, 1); 1102 smp_call_function(smp_stop_cpu_function, NULL, 1);
1165} 1103}
1166 1104
1167/* this function is triggered in time.c when a clock tick fires 1105/* this function is triggered in time.c when a clock tick fires
@@ -1848,5 +1786,7 @@ struct smp_ops smp_ops = {
1848 1786
1849 .smp_send_stop = voyager_smp_send_stop, 1787 .smp_send_stop = voyager_smp_send_stop,
1850 .smp_send_reschedule = voyager_smp_send_reschedule, 1788 .smp_send_reschedule = voyager_smp_send_reschedule,
1851 .smp_call_function_mask = voyager_smp_call_function_mask, 1789
1790 .send_call_func_ipi = native_send_call_func_ipi,
1791 .send_call_func_single_ipi = native_send_call_func_single_ipi,
1852}; 1792};
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c107641cd39b..9873716e9f76 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -8,6 +8,11 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15
11ifeq ($(CONFIG_X86_32),y) 16ifeq ($(CONFIG_X86_32),y)
12obj-$(CONFIG_NUMA) += discontig_32.o 17obj-$(CONFIG_NUMA) += discontig_32.o
13else 18else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d0f5fce77d95..455f3fe67b42 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -49,6 +50,16 @@
49#define PF_RSVD (1<<3) 50#define PF_RSVD (1<<3)
50#define PF_INSTR (1<<4) 51#define PF_INSTR (1<<4)
51 52
53static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
54{
55#ifdef CONFIG_MMIOTRACE_HOOKS
56 if (unlikely(is_kmmio_active()))
57 if (kmmio_handler(regs, addr) == 1)
58 return -1;
59#endif
60 return 0;
61}
62
52static inline int notify_page_fault(struct pt_regs *regs) 63static inline int notify_page_fault(struct pt_regs *regs)
53{ 64{
54#ifdef CONFIG_KPROBES 65#ifdef CONFIG_KPROBES
@@ -598,6 +609,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
598 609
599 if (notify_page_fault(regs)) 610 if (notify_page_fault(regs))
600 return; 611 return;
612 if (unlikely(kmmio_fault(regs, address)))
613 return;
601 614
602 /* 615 /*
603 * We fault-in kernel-space virtual memory on-demand. The 616 * We fault-in kernel-space virtual memory on-demand. The
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b5a0fd5f4c5f..9689a5138e64 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
50 50
51unsigned int __VMALLOC_RESERVE = 128 << 20; 51unsigned int __VMALLOC_RESERVE = 128 << 20;
52 52
53unsigned long max_low_pfn_mapped;
53unsigned long max_pfn_mapped; 54unsigned long max_pfn_mapped;
54 55
55DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 56DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -1034,6 +1035,8 @@ void mark_rodata_ro(void)
1034 unsigned long start = PFN_ALIGN(_text); 1035 unsigned long start = PFN_ALIGN(_text);
1035 unsigned long size = PFN_ALIGN(_etext) - start; 1036 unsigned long size = PFN_ALIGN(_etext) - start;
1036 1037
1038#ifndef CONFIG_DYNAMIC_FTRACE
1039 /* Dynamic tracing modifies the kernel text section */
1037 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1040 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1038 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1041 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
1039 size >> 10); 1042 size >> 10);
@@ -1046,6 +1049,8 @@ void mark_rodata_ro(void)
1046 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1049 printk(KERN_INFO "Testing CPA: write protecting again\n");
1047 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1050 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
1048#endif 1051#endif
1052#endif /* CONFIG_DYNAMIC_FTRACE */
1053
1049 start += size; 1054 start += size;
1050 size = (unsigned long)__end_rodata - start; 1055 size = (unsigned long)__end_rodata - start;
1051 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1056 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 77d129d62c97..306049edd553 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -53,6 +53,7 @@
53 * The direct mapping extends to max_pfn_mapped, so that we can directly access 53 * The direct mapping extends to max_pfn_mapped, so that we can directly access
54 * apertures, ACPI and other tables without having to play with fixmaps. 54 * apertures, ACPI and other tables without having to play with fixmaps.
55 */ 55 */
56unsigned long max_low_pfn_mapped;
56unsigned long max_pfn_mapped; 57unsigned long max_pfn_mapped;
57 58
58static unsigned long dma_reserve __initdata; 59static unsigned long dma_reserve __initdata;
@@ -202,6 +203,46 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
202} 203}
203 204
204/* 205/*
206 * Create large page table mappings for a range of physical addresses.
207 */
208static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
209 pgprot_t prot)
210{
211 pgd_t *pgd;
212 pud_t *pud;
213 pmd_t *pmd;
214
215 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
216 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
217 pgd = pgd_offset_k((unsigned long)__va(phys));
218 if (pgd_none(*pgd)) {
219 pud = (pud_t *) spp_getpage();
220 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
221 _PAGE_USER));
222 }
223 pud = pud_offset(pgd, (unsigned long)__va(phys));
224 if (pud_none(*pud)) {
225 pmd = (pmd_t *) spp_getpage();
226 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
227 _PAGE_USER));
228 }
229 pmd = pmd_offset(pud, phys);
230 BUG_ON(!pmd_none(*pmd));
231 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
232 }
233}
234
235void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
236{
237 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
238}
239
240void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
241{
242 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
243}
244
245/*
205 * The head.S code sets up the kernel high mapping: 246 * The head.S code sets up the kernel high mapping:
206 * 247 *
207 * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) 248 * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
@@ -262,11 +303,13 @@ static __meminit void unmap_low_page(void *adr)
262 early_iounmap(adr, PAGE_SIZE); 303 early_iounmap(adr, PAGE_SIZE);
263} 304}
264 305
265static void __meminit 306static unsigned long __meminit
266phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 307phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
267{ 308{
268 unsigned pages = 0; 309 unsigned pages = 0;
310 unsigned long last_map_addr = end;
269 int i; 311 int i;
312
270 pte_t *pte = pte_page + pte_index(addr); 313 pte_t *pte = pte_page + pte_index(addr);
271 314
272 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { 315 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
@@ -286,23 +329,28 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
286 printk(" pte=%p addr=%lx pte=%016lx\n", 329 printk(" pte=%p addr=%lx pte=%016lx\n",
287 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 330 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
288 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); 331 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
332 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
289 pages++; 333 pages++;
290 } 334 }
291 update_page_count(PG_LEVEL_4K, pages); 335 update_page_count(PG_LEVEL_4K, pages);
336
337 return last_map_addr;
292} 338}
293 339
294static void __meminit 340static unsigned long __meminit
295phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 341phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
296{ 342{
297 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 343 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
298 344
299 phys_pte_init(pte, address, end); 345 return phys_pte_init(pte, address, end);
300} 346}
301 347
302static unsigned long __meminit 348static unsigned long __meminit
303phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) 349phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
350 unsigned long page_size_mask)
304{ 351{
305 unsigned long pages = 0; 352 unsigned long pages = 0;
353 unsigned long last_map_addr = end;
306 354
307 int i = pmd_index(address); 355 int i = pmd_index(address);
308 356
@@ -321,42 +369,46 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
321 369
322 if (pmd_val(*pmd)) { 370 if (pmd_val(*pmd)) {
323 if (!pmd_large(*pmd)) 371 if (!pmd_large(*pmd))
324 phys_pte_update(pmd, address, end); 372 last_map_addr = phys_pte_update(pmd, address,
373 end);
325 continue; 374 continue;
326 } 375 }
327 376
328 if (cpu_has_pse) { 377 if (page_size_mask & (1<<PG_LEVEL_2M)) {
329 pages++; 378 pages++;
330 set_pte((pte_t *)pmd, 379 set_pte((pte_t *)pmd,
331 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 380 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
381 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
332 continue; 382 continue;
333 } 383 }
334 384
335 pte = alloc_low_page(&pte_phys); 385 pte = alloc_low_page(&pte_phys);
336 phys_pte_init(pte, address, end); 386 last_map_addr = phys_pte_init(pte, address, end);
337 unmap_low_page(pte); 387 unmap_low_page(pte);
338 388
339 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); 389 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
340 } 390 }
341 update_page_count(PG_LEVEL_2M, pages); 391 update_page_count(PG_LEVEL_2M, pages);
342 return address; 392 return last_map_addr;
343} 393}
344 394
345static unsigned long __meminit 395static unsigned long __meminit
346phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 396phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
397 unsigned long page_size_mask)
347{ 398{
348 pmd_t *pmd = pmd_offset(pud, 0); 399 pmd_t *pmd = pmd_offset(pud, 0);
349 unsigned long last_map_addr; 400 unsigned long last_map_addr;
350 401
351 spin_lock(&init_mm.page_table_lock); 402 spin_lock(&init_mm.page_table_lock);
352 last_map_addr = phys_pmd_init(pmd, address, end); 403 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
353 spin_unlock(&init_mm.page_table_lock); 404 spin_unlock(&init_mm.page_table_lock);
354 __flush_tlb_all(); 405 __flush_tlb_all();
355 return last_map_addr; 406 return last_map_addr;
356} 407}
357 408
358static unsigned long __meminit 409static unsigned long __meminit
359phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 410phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
411 unsigned long page_size_mask)
360{ 412{
361 unsigned long pages = 0; 413 unsigned long pages = 0;
362 unsigned long last_map_addr = end; 414 unsigned long last_map_addr = end;
@@ -378,11 +430,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
378 430
379 if (pud_val(*pud)) { 431 if (pud_val(*pud)) {
380 if (!pud_large(*pud)) 432 if (!pud_large(*pud))
381 last_map_addr = phys_pmd_update(pud, addr, end); 433 last_map_addr = phys_pmd_update(pud, addr, end,
434 page_size_mask);
382 continue; 435 continue;
383 } 436 }
384 437
385 if (direct_gbpages) { 438 if (page_size_mask & (1<<PG_LEVEL_1G)) {
386 pages++; 439 pages++;
387 set_pte((pte_t *)pud, 440 set_pte((pte_t *)pud,
388 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 441 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
@@ -393,7 +446,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
393 pmd = alloc_low_page(&pmd_phys); 446 pmd = alloc_low_page(&pmd_phys);
394 447
395 spin_lock(&init_mm.page_table_lock); 448 spin_lock(&init_mm.page_table_lock);
396 last_map_addr = phys_pmd_init(pmd, addr, end); 449 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
397 unmap_low_page(pmd); 450 unmap_low_page(pmd);
398 pud_populate(&init_mm, pud, __va(pmd_phys)); 451 pud_populate(&init_mm, pud, __va(pmd_phys));
399 spin_unlock(&init_mm.page_table_lock); 452 spin_unlock(&init_mm.page_table_lock);
@@ -406,29 +459,37 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
406} 459}
407 460
408static unsigned long __meminit 461static unsigned long __meminit
409phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end) 462phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
463 unsigned long page_size_mask)
410{ 464{
411 pud_t *pud; 465 pud_t *pud;
412 466
413 pud = (pud_t *)pgd_page_vaddr(*pgd); 467 pud = (pud_t *)pgd_page_vaddr(*pgd);
414 468
415 return phys_pud_init(pud, addr, end); 469 return phys_pud_init(pud, addr, end, page_size_mask);
416} 470}
417 471
418static void __init find_early_table_space(unsigned long end) 472static void __init find_early_table_space(unsigned long end)
419{ 473{
420 unsigned long puds, tables, start; 474 unsigned long puds, pmds, ptes, tables, start;
421 475
422 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 476 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
423 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 477 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
424 if (!direct_gbpages) { 478 if (direct_gbpages) {
425 unsigned long pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 479 unsigned long extra;
426 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 480 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
427 } 481 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
428 if (!cpu_has_pse) { 482 } else
429 unsigned long ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 483 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
430 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); 484 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
431 } 485
486 if (cpu_has_pse) {
487 unsigned long extra;
488 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
489 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
490 } else
491 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
492 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
432 493
433 /* 494 /*
434 * RED-PEN putting page tables only on node 0 could 495 * RED-PEN putting page tables only on node 0 could
@@ -568,29 +629,12 @@ static void __init early_memtest(unsigned long start, unsigned long end)
568} 629}
569#endif 630#endif
570 631
571/* 632static unsigned long __init kernel_physical_mapping_init(unsigned long start,
572 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 633 unsigned long end,
573 * This runs before bootmem is initialized and gets pages directly from 634 unsigned long page_size_mask)
574 * the physical memory. To access them they are temporarily mapped.
575 */
576unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
577{ 635{
578 unsigned long next, last_map_addr = end;
579 unsigned long start_phys = start, end_phys = end;
580 636
581 printk(KERN_INFO "init_memory_mapping\n"); 637 unsigned long next, last_map_addr = end;
582
583 /*
584 * Find space for the kernel direct mapping tables.
585 *
586 * Later we should allocate these tables in the local node of the
587 * memory mapped. Unfortunately this is done currently before the
588 * nodes are discovered.
589 */
590 if (!after_bootmem) {
591 init_gbpages();
592 find_early_table_space(end);
593 }
594 638
595 start = (unsigned long)__va(start); 639 start = (unsigned long)__va(start);
596 end = (unsigned long)__va(end); 640 end = (unsigned long)__va(end);
@@ -600,12 +644,13 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
600 unsigned long pud_phys; 644 unsigned long pud_phys;
601 pud_t *pud; 645 pud_t *pud;
602 646
603 next = start + PGDIR_SIZE; 647 next = (start + PGDIR_SIZE) & PGDIR_MASK;
604 if (next > end) 648 if (next > end)
605 next = end; 649 next = end;
606 650
607 if (pgd_val(*pgd)) { 651 if (pgd_val(*pgd)) {
608 last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end)); 652 last_map_addr = phys_pud_update(pgd, __pa(start),
653 __pa(end), page_size_mask);
609 continue; 654 continue;
610 } 655 }
611 656
@@ -614,22 +659,151 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
614 else 659 else
615 pud = alloc_low_page(&pud_phys); 660 pud = alloc_low_page(&pud_phys);
616 661
617 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); 662 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
663 page_size_mask);
618 unmap_low_page(pud); 664 unmap_low_page(pud);
619 pgd_populate(&init_mm, pgd_offset_k(start), 665 pgd_populate(&init_mm, pgd_offset_k(start),
620 __va(pud_phys)); 666 __va(pud_phys));
621 } 667 }
622 668
669 return last_map_addr;
670}
671
672struct map_range {
673 unsigned long start;
674 unsigned long end;
675 unsigned page_size_mask;
676};
677
678#define NR_RANGE_MR 5
679
680static int save_mr(struct map_range *mr, int nr_range,
681 unsigned long start_pfn, unsigned long end_pfn,
682 unsigned long page_size_mask)
683{
684
685 if (start_pfn < end_pfn) {
686 if (nr_range >= NR_RANGE_MR)
687 panic("run out of range for init_memory_mapping\n");
688 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
689 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
690 mr[nr_range].page_size_mask = page_size_mask;
691 nr_range++;
692 }
693
694 return nr_range;
695}
696
697/*
698 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
699 * This runs before bootmem is initialized and gets pages directly from
700 * the physical memory. To access them they are temporarily mapped.
701 */
702unsigned long __init_refok init_memory_mapping(unsigned long start,
703 unsigned long end)
704{
705 unsigned long last_map_addr = 0;
706 unsigned long page_size_mask = 0;
707 unsigned long start_pfn, end_pfn;
708
709 struct map_range mr[NR_RANGE_MR];
710 int nr_range, i;
711
712 printk(KERN_INFO "init_memory_mapping\n");
713
714 /*
715 * Find space for the kernel direct mapping tables.
716 *
717 * Later we should allocate these tables in the local node of the
718 * memory mapped. Unfortunately this is done currently before the
719 * nodes are discovered.
720 */
721 if (!after_bootmem)
722 init_gbpages();
723
724 if (direct_gbpages)
725 page_size_mask |= 1 << PG_LEVEL_1G;
726 if (cpu_has_pse)
727 page_size_mask |= 1 << PG_LEVEL_2M;
728
729 memset(mr, 0, sizeof(mr));
730 nr_range = 0;
731
732 /* head if not big page alignment ?*/
733 start_pfn = start >> PAGE_SHIFT;
734 end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
735 << (PMD_SHIFT - PAGE_SHIFT);
736 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
737
738 /* big page (2M) range*/
739 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
740 << (PMD_SHIFT - PAGE_SHIFT);
741 end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
742 << (PUD_SHIFT - PAGE_SHIFT);
743 if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
744 end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
745 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
746 page_size_mask & (1<<PG_LEVEL_2M));
747
748 /* big page (1G) range */
749 start_pfn = end_pfn;
750 end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
751 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
752 page_size_mask &
753 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
754
755 /* tail is not big page (1G) alignment */
756 start_pfn = end_pfn;
757 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
758 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
759 page_size_mask & (1<<PG_LEVEL_2M));
760
761 /* tail is not big page (2M) alignment */
762 start_pfn = end_pfn;
763 end_pfn = end>>PAGE_SHIFT;
764 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
765
766 /* try to merge same page size and continuous */
767 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
768 unsigned long old_start;
769 if (mr[i].end != mr[i+1].start ||
770 mr[i].page_size_mask != mr[i+1].page_size_mask)
771 continue;
772 /* move it */
773 old_start = mr[i].start;
774 memmove(&mr[i], &mr[i+1],
775 (nr_range - 1 - i) * sizeof (struct map_range));
776 mr[i].start = old_start;
777 nr_range--;
778 }
779
780 for (i = 0; i < nr_range; i++)
781 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
782 mr[i].start, mr[i].end,
783 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
784 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
785
786 if (!after_bootmem)
787 find_early_table_space(end);
788
789 for (i = 0; i < nr_range; i++)
790 last_map_addr = kernel_physical_mapping_init(
791 mr[i].start, mr[i].end,
792 mr[i].page_size_mask);
793
623 if (!after_bootmem) 794 if (!after_bootmem)
624 mmu_cr4_features = read_cr4(); 795 mmu_cr4_features = read_cr4();
625 __flush_tlb_all(); 796 __flush_tlb_all();
626 797
627 if (!after_bootmem) 798 if (!after_bootmem && table_end > table_start)
628 reserve_early(table_start << PAGE_SHIFT, 799 reserve_early(table_start << PAGE_SHIFT,
629 table_end << PAGE_SHIFT, "PGTABLE"); 800 table_end << PAGE_SHIFT, "PGTABLE");
630 801
802 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
803 last_map_addr, end);
804
631 if (!after_bootmem) 805 if (!after_bootmem)
632 early_memtest(start_phys, end_phys); 806 early_memtest(start, end);
633 807
634 return last_map_addr >> PAGE_SHIFT; 808 return last_map_addr >> PAGE_SHIFT;
635} 809}
@@ -817,6 +991,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
817void mark_rodata_ro(void) 991void mark_rodata_ro(void)
818{ 992{
819 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 993 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
994 unsigned long rodata_start =
995 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
996
997#ifdef CONFIG_DYNAMIC_FTRACE
998 /* Dynamic tracing modifies the kernel text section */
999 start = rodata_start;
1000#endif
820 1001
821 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 1002 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
822 (end - start) >> 10); 1003 (end - start) >> 10);
@@ -826,8 +1007,7 @@ void mark_rodata_ro(void)
826 * The rodata section (but not the kernel text!) should also be 1007 * The rodata section (but not the kernel text!) should also be
827 * not-executable. 1008 * not-executable.
828 */ 1009 */
829 start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 1010 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
830 set_memory_nx(start, (end - start) >> PAGE_SHIFT);
831 1011
832 rodata_test(); 1012 rodata_test();
833 1013
@@ -1036,9 +1216,6 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1036 PAGE_KERNEL_LARGE); 1216 PAGE_KERNEL_LARGE);
1037 set_pmd(pmd, __pmd(pte_val(entry))); 1217 set_pmd(pmd, __pmd(pte_val(entry)));
1038 1218
1039 addr_end = addr + PMD_SIZE;
1040 p_end = p + PMD_SIZE;
1041
1042 /* check to see if we have contiguous blocks */ 1219 /* check to see if we have contiguous blocks */
1043 if (p_end != p || node_start != node) { 1220 if (p_end != p || node_start != node) {
1044 if (p_start) 1221 if (p_start)
@@ -1048,6 +1225,9 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1048 node_start = node; 1225 node_start = node;
1049 p_start = p; 1226 p_start = p;
1050 } 1227 }
1228
1229 addr_end = addr + PMD_SIZE;
1230 p_end = p + PMD_SIZE;
1051 } else 1231 } else
1052 vmemmap_verify((pte_t *)pmd, node, addr, next); 1232 vmemmap_verify((pte_t *)pmd, node, addr, next);
1053 } 1233 }
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 115f13ee40c9..24c1d3c30186 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/mmiotrace.h>
15 16
16#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
17#include <asm/e820.h> 18#include <asm/e820.h>
@@ -122,10 +123,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
122{ 123{
123 unsigned long pfn, offset, vaddr; 124 unsigned long pfn, offset, vaddr;
124 resource_size_t last_addr; 125 resource_size_t last_addr;
126 const resource_size_t unaligned_phys_addr = phys_addr;
127 const unsigned long unaligned_size = size;
125 struct vm_struct *area; 128 struct vm_struct *area;
126 unsigned long new_prot_val; 129 unsigned long new_prot_val;
127 pgprot_t prot; 130 pgprot_t prot;
128 int retval; 131 int retval;
132 void __iomem *ret_addr;
129 133
130 /* Don't allow wraparound or zero size */ 134 /* Don't allow wraparound or zero size */
131 last_addr = phys_addr + size - 1; 135 last_addr = phys_addr + size - 1;
@@ -233,7 +237,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
233 return NULL; 237 return NULL;
234 } 238 }
235 239
236 return (void __iomem *) (vaddr + offset); 240 ret_addr = (void __iomem *) (vaddr + offset);
241 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
242
243 return ret_addr;
237} 244}
238 245
239/** 246/**
@@ -348,6 +355,8 @@ void iounmap(volatile void __iomem *addr)
348 addr = (volatile void __iomem *) 355 addr = (volatile void __iomem *)
349 (PAGE_MASK & (unsigned long __force)addr); 356 (PAGE_MASK & (unsigned long __force)addr);
350 357
358 mmiotrace_iounmap(addr);
359
351 /* Use the vm area unlocked, assuming the caller 360 /* Use the vm area unlocked, assuming the caller
352 ensures there isn't another iounmap for the same address 361 ensures there isn't another iounmap for the same address
353 in parallel. Reuse of the virtual address is prevented by 362 in parallel. Reuse of the virtual address is prevented by
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..93d82038af4b
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,510 @@
1/* Support for MMIO probes.
2 * Benfit many code from kprobes
3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4 * 2007 Alexander Eichner
5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */
7
8#include <linux/list.h>
9#include <linux/rculist.h>
10#include <linux/spinlock.h>
11#include <linux/hash.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/uaccess.h>
16#include <linux/ptrace.h>
17#include <linux/preempt.h>
18#include <linux/percpu.h>
19#include <linux/kdebug.h>
20#include <linux/mutex.h>
21#include <linux/io.h>
22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h>
24#include <linux/errno.h>
25#include <asm/debugreg.h>
26#include <linux/mmiotrace.h>
27
28#define KMMIO_PAGE_HASH_BITS 4
29#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
30
31struct kmmio_fault_page {
32 struct list_head list;
33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */
35
36 /*
37 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU).
40 */
41 int count;
42};
43
44struct kmmio_delayed_release {
45 struct rcu_head rcu;
46 struct kmmio_fault_page *release_list;
47};
48
49struct kmmio_context {
50 struct kmmio_fault_page *fpage;
51 struct kmmio_probe *probe;
52 unsigned long saved_flags;
53 unsigned long addr;
54 int active;
55};
56
57static DEFINE_SPINLOCK(kmmio_lock);
58
59/* Protected by kmmio_lock */
60unsigned int kmmio_count;
61
62/* Read-protected by RCU, write-protected by kmmio_lock. */
63static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
64static LIST_HEAD(kmmio_probes);
65
66static struct list_head *kmmio_page_list(unsigned long page)
67{
68 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
69}
70
71/* Accessed per-cpu */
72static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
73
74/*
75 * this is basically a dynamic stabbing problem:
76 * Could use the existing prio tree code or
77 * Possible better implementations:
78 * The Interval Skip List: A Data Structure for Finding All Intervals That
79 * Overlap a Point (might be simple)
80 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
81 */
82/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
83static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
84{
85 struct kmmio_probe *p;
86 list_for_each_entry_rcu(p, &kmmio_probes, list) {
87 if (addr >= p->addr && addr <= (p->addr + p->len))
88 return p;
89 }
90 return NULL;
91}
92
93/* You must be holding RCU read lock. */
94static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
95{
96 struct list_head *head;
97 struct kmmio_fault_page *p;
98
99 page &= PAGE_MASK;
100 head = kmmio_page_list(page);
101 list_for_each_entry_rcu(p, head, list) {
102 if (p->page == page)
103 return p;
104 }
105 return NULL;
106}
107
108static void set_page_present(unsigned long addr, bool present,
109 unsigned int *pglevel)
110{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level);
116
117 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return;
120 }
121
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) {
126 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte;
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break;
133
134 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT;
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break;
140
141 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return;
144 }
145
146 __flush_tlb_one(addr);
147}
148
149/** Mark the given page as not present. Access to it will trigger a fault. */
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
151{
152 set_page_present(page & PAGE_MASK, false, pglevel);
153}
154
155/** Mark the given page as present. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
157{
158 set_page_present(page & PAGE_MASK, true, pglevel);
159}
160
161/*
162 * This is being called from do_page_fault().
163 *
164 * We may be in an interrupt or a critical section. Also prefecthing may
165 * trigger a page fault. We may be in the middle of process switch.
166 * We cannot take any locks, because we could be executing especially
167 * within a kmmio critical section.
168 *
169 * Local interrupts are disabled, so preemption cannot happen.
170 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
171 */
172/*
173 * Interrupts are disabled on entry as trap3 is an interrupt gate
174 * and they remain disabled thorough out this function.
175 */
176int kmmio_handler(struct pt_regs *regs, unsigned long addr)
177{
178 struct kmmio_context *ctx;
179 struct kmmio_fault_page *faultpage;
180 int ret = 0; /* default to fault not handled */
181
182 /*
183 * Preemption is now disabled to prevent process switch during
184 * single stepping. We can only handle one active kmmio trace
185 * per cpu, so ensure that we finish it before something else
186 * gets to run. We also hold the RCU read lock over single
187 * stepping to avoid looking up the probe and kmmio_fault_page
188 * again.
189 */
190 preempt_disable();
191 rcu_read_lock();
192
193 faultpage = get_kmmio_fault_page(addr);
194 if (!faultpage) {
195 /*
196 * Either this page fault is not caused by kmmio, or
197 * another CPU just pulled the kmmio probe from under
198 * our feet. The latter case should not be possible.
199 */
200 goto no_kmmio;
201 }
202
203 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) {
207 /*
208 * On SMP we sometimes get recursive probe hits on the
209 * same address. Context is already saved, fall out.
210 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for "
212 "address 0x%08lx.\n",
213 smp_processor_id(), addr);
214 ret = 1;
215 goto no_kmmio_ctx;
216 }
217 /*
218 * Prevent overwriting already in-flight context.
219 * This should not happen, let's hope disarming at least
220 * prevents a panic.
221 */
222 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr);
227 goto no_kmmio_ctx;
228 }
229 ctx->active++;
230
231 ctx->fpage = faultpage;
232 ctx->probe = get_kmmio_probe(addr);
233 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
234 ctx->addr = addr;
235
236 if (ctx->probe && ctx->probe->pre_handler)
237 ctx->probe->pre_handler(ctx->probe, regs, addr);
238
239 /*
240 * Enable single-stepping and disable interrupts for the faulting
241 * context. Local interrupts must not get enabled during stepping.
242 */
243 regs->flags |= X86_EFLAGS_TF;
244 regs->flags &= ~X86_EFLAGS_IF;
245
246 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL);
248
249 /*
250 * If another cpu accesses the same page while we are stepping,
251 * the access will not be caught. It will simply succeed and the
252 * only downside is we lose the event. If this becomes a problem,
253 * the user should drop to single cpu before tracing.
254 */
255
256 put_cpu_var(kmmio_ctx);
257 return 1; /* fault handled */
258
259no_kmmio_ctx:
260 put_cpu_var(kmmio_ctx);
261no_kmmio:
262 rcu_read_unlock();
263 preempt_enable_no_resched();
264 return ret;
265}
266
267/*
268 * Interrupts are disabled on entry as trap1 is an interrupt gate
269 * and they remain disabled thorough out this function.
270 * This must always get called as the pair to kmmio_handler().
271 */
272static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
273{
274 int ret = 0;
275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
276
277 if (!ctx->active) {
278 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
279 smp_processor_id());
280 goto out;
281 }
282
283 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs);
285
286 arm_kmmio_fault_page(ctx->fpage->page, NULL);
287
288 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags;
290
291 /* These were acquired in kmmio_handler(). */
292 ctx->active--;
293 BUG_ON(ctx->active);
294 rcu_read_unlock();
295 preempt_enable_no_resched();
296
297 /*
298 * if somebody else is singlestepping across a probe point, flags
299 * will have TF set, in which case, continue the remaining processing
300 * of do_debug, as if this is not a probe hit.
301 */
302 if (!(regs->flags & X86_EFLAGS_TF))
303 ret = 1;
304out:
305 put_cpu_var(kmmio_ctx);
306 return ret;
307}
308
309/* You must be holding kmmio_lock. */
310static int add_kmmio_fault_page(unsigned long page)
311{
312 struct kmmio_fault_page *f;
313
314 page &= PAGE_MASK;
315 f = get_kmmio_fault_page(page);
316 if (f) {
317 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL);
319 f->count++;
320 return 0;
321 }
322
323 f = kmalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f)
325 return -1;
326
327 f->count = 1;
328 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330
331 arm_kmmio_fault_page(f->page, NULL);
332
333 return 0;
334}
335
336/* You must be holding kmmio_lock. */
337static void release_kmmio_fault_page(unsigned long page,
338 struct kmmio_fault_page **release_list)
339{
340 struct kmmio_fault_page *f;
341
342 page &= PAGE_MASK;
343 f = get_kmmio_fault_page(page);
344 if (!f)
345 return;
346
347 f->count--;
348 BUG_ON(f->count < 0);
349 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL);
351 f->release_next = *release_list;
352 *release_list = f;
353 }
354}
355
356/*
357 * With page-unaligned ioremaps, one or two armed pages may contain
358 * addresses from outside the intended mapping. Events for these addresses
359 * are currently silently dropped. The events may result only from programming
360 * mistakes by accessing addresses before the beginning or past the end of a
361 * mapping.
362 */
363int register_kmmio_probe(struct kmmio_probe *p)
364{
365 unsigned long flags;
366 int ret = 0;
367 unsigned long size = 0;
368 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
369
370 spin_lock_irqsave(&kmmio_lock, flags);
371 if (get_kmmio_probe(p->addr)) {
372 ret = -EEXIST;
373 goto out;
374 }
375 kmmio_count++;
376 list_add_rcu(&p->list, &kmmio_probes);
377 while (size < size_lim) {
378 if (add_kmmio_fault_page(p->addr + size))
379 pr_err("kmmio: Unable to set page fault.\n");
380 size += PAGE_SIZE;
381 }
382out:
383 spin_unlock_irqrestore(&kmmio_lock, flags);
384 /*
385 * XXX: What should I do here?
386 * Here was a call to global_flush_tlb(), but it does not exist
387 * anymore. It seems it's not needed after all.
388 */
389 return ret;
390}
391EXPORT_SYMBOL(register_kmmio_probe);
392
393static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
394{
395 struct kmmio_delayed_release *dr = container_of(
396 head,
397 struct kmmio_delayed_release,
398 rcu);
399 struct kmmio_fault_page *p = dr->release_list;
400 while (p) {
401 struct kmmio_fault_page *next = p->release_next;
402 BUG_ON(p->count);
403 kfree(p);
404 p = next;
405 }
406 kfree(dr);
407}
408
409static void remove_kmmio_fault_pages(struct rcu_head *head)
410{
411 struct kmmio_delayed_release *dr = container_of(
412 head,
413 struct kmmio_delayed_release,
414 rcu);
415 struct kmmio_fault_page *p = dr->release_list;
416 struct kmmio_fault_page **prevp = &dr->release_list;
417 unsigned long flags;
418 spin_lock_irqsave(&kmmio_lock, flags);
419 while (p) {
420 if (!p->count)
421 list_del_rcu(&p->list);
422 else
423 *prevp = p->release_next;
424 prevp = &p->release_next;
425 p = p->release_next;
426 }
427 spin_unlock_irqrestore(&kmmio_lock, flags);
428 /* This is the real RCU destroy call. */
429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
430}
431
432/*
433 * Remove a kmmio probe. You have to synchronize_rcu() before you can be
434 * sure that the callbacks will not be called anymore. Only after that
435 * you may actually release your struct kmmio_probe.
436 *
437 * Unregistering a kmmio fault page has three steps:
438 * 1. release_kmmio_fault_page()
439 * Disarm the page, wait a grace period to let all faults finish.
440 * 2. remove_kmmio_fault_pages()
441 * Remove the pages from kmmio_page_table.
442 * 3. rcu_free_kmmio_fault_pages()
443 * Actally free the kmmio_fault_page structs as with RCU.
444 */
445void unregister_kmmio_probe(struct kmmio_probe *p)
446{
447 unsigned long flags;
448 unsigned long size = 0;
449 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
450 struct kmmio_fault_page *release_list = NULL;
451 struct kmmio_delayed_release *drelease;
452
453 spin_lock_irqsave(&kmmio_lock, flags);
454 while (size < size_lim) {
455 release_kmmio_fault_page(p->addr + size, &release_list);
456 size += PAGE_SIZE;
457 }
458 list_del_rcu(&p->list);
459 kmmio_count--;
460 spin_unlock_irqrestore(&kmmio_lock, flags);
461
462 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
463 if (!drelease) {
464 pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
465 return;
466 }
467 drelease->release_list = release_list;
468
469 /*
470 * This is not really RCU here. We have just disarmed a set of
471 * pages so that they cannot trigger page faults anymore. However,
472 * we cannot remove the pages from kmmio_page_table,
473 * because a probe hit might be in flight on another CPU. The
474 * pages are collected into a list, and they will be removed from
475 * kmmio_page_table when it is certain that no probe hit related to
476 * these pages can be in flight. RCU grace period sounds like a
477 * good choice.
478 *
479 * If we removed the pages too early, kmmio page fault handler might
480 * not find the respective kmmio_fault_page and determine it's not
481 * a kmmio fault, when it actually is. This would lead to madness.
482 */
483 call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
484}
485EXPORT_SYMBOL(unregister_kmmio_probe);
486
487static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
488 void *args)
489{
490 struct die_args *arg = args;
491
492 if (val == DIE_DEBUG && (arg->err & DR_STEP))
493 if (post_kmmio_handler(arg->err, arg->regs) == 1)
494 return NOTIFY_STOP;
495
496 return NOTIFY_DONE;
497}
498
499static struct notifier_block nb_die = {
500 .notifier_call = kmmio_die_notifier
501};
502
503static int __init init_kmmio(void)
504{
505 int i;
506 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
507 INIT_LIST_HEAD(&kmmio_page_table[i]);
508 return register_die_notifier(&nb_die);
509}
510fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..e7397e108beb
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,515 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2005
17 * Jeff Muizelaar, 2006, 2007
18 * Pekka Paalanen, 2008 <pq@iki.fi>
19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */
22#define DEBUG 1
23
24#include <linux/module.h>
25#include <linux/debugfs.h>
26#include <linux/uaccess.h>
27#include <linux/io.h>
28#include <linux/version.h>
29#include <linux/kallsyms.h>
30#include <asm/pgtable.h>
31#include <linux/mmiotrace.h>
32#include <asm/e820.h> /* for ISA_START_ADDRESS */
33#include <asm/atomic.h>
34#include <linux/percpu.h>
35#include <linux/cpu.h>
36
37#include "pf_in.h"
38
39#define NAME "mmiotrace: "
40
41struct trap_reason {
42 unsigned long addr;
43 unsigned long ip;
44 enum reason_type type;
45 int active_traces;
46};
47
48struct remap_trace {
49 struct list_head list;
50 struct kmmio_probe probe;
51 resource_size_t phys;
52 unsigned long id;
53};
54
55/* Accessed per-cpu. */
56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled;
69static LIST_HEAD(trace_list); /* struct remap_trace */
70
71/*
72 * Locking in this file:
73 * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
74 * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
75 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */
81
82/* module parameters */
83static unsigned long filter_offset;
84static int nommiotrace;
85static int trace_pc;
86
87module_param(filter_offset, ulong, 0);
88module_param(nommiotrace, bool, 0);
89module_param(trace_pc, bool, 0);
90
91MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
92MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
93MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
94
95static bool is_enabled(void)
96{
97 return atomic_read(&mmiotrace_enabled);
98}
99
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address)
139{
140 unsigned int level;
141 pte_t *pte = lookup_address(address, &level);
142
143 if (!pte) {
144 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
145 __func__, address);
146 return;
147 }
148
149 if (level == PG_LEVEL_2M) {
150 pr_emerg(NAME "4MB pages are not currently supported: "
151 "0x%08lx\n", address);
152 BUG();
153 }
154 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
155 (unsigned long long)pte_val(*pte),
156 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
157}
158
159/*
160 * For some reason the pre/post pairs have been called in an
161 * unmatched order. Report and die.
162 */
163static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
164{
165 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
166 pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
167 "last fault for address: 0x%08lx\n",
168 addr, my_reason->addr);
169 print_pte(addr);
170 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
171 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
172#ifdef __i386__
173 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
174 regs->ax, regs->bx, regs->cx, regs->dx);
175 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
176 regs->si, regs->di, regs->bp, regs->sp);
177#else
178 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
179 regs->ax, regs->cx, regs->dx);
180 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
181 regs->si, regs->di, regs->bp, regs->sp);
182#endif
183 put_cpu_var(pf_reason);
184 BUG();
185}
186
187static void pre(struct kmmio_probe *p, struct pt_regs *regs,
188 unsigned long addr)
189{
190 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
191 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
192 const unsigned long instptr = instruction_pointer(regs);
193 const enum reason_type type = get_ins_type(instptr);
194 struct remap_trace *trace = p->private;
195
196 /* it doesn't make sense to have more than one active trace per cpu */
197 if (my_reason->active_traces)
198 die_kmmio_nesting_error(regs, addr);
199 else
200 my_reason->active_traces++;
201
202 my_reason->type = type;
203 my_reason->addr = addr;
204 my_reason->ip = instptr;
205
206 my_trace->phys = addr - trace->probe.addr + trace->phys;
207 my_trace->map_id = trace->id;
208
209 /*
210 * Only record the program counter when requested.
211 * It may taint clean-room reverse engineering.
212 */
213 if (trace_pc)
214 my_trace->pc = instptr;
215 else
216 my_trace->pc = 0;
217
218 /*
219 * XXX: the timestamp recorded will be *after* the tracing has been
220 * done, not at the time we hit the instruction. SMP implications
221 * on event ordering?
222 */
223
224 switch (type) {
225 case REG_READ:
226 my_trace->opcode = MMIO_READ;
227 my_trace->width = get_ins_mem_width(instptr);
228 break;
229 case REG_WRITE:
230 my_trace->opcode = MMIO_WRITE;
231 my_trace->width = get_ins_mem_width(instptr);
232 my_trace->value = get_ins_reg_val(instptr, regs);
233 break;
234 case IMM_WRITE:
235 my_trace->opcode = MMIO_WRITE;
236 my_trace->width = get_ins_mem_width(instptr);
237 my_trace->value = get_ins_imm_val(instptr);
238 break;
239 default:
240 {
241 unsigned char *ip = (unsigned char *)instptr;
242 my_trace->opcode = MMIO_UNKNOWN_OP;
243 my_trace->width = 0;
244 my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
245 *(ip + 2);
246 }
247 }
248 put_cpu_var(cpu_trace);
249 put_cpu_var(pf_reason);
250}
251
252static void post(struct kmmio_probe *p, unsigned long condition,
253 struct pt_regs *regs)
254{
255 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
256 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
257
258 /* this should always return the active_trace count to 0 */
259 my_reason->active_traces--;
260 if (my_reason->active_traces) {
261 pr_emerg(NAME "unexpected post handler");
262 BUG();
263 }
264
265 switch (my_reason->type) {
266 case REG_READ:
267 my_trace->value = get_ins_reg_val(my_reason->ip, regs);
268 break;
269 default:
270 break;
271 }
272
273 mmio_trace_rw(my_trace);
274 put_cpu_var(cpu_trace);
275 put_cpu_var(pf_reason);
276}
277
278static void ioremap_trace_core(resource_size_t offset, unsigned long size,
279 void __iomem *addr)
280{
281 static atomic_t next_id;
282 struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
283 /* These are page-unaligned. */
284 struct mmiotrace_map map = {
285 .phys = offset,
286 .virt = (unsigned long)addr,
287 .len = size,
288 .opcode = MMIO_PROBE
289 };
290
291 if (!trace) {
292 pr_err(NAME "kmalloc failed in ioremap\n");
293 return;
294 }
295
296 *trace = (struct remap_trace) {
297 .probe = {
298 .addr = (unsigned long)addr,
299 .len = size,
300 .pre_handler = pre,
301 .post_handler = post,
302 .private = trace
303 },
304 .phys = offset,
305 .id = atomic_inc_return(&next_id)
306 };
307 map.map_id = trace->id;
308
309 spin_lock_irq(&trace_lock);
310 if (!is_enabled())
311 goto not_enabled;
312
313 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list);
315 if (!nommiotrace)
316 register_kmmio_probe(&trace->probe);
317
318not_enabled:
319 spin_unlock_irq(&trace_lock);
320}
321
322void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
323 void __iomem *addr)
324{
325 if (!is_enabled()) /* recheck and proper locking in *_core() */
326 return;
327
328 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
329 (unsigned long long)offset, size, addr);
330 if ((filter_offset) && (offset != filter_offset))
331 return;
332 ioremap_trace_core(offset, size, addr);
333}
334
335static void iounmap_trace_core(volatile void __iomem *addr)
336{
337 struct mmiotrace_map map = {
338 .phys = 0,
339 .virt = (unsigned long)addr,
340 .len = 0,
341 .opcode = MMIO_UNPROBE
342 };
343 struct remap_trace *trace;
344 struct remap_trace *tmp;
345 struct remap_trace *found_trace = NULL;
346
347 pr_debug(NAME "Unmapping %p.\n", addr);
348
349 spin_lock_irq(&trace_lock);
350 if (!is_enabled())
351 goto not_enabled;
352
353 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
354 if ((unsigned long)addr == trace->probe.addr) {
355 if (!nommiotrace)
356 unregister_kmmio_probe(&trace->probe);
357 list_del(&trace->list);
358 found_trace = trace;
359 break;
360 }
361 }
362 map.map_id = (found_trace) ? found_trace->id : -1;
363 mmio_trace_mapping(&map);
364
365not_enabled:
366 spin_unlock_irq(&trace_lock);
367 if (found_trace) {
368 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
369 kfree(found_trace);
370 }
371}
372
373void mmiotrace_iounmap(volatile void __iomem *addr)
374{
375 might_sleep();
376 if (is_enabled()) /* recheck and proper locking in *_core() */
377 iounmap_trace_core(addr);
378}
379
380static void clear_trace_list(void)
381{
382 struct remap_trace *trace;
383 struct remap_trace *tmp;
384
385 /*
386 * No locking required, because the caller ensures we are in a
387 * critical section via mutex, and is_enabled() is false,
388 * i.e. nothing can traverse or modify this list.
389 * Caller also ensures is_enabled() cannot change.
390 */
391 list_for_each_entry(trace, &trace_list, list) {
392 pr_notice(NAME "purging non-iounmapped "
393 "trace @0x%08lx, size 0x%lx.\n",
394 trace->probe.addr, trace->probe.len);
395 if (!nommiotrace)
396 unregister_kmmio_probe(&trace->probe);
397 }
398 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
399
400 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
401 list_del(&trace->list);
402 kfree(trace);
403 }
404}
405
406#ifdef CONFIG_HOTPLUG_CPU
407static cpumask_t downed_cpus;
408
409static void enter_uniprocessor(void)
410{
411 int cpu;
412 int err;
413
414 get_online_cpus();
415 downed_cpus = cpu_online_map;
416 cpu_clear(first_cpu(cpu_online_map), downed_cpus);
417 if (num_online_cpus() > 1)
418 pr_notice(NAME "Disabling non-boot CPUs...\n");
419 put_online_cpus();
420
421 for_each_cpu_mask(cpu, downed_cpus) {
422 err = cpu_down(cpu);
423 if (!err)
424 pr_info(NAME "CPU%d is down.\n", cpu);
425 else
426 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
427 }
428 if (num_online_cpus() > 1)
429 pr_warning(NAME "multiple CPUs still online, "
430 "may miss events.\n");
431}
432
433static void leave_uniprocessor(void)
434{
435 int cpu;
436 int err;
437
438 if (cpus_weight(downed_cpus) == 0)
439 return;
440 pr_notice(NAME "Re-enabling CPUs...\n");
441 for_each_cpu_mask(cpu, downed_cpus) {
442 err = cpu_up(cpu);
443 if (!err)
444 pr_info(NAME "enabled CPU%d.\n", cpu);
445 else
446 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
447 }
448}
449
450#else /* !CONFIG_HOTPLUG_CPU */
451static void enter_uniprocessor(void)
452{
453 if (num_online_cpus() > 1)
454 pr_warning(NAME "multiple CPUs are online, may miss events. "
455 "Suggest booting with maxcpus=1 kernel argument.\n");
456}
457
458static void leave_uniprocessor(void)
459{
460}
461#endif
462
463#if 0 /* XXX: out of order */
464static struct file_operations fops_marker = {
465 .owner = THIS_MODULE,
466 .write = write_marker
467};
468#endif
469
470void enable_mmiotrace(void)
471{
472 mutex_lock(&mmiotrace_mutex);
473 if (is_enabled())
474 goto out;
475
476#if 0 /* XXX: tracing does not support text entries */
477 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
478 &fops_marker);
479 if (!marker_file)
480 pr_err(NAME "marker file creation failed.\n");
481#endif
482
483 if (nommiotrace)
484 pr_info(NAME "MMIO tracing disabled.\n");
485 enter_uniprocessor();
486 spin_lock_irq(&trace_lock);
487 atomic_inc(&mmiotrace_enabled);
488 spin_unlock_irq(&trace_lock);
489 pr_info(NAME "enabled.\n");
490out:
491 mutex_unlock(&mmiotrace_mutex);
492}
493
494void disable_mmiotrace(void)
495{
496 mutex_lock(&mmiotrace_mutex);
497 if (!is_enabled())
498 goto out;
499
500 spin_lock_irq(&trace_lock);
501 atomic_dec(&mmiotrace_enabled);
502 BUG_ON(is_enabled());
503 spin_unlock_irq(&trace_lock);
504
505 clear_trace_list(); /* guarantees: no more kmmio callbacks */
506 leave_uniprocessor();
507 if (marker_file) {
508 debugfs_remove(marker_file);
509 marker_file = NULL;
510 }
511
512 pr_info(NAME "disabled.\n");
513out:
514 mutex_unlock(&mmiotrace_mutex);
515}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index afd40054d157..65c6e46bf059 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -141,7 +141,7 @@ static void cpa_flush_all(unsigned long cache)
141{ 141{
142 BUG_ON(irqs_disabled()); 142 BUG_ON(irqs_disabled());
143 143
144 on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); 144 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
145} 145}
146 146
147static void __cpa_flush_range(void *arg) 147static void __cpa_flush_range(void *arg)
@@ -162,7 +162,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
162 BUG_ON(irqs_disabled()); 162 BUG_ON(irqs_disabled());
163 WARN_ON(PAGE_ALIGN(start) != start); 163 WARN_ON(PAGE_ALIGN(start) != start);
164 164
165 on_each_cpu(__cpa_flush_range, NULL, 1, 1); 165 on_each_cpu(__cpa_flush_range, NULL, 1);
166 166
167 if (!cache) 167 if (!cache)
168 return; 168 return;
@@ -262,6 +262,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
262 262
263 return pte_offset_kernel(pmd, address); 263 return pte_offset_kernel(pmd, address);
264} 264}
265EXPORT_SYMBOL_GPL(lookup_address);
265 266
266/* 267/*
267 * Set the new pmd in all the pgds we know about: 268 * Set the new pmd in all the pgds we know about:
@@ -536,8 +537,14 @@ static int split_large_page(pte_t *kpte, unsigned long address)
536 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 537 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
537 538
538 if (address >= (unsigned long)__va(0) && 539 if (address >= (unsigned long)__va(0) &&
540 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
541 split_page_count(level);
542
543#ifdef CONFIG_X86_64
544 if (address >= (unsigned long)__va(1UL<<32) &&
539 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) 545 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
540 split_page_count(level); 546 split_page_count(level);
547#endif
541 548
542 /* 549 /*
543 * Install the new, split up pagetable. Important details here: 550 * Install the new, split up pagetable. Important details here:
@@ -652,15 +659,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
652 struct cpa_data alias_cpa; 659 struct cpa_data alias_cpa;
653 int ret = 0; 660 int ret = 0;
654 661
655 if (cpa->pfn > max_pfn_mapped) 662 if (cpa->pfn >= max_pfn_mapped)
656 return 0; 663 return 0;
657 664
665#ifdef CONFIG_X86_64
666 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
667 return 0;
668#endif
658 /* 669 /*
659 * No need to redo, when the primary call touched the direct 670 * No need to redo, when the primary call touched the direct
660 * mapping already: 671 * mapping already:
661 */ 672 */
662 if (!within(cpa->vaddr, PAGE_OFFSET, 673 if (!(within(cpa->vaddr, PAGE_OFFSET,
663 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 674 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
675#ifdef CONFIG_X86_64
676 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
677 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
678#endif
679 )) {
664 680
665 alias_cpa = *cpa; 681 alias_cpa = *cpa;
666 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 682 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index a885a1019b8a..d4585077977a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -449,7 +449,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
449 if (retval < 0) 449 if (retval < 0)
450 return 0; 450 return 0;
451 451
452 if (pfn <= max_pfn_mapped && 452 if (((pfn < max_low_pfn_mapped) ||
453 (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
453 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { 454 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
454 free_memtype(offset, offset + size); 455 free_memtype(offset, offset + size);
455 printk(KERN_INFO 456 printk(KERN_INFO
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
23 * Copyright by Intel Crop., 2002
24 * Louis Zhuang (louis.zhuang@intel.com)
25 *
26 * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
27 */
28
29#include <linux/module.h>
30#include <linux/ptrace.h> /* struct pt_regs */
31#include "pf_in.h"
32
33#ifdef __i386__
34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67
38};
39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42};
43static unsigned int reg_wop[] = { 0x88, 0x89 };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
53static unsigned int mw64[] = {};
54#else /* not __i386__ */
55static unsigned char prefix_codes[] = {
56 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
57 0xF0, 0xF3, 0xF2,
58 /* REX Prefixes */
59 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
60 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
61};
62/* AMD64 Manual 3, Appendix A*/
63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65};
66static unsigned int reg_wop[] = { 0x88, 0x89 };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
71};
72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */
81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
83 int *rexr)
84{
85 int i;
86 unsigned char *p = addr;
87 *shorted = 0;
88 *enlarged = 0;
89 *rexr = 0;
90
91restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) {
94 if (*p == 0x66)
95 *shorted = 1;
96#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1;
99 if ((*p & 0xf4) == 0x44)
100 *rexr = 1;
101#endif
102 p++;
103 goto restart;
104 }
105 }
106
107 return (p - addr);
108}
109
110static int get_opcode(unsigned char *addr, unsigned int *opcode)
111{
112 int len;
113
114 if (*addr == 0x0F) {
115 /* 0x0F is extension instruction */
116 *opcode = *(unsigned short *)addr;
117 len = 2;
118 } else {
119 *opcode = *addr;
120 len = 1;
121 }
122
123 return len;
124}
125
126#define CHECK_OP_TYPE(opcode, array, type) \
127 for (i = 0; i < ARRAY_SIZE(array); i++) { \
128 if (array[i] == opcode) { \
129 rv = type; \
130 goto exit; \
131 } \
132 }
133
134enum reason_type get_ins_type(unsigned long ins_addr)
135{
136 unsigned int opcode;
137 unsigned char *p;
138 int shorted, enlarged, rexr;
139 int i;
140 enum reason_type rv = OTHERS;
141
142 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr);
144 p += get_opcode(p, &opcode);
145
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
147 CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
148 CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
149
150exit:
151 return rv;
152}
153#undef CHECK_OP_TYPE
154
155static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{
157 unsigned int opcode;
158 unsigned char *p;
159 int i, shorted, enlarged, rexr;
160
161 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr);
163 p += get_opcode(p, &opcode);
164
165 for (i = 0; i < ARRAY_SIZE(rw8); i++)
166 if (rw8[i] == opcode)
167 return 1;
168
169 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4));
172
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0;
175}
176
177unsigned int get_ins_mem_width(unsigned long ins_addr)
178{
179 unsigned int opcode;
180 unsigned char *p;
181 int i, shorted, enlarged, rexr;
182
183 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr);
185 p += get_opcode(p, &opcode);
186
187 for (i = 0; i < ARRAY_SIZE(mw8); i++)
188 if (mw8[i] == opcode)
189 return 1;
190
191 for (i = 0; i < ARRAY_SIZE(mw16); i++)
192 if (mw16[i] == opcode)
193 return 2;
194
195 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode)
197 return shorted ? 2 : 4;
198
199 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4);
202
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0;
205}
206
207/*
208 * Define register ident in mod/rm byte.
209 * Note: these are NOT the same as in ptrace-abi.h.
210 */
211enum {
212 arg_AL = 0,
213 arg_CL = 1,
214 arg_DL = 2,
215 arg_BL = 3,
216 arg_AH = 4,
217 arg_CH = 5,
218 arg_DH = 6,
219 arg_BH = 7,
220
221 arg_AX = 0,
222 arg_CX = 1,
223 arg_DX = 2,
224 arg_BX = 3,
225 arg_SP = 4,
226 arg_BP = 5,
227 arg_SI = 6,
228 arg_DI = 7,
229#ifdef __amd64__
230 arg_R8 = 8,
231 arg_R9 = 9,
232 arg_R10 = 10,
233 arg_R11 = 11,
234 arg_R12 = 12,
235 arg_R13 = 13,
236 arg_R14 = 14,
237 arg_R15 = 15
238#endif
239};
240
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
242{
243 unsigned char *rv = NULL;
244
245 switch (no) {
246 case arg_AL:
247 rv = (unsigned char *)&regs->ax;
248 break;
249 case arg_BL:
250 rv = (unsigned char *)&regs->bx;
251 break;
252 case arg_CL:
253 rv = (unsigned char *)&regs->cx;
254 break;
255 case arg_DL:
256 rv = (unsigned char *)&regs->dx;
257 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__
271 case arg_R8:
272 rv = (unsigned char *)&regs->r8;
273 break;
274 case arg_R9:
275 rv = (unsigned char *)&regs->r9;
276 break;
277 case arg_R10:
278 rv = (unsigned char *)&regs->r10;
279 break;
280 case arg_R11:
281 rv = (unsigned char *)&regs->r11;
282 break;
283 case arg_R12:
284 rv = (unsigned char *)&regs->r12;
285 break;
286 case arg_R13:
287 rv = (unsigned char *)&regs->r13;
288 break;
289 case arg_R14:
290 rv = (unsigned char *)&regs->r14;
291 break;
292 case arg_R15:
293 rv = (unsigned char *)&regs->r15;
294 break;
295#endif
296 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break;
299 }
300 return rv;
301}
302
303static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
304{
305 unsigned long *rv = NULL;
306
307 switch (no) {
308 case arg_AX:
309 rv = &regs->ax;
310 break;
311 case arg_BX:
312 rv = &regs->bx;
313 break;
314 case arg_CX:
315 rv = &regs->cx;
316 break;
317 case arg_DX:
318 rv = &regs->dx;
319 break;
320 case arg_SP:
321 rv = &regs->sp;
322 break;
323 case arg_BP:
324 rv = &regs->bp;
325 break;
326 case arg_SI:
327 rv = &regs->si;
328 break;
329 case arg_DI:
330 rv = &regs->di;
331 break;
332#ifdef __amd64__
333 case arg_R8:
334 rv = &regs->r8;
335 break;
336 case arg_R9:
337 rv = &regs->r9;
338 break;
339 case arg_R10:
340 rv = &regs->r10;
341 break;
342 case arg_R11:
343 rv = &regs->r11;
344 break;
345 case arg_R12:
346 rv = &regs->r12;
347 break;
348 case arg_R13:
349 rv = &regs->r13;
350 break;
351 case arg_R14:
352 rv = &regs->r14;
353 break;
354 case arg_R15:
355 rv = &regs->r15;
356 break;
357#endif
358 default:
359 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
360 }
361
362 return rv;
363}
364
365unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
366{
367 unsigned int opcode;
368 unsigned char mod_rm;
369 int reg;
370 unsigned char *p;
371 int i, shorted, enlarged, rexr;
372 unsigned long rv;
373
374 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr);
376 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) {
379 rv = REG_READ;
380 goto do_work;
381 }
382
383 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
384 if (reg_wop[i] == opcode) {
385 rv = REG_WRITE;
386 goto do_work;
387 }
388
389 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
390 "0x%02x\n", opcode);
391 goto err;
392
393do_work:
394 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) {
397 case 1:
398 return *get_reg_w8(reg, regs);
399
400 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs);
402
403 case 4:
404 return *(unsigned int *)get_reg_w32(reg, regs);
405
406#ifdef __amd64__
407 case 8:
408 return *(unsigned long *)get_reg_w32(reg, regs);
409#endif
410
411 default:
412 printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
413 }
414
415err:
416 return 0;
417}
418
419unsigned long get_ins_imm_val(unsigned long ins_addr)
420{
421 unsigned int opcode;
422 unsigned char mod_rm;
423 unsigned char mod;
424 unsigned char *p;
425 int i, shorted, enlarged, rexr;
426 unsigned long rv;
427
428 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr);
430 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) {
433 rv = IMM_WRITE;
434 goto do_work;
435 }
436
437 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
438 "0x%02x\n", opcode);
439 goto err;
440
441do_work:
442 mod_rm = *p;
443 mod = mod_rm >> 6;
444 p++;
445 switch (mod) {
446 case 0:
447 /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
448 /* AMD64: XXX Check for address size prefix? */
449 if ((mod_rm & 0x7) == 0x5)
450 p += 4;
451 break;
452
453 case 1:
454 p += 1;
455 break;
456
457 case 2:
458 p += 4;
459 break;
460
461 case 3:
462 default:
463 printk(KERN_ERR "mmiotrace: not a memory access instruction "
464 "at 0x%lx, rm_mod=0x%02x\n",
465 ins_addr, mod_rm);
466 }
467
468 switch (get_ins_reg_width(ins_addr)) {
469 case 1:
470 return *(unsigned char *)p;
471
472 case 2:
473 return *(unsigned short *)p;
474
475 case 4:
476 return *(unsigned int *)p;
477
478#ifdef __amd64__
479 case 8:
480 return *(unsigned long *)p;
481#endif
482
483 default:
484 printk(KERN_ERR "mmiotrace: Error: width.\n");
485 }
486
487err:
488 return 0;
489}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22#ifndef __PF_H_
23#define __PF_H_
24
25enum reason_type {
26 NOT_ME, /* page fault is not in regions */
27 NOTHING, /* access others point in regions */
28 REG_READ, /* read from addr to reg */
29 REG_WRITE, /* write from reg to addr */
30 IMM_WRITE, /* write from imm to addr */
31 OTHERS /* Other instructions can not intercept */
32};
33
34enum reason_type get_ins_type(unsigned long ins_addr);
35unsigned int get_ins_mem_width(unsigned long ins_addr);
36unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
37unsigned long get_ins_imm_val(unsigned long ins_addr);
38
39#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 828907d001e8..b4becbf8c570 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -141,7 +141,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
141 __flush_tlb_one(vaddr); 141 __flush_tlb_one(vaddr);
142} 142}
143 143
144static int fixmaps;
145unsigned long __FIXADDR_TOP = 0xfffff000; 144unsigned long __FIXADDR_TOP = 0xfffff000;
146EXPORT_SYMBOL(__FIXADDR_TOP); 145EXPORT_SYMBOL(__FIXADDR_TOP);
147 146
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index f41d67f8f831..1eb2973a301c 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -156,10 +156,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
156 156
157 num_memory_chunks++; 157 num_memory_chunks++;
158 158
159 printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)" 159 printk(KERN_DEBUG "Memory range %08lx to %08lx"
160 " in proximity domain %02x %s\n", 160 " in proximity domain %02x %s\n",
161 start_pfn, end_pfn, 161 start_pfn, end_pfn,
162 memory_affinity->memory_type,
163 pxm, 162 pxm,
164 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? 163 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
165 "enabled and removable" : "enabled" ) ); 164 "enabled and removable" : "enabled" ) );
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 0fd67b81a8b6..1b4763e26ea9 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -100,7 +100,19 @@ static __init inline int srat_disabled(void)
100/* Callback for SLIT parsing */ 100/* Callback for SLIT parsing */
101void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
102{ 102{
103 acpi_slit = slit; 103 unsigned length;
104 unsigned long phys;
105
106 length = slit->header.length;
107 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
108 PAGE_SIZE);
109
110 if (phys == -1L)
111 panic(" Can not save slit!\n");
112
113 acpi_slit = __va(phys);
114 memcpy(acpi_slit, slit, length);
115 reserve_early(phys, phys + length, "ACPI SLIT");
104} 116}
105 117
106/* Callback for Proximity Domain -> LAPIC mapping */ 118/* Callback for Proximity Domain -> LAPIC mapping */
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..d877c5b423ef
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi>
3 */
4#include <linux/module.h>
5#include <linux/io.h>
6
7#define MODULE_NAME "testmmiotrace"
8
9static unsigned long mmio_address;
10module_param(mmio_address, ulong, 0);
11MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
12
13static void do_write_test(void __iomem *p)
14{
15 unsigned int i;
16 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2)
19 iowrite16(i * 12 + 7, p + i);
20 for (i = (5 * 1024); i < (16 * 1024); i += 4)
21 iowrite32(i * 212371 + 13, p + i);
22}
23
24static void do_read_test(void __iomem *p)
25{
26 unsigned int i;
27 for (i = 0; i < 256; i++)
28 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2)
30 ioread16(p + i);
31 for (i = (5 * 1024); i < (16 * 1024); i += 4)
32 ioread32(p + i);
33}
34
35static void do_test(void)
36{
37 void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
38 if (!p) {
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return;
41 }
42 do_write_test(p);
43 do_read_test(p);
44 iounmap(p);
45}
46
47static int __init init(void)
48{
49 if (mmio_address == 0) {
50 pr_err(MODULE_NAME ": you have to use the module argument "
51 "mmio_address.\n");
52 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
53 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
54 return -ENXIO;
55 }
56
57 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
58 "in PCI address space, and writing "
59 "rubbish in there.\n", mmio_address);
60 do_test();
61 return 0;
62}
63
64static void __exit cleanup(void)
65{
66 pr_debug(MODULE_NAME ": unloaded.\n");
67}
68
69module_init(init);
70module_exit(cleanup);
71MODULE_LICENSE("GPL");
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2b6ad5b9f9d5..7f3329b55d2e 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -218,8 +218,8 @@ static int nmi_setup(void)
218 } 218 }
219 219
220 } 220 }
221 on_each_cpu(nmi_save_registers, NULL, 0, 1); 221 on_each_cpu(nmi_save_registers, NULL, 1);
222 on_each_cpu(nmi_cpu_setup, NULL, 0, 1); 222 on_each_cpu(nmi_cpu_setup, NULL, 1);
223 nmi_enabled = 1; 223 nmi_enabled = 1;
224 return 0; 224 return 0;
225} 225}
@@ -271,7 +271,7 @@ static void nmi_shutdown(void)
271{ 271{
272 struct op_msrs *msrs = &get_cpu_var(cpu_msrs); 272 struct op_msrs *msrs = &get_cpu_var(cpu_msrs);
273 nmi_enabled = 0; 273 nmi_enabled = 0;
274 on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); 274 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
275 unregister_die_notifier(&profile_exceptions_nb); 275 unregister_die_notifier(&profile_exceptions_nb);
276 model->shutdown(msrs); 276 model->shutdown(msrs);
277 free_msrs(); 277 free_msrs();
@@ -286,7 +286,7 @@ static void nmi_cpu_start(void *dummy)
286 286
287static int nmi_start(void) 287static int nmi_start(void)
288{ 288{
289 on_each_cpu(nmi_cpu_start, NULL, 0, 1); 289 on_each_cpu(nmi_cpu_start, NULL, 1);
290 return 0; 290 return 0;
291} 291}
292 292
@@ -298,7 +298,7 @@ static void nmi_cpu_stop(void *dummy)
298 298
299static void nmi_stop(void) 299static void nmi_stop(void)
300{ 300{
301 on_each_cpu(nmi_cpu_stop, NULL, 0, 1); 301 on_each_cpu(nmi_cpu_stop, NULL, 1);
302} 302}
303 303
304struct op_counter_config counter_config[OP_MAX_COUNTER]; 304struct op_counter_config counter_config[OP_MAX_COUNTER];
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index c5c8e485fc44..e515e8db842a 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -1,5 +1,17 @@
1ifeq ($(CONFIG_X86_32),y) 1obj-y := i386.o init.o
2include ${srctree}/arch/x86/pci/Makefile_32 2
3else 3obj-$(CONFIG_PCI_BIOS) += pcbios.o
4include ${srctree}/arch/x86/pci/Makefile_64 4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5endif 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o
7
8pci-y := fixup.o
9pci-$(CONFIG_ACPI) += acpi.o
10pci-y += legacy.o irq.o
11
12pci-$(CONFIG_X86_VISWS) += visws.o
13
14pci-$(CONFIG_X86_NUMAQ) += numa.o
15
16obj-y += $(pci-y) common.o early.o
17obj-y += amd_bus.o
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
deleted file mode 100644
index a34fbf557926..000000000000
--- a/arch/x86/pci/Makefile_32
+++ /dev/null
@@ -1,26 +0,0 @@
1obj-y := i386.o init.o
2
3obj-$(CONFIG_PCI_BIOS) += pcbios.o
4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o
7
8pci-y := fixup.o
9
10# Do not change the ordering here. There is a nasty init function
11# ordering dependency which breaks when you move acpi.o below
12# legacy/irq.o
13pci-$(CONFIG_ACPI) += acpi.o
14pci-y += legacy.o irq.o
15
16# Careful: VISWS overrule the pci-y above. The colons are
17# therefor correct. This needs a proper fix by distangling the code.
18pci-$(CONFIG_X86_VISWS) := visws.o fixup.o
19
20pci-$(CONFIG_X86_NUMAQ) += numa.o
21
22# Necessary for NUMAQ as well
23pci-$(CONFIG_NUMA) += mp_bus_to_node.o
24
25obj-y += $(pci-y) common.o early.o
26obj-y += amd_bus.o
diff --git a/arch/x86/pci/Makefile_64 b/arch/x86/pci/Makefile_64
deleted file mode 100644
index fd47068c95de..000000000000
--- a/arch/x86/pci/Makefile_64
+++ /dev/null
@@ -1,17 +0,0 @@
1#
2# Makefile for X86_64 specific PCI routines
3#
4# Reuse the i386 PCI subsystem
5#
6EXTRA_CFLAGS += -Iarch/x86/pci
7
8obj-y := i386.o
9obj-$(CONFIG_PCI_DIRECT)+= direct.o
10obj-y += fixup.o init.o
11obj-$(CONFIG_ACPI) += acpi.o
12obj-y += legacy.o irq.o common.o early.o
13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_64.o direct.o mmconfig-shared.o
15
16obj-y += amd_bus.o
17
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 4fa52d3dc848..19af06927fbc 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -223,7 +223,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
223 return bus; 223 return bus;
224} 224}
225 225
226static int __init pci_acpi_init(void) 226int __init pci_acpi_init(void)
227{ 227{
228 struct pci_dev *dev = NULL; 228 struct pci_dev *dev = NULL;
229 229
@@ -257,4 +257,3 @@ static int __init pci_acpi_init(void)
257 257
258 return 0; 258 return 0;
259} 259}
260subsys_initcall(pci_acpi_init);
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index d02c598451ec..dbf532369711 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -1,44 +1,25 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h>
3#include "pci.h" 4#include "pci.h"
4 5
5#ifdef CONFIG_X86_64 6#ifdef CONFIG_X86_64
6
7#include <asm/pci-direct.h> 7#include <asm/pci-direct.h>
8#include <asm/mpspec.h> 8#include <asm/mpspec.h>
9#include <linux/cpumask.h> 9#include <linux/cpumask.h>
10#include <linux/topology.h> 10#endif
11 11
12/* 12/*
13 * This discovers the pcibus <-> node mapping on AMD K8. 13 * This discovers the pcibus <-> node mapping on AMD K8.
14 * also get peer root bus resource for io,mmio 14 * also get peer root bus resource for io,mmio
15 */ 15 */
16 16
17
18/*
19 * sub bus (transparent) will use entres from 3 to store extra from root,
20 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
21 */
22#define RES_NUM 16
23struct pci_root_info {
24 char name[12];
25 unsigned int res_num;
26 struct resource res[RES_NUM];
27 int bus_min;
28 int bus_max;
29 int node;
30 int link;
31};
32
33/* 4 at this time, it may become to 32 */
34#define PCI_ROOT_NR 4
35static int pci_root_num;
36static struct pci_root_info pci_root_info[PCI_ROOT_NR];
37
38#ifdef CONFIG_NUMA 17#ifdef CONFIG_NUMA
39 18
40#define BUS_NR 256 19#define BUS_NR 256
41 20
21#ifdef CONFIG_X86_64
22
42static int mp_bus_to_node[BUS_NR]; 23static int mp_bus_to_node[BUS_NR];
43 24
44void set_mp_bus_to_node(int busnum, int node) 25void set_mp_bus_to_node(int busnum, int node)
@@ -65,7 +46,52 @@ int get_mp_bus_to_node(int busnum)
65 46
66 return node; 47 return node;
67} 48}
68#endif 49
50#else /* CONFIG_X86_32 */
51
52static unsigned char mp_bus_to_node[BUS_NR];
53
54void set_mp_bus_to_node(int busnum, int node)
55{
56 if (busnum >= 0 && busnum < BUS_NR)
57 mp_bus_to_node[busnum] = (unsigned char) node;
58}
59
60int get_mp_bus_to_node(int busnum)
61{
62 int node;
63
64 if (busnum < 0 || busnum > (BUS_NR - 1))
65 return 0;
66 node = mp_bus_to_node[busnum];
67 return node;
68}
69
70#endif /* CONFIG_X86_32 */
71
72#endif /* CONFIG_NUMA */
73
74#ifdef CONFIG_X86_64
75
76/*
77 * sub bus (transparent) will use entres from 3 to store extra from root,
78 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
79 */
80#define RES_NUM 16
81struct pci_root_info {
82 char name[12];
83 unsigned int res_num;
84 struct resource res[RES_NUM];
85 int bus_min;
86 int bus_max;
87 int node;
88 int link;
89};
90
91/* 4 at this time, it may become to 32 */
92#define PCI_ROOT_NR 4
93static int pci_root_num;
94static struct pci_root_info pci_root_info[PCI_ROOT_NR];
69 95
70void set_pci_bus_resources_arch_default(struct pci_bus *b) 96void set_pci_bus_resources_arch_default(struct pci_bus *b)
71{ 97{
@@ -552,7 +578,7 @@ static int __init enable_pci_io_ecs(void)
552 /* assume all cpus from fam10h have IO ECS */ 578 /* assume all cpus from fam10h have IO ECS */
553 if (boot_cpu_data.x86 < 0x10) 579 if (boot_cpu_data.x86 < 0x10)
554 return 0; 580 return 0;
555 on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1, 1); 581 on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1);
556 pci_probe |= PCI_HAS_IO_ECS; 582 pci_probe |= PCI_HAS_IO_ECS;
557 return 0; 583 return 0;
558} 584}
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 00a319cd5be3..1485a26ddcef 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -20,6 +20,7 @@
20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | 20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
21 PCI_PROBE_MMCONF; 21 PCI_PROBE_MMCONF;
22 22
23unsigned int pci_early_dump_regs;
23static int pci_bf_sort; 24static int pci_bf_sort;
24int pci_routeirq; 25int pci_routeirq;
25int noioapicquirk; 26int noioapicquirk;
@@ -33,7 +34,7 @@ struct pci_raw_ops *raw_pci_ext_ops;
33int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, 34int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
34 int reg, int len, u32 *val) 35 int reg, int len, u32 *val)
35{ 36{
36 if (reg < 256 && raw_pci_ops) 37 if (domain == 0 && reg < 256 && raw_pci_ops)
37 return raw_pci_ops->read(domain, bus, devfn, reg, len, val); 38 return raw_pci_ops->read(domain, bus, devfn, reg, len, val);
38 if (raw_pci_ext_ops) 39 if (raw_pci_ext_ops)
39 return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val); 40 return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val);
@@ -43,7 +44,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
43int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn, 44int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
44 int reg, int len, u32 val) 45 int reg, int len, u32 val)
45{ 46{
46 if (reg < 256 && raw_pci_ops) 47 if (domain == 0 && reg < 256 && raw_pci_ops)
47 return raw_pci_ops->write(domain, bus, devfn, reg, len, val); 48 return raw_pci_ops->write(domain, bus, devfn, reg, len, val);
48 if (raw_pci_ext_ops) 49 if (raw_pci_ext_ops)
49 return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val); 50 return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val);
@@ -123,6 +124,21 @@ void __init dmi_check_skip_isa_align(void)
123 dmi_check_system(can_skip_pciprobe_dmi_table); 124 dmi_check_system(can_skip_pciprobe_dmi_table);
124} 125}
125 126
127static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
128{
129 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
130
131 if (pci_probe & PCI_NOASSIGN_ROMS) {
132 if (rom_r->parent)
133 return;
134 if (rom_r->start) {
135 /* we deal with BIOS assigned ROM later */
136 return;
137 }
138 rom_r->start = rom_r->end = rom_r->flags = 0;
139 }
140}
141
126/* 142/*
127 * Called after each bus is probed, but before its children 143 * Called after each bus is probed, but before its children
128 * are examined. 144 * are examined.
@@ -130,7 +146,11 @@ void __init dmi_check_skip_isa_align(void)
130 146
131void __devinit pcibios_fixup_bus(struct pci_bus *b) 147void __devinit pcibios_fixup_bus(struct pci_bus *b)
132{ 148{
149 struct pci_dev *dev;
150
133 pci_read_bridge_bases(b); 151 pci_read_bridge_bases(b);
152 list_for_each_entry(dev, &b->devices, bus_list)
153 pcibios_fixup_device_resources(dev);
134} 154}
135 155
136/* 156/*
@@ -386,7 +406,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
386 406
387extern u8 pci_cache_line_size; 407extern u8 pci_cache_line_size;
388 408
389static int __init pcibios_init(void) 409int __init pcibios_init(void)
390{ 410{
391 struct cpuinfo_x86 *c = &boot_cpu_data; 411 struct cpuinfo_x86 *c = &boot_cpu_data;
392 412
@@ -413,8 +433,6 @@ static int __init pcibios_init(void)
413 return 0; 433 return 0;
414} 434}
415 435
416subsys_initcall(pcibios_init);
417
418char * __devinit pcibios_setup(char *str) 436char * __devinit pcibios_setup(char *str)
419{ 437{
420 if (!strcmp(str, "off")) { 438 if (!strcmp(str, "off")) {
@@ -485,12 +503,18 @@ char * __devinit pcibios_setup(char *str)
485 else if (!strcmp(str, "rom")) { 503 else if (!strcmp(str, "rom")) {
486 pci_probe |= PCI_ASSIGN_ROMS; 504 pci_probe |= PCI_ASSIGN_ROMS;
487 return NULL; 505 return NULL;
506 } else if (!strcmp(str, "norom")) {
507 pci_probe |= PCI_NOASSIGN_ROMS;
508 return NULL;
488 } else if (!strcmp(str, "assign-busses")) { 509 } else if (!strcmp(str, "assign-busses")) {
489 pci_probe |= PCI_ASSIGN_ALL_BUSSES; 510 pci_probe |= PCI_ASSIGN_ALL_BUSSES;
490 return NULL; 511 return NULL;
491 } else if (!strcmp(str, "use_crs")) { 512 } else if (!strcmp(str, "use_crs")) {
492 pci_probe |= PCI_USE__CRS; 513 pci_probe |= PCI_USE__CRS;
493 return NULL; 514 return NULL;
515 } else if (!strcmp(str, "earlydump")) {
516 pci_early_dump_regs = 1;
517 return NULL;
494 } else if (!strcmp(str, "routeirq")) { 518 } else if (!strcmp(str, "routeirq")) {
495 pci_routeirq = 1; 519 pci_routeirq = 1;
496 return NULL; 520 return NULL;
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 42df4b6606df..858dbe3399f9 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -49,7 +49,14 @@ void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{ 49{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val); 50 PDprintk("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc); 52 outb(val, 0xcfc + (offset&3));
53}
54
55void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
56{
57 PDprintk("%x writing to %x: %x\n", slot, offset, val);
58 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
59 outw(val, 0xcfc + (offset&2));
53} 60}
54 61
55int early_pci_allowed(void) 62int early_pci_allowed(void)
@@ -57,3 +64,54 @@ int early_pci_allowed(void)
57 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == 64 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
58 PCI_PROBE_CONF1; 65 PCI_PROBE_CONF1;
59} 66}
67
68void early_dump_pci_device(u8 bus, u8 slot, u8 func)
69{
70 int i;
71 int j;
72 u32 val;
73
74 printk("PCI: %02x:%02x:%02x", bus, slot, func);
75
76 for (i = 0; i < 256; i += 4) {
77 if (!(i & 0x0f))
78 printk("\n%04x:",i);
79
80 val = read_pci_config(bus, slot, func, i);
81 for (j = 0; j < 4; j++) {
82 printk(" %02x", val & 0xff);
83 val >>= 8;
84 }
85 }
86 printk("\n");
87}
88
89void early_dump_pci_devices(void)
90{
91 unsigned bus, slot, func;
92
93 if (!early_pci_allowed())
94 return;
95
96 for (bus = 0; bus < 256; bus++) {
97 for (slot = 0; slot < 32; slot++) {
98 for (func = 0; func < 8; func++) {
99 u32 class;
100 u8 type;
101 class = read_pci_config(bus, slot, func,
102 PCI_CLASS_REVISION);
103 if (class == 0xffffffff)
104 break;
105
106 early_dump_pci_device(bus, slot, func);
107
108 /* No multi-function device? */
109 type = read_pci_config_byte(bus, slot, func,
110 PCI_HEADER_TYPE);
111 if (!(type & 0x80))
112 break;
113 }
114 }
115 }
116}
117
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 6ccd7a108cd4..2aafb67dc5f1 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -334,7 +334,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
334 flags = new_flags; 334 flags = new_flags;
335 } 335 }
336 336
337 if (vma->vm_pgoff <= max_pfn_mapped && 337 if (((vma->vm_pgoff < max_low_pfn_mapped) ||
338 (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
339 vma->vm_pgoff < max_pfn_mapped)) &&
338 ioremap_change_attr((unsigned long)__va(addr), len, flags)) { 340 ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
339 free_memtype(addr, addr + len); 341 free_memtype(addr, addr + len);
340 return -EINVAL; 342 return -EINVAL;
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index b821f4462d99..d6c950f81858 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -4,7 +4,7 @@
4 4
5/* arch_initcall has too random ordering, so call the initializers 5/* arch_initcall has too random ordering, so call the initializers
6 in the right sequence from here. */ 6 in the right sequence from here. */
7static __init int pci_access_init(void) 7static __init int pci_arch_init(void)
8{ 8{
9#ifdef CONFIG_PCI_DIRECT 9#ifdef CONFIG_PCI_DIRECT
10 int type = 0; 10 int type = 0;
@@ -40,4 +40,4 @@ static __init int pci_access_init(void)
40 40
41 return 0; 41 return 0;
42} 42}
43arch_initcall(pci_access_init); 43arch_initcall(pci_arch_init);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index f0859de23e20..6a06a2eb0597 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -45,7 +45,8 @@ struct irq_router {
45 char *name; 45 char *name;
46 u16 vendor, device; 46 u16 vendor, device;
47 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); 47 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
48 int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); 48 int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
49 int new);
49}; 50};
50 51
51struct irq_router_handler { 52struct irq_router_handler {
@@ -77,7 +78,8 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
77 for (i = 0; i < rt->size; i++) 78 for (i = 0; i < rt->size; i++)
78 sum += addr[i]; 79 sum += addr[i];
79 if (!sum) { 80 if (!sum) {
80 DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt); 81 DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
82 rt);
81 return rt; 83 return rt;
82 } 84 }
83 return NULL; 85 return NULL;
@@ -183,7 +185,8 @@ static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset,
183 return (nr & 1) ? (x >> 4) : (x & 0xf); 185 return (nr & 1) ? (x >> 4) : (x & 0xf);
184} 186}
185 187
186static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) 188static void write_config_nybble(struct pci_dev *router, unsigned offset,
189 unsigned nr, unsigned int val)
187{ 190{
188 u8 x; 191 u8 x;
189 unsigned reg = offset + (nr >> 1); 192 unsigned reg = offset + (nr >> 1);
@@ -467,7 +470,8 @@ static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int
467 return inb(0xc01) & 0xf; 470 return inb(0xc01) & 0xf;
468} 471}
469 472
470static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 473static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
474 int pirq, int irq)
471{ 475{
472 outb(pirq, 0xc00); 476 outb(pirq, 0xc00);
473 outb(irq, 0xc01); 477 outb(irq, 0xc01);
@@ -660,7 +664,8 @@ static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router
660} 664}
661 665
662 666
663static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 667static __init int serverworks_router_probe(struct irq_router *r,
668 struct pci_dev *router, u16 device)
664{ 669{
665 switch (device) { 670 switch (device) {
666 case PCI_DEVICE_ID_SERVERWORKS_OSB4: 671 case PCI_DEVICE_ID_SERVERWORKS_OSB4:
@@ -827,10 +832,12 @@ static void __init pirq_find_router(struct irq_router *r)
827 832
828 for (h = pirq_routers; h->vendor; h++) { 833 for (h = pirq_routers; h->vendor; h++) {
829 /* First look for a router match */ 834 /* First look for a router match */
830 if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) 835 if (rt->rtr_vendor == h->vendor &&
836 h->probe(r, pirq_router_dev, rt->rtr_device))
831 break; 837 break;
832 /* Fall back to a device match */ 838 /* Fall back to a device match */
833 if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) 839 if (pirq_router_dev->vendor == h->vendor &&
840 h->probe(r, pirq_router_dev, pirq_router_dev->device))
834 break; 841 break;
835 } 842 }
836 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", 843 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
@@ -845,11 +852,13 @@ static void __init pirq_find_router(struct irq_router *r)
845static struct irq_info *pirq_get_info(struct pci_dev *dev) 852static struct irq_info *pirq_get_info(struct pci_dev *dev)
846{ 853{
847 struct irq_routing_table *rt = pirq_table; 854 struct irq_routing_table *rt = pirq_table;
848 int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); 855 int entries = (rt->size - sizeof(struct irq_routing_table)) /
856 sizeof(struct irq_info);
849 struct irq_info *info; 857 struct irq_info *info;
850 858
851 for (info = rt->slots; entries--; info++) 859 for (info = rt->slots; entries--; info++)
852 if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) 860 if (info->bus == dev->bus->number &&
861 PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
853 return info; 862 return info;
854 return NULL; 863 return NULL;
855} 864}
@@ -890,7 +899,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
890 DBG(" -> not routed\n" KERN_DEBUG); 899 DBG(" -> not routed\n" KERN_DEBUG);
891 return 0; 900 return 0;
892 } 901 }
893 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); 902 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask,
903 pirq_table->exclusive_irqs);
894 mask &= pcibios_irq_mask; 904 mask &= pcibios_irq_mask;
895 905
896 /* Work around broken HP Pavilion Notebooks which assign USB to 906 /* Work around broken HP Pavilion Notebooks which assign USB to
@@ -903,7 +913,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
903 } 913 }
904 914
905 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ 915 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
906 if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { 916 if (acer_tm360_irqrouting && dev->irq == 11 &&
917 dev->vendor == PCI_VENDOR_ID_O2) {
907 pirq = 0x68; 918 pirq = 0x68;
908 mask = 0x400; 919 mask = 0x400;
909 dev->irq = r->get(pirq_router_dev, dev, pirq); 920 dev->irq = r->get(pirq_router_dev, dev, pirq);
@@ -920,15 +931,16 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
920 newirq = 0; 931 newirq = 0;
921 else 932 else
922 printk("\n" KERN_WARNING 933 printk("\n" KERN_WARNING
923 "PCI: IRQ %i for device %s doesn't match PIRQ mask " 934 "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n"
924 "- try pci=usepirqmask\n" KERN_DEBUG, newirq, 935 KERN_DEBUG, newirq,
925 pci_name(dev)); 936 pci_name(dev));
926 } 937 }
927 if (!newirq && assign) { 938 if (!newirq && assign) {
928 for (i = 0; i < 16; i++) { 939 for (i = 0; i < 16; i++) {
929 if (!(mask & (1 << i))) 940 if (!(mask & (1 << i)))
930 continue; 941 continue;
931 if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED)) 942 if (pirq_penalty[i] < pirq_penalty[newirq] &&
943 can_request_irq(i, IRQF_SHARED))
932 newirq = i; 944 newirq = i;
933 } 945 }
934 } 946 }
@@ -944,7 +956,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
944 DBG(" -> got IRQ %d\n", irq); 956 DBG(" -> got IRQ %d\n", irq);
945 msg = "Found"; 957 msg = "Found";
946 eisa_set_level_irq(irq); 958 eisa_set_level_irq(irq);
947 } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { 959 } else if (newirq && r->set &&
960 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
948 DBG(" -> assigning IRQ %d", newirq); 961 DBG(" -> assigning IRQ %d", newirq);
949 if (r->set(pirq_router_dev, dev, pirq, newirq)) { 962 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
950 eisa_set_level_irq(newirq); 963 eisa_set_level_irq(newirq);
@@ -962,7 +975,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
962 } else 975 } else
963 return 0; 976 return 0;
964 } 977 }
965 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); 978 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq,
979 pci_name(dev));
966 980
967 /* Update IRQ for all devices with the same pirq value */ 981 /* Update IRQ for all devices with the same pirq value */
968 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 982 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -974,7 +988,10 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
974 if (!info) 988 if (!info)
975 continue; 989 continue;
976 if (info->irq[pin].link == pirq) { 990 if (info->irq[pin].link == pirq) {
977 /* We refuse to override the dev->irq information. Give a warning! */ 991 /*
992 * We refuse to override the dev->irq
993 * information. Give a warning!
994 */
978 if (dev2->irq && dev2->irq != irq && \ 995 if (dev2->irq && dev2->irq != irq && \
979 (!(pci_probe & PCI_USE_PIRQ_MASK) || \ 996 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
980 ((1 << dev2->irq) & mask))) { 997 ((1 << dev2->irq) & mask))) {
@@ -987,7 +1004,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
987 dev2->irq = irq; 1004 dev2->irq = irq;
988 pirq_penalty[irq]++; 1005 pirq_penalty[irq]++;
989 if (dev != dev2) 1006 if (dev != dev2)
990 printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); 1007 printk(KERN_INFO
1008 "PCI: Sharing IRQ %d with %s\n",
1009 irq, pci_name(dev2));
991 } 1010 }
992 } 1011 }
993 return 1; 1012 return 1;
@@ -1001,15 +1020,21 @@ static void __init pcibios_fixup_irqs(void)
1001 DBG(KERN_DEBUG "PCI: IRQ fixup\n"); 1020 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
1002 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1021 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1003 /* 1022 /*
1004 * If the BIOS has set an out of range IRQ number, just ignore it. 1023 * If the BIOS has set an out of range IRQ number, just
1005 * Also keep track of which IRQ's are already in use. 1024 * ignore it. Also keep track of which IRQ's are
1025 * already in use.
1006 */ 1026 */
1007 if (dev->irq >= 16) { 1027 if (dev->irq >= 16) {
1008 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); 1028 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n",
1029 pci_name(dev), dev->irq);
1009 dev->irq = 0; 1030 dev->irq = 0;
1010 } 1031 }
1011 /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ 1032 /*
1012 if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) 1033 * If the IRQ is already assigned to a PCI device,
1034 * ignore its ISA use penalty
1035 */
1036 if (pirq_penalty[dev->irq] >= 100 &&
1037 pirq_penalty[dev->irq] < 100000)
1013 pirq_penalty[dev->irq] = 0; 1038 pirq_penalty[dev->irq] = 0;
1014 pirq_penalty[dev->irq]++; 1039 pirq_penalty[dev->irq]++;
1015 } 1040 }
@@ -1025,8 +1050,13 @@ static void __init pcibios_fixup_irqs(void)
1025 int irq; 1050 int irq;
1026 1051
1027 if (pin) { 1052 if (pin) {
1028 pin--; /* interrupt pins are numbered starting from 1 */ 1053 /*
1029 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); 1054 * interrupt pins are numbered starting
1055 * from 1
1056 */
1057 pin--;
1058 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1059 PCI_SLOT(dev->devfn), pin);
1030 /* 1060 /*
1031 * Busses behind bridges are typically not listed in the MP-table. 1061 * Busses behind bridges are typically not listed in the MP-table.
1032 * In this case we have to look up the IRQ based on the parent bus, 1062 * In this case we have to look up the IRQ based on the parent bus,
@@ -1067,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
1067{ 1097{
1068 if (!broken_hp_bios_irq9) { 1098 if (!broken_hp_bios_irq9) {
1069 broken_hp_bios_irq9 = 1; 1099 broken_hp_bios_irq9 = 1;
1070 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); 1100 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
1101 d->ident);
1071 } 1102 }
1072 return 0; 1103 return 0;
1073} 1104}
@@ -1080,7 +1111,8 @@ static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
1080{ 1111{
1081 if (!acer_tm360_irqrouting) { 1112 if (!acer_tm360_irqrouting) {
1082 acer_tm360_irqrouting = 1; 1113 acer_tm360_irqrouting = 1;
1083 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); 1114 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
1115 d->ident);
1084 } 1116 }
1085 return 0; 1117 return 0;
1086} 1118}
@@ -1092,7 +1124,8 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1092 .matches = { 1124 .matches = {
1093 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1125 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1094 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), 1126 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
1095 DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), 1127 DMI_MATCH(DMI_PRODUCT_VERSION,
1128 "HP Pavilion Notebook Model GE"),
1096 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), 1129 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
1097 }, 1130 },
1098 }, 1131 },
@@ -1107,7 +1140,7 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1107 { } 1140 { }
1108}; 1141};
1109 1142
1110static int __init pcibios_irq_init(void) 1143int __init pcibios_irq_init(void)
1111{ 1144{
1112 DBG(KERN_DEBUG "PCI: IRQ init\n"); 1145 DBG(KERN_DEBUG "PCI: IRQ init\n");
1113 1146
@@ -1131,7 +1164,10 @@ static int __init pcibios_irq_init(void)
1131 if (!(pirq_table->exclusive_irqs & (1 << i))) 1164 if (!(pirq_table->exclusive_irqs & (1 << i)))
1132 pirq_penalty[i] += 100; 1165 pirq_penalty[i] += 100;
1133 } 1166 }
1134 /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ 1167 /*
1168 * If we're using the I/O APIC, avoid using the PCI IRQ
1169 * routing table
1170 */
1135 if (io_apic_assign_pci_irqs) 1171 if (io_apic_assign_pci_irqs)
1136 pirq_table = NULL; 1172 pirq_table = NULL;
1137 } 1173 }
@@ -1142,9 +1178,6 @@ static int __init pcibios_irq_init(void)
1142 return 0; 1178 return 0;
1143} 1179}
1144 1180
1145subsys_initcall(pcibios_irq_init);
1146
1147
1148static void pirq_penalize_isa_irq(int irq, int active) 1181static void pirq_penalize_isa_irq(int irq, int active)
1149{ 1182{
1150 /* 1183 /*
@@ -1178,7 +1211,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
1178 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { 1211 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
1179 char *msg = ""; 1212 char *msg = "";
1180 1213
1181 pin--; /* interrupt pins are numbered starting from 1 */ 1214 pin--; /* interrupt pins are numbered starting from 1 */
1182 1215
1183 if (io_apic_assign_pci_irqs) { 1216 if (io_apic_assign_pci_irqs) {
1184 int irq; 1217 int irq;
@@ -1198,13 +1231,16 @@ static int pirq_enable_irq(struct pci_dev *dev)
1198 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1231 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1199 PCI_SLOT(bridge->devfn), pin); 1232 PCI_SLOT(bridge->devfn), pin);
1200 if (irq >= 0) 1233 if (irq >= 0)
1201 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1234 printk(KERN_WARNING
1202 pci_name(bridge), 'A' + pin, irq); 1235 "PCI: using PPB %s[%c] to get irq %d\n",
1236 pci_name(bridge),
1237 'A' + pin, irq);
1203 dev = bridge; 1238 dev = bridge;
1204 } 1239 }
1205 dev = temp_dev; 1240 dev = temp_dev;
1206 if (irq >= 0) { 1241 if (irq >= 0) {
1207 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1242 printk(KERN_INFO
1243 "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
1208 pci_name(dev), 'A' + pin, irq); 1244 pci_name(dev), 'A' + pin, irq);
1209 dev->irq = irq; 1245 dev->irq = irq;
1210 return 0; 1246 return 0;
@@ -1215,12 +1251,17 @@ static int pirq_enable_irq(struct pci_dev *dev)
1215 else 1251 else
1216 msg = " Please try using pci=biosirq."; 1252 msg = " Please try using pci=biosirq.";
1217 1253
1218 /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ 1254 /*
1219 if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) 1255 * With IDE legacy devices the IRQ lookup failure is not
1256 * a problem..
1257 */
1258 if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
1259 !(dev->class & 0x5))
1220 return 0; 1260 return 0;
1221 1261
1222 printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", 1262 printk(KERN_WARNING
1223 'A' + pin, pci_name(dev), msg); 1263 "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
1264 'A' + pin, pci_name(dev), msg);
1224 } 1265 }
1225 return 0; 1266 return 0;
1226} 1267}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index a67921ce60af..132876cc6fca 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -55,4 +55,18 @@ static int __init pci_legacy_init(void)
55 return 0; 55 return 0;
56} 56}
57 57
58subsys_initcall(pci_legacy_init); 58int __init pci_subsys_init(void)
59{
60#ifdef CONFIG_ACPI
61 pci_acpi_init();
62#endif
63 pci_legacy_init();
64 pcibios_irq_init();
65#ifdef CONFIG_X86_NUMAQ
66 pci_numa_init();
67#endif
68 pcibios_init();
69
70 return 0;
71}
72subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/pci/mp_bus_to_node.c b/arch/x86/pci/mp_bus_to_node.c
deleted file mode 100644
index 022943999b84..000000000000
--- a/arch/x86/pci/mp_bus_to_node.c
+++ /dev/null
@@ -1,23 +0,0 @@
1#include <linux/pci.h>
2#include <linux/init.h>
3#include <linux/topology.h>
4
5#define BUS_NR 256
6
7static unsigned char mp_bus_to_node[BUS_NR];
8
9void set_mp_bus_to_node(int busnum, int node)
10{
11 if (busnum >= 0 && busnum < BUS_NR)
12 mp_bus_to_node[busnum] = (unsigned char) node;
13}
14
15int get_mp_bus_to_node(int busnum)
16{
17 int node;
18
19 if (busnum < 0 || busnum > (BUS_NR - 1))
20 return 0;
21 node = mp_bus_to_node[busnum];
22 return node;
23}
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c
index 99f1ecd485b5..8b5ca1966731 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numa.c
@@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
151} 151}
152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); 152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
153 153
154static int __init pci_numa_init(void) 154int __init pci_numa_init(void)
155{ 155{
156 int quad; 156 int quad;
157 157
@@ -176,5 +176,3 @@ static int __init pci_numa_init(void)
176 } 176 }
177 return 0; 177 return 0;
178} 178}
179
180subsys_initcall(pci_numa_init);
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index ba263e626a68..3e25deb821ac 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -28,6 +28,7 @@
28#define PCI_USE__CRS 0x10000 28#define PCI_USE__CRS 0x10000
29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
30#define PCI_HAS_IO_ECS 0x40000 30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000
31 32
32extern unsigned int pci_probe; 33extern unsigned int pci_probe;
33extern unsigned long pirq_table_addr; 34extern unsigned long pirq_table_addr;
@@ -39,9 +40,6 @@ enum pci_bf_sort_state {
39 pci_dmi_bf, 40 pci_dmi_bf,
40}; 41};
41 42
42extern void __init dmi_check_pciprobe(void);
43extern void __init dmi_check_skip_isa_align(void);
44
45/* pci-i386.c */ 43/* pci-i386.c */
46 44
47extern unsigned int pcibios_max_latency; 45extern unsigned int pcibios_max_latency;
@@ -99,10 +97,19 @@ extern struct pci_raw_ops *raw_pci_ext_ops;
99 97
100extern struct pci_raw_ops pci_direct_conf1; 98extern struct pci_raw_ops pci_direct_conf1;
101 99
100/* arch_initcall level */
102extern int pci_direct_probe(void); 101extern int pci_direct_probe(void);
103extern void pci_direct_init(int type); 102extern void pci_direct_init(int type);
104extern void pci_pcbios_init(void); 103extern void pci_pcbios_init(void);
105extern int pci_olpc_init(void); 104extern int pci_olpc_init(void);
105extern void __init dmi_check_pciprobe(void);
106extern void __init dmi_check_skip_isa_align(void);
107
108/* some common used subsys_initcalls */
109extern int __init pci_acpi_init(void);
110extern int __init pcibios_irq_init(void);
111extern int __init pci_numa_init(void);
112extern int __init pcibios_init(void);
106 113
107/* pci-mmconfig.c */ 114/* pci-mmconfig.c */
108 115
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index c2df4e97eed6..1a7bed492bb1 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -8,18 +8,19 @@
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/init.h> 9#include <linux/init.h>
10 10
11#include "cobalt.h" 11#include <asm/setup.h>
12#include "lithium.h" 12#include <asm/visws/cobalt.h>
13#include <asm/visws/lithium.h>
13 14
14#include "pci.h" 15#include "pci.h"
15 16
16static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } 17static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
17static void pci_visws_disable_irq(struct pci_dev *dev) { } 18static void pci_visws_disable_irq(struct pci_dev *dev) { }
18 19
19int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; 20/* int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; */
20void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; 21/* void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; */
21 22
22void __init pcibios_penalize_isa_irq(int irq, int active) {} 23/* void __init pcibios_penalize_isa_irq(int irq, int active) {} */
23 24
24 25
25unsigned int pci_bus0, pci_bus1; 26unsigned int pci_bus0, pci_bus1;
@@ -85,7 +86,7 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
85 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); 86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
86} 87}
87 88
88static int __init pcibios_init(void) 89static int __init pci_visws_init(void)
89{ 90{
90 /* The VISWS supports configuration access type 1 only */ 91 /* The VISWS supports configuration access type 1 only */
91 pci_probe = (pci_probe | PCI_PROBE_CONF1) & 92 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
@@ -105,4 +106,17 @@ static int __init pcibios_init(void)
105 return 0; 106 return 0;
106} 107}
107 108
108subsys_initcall(pcibios_init); 109static __init int pci_subsys_init(void)
110{
111 if (!is_visws_box())
112 return -1;
113
114 pcibios_enable_irq = &pci_visws_enable_irq;
115 pcibios_disable_irq = &pci_visws_disable_irq;
116
117 pci_visws_init();
118 pcibios_init();
119
120 return 0;
121}
122subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index efa2ba7c6005..1ef0f90813d6 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -23,7 +23,7 @@
23 23
24#define gtod vdso_vsyscall_gtod_data 24#define gtod vdso_vsyscall_gtod_data
25 25
26static long vdso_fallback_gettime(long clock, struct timespec *ts) 26notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
27{ 27{
28 long ret; 28 long ret;
29 asm("syscall" : "=a" (ret) : 29 asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts)
31 return ret; 31 return ret;
32} 32}
33 33
34static inline long vgetns(void) 34notrace static inline long vgetns(void)
35{ 35{
36 long v; 36 long v;
37 cycles_t (*vread)(void); 37 cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
40 return (v * gtod->clock.mult) >> gtod->clock.shift; 40 return (v * gtod->clock.mult) >> gtod->clock.shift;
41} 41}
42 42
43static noinline int do_realtime(struct timespec *ts) 43notrace static noinline int do_realtime(struct timespec *ts)
44{ 44{
45 unsigned long seq, ns; 45 unsigned long seq, ns;
46 do { 46 do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts)
54} 54}
55 55
56/* Copy of the version in kernel/time.c which we cannot directly access */ 56/* Copy of the version in kernel/time.c which we cannot directly access */
57static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) 57notrace static void
58vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
58{ 59{
59 while (nsec >= NSEC_PER_SEC) { 60 while (nsec >= NSEC_PER_SEC) {
60 nsec -= NSEC_PER_SEC; 61 nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
68 ts->tv_nsec = nsec; 69 ts->tv_nsec = nsec;
69} 70}
70 71
71static noinline int do_monotonic(struct timespec *ts) 72notrace static noinline int do_monotonic(struct timespec *ts)
72{ 73{
73 unsigned long seq, ns, secs; 74 unsigned long seq, ns, secs;
74 do { 75 do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts)
82 return 0; 83 return 0;
83} 84}
84 85
85int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 86notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
86{ 87{
87 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) 88 if (likely(gtod->sysctl_enabled && gtod->clock.vread))
88 switch (clock) { 89 switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
96int clock_gettime(clockid_t, struct timespec *) 97int clock_gettime(clockid_t, struct timespec *)
97 __attribute__((weak, alias("__vdso_clock_gettime"))); 98 __attribute__((weak, alias("__vdso_clock_gettime")));
98 99
99int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) 100notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
100{ 101{
101 long ret; 102 long ret;
102 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { 103 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index cf058fecfcee..0bce5429a515 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -203,20 +203,11 @@ static struct page *vdso32_pages[1];
203 203
204#ifdef CONFIG_X86_64 204#ifdef CONFIG_X86_64
205 205
206static int use_sysenter __read_mostly = -1; 206#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
207
208#define vdso32_sysenter() (use_sysenter > 0)
209 207
210/* May not be __init: called during resume */ 208/* May not be __init: called during resume */
211void syscall32_cpu_init(void) 209void syscall32_cpu_init(void)
212{ 210{
213 if (use_sysenter < 0) {
214 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
215 use_sysenter = 1;
216 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
217 use_sysenter = 1;
218 }
219
220 /* Load these always in case some future AMD CPU supports 211 /* Load these always in case some future AMD CPU supports
221 SYSENTER from compat mode too. */ 212 SYSENTER from compat mode too. */
222 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); 213 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index c8097f17f8a9..9fbc6b20026b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -13,7 +13,8 @@
13#include <asm/vgtod.h> 13#include <asm/vgtod.h>
14#include "vextern.h" 14#include "vextern.h"
15 15
16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16notrace long
17__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 18{
18 unsigned int p; 19 unsigned int p;
19 20
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3b980831602c..bb508456ef52 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1062,7 +1062,7 @@ static const struct pv_time_ops xen_time_ops __initdata = {
1062 1062
1063 .set_wallclock = xen_set_wallclock, 1063 .set_wallclock = xen_set_wallclock,
1064 .get_wallclock = xen_get_wallclock, 1064 .get_wallclock = xen_get_wallclock,
1065 .get_cpu_khz = xen_cpu_khz, 1065 .get_tsc_khz = xen_tsc_khz,
1066 .sched_clock = xen_sched_clock, 1066 .sched_clock = xen_sched_clock,
1067}; 1067};
1068 1068
@@ -1214,7 +1214,9 @@ static const struct smp_ops xen_smp_ops __initdata = {
1214 1214
1215 .smp_send_stop = xen_smp_send_stop, 1215 .smp_send_stop = xen_smp_send_stop,
1216 .smp_send_reschedule = xen_smp_send_reschedule, 1216 .smp_send_reschedule = xen_smp_send_reschedule,
1217 .smp_call_function_mask = xen_smp_call_function_mask, 1217
1218 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1219 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1218}; 1220};
1219#endif /* CONFIG_SMP */ 1221#endif /* CONFIG_SMP */
1220 1222
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42b3b9ed641d..ff0aa74afaa1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -796,7 +796,7 @@ static void drop_mm_ref(struct mm_struct *mm)
796 } 796 }
797 797
798 if (!cpus_empty(mask)) 798 if (!cpus_empty(mask))
799 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 799 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
800} 800}
801#else 801#else
802static void drop_mm_ref(struct mm_struct *mm) 802static void drop_mm_ref(struct mm_struct *mm)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d2e3c20127d7..233156f39b7f 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -36,27 +36,14 @@
36#include "mmu.h" 36#include "mmu.h"
37 37
38cpumask_t xen_cpu_initialized_map; 38cpumask_t xen_cpu_initialized_map;
39static DEFINE_PER_CPU(int, resched_irq) = -1;
40static DEFINE_PER_CPU(int, callfunc_irq) = -1;
41static DEFINE_PER_CPU(int, debug_irq) = -1;
42
43/*
44 * Structure and data for smp_call_function(). This is designed to minimise
45 * static memory requirements. It also looks cleaner.
46 */
47static DEFINE_SPINLOCK(call_lock);
48 39
49struct call_data_struct { 40static DEFINE_PER_CPU(int, resched_irq);
50 void (*func) (void *info); 41static DEFINE_PER_CPU(int, callfunc_irq);
51 void *info; 42static DEFINE_PER_CPU(int, callfuncsingle_irq);
52 atomic_t started; 43static DEFINE_PER_CPU(int, debug_irq) = -1;
53 atomic_t finished;
54 int wait;
55};
56 44
57static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 45static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
58 46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
59static struct call_data_struct *call_data;
60 47
61/* 48/*
62 * Reschedule call back. Nothing to do, 49 * Reschedule call back. Nothing to do,
@@ -128,6 +115,17 @@ static int xen_smp_intr_init(unsigned int cpu)
128 goto fail; 115 goto fail;
129 per_cpu(debug_irq, cpu) = rc; 116 per_cpu(debug_irq, cpu) = rc;
130 117
118 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
119 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
120 cpu,
121 xen_call_function_single_interrupt,
122 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
123 callfunc_name,
124 NULL);
125 if (rc < 0)
126 goto fail;
127 per_cpu(callfuncsingle_irq, cpu) = rc;
128
131 return 0; 129 return 0;
132 130
133 fail: 131 fail:
@@ -137,6 +135,9 @@ static int xen_smp_intr_init(unsigned int cpu)
137 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 135 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
138 if (per_cpu(debug_irq, cpu) >= 0) 136 if (per_cpu(debug_irq, cpu) >= 0)
139 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 137 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
138 if (per_cpu(callfuncsingle_irq, cpu) >= 0)
139 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
140
140 return rc; 141 return rc;
141} 142}
142 143
@@ -336,7 +337,7 @@ static void stop_self(void *v)
336 337
337void xen_smp_send_stop(void) 338void xen_smp_send_stop(void)
338{ 339{
339 smp_call_function(stop_self, NULL, 0, 0); 340 smp_call_function(stop_self, NULL, 0);
340} 341}
341 342
342void xen_smp_send_reschedule(int cpu) 343void xen_smp_send_reschedule(int cpu)
@@ -344,7 +345,6 @@ void xen_smp_send_reschedule(int cpu)
344 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
345} 346}
346 347
347
348static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) 348static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
349{ 349{
350 unsigned cpu; 350 unsigned cpu;
@@ -355,83 +355,42 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
355 xen_send_IPI_one(cpu, vector); 355 xen_send_IPI_one(cpu, vector);
356} 356}
357 357
358void xen_smp_send_call_function_ipi(cpumask_t mask)
359{
360 int cpu;
361
362 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
363
364 /* Make sure other vcpus get a chance to run if they need to. */
365 for_each_cpu_mask(cpu, mask) {
366 if (xen_vcpu_stolen(cpu)) {
367 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
368 break;
369 }
370 }
371}
372
373void xen_smp_send_call_function_single_ipi(int cpu)
374{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376}
377
358static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 378static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
359{ 379{
360 void (*func) (void *info) = call_data->func;
361 void *info = call_data->info;
362 int wait = call_data->wait;
363
364 /*
365 * Notify initiating CPU that I've grabbed the data and am
366 * about to execute the function
367 */
368 mb();
369 atomic_inc(&call_data->started);
370 /*
371 * At this point the info structure may be out of scope unless wait==1
372 */
373 irq_enter(); 380 irq_enter();
374 (*func)(info); 381 generic_smp_call_function_interrupt();
375 __get_cpu_var(irq_stat).irq_call_count++; 382 __get_cpu_var(irq_stat).irq_call_count++;
376 irq_exit(); 383 irq_exit();
377 384
378 if (wait) {
379 mb(); /* commit everything before setting finished */
380 atomic_inc(&call_data->finished);
381 }
382
383 return IRQ_HANDLED; 385 return IRQ_HANDLED;
384} 386}
385 387
386int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 388static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
387 void *info, int wait)
388{ 389{
389 struct call_data_struct data; 390 irq_enter();
390 int cpus, cpu; 391 generic_smp_call_function_single_interrupt();
391 bool yield; 392 __get_cpu_var(irq_stat).irq_call_count++;
392 393 irq_exit();
393 /* Holding any lock stops cpus from going down. */
394 spin_lock(&call_lock);
395
396 cpu_clear(smp_processor_id(), mask);
397
398 cpus = cpus_weight(mask);
399 if (!cpus) {
400 spin_unlock(&call_lock);
401 return 0;
402 }
403
404 /* Can deadlock when called with interrupts disabled */
405 WARN_ON(irqs_disabled());
406
407 data.func = func;
408 data.info = info;
409 atomic_set(&data.started, 0);
410 data.wait = wait;
411 if (wait)
412 atomic_set(&data.finished, 0);
413
414 call_data = &data;
415 mb(); /* write everything before IPI */
416
417 /* Send a message to other CPUs and wait for them to respond */
418 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
419
420 /* Make sure other vcpus get a chance to run if they need to. */
421 yield = false;
422 for_each_cpu_mask(cpu, mask)
423 if (xen_vcpu_stolen(cpu))
424 yield = true;
425
426 if (yield)
427 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
428
429 /* Wait for response */
430 while (atomic_read(&data.started) != cpus ||
431 (wait && atomic_read(&data.finished) != cpus))
432 cpu_relax();
433
434 spin_unlock(&call_lock);
435 394
436 return 0; 395 return IRQ_HANDLED;
437} 396}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 64f0038b9558..685b77470fc3 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -197,8 +197,8 @@ unsigned long long xen_sched_clock(void)
197} 197}
198 198
199 199
200/* Get the CPU speed from Xen */ 200/* Get the TSC speed from Xen */
201unsigned long xen_cpu_khz(void) 201unsigned long xen_tsc_khz(void)
202{ 202{
203 u64 xen_khz = 1000000ULL << 32; 203 u64 xen_khz = 1000000ULL << 32;
204 const struct pvclock_vcpu_time_info *info = 204 const struct pvclock_vcpu_time_info *info =
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9a055592a307..6f4b1045c1c2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -32,7 +32,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
32 32
33void xen_setup_timer(int cpu); 33void xen_setup_timer(int cpu);
34void xen_setup_cpu_clockevents(void); 34void xen_setup_cpu_clockevents(void);
35unsigned long xen_cpu_khz(void); 35unsigned long xen_tsc_khz(void);
36void __init xen_time_init(void); 36void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 37unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 38int xen_set_wallclock(unsigned long time);
@@ -55,13 +55,8 @@ void xen_smp_cpus_done(unsigned int max_cpus);
55 55
56void xen_smp_send_stop(void); 56void xen_smp_send_stop(void);
57void xen_smp_send_reschedule(int cpu); 57void xen_smp_send_reschedule(int cpu);
58int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, 58void xen_smp_send_call_function_ipi(cpumask_t mask);
59 int wait); 59void xen_smp_send_call_function_single_ipi(int cpu);
60int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
61 int nonatomic, int wait);
62
63int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
64 void *info, int wait);
65 60
66extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_t xen_cpu_initialized_map;
67 62