aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-10-17 19:09:09 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2009-10-17 19:12:33 -0400
commit0f8f86c7bdd1c954fbe153af437a0d91a6c5721a (patch)
tree94a8d419a470a4f9852ca397bb9bbe48db92ff5c /arch/x86
parentdca2d6ac09d9ef59ff46820d4f0c94b08a671202 (diff)
parentf39cdf25bf77219676ec5360980ac40b1a7e144a (diff)
Merge commit 'perf/core' into perf/hw-breakpoint
Conflicts: kernel/Makefile kernel/trace/Makefile kernel/trace/trace.h samples/Makefile Merge reason: We need to be uptodate with the perf events development branch because we plan to rewrite the breakpoints API on top of perf events.
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig126
-rw-r--r--arch/x86/Kconfig.cpu3
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/boot/compressed/head_32.S3
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S6
-rw-r--r--arch/x86/boot/install.sh4
-rw-r--r--arch/x86/ia32/ia32entry.S38
-rw-r--r--arch/x86/include/asm/acpi.h1
-rw-r--r--arch/x86/include/asm/agp.h4
-rw-r--r--arch/x86/include/asm/apic.h32
-rw-r--r--arch/x86/include/asm/bootparam.h13
-rw-r--r--arch/x86/include/asm/cache.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h54
-rw-r--r--arch/x86/include/asm/checksum_32.h3
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h30
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/do_timer.h16
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/fixmap.h3
-rw-r--r--arch/x86/include/asm/hypervisor.h2
-rw-r--r--arch/x86/include/asm/io_apic.h7
-rw-r--r--arch/x86/include/asm/iomap.h9
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/include/asm/mce.h34
-rw-r--r--arch/x86/include/asm/mmu_context.h6
-rw-r--r--arch/x86/include/asm/mpspec.h47
-rw-r--r--arch/x86/include/asm/msr-index.h11
-rw-r--r--arch/x86/include/asm/mtrr.h6
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/paravirt.h79
-rw-r--r--arch/x86/include/asm/paravirt_types.h38
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pci.h7
-rw-r--r--arch/x86/include/asm/percpu.h9
-rw-r--r--arch/x86/include/asm/perf_event.h (renamed from arch/x86/include/asm/perf_counter.h)43
-rw-r--r--arch/x86/include/asm/pgtable.h10
-rw-r--r--arch/x86/include/asm/pgtable_types.h5
-rw-r--r--arch/x86/include/asm/processor.h32
-rw-r--r--arch/x86/include/asm/setup.h49
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/string_32.h1
-rw-r--r--arch/x86/include/asm/syscall.h14
-rw-r--r--arch/x86/include/asm/time.h53
-rw-r--r--arch/x86/include/asm/timer.h14
-rw-r--r--arch/x86/include/asm/topology.h24
-rw-r--r--arch/x86/include/asm/tsc.h3
-rw-r--r--arch/x86/include/asm/uaccess_32.h2
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h19
-rw-r--r--arch/x86/include/asm/vgtod.h1
-rw-r--r--arch/x86/include/asm/vmware.h2
-rw-r--r--arch/x86/include/asm/x86_init.h133
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S4
-rw-r--r--arch/x86/kernel/apic/apic.c40
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c63
-rw-r--r--arch/x86/kernel/apic/nmi.c6
-rw-r--r--arch/x86/kernel/apic/numaq_32.c57
-rw-r--r--arch/x86/kernel/apic/probe_64.c15
-rw-r--r--arch/x86/kernel/apic/summit_32.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c11
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c5
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c116
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c44
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c14
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile5
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c116
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c159
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c396
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c163
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c80
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event.c (renamed from arch/x86/kernel/cpu/perf_counter.c)781
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/cpu/vmware.c27
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/dumpstack_32.c1
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/e820.c25
-rw-r--r--arch/x86/kernel/early_printk.c788
-rw-r--r--arch/x86/kernel/efi.c4
-rw-r--r--arch/x86/kernel/entry_32.S7
-rw-r--r--arch/x86/kernel/entry_64.S32
-rw-r--r--arch/x86/kernel/ftrace.c84
-rw-r--r--arch/x86/kernel/head32.c26
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c10
-rw-r--r--arch/x86/kernel/i8253.c19
-rw-r--r--arch/x86/kernel/init_task.c5
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irqinit.c40
-rw-r--r--arch/x86/kernel/kvmclock.c11
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/mpparse.c75
-rw-r--r--arch/x86/kernel/mrst.c24
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/paravirt.c36
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-gart_64.c1
-rw-r--r--arch/x86/kernel/pci-swiotlb.c5
-rw-r--r--arch/x86/kernel/process.c31
-rw-r--r--arch/x86/kernel/ptrace.c21
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/rtc.c17
-rw-r--r--arch/x86/kernel/setup.c131
-rw-r--r--arch/x86/kernel/setup_percpu.c364
-rw-r--r--arch/x86/kernel/sfi.c122
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smpboot.c29
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c447
-rw-r--r--arch/x86/kernel/time.c121
-rw-r--r--arch/x86/kernel/time_32.c137
-rw-r--r--arch/x86/kernel/time_64.c135
-rw-r--r--arch/x86/kernel/trampoline.c12
-rw-r--r--arch/x86/kernel/trampoline_32.S8
-rw-r--r--arch/x86/kernel/trampoline_64.S7
-rw-r--r--arch/x86/kernel/traps.c13
-rw-r--r--arch/x86/kernel/tsc.c88
-rw-r--r--arch/x86/kernel/tsc_sync.c2
-rw-r--r--arch/x86/kernel/visws_quirks.c54
-rw-r--r--arch/x86/kernel/vmi_32.c14
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S111
-rw-r--r--arch/x86/kernel/vsyscall_64.c11
-rw-r--r--arch/x86/kernel/x86_init.c75
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c84
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/svm.c25
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/lguest/boot.c21
-rw-r--r--arch/x86/lib/Makefile4
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S57
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c27
-rw-r--r--arch/x86/mm/init.c63
-rw-r--r--arch/x86/mm/init_32.c12
-rw-r--r--arch/x86/mm/init_64.c12
-rw-r--r--arch/x86/mm/iomap_32.c27
-rw-r--r--arch/x86/mm/ioremap.c18
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c3
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c1
-rw-r--r--arch/x86/mm/mmap.c17
-rw-r--r--arch/x86/mm/pageattr.c30
-rw-r--r--arch/x86/mm/pat.c360
-rw-r--r--arch/x86/mm/setup_nx.c69
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/mm/tlb.c15
-rw-r--r--arch/x86/oprofile/op_model_ppro.c4
-rw-r--r--arch/x86/oprofile/op_x86_model.h2
-rw-r--r--arch/x86/pci/amd_bus.c64
-rw-r--r--arch/x86/pci/common.c69
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c8
-rw-r--r--arch/x86/pci/mmconfig_32.c2
-rw-r--r--arch/x86/power/cpu.c6
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vclock_gettime.c39
-rw-r--r--arch/x86/xen/debugfs.c2
-rw-r--r--arch/x86/xen/enlighten.c36
-rw-r--r--arch/x86/xen/irq.c5
-rw-r--r--arch/x86/xen/mmu.c20
-rw-r--r--arch/x86/xen/mmu.h2
-rw-r--r--arch/x86/xen/xen-ops.h2
190 files changed, 3811 insertions, 4128 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c71a8f8bdba8..1b7c74350a04 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,7 +24,7 @@ config X86
24 select HAVE_UNSTABLE_SCHED_CLOCK 24 select HAVE_UNSTABLE_SCHED_CLOCK
25 select HAVE_IDE 25 select HAVE_IDE
26 select HAVE_OPROFILE 26 select HAVE_OPROFILE
27 select HAVE_PERF_COUNTERS if (!M386 && !M486) 27 select HAVE_PERF_EVENTS if (!M386 && !M486)
28 select HAVE_IOREMAP_PROT 28 select HAVE_IOREMAP_PROT
29 select HAVE_KPROBES 29 select HAVE_KPROBES
30 select ARCH_WANT_OPTIONAL_GPIOLIB 30 select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -87,10 +87,6 @@ config STACKTRACE_SUPPORT
87config HAVE_LATENCYTOP_SUPPORT 87config HAVE_LATENCYTOP_SUPPORT
88 def_bool y 88 def_bool y
89 89
90config FAST_CMPXCHG_LOCAL
91 bool
92 default y
93
94config MMU 90config MMU
95 def_bool y 91 def_bool y
96 92
@@ -151,7 +147,10 @@ config ARCH_HAS_CACHE_LINE_SIZE
151config HAVE_SETUP_PER_CPU_AREA 147config HAVE_SETUP_PER_CPU_AREA
152 def_bool y 148 def_bool y
153 149
154config HAVE_DYNAMIC_PER_CPU_AREA 150config NEED_PER_CPU_EMBED_FIRST_CHUNK
151 def_bool y
152
153config NEED_PER_CPU_PAGE_FIRST_CHUNK
155 def_bool y 154 def_bool y
156 155
157config HAVE_CPUMASK_OF_CPU_MAP 156config HAVE_CPUMASK_OF_CPU_MAP
@@ -180,6 +179,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
180config ARCH_SUPPORTS_DEBUG_PAGEALLOC 179config ARCH_SUPPORTS_DEBUG_PAGEALLOC
181 def_bool y 180 def_bool y
182 181
182config HAVE_INTEL_TXT
183 def_bool y
184 depends on EXPERIMENTAL && DMAR && ACPI
185
183# Use the generic interrupt handling code in kernel/irq/: 186# Use the generic interrupt handling code in kernel/irq/:
184config GENERIC_HARDIRQS 187config GENERIC_HARDIRQS
185 bool 188 bool
@@ -319,6 +322,7 @@ config X86_EXTENDED_PLATFORM
319 SGI 320/540 (Visual Workstation) 322 SGI 320/540 (Visual Workstation)
320 Summit/EXA (IBM x440) 323 Summit/EXA (IBM x440)
321 Unisys ES7000 IA32 series 324 Unisys ES7000 IA32 series
325 Moorestown MID devices
322 326
323 If you have one of these systems, or if you want to build a 327 If you have one of these systems, or if you want to build a
324 generic distribution kernel, say Y here - otherwise say N. 328 generic distribution kernel, say Y here - otherwise say N.
@@ -378,6 +382,18 @@ config X86_ELAN
378 382
379 If unsure, choose "PC-compatible" instead. 383 If unsure, choose "PC-compatible" instead.
380 384
385config X86_MRST
386 bool "Moorestown MID platform"
387 depends on X86_32
388 depends on X86_EXTENDED_PLATFORM
389 ---help---
390 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
391 Internet Device(MID) platform. Moorestown consists of two chips:
392 Lincroft (CPU core, graphics, and memory controller) and Langwell IOH.
393 Unlike standard x86 PCs, Moorestown does not have many legacy devices
394 nor standard legacy replacement devices/features. e.g. Moorestown does
395 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
396
381config X86_RDC321X 397config X86_RDC321X
382 bool "RDC R-321x SoC" 398 bool "RDC R-321x SoC"
383 depends on X86_32 399 depends on X86_32
@@ -413,6 +429,17 @@ config X86_NUMAQ
413 of Flat Logical. You will need a new lynxer.elf file to flash your 429 of Flat Logical. You will need a new lynxer.elf file to flash your
414 firmware with - send email to <Martin.Bligh@us.ibm.com>. 430 firmware with - send email to <Martin.Bligh@us.ibm.com>.
415 431
432config X86_SUPPORTS_MEMORY_FAILURE
433 bool
434 # MCE code calls memory_failure():
435 depends on X86_MCE
436 # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
437 depends on !X86_NUMAQ
438 # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
439 depends on X86_64 || !SPARSEMEM
440 select ARCH_SUPPORTS_MEMORY_FAILURE
441 default y
442
416config X86_VISWS 443config X86_VISWS
417 bool "SGI 320/540 (Visual Workstation)" 444 bool "SGI 320/540 (Visual Workstation)"
418 depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT 445 depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
@@ -465,7 +492,7 @@ if PARAVIRT_GUEST
465source "arch/x86/xen/Kconfig" 492source "arch/x86/xen/Kconfig"
466 493
467config VMI 494config VMI
468 bool "VMI Guest support" 495 bool "VMI Guest support (DEPRECATED)"
469 select PARAVIRT 496 select PARAVIRT
470 depends on X86_32 497 depends on X86_32
471 ---help--- 498 ---help---
@@ -474,6 +501,15 @@ config VMI
474 at the moment), by linking the kernel to a GPL-ed ROM module 501 at the moment), by linking the kernel to a GPL-ed ROM module
475 provided by the hypervisor. 502 provided by the hypervisor.
476 503
504 As of September 2009, VMware has started a phased retirement
505 of this feature from VMware's products. Please see
506 feature-removal-schedule.txt for details. If you are
507 planning to enable this option, please note that you cannot
508 live migrate a VMI enabled VM to a future VMware product,
509 which doesn't support VMI. So if you expect your kernel to
510 seamlessly migrate to newer VMware products, keep this
511 disabled.
512
477config KVM_CLOCK 513config KVM_CLOCK
478 bool "KVM paravirtualized clock" 514 bool "KVM paravirtualized clock"
479 select PARAVIRT 515 select PARAVIRT
@@ -777,41 +813,17 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
777 increased on these systems. 813 increased on these systems.
778 814
779config X86_MCE 815config X86_MCE
780 bool "Machine Check Exception" 816 bool "Machine Check / overheating reporting"
781 ---help--- 817 ---help---
782 Machine Check Exception support allows the processor to notify the 818 Machine Check support allows the processor to notify the
783 kernel if it detects a problem (e.g. overheating, component failure). 819 kernel if it detects a problem (e.g. overheating, data corruption).
784 The action the kernel takes depends on the severity of the problem, 820 The action the kernel takes depends on the severity of the problem,
785 ranging from a warning message on the console, to halting the machine. 821 ranging from warning messages to halting the machine.
786 Your processor must be a Pentium or newer to support this - check the
787 flags in /proc/cpuinfo for mce. Note that some older Pentium systems
788 have a design flaw which leads to false MCE events - hence MCE is
789 disabled on all P5 processors, unless explicitly enabled with "mce"
790 as a boot argument. Similarly, if MCE is built in and creates a
791 problem on some new non-standard machine, you can boot with "nomce"
792 to disable it. MCE support simply ignores non-MCE processors like
793 the 386 and 486, so nearly everyone can say Y here.
794
795config X86_OLD_MCE
796 depends on X86_32 && X86_MCE
797 bool "Use legacy machine check code (will go away)"
798 default n
799 select X86_ANCIENT_MCE
800 ---help---
801 Use the old i386 machine check code. This is merely intended for
802 testing in a transition period. Try this if you run into any machine
803 check related software problems, but report the problem to
804 linux-kernel. When in doubt say no.
805
806config X86_NEW_MCE
807 depends on X86_MCE
808 bool
809 default y if (!X86_OLD_MCE && X86_32) || X86_64
810 822
811config X86_MCE_INTEL 823config X86_MCE_INTEL
812 def_bool y 824 def_bool y
813 prompt "Intel MCE features" 825 prompt "Intel MCE features"
814 depends on X86_NEW_MCE && X86_LOCAL_APIC 826 depends on X86_MCE && X86_LOCAL_APIC
815 ---help--- 827 ---help---
816 Additional support for intel specific MCE features such as 828 Additional support for intel specific MCE features such as
817 the thermal monitor. 829 the thermal monitor.
@@ -819,14 +831,14 @@ config X86_MCE_INTEL
819config X86_MCE_AMD 831config X86_MCE_AMD
820 def_bool y 832 def_bool y
821 prompt "AMD MCE features" 833 prompt "AMD MCE features"
822 depends on X86_NEW_MCE && X86_LOCAL_APIC 834 depends on X86_MCE && X86_LOCAL_APIC
823 ---help--- 835 ---help---
824 Additional support for AMD specific MCE features such as 836 Additional support for AMD specific MCE features such as
825 the DRAM Error Threshold. 837 the DRAM Error Threshold.
826 838
827config X86_ANCIENT_MCE 839config X86_ANCIENT_MCE
828 def_bool n 840 def_bool n
829 depends on X86_32 841 depends on X86_32 && X86_MCE
830 prompt "Support for old Pentium 5 / WinChip machine checks" 842 prompt "Support for old Pentium 5 / WinChip machine checks"
831 ---help--- 843 ---help---
832 Include support for machine check handling on old Pentium 5 or WinChip 844 Include support for machine check handling on old Pentium 5 or WinChip
@@ -839,36 +851,16 @@ config X86_MCE_THRESHOLD
839 default y 851 default y
840 852
841config X86_MCE_INJECT 853config X86_MCE_INJECT
842 depends on X86_NEW_MCE 854 depends on X86_MCE
843 tristate "Machine check injector support" 855 tristate "Machine check injector support"
844 ---help--- 856 ---help---
845 Provide support for injecting machine checks for testing purposes. 857 Provide support for injecting machine checks for testing purposes.
846 If you don't know what a machine check is and you don't do kernel 858 If you don't know what a machine check is and you don't do kernel
847 QA it is safe to say n. 859 QA it is safe to say n.
848 860
849config X86_MCE_NONFATAL
850 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
851 depends on X86_OLD_MCE
852 ---help---
853 Enabling this feature starts a timer that triggers every 5 seconds which
854 will look at the machine check registers to see if anything happened.
855 Non-fatal problems automatically get corrected (but still logged).
856 Disable this if you don't want to see these messages.
857 Seeing the messages this option prints out may be indicative of dying
858 or out-of-spec (ie, overclocked) hardware.
859 This option only does something on certain CPUs.
860 (AMD Athlon/Duron and Intel Pentium 4)
861
862config X86_MCE_P4THERMAL
863 bool "check for P4 thermal throttling interrupt."
864 depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
865 ---help---
866 Enabling this feature will cause a message to be printed when the P4
867 enters thermal throttling.
868
869config X86_THERMAL_VECTOR 861config X86_THERMAL_VECTOR
870 def_bool y 862 def_bool y
871 depends on X86_MCE_P4THERMAL || X86_MCE_INTEL 863 depends on X86_MCE_INTEL
872 864
873config VM86 865config VM86
874 bool "Enable VM86 support" if EMBEDDED 866 bool "Enable VM86 support" if EMBEDDED
@@ -1229,6 +1221,10 @@ config ARCH_DISCONTIGMEM_DEFAULT
1229 def_bool y 1221 def_bool y
1230 depends on NUMA && X86_32 1222 depends on NUMA && X86_32
1231 1223
1224config ARCH_PROC_KCORE_TEXT
1225 def_bool y
1226 depends on X86_64 && PROC_KCORE
1227
1232config ARCH_SPARSEMEM_DEFAULT 1228config ARCH_SPARSEMEM_DEFAULT
1233 def_bool y 1229 def_bool y
1234 depends on X86_64 1230 depends on X86_64
@@ -1414,6 +1410,10 @@ config X86_PAT
1414 1410
1415 If unsure, say Y. 1411 If unsure, say Y.
1416 1412
1413config ARCH_USES_PG_UNCACHED
1414 def_bool y
1415 depends on X86_PAT
1416
1417config EFI 1417config EFI
1418 bool "EFI runtime service support" 1418 bool "EFI runtime service support"
1419 depends on ACPI 1419 depends on ACPI
@@ -1683,6 +1683,8 @@ source "kernel/power/Kconfig"
1683 1683
1684source "drivers/acpi/Kconfig" 1684source "drivers/acpi/Kconfig"
1685 1685
1686source "drivers/sfi/Kconfig"
1687
1686config X86_APM_BOOT 1688config X86_APM_BOOT
1687 bool 1689 bool
1688 default y 1690 default y
@@ -1878,7 +1880,7 @@ config PCI_DIRECT
1878 1880
1879config PCI_MMCONFIG 1881config PCI_MMCONFIG
1880 def_bool y 1882 def_bool y
1881 depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) 1883 depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY)
1882 1884
1883config PCI_OLPC 1885config PCI_OLPC
1884 def_bool y 1886 def_bool y
@@ -1916,7 +1918,7 @@ config DMAR_DEFAULT_ON
1916config DMAR_BROKEN_GFX_WA 1918config DMAR_BROKEN_GFX_WA
1917 def_bool n 1919 def_bool n
1918 prompt "Workaround broken graphics drivers (going away soon)" 1920 prompt "Workaround broken graphics drivers (going away soon)"
1919 depends on DMAR 1921 depends on DMAR && BROKEN
1920 ---help--- 1922 ---help---
1921 Current Graphics drivers tend to use physical address 1923 Current Graphics drivers tend to use physical address
1922 for DMA and avoid using DMA APIs. Setting this config 1924 for DMA and avoid using DMA APIs. Setting this config
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 527519b8a9f9..f2824fb8c79c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -400,7 +400,7 @@ config X86_TSC
400 400
401config X86_CMPXCHG64 401config X86_CMPXCHG64
402 def_bool y 402 def_bool y
403 depends on X86_PAE || X86_64 403 depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
404 404
405# this should be set for all -march=.. options where the compiler 405# this should be set for all -march=.. options where the compiler
406# generates cmov. 406# generates cmov.
@@ -412,6 +412,7 @@ config X86_MINIMUM_CPU_FAMILY
412 int 412 int
413 default "64" if X86_64 413 default "64" if X86_64
414 default "6" if X86_32 && X86_P6_NOP 414 default "6" if X86_32 && X86_P6_NOP
415 default "5" if X86_32 && X86_CMPXCHG64
415 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) 416 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
416 default "3" 417 default "3"
417 418
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 7983c420eaf2..a012ee8ef803 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -179,8 +179,8 @@ archclean:
179define archhelp 179define archhelp
180 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' 180 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
181 echo ' install - Install kernel using' 181 echo ' install - Install kernel using'
182 echo ' (your) ~/bin/installkernel or' 182 echo ' (your) ~/bin/$(INSTALLKERNEL) or'
183 echo ' (distribution) /sbin/installkernel or' 183 echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
184 echo ' install to $$(INSTALL_PATH) and run lilo' 184 echo ' install to $$(INSTALL_PATH) and run lilo'
185 echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' 185 echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
186 echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' 186 echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 75e4f001e706..f543b70ffae2 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -23,13 +23,14 @@
23 */ 23 */
24 .text 24 .text
25 25
26#include <linux/init.h>
26#include <linux/linkage.h> 27#include <linux/linkage.h>
27#include <asm/segment.h> 28#include <asm/segment.h>
28#include <asm/page_types.h> 29#include <asm/page_types.h>
29#include <asm/boot.h> 30#include <asm/boot.h>
30#include <asm/asm-offsets.h> 31#include <asm/asm-offsets.h>
31 32
32 .section ".text.head","ax",@progbits 33 __HEAD
33ENTRY(startup_32) 34ENTRY(startup_32)
34 cld 35 cld
35 /* 36 /*
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f62c284db9eb..077e1b69198e 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -24,6 +24,7 @@
24 .code32 24 .code32
25 .text 25 .text
26 26
27#include <linux/init.h>
27#include <linux/linkage.h> 28#include <linux/linkage.h>
28#include <asm/segment.h> 29#include <asm/segment.h>
29#include <asm/pgtable_types.h> 30#include <asm/pgtable_types.h>
@@ -33,7 +34,7 @@
33#include <asm/processor-flags.h> 34#include <asm/processor-flags.h>
34#include <asm/asm-offsets.h> 35#include <asm/asm-offsets.h>
35 36
36 .section ".text.head" 37 __HEAD
37 .code32 38 .code32
38ENTRY(startup_32) 39ENTRY(startup_32)
39 cld 40 cld
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index cc353e1b3ffd..f4193bb48782 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,3 +1,5 @@
1#include <asm-generic/vmlinux.lds.h>
2
1OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) 3OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
2 4
3#undef i386 5#undef i386
@@ -18,9 +20,9 @@ SECTIONS
18 * address 0. 20 * address 0.
19 */ 21 */
20 . = 0; 22 . = 0;
21 .text.head : { 23 .head.text : {
22 _head = . ; 24 _head = . ;
23 *(.text.head) 25 HEAD_TEXT
24 _ehead = . ; 26 _ehead = . ;
25 } 27 }
26 .rodata.compressed : { 28 .rodata.compressed : {
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 8d60ee15dfd9..d13ec1c38640 100644
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -33,8 +33,8 @@ verify "$3"
33 33
34# User may have a custom install script 34# User may have a custom install script
35 35
36if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi 36if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
37if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi 37if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
38 38
39# Default install - same as make zlilo 39# Default install - same as make zlilo
40 40
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ba331bfd1112..1733f9f65e82 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -21,8 +21,8 @@
21#define __AUDIT_ARCH_LE 0x40000000 21#define __AUDIT_ARCH_LE 0x40000000
22 22
23#ifndef CONFIG_AUDITSYSCALL 23#ifndef CONFIG_AUDITSYSCALL
24#define sysexit_audit int_ret_from_sys_call 24#define sysexit_audit ia32_ret_from_sys_call
25#define sysretl_audit int_ret_from_sys_call 25#define sysretl_audit ia32_ret_from_sys_call
26#endif 26#endif
27 27
28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
@@ -39,12 +39,12 @@
39 .endm 39 .endm
40 40
41 /* clobbers %eax */ 41 /* clobbers %eax */
42 .macro CLEAR_RREGS _r9=rax 42 .macro CLEAR_RREGS offset=0, _r9=rax
43 xorl %eax,%eax 43 xorl %eax,%eax
44 movq %rax,R11(%rsp) 44 movq %rax,\offset+R11(%rsp)
45 movq %rax,R10(%rsp) 45 movq %rax,\offset+R10(%rsp)
46 movq %\_r9,R9(%rsp) 46 movq %\_r9,\offset+R9(%rsp)
47 movq %rax,R8(%rsp) 47 movq %rax,\offset+R8(%rsp)
48 .endm 48 .endm
49 49
50 /* 50 /*
@@ -172,6 +172,10 @@ sysexit_from_sys_call:
172 movl RIP-R11(%rsp),%edx /* User %eip */ 172 movl RIP-R11(%rsp),%edx /* User %eip */
173 CFI_REGISTER rip,rdx 173 CFI_REGISTER rip,rdx
174 RESTORE_ARGS 1,24,1,1,1,1 174 RESTORE_ARGS 1,24,1,1,1,1
175 xorq %r8,%r8
176 xorq %r9,%r9
177 xorq %r10,%r10
178 xorq %r11,%r11
175 popfq 179 popfq
176 CFI_ADJUST_CFA_OFFSET -8 180 CFI_ADJUST_CFA_OFFSET -8
177 /*CFI_RESTORE rflags*/ 181 /*CFI_RESTORE rflags*/
@@ -202,7 +206,7 @@ sysexit_from_sys_call:
202 206
203 .macro auditsys_exit exit,ebpsave=RBP 207 .macro auditsys_exit exit,ebpsave=RBP
204 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 208 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
205 jnz int_ret_from_sys_call 209 jnz ia32_ret_from_sys_call
206 TRACE_IRQS_ON 210 TRACE_IRQS_ON
207 sti 211 sti
208 movl %eax,%esi /* second arg, syscall return value */ 212 movl %eax,%esi /* second arg, syscall return value */
@@ -218,8 +222,9 @@ sysexit_from_sys_call:
218 cli 222 cli
219 TRACE_IRQS_OFF 223 TRACE_IRQS_OFF
220 testl %edi,TI_flags(%r10) 224 testl %edi,TI_flags(%r10)
221 jnz int_with_check 225 jz \exit
222 jmp \exit 226 CLEAR_RREGS -ARGOFFSET
227 jmp int_with_check
223 .endm 228 .endm
224 229
225sysenter_auditsys: 230sysenter_auditsys:
@@ -329,6 +334,9 @@ sysretl_from_sys_call:
329 CFI_REGISTER rip,rcx 334 CFI_REGISTER rip,rcx
330 movl EFLAGS-ARGOFFSET(%rsp),%r11d 335 movl EFLAGS-ARGOFFSET(%rsp),%r11d
331 /*CFI_REGISTER rflags,r11*/ 336 /*CFI_REGISTER rflags,r11*/
337 xorq %r10,%r10
338 xorq %r9,%r9
339 xorq %r8,%r8
332 TRACE_IRQS_ON 340 TRACE_IRQS_ON
333 movl RSP-ARGOFFSET(%rsp),%esp 341 movl RSP-ARGOFFSET(%rsp),%esp
334 CFI_RESTORE rsp 342 CFI_RESTORE rsp
@@ -353,7 +361,7 @@ cstar_tracesys:
353#endif 361#endif
354 xchgl %r9d,%ebp 362 xchgl %r9d,%ebp
355 SAVE_REST 363 SAVE_REST
356 CLEAR_RREGS r9 364 CLEAR_RREGS 0, r9
357 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 365 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
358 movq %rsp,%rdi /* &pt_regs -> arg1 */ 366 movq %rsp,%rdi /* &pt_regs -> arg1 */
359 call syscall_trace_enter 367 call syscall_trace_enter
@@ -425,6 +433,8 @@ ia32_do_call:
425 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative 433 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
426ia32_sysret: 434ia32_sysret:
427 movq %rax,RAX-ARGOFFSET(%rsp) 435 movq %rax,RAX-ARGOFFSET(%rsp)
436ia32_ret_from_sys_call:
437 CLEAR_RREGS -ARGOFFSET
428 jmp int_ret_from_sys_call 438 jmp int_ret_from_sys_call
429 439
430ia32_tracesys: 440ia32_tracesys:
@@ -442,8 +452,8 @@ END(ia32_syscall)
442 452
443ia32_badsys: 453ia32_badsys:
444 movq $0,ORIG_RAX-ARGOFFSET(%rsp) 454 movq $0,ORIG_RAX-ARGOFFSET(%rsp)
445 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 455 movq $-ENOSYS,%rax
446 jmp int_ret_from_sys_call 456 jmp ia32_sysret
447 457
448quiet_ni_syscall: 458quiet_ni_syscall:
449 movq $-ENOSYS,%rax 459 movq $-ENOSYS,%rax
@@ -831,5 +841,5 @@ ia32_sys_call_table:
831 .quad compat_sys_preadv 841 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 842 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 843 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_counter_open 844 .quad sys_perf_event_open
835ia32_syscall_end: 845ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 20d1465a2ab0..4518dc500903 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
144 144
145#else /* !CONFIG_ACPI */ 145#else /* !CONFIG_ACPI */
146 146
147#define acpi_disabled 1
148#define acpi_lapic 0 147#define acpi_lapic 0
149#define acpi_ioapic 0 148#define acpi_ioapic 0
150static inline void acpi_noirq_set(void) { } 149static inline void acpi_noirq_set(void) { }
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 9825cd64c9b6..eec2a70d4376 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -22,10 +22,6 @@
22 */ 22 */
23#define flush_agp_cache() wbinvd() 23#define flush_agp_cache() wbinvd()
24 24
25/* Convert a physical address to an address suitable for the GART. */
26#define phys_to_gart(x) (x)
27#define gart_to_phys(x) (x)
28
29/* GATT allocation. Returns/accepts GATT kernel virtual address. */ 25/* GATT allocation. Returns/accepts GATT kernel virtual address. */
30#define alloc_gatt_pages(order) \ 26#define alloc_gatt_pages(order) \
31 ((char *)__get_free_pages(GFP_KERNEL, (order))) 27 ((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 586b7adb8e53..474d80d3e6cc 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -66,13 +66,23 @@ static inline void default_inquire_remote_apic(int apicid)
66} 66}
67 67
68/* 68/*
69 * With 82489DX we can't rely on apic feature bit
70 * retrieved via cpuid but still have to deal with
71 * such an apic chip so we assume that SMP configuration
72 * is found from MP table (64bit case uses ACPI mostly
73 * which set smp presence flag as well so we are safe
74 * to use this helper too).
75 */
76static inline bool apic_from_smp_config(void)
77{
78 return smp_found_config && !disable_apic;
79}
80
81/*
69 * Basic functions accessing APICs. 82 * Basic functions accessing APICs.
70 */ 83 */
71#ifdef CONFIG_PARAVIRT 84#ifdef CONFIG_PARAVIRT
72#include <asm/paravirt.h> 85#include <asm/paravirt.h>
73#else
74#define setup_boot_clock setup_boot_APIC_clock
75#define setup_secondary_clock setup_secondary_APIC_clock
76#endif 86#endif
77 87
78#ifdef CONFIG_X86_64 88#ifdef CONFIG_X86_64
@@ -252,6 +262,8 @@ static inline void lapic_shutdown(void) { }
252static inline void init_apic_mappings(void) { } 262static inline void init_apic_mappings(void) { }
253static inline void disable_local_APIC(void) { } 263static inline void disable_local_APIC(void) { }
254static inline void apic_disable(void) { } 264static inline void apic_disable(void) { }
265# define setup_boot_APIC_clock x86_init_noop
266# define setup_secondary_APIC_clock x86_init_noop
255#endif /* !CONFIG_X86_LOCAL_APIC */ 267#endif /* !CONFIG_X86_LOCAL_APIC */
256 268
257#ifdef CONFIG_X86_64 269#ifdef CONFIG_X86_64
@@ -300,7 +312,7 @@ struct apic {
300 int (*cpu_present_to_apicid)(int mps_cpu); 312 int (*cpu_present_to_apicid)(int mps_cpu);
301 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); 313 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
302 void (*setup_portio_remap)(void); 314 void (*setup_portio_remap)(void);
303 int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); 315 int (*check_phys_apicid_present)(int phys_apicid);
304 void (*enable_apic_mode)(void); 316 void (*enable_apic_mode)(void);
305 int (*phys_pkg_id)(int cpuid_apic, int index_msb); 317 int (*phys_pkg_id)(int cpuid_apic, int index_msb);
306 318
@@ -434,7 +446,7 @@ extern struct apic apic_x2apic_uv_x;
434DECLARE_PER_CPU(int, x2apic_extra_bits); 446DECLARE_PER_CPU(int, x2apic_extra_bits);
435 447
436extern int default_cpu_present_to_apicid(int mps_cpu); 448extern int default_cpu_present_to_apicid(int mps_cpu);
437extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); 449extern int default_check_phys_apicid_present(int phys_apicid);
438#endif 450#endif
439 451
440static inline void default_wait_for_init_deassert(atomic_t *deassert) 452static inline void default_wait_for_init_deassert(atomic_t *deassert)
@@ -550,9 +562,9 @@ static inline int __default_cpu_present_to_apicid(int mps_cpu)
550} 562}
551 563
552static inline int 564static inline int
553__default_check_phys_apicid_present(int boot_cpu_physical_apicid) 565__default_check_phys_apicid_present(int phys_apicid)
554{ 566{
555 return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); 567 return physid_isset(phys_apicid, phys_cpu_present_map);
556} 568}
557 569
558#ifdef CONFIG_X86_32 570#ifdef CONFIG_X86_32
@@ -562,13 +574,13 @@ static inline int default_cpu_present_to_apicid(int mps_cpu)
562} 574}
563 575
564static inline int 576static inline int
565default_check_phys_apicid_present(int boot_cpu_physical_apicid) 577default_check_phys_apicid_present(int phys_apicid)
566{ 578{
567 return __default_check_phys_apicid_present(boot_cpu_physical_apicid); 579 return __default_check_phys_apicid_present(phys_apicid);
568} 580}
569#else 581#else
570extern int default_cpu_present_to_apicid(int mps_cpu); 582extern int default_cpu_present_to_apicid(int mps_cpu);
571extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); 583extern int default_check_phys_apicid_present(int phys_apicid);
572#endif 584#endif
573 585
574static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) 586static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 1724e8de317c..6be33d83c716 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -85,7 +85,8 @@ struct efi_info {
85struct boot_params { 85struct boot_params {
86 struct screen_info screen_info; /* 0x000 */ 86 struct screen_info screen_info; /* 0x000 */
87 struct apm_bios_info apm_bios_info; /* 0x040 */ 87 struct apm_bios_info apm_bios_info; /* 0x040 */
88 __u8 _pad2[12]; /* 0x054 */ 88 __u8 _pad2[4]; /* 0x054 */
89 __u64 tboot_addr; /* 0x058 */
89 struct ist_info ist_info; /* 0x060 */ 90 struct ist_info ist_info; /* 0x060 */
90 __u8 _pad3[16]; /* 0x070 */ 91 __u8 _pad3[16]; /* 0x070 */
91 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 92 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
@@ -109,4 +110,14 @@ struct boot_params {
109 __u8 _pad9[276]; /* 0xeec */ 110 __u8 _pad9[276]; /* 0xeec */
110} __attribute__((packed)); 111} __attribute__((packed));
111 112
113enum {
114 X86_SUBARCH_PC = 0,
115 X86_SUBARCH_LGUEST,
116 X86_SUBARCH_XEN,
117 X86_SUBARCH_MRST,
118 X86_NR_SUBARCHS,
119};
120
121
122
112#endif /* _ASM_X86_BOOTPARAM_H */ 123#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 5d367caa0e36..549860d3be8f 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_CACHE_H 1#ifndef _ASM_X86_CACHE_H
2#define _ASM_X86_CACHE_H 2#define _ASM_X86_CACHE_H
3 3
4#include <linux/linkage.h>
5
4/* L1 cache line size */ 6/* L1 cache line size */
5#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) 7#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
6#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) 8#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
@@ -13,7 +15,7 @@
13#ifdef CONFIG_SMP 15#ifdef CONFIG_SMP
14#define __cacheline_aligned_in_smp \ 16#define __cacheline_aligned_in_smp \
15 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ 17 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
16 __attribute__((__section__(".data.page_aligned"))) 18 __page_aligned_data
17#endif 19#endif
18#endif 20#endif
19 21
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e55dfc1ad453..b54f6afe7ec4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
43 memcpy(dst, src, len); 43 memcpy(dst, src, len);
44} 44}
45 45
46#define PG_non_WB PG_arch_1 46#define PG_WC PG_arch_1
47PAGEFLAG(NonWB, non_WB) 47PAGEFLAG(WC, WC)
48
49#ifdef CONFIG_X86_PAT
50/*
51 * X86 PAT uses page flags WC and Uncached together to keep track of
52 * memory type of pages that have backing page struct. X86 PAT supports 3
53 * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and
54 * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
55 * been changed from its default (value of -1 used to denote this).
56 * Note we do not support _PAGE_CACHE_UC here.
57 *
58 * Caller must hold memtype_lock for atomicity.
59 */
60static inline unsigned long get_page_memtype(struct page *pg)
61{
62 if (!PageUncached(pg) && !PageWC(pg))
63 return -1;
64 else if (!PageUncached(pg) && PageWC(pg))
65 return _PAGE_CACHE_WC;
66 else if (PageUncached(pg) && !PageWC(pg))
67 return _PAGE_CACHE_UC_MINUS;
68 else
69 return _PAGE_CACHE_WB;
70}
71
72static inline void set_page_memtype(struct page *pg, unsigned long memtype)
73{
74 switch (memtype) {
75 case _PAGE_CACHE_WC:
76 ClearPageUncached(pg);
77 SetPageWC(pg);
78 break;
79 case _PAGE_CACHE_UC_MINUS:
80 SetPageUncached(pg);
81 ClearPageWC(pg);
82 break;
83 case _PAGE_CACHE_WB:
84 SetPageUncached(pg);
85 SetPageWC(pg);
86 break;
87 default:
88 case -1:
89 ClearPageUncached(pg);
90 ClearPageWC(pg);
91 break;
92 }
93}
94#else
95static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
96static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
97#endif
48 98
49/* 99/*
50 * The set_memory_* API can be used to change various attributes of a virtual 100 * The set_memory_* API can be used to change various attributes of a virtual
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 7c5ef8b14d92..46fc474fd819 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -161,7 +161,8 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
161 "adcl $0, %0 ;\n" 161 "adcl $0, %0 ;\n"
162 : "=&r" (sum) 162 : "=&r" (sum)
163 : "r" (saddr), "r" (daddr), 163 : "r" (saddr), "r" (daddr),
164 "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)); 164 "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)
165 : "memory");
165 166
166 return csum_fold(sum); 167 return csum_fold(sum);
167} 168}
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 82ceb788a981..ee1931be6593 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -312,19 +312,23 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
312 312
313extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); 313extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
314 314
315#define cmpxchg64(ptr, o, n) \ 315#define cmpxchg64(ptr, o, n) \
316({ \ 316({ \
317 __typeof__(*(ptr)) __ret; \ 317 __typeof__(*(ptr)) __ret; \
318 if (likely(boot_cpu_data.x86 > 4)) \ 318 __typeof__(*(ptr)) __old = (o); \
319 __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \ 319 __typeof__(*(ptr)) __new = (n); \
320 (unsigned long long)(o), \ 320 alternative_io("call cmpxchg8b_emu", \
321 (unsigned long long)(n)); \ 321 "lock; cmpxchg8b (%%esi)" , \
322 else \ 322 X86_FEATURE_CX8, \
323 __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ 323 "=A" (__ret), \
324 (unsigned long long)(o), \ 324 "S" ((ptr)), "0" (__old), \
325 (unsigned long long)(n)); \ 325 "b" ((unsigned int)__new), \
326 __ret; \ 326 "c" ((unsigned int)(__new>>32)) \
327}) 327 : "memory"); \
328 __ret; })
329
330
331
328#define cmpxchg64_local(ptr, o, n) \ 332#define cmpxchg64_local(ptr, o, n) \
329({ \ 333({ \
330 __typeof__(*(ptr)) __ret; \ 334 __typeof__(*(ptr)) __ret; \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 847fee6493a2..9cfc88b97742 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -96,6 +96,7 @@
96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ 96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ 97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
98#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ 98#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */
99#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
99 100
100/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 101/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
101#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 102#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h
deleted file mode 100644
index 23ecda0b28a0..000000000000
--- a/arch/x86/include/asm/do_timer.h
+++ /dev/null
@@ -1,16 +0,0 @@
1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
3
4#include <asm/i8259.h>
5#include <asm/i8253.h>
6
7/**
8 * do_timer_interrupt_hook - hook into timer tick
9 *
10 * Call the pit clock event handler. see asm/i8253.h
11 **/
12
13static inline void do_timer_interrupt_hook(void)
14{
15 global_clock_event->event_handler(global_clock_event);
16}
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 7ecba4d85089..40b4e614fe71 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -126,8 +126,6 @@ extern void e820_reserve_resources(void);
126extern void e820_reserve_resources_late(void); 126extern void e820_reserve_resources_late(void);
127extern void setup_memory_map(void); 127extern void setup_memory_map(void);
128extern char *default_machine_specific_memory_setup(void); 128extern char *default_machine_specific_memory_setup(void);
129extern char *machine_specific_memory_setup(void);
130extern char *memory_setup(void);
131#endif /* __KERNEL__ */ 129#endif /* __KERNEL__ */
132#endif /* __ASSEMBLY__ */ 130#endif /* __ASSEMBLY__ */
133 131
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 83c1bc8d2e8a..456a304b8172 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -299,6 +299,8 @@ do { \
299 299
300#ifdef CONFIG_X86_32 300#ifdef CONFIG_X86_32
301 301
302#define STACK_RND_MASK (0x7ff)
303
302#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) 304#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
303 305
304#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) 306#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index ff8cbfa07851..f5693c81a1db 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
49BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 49BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
51 51
52#ifdef CONFIG_PERF_COUNTERS 52#ifdef CONFIG_PERF_EVENTS
53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) 53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
54#endif 54#endif
55 55
@@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
61BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) 61BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
62#endif 62#endif
63 63
64#ifdef CONFIG_X86_NEW_MCE 64#ifdef CONFIG_X86_MCE
65BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) 65BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
66#endif 66#endif
67 67
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 7b2d71df39a6..14f9890eb495 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -132,6 +132,9 @@ enum fixed_addresses {
132#ifdef CONFIG_X86_32 132#ifdef CONFIG_X86_32
133 FIX_WP_TEST, 133 FIX_WP_TEST,
134#endif 134#endif
135#ifdef CONFIG_INTEL_TXT
136 FIX_TBOOT_BASE,
137#endif
135 __end_of_fixed_addresses 138 __end_of_fixed_addresses
136}; 139};
137 140
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 369f5c5d09a1..b78c0941e422 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,7 +20,7 @@
20#ifndef ASM_X86__HYPERVISOR_H 20#ifndef ASM_X86__HYPERVISOR_H
21#define ASM_X86__HYPERVISOR_H 21#define ASM_X86__HYPERVISOR_H
22 22
23extern unsigned long get_hypervisor_tsc_freq(void);
24extern void init_hypervisor(struct cpuinfo_x86 *c); 23extern void init_hypervisor(struct cpuinfo_x86 *c);
24extern void init_hypervisor_platform(void);
25 25
26#endif 26#endif
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 85232d32fcb8..7c7c16cde1f8 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,6 +143,8 @@ extern int noioapicreroute;
143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ 143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
144extern int timer_through_8259; 144extern int timer_through_8259;
145 145
146extern void io_apic_disable_legacy(void);
147
146/* 148/*
147 * If we use the IO-APIC for IRQ routing, disable automatic 149 * If we use the IO-APIC for IRQ routing, disable automatic
148 * assignment of PCI IRQ's. 150 * assignment of PCI IRQ's.
@@ -176,6 +178,7 @@ extern int setup_ioapic_entry(int apic, int irq,
176 int polarity, int vector, int pin); 178 int polarity, int vector, int pin);
177extern void ioapic_write_entry(int apic, int pin, 179extern void ioapic_write_entry(int apic, int pin,
178 struct IO_APIC_route_entry e); 180 struct IO_APIC_route_entry e);
181extern void setup_ioapic_ids_from_mpc(void);
179 182
180struct mp_ioapic_gsi{ 183struct mp_ioapic_gsi{
181 int gsi_base; 184 int gsi_base;
@@ -187,12 +190,14 @@ int mp_find_ioapic_pin(int ioapic, int gsi);
187void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); 190void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
188 191
189#else /* !CONFIG_X86_IO_APIC */ 192#else /* !CONFIG_X86_IO_APIC */
193
190#define io_apic_assign_pci_irqs 0 194#define io_apic_assign_pci_irqs 0
195#define setup_ioapic_ids_from_mpc x86_init_noop
191static const int timer_through_8259 = 0; 196static const int timer_through_8259 = 0;
192static inline void ioapic_init_mappings(void) { } 197static inline void ioapic_init_mappings(void) { }
193static inline void ioapic_insert_resources(void) { } 198static inline void ioapic_insert_resources(void) { }
194
195static inline void probe_nr_irqs_gsi(void) { } 199static inline void probe_nr_irqs_gsi(void) { }
200
196#endif 201#endif
197 202
198#endif /* _ASM_X86_IO_APIC_H */ 203#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 0e9fe1d9d971..f35eb45d6576 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -26,13 +26,16 @@
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
29int
30is_io_mapping_possible(resource_size_t base, unsigned long size);
31
32void * 29void *
33iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 30iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
34 31
35void 32void
36iounmap_atomic(void *kvaddr, enum km_type type); 33iounmap_atomic(void *kvaddr, enum km_type type);
37 34
35int
36iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
37
38void
39iomap_free(resource_size_t base, unsigned long size);
40
38#endif /* _ASM_X86_IOMAP_H */ 41#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index f38481bcd455..ddda6cbed6f4 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -37,7 +37,6 @@ extern void fixup_irqs(void);
37#endif 37#endif
38 38
39extern void (*generic_interrupt_extension)(void); 39extern void (*generic_interrupt_extension)(void);
40extern void init_IRQ(void);
41extern void native_init_IRQ(void); 40extern void native_init_IRQ(void);
42extern bool handle_irq(unsigned irq, struct pt_regs *regs); 41extern bool handle_irq(unsigned irq, struct pt_regs *regs);
43 42
@@ -47,4 +46,6 @@ extern unsigned int do_IRQ(struct pt_regs *regs);
47extern DECLARE_BITMAP(used_vectors, NR_VECTORS); 46extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
48extern int vector_used_by_percpu_irq(unsigned int vector); 47extern int vector_used_by_percpu_irq(unsigned int vector);
49 48
49extern void init_ISA_irqs(void);
50
50#endif /* _ASM_X86_IRQ_H */ 51#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3be000435fad..d83892226f73 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
796#define KVM_ARCH_WANT_MMU_NOTIFIER 796#define KVM_ARCH_WANT_MMU_NOTIFIER
797int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 797int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
798int kvm_age_hva(struct kvm *kvm, unsigned long hva); 798int kvm_age_hva(struct kvm *kvm, unsigned long hva);
799void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
799int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 800int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
800int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 801int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
801int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 802int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5cdd8d100ec9..f1363b72364f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -9,7 +9,7 @@
9 */ 9 */
10 10
11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */ 11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
12#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ 12#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ 13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
14#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ 14#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
15#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ 15#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
@@ -38,6 +38,14 @@
38#define MCM_ADDR_MEM 3 /* memory address */ 38#define MCM_ADDR_MEM 3 /* memory address */
39#define MCM_ADDR_GENERIC 7 /* generic */ 39#define MCM_ADDR_GENERIC 7 /* generic */
40 40
41#define MCJ_CTX_MASK 3
42#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
43#define MCJ_CTX_RANDOM 0 /* inject context: random */
44#define MCJ_CTX_PROCESS 1 /* inject context: process */
45#define MCJ_CTX_IRQ 2 /* inject context: IRQ */
46#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */
47#define MCJ_EXCEPTION 8 /* raise as exception */
48
41/* Fields are zero when not available */ 49/* Fields are zero when not available */
42struct mce { 50struct mce {
43 __u64 status; 51 __u64 status;
@@ -48,8 +56,8 @@ struct mce {
48 __u64 tsc; /* cpu time stamp counter */ 56 __u64 tsc; /* cpu time stamp counter */
49 __u64 time; /* wall time_t when error was detected */ 57 __u64 time; /* wall time_t when error was detected */
50 __u8 cpuvendor; /* cpu vendor as encoded in system.h */ 58 __u8 cpuvendor; /* cpu vendor as encoded in system.h */
51 __u8 pad1; 59 __u8 inject_flags; /* software inject flags */
52 __u16 pad2; 60 __u16 pad;
53 __u32 cpuid; /* CPUID 1 EAX */ 61 __u32 cpuid; /* CPUID 1 EAX */
54 __u8 cs; /* code segment */ 62 __u8 cs; /* code segment */
55 __u8 bank; /* machine check bank */ 63 __u8 bank; /* machine check bank */
@@ -115,13 +123,6 @@ void mcheck_init(struct cpuinfo_x86 *c);
115static inline void mcheck_init(struct cpuinfo_x86 *c) {} 123static inline void mcheck_init(struct cpuinfo_x86 *c) {}
116#endif 124#endif
117 125
118#ifdef CONFIG_X86_OLD_MCE
119extern int nr_mce_banks;
120void amd_mcheck_init(struct cpuinfo_x86 *c);
121void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
122void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
123#endif
124
125#ifdef CONFIG_X86_ANCIENT_MCE 126#ifdef CONFIG_X86_ANCIENT_MCE
126void intel_p5_mcheck_init(struct cpuinfo_x86 *c); 127void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
127void winchip_mcheck_init(struct cpuinfo_x86 *c); 128void winchip_mcheck_init(struct cpuinfo_x86 *c);
@@ -132,15 +133,18 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
132static inline void enable_p5_mce(void) {} 133static inline void enable_p5_mce(void) {}
133#endif 134#endif
134 135
136extern void (*x86_mce_decode_callback)(struct mce *m);
137
135void mce_setup(struct mce *m); 138void mce_setup(struct mce *m);
136void mce_log(struct mce *m); 139void mce_log(struct mce *m);
137DECLARE_PER_CPU(struct sys_device, mce_dev); 140DECLARE_PER_CPU(struct sys_device, mce_dev);
138 141
139/* 142/*
140 * To support more than 128 would need to escape the predefined 143 * Maximum banks number.
141 * Linux defined extended banks first. 144 * This is the limit of the current register layout on
145 * Intel CPUs.
142 */ 146 */
143#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) 147#define MAX_NR_BANKS 32
144 148
145#ifdef CONFIG_X86_MCE_INTEL 149#ifdef CONFIG_X86_MCE_INTEL
146extern int mce_cmci_disabled; 150extern int mce_cmci_disabled;
@@ -208,11 +212,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
208 212
209void intel_init_thermal(struct cpuinfo_x86 *c); 213void intel_init_thermal(struct cpuinfo_x86 *c);
210 214
211#ifdef CONFIG_X86_NEW_MCE
212void mce_log_therm_throt_event(__u64 status); 215void mce_log_therm_throt_event(__u64 status);
213#else
214static inline void mce_log_therm_throt_event(__u64 status) {}
215#endif
216 216
217#endif /* __KERNEL__ */ 217#endif /* __KERNEL__ */
218#endif /* _ASM_X86_MCE_H */ 218#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index f923203dc39a..4a2d4e0c18d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
37 37
38 if (likely(prev != next)) { 38 if (likely(prev != next)) {
39 /* stop flush ipis for the previous mm */ 39 /* stop flush ipis for the previous mm */
40 cpu_clear(cpu, prev->cpu_vm_mask); 40 cpumask_clear_cpu(cpu, mm_cpumask(prev));
41#ifdef CONFIG_SMP 41#ifdef CONFIG_SMP
42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
43 percpu_write(cpu_tlbstate.active_mm, next); 43 percpu_write(cpu_tlbstate.active_mm, next);
44#endif 44#endif
45 cpu_set(cpu, next->cpu_vm_mask); 45 cpumask_set_cpu(cpu, mm_cpumask(next));
46 46
47 /* Re-load page tables */ 47 /* Re-load page tables */
48 load_cr3(next->pgd); 48 load_cr3(next->pgd);
@@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
58 percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 58 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
59 BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); 59 BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
60 60
61 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { 61 if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
62 /* We were in lazy tlb mode and leave_mm disabled 62 /* We were in lazy tlb mode and leave_mm disabled
63 * tlb flush IPI delivery. We must reload CR3 63 * tlb flush IPI delivery. We must reload CR3
64 * to make sure to use no freed page tables. 64 * to make sure to use no freed page tables.
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index e2a1bb6d71ea..79c94500c0bb 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -4,6 +4,7 @@
4#include <linux/init.h> 4#include <linux/init.h>
5 5
6#include <asm/mpspec_def.h> 6#include <asm/mpspec_def.h>
7#include <asm/x86_init.h>
7 8
8extern int apic_version[MAX_APICS]; 9extern int apic_version[MAX_APICS];
9extern int pic_mode; 10extern int pic_mode;
@@ -41,9 +42,6 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
41 42
42#endif /* CONFIG_X86_64 */ 43#endif /* CONFIG_X86_64 */
43 44
44extern void early_find_smp_config(void);
45extern void early_get_smp_config(void);
46
47#if defined(CONFIG_MCA) || defined(CONFIG_EISA) 45#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
48extern int mp_bus_id_to_type[MAX_MP_BUSSES]; 46extern int mp_bus_id_to_type[MAX_MP_BUSSES];
49#endif 47#endif
@@ -52,20 +50,55 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
52 50
53extern unsigned int boot_cpu_physical_apicid; 51extern unsigned int boot_cpu_physical_apicid;
54extern unsigned int max_physical_apicid; 52extern unsigned int max_physical_apicid;
55extern int smp_found_config;
56extern int mpc_default_type; 53extern int mpc_default_type;
57extern unsigned long mp_lapic_addr; 54extern unsigned long mp_lapic_addr;
58 55
59extern void get_smp_config(void); 56#ifdef CONFIG_X86_LOCAL_APIC
57extern int smp_found_config;
58#else
59# define smp_found_config 0
60#endif
61
62static inline void get_smp_config(void)
63{
64 x86_init.mpparse.get_smp_config(0);
65}
66
67static inline void early_get_smp_config(void)
68{
69 x86_init.mpparse.get_smp_config(1);
70}
71
72static inline void find_smp_config(void)
73{
74 x86_init.mpparse.find_smp_config(1);
75}
76
77static inline void early_find_smp_config(void)
78{
79 x86_init.mpparse.find_smp_config(0);
80}
60 81
61#ifdef CONFIG_X86_MPPARSE 82#ifdef CONFIG_X86_MPPARSE
62extern void find_smp_config(void);
63extern void early_reserve_e820_mpc_new(void); 83extern void early_reserve_e820_mpc_new(void);
64extern int enable_update_mptable; 84extern int enable_update_mptable;
85extern int default_mpc_apic_id(struct mpc_cpu *m);
86extern void default_smp_read_mpc_oem(struct mpc_table *mpc);
87# ifdef CONFIG_X86_IO_APIC
88extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
89# else
90# define default_mpc_oem_bus_info NULL
91# endif
92extern void default_find_smp_config(unsigned int reserve);
93extern void default_get_smp_config(unsigned int early);
65#else 94#else
66static inline void find_smp_config(void) { }
67static inline void early_reserve_e820_mpc_new(void) { } 95static inline void early_reserve_e820_mpc_new(void) { }
68#define enable_update_mptable 0 96#define enable_update_mptable 0
97#define default_mpc_apic_id NULL
98#define default_smp_read_mpc_oem NULL
99#define default_mpc_oem_bus_info NULL
100#define default_find_smp_config x86_init_uint_noop
101#define default_get_smp_config x86_init_uint_noop
69#endif 102#endif
70 103
71void __cpuinit generic_processor_info(int apicid, int version); 104void __cpuinit generic_processor_info(int apicid, int version);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bd5549034a95..4ffe09b2ad75 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,8 +81,15 @@
81#define MSR_IA32_MC0_ADDR 0x00000402 81#define MSR_IA32_MC0_ADDR 0x00000402
82#define MSR_IA32_MC0_MISC 0x00000403 82#define MSR_IA32_MC0_MISC 0x00000403
83 83
84#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
85#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
86#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
87#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
88
84/* These are consecutive and not in the normal 4er MCE bank block */ 89/* These are consecutive and not in the normal 4er MCE bank block */
85#define MSR_IA32_MC0_CTL2 0x00000280 90#define MSR_IA32_MC0_CTL2 0x00000280
91#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
92
86#define CMCI_EN (1ULL << 30) 93#define CMCI_EN (1ULL << 30)
87#define CMCI_THRESHOLD_MASK 0xffffULL 94#define CMCI_THRESHOLD_MASK 0xffffULL
88 95
@@ -215,6 +222,10 @@
215 222
216#define THERM_STATUS_PROCHOT (1 << 0) 223#define THERM_STATUS_PROCHOT (1 << 0)
217 224
225#define MSR_THERM2_CTL 0x0000019d
226
227#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16)
228
218#define MSR_IA32_MISC_ENABLE 0x000001a0 229#define MSR_IA32_MISC_ENABLE 0x000001a0
219 230
220/* MISC_ENABLE bits: architectural */ 231/* MISC_ENABLE bits: architectural */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index a51ada8467de..4365ffdb461f 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -121,6 +121,9 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
121extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); 121extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
122extern void mtrr_ap_init(void); 122extern void mtrr_ap_init(void);
123extern void mtrr_bp_init(void); 123extern void mtrr_bp_init(void);
124extern void set_mtrr_aps_delayed_init(void);
125extern void mtrr_aps_init(void);
126extern void mtrr_bp_restore(void);
124extern int mtrr_trim_uncached_memory(unsigned long end_pfn); 127extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
125extern int amd_special_default_mtrr(void); 128extern int amd_special_default_mtrr(void);
126# else 129# else
@@ -161,6 +164,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
161 164
162#define mtrr_ap_init() do {} while (0) 165#define mtrr_ap_init() do {} while (0)
163#define mtrr_bp_init() do {} while (0) 166#define mtrr_bp_init() do {} while (0)
167#define set_mtrr_aps_delayed_init() do {} while (0)
168#define mtrr_aps_init() do {} while (0)
169#define mtrr_bp_restore() do {} while (0)
164# endif 170# endif
165 171
166#ifdef CONFIG_COMPAT 172#ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index e63cf7d441e1..139d4c1a33a7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog;
40#define NMI_INVALID 3 40#define NMI_INVALID 3
41 41
42struct ctl_table; 42struct ctl_table;
43struct file; 43extern int proc_nmi_enabled(struct ctl_table *, int ,
44extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
45 void __user *, size_t *, loff_t *); 44 void __user *, size_t *, loff_t *);
46extern int unknown_nmi_panic; 45extern int unknown_nmi_panic;
47 46
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index ad2668ee1aa7..6d8723a766cc 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -65,6 +65,8 @@
65 6: osp nopl 0x00(%eax,%eax,1) 65 6: osp nopl 0x00(%eax,%eax,1)
66 7: nopl 0x00000000(%eax) 66 7: nopl 0x00000000(%eax)
67 8: nopl 0x00000000(%eax,%eax,1) 67 8: nopl 0x00000000(%eax,%eax,1)
68 Note: All the above are assumed to be a single instruction.
69 There is kernel code that depends on this.
68*/ 70*/
69#define P6_NOP1 GENERIC_NOP1 71#define P6_NOP1 GENERIC_NOP1
70#define P6_NOP2 ".byte 0x66,0x90\n" 72#define P6_NOP2 ".byte 0x66,0x90\n"
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 40d6586af25b..efb38994859c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -24,22 +24,6 @@ static inline void load_sp0(struct tss_struct *tss,
24 PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); 24 PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
25} 25}
26 26
27#define ARCH_SETUP pv_init_ops.arch_setup();
28static inline unsigned long get_wallclock(void)
29{
30 return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
31}
32
33static inline int set_wallclock(unsigned long nowtime)
34{
35 return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
36}
37
38static inline void (*choose_time_init(void))(void)
39{
40 return pv_time_ops.time_init;
41}
42
43/* The paravirtualized CPUID instruction. */ 27/* The paravirtualized CPUID instruction. */
44static inline void __cpuid(unsigned int *eax, unsigned int *ebx, 28static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
45 unsigned int *ecx, unsigned int *edx) 29 unsigned int *ecx, unsigned int *edx)
@@ -245,7 +229,6 @@ static inline unsigned long long paravirt_sched_clock(void)
245{ 229{
246 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); 230 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
247} 231}
248#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
249 232
250static inline unsigned long long paravirt_read_pmc(int counter) 233static inline unsigned long long paravirt_read_pmc(int counter)
251{ 234{
@@ -363,34 +346,6 @@ static inline void slow_down_io(void)
363#endif 346#endif
364} 347}
365 348
366#ifdef CONFIG_X86_LOCAL_APIC
367static inline void setup_boot_clock(void)
368{
369 PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
370}
371
372static inline void setup_secondary_clock(void)
373{
374 PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
375}
376#endif
377
378static inline void paravirt_post_allocator_init(void)
379{
380 if (pv_init_ops.post_allocator_init)
381 (*pv_init_ops.post_allocator_init)();
382}
383
384static inline void paravirt_pagetable_setup_start(pgd_t *base)
385{
386 (*pv_mmu_ops.pagetable_setup_start)(base);
387}
388
389static inline void paravirt_pagetable_setup_done(pgd_t *base)
390{
391 (*pv_mmu_ops.pagetable_setup_done)(base);
392}
393
394#ifdef CONFIG_SMP 349#ifdef CONFIG_SMP
395static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, 350static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
396 unsigned long start_esp) 351 unsigned long start_esp)
@@ -885,42 +840,22 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
885 840
886static inline unsigned long __raw_local_save_flags(void) 841static inline unsigned long __raw_local_save_flags(void)
887{ 842{
888 unsigned long f; 843 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
889
890 asm volatile(paravirt_alt(PARAVIRT_CALL)
891 : "=a"(f)
892 : paravirt_type(pv_irq_ops.save_fl),
893 paravirt_clobber(CLBR_EAX)
894 : "memory", "cc");
895 return f;
896} 844}
897 845
898static inline void raw_local_irq_restore(unsigned long f) 846static inline void raw_local_irq_restore(unsigned long f)
899{ 847{
900 asm volatile(paravirt_alt(PARAVIRT_CALL) 848 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
901 : "=a"(f)
902 : PV_FLAGS_ARG(f),
903 paravirt_type(pv_irq_ops.restore_fl),
904 paravirt_clobber(CLBR_EAX)
905 : "memory", "cc");
906} 849}
907 850
908static inline void raw_local_irq_disable(void) 851static inline void raw_local_irq_disable(void)
909{ 852{
910 asm volatile(paravirt_alt(PARAVIRT_CALL) 853 PVOP_VCALLEE0(pv_irq_ops.irq_disable);
911 :
912 : paravirt_type(pv_irq_ops.irq_disable),
913 paravirt_clobber(CLBR_EAX)
914 : "memory", "eax", "cc");
915} 854}
916 855
917static inline void raw_local_irq_enable(void) 856static inline void raw_local_irq_enable(void)
918{ 857{
919 asm volatile(paravirt_alt(PARAVIRT_CALL) 858 PVOP_VCALLEE0(pv_irq_ops.irq_enable);
920 :
921 : paravirt_type(pv_irq_ops.irq_enable),
922 paravirt_clobber(CLBR_EAX)
923 : "memory", "eax", "cc");
924} 859}
925 860
926static inline unsigned long __raw_local_irq_save(void) 861static inline unsigned long __raw_local_irq_save(void)
@@ -948,6 +883,8 @@ static inline unsigned long __raw_local_irq_save(void)
948#undef PVOP_VCALL4 883#undef PVOP_VCALL4
949#undef PVOP_CALL4 884#undef PVOP_CALL4
950 885
886extern void default_banner(void);
887
951#else /* __ASSEMBLY__ */ 888#else /* __ASSEMBLY__ */
952 889
953#define _PVSITE(ptype, clobbers, ops, word, algn) \ 890#define _PVSITE(ptype, clobbers, ops, word, algn) \
@@ -1088,5 +1025,7 @@ static inline unsigned long __raw_local_irq_save(void)
1088#endif /* CONFIG_X86_32 */ 1025#endif /* CONFIG_X86_32 */
1089 1026
1090#endif /* __ASSEMBLY__ */ 1027#endif /* __ASSEMBLY__ */
1091#endif /* CONFIG_PARAVIRT */ 1028#else /* CONFIG_PARAVIRT */
1029# define default_banner x86_init_noop
1030#endif /* !CONFIG_PARAVIRT */
1092#endif /* _ASM_X86_PARAVIRT_H */ 1031#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 25402d0006e7..9357473c8da0 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -78,14 +78,6 @@ struct pv_init_ops {
78 */ 78 */
79 unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, 79 unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
80 unsigned long addr, unsigned len); 80 unsigned long addr, unsigned len);
81
82 /* Basic arch-specific setup */
83 void (*arch_setup)(void);
84 char *(*memory_setup)(void);
85 void (*post_allocator_init)(void);
86
87 /* Print a banner to identify the environment */
88 void (*banner)(void);
89}; 81};
90 82
91 83
@@ -96,12 +88,6 @@ struct pv_lazy_ops {
96}; 88};
97 89
98struct pv_time_ops { 90struct pv_time_ops {
99 void (*time_init)(void);
100
101 /* Set and set time of day */
102 unsigned long (*get_wallclock)(void);
103 int (*set_wallclock)(unsigned long);
104
105 unsigned long long (*sched_clock)(void); 91 unsigned long long (*sched_clock)(void);
106 unsigned long (*get_tsc_khz)(void); 92 unsigned long (*get_tsc_khz)(void);
107}; 93};
@@ -203,8 +189,6 @@ struct pv_cpu_ops {
203}; 189};
204 190
205struct pv_irq_ops { 191struct pv_irq_ops {
206 void (*init_IRQ)(void);
207
208 /* 192 /*
209 * Get/set interrupt state. save_fl and restore_fl are only 193 * Get/set interrupt state. save_fl and restore_fl are only
210 * expected to use X86_EFLAGS_IF; all other bits 194 * expected to use X86_EFLAGS_IF; all other bits
@@ -229,9 +213,6 @@ struct pv_irq_ops {
229 213
230struct pv_apic_ops { 214struct pv_apic_ops {
231#ifdef CONFIG_X86_LOCAL_APIC 215#ifdef CONFIG_X86_LOCAL_APIC
232 void (*setup_boot_clock)(void);
233 void (*setup_secondary_clock)(void);
234
235 void (*startup_ipi_hook)(int phys_apicid, 216 void (*startup_ipi_hook)(int phys_apicid,
236 unsigned long start_eip, 217 unsigned long start_eip,
237 unsigned long start_esp); 218 unsigned long start_esp);
@@ -239,15 +220,6 @@ struct pv_apic_ops {
239}; 220};
240 221
241struct pv_mmu_ops { 222struct pv_mmu_ops {
242 /*
243 * Called before/after init_mm pagetable setup. setup_start
244 * may reset %cr3, and may pre-install parts of the pagetable;
245 * pagetable setup is expected to preserve any existing
246 * mapping.
247 */
248 void (*pagetable_setup_start)(pgd_t *pgd_base);
249 void (*pagetable_setup_done)(pgd_t *pgd_base);
250
251 unsigned long (*read_cr2)(void); 223 unsigned long (*read_cr2)(void);
252 void (*write_cr2)(unsigned long); 224 void (*write_cr2)(unsigned long);
253 225
@@ -522,10 +494,11 @@ int paravirt_disable_iospace(void);
522#define EXTRA_CLOBBERS 494#define EXTRA_CLOBBERS
523#define VEXTRA_CLOBBERS 495#define VEXTRA_CLOBBERS
524#else /* CONFIG_X86_64 */ 496#else /* CONFIG_X86_64 */
497/* [re]ax isn't an arg, but the return val */
525#define PVOP_VCALL_ARGS \ 498#define PVOP_VCALL_ARGS \
526 unsigned long __edi = __edi, __esi = __esi, \ 499 unsigned long __edi = __edi, __esi = __esi, \
527 __edx = __edx, __ecx = __ecx 500 __edx = __edx, __ecx = __ecx, __eax = __eax
528#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax 501#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
529 502
530#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) 503#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
531#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) 504#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
@@ -537,6 +510,7 @@ int paravirt_disable_iospace(void);
537 "=c" (__ecx) 510 "=c" (__ecx)
538#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) 511#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
539 512
513/* void functions are still allowed [re]ax for scratch */
540#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) 514#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
541#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS 515#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
542 516
@@ -611,8 +585,8 @@ int paravirt_disable_iospace(void);
611 VEXTRA_CLOBBERS, \ 585 VEXTRA_CLOBBERS, \
612 pre, post, ##__VA_ARGS__) 586 pre, post, ##__VA_ARGS__)
613 587
614#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ 588#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
615 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ 589 ____PVOP_VCALL(op.func, CLBR_RET_REG, \
616 PVOP_VCALLEE_CLOBBERS, , \ 590 PVOP_VCALLEE_CLOBBERS, , \
617 pre, post, ##__VA_ARGS__) 591 pre, post, ##__VA_ARGS__)
618 592
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 7af14e512f97..e2c1668dde7a 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end);
19extern int kernel_map_sync_memtype(u64 base, unsigned long size, 19extern int kernel_map_sync_memtype(u64 base, unsigned long size,
20 unsigned long flag); 20 unsigned long flag);
21 21
22int io_reserve_memtype(resource_size_t start, resource_size_t end,
23 unsigned long *type);
24
25void io_free_memtype(resource_size_t start, resource_size_t end);
26
22#endif /* _ASM_X86_PAT_H */ 27#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 1ff685ca221c..ada8c201d513 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void);
48#else 48#else
49#define pcibios_assign_all_busses() 0 49#define pcibios_assign_all_busses() 0
50#endif 50#endif
51#define pcibios_scan_all_fns(a, b) 0
52 51
53extern unsigned long pci_mem_start; 52extern unsigned long pci_mem_start;
54#define PCIBIOS_MIN_IO 0x1000 53#define PCIBIOS_MIN_IO 0x1000
@@ -144,7 +143,11 @@ static inline int __pcibus_to_node(const struct pci_bus *bus)
144static inline const struct cpumask * 143static inline const struct cpumask *
145cpumask_of_pcibus(const struct pci_bus *bus) 144cpumask_of_pcibus(const struct pci_bus *bus)
146{ 145{
147 return cpumask_of_node(__pcibus_to_node(bus)); 146 int node;
147
148 node = __pcibus_to_node(bus);
149 return (node == -1) ? cpu_online_mask :
150 cpumask_of_node(node);
148} 151}
149#endif 152#endif
150 153
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 04eacefcfd26..b65a36defeb7 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -168,15 +168,6 @@ do { \
168/* We can use this directly for local CPU (faster). */ 168/* We can use this directly for local CPU (faster). */
169DECLARE_PER_CPU(unsigned long, this_cpu_off); 169DECLARE_PER_CPU(unsigned long, this_cpu_off);
170 170
171#ifdef CONFIG_NEED_MULTIPLE_NODES
172void *pcpu_lpage_remapped(void *kaddr);
173#else
174static inline void *pcpu_lpage_remapped(void *kaddr)
175{
176 return NULL;
177}
178#endif
179
180#endif /* !__ASSEMBLY__ */ 171#endif /* !__ASSEMBLY__ */
181 172
182#ifdef CONFIG_SMP 173#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_event.h
index e7b7c938ae27..8d9f8548a870 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -1,8 +1,8 @@
1#ifndef _ASM_X86_PERF_COUNTER_H 1#ifndef _ASM_X86_PERF_EVENT_H
2#define _ASM_X86_PERF_COUNTER_H 2#define _ASM_X86_PERF_EVENT_H
3 3
4/* 4/*
5 * Performance counter hw details: 5 * Performance event hw details:
6 */ 6 */
7 7
8#define X86_PMC_MAX_GENERIC 8 8#define X86_PMC_MAX_GENERIC 8
@@ -28,9 +28,20 @@
28 */ 28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff 29#define ARCH_PERFMON_EVENT_MASK 0xffff
30 30
31/*
32 * filter mask to validate fixed counter events.
33 * the following filters disqualify for fixed counters:
34 * - inv
35 * - edge
36 * - cnt-mask
37 * The other filters are supported by fixed counters.
38 * The any-thread option is supported starting with v3.
39 */
40#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000
41
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 42#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 43#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 44#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ 45#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) 46 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36 47
@@ -43,7 +54,7 @@
43union cpuid10_eax { 54union cpuid10_eax {
44 struct { 55 struct {
45 unsigned int version_id:8; 56 unsigned int version_id:8;
46 unsigned int num_counters:8; 57 unsigned int num_events:8;
47 unsigned int bit_width:8; 58 unsigned int bit_width:8;
48 unsigned int mask_length:8; 59 unsigned int mask_length:8;
49 } split; 60 } split;
@@ -52,7 +63,7 @@ union cpuid10_eax {
52 63
53union cpuid10_edx { 64union cpuid10_edx {
54 struct { 65 struct {
55 unsigned int num_counters_fixed:4; 66 unsigned int num_events_fixed:4;
56 unsigned int reserved:28; 67 unsigned int reserved:28;
57 } split; 68 } split;
58 unsigned int full; 69 unsigned int full;
@@ -60,7 +71,7 @@ union cpuid10_edx {
60 71
61 72
62/* 73/*
63 * Fixed-purpose performance counters: 74 * Fixed-purpose performance events:
64 */ 75 */
65 76
66/* 77/*
@@ -87,22 +98,22 @@ union cpuid10_edx {
87/* 98/*
88 * We model BTS tracing as another fixed-mode PMC. 99 * We model BTS tracing as another fixed-mode PMC.
89 * 100 *
90 * We choose a value in the middle of the fixed counter range, since lower 101 * We choose a value in the middle of the fixed event range, since lower
91 * values are used by actual fixed counters and higher values are used 102 * values are used by actual fixed events and higher values are used
92 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. 103 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
93 */ 104 */
94#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 105#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
95 106
96 107
97#ifdef CONFIG_PERF_COUNTERS 108#ifdef CONFIG_PERF_EVENTS
98extern void init_hw_perf_counters(void); 109extern void init_hw_perf_events(void);
99extern void perf_counters_lapic_init(void); 110extern void perf_events_lapic_init(void);
100 111
101#define PERF_COUNTER_INDEX_OFFSET 0 112#define PERF_EVENT_INDEX_OFFSET 0
102 113
103#else 114#else
104static inline void init_hw_perf_counters(void) { } 115static inline void init_hw_perf_events(void) { }
105static inline void perf_counters_lapic_init(void) { } 116static inline void perf_events_lapic_init(void) { }
106#endif 117#endif
107 118
108#endif /* _ASM_X86_PERF_COUNTER_H */ 119#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 4c5b51fdc788..af6fd360ab35 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -56,16 +56,6 @@ extern struct list_head pgd_list;
56#define pte_update(mm, addr, ptep) do { } while (0) 56#define pte_update(mm, addr, ptep) do { } while (0)
57#define pte_update_defer(mm, addr, ptep) do { } while (0) 57#define pte_update_defer(mm, addr, ptep) do { } while (0)
58 58
59static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
60{
61 native_pagetable_setup_start(base);
62}
63
64static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
65{
66 native_pagetable_setup_done(base);
67}
68
69#define pgd_val(x) native_pgd_val(x) 59#define pgd_val(x) native_pgd_val(x)
70#define __pgd(x) native_make_pgd(x) 60#define __pgd(x) native_make_pgd(x)
71 61
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 54cb697f4900..d1f4a760be23 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -277,6 +277,7 @@ static inline pteval_t pte_flags(pte_t pte)
277typedef struct page *pgtable_t; 277typedef struct page *pgtable_t;
278 278
279extern pteval_t __supported_pte_mask; 279extern pteval_t __supported_pte_mask;
280extern void set_nx(void);
280extern int nx_enabled; 281extern int nx_enabled;
281 282
282#define pgprot_writecombine pgprot_writecombine 283#define pgprot_writecombine pgprot_writecombine
@@ -299,8 +300,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
299extern void native_pagetable_setup_start(pgd_t *base); 300extern void native_pagetable_setup_start(pgd_t *base);
300extern void native_pagetable_setup_done(pgd_t *base); 301extern void native_pagetable_setup_done(pgd_t *base);
301#else 302#else
302static inline void native_pagetable_setup_start(pgd_t *base) {} 303#define native_pagetable_setup_start x86_init_pgd_noop
303static inline void native_pagetable_setup_done(pgd_t *base) {} 304#define native_pagetable_setup_done x86_init_pgd_noop
304#endif 305#endif
305 306
306struct seq_file; 307struct seq_file;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1153037ae9ff..61aafb71c7ef 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@ struct mm_struct;
27#include <linux/cpumask.h> 27#include <linux/cpumask.h>
28#include <linux/cache.h> 28#include <linux/cache.h>
29#include <linux/threads.h> 29#include <linux/threads.h>
30#include <linux/math64.h>
30#include <linux/init.h> 31#include <linux/init.h>
31 32
32#define HBP_NUM 4 33#define HBP_NUM 4
@@ -1020,4 +1021,35 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
1020extern int get_tsc_mode(unsigned long adr); 1021extern int get_tsc_mode(unsigned long adr);
1021extern int set_tsc_mode(unsigned int val); 1022extern int set_tsc_mode(unsigned int val);
1022 1023
1024extern int amd_get_nb_id(int cpu);
1025
1026struct aperfmperf {
1027 u64 aperf, mperf;
1028};
1029
1030static inline void get_aperfmperf(struct aperfmperf *am)
1031{
1032 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
1033
1034 rdmsrl(MSR_IA32_APERF, am->aperf);
1035 rdmsrl(MSR_IA32_MPERF, am->mperf);
1036}
1037
1038#define APERFMPERF_SHIFT 10
1039
1040static inline
1041unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
1042 struct aperfmperf *new)
1043{
1044 u64 aperf = new->aperf - old->aperf;
1045 u64 mperf = new->mperf - old->mperf;
1046 unsigned long ratio = aperf;
1047
1048 mperf >>= APERFMPERF_SHIFT;
1049 if (mperf)
1050 ratio = div64_u64(aperf, mperf);
1051
1052 return ratio;
1053}
1054
1023#endif /* _ASM_X86_PROCESSOR_H */ 1055#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 4093d1ed6db2..18e496c98ff0 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -5,43 +5,6 @@
5 5
6#define COMMAND_LINE_SIZE 2048 6#define COMMAND_LINE_SIZE 2048
7 7
8#ifndef __ASSEMBLY__
9
10/*
11 * Any setup quirks to be performed?
12 */
13struct mpc_cpu;
14struct mpc_bus;
15struct mpc_oemtable;
16
17struct x86_quirks {
18 int (*arch_pre_time_init)(void);
19 int (*arch_time_init)(void);
20 int (*arch_pre_intr_init)(void);
21 int (*arch_intr_init)(void);
22 int (*arch_trap_init)(void);
23 char * (*arch_memory_setup)(void);
24 int (*mach_get_smp_config)(unsigned int early);
25 int (*mach_find_smp_config)(unsigned int reserve);
26
27 int *mpc_record;
28 int (*mpc_apic_id)(struct mpc_cpu *m);
29 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
30 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
31 void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
32 unsigned short oemsize);
33 int (*setup_ioapic_ids)(void);
34};
35
36extern void x86_quirk_intr_init(void);
37
38extern void x86_quirk_trap_init(void);
39
40extern void x86_quirk_pre_time_init(void);
41extern void x86_quirk_time_init(void);
42
43#endif /* __ASSEMBLY__ */
44
45#ifdef __i386__ 8#ifdef __i386__
46 9
47#include <linux/pfn.h> 10#include <linux/pfn.h>
@@ -61,6 +24,7 @@ extern void x86_quirk_time_init(void);
61 24
62#ifndef __ASSEMBLY__ 25#ifndef __ASSEMBLY__
63#include <asm/bootparam.h> 26#include <asm/bootparam.h>
27#include <asm/x86_init.h>
64 28
65/* Interrupt control for vSMPowered x86_64 systems */ 29/* Interrupt control for vSMPowered x86_64 systems */
66#ifdef CONFIG_X86_64 30#ifdef CONFIG_X86_64
@@ -79,11 +43,16 @@ static inline void visws_early_detect(void) { }
79static inline int is_visws_box(void) { return 0; } 43static inline int is_visws_box(void) { return 0; }
80#endif 44#endif
81 45
82extern struct x86_quirks *x86_quirks;
83extern unsigned long saved_video_mode; 46extern unsigned long saved_video_mode;
84 47
85#ifndef CONFIG_PARAVIRT 48extern void reserve_standard_io_resources(void);
86#define paravirt_post_allocator_init() do {} while (0) 49extern void i386_reserve_resources(void);
50extern void setup_default_timer_irq(void);
51
52#ifdef CONFIG_X86_MRST
53extern void x86_mrst_early_setup(void);
54#else
55static inline void x86_mrst_early_setup(void) { }
87#endif 56#endif
88 57
89#ifndef _SETUP 58#ifndef _SETUP
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 6a84ed166aec..1e796782cd7b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu)
121 smp_ops.send_call_func_single_ipi(cpu); 121 smp_ops.send_call_func_single_ipi(cpu);
122} 122}
123 123
124#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
125static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) 124static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
126{ 125{
127 smp_ops.send_call_func_ipi(mask); 126 smp_ops.send_call_func_ipi(mask);
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index c86f452256de..ae907e617181 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -65,7 +65,6 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
65 case 4: 65 case 4:
66 *(int *)to = *(int *)from; 66 *(int *)to = *(int *)from;
67 return to; 67 return to;
68
69 case 3: 68 case 3:
70 *(short *)to = *(short *)from; 69 *(short *)to = *(short *)from;
71 *((char *)to + 2) = *((char *)from + 2); 70 *((char *)to + 2) = *((char *)from + 2);
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d82f39bb7905..8d33bc5462d1 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Access to user system call parameters and results 2 * Access to user system call parameters and results
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved.
5 * 5 *
6 * This copyrighted material is made available to anyone wishing to use, 6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions 7 * modify, copy, or redistribute it subject to the terms and conditions
@@ -16,13 +16,13 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/err.h> 17#include <linux/err.h>
18 18
19static inline long syscall_get_nr(struct task_struct *task, 19/*
20 struct pt_regs *regs) 20 * Only the low 32 bits of orig_ax are meaningful, so we return int.
21 * This importantly ignores the high bits on 64-bit, so comparisons
22 * sign-extend the low 32 bits.
23 */
24static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
21{ 25{
22 /*
23 * We always sign-extend a -1 value being set here,
24 * so this is always either -1L or a syscall number.
25 */
26 return regs->orig_ax; 26 return regs->orig_ax;
27} 27}
28 28
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 50c733aac421..7bdec4e9b739 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -4,60 +4,7 @@
4extern void hpet_time_init(void); 4extern void hpet_time_init(void);
5 5
6#include <asm/mc146818rtc.h> 6#include <asm/mc146818rtc.h>
7#ifdef CONFIG_X86_32
8#include <linux/efi.h>
9
10static inline unsigned long native_get_wallclock(void)
11{
12 unsigned long retval;
13
14 if (efi_enabled)
15 retval = efi_get_time();
16 else
17 retval = mach_get_cmos_time();
18
19 return retval;
20}
21
22static inline int native_set_wallclock(unsigned long nowtime)
23{
24 int retval;
25
26 if (efi_enabled)
27 retval = efi_set_rtc_mmss(nowtime);
28 else
29 retval = mach_set_rtc_mmss(nowtime);
30
31 return retval;
32}
33
34#else
35extern void native_time_init_hook(void);
36
37static inline unsigned long native_get_wallclock(void)
38{
39 return mach_get_cmos_time();
40}
41
42static inline int native_set_wallclock(unsigned long nowtime)
43{
44 return mach_set_rtc_mmss(nowtime);
45}
46
47#endif
48 7
49extern void time_init(void); 8extern void time_init(void);
50 9
51#ifdef CONFIG_PARAVIRT
52#include <asm/paravirt.h>
53#else /* !CONFIG_PARAVIRT */
54
55#define get_wallclock() native_get_wallclock()
56#define set_wallclock(x) native_set_wallclock(x)
57#define choose_time_init() hpet_time_init
58
59#endif /* CONFIG_PARAVIRT */
60
61extern unsigned long __init calibrate_cpu(void);
62
63#endif /* _ASM_X86_TIME_H */ 10#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 20ca9c4d4686..5469630b27f5 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -8,20 +8,16 @@
8#define TICK_SIZE (tick_nsec / 1000) 8#define TICK_SIZE (tick_nsec / 1000)
9 9
10unsigned long long native_sched_clock(void); 10unsigned long long native_sched_clock(void);
11unsigned long native_calibrate_tsc(void); 11extern int recalibrate_cpu_khz(void);
12 12
13#ifdef CONFIG_X86_32 13#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
14extern int timer_ack; 14extern int timer_ack;
15extern irqreturn_t timer_interrupt(int irq, void *dev_id); 15#else
16#endif /* CONFIG_X86_32 */ 16# define timer_ack (0)
17extern int recalibrate_cpu_khz(void); 17#endif
18 18
19extern int no_timer_check; 19extern int no_timer_check;
20 20
21#ifndef CONFIG_PARAVIRT
22#define calibrate_tsc() native_calibrate_tsc()
23#endif
24
25/* Accelerators for sched_clock() 21/* Accelerators for sched_clock()
26 * convert from cycles(64bits) => nanoseconds (64bits) 22 * convert from cycles(64bits) => nanoseconds (64bits)
27 * basic equation: 23 * basic equation:
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 26d06e052a18..25a92842dd99 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,15 +116,11 @@ extern unsigned long node_remap_size[];
116 116
117# define SD_CACHE_NICE_TRIES 1 117# define SD_CACHE_NICE_TRIES 1
118# define SD_IDLE_IDX 1 118# define SD_IDLE_IDX 1
119# define SD_NEWIDLE_IDX 2
120# define SD_FORKEXEC_IDX 0
121 119
122#else 120#else
123 121
124# define SD_CACHE_NICE_TRIES 2 122# define SD_CACHE_NICE_TRIES 2
125# define SD_IDLE_IDX 2 123# define SD_IDLE_IDX 2
126# define SD_NEWIDLE_IDX 2
127# define SD_FORKEXEC_IDX 1
128 124
129#endif 125#endif
130 126
@@ -137,22 +133,20 @@ extern unsigned long node_remap_size[];
137 .cache_nice_tries = SD_CACHE_NICE_TRIES, \ 133 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
138 .busy_idx = 3, \ 134 .busy_idx = 3, \
139 .idle_idx = SD_IDLE_IDX, \ 135 .idle_idx = SD_IDLE_IDX, \
140 .newidle_idx = SD_NEWIDLE_IDX, \ 136 .newidle_idx = 0, \
141 .wake_idx = 1, \ 137 .wake_idx = 0, \
142 .forkexec_idx = SD_FORKEXEC_IDX, \ 138 .forkexec_idx = 0, \
143 \ 139 \
144 .flags = 1*SD_LOAD_BALANCE \ 140 .flags = 1*SD_LOAD_BALANCE \
145 | 1*SD_BALANCE_NEWIDLE \ 141 | 1*SD_BALANCE_NEWIDLE \
146 | 1*SD_BALANCE_EXEC \ 142 | 1*SD_BALANCE_EXEC \
147 | 1*SD_BALANCE_FORK \ 143 | 1*SD_BALANCE_FORK \
148 | 0*SD_WAKE_IDLE \ 144 | 0*SD_BALANCE_WAKE \
149 | 1*SD_WAKE_AFFINE \ 145 | 1*SD_WAKE_AFFINE \
150 | 1*SD_WAKE_BALANCE \
151 | 0*SD_SHARE_CPUPOWER \ 146 | 0*SD_SHARE_CPUPOWER \
152 | 0*SD_POWERSAVINGS_BALANCE \ 147 | 0*SD_POWERSAVINGS_BALANCE \
153 | 0*SD_SHARE_PKG_RESOURCES \ 148 | 0*SD_SHARE_PKG_RESOURCES \
154 | 1*SD_SERIALIZE \ 149 | 1*SD_SERIALIZE \
155 | 1*SD_WAKE_IDLE_FAR \
156 | 0*SD_PREFER_SIBLING \ 150 | 0*SD_PREFER_SIBLING \
157 , \ 151 , \
158 .last_balance = jiffies, \ 152 .last_balance = jiffies, \
@@ -171,21 +165,11 @@ static inline int numa_node_id(void)
171 return 0; 165 return 0;
172} 166}
173 167
174static inline int cpu_to_node(int cpu)
175{
176 return 0;
177}
178
179static inline int early_cpu_to_node(int cpu) 168static inline int early_cpu_to_node(int cpu)
180{ 169{
181 return 0; 170 return 0;
182} 171}
183 172
184static inline const struct cpumask *cpumask_of_node(int node)
185{
186 return cpu_online_mask;
187}
188
189static inline void setup_node_to_cpumask_map(void) { } 173static inline void setup_node_to_cpumask_map(void) { }
190 174
191#endif 175#endif
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 38ae163cc91b..c0427295e8f5 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void)
48extern void tsc_init(void); 48extern void tsc_init(void);
49extern void mark_tsc_unstable(char *reason); 49extern void mark_tsc_unstable(char *reason);
50extern int unsynchronized_tsc(void); 50extern int unsynchronized_tsc(void);
51int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void);
52 53
53/* 54/*
54 * Boot-time check whether the TSCs are synchronized across 55 * Boot-time check whether the TSCs are synchronized across
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 5e06259e90e5..632fb44b4cb5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -33,7 +33,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
33 * Copy data from kernel space to user space. Caller must check 33 * Copy data from kernel space to user space. Caller must check
34 * the specified block with access_ok() before calling this function. 34 * the specified block with access_ok() before calling this function.
35 * The caller should also make sure he pins the user space address 35 * The caller should also make sure he pins the user space address
36 * so that the we don't result in page fault and sleep. 36 * so that we don't result in page fault and sleep.
37 * 37 *
38 * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault 38 * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
39 * we return the initial request size (1, 2 or 4), as copy_*_user should do. 39 * we return the initial request size (1, 2 or 4), as copy_*_user should do.
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 8deaada61bc8..6fb3c209a7e3 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,7 +341,7 @@
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_counter_open 336 344#define __NR_perf_event_open 336
345 345
346#ifdef __KERNEL__ 346#ifdef __KERNEL__
347 347
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index b9f3c60de5f7..8d3ad0adbc68 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660#define __NR_rt_tgsigqueueinfo 297 660#define __NR_rt_tgsigqueueinfo 297
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_counter_open 298 662#define __NR_perf_event_open 298
663__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664 664
665#ifndef __NO_STUBS 665#ifndef __NO_STUBS
666#define __ARCH_WANT_OLD_READDIR 666#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 77a68505419a..04eb6c958b9d 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -15,6 +15,7 @@
15#include <linux/numa.h> 15#include <linux/numa.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/timer.h> 17#include <linux/timer.h>
18#include <linux/io.h>
18#include <asm/types.h> 19#include <asm/types.h>
19#include <asm/percpu.h> 20#include <asm/percpu.h>
20#include <asm/uv/uv_mmrs.h> 21#include <asm/uv/uv_mmrs.h>
@@ -258,13 +259,13 @@ static inline unsigned long *uv_global_mmr32_address(int pnode,
258static inline void uv_write_global_mmr32(int pnode, unsigned long offset, 259static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
259 unsigned long val) 260 unsigned long val)
260{ 261{
261 *uv_global_mmr32_address(pnode, offset) = val; 262 writeq(val, uv_global_mmr32_address(pnode, offset));
262} 263}
263 264
264static inline unsigned long uv_read_global_mmr32(int pnode, 265static inline unsigned long uv_read_global_mmr32(int pnode,
265 unsigned long offset) 266 unsigned long offset)
266{ 267{
267 return *uv_global_mmr32_address(pnode, offset); 268 return readq(uv_global_mmr32_address(pnode, offset));
268} 269}
269 270
270/* 271/*
@@ -281,13 +282,13 @@ static inline unsigned long *uv_global_mmr64_address(int pnode,
281static inline void uv_write_global_mmr64(int pnode, unsigned long offset, 282static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
282 unsigned long val) 283 unsigned long val)
283{ 284{
284 *uv_global_mmr64_address(pnode, offset) = val; 285 writeq(val, uv_global_mmr64_address(pnode, offset));
285} 286}
286 287
287static inline unsigned long uv_read_global_mmr64(int pnode, 288static inline unsigned long uv_read_global_mmr64(int pnode,
288 unsigned long offset) 289 unsigned long offset)
289{ 290{
290 return *uv_global_mmr64_address(pnode, offset); 291 return readq(uv_global_mmr64_address(pnode, offset));
291} 292}
292 293
293/* 294/*
@@ -301,22 +302,22 @@ static inline unsigned long *uv_local_mmr_address(unsigned long offset)
301 302
302static inline unsigned long uv_read_local_mmr(unsigned long offset) 303static inline unsigned long uv_read_local_mmr(unsigned long offset)
303{ 304{
304 return *uv_local_mmr_address(offset); 305 return readq(uv_local_mmr_address(offset));
305} 306}
306 307
307static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) 308static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
308{ 309{
309 *uv_local_mmr_address(offset) = val; 310 writeq(val, uv_local_mmr_address(offset));
310} 311}
311 312
312static inline unsigned char uv_read_local_mmr8(unsigned long offset) 313static inline unsigned char uv_read_local_mmr8(unsigned long offset)
313{ 314{
314 return *((unsigned char *)uv_local_mmr_address(offset)); 315 return readb(uv_local_mmr_address(offset));
315} 316}
316 317
317static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) 318static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
318{ 319{
319 *((unsigned char *)uv_local_mmr_address(offset)) = val; 320 writeb(val, uv_local_mmr_address(offset));
320} 321}
321 322
322/* 323/*
@@ -422,7 +423,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
422 unsigned long val; 423 unsigned long val;
423 424
424 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 425 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
425 ((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) | 426 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
426 (vector << UVH_IPI_INT_VECTOR_SHFT); 427 (vector << UVH_IPI_INT_VECTOR_SHFT);
427 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 428 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
428} 429}
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index dc27a69e5d2a..3d61e204826f 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -21,6 +21,7 @@ struct vsyscall_gtod_data {
21 u32 shift; 21 u32 shift;
22 } clock; 22 } clock;
23 struct timespec wall_to_monotonic; 23 struct timespec wall_to_monotonic;
24 struct timespec wall_time_coarse;
24}; 25};
25extern struct vsyscall_gtod_data __vsyscall_gtod_data 26extern struct vsyscall_gtod_data __vsyscall_gtod_data
26__section_vsyscall_gtod_data; 27__section_vsyscall_gtod_data;
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
index c11b7e100d83..e49ed6d2fd4e 100644
--- a/arch/x86/include/asm/vmware.h
+++ b/arch/x86/include/asm/vmware.h
@@ -20,7 +20,7 @@
20#ifndef ASM_X86__VMWARE_H 20#ifndef ASM_X86__VMWARE_H
21#define ASM_X86__VMWARE_H 21#define ASM_X86__VMWARE_H
22 22
23extern unsigned long vmware_get_tsc_khz(void); 23extern void vmware_platform_setup(void);
24extern int vmware_platform(void); 24extern int vmware_platform(void);
25extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); 25extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
26 26
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
new file mode 100644
index 000000000000..2c756fd4ab0e
--- /dev/null
+++ b/arch/x86/include/asm/x86_init.h
@@ -0,0 +1,133 @@
1#ifndef _ASM_X86_PLATFORM_H
2#define _ASM_X86_PLATFORM_H
3
4#include <asm/pgtable_types.h>
5#include <asm/bootparam.h>
6
7struct mpc_bus;
8struct mpc_cpu;
9struct mpc_table;
10
11/**
12 * struct x86_init_mpparse - platform specific mpparse ops
13 * @mpc_record: platform specific mpc record accounting
14 * @setup_ioapic_ids: platform specific ioapic id override
15 * @mpc_apic_id: platform specific mpc apic id assignment
16 * @smp_read_mpc_oem: platform specific oem mpc table setup
17 * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL)
18 * @mpc_oem_bus_info: platform specific mpc bus info
19 * @find_smp_config: find the smp configuration
20 * @get_smp_config: get the smp configuration
21 */
22struct x86_init_mpparse {
23 void (*mpc_record)(unsigned int mode);
24 void (*setup_ioapic_ids)(void);
25 int (*mpc_apic_id)(struct mpc_cpu *m);
26 void (*smp_read_mpc_oem)(struct mpc_table *mpc);
27 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
29 void (*find_smp_config)(unsigned int reserve);
30 void (*get_smp_config)(unsigned int early);
31};
32
33/**
34 * struct x86_init_resources - platform specific resource related ops
35 * @probe_roms: probe BIOS roms
36 * @reserve_resources: reserve the standard resources for the
37 * platform
38 * @memory_setup: platform specific memory setup
39 *
40 */
41struct x86_init_resources {
42 void (*probe_roms)(void);
43 void (*reserve_resources)(void);
44 char *(*memory_setup)(void);
45};
46
47/**
48 * struct x86_init_irqs - platform specific interrupt setup
49 * @pre_vector_init: init code to run before interrupt vectors
50 * are set up.
51 * @intr_init: interrupt init code
52 * @trap_init: platform specific trap setup
53 */
54struct x86_init_irqs {
55 void (*pre_vector_init)(void);
56 void (*intr_init)(void);
57 void (*trap_init)(void);
58};
59
60/**
61 * struct x86_init_oem - oem platform specific customizing functions
62 * @arch_setup: platform specific architecure setup
63 * @banner: print a platform specific banner
64 */
65struct x86_init_oem {
66 void (*arch_setup)(void);
67 void (*banner)(void);
68};
69
70/**
71 * struct x86_init_paging - platform specific paging functions
72 * @pagetable_setup_start: platform specific pre paging_init() call
73 * @pagetable_setup_done: platform specific post paging_init() call
74 */
75struct x86_init_paging {
76 void (*pagetable_setup_start)(pgd_t *base);
77 void (*pagetable_setup_done)(pgd_t *base);
78};
79
80/**
81 * struct x86_init_timers - platform specific timer setup
82 * @setup_perpcu_clockev: set up the per cpu clock event device for the
83 * boot cpu
84 * @tsc_pre_init: platform function called before TSC init
85 * @timer_init: initialize the platform timer (default PIT/HPET)
86 */
87struct x86_init_timers {
88 void (*setup_percpu_clockev)(void);
89 void (*tsc_pre_init)(void);
90 void (*timer_init)(void);
91};
92
93/**
94 * struct x86_init_ops - functions for platform specific setup
95 *
96 */
97struct x86_init_ops {
98 struct x86_init_resources resources;
99 struct x86_init_mpparse mpparse;
100 struct x86_init_irqs irqs;
101 struct x86_init_oem oem;
102 struct x86_init_paging paging;
103 struct x86_init_timers timers;
104};
105
106/**
107 * struct x86_cpuinit_ops - platform specific cpu hotplug setups
108 * @setup_percpu_clockev: set up the per cpu clock event device
109 */
110struct x86_cpuinit_ops {
111 void (*setup_percpu_clockev)(void);
112};
113
114/**
115 * struct x86_platform_ops - platform specific runtime functions
116 * @calibrate_tsc: calibrate TSC
117 * @get_wallclock: get time from HW clock like RTC etc.
118 * @set_wallclock: set time back to HW clock
119 */
120struct x86_platform_ops {
121 unsigned long (*calibrate_tsc)(void);
122 unsigned long (*get_wallclock)(void);
123 int (*set_wallclock)(unsigned long nowtime);
124};
125
126extern struct x86_init_ops x86_init;
127extern struct x86_cpuinit_ops x86_cpuinit;
128extern struct x86_platform_ops x86_platform;
129
130extern void x86_init_noop(void);
131extern void x86_init_uint_noop(unsigned int unused);
132
133#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bf04201b6575..4f2e66e29ecc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n
31 31
32obj-y := process_$(BITS).o signal.o entry_$(BITS).o 32obj-y := process_$(BITS).o signal.o entry_$(BITS).o
33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
34obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 34obj-y += time.o ioport.o ldt.o dumpstack.o
35obj-y += setup.o i8259.o irqinit.o 35obj-y += setup.o x86_init.o i8259.o irqinit.o
36obj-$(CONFIG_X86_VISWS) += visws_quirks.o 36obj-$(CONFIG_X86_VISWS) += visws_quirks.o
37obj-$(CONFIG_X86_32) += probe_roms_32.o 37obj-$(CONFIG_X86_32) += probe_roms_32.o
38obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 38obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 52obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 53obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 54obj-y += step.o
55obj-$(CONFIG_INTEL_TXT) += tboot.o
55obj-$(CONFIG_STACKTRACE) += stacktrace.o 56obj-$(CONFIG_STACKTRACE) += stacktrace.o
56obj-y += cpu/ 57obj-y += cpu/
57obj-y += acpi/ 58obj-y += acpi/
59obj-$(CONFIG_SFI) += sfi.o
58obj-y += reboot.o 60obj-y += reboot.o
59obj-$(CONFIG_MCA) += mca_32.o 61obj-$(CONFIG_MCA) += mca_32.o
60obj-$(CONFIG_X86_MSR) += msr.o 62obj-$(CONFIG_X86_MSR) += msr.o
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o
104scx200-y += scx200_32.o 106scx200-y += scx200_32.o
105 107
106obj-$(CONFIG_OLPC) += olpc.o 108obj-$(CONFIG_OLPC) += olpc.o
109obj-$(CONFIG_X86_MRST) += mrst.o
107 110
108microcode-y := microcode_core.o 111microcode-y := microcode_core.o
109microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o 112microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8c44c232efcb..59cdfa4686b2 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cda..0e50e1e5c573 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -56,6 +56,6 @@ SECTIONS
56 /DISCARD/ : { 56 /DISCARD/ : {
57 *(.note*) 57 *(.note*)
58 } 58 }
59
60 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
61} 59}
60
61ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 159740decc41..894aa97f0717 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,7 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
19#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
20#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -35,7 +35,8 @@
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37 37
38#include <asm/perf_counter.h> 38#include <asm/perf_event.h>
39#include <asm/x86_init.h>
39#include <asm/pgalloc.h> 40#include <asm/pgalloc.h>
40#include <asm/atomic.h> 41#include <asm/atomic.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
@@ -61,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U;
61/* 62/*
62 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
63 * 64 *
64 * This determines the messaging protocol we can use: if all APIC IDs 65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
65 * are in the 0 ... 7 range, then we can use logical addressing which 66 * are in the 0 ... 7 range, then we can use logical addressing which
66 * has some performance advantages (better broadcasting). 67 * has some performance advantages (better broadcasting).
67 * 68 *
@@ -978,7 +979,7 @@ void lapic_shutdown(void)
978{ 979{
979 unsigned long flags; 980 unsigned long flags;
980 981
981 if (!cpu_has_apic) 982 if (!cpu_has_apic && !apic_from_smp_config())
982 return; 983 return;
983 984
984 local_irq_save(flags); 985 local_irq_save(flags);
@@ -1188,7 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1188 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1189 } 1190 }
1190#endif 1191#endif
1191 perf_counters_lapic_init(); 1192 perf_events_lapic_init();
1192 1193
1193 preempt_disable(); 1194 preempt_disable();
1194 1195
@@ -1196,8 +1197,7 @@ void __cpuinit setup_local_APIC(void)
1196 * Double-check whether this APIC is really registered. 1197 * Double-check whether this APIC is really registered.
1197 * This is meaningless in clustered apic mode, so we skip it. 1198 * This is meaningless in clustered apic mode, so we skip it.
1198 */ 1199 */
1199 if (!apic->apic_id_registered()) 1200 BUG_ON(!apic->apic_id_registered());
1200 BUG();
1201 1201
1202 /* 1202 /*
1203 * Intel recommends to set DFR, LDR and TPR before enabling 1203 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -1709,7 +1709,7 @@ int __init APIC_init_uniprocessor(void)
1709 localise_nmi_watchdog(); 1709 localise_nmi_watchdog();
1710#endif 1710#endif
1711 1711
1712 setup_boot_clock(); 1712 x86_init.timers.setup_percpu_clockev();
1713#ifdef CONFIG_X86_64 1713#ifdef CONFIG_X86_64
1714 check_nmi_watchdog(); 1714 check_nmi_watchdog();
1715#endif 1715#endif
@@ -1916,24 +1916,14 @@ void __cpuinit generic_processor_info(int apicid, int version)
1916 max_physical_apicid = apicid; 1916 max_physical_apicid = apicid;
1917 1917
1918#ifdef CONFIG_X86_32 1918#ifdef CONFIG_X86_32
1919 /* 1919 switch (boot_cpu_data.x86_vendor) {
1920 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1920 case X86_VENDOR_INTEL:
1921 * but we need to work other dependencies like SMP_SUSPEND etc 1921 if (num_processors > 8)
1922 * before this can be done without some confusion. 1922 def_to_bigsmp = 1;
1923 * if (CPU_HOTPLUG_ENABLED || num_processors > 8) 1923 break;
1924 * - Ashok Raj <ashok.raj@intel.com> 1924 case X86_VENDOR_AMD:
1925 */ 1925 if (max_physical_apicid >= 8)
1926 if (max_physical_apicid >= 8) {
1927 switch (boot_cpu_data.x86_vendor) {
1928 case X86_VENDOR_INTEL:
1929 if (!APIC_XAPIC(version)) {
1930 def_to_bigsmp = 0;
1931 break;
1932 }
1933 /* If P4 and above fall through */
1934 case X86_VENDOR_AMD:
1935 def_to_bigsmp = 1; 1926 def_to_bigsmp = 1;
1936 }
1937 } 1927 }
1938#endif 1928#endif
1939 1929
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 676cdac385c0..77a06413b6b2 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
112 return physids_promote(0xFFL); 112 return physids_promote(0xFFL);
113} 113}
114 114
115static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) 115static int bigsmp_check_phys_apicid_present(int phys_apicid)
116{ 116{
117 return 1; 117 return 1;
118} 118}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3c8f9e75d038..dc69f28489f5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -96,6 +96,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
96/* # of MP IRQ source entries */ 96/* # of MP IRQ source entries */
97int mp_irq_entries; 97int mp_irq_entries;
98 98
99/* Number of legacy interrupts */
100static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
101/* GSI interrupts */
102static int nr_irqs_gsi = NR_IRQS_LEGACY;
103
99#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 104#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
100int mp_bus_id_to_type[MAX_MP_BUSSES]; 105int mp_bus_id_to_type[MAX_MP_BUSSES];
101#endif 106#endif
@@ -173,6 +178,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
173 [15] = { .vector = IRQ15_VECTOR, }, 178 [15] = { .vector = IRQ15_VECTOR, },
174}; 179};
175 180
181void __init io_apic_disable_legacy(void)
182{
183 nr_legacy_irqs = 0;
184 nr_irqs_gsi = 0;
185}
186
176int __init arch_early_irq_init(void) 187int __init arch_early_irq_init(void)
177{ 188{
178 struct irq_cfg *cfg; 189 struct irq_cfg *cfg;
@@ -190,7 +201,7 @@ int __init arch_early_irq_init(void)
190 desc->chip_data = &cfg[i]; 201 desc->chip_data = &cfg[i];
191 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 202 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
192 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 203 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
193 if (i < NR_IRQS_LEGACY) 204 if (i < nr_legacy_irqs)
194 cpumask_setall(cfg[i].domain); 205 cpumask_setall(cfg[i].domain);
195 } 206 }
196 207
@@ -216,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node)
216 227
217 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 228 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
218 if (cfg) { 229 if (cfg) {
219 if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { 230 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
220 kfree(cfg); 231 kfree(cfg);
221 cfg = NULL; 232 cfg = NULL;
222 } else if (!alloc_cpumask_var_node(&cfg->old_domain, 233 } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
223 GFP_ATOMIC, node)) { 234 GFP_ATOMIC, node)) {
224 free_cpumask_var(cfg->domain); 235 free_cpumask_var(cfg->domain);
225 kfree(cfg); 236 kfree(cfg);
226 cfg = NULL; 237 cfg = NULL;
227 } else {
228 cpumask_clear(cfg->domain);
229 cpumask_clear(cfg->old_domain);
230 } 238 }
231 } 239 }
232 240
@@ -867,7 +875,7 @@ static int __init find_isa_irq_apic(int irq, int type)
867 */ 875 */
868static int EISA_ELCR(unsigned int irq) 876static int EISA_ELCR(unsigned int irq)
869{ 877{
870 if (irq < NR_IRQS_LEGACY) { 878 if (irq < nr_legacy_irqs) {
871 unsigned int port = 0x4d0 + (irq >> 3); 879 unsigned int port = 0x4d0 + (irq >> 3);
872 return (inb(port) >> (irq & 7)) & 1; 880 return (inb(port) >> (irq & 7)) & 1;
873 } 881 }
@@ -1464,7 +1472,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1464 } 1472 }
1465 1473
1466 ioapic_register_intr(irq, desc, trigger); 1474 ioapic_register_intr(irq, desc, trigger);
1467 if (irq < NR_IRQS_LEGACY) 1475 if (irq < nr_legacy_irqs)
1468 disable_8259A_irq(irq); 1476 disable_8259A_irq(irq);
1469 1477
1470 ioapic_write_entry(apic_id, pin, entry); 1478 ioapic_write_entry(apic_id, pin, entry);
@@ -1831,7 +1839,7 @@ __apicdebuginit(void) print_PIC(void)
1831 unsigned int v; 1839 unsigned int v;
1832 unsigned long flags; 1840 unsigned long flags;
1833 1841
1834 if (apic_verbosity == APIC_QUIET) 1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs)
1835 return; 1843 return;
1836 1844
1837 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1845 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1863,7 +1871,7 @@ __apicdebuginit(int) print_all_ICs(void)
1863 print_PIC(); 1871 print_PIC();
1864 1872
1865 /* don't print out if apic is not there */ 1873 /* don't print out if apic is not there */
1866 if (!cpu_has_apic || disable_apic) 1874 if (!cpu_has_apic && !apic_from_smp_config())
1867 return 0; 1875 return 0;
1868 1876
1869 print_all_local_APICs(); 1877 print_all_local_APICs();
@@ -1894,6 +1902,10 @@ void __init enable_IO_APIC(void)
1894 spin_unlock_irqrestore(&ioapic_lock, flags); 1902 spin_unlock_irqrestore(&ioapic_lock, flags);
1895 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1903 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1896 } 1904 }
1905
1906 if (!nr_legacy_irqs)
1907 return;
1908
1897 for(apic = 0; apic < nr_ioapics; apic++) { 1909 for(apic = 0; apic < nr_ioapics; apic++) {
1898 int pin; 1910 int pin;
1899 /* See if any of the pins is in ExtINT mode */ 1911 /* See if any of the pins is in ExtINT mode */
@@ -1948,6 +1960,9 @@ void disable_IO_APIC(void)
1948 */ 1960 */
1949 clear_IO_APIC(); 1961 clear_IO_APIC();
1950 1962
1963 if (!nr_legacy_irqs)
1964 return;
1965
1951 /* 1966 /*
1952 * If the i8259 is routed through an IOAPIC 1967 * If the i8259 is routed through an IOAPIC
1953 * Put that IOAPIC in virtual wire mode 1968 * Put that IOAPIC in virtual wire mode
@@ -1981,7 +1996,7 @@ void disable_IO_APIC(void)
1981 /* 1996 /*
1982 * Use virtual wire A mode when interrupt remapping is enabled. 1997 * Use virtual wire A mode when interrupt remapping is enabled.
1983 */ 1998 */
1984 if (cpu_has_apic) 1999 if (cpu_has_apic || apic_from_smp_config())
1985 disconnect_bsp_APIC(!intr_remapping_enabled && 2000 disconnect_bsp_APIC(!intr_remapping_enabled &&
1986 ioapic_i8259.pin != -1); 2001 ioapic_i8259.pin != -1);
1987} 2002}
@@ -1994,7 +2009,7 @@ void disable_IO_APIC(void)
1994 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 2009 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1995 */ 2010 */
1996 2011
1997static void __init setup_ioapic_ids_from_mpc(void) 2012void __init setup_ioapic_ids_from_mpc(void)
1998{ 2013{
1999 union IO_APIC_reg_00 reg_00; 2014 union IO_APIC_reg_00 reg_00;
2000 physid_mask_t phys_id_present_map; 2015 physid_mask_t phys_id_present_map;
@@ -2003,9 +2018,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
2003 unsigned char old_id; 2018 unsigned char old_id;
2004 unsigned long flags; 2019 unsigned long flags;
2005 2020
2006 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) 2021 if (acpi_ioapic)
2007 return; 2022 return;
2008
2009 /* 2023 /*
2010 * Don't check I/O APIC IDs for xAPIC systems. They have 2024 * Don't check I/O APIC IDs for xAPIC systems. They have
2011 * no meaning without the serial APIC bus. 2025 * no meaning without the serial APIC bus.
@@ -2179,7 +2193,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2179 struct irq_cfg *cfg; 2193 struct irq_cfg *cfg;
2180 2194
2181 spin_lock_irqsave(&ioapic_lock, flags); 2195 spin_lock_irqsave(&ioapic_lock, flags);
2182 if (irq < NR_IRQS_LEGACY) { 2196 if (irq < nr_legacy_irqs) {
2183 disable_8259A_irq(irq); 2197 disable_8259A_irq(irq);
2184 if (i8259A_irq_pending(irq)) 2198 if (i8259A_irq_pending(irq))
2185 was_pending = 1; 2199 was_pending = 1;
@@ -2657,7 +2671,7 @@ static inline void init_IO_APIC_traps(void)
2657 * so default to an old-fashioned 8259 2671 * so default to an old-fashioned 8259
2658 * interrupt if we can.. 2672 * interrupt if we can..
2659 */ 2673 */
2660 if (irq < NR_IRQS_LEGACY) 2674 if (irq < nr_legacy_irqs)
2661 make_8259A_irq(irq); 2675 make_8259A_irq(irq);
2662 else 2676 else
2663 /* Strange. Oh, well.. */ 2677 /* Strange. Oh, well.. */
@@ -2993,7 +3007,7 @@ out:
2993 * the I/O APIC in all cases now. No actual device should request 3007 * the I/O APIC in all cases now. No actual device should request
2994 * it anyway. --macro 3008 * it anyway. --macro
2995 */ 3009 */
2996#define PIC_IRQS (1 << PIC_CASCADE_IR) 3010#define PIC_IRQS (1UL << PIC_CASCADE_IR)
2997 3011
2998void __init setup_IO_APIC(void) 3012void __init setup_IO_APIC(void)
2999{ 3013{
@@ -3001,21 +3015,19 @@ void __init setup_IO_APIC(void)
3001 /* 3015 /*
3002 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3016 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3003 */ 3017 */
3004 3018 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3005 io_apic_irqs = ~PIC_IRQS;
3006 3019
3007 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3020 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3008 /* 3021 /*
3009 * Set up IO-APIC IRQ routing. 3022 * Set up IO-APIC IRQ routing.
3010 */ 3023 */
3011#ifdef CONFIG_X86_32 3024 x86_init.mpparse.setup_ioapic_ids();
3012 if (!acpi_ioapic) 3025
3013 setup_ioapic_ids_from_mpc();
3014#endif
3015 sync_Arb_IDs(); 3026 sync_Arb_IDs();
3016 setup_IO_APIC_irqs(); 3027 setup_IO_APIC_irqs();
3017 init_IO_APIC_traps(); 3028 init_IO_APIC_traps();
3018 check_timer(); 3029 if (nr_legacy_irqs)
3030 check_timer();
3019} 3031}
3020 3032
3021/* 3033/*
@@ -3116,7 +3128,6 @@ static int __init ioapic_init_sysfs(void)
3116 3128
3117device_initcall(ioapic_init_sysfs); 3129device_initcall(ioapic_init_sysfs);
3118 3130
3119static int nr_irqs_gsi = NR_IRQS_LEGACY;
3120/* 3131/*
3121 * Dynamic irq allocate and deallocation 3132 * Dynamic irq allocate and deallocation
3122 */ 3133 */
@@ -3856,7 +3867,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3856 /* 3867 /*
3857 * IRQs < 16 are already in the irq_2_pin[] map 3868 * IRQs < 16 are already in the irq_2_pin[] map
3858 */ 3869 */
3859 if (irq >= NR_IRQS_LEGACY) { 3870 if (irq >= nr_legacy_irqs) {
3860 cfg = desc->chip_data; 3871 cfg = desc->chip_data;
3861 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { 3872 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3862 printk(KERN_INFO "can not add pin %d for irq %d\n", 3873 printk(KERN_INFO "can not add pin %d for irq %d\n",
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index db7220220d09..7ff61d6a188a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_NEW_MCE) 69#if defined(CONFIG_X86_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
@@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
508/* 508/*
509 * proc handler for /proc/sys/kernel/nmi 509 * proc handler for /proc/sys/kernel/nmi
510 */ 510 */
511int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, 511int proc_nmi_enabled(struct ctl_table *table, int write,
512 void __user *buffer, size_t *length, loff_t *ppos) 512 void __user *buffer, size_t *length, loff_t *ppos)
513{ 513{
514 int old_state; 514 int old_state;
515 515
516 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; 516 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
517 old_state = nmi_watchdog_enabled; 517 old_state = nmi_watchdog_enabled;
518 proc_dointvec(table, write, file, buffer, length, ppos); 518 proc_dointvec(table, write, buffer, length, ppos);
519 if (!!old_state == !!nmi_watchdog_enabled) 519 if (!!old_state == !!nmi_watchdog_enabled)
520 return 0; 520 return 0;
521 521
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index ca96e68f0d23..efa00e2b8505 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -66,7 +66,6 @@ struct mpc_trans {
66 unsigned short trans_reserved; 66 unsigned short trans_reserved;
67}; 67};
68 68
69/* x86_quirks member */
70static int mpc_record; 69static int mpc_record;
71 70
72static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; 71static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
@@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void)
130 } 129 }
131} 130}
132 131
133static int __init numaq_pre_time_init(void) 132static void __init numaq_tsc_init(void)
134{ 133{
135 numaq_tsc_disable(); 134 numaq_tsc_disable();
136 return 0;
137} 135}
138 136
139static inline int generate_logical_apicid(int quad, int phys_apicid) 137static inline int generate_logical_apicid(int quad, int phys_apicid)
@@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m)
177 quad_local_to_mp_bus_id[quad][local] = m->busid; 175 quad_local_to_mp_bus_id[quad][local] = m->busid;
178} 176}
179 177
178/*
179 * Called from mpparse code.
180 * mode = 0: prescan
181 * mode = 1: one mpc entry scanned
182 */
183static void numaq_mpc_record(unsigned int mode)
184{
185 if (!mode)
186 mpc_record = 0;
187 else
188 mpc_record++;
189}
190
180static void __init MP_translation_info(struct mpc_trans *m) 191static void __init MP_translation_info(struct mpc_trans *m)
181{ 192{
182 printk(KERN_INFO 193 printk(KERN_INFO
@@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len)
206/* 217/*
207 * Read/parse the MPC oem tables 218 * Read/parse the MPC oem tables
208 */ 219 */
209static void __init 220static void __init smp_read_mpc_oem(struct mpc_table *mpc)
210 smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
211{ 221{
222 struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
212 int count = sizeof(*oemtable); /* the header size */ 223 int count = sizeof(*oemtable); /* the header size */
213 unsigned char *oemptr = ((unsigned char *)oemtable) + count; 224 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
214 225
@@ -250,29 +261,6 @@ static void __init
250 } 261 }
251} 262}
252 263
253static int __init numaq_setup_ioapic_ids(void)
254{
255 /* so can skip it */
256 return 1;
257}
258
259static struct x86_quirks numaq_x86_quirks __initdata = {
260 .arch_pre_time_init = numaq_pre_time_init,
261 .arch_time_init = NULL,
262 .arch_pre_intr_init = NULL,
263 .arch_memory_setup = NULL,
264 .arch_intr_init = NULL,
265 .arch_trap_init = NULL,
266 .mach_get_smp_config = NULL,
267 .mach_find_smp_config = NULL,
268 .mpc_record = &mpc_record,
269 .mpc_apic_id = mpc_apic_id,
270 .mpc_oem_bus_info = mpc_oem_bus_info,
271 .mpc_oem_pci_bus = mpc_oem_pci_bus,
272 .smp_read_mpc_oem = smp_read_mpc_oem,
273 .setup_ioapic_ids = numaq_setup_ioapic_ids,
274};
275
276static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
277{ 265{
278 /* 266 /*
@@ -286,8 +274,15 @@ static __init void early_check_numaq(void)
286 if (smp_found_config) 274 if (smp_found_config)
287 early_get_smp_config(); 275 early_get_smp_config();
288 276
289 if (found_numaq) 277 if (found_numaq) {
290 x86_quirks = &numaq_x86_quirks; 278 x86_init.mpparse.mpc_record = numaq_mpc_record;
279 x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
280 x86_init.mpparse.mpc_apic_id = mpc_apic_id;
281 x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
282 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
283 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
284 x86_init.timers.tsc_pre_init = numaq_tsc_init;
285 }
291} 286}
292 287
293int __init get_memcfg_numaq(void) 288int __init get_memcfg_numaq(void)
@@ -418,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
418/* Where the IO area was mapped on multiquad, always 0 otherwise */ 413/* Where the IO area was mapped on multiquad, always 0 otherwise */
419void *xquad_portio; 414void *xquad_portio;
420 415
421static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) 416static inline int numaq_check_phys_apicid_present(int phys_apicid)
422{ 417{
423 return 1; 418 return 1;
424} 419}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 65edc180fc82..c4cbd3080c1c 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -64,16 +64,23 @@ void __init default_setup_apic_routing(void)
64 apic = &apic_x2apic_phys; 64 apic = &apic_x2apic_phys;
65 else 65 else
66 apic = &apic_x2apic_cluster; 66 apic = &apic_x2apic_cluster;
67 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
68 } 67 }
69#endif 68#endif
70 69
71 if (apic == &apic_flat) { 70 if (apic == &apic_flat) {
72 if (max_physical_apicid >= 8) 71 switch (boot_cpu_data.x86_vendor) {
73 apic = &apic_physflat; 72 case X86_VENDOR_INTEL:
74 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
75 } 80 }
76 81
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83
77 if (is_vsmp_box()) { 84 if (is_vsmp_box()) {
78 /* need to update phys_pkg_id */ 85 /* need to update phys_pkg_id */
79 apic->phys_pkg_id = apicid_phys_pkg_id; 86 apic->phys_pkg_id = apicid_phys_pkg_id;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index eafdfbd1ea95..645ecc4ff0be 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid)
272 return physid_mask_of_physid(0); 272 return physid_mask_of_physid(0);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
276{ 276{
277 return 1; 277 return 1;
278} 278}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 601159374e87..f5f5886a6b53 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode)
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
390} 390}
391 391
392static __init void map_mmr_high(int max_pnode)
393{
394 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
395 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
396
397 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
398 if (mmr.s.enable)
399 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
400}
401
392static __init void map_mmioh_high(int max_pnode) 402static __init void map_mmioh_high(int max_pnode)
393{ 403{
394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 404 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -643,6 +653,7 @@ void __init uv_system_init(void)
643 } 653 }
644 654
645 map_gru_high(max_pnode); 655 map_gru_high(max_pnode);
656 map_mmr_high(max_pnode);
646 map_mmioh_high(max_pnode); 657 map_mmioh_high(max_pnode);
647 658
648 uv_cpu_init(); 659 uv_cpu_init();
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac155..68537e957a9b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
13 13
14obj-y := intel_cacheinfo.o addon_cpuid_features.o 14obj-y := intel_cacheinfo.o addon_cpuid_features.o
15obj-y += proc.o capflags.o powerflags.o common.o 15obj-y += proc.o capflags.o powerflags.o common.o
16obj-y += vmware.o hypervisor.o 16obj-y += vmware.o hypervisor.o sched.o
17 17
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 19obj-$(CONFIG_X86_64) += bugs_64.o
@@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
27obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 27obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
28obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 28obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
29 29
30obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 30obj-$(CONFIG_PERF_EVENTS) += perf_event.o
31 31
32obj-$(CONFIG_X86_MCE) += mcheck/ 32obj-$(CONFIG_X86_MCE) += mcheck/
33obj-$(CONFIG_MTRR) += mtrr/ 33obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 22a47c82f3c0..c910a716a71c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
184 * approved Athlon 184 * approved Athlon
185 */ 185 */
186 WARN_ONCE(1, "WARNING: This combination of AMD" 186 WARN_ONCE(1, "WARNING: This combination of AMD"
187 "processors is not suitable for SMP.\n"); 187 " processors is not suitable for SMP.\n");
188 if (!test_taint(TAINT_UNSAFE_SMP)) 188 if (!test_taint(TAINT_UNSAFE_SMP))
189 add_taint(TAINT_UNSAFE_SMP); 189 add_taint(TAINT_UNSAFE_SMP);
190 190
@@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
333#endif 333#endif
334} 334}
335 335
336int amd_get_nb_id(int cpu)
337{
338 int id = 0;
339#ifdef CONFIG_SMP
340 id = per_cpu(cpu_llc_id, cpu);
341#endif
342 return id;
343}
344EXPORT_SYMBOL_GPL(amd_get_nb_id);
345
336static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 346static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
337{ 347{
338#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 348#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2055fc2b2e6b..cc25c2b4a567 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,7 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h> 16#include <asm/perf_event.h>
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
19#include <asm/processor.h> 19#include <asm/processor.h>
@@ -34,7 +34,6 @@
34#include <asm/mce.h> 34#include <asm/mce.h>
35#include <asm/msr.h> 35#include <asm/msr.h>
36#include <asm/pat.h> 36#include <asm/pat.h>
37#include <linux/smp.h>
38 37
39#ifdef CONFIG_X86_LOCAL_APIC 38#ifdef CONFIG_X86_LOCAL_APIC
40#include <asm/uv/uv.h> 39#include <asm/uv/uv.h>
@@ -870,7 +869,7 @@ void __init identify_boot_cpu(void)
870#else 869#else
871 vgetcpu_set_mode(); 870 vgetcpu_set_mode();
872#endif 871#endif
873 init_hw_perf_counters(); 872 init_hw_perf_events();
874} 873}
875 874
876void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 875void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd0403..dca325c03999 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
36 36
37static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220ca..7d5c3b0ea8da 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <trace/power.h> 36#include <trace/events/power.h>
37 37
38#include <linux/acpi.h> 38#include <linux/acpi.h>
39#include <linux/io.h> 39#include <linux/io.h>
@@ -60,7 +60,6 @@ enum {
60}; 60};
61 61
62#define INTEL_MSR_RANGE (0xffff) 62#define INTEL_MSR_RANGE (0xffff)
63#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
64 63
65struct acpi_cpufreq_data { 64struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data; 65 struct acpi_processor_performance *acpi_data;
@@ -71,13 +70,7 @@ struct acpi_cpufreq_data {
71 70
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
73 72
74struct acpi_msr_data { 73static DEFINE_PER_CPU(struct aperfmperf, old_perf);
75 u64 saved_aperf, saved_mperf;
76};
77
78static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
79
80DEFINE_TRACE(power_mark);
81 74
82/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
83static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
@@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask)
244 return cmd.val; 237 return cmd.val;
245} 238}
246 239
247struct perf_pair {
248 union {
249 struct {
250 u32 lo;
251 u32 hi;
252 } split;
253 u64 whole;
254 } aperf, mperf;
255};
256
257/* Called via smp_call_function_single(), on the target CPU */ 240/* Called via smp_call_function_single(), on the target CPU */
258static void read_measured_perf_ctrs(void *_cur) 241static void read_measured_perf_ctrs(void *_cur)
259{ 242{
260 struct perf_pair *cur = _cur; 243 struct aperfmperf *am = _cur;
261 244
262 rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); 245 get_aperfmperf(am);
263 rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
264} 246}
265 247
266/* 248/*
@@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur)
279static unsigned int get_measured_perf(struct cpufreq_policy *policy, 261static unsigned int get_measured_perf(struct cpufreq_policy *policy,
280 unsigned int cpu) 262 unsigned int cpu)
281{ 263{
282 struct perf_pair readin, cur; 264 struct aperfmperf perf;
283 unsigned int perf_percent; 265 unsigned long ratio;
284 unsigned int retval; 266 unsigned int retval;
285 267
286 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) 268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
287 return 0; 269 return 0;
288 270
289 cur.aperf.whole = readin.aperf.whole - 271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
290 per_cpu(msr_data, cpu).saved_aperf; 272 per_cpu(old_perf, cpu) = perf;
291 cur.mperf.whole = readin.mperf.whole -
292 per_cpu(msr_data, cpu).saved_mperf;
293 per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
294 per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
295
296#ifdef __i386__
297 /*
298 * We dont want to do 64 bit divide with 32 bit kernel
299 * Get an approximate value. Return failure in case we cannot get
300 * an approximate value.
301 */
302 if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
303 int shift_count;
304 u32 h;
305
306 h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
307 shift_count = fls(h);
308
309 cur.aperf.whole >>= shift_count;
310 cur.mperf.whole >>= shift_count;
311 }
312
313 if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
314 int shift_count = 7;
315 cur.aperf.split.lo >>= shift_count;
316 cur.mperf.split.lo >>= shift_count;
317 }
318
319 if (cur.aperf.split.lo && cur.mperf.split.lo)
320 perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
321 else
322 perf_percent = 0;
323
324#else
325 if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
326 int shift_count = 7;
327 cur.aperf.whole >>= shift_count;
328 cur.mperf.whole >>= shift_count;
329 }
330
331 if (cur.aperf.whole && cur.mperf.whole)
332 perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
333 else
334 perf_percent = 0;
335
336#endif
337 273
338 retval = (policy->cpuinfo.max_freq * perf_percent) / 100; 274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
339 275
340 return retval; 276 return retval;
341} 277}
@@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
394 unsigned int next_perf_state = 0; /* Index into perf table */ 330 unsigned int next_perf_state = 0; /* Index into perf table */
395 unsigned int i; 331 unsigned int i;
396 int result = 0; 332 int result = 0;
397 struct power_trace it;
398 333
399 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 334 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
400 335
@@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
426 } 361 }
427 } 362 }
428 363
429 trace_power_mark(&it, POWER_PSTATE, next_perf_state); 364 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
430 365
431 switch (data->cpu_feature) { 366 switch (data->cpu_feature) {
432 case SYSTEM_INTEL_MSR_CAPABLE: 367 case SYSTEM_INTEL_MSR_CAPABLE:
@@ -588,6 +523,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
588 }, 523 },
589 { } 524 { }
590}; 525};
526
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/
533 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) &&
535 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable())
537 return -ENODEV;
538 }
539 return 0;
540}
591#endif 541#endif
592 542
593static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) 543static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
@@ -602,6 +552,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
602 552
603 dprintk("acpi_cpufreq_cpu_init\n"); 553 dprintk("acpi_cpufreq_cpu_init\n");
604 554
555#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c);
557 if (result)
558 return result;
559#endif
560
605 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
606 if (!data) 562 if (!data)
607 return -ENOMEM; 563 return -ENOMEM;
@@ -731,12 +687,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
731 acpi_processor_notify_smm(THIS_MODULE); 687 acpi_processor_notify_smm(THIS_MODULE);
732 688
733 /* Check for APERF/MPERF support in hardware */ 689 /* Check for APERF/MPERF support in hardware */
734 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { 690 if (cpu_has(c, X86_FEATURE_APERFMPERF))
735 unsigned int ecx; 691 acpi_cpufreq_driver.getavg = get_measured_perf;
736 ecx = cpuid_ecx(6);
737 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
738 acpi_cpufreq_driver.getavg = get_measured_perf;
739 }
740 692
741 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 693 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
742 for (i = 0; i < perf->state_count; i++) 694 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 2a50ef891000..6394aa5c7985 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
605 return 0; 605 return 0;
606} 606}
607 607
608static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) 608static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
609 unsigned int entry)
609{ 610{
610 data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; 611 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
611} 612}
612 613
613static void print_basics(struct powernow_k8_data *data) 614static void print_basics(struct powernow_k8_data *data)
@@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
854 goto err_out; 855 goto err_out;
855 } 856 }
856 857
858 /* fill in data */
859 data->numps = data->acpi_data.state_count;
860 powernow_k8_acpi_pst_values(data, 0);
861
857 if (cpu_family == CPU_HW_PSTATE) 862 if (cpu_family == CPU_HW_PSTATE)
858 ret_val = fill_powernow_table_pstate(data, powernow_table); 863 ret_val = fill_powernow_table_pstate(data, powernow_table);
859 else 864 else
@@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
866 powernow_table[data->acpi_data.state_count].index = 0; 871 powernow_table[data->acpi_data.state_count].index = 0;
867 data->powernow_table = powernow_table; 872 data->powernow_table = powernow_table;
868 873
869 /* fill in data */
870 data->numps = data->acpi_data.state_count;
871 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) 874 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
872 print_basics(data); 875 print_basics(data);
873 powernow_k8_acpi_pst_values(data, 0);
874 876
875 /* notify BIOS that we exist */ 877 /* notify BIOS that we exist */
876 acpi_processor_notify_smm(THIS_MODULE); 878 acpi_processor_notify_smm(THIS_MODULE);
@@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
914 "bad value %d.\n", i, index); 916 "bad value %d.\n", i, index);
915 printk(KERN_ERR PFX "Please report to BIOS " 917 printk(KERN_ERR PFX "Please report to BIOS "
916 "manufacturer\n"); 918 "manufacturer\n");
917 invalidate_entry(data, i); 919 invalidate_entry(powernow_table, i);
918 continue; 920 continue;
919 } 921 }
920 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); 922 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
921 if (!(hi & HW_PSTATE_VALID_MASK)) { 923 if (!(hi & HW_PSTATE_VALID_MASK)) {
922 dprintk("invalid pstate %d, ignoring\n", index); 924 dprintk("invalid pstate %d, ignoring\n", index);
923 invalidate_entry(data, i); 925 invalidate_entry(powernow_table, i);
924 continue; 926 continue;
925 } 927 }
926 928
@@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
941 struct cpufreq_frequency_table *powernow_table) 943 struct cpufreq_frequency_table *powernow_table)
942{ 944{
943 int i; 945 int i;
944 int cntlofreq = 0;
945 946
946 for (i = 0; i < data->acpi_data.state_count; i++) { 947 for (i = 0; i < data->acpi_data.state_count; i++) {
947 u32 fid; 948 u32 fid;
@@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
970 /* verify frequency is OK */ 971 /* verify frequency is OK */
971 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { 972 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
972 dprintk("invalid freq %u kHz, ignoring\n", freq); 973 dprintk("invalid freq %u kHz, ignoring\n", freq);
973 invalidate_entry(data, i); 974 invalidate_entry(powernow_table, i);
974 continue; 975 continue;
975 } 976 }
976 977
@@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
978 * BIOSs are using "off" to indicate invalid */ 979 * BIOSs are using "off" to indicate invalid */
979 if (vid == VID_OFF) { 980 if (vid == VID_OFF) {
980 dprintk("invalid vid %u, ignoring\n", vid); 981 dprintk("invalid vid %u, ignoring\n", vid);
981 invalidate_entry(data, i); 982 invalidate_entry(powernow_table, i);
982 continue; 983 continue;
983 } 984 }
984 985
985 /* verify only 1 entry from the lo frequency table */
986 if (fid < HI_FID_TABLE_BOTTOM) {
987 if (cntlofreq) {
988 /* if both entries are the same,
989 * ignore this one ... */
990 if ((freq != powernow_table[cntlofreq].frequency) ||
991 (index != powernow_table[cntlofreq].index)) {
992 printk(KERN_ERR PFX
993 "Too many lo freq table "
994 "entries\n");
995 return 1;
996 }
997
998 dprintk("double low frequency table entry, "
999 "ignoring it.\n");
1000 invalidate_entry(data, i);
1001 continue;
1002 } else
1003 cntlofreq = i;
1004 }
1005
1006 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { 986 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
1007 printk(KERN_INFO PFX "invalid freq entries " 987 printk(KERN_INFO PFX "invalid freq entries "
1008 "%u kHz vs. %u kHz\n", freq, 988 "%u kHz vs. %u kHz\n", freq,
1009 (unsigned int) 989 (unsigned int)
1010 (data->acpi_data.states[i].core_frequency 990 (data->acpi_data.states[i].core_frequency
1011 * 1000)); 991 * 1000));
1012 invalidate_entry(data, i); 992 invalidate_entry(powernow_table, i);
1013 continue; 993 continue;
1014 } 994 }
1015 } 995 }
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 93ba8eeb100a..08be922de33a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; 34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35} 35}
36 36
37unsigned long get_hypervisor_tsc_freq(void)
38{
39 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
40 return vmware_get_tsc_khz();
41 return 0;
42}
43
44static inline void __cpuinit 37static inline void __cpuinit
45hypervisor_set_feature_bits(struct cpuinfo_x86 *c) 38hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
46{ 39{
@@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
55 detect_hypervisor_vendor(c); 48 detect_hypervisor_vendor(c);
56 hypervisor_set_feature_bits(c); 49 hypervisor_set_feature_bits(c);
57} 50}
51
52void __init init_hypervisor_platform(void)
53{
54 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
56 vmware_platform_setup();
57}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 80a722a071b5..40e1835b35e8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
351 } 351 }
352 352
353 if (c->cpuid_level > 6) {
354 unsigned ecx = cpuid_ecx(6);
355 if (ecx & 0x01)
356 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
357 }
358
353 if (cpu_has_xmm2) 359 if (cpu_has_xmm2)
354 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 360 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
355 if (cpu_has_ds) { 361 if (cpu_has_ds) {
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 188a1ca5ad2b..4ac6d48fe11b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,8 @@
1obj-y = mce.o 1obj-y = mce.o mce-severity.o
2 2
3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o 3obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 6obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11 8
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
deleted file mode 100644
index b945d5dbc609..000000000000
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ /dev/null
@@ -1,116 +0,0 @@
1/*
2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For AMD Athlon/Duron: */
17static void k7_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 1; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57
58 /* Clear it: */
59 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
60 /* Serialize: */
61 wmb();
62 add_taint(TAINT_MACHINE_CHECK);
63 }
64 }
65
66 if (recover & 2)
67 panic("CPU context corrupt");
68 if (recover & 1)
69 panic("Unable to continue");
70
71 printk(KERN_EMERG "Attempting to continue.\n");
72
73 mcgstl &= ~(1<<2);
74 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
75}
76
77
78/* AMD K7 machine check is Intel like: */
79void amd_mcheck_init(struct cpuinfo_x86 *c)
80{
81 u32 l, h;
82 int i;
83
84 if (!cpu_has(c, X86_FEATURE_MCE))
85 return;
86
87 machine_check_vector = k7_machine_check;
88 /* Make sure the vector pointer is visible before we enable MCEs: */
89 wmb();
90
91 printk(KERN_INFO "Intel machine check architecture supported.\n");
92
93 rdmsr(MSR_IA32_MCG_CAP, l, h);
94 if (l & (1<<8)) /* Control register present ? */
95 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
96 nr_mce_banks = l & 0xff;
97
98 /*
99 * Clear status for MC index 0 separately, we don't touch CTL,
100 * as some K7 Athlons cause spurious MCEs when its enabled:
101 */
102 if (boot_cpu_data.x86 == 6) {
103 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
104 i = 1;
105 } else
106 i = 0;
107
108 for (; i < nr_mce_banks; i++) {
109 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
110 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
111 }
112
113 set_in_cr4(X86_CR4_MCE);
114 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
115 smp_processor_id());
116}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a3a235a53f09..472763d92098 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -18,7 +18,12 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
21#include <linux/notifier.h>
22#include <linux/kdebug.h>
23#include <linux/cpu.h>
24#include <linux/sched.h>
21#include <asm/mce.h> 25#include <asm/mce.h>
26#include <asm/apic.h>
22 27
23/* Update fake mce registers on current CPU. */ 28/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m) 29static void inject_mce(struct mce *m)
@@ -39,44 +44,142 @@ static void inject_mce(struct mce *m)
39 i->finished = 1; 44 i->finished = 1;
40} 45}
41 46
42struct delayed_mce { 47static void raise_poll(struct mce *m)
43 struct timer_list timer; 48{
44 struct mce m; 49 unsigned long flags;
45}; 50 mce_banks_t b;
46 51
47/* Inject mce on current CPU */ 52 memset(&b, 0xff, sizeof(mce_banks_t));
48static void raise_mce(unsigned long data) 53 local_irq_save(flags);
54 machine_check_poll(0, &b);
55 local_irq_restore(flags);
56 m->finished = 0;
57}
58
59static void raise_exception(struct mce *m, struct pt_regs *pregs)
49{ 60{
50 struct delayed_mce *dm = (struct delayed_mce *)data; 61 struct pt_regs regs;
51 struct mce *m = &dm->m; 62 unsigned long flags;
52 int cpu = m->extcpu;
53 63
54 inject_mce(m); 64 if (!pregs) {
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs)); 65 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip; 66 regs.ip = m->ip;
59 regs.cs = m->cs; 67 regs.cs = m->cs;
68 pregs = &regs;
69 }
70 /* in mcheck exeception handler, irq will be disabled */
71 local_irq_save(flags);
72 do_machine_check(pregs, 0);
73 local_irq_restore(flags);
74 m->finished = 0;
75}
76
77static cpumask_t mce_inject_cpumask;
78
79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data)
81{
82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs);
90 else if (m->status)
91 raise_poll(m);
92 return NOTIFY_STOP;
93}
94
95static struct notifier_block mce_raise_nb = {
96 .notifier_call = mce_raise_notify,
97 .priority = 1000,
98};
99
100/* Inject mce on current CPU */
101static int raise_local(void)
102{
103 struct mce *m = &__get_cpu_var(injectm);
104 int context = MCJ_CTX(m->inject_flags);
105 int ret = 0;
106 int cpu = m->extcpu;
107
108 if (m->inject_flags & MCJ_EXCEPTION) {
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); 109 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0); 110 switch (context) {
111 case MCJ_CTX_IRQ:
112 /*
113 * Could do more to fake interrupts like
114 * calling irq_enter, but the necessary
115 * machinery isn't exported currently.
116 */
117 /*FALL THROUGH*/
118 case MCJ_CTX_PROCESS:
119 raise_exception(m, NULL);
120 break;
121 default:
122 printk(KERN_INFO "Invalid MCE context\n");
123 ret = -EINVAL;
124 }
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); 125 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else { 126 } else if (m->status) {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); 127 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b); 128 raise_poll(m);
68 mce_notify_irq(); 129 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n", 130 printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
70 cpu); 131 } else
71 } 132 m->finished = 0;
72 kfree(dm); 133
134 return ret;
135}
136
137static void raise_mce(struct mce *m)
138{
139 int context = MCJ_CTX(m->inject_flags);
140
141 inject_mce(m);
142
143 if (context == MCJ_CTX_RANDOM)
144 return;
145
146#ifdef CONFIG_X86_LOCAL_APIC
147 if (m->inject_flags & MCJ_NMI_BROADCAST) {
148 unsigned long start;
149 int cpu;
150 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map;
152 cpu_clear(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask);
158 }
159 if (!cpus_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask));
167 break;
168 }
169 cpu_relax();
170 }
171 raise_local();
172 put_cpu();
173 put_online_cpus();
174 } else
175#endif
176 raise_local();
73} 177}
74 178
75/* Error injection interface */ 179/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf, 180static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off) 181 size_t usize, loff_t *off)
78{ 182{
79 struct delayed_mce *dm;
80 struct mce m; 183 struct mce m;
81 184
82 if (!capable(CAP_SYS_ADMIN)) 185 if (!capable(CAP_SYS_ADMIN))
@@ -96,19 +199,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) 199 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL; 200 return -EINVAL;
98 201
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /* 202 /*
104 * Need to give user space some time to set everything up, 203 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere. 204 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */ 205 */
108 memcpy(&dm->m, &m, sizeof(struct mce)); 206 schedule_timeout(2);
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm); 207 raise_mce(&m);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize; 208 return usize;
113} 209}
114 210
@@ -116,6 +212,7 @@ static int inject_init(void)
116{ 212{
117 printk(KERN_INFO "Machine check injector initialized\n"); 213 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write; 214 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb);
119 return 0; 216 return 0;
120} 217}
121 218
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 54dcb8ff12e5..32996f9fab67 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,4 @@
1#include <linux/sysdev.h>
1#include <asm/mce.h> 2#include <asm/mce.h>
2 3
3enum severity_level { 4enum severity_level {
@@ -10,6 +11,20 @@ enum severity_level {
10 MCE_PANIC_SEVERITY, 11 MCE_PANIC_SEVERITY,
11}; 12};
12 13
14#define ATTR_LEN 16
15
16/* One object for each MCE bank, shared by all CPUs */
17struct mce_bank {
18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */
20 struct sysdev_attribute attr; /* sysdev attribute */
21 char attrname[ATTR_LEN]; /* attribute name */
22};
23
13int mce_severity(struct mce *a, int tolerant, char **msg); 24int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void);
14 26
15extern int mce_ser; 27extern int mce_ser;
28
29extern struct mce_bank *mce_banks;
30
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index ff0807f97056..8a85dd1b1aa1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
139 } 139 }
140} 140}
141 141
142#ifdef CONFIG_DEBUG_FS
142static void *s_start(struct seq_file *f, loff_t *pos) 143static void *s_start(struct seq_file *f, loff_t *pos)
143{ 144{
144 if (*pos >= ARRAY_SIZE(severities)) 145 if (*pos >= ARRAY_SIZE(severities))
@@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void)
197{ 198{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL; 199 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199 200
200 dmce = debugfs_create_dir("mce", NULL); 201 dmce = mce_get_debugfs_dir();
201 if (dmce == NULL) 202 if (dmce == NULL)
202 goto err_out; 203 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage", 204 fseverities_coverage = debugfs_create_file("severities-coverage",
@@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void)
209 return 0; 210 return 0;
210 211
211err_out: 212err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM; 213 return -ENOMEM;
217} 214}
218late_initcall(severities_debugfs_init); 215late_initcall(severities_debugfs_init);
216#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9bfe9d2ea615..b1598a9436d0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/debugfs.h>
37 38
38#include <asm/processor.h> 39#include <asm/processor.h>
39#include <asm/hw_irq.h> 40#include <asm/hw_irq.h>
@@ -45,21 +46,8 @@
45 46
46#include "mce-internal.h" 47#include "mce-internal.h"
47 48
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52 smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57 unexpected_machine_check;
58
59int mce_disabled __read_mostly; 49int mce_disabled __read_mostly;
60 50
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR 227 51#define MISC_MCELOG_MINOR 227
64 52
65#define SPINUNIT 100 /* 100ns */ 53#define SPINUNIT 100 /* 100ns */
@@ -77,7 +65,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
77 */ 65 */
78static int tolerant __read_mostly = 1; 66static int tolerant __read_mostly = 1;
79static int banks __read_mostly; 67static int banks __read_mostly;
80static u64 *bank __read_mostly;
81static int rip_msr __read_mostly; 68static int rip_msr __read_mostly;
82static int mce_bootlog __read_mostly = -1; 69static int mce_bootlog __read_mostly = -1;
83static int monarch_timeout __read_mostly = -1; 70static int monarch_timeout __read_mostly = -1;
@@ -87,28 +74,35 @@ int mce_cmci_disabled __read_mostly;
87int mce_ignore_ce __read_mostly; 74int mce_ignore_ce __read_mostly;
88int mce_ser __read_mostly; 75int mce_ser __read_mostly;
89 76
77struct mce_bank *mce_banks __read_mostly;
78
90/* User mode helper program triggered by machine check event */ 79/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify; 80static unsigned long mce_need_notify;
92static char mce_helper[128]; 81static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL }; 82static char *mce_helper_argv[2] = { mce_helper, NULL };
94 83
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 84static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen); 85static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing; 86static int cpu_missing;
100 87
88static void default_decode_mce(struct mce *m)
89{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
92}
93
94/*
95 * CPU/chipset specific EDAC code can register a callback here to print
96 * MCE errors in a human-readable form:
97 */
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
101 100
102/* MCA banks polled by the period polling timer for corrected events */ 101/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 103 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105}; 104};
106 105
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work); 106static DEFINE_PER_CPU(struct work_struct, mce_work);
113 107
114/* Do initial initialization of a struct mce */ 108/* Do initial initialization of a struct mce */
@@ -183,59 +177,60 @@ void mce_log(struct mce *mce)
183 set_bit(0, &mce_need_notify); 177 set_bit(0, &mce_need_notify);
184} 178}
185 179
186void __weak decode_mce(struct mce *m)
187{
188 return;
189}
190
191static void print_mce(struct mce *m) 180static void print_mce(struct mce *m)
192{ 181{
193 printk(KERN_EMERG 182 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
194 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
195 m->extcpu, m->mcgstatus, m->bank, m->status); 183 m->extcpu, m->mcgstatus, m->bank, m->status);
184
196 if (m->ip) { 185 if (m->ip) {
197 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 186 pr_emerg("RIP%s %02x:<%016Lx> ",
198 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 187 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
199 m->cs, m->ip); 188 m->cs, m->ip);
189
200 if (m->cs == __KERNEL_CS) 190 if (m->cs == __KERNEL_CS)
201 print_symbol("{%s}", m->ip); 191 print_symbol("{%s}", m->ip);
202 printk(KERN_CONT "\n"); 192 pr_cont("\n");
203 } 193 }
204 printk(KERN_EMERG "TSC %llx ", m->tsc); 194
195 pr_emerg("TSC %llx ", m->tsc);
205 if (m->addr) 196 if (m->addr)
206 printk(KERN_CONT "ADDR %llx ", m->addr); 197 pr_cont("ADDR %llx ", m->addr);
207 if (m->misc) 198 if (m->misc)
208 printk(KERN_CONT "MISC %llx ", m->misc); 199 pr_cont("MISC %llx ", m->misc);
209 printk(KERN_CONT "\n");
210 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
211 m->cpuvendor, m->cpuid, m->time, m->socketid,
212 m->apicid);
213 200
214 decode_mce(m); 201 pr_cont("\n");
202 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
203 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
204
205 /*
206 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that):
208 */
209 x86_mce_decode_callback(m);
215} 210}
216 211
217static void print_mce_head(void) 212static void print_mce_head(void)
218{ 213{
219 printk(KERN_EMERG "\nHARDWARE ERROR\n"); 214 pr_emerg("\nHARDWARE ERROR\n");
220} 215}
221 216
222static void print_mce_tail(void) 217static void print_mce_tail(void)
223{ 218{
224 printk(KERN_EMERG "This is not a software problem!\n" 219 pr_emerg("This is not a software problem!\n");
225#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
226 "Run through mcelog --ascii to decode and contact your hardware vendor\n"
227#endif
228 );
229} 220}
230 221
231#define PANIC_TIMEOUT 5 /* 5 seconds */ 222#define PANIC_TIMEOUT 5 /* 5 seconds */
232 223
233static atomic_t mce_paniced; 224static atomic_t mce_paniced;
234 225
226static int fake_panic;
227static atomic_t mce_fake_paniced;
228
235/* Panic in progress. Enable interrupts and wait for final IPI */ 229/* Panic in progress. Enable interrupts and wait for final IPI */
236static void wait_for_panic(void) 230static void wait_for_panic(void)
237{ 231{
238 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 232 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
233
239 preempt_disable(); 234 preempt_disable();
240 local_irq_enable(); 235 local_irq_enable();
241 while (timeout-- > 0) 236 while (timeout-- > 0)
@@ -249,15 +244,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
249{ 244{
250 int i; 245 int i;
251 246
252 /* 247 if (!fake_panic) {
253 * Make sure only one CPU runs in machine check panic 248 /*
254 */ 249 * Make sure only one CPU runs in machine check panic
255 if (atomic_add_return(1, &mce_paniced) > 1) 250 */
256 wait_for_panic(); 251 if (atomic_inc_return(&mce_paniced) > 1)
257 barrier(); 252 wait_for_panic();
253 barrier();
258 254
259 bust_spinlocks(1); 255 bust_spinlocks(1);
260 console_verbose(); 256 console_verbose();
257 } else {
258 /* Don't log too much for fake panic */
259 if (atomic_inc_return(&mce_fake_paniced) > 1)
260 return;
261 }
261 print_mce_head(); 262 print_mce_head();
262 /* First print corrected ones that are still unlogged */ 263 /* First print corrected ones that are still unlogged */
263 for (i = 0; i < MCE_LOG_LEN; i++) { 264 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -284,9 +285,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
284 print_mce_tail(); 285 print_mce_tail();
285 if (exp) 286 if (exp)
286 printk(KERN_EMERG "Machine check: %s\n", exp); 287 printk(KERN_EMERG "Machine check: %s\n", exp);
287 if (panic_timeout == 0) 288 if (!fake_panic) {
288 panic_timeout = mce_panic_timeout; 289 if (panic_timeout == 0)
289 panic(msg); 290 panic_timeout = mce_panic_timeout;
291 panic(msg);
292 } else
293 printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
290} 294}
291 295
292/* Support code for software error injection */ 296/* Support code for software error injection */
@@ -294,13 +298,14 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
294static int msr_to_offset(u32 msr) 298static int msr_to_offset(u32 msr)
295{ 299{
296 unsigned bank = __get_cpu_var(injectm.bank); 300 unsigned bank = __get_cpu_var(injectm.bank);
301
297 if (msr == rip_msr) 302 if (msr == rip_msr)
298 return offsetof(struct mce, ip); 303 return offsetof(struct mce, ip);
299 if (msr == MSR_IA32_MC0_STATUS + bank*4) 304 if (msr == MSR_IA32_MCx_STATUS(bank))
300 return offsetof(struct mce, status); 305 return offsetof(struct mce, status);
301 if (msr == MSR_IA32_MC0_ADDR + bank*4) 306 if (msr == MSR_IA32_MCx_ADDR(bank))
302 return offsetof(struct mce, addr); 307 return offsetof(struct mce, addr);
303 if (msr == MSR_IA32_MC0_MISC + bank*4) 308 if (msr == MSR_IA32_MCx_MISC(bank))
304 return offsetof(struct mce, misc); 309 return offsetof(struct mce, misc);
305 if (msr == MSR_IA32_MCG_STATUS) 310 if (msr == MSR_IA32_MCG_STATUS)
306 return offsetof(struct mce, mcgstatus); 311 return offsetof(struct mce, mcgstatus);
@@ -311,13 +316,25 @@ static int msr_to_offset(u32 msr)
311static u64 mce_rdmsrl(u32 msr) 316static u64 mce_rdmsrl(u32 msr)
312{ 317{
313 u64 v; 318 u64 v;
319
314 if (__get_cpu_var(injectm).finished) { 320 if (__get_cpu_var(injectm).finished) {
315 int offset = msr_to_offset(msr); 321 int offset = msr_to_offset(msr);
322
316 if (offset < 0) 323 if (offset < 0)
317 return 0; 324 return 0;
318 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 325 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
319 } 326 }
320 rdmsrl(msr, v); 327
328 if (rdmsrl_safe(msr, &v)) {
329 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
330 /*
331 * Return zero in case the access faulted. This should
332 * not happen normally but can happen if the CPU does
333 * something weird, or if the code is buggy.
334 */
335 v = 0;
336 }
337
321 return v; 338 return v;
322} 339}
323 340
@@ -325,6 +342,7 @@ static void mce_wrmsrl(u32 msr, u64 v)
325{ 342{
326 if (__get_cpu_var(injectm).finished) { 343 if (__get_cpu_var(injectm).finished) {
327 int offset = msr_to_offset(msr); 344 int offset = msr_to_offset(msr);
345
328 if (offset >= 0) 346 if (offset >= 0)
329 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 347 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
330 return; 348 return;
@@ -421,7 +439,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
421 m->ip = mce_rdmsrl(rip_msr); 439 m->ip = mce_rdmsrl(rip_msr);
422} 440}
423 441
424#ifdef CONFIG_X86_LOCAL_APIC 442#ifdef CONFIG_X86_LOCAL_APIC
425/* 443/*
426 * Called after interrupts have been reenabled again 444 * Called after interrupts have been reenabled again
427 * when a MCE happened during an interrupts off region 445 * when a MCE happened during an interrupts off region
@@ -505,7 +523,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
505 523
506 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 524 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
507 for (i = 0; i < banks; i++) { 525 for (i = 0; i < banks; i++) {
508 if (!bank[i] || !test_bit(i, *b)) 526 if (!mce_banks[i].ctl || !test_bit(i, *b))
509 continue; 527 continue;
510 528
511 m.misc = 0; 529 m.misc = 0;
@@ -514,7 +532,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
514 m.tsc = 0; 532 m.tsc = 0;
515 533
516 barrier(); 534 barrier();
517 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 535 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
518 if (!(m.status & MCI_STATUS_VAL)) 536 if (!(m.status & MCI_STATUS_VAL))
519 continue; 537 continue;
520 538
@@ -529,9 +547,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
529 continue; 547 continue;
530 548
531 if (m.status & MCI_STATUS_MISCV) 549 if (m.status & MCI_STATUS_MISCV)
532 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 550 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
533 if (m.status & MCI_STATUS_ADDRV) 551 if (m.status & MCI_STATUS_ADDRV)
534 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 552 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
535 553
536 if (!(flags & MCP_TIMESTAMP)) 554 if (!(flags & MCP_TIMESTAMP))
537 m.tsc = 0; 555 m.tsc = 0;
@@ -547,7 +565,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
547 /* 565 /*
548 * Clear state for this bank. 566 * Clear state for this bank.
549 */ 567 */
550 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 568 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
551 } 569 }
552 570
553 /* 571 /*
@@ -568,7 +586,7 @@ static int mce_no_way_out(struct mce *m, char **msg)
568 int i; 586 int i;
569 587
570 for (i = 0; i < banks; i++) { 588 for (i = 0; i < banks; i++) {
571 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 589 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
572 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 590 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
573 return 1; 591 return 1;
574 } 592 }
@@ -628,7 +646,7 @@ out:
628 * This way we prevent any potential data corruption in a unrecoverable case 646 * This way we prevent any potential data corruption in a unrecoverable case
629 * and also makes sure always all CPU's errors are examined. 647 * and also makes sure always all CPU's errors are examined.
630 * 648 *
631 * Also this detects the case of an machine check event coming from outer 649 * Also this detects the case of a machine check event coming from outer
632 * space (not detected by any CPUs) In this case some external agent wants 650 * space (not detected by any CPUs) In this case some external agent wants
633 * us to shut down, so panic too. 651 * us to shut down, so panic too.
634 * 652 *
@@ -681,7 +699,7 @@ static void mce_reign(void)
681 * No machine check event found. Must be some external 699 * No machine check event found. Must be some external
682 * source or one CPU is hung. Panic. 700 * source or one CPU is hung. Panic.
683 */ 701 */
684 if (!m && tolerant < 3) 702 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
685 mce_panic("Machine check from unknown source", NULL, NULL); 703 mce_panic("Machine check from unknown source", NULL, NULL);
686 704
687 /* 705 /*
@@ -715,7 +733,7 @@ static int mce_start(int *no_way_out)
715 * global_nwo should be updated before mce_callin 733 * global_nwo should be updated before mce_callin
716 */ 734 */
717 smp_wmb(); 735 smp_wmb();
718 order = atomic_add_return(1, &mce_callin); 736 order = atomic_inc_return(&mce_callin);
719 737
720 /* 738 /*
721 * Wait for everyone. 739 * Wait for everyone.
@@ -852,7 +870,7 @@ static void mce_clear_state(unsigned long *toclear)
852 870
853 for (i = 0; i < banks; i++) { 871 for (i = 0; i < banks; i++) {
854 if (test_bit(i, toclear)) 872 if (test_bit(i, toclear))
855 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 873 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
856 } 874 }
857} 875}
858 876
@@ -905,11 +923,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
905 mce_setup(&m); 923 mce_setup(&m);
906 924
907 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 925 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
908 no_way_out = mce_no_way_out(&m, &msg);
909
910 final = &__get_cpu_var(mces_seen); 926 final = &__get_cpu_var(mces_seen);
911 *final = m; 927 *final = m;
912 928
929 no_way_out = mce_no_way_out(&m, &msg);
930
913 barrier(); 931 barrier();
914 932
915 /* 933 /*
@@ -926,14 +944,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
926 order = mce_start(&no_way_out); 944 order = mce_start(&no_way_out);
927 for (i = 0; i < banks; i++) { 945 for (i = 0; i < banks; i++) {
928 __clear_bit(i, toclear); 946 __clear_bit(i, toclear);
929 if (!bank[i]) 947 if (!mce_banks[i].ctl)
930 continue; 948 continue;
931 949
932 m.misc = 0; 950 m.misc = 0;
933 m.addr = 0; 951 m.addr = 0;
934 m.bank = i; 952 m.bank = i;
935 953
936 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 954 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
937 if ((m.status & MCI_STATUS_VAL) == 0) 955 if ((m.status & MCI_STATUS_VAL) == 0)
938 continue; 956 continue;
939 957
@@ -974,9 +992,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
974 kill_it = 1; 992 kill_it = 1;
975 993
976 if (m.status & MCI_STATUS_MISCV) 994 if (m.status & MCI_STATUS_MISCV)
977 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 995 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
978 if (m.status & MCI_STATUS_ADDRV) 996 if (m.status & MCI_STATUS_ADDRV)
979 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 997 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
980 998
981 /* 999 /*
982 * Action optional error. Queue address for later processing. 1000 * Action optional error. Queue address for later processing.
@@ -1101,7 +1119,7 @@ void mce_log_therm_throt_event(__u64 status)
1101 */ 1119 */
1102static int check_interval = 5 * 60; /* 5 minutes */ 1120static int check_interval = 5 * 60; /* 5 minutes */
1103 1121
1104static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1105static DEFINE_PER_CPU(struct timer_list, mce_timer); 1123static DEFINE_PER_CPU(struct timer_list, mce_timer);
1106 1124
1107static void mcheck_timer(unsigned long data) 1125static void mcheck_timer(unsigned long data)
@@ -1120,7 +1138,7 @@ static void mcheck_timer(unsigned long data)
1120 * Alert userspace if needed. If we logged an MCE, reduce the 1138 * Alert userspace if needed. If we logged an MCE, reduce the
1121 * polling interval, otherwise increase the polling interval. 1139 * polling interval, otherwise increase the polling interval.
1122 */ 1140 */
1123 n = &__get_cpu_var(next_interval); 1141 n = &__get_cpu_var(mce_next_interval);
1124 if (mce_notify_irq()) 1142 if (mce_notify_irq())
1125 *n = max(*n/2, HZ/100); 1143 *n = max(*n/2, HZ/100);
1126 else 1144 else
@@ -1169,10 +1187,26 @@ int mce_notify_irq(void)
1169} 1187}
1170EXPORT_SYMBOL_GPL(mce_notify_irq); 1188EXPORT_SYMBOL_GPL(mce_notify_irq);
1171 1189
1190static int mce_banks_init(void)
1191{
1192 int i;
1193
1194 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1195 if (!mce_banks)
1196 return -ENOMEM;
1197 for (i = 0; i < banks; i++) {
1198 struct mce_bank *b = &mce_banks[i];
1199
1200 b->ctl = -1ULL;
1201 b->init = 1;
1202 }
1203 return 0;
1204}
1205
1172/* 1206/*
1173 * Initialize Machine Checks for a CPU. 1207 * Initialize Machine Checks for a CPU.
1174 */ 1208 */
1175static int mce_cap_init(void) 1209static int __cpuinit mce_cap_init(void)
1176{ 1210{
1177 unsigned b; 1211 unsigned b;
1178 u64 cap; 1212 u64 cap;
@@ -1192,11 +1226,11 @@ static int mce_cap_init(void)
1192 /* Don't support asymmetric configurations today */ 1226 /* Don't support asymmetric configurations today */
1193 WARN_ON(banks != 0 && b != banks); 1227 WARN_ON(banks != 0 && b != banks);
1194 banks = b; 1228 banks = b;
1195 if (!bank) { 1229 if (!mce_banks) {
1196 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1230 int err = mce_banks_init();
1197 if (!bank) 1231
1198 return -ENOMEM; 1232 if (err)
1199 memset(bank, 0xff, banks * sizeof(u64)); 1233 return err;
1200 } 1234 }
1201 1235
1202 /* Use accurate RIP reporting if available. */ 1236 /* Use accurate RIP reporting if available. */
@@ -1228,15 +1262,17 @@ static void mce_init(void)
1228 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1262 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1229 1263
1230 for (i = 0; i < banks; i++) { 1264 for (i = 0; i < banks; i++) {
1231 if (skip_bank_init(i)) 1265 struct mce_bank *b = &mce_banks[i];
1266
1267 if (!b->init)
1232 continue; 1268 continue;
1233 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1269 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1234 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1270 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1235 } 1271 }
1236} 1272}
1237 1273
1238/* Add per CPU specific workarounds here */ 1274/* Add per CPU specific workarounds here */
1239static int mce_cpu_quirks(struct cpuinfo_x86 *c) 1275static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1240{ 1276{
1241 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1277 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1242 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1278 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1251,7 +1287,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1251 * trips off incorrectly with the IOMMU & 3ware 1287 * trips off incorrectly with the IOMMU & 3ware
1252 * & Cerberus: 1288 * & Cerberus:
1253 */ 1289 */
1254 clear_bit(10, (unsigned long *)&bank[4]); 1290 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1255 } 1291 }
1256 if (c->x86 <= 17 && mce_bootlog < 0) { 1292 if (c->x86 <= 17 && mce_bootlog < 0) {
1257 /* 1293 /*
@@ -1265,7 +1301,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1265 * by default. 1301 * by default.
1266 */ 1302 */
1267 if (c->x86 == 6 && banks > 0) 1303 if (c->x86 == 6 && banks > 0)
1268 bank[0] = 0; 1304 mce_banks[0].ctl = 0;
1269 } 1305 }
1270 1306
1271 if (c->x86_vendor == X86_VENDOR_INTEL) { 1307 if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1278,8 +1314,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1278 * valid event later, merely don't write CTL0. 1314 * valid event later, merely don't write CTL0.
1279 */ 1315 */
1280 1316
1281 if (c->x86 == 6 && c->x86_model < 0x1A) 1317 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1282 __set_bit(0, &dont_init_banks); 1318 mce_banks[0].init = 0;
1283 1319
1284 /* 1320 /*
1285 * All newer Intel systems support MCE broadcasting. Enable 1321 * All newer Intel systems support MCE broadcasting. Enable
@@ -1335,7 +1371,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1335static void mce_init_timer(void) 1371static void mce_init_timer(void)
1336{ 1372{
1337 struct timer_list *t = &__get_cpu_var(mce_timer); 1373 struct timer_list *t = &__get_cpu_var(mce_timer);
1338 int *n = &__get_cpu_var(next_interval); 1374 int *n = &__get_cpu_var(mce_next_interval);
1339 1375
1340 if (mce_ignore_ce) 1376 if (mce_ignore_ce)
1341 return; 1377 return;
@@ -1348,6 +1384,17 @@ static void mce_init_timer(void)
1348 add_timer_on(t, smp_processor_id()); 1384 add_timer_on(t, smp_processor_id());
1349} 1385}
1350 1386
1387/* Handle unconfigured int18 (should never happen) */
1388static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1389{
1390 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1391 smp_processor_id());
1392}
1393
1394/* Call the installed machine check handler for this CPU setup. */
1395void (*machine_check_vector)(struct pt_regs *, long error_code) =
1396 unexpected_machine_check;
1397
1351/* 1398/*
1352 * Called for each booted CPU to set up machine checks. 1399 * Called for each booted CPU to set up machine checks.
1353 * Must be called with preempt off: 1400 * Must be called with preempt off:
@@ -1561,8 +1608,10 @@ static struct miscdevice mce_log_device = {
1561 */ 1608 */
1562static int __init mcheck_enable(char *str) 1609static int __init mcheck_enable(char *str)
1563{ 1610{
1564 if (*str == 0) 1611 if (*str == 0) {
1565 enable_p5_mce(); 1612 enable_p5_mce();
1613 return 1;
1614 }
1566 if (*str == '=') 1615 if (*str == '=')
1567 str++; 1616 str++;
1568 if (!strcmp(str, "off")) 1617 if (!strcmp(str, "off"))
@@ -1603,8 +1652,10 @@ static int mce_disable(void)
1603 int i; 1652 int i;
1604 1653
1605 for (i = 0; i < banks; i++) { 1654 for (i = 0; i < banks; i++) {
1606 if (!skip_bank_init(i)) 1655 struct mce_bank *b = &mce_banks[i];
1607 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1656
1657 if (b->init)
1658 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1608 } 1659 }
1609 return 0; 1660 return 0;
1610} 1661}
@@ -1679,14 +1730,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);
1679__cpuinitdata 1730__cpuinitdata
1680void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1731void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1681 1732
1682static struct sysdev_attribute *bank_attrs; 1733static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1734{
1735 return container_of(attr, struct mce_bank, attr);
1736}
1683 1737
1684static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1738static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1685 char *buf) 1739 char *buf)
1686{ 1740{
1687 u64 b = bank[attr - bank_attrs]; 1741 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1688
1689 return sprintf(buf, "%llx\n", b);
1690} 1742}
1691 1743
1692static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1744static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
@@ -1697,7 +1749,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1697 if (strict_strtoull(buf, 0, &new) < 0) 1749 if (strict_strtoull(buf, 0, &new) < 0)
1698 return -EINVAL; 1750 return -EINVAL;
1699 1751
1700 bank[attr - bank_attrs] = new; 1752 attr_to_bank(attr)->ctl = new;
1701 mce_restart(); 1753 mce_restart();
1702 1754
1703 return size; 1755 return size;
@@ -1839,7 +1891,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1839 } 1891 }
1840 for (j = 0; j < banks; j++) { 1892 for (j = 0; j < banks; j++) {
1841 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1893 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1842 &bank_attrs[j]); 1894 &mce_banks[j].attr);
1843 if (err) 1895 if (err)
1844 goto error2; 1896 goto error2;
1845 } 1897 }
@@ -1848,10 +1900,10 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1848 return 0; 1900 return 0;
1849error2: 1901error2:
1850 while (--j >= 0) 1902 while (--j >= 0)
1851 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); 1903 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1852error: 1904error:
1853 while (--i >= 0) 1905 while (--i >= 0)
1854 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1906 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1855 1907
1856 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1908 sysdev_unregister(&per_cpu(mce_dev, cpu));
1857 1909
@@ -1869,7 +1921,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1869 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1921 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1870 1922
1871 for (i = 0; i < banks; i++) 1923 for (i = 0; i < banks; i++)
1872 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1924 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1873 1925
1874 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1926 sysdev_unregister(&per_cpu(mce_dev, cpu));
1875 cpumask_clear_cpu(cpu, mce_dev_initialized); 1927 cpumask_clear_cpu(cpu, mce_dev_initialized);
@@ -1886,8 +1938,10 @@ static void mce_disable_cpu(void *h)
1886 if (!(action & CPU_TASKS_FROZEN)) 1938 if (!(action & CPU_TASKS_FROZEN))
1887 cmci_clear(); 1939 cmci_clear();
1888 for (i = 0; i < banks; i++) { 1940 for (i = 0; i < banks; i++) {
1889 if (!skip_bank_init(i)) 1941 struct mce_bank *b = &mce_banks[i];
1890 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1942
1943 if (b->init)
1944 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1891 } 1945 }
1892} 1946}
1893 1947
@@ -1902,8 +1956,10 @@ static void mce_reenable_cpu(void *h)
1902 if (!(action & CPU_TASKS_FROZEN)) 1956 if (!(action & CPU_TASKS_FROZEN))
1903 cmci_reenable(); 1957 cmci_reenable();
1904 for (i = 0; i < banks; i++) { 1958 for (i = 0; i < banks; i++) {
1905 if (!skip_bank_init(i)) 1959 struct mce_bank *b = &mce_banks[i];
1906 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1960
1961 if (b->init)
1962 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1907 } 1963 }
1908} 1964}
1909 1965
@@ -1935,7 +1991,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1935 case CPU_DOWN_FAILED: 1991 case CPU_DOWN_FAILED:
1936 case CPU_DOWN_FAILED_FROZEN: 1992 case CPU_DOWN_FAILED_FROZEN:
1937 t->expires = round_jiffies(jiffies + 1993 t->expires = round_jiffies(jiffies +
1938 __get_cpu_var(next_interval)); 1994 __get_cpu_var(mce_next_interval));
1939 add_timer_on(t, cpu); 1995 add_timer_on(t, cpu);
1940 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1996 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1941 break; 1997 break;
@@ -1951,35 +2007,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1951 .notifier_call = mce_cpu_callback, 2007 .notifier_call = mce_cpu_callback,
1952}; 2008};
1953 2009
1954static __init int mce_init_banks(void) 2010static __init void mce_init_banks(void)
1955{ 2011{
1956 int i; 2012 int i;
1957 2013
1958 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1959 GFP_KERNEL);
1960 if (!bank_attrs)
1961 return -ENOMEM;
1962
1963 for (i = 0; i < banks; i++) { 2014 for (i = 0; i < banks; i++) {
1964 struct sysdev_attribute *a = &bank_attrs[i]; 2015 struct mce_bank *b = &mce_banks[i];
2016 struct sysdev_attribute *a = &b->attr;
1965 2017
1966 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 2018 a->attr.name = b->attrname;
1967 if (!a->attr.name) 2019 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
1968 goto nomem;
1969 2020
1970 a->attr.mode = 0644; 2021 a->attr.mode = 0644;
1971 a->show = show_bank; 2022 a->show = show_bank;
1972 a->store = set_bank; 2023 a->store = set_bank;
1973 } 2024 }
1974 return 0;
1975
1976nomem:
1977 while (--i >= 0)
1978 kfree(bank_attrs[i].attr.name);
1979 kfree(bank_attrs);
1980 bank_attrs = NULL;
1981
1982 return -ENOMEM;
1983} 2025}
1984 2026
1985static __init int mce_init_device(void) 2027static __init int mce_init_device(void)
@@ -1992,9 +2034,7 @@ static __init int mce_init_device(void)
1992 2034
1993 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2035 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1994 2036
1995 err = mce_init_banks(); 2037 mce_init_banks();
1996 if (err)
1997 return err;
1998 2038
1999 err = sysdev_class_register(&mce_sysclass); 2039 err = sysdev_class_register(&mce_sysclass);
2000 if (err) 2040 if (err)
@@ -2014,57 +2054,65 @@ static __init int mce_init_device(void)
2014 2054
2015device_initcall(mce_init_device); 2055device_initcall(mce_init_device);
2016 2056
2017#else /* CONFIG_X86_OLD_MCE: */ 2057/*
2018 2058 * Old style boot options parsing. Only for compatibility.
2019int nr_mce_banks; 2059 */
2020EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 2060static int __init mcheck_disable(char *str)
2061{
2062 mce_disabled = 1;
2063 return 1;
2064}
2065__setup("nomce", mcheck_disable);
2021 2066
2022/* This has to be run for each processor */ 2067#ifdef CONFIG_DEBUG_FS
2023void mcheck_init(struct cpuinfo_x86 *c) 2068struct dentry *mce_get_debugfs_dir(void)
2024{ 2069{
2025 if (mce_disabled) 2070 static struct dentry *dmce;
2026 return;
2027 2071
2028 switch (c->x86_vendor) { 2072 if (!dmce)
2029 case X86_VENDOR_AMD: 2073 dmce = debugfs_create_dir("mce", NULL);
2030 amd_mcheck_init(c);
2031 break;
2032 2074
2033 case X86_VENDOR_INTEL: 2075 return dmce;
2034 if (c->x86 == 5) 2076}
2035 intel_p5_mcheck_init(c);
2036 if (c->x86 == 6)
2037 intel_p6_mcheck_init(c);
2038 if (c->x86 == 15)
2039 intel_p4_mcheck_init(c);
2040 break;
2041 2077
2042 case X86_VENDOR_CENTAUR: 2078static void mce_reset(void)
2043 if (c->x86 == 5) 2079{
2044 winchip_mcheck_init(c); 2080 cpu_missing = 0;
2045 break; 2081 atomic_set(&mce_fake_paniced, 0);
2082 atomic_set(&mce_executing, 0);
2083 atomic_set(&mce_callin, 0);
2084 atomic_set(&global_nwo, 0);
2085}
2046 2086
2047 default: 2087static int fake_panic_get(void *data, u64 *val)
2048 break; 2088{
2049 } 2089 *val = fake_panic;
2050 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 2090 return 0;
2051} 2091}
2052 2092
2053static int __init mcheck_enable(char *str) 2093static int fake_panic_set(void *data, u64 val)
2054{ 2094{
2055 mce_p5_enabled = 1; 2095 mce_reset();
2056 return 1; 2096 fake_panic = val;
2097 return 0;
2057} 2098}
2058__setup("mce", mcheck_enable);
2059 2099
2060#endif /* CONFIG_X86_OLD_MCE */ 2100DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2101 fake_panic_set, "%llu\n");
2061 2102
2062/* 2103static int __init mce_debugfs_init(void)
2063 * Old style boot options parsing. Only for compatibility.
2064 */
2065static int __init mcheck_disable(char *str)
2066{ 2104{
2067 mce_disabled = 1; 2105 struct dentry *dmce, *ffake_panic;
2068 return 1; 2106
2107 dmce = mce_get_debugfs_dir();
2108 if (!dmce)
2109 return -ENOMEM;
2110 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2111 &fake_panic_fops);
2112 if (!ffake_panic)
2113 return -ENOMEM;
2114
2115 return 0;
2069} 2116}
2070__setup("nomce", mcheck_disable); 2117late_initcall(mce_debugfs_init);
2118#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 1fecba404fd8..83a3d1f4efca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
73 73
74#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = { 75static unsigned char shared_bank[NR_BANKS] = {
@@ -489,8 +489,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
489 int i, err = 0; 489 int i, err = 0;
490 struct threshold_bank *b = NULL; 490 struct threshold_bank *b = NULL;
491 char name[32]; 491 char name[32];
492#ifdef CONFIG_SMP
492 struct cpuinfo_x86 *c = &cpu_data(cpu); 493 struct cpuinfo_x86 *c = &cpu_data(cpu);
493 494#endif
494 495
495 sprintf(name, "threshold_bank%i", bank); 496 sprintf(name, "threshold_bank%i", bank);
496 497
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0f7a32..7c785634af2b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/sched.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/processor.h> 13#include <asm/processor.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
@@ -90,7 +91,7 @@ static void cmci_discover(int banks, int boot)
90 if (test_bit(i, owned)) 91 if (test_bit(i, owned))
91 continue; 92 continue;
92 93
93 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 94 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
94 95
95 /* Already owned by someone else? */ 96 /* Already owned by someone else? */
96 if (val & CMCI_EN) { 97 if (val & CMCI_EN) {
@@ -101,8 +102,8 @@ static void cmci_discover(int banks, int boot)
101 } 102 }
102 103
103 val |= CMCI_EN | CMCI_THRESHOLD; 104 val |= CMCI_EN | CMCI_THRESHOLD;
104 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 105 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
105 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 106 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
106 107
107 /* Did the enable bit stick? -- the bank supports CMCI */ 108 /* Did the enable bit stick? -- the bank supports CMCI */
108 if (val & CMCI_EN) { 109 if (val & CMCI_EN) {
@@ -152,9 +153,9 @@ void cmci_clear(void)
152 if (!test_bit(i, __get_cpu_var(mce_banks_owned))) 153 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
153 continue; 154 continue;
154 /* Disable CMCI */ 155 /* Disable CMCI */
155 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 156 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 157 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
157 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 158 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
158 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 159 __clear_bit(i, __get_cpu_var(mce_banks_owned));
159 } 160 }
160 spin_unlock_irqrestore(&cmci_discover_lock, flags); 161 spin_unlock_irqrestore(&cmci_discover_lock, flags);
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
deleted file mode 100644
index f5f2d6f71fb6..000000000000
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ /dev/null
@@ -1,94 +0,0 @@
1/*
2 * Non Fatal Machine Check Exception Reporting
3 *
4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 *
6 * This file contains routines to check for non-fatal MCEs every 15s
7 *
8 */
9#include <linux/interrupt.h>
10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
17
18#include <asm/processor.h>
19#include <asm/system.h>
20#include <asm/mce.h>
21#include <asm/msr.h>
22
23static int firstbank;
24
25#define MCE_RATE (15*HZ) /* timer rate is 15s */
26
27static void mce_checkregs(void *info)
28{
29 u32 low, high;
30 int i;
31
32 for (i = firstbank; i < nr_mce_banks; i++) {
33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
34
35 if (!(high & (1<<31)))
36 continue;
37
38 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
39 "correctable incident occurred on CPU %d.\n",
40 smp_processor_id());
41
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
43
44 /*
45 * Scrub the error so we don't pick it up in MCE_RATE
46 * seconds time:
47 */
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
49
50 /* Serialize: */
51 wmb();
52 add_taint(TAINT_MACHINE_CHECK);
53 }
54}
55
56static void mce_work_fn(struct work_struct *work);
57static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
58
59static void mce_work_fn(struct work_struct *work)
60{
61 on_each_cpu(mce_checkregs, NULL, 1);
62 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
63}
64
65static int __init init_nonfatal_mce_checker(void)
66{
67 struct cpuinfo_x86 *c = &boot_cpu_data;
68
69 /* Check for MCE support */
70 if (!cpu_has(c, X86_FEATURE_MCE))
71 return -ENODEV;
72
73 /* Check for PPro style MCA */
74 if (!cpu_has(c, X86_FEATURE_MCA))
75 return -ENODEV;
76
77 /* Some Athlons misbehave when we frob bank 0 */
78 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
79 boot_cpu_data.x86 == 6)
80 firstbank = 1;
81 else
82 firstbank = 0;
83
84 /*
85 * Check for non-fatal errors every MCE_RATE s
86 */
87 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
88 printk(KERN_INFO "Machine check exception polling timer started.\n");
89
90 return 0;
91}
92module_init(init_nonfatal_mce_checker);
93
94MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
deleted file mode 100644
index 4482aea9aa2e..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,163 +0,0 @@
1/*
2 * P4 specific Machine Check Exception Reporting
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/smp.h>
8
9#include <asm/processor.h>
10#include <asm/mce.h>
11#include <asm/msr.h>
12
13/* as supported by the P4/Xeon family */
14struct intel_mce_extended_msrs {
15 u32 eax;
16 u32 ebx;
17 u32 ecx;
18 u32 edx;
19 u32 esi;
20 u32 edi;
21 u32 ebp;
22 u32 esp;
23 u32 eflags;
24 u32 eip;
25 /* u32 *reserved[]; */
26};
27
28static int mce_num_extended_msrs;
29
30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
32{
33 u32 h;
34
35 rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
36 rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
37 rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
38 rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
39 rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
40 rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
41 rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
42 rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
43 rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
44 rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
45}
46
47static void intel_machine_check(struct pt_regs *regs, long error_code)
48{
49 u32 alow, ahigh, high, low;
50 u32 mcgstl, mcgsth;
51 int recover = 1;
52 int i;
53
54 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
55 if (mcgstl & (1<<0)) /* Recoverable ? */
56 recover = 0;
57
58 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
59 smp_processor_id(), mcgsth, mcgstl);
60
61 if (mce_num_extended_msrs > 0) {
62 struct intel_mce_extended_msrs dbg;
63
64 intel_get_extended_msrs(&dbg);
65
66 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
67 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
68 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
69 smp_processor_id(), dbg.eip, dbg.eflags,
70 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
71 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
72 }
73
74 for (i = 0; i < nr_mce_banks; i++) {
75 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
76 if (high & (1<<31)) {
77 char misc[20];
78 char addr[24];
79
80 misc[0] = addr[0] = '\0';
81 if (high & (1<<29))
82 recover |= 1;
83 if (high & (1<<25))
84 recover |= 2;
85 high &= ~(1<<31);
86 if (high & (1<<27)) {
87 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
88 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
89 }
90 if (high & (1<<26)) {
91 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
92 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
93 }
94 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
95 smp_processor_id(), i, high, low, misc, addr);
96 }
97 }
98
99 if (recover & 2)
100 panic("CPU context corrupt");
101 if (recover & 1)
102 panic("Unable to continue");
103
104 printk(KERN_EMERG "Attempting to continue.\n");
105
106 /*
107 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
108 * recoverable/continuable.This will allow BIOS to look at the MSRs
109 * for errors if the OS could not log the error.
110 */
111 for (i = 0; i < nr_mce_banks; i++) {
112 u32 msr;
113 msr = MSR_IA32_MC0_STATUS+i*4;
114 rdmsr(msr, low, high);
115 if (high&(1<<31)) {
116 /* Clear it */
117 wrmsr(msr, 0UL, 0UL);
118 /* Serialize */
119 wmb();
120 add_taint(TAINT_MACHINE_CHECK);
121 }
122 }
123 mcgstl &= ~(1<<2);
124 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
125}
126
127void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
128{
129 u32 l, h;
130 int i;
131
132 machine_check_vector = intel_machine_check;
133 wmb();
134
135 printk(KERN_INFO "Intel machine check architecture supported.\n");
136 rdmsr(MSR_IA32_MCG_CAP, l, h);
137 if (l & (1<<8)) /* Control register present ? */
138 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
139 nr_mce_banks = l & 0xff;
140
141 for (i = 0; i < nr_mce_banks; i++) {
142 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
143 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
144 }
145
146 set_in_cr4(X86_CR4_MCE);
147 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
148 smp_processor_id());
149
150 /* Check for P4/Xeon extended MCE MSRs */
151 rdmsr(MSR_IA32_MCG_CAP, l, h);
152 if (l & (1<<9)) {/* MCG_EXT_P */
153 mce_num_extended_msrs = (l >> 16) & 0xff;
154 printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
155 " available\n",
156 smp_processor_id(), mce_num_extended_msrs);
157
158#ifdef CONFIG_X86_MCE_P4THERMAL
159 /* Check for P4/Xeon Thermal monitor */
160 intel_init_thermal(c);
161#endif
162 }
163}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
deleted file mode 100644
index 01e4f8178183..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For PII/PIII */
17static void intel_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 0; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57 }
58 }
59
60 if (recover & 2)
61 panic("CPU context corrupt");
62 if (recover & 1)
63 panic("Unable to continue");
64
65 printk(KERN_EMERG "Attempting to continue.\n");
66 /*
67 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
68 * recoverable/continuable.This will allow BIOS to look at the MSRs
69 * for errors if the OS could not log the error:
70 */
71 for (i = 0; i < nr_mce_banks; i++) {
72 unsigned int msr;
73
74 msr = MSR_IA32_MC0_STATUS+i*4;
75 rdmsr(msr, low, high);
76 if (high & (1<<31)) {
77 /* Clear it: */
78 wrmsr(msr, 0UL, 0UL);
79 /* Serialize: */
80 wmb();
81 add_taint(TAINT_MACHINE_CHECK);
82 }
83 }
84 mcgstl &= ~(1<<2);
85 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
86}
87
88/* Set up machine check reporting for processors with Intel style MCE: */
89void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
90{
91 u32 l, h;
92 int i;
93
94 /* Check for MCE support */
95 if (!cpu_has(c, X86_FEATURE_MCE))
96 return;
97
98 /* Check for PPro style MCA */
99 if (!cpu_has(c, X86_FEATURE_MCA))
100 return;
101
102 /* Ok machine check is available */
103 machine_check_vector = intel_machine_check;
104 /* Make sure the vector pointer is visible before we enable MCEs: */
105 wmb();
106
107 printk(KERN_INFO "Intel machine check architecture supported.\n");
108 rdmsr(MSR_IA32_MCG_CAP, l, h);
109 if (l & (1<<8)) /* Control register present ? */
110 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
111 nr_mce_banks = l & 0xff;
112
113 /*
114 * Following the example in IA-32 SDM Vol 3:
115 * - MC0_CTL should not be written
116 * - Status registers on all banks should be cleared on reset
117 */
118 for (i = 1; i < nr_mce_banks; i++)
119 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
120
121 for (i = 0; i < nr_mce_banks; i++)
122 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
123
124 set_in_cr4(X86_CR4_MCE);
125 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
126 smp_processor_id());
127}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 5957a93e5173..b3a1dba75330 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,20 +34,31 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 37/*
38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 38 * Current thermal throttling state:
39static DEFINE_PER_CPU(bool, thermal_throttle_active); 39 */
40struct thermal_state {
41 bool is_throttled;
42
43 u64 next_check;
44 unsigned long throttle_count;
45 unsigned long last_throttle_count;
46};
47
48static DEFINE_PER_CPU(struct thermal_state, thermal_state);
40 49
41static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
42 51
43#ifdef CONFIG_SYSFS 52#ifdef CONFIG_SYSFS
44#define define_therm_throt_sysdev_one_ro(_name) \ 53#define define_therm_throt_sysdev_one_ro(_name) \
45 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
46 55
47#define define_therm_throt_sysdev_show_func(name) \ 56#define define_therm_throt_sysdev_show_func(name) \
48static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 57 \
49 struct sysdev_attribute *attr, \ 58static ssize_t therm_throt_sysdev_show_##name( \
50 char *buf) \ 59 struct sys_device *dev, \
60 struct sysdev_attribute *attr, \
61 char *buf) \
51{ \ 62{ \
52 unsigned int cpu = dev->id; \ 63 unsigned int cpu = dev->id; \
53 ssize_t ret; \ 64 ssize_t ret; \
@@ -55,7 +66,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
55 preempt_disable(); /* CPU hotplug */ \ 66 preempt_disable(); /* CPU hotplug */ \
56 if (cpu_online(cpu)) \ 67 if (cpu_online(cpu)) \
57 ret = sprintf(buf, "%lu\n", \ 68 ret = sprintf(buf, "%lu\n", \
58 per_cpu(thermal_throttle_##name, cpu)); \ 69 per_cpu(thermal_state, cpu).name); \
59 else \ 70 else \
60 ret = 0; \ 71 ret = 0; \
61 preempt_enable(); \ 72 preempt_enable(); \
@@ -63,11 +74,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
63 return ret; \ 74 return ret; \
64} 75}
65 76
66define_therm_throt_sysdev_show_func(count); 77define_therm_throt_sysdev_show_func(throttle_count);
67define_therm_throt_sysdev_one_ro(count); 78define_therm_throt_sysdev_one_ro(throttle_count);
68 79
69static struct attribute *thermal_throttle_attrs[] = { 80static struct attribute *thermal_throttle_attrs[] = {
70 &attr_count.attr, 81 &attr_throttle_count.attr,
71 NULL 82 NULL
72}; 83};
73 84
@@ -93,33 +104,39 @@ static struct attribute_group thermal_throttle_attr_group = {
93 * 1 : Event should be logged further, and a message has been 104 * 1 : Event should be logged further, and a message has been
94 * printed to the syslog. 105 * printed to the syslog.
95 */ 106 */
96static int therm_throt_process(int curr) 107static int therm_throt_process(bool is_throttled)
97{ 108{
98 unsigned int cpu = smp_processor_id(); 109 struct thermal_state *state;
99 __u64 tmp_jiffs = get_jiffies_64(); 110 unsigned int this_cpu;
100 bool was_throttled = __get_cpu_var(thermal_throttle_active); 111 bool was_throttled;
101 bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; 112 u64 now;
113
114 this_cpu = smp_processor_id();
115 now = get_jiffies_64();
116 state = &per_cpu(thermal_state, this_cpu);
117
118 was_throttled = state->is_throttled;
119 state->is_throttled = is_throttled;
102 120
103 if (is_throttled) 121 if (is_throttled)
104 __get_cpu_var(thermal_throttle_count)++; 122 state->throttle_count++;
105 123
106 if (!(was_throttled ^ is_throttled) && 124 if (time_before64(now, state->next_check) &&
107 time_before64(tmp_jiffs, __get_cpu_var(next_check))) 125 state->throttle_count != state->last_throttle_count)
108 return 0; 126 return 0;
109 127
110 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; 128 state->next_check = now + CHECK_INTERVAL;
129 state->last_throttle_count = state->throttle_count;
111 130
112 /* if we just entered the thermal event */ 131 /* if we just entered the thermal event */
113 if (is_throttled) { 132 if (is_throttled) {
114 printk(KERN_CRIT "CPU%d: Temperature above threshold, " 133 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
115 "cpu clock throttled (total events = %lu)\n",
116 cpu, __get_cpu_var(thermal_throttle_count));
117 134
118 add_taint(TAINT_MACHINE_CHECK); 135 add_taint(TAINT_MACHINE_CHECK);
119 return 1; 136 return 1;
120 } 137 }
121 if (was_throttled) { 138 if (was_throttled) {
122 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); 139 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
123 return 1; 140 return 1;
124 } 141 }
125 142
@@ -213,7 +230,7 @@ static void intel_thermal_interrupt(void)
213 __u64 msr_val; 230 __u64 msr_val;
214 231
215 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 232 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
216 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) 233 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
217 mce_log_therm_throt_event(msr_val); 234 mce_log_therm_throt_event(msr_val);
218} 235}
219 236
@@ -260,9 +277,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
260 return; 277 return;
261 } 278 }
262 279
263 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
264 tm2 = 1;
265
266 /* Check whether a vector already exists */ 280 /* Check whether a vector already exists */
267 if (h & APIC_VECTOR_MASK) { 281 if (h & APIC_VECTOR_MASK) {
268 printk(KERN_DEBUG 282 printk(KERN_DEBUG
@@ -271,6 +285,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
271 return; 285 return;
272 } 286 }
273 287
288 /* early Pentium M models use different method for enabling TM2 */
289 if (cpu_has(c, X86_FEATURE_TM2)) {
290 if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
291 rdmsr(MSR_THERM2_CTL, l, h);
292 if (l & MSR_THERM2_CTL_TM_SELECT)
293 tm2 = 1;
294 } else if (l & MSR_IA32_MISC_ENABLE_TM2)
295 tm2 = 1;
296 }
297
274 /* We'll mask the thermal vector in the lapic till we're ready: */ 298 /* We'll mask the thermal vector in the lapic till we're ready: */
275 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; 299 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
276 apic_write(APIC_LVTTHMR, h); 300 apic_write(APIC_LVTTHMR, h);
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 08b6ea4c62b4..3c1b12d461d1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -96,17 +96,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
96 unsigned long long base, size; 96 unsigned long long base, size;
97 char *ptr; 97 char *ptr;
98 char line[LINE_SIZE]; 98 char line[LINE_SIZE];
99 int length;
99 size_t linelen; 100 size_t linelen;
100 101
101 if (!capable(CAP_SYS_ADMIN)) 102 if (!capable(CAP_SYS_ADMIN))
102 return -EPERM; 103 return -EPERM;
103 if (!len)
104 return -EINVAL;
105 104
106 memset(line, 0, LINE_SIZE); 105 memset(line, 0, LINE_SIZE);
107 if (len > LINE_SIZE) 106
108 len = LINE_SIZE; 107 length = len;
109 if (copy_from_user(line, buf, len - 1)) 108 length--;
109
110 if (length > LINE_SIZE - 1)
111 length = LINE_SIZE - 1;
112
113 if (length < 0)
114 return -EINVAL;
115
116 if (copy_from_user(line, buf, length))
110 return -EFAULT; 117 return -EFAULT;
111 118
112 linelen = strlen(line); 119 linelen = strlen(line);
@@ -126,8 +133,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
126 return -EINVAL; 133 return -EINVAL;
127 134
128 base = simple_strtoull(line + 5, &ptr, 0); 135 base = simple_strtoull(line + 5, &ptr, 0);
129 for (; isspace(*ptr); ++ptr) 136 while (isspace(*ptr))
130 ; 137 ptr++;
131 138
132 if (strncmp(ptr, "size=", 5)) 139 if (strncmp(ptr, "size=", 5))
133 return -EINVAL; 140 return -EINVAL;
@@ -135,14 +142,14 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
135 size = simple_strtoull(ptr + 5, &ptr, 0); 142 size = simple_strtoull(ptr + 5, &ptr, 0);
136 if ((base & 0xfff) || (size & 0xfff)) 143 if ((base & 0xfff) || (size & 0xfff))
137 return -EINVAL; 144 return -EINVAL;
138 for (; isspace(*ptr); ++ptr) 145 while (isspace(*ptr))
139 ; 146 ptr++;
140 147
141 if (strncmp(ptr, "type=", 5)) 148 if (strncmp(ptr, "type=", 5))
142 return -EINVAL; 149 return -EINVAL;
143 ptr += 5; 150 ptr += 5;
144 for (; isspace(*ptr); ++ptr) 151 while (isspace(*ptr))
145 ; 152 ptr++;
146 153
147 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 154 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
148 if (strcmp(ptr, mtrr_strings[i])) 155 if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7af0f88a4163..84e83de54575 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
58static DEFINE_MUTEX(mtrr_mutex); 58static DEFINE_MUTEX(mtrr_mutex);
59 59
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init;
61 62
62static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
63 64
@@ -163,7 +164,10 @@ static void ipi_handler(void *info)
163 if (data->smp_reg != ~0U) { 164 if (data->smp_reg != ~0U) {
164 mtrr_if->set(data->smp_reg, data->smp_base, 165 mtrr_if->set(data->smp_reg, data->smp_base,
165 data->smp_size, data->smp_type); 166 data->smp_size, data->smp_type);
166 } else { 167 } else if (mtrr_aps_delayed_init) {
168 /*
169 * Initialize the MTRRs inaddition to the synchronisation.
170 */
167 mtrr_if->set_all(); 171 mtrr_if->set_all();
168 } 172 }
169 173
@@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
265 */ 269 */
266 if (reg != ~0U) 270 if (reg != ~0U)
267 mtrr_if->set(reg, base, size, type); 271 mtrr_if->set(reg, base, size, type);
272 else if (!mtrr_aps_delayed_init)
273 mtrr_if->set_all();
268 274
269 /* Wait for the others */ 275 /* Wait for the others */
270 while (atomic_read(&data.count)) 276 while (atomic_read(&data.count))
@@ -721,9 +727,7 @@ void __init mtrr_bp_init(void)
721 727
722void mtrr_ap_init(void) 728void mtrr_ap_init(void)
723{ 729{
724 unsigned long flags; 730 if (!use_intel() || mtrr_aps_delayed_init)
725
726 if (!mtrr_if || !use_intel())
727 return; 731 return;
728 /* 732 /*
729 * Ideally we should hold mtrr_mutex here to avoid mtrr entries 733 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
@@ -738,11 +742,7 @@ void mtrr_ap_init(void)
738 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug 742 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
739 * lock to prevent mtrr entry changes 743 * lock to prevent mtrr entry changes
740 */ 744 */
741 local_irq_save(flags); 745 set_mtrr(~0U, 0, 0, 0);
742
743 mtrr_if->set_all();
744
745 local_irq_restore(flags);
746} 746}
747 747
748/** 748/**
@@ -753,6 +753,34 @@ void mtrr_save_state(void)
753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); 753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
754} 754}
755 755
756void set_mtrr_aps_delayed_init(void)
757{
758 if (!use_intel())
759 return;
760
761 mtrr_aps_delayed_init = true;
762}
763
764/*
765 * MTRR initialization for all AP's
766 */
767void mtrr_aps_init(void)
768{
769 if (!use_intel())
770 return;
771
772 set_mtrr(~0U, 0, 0, 0);
773 mtrr_aps_delayed_init = false;
774}
775
776void mtrr_bp_restore(void)
777{
778 if (!use_intel())
779 return;
780
781 mtrr_if->set_all();
782}
783
756static int __init mtrr_init_finialize(void) 784static int __init mtrr_init_finialize(void)
757{ 785{
758 if (!mtrr_if) 786 if (!mtrr_if)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c
index f9cd0849bd42..2e20bca3cca1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Performance counter x86 architecture code 2 * Performance events x86 architecture code
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
@@ -11,7 +11,7 @@
11 * For licencing details see kernel-base/COPYING 11 * For licencing details see kernel-base/COPYING
12 */ 12 */
13 13
14#include <linux/perf_counter.h> 14#include <linux/perf_event.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/notifier.h> 16#include <linux/notifier.h>
17#include <linux/hardirq.h> 17#include <linux/hardirq.h>
@@ -27,19 +27,19 @@
27#include <asm/stacktrace.h> 27#include <asm/stacktrace.h>
28#include <asm/nmi.h> 28#include <asm/nmi.h>
29 29
30static u64 perf_counter_mask __read_mostly; 30static u64 perf_event_mask __read_mostly;
31 31
32/* The maximal number of PEBS counters: */ 32/* The maximal number of PEBS events: */
33#define MAX_PEBS_COUNTERS 4 33#define MAX_PEBS_EVENTS 4
34 34
35/* The size of a BTS record in bytes: */ 35/* The size of a BTS record in bytes: */
36#define BTS_RECORD_SIZE 24 36#define BTS_RECORD_SIZE 24
37 37
38/* The size of a per-cpu BTS buffer in bytes: */ 38/* The size of a per-cpu BTS buffer in bytes: */
39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) 39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
40 40
41/* The BTS overflow threshold in bytes from the end of the buffer: */ 41/* The BTS overflow threshold in bytes from the end of the buffer: */
42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) 42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
43 43
44 44
45/* 45/*
@@ -65,11 +65,11 @@ struct debug_store {
65 u64 pebs_index; 65 u64 pebs_index;
66 u64 pebs_absolute_maximum; 66 u64 pebs_absolute_maximum;
67 u64 pebs_interrupt_threshold; 67 u64 pebs_interrupt_threshold;
68 u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; 68 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 69};
70 70
71struct cpu_hw_counters { 71struct cpu_hw_events {
72 struct perf_counter *counters[X86_PMC_IDX_MAX]; 72 struct perf_event *events[X86_PMC_IDX_MAX];
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 75 unsigned long interrupts;
@@ -77,6 +77,18 @@ struct cpu_hw_counters {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -86,30 +98,34 @@ struct x86_pmu {
86 int (*handle_irq)(struct pt_regs *); 98 int (*handle_irq)(struct pt_regs *);
87 void (*disable_all)(void); 99 void (*disable_all)(void);
88 void (*enable_all)(void); 100 void (*enable_all)(void);
89 void (*enable)(struct hw_perf_counter *, int); 101 void (*enable)(struct hw_perf_event *, int);
90 void (*disable)(struct hw_perf_counter *, int); 102 void (*disable)(struct hw_perf_event *, int);
91 unsigned eventsel; 103 unsigned eventsel;
92 unsigned perfctr; 104 unsigned perfctr;
93 u64 (*event_map)(int); 105 u64 (*event_map)(int);
94 u64 (*raw_event)(u64); 106 u64 (*raw_event)(u64);
95 int max_events; 107 int max_events;
96 int num_counters; 108 int num_events;
97 int num_counters_fixed; 109 int num_events_fixed;
98 int counter_bits; 110 int event_bits;
99 u64 counter_mask; 111 u64 event_mask;
100 int apic; 112 int apic;
101 u64 max_period; 113 u64 max_period;
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
108 122
109static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { 123static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -124,37 +140,47 @@ static const u64 p6_perfmon_event_map[] =
124 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, 140 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
125}; 141};
126 142
127static u64 p6_pmu_event_map(int event) 143static u64 p6_pmu_event_map(int hw_event)
128{ 144{
129 return p6_perfmon_event_map[event]; 145 return p6_perfmon_event_map[hw_event];
130} 146}
131 147
132/* 148/*
133 * Counter setting that is specified not to count anything. 149 * Event setting that is specified not to count anything.
134 * We use this to effectively disable a counter. 150 * We use this to effectively disable a counter.
135 * 151 *
136 * L2_RQSTS with 0 MESI unit mask. 152 * L2_RQSTS with 0 MESI unit mask.
137 */ 153 */
138#define P6_NOP_COUNTER 0x0000002EULL 154#define P6_NOP_EVENT 0x0000002EULL
139 155
140static u64 p6_pmu_raw_event(u64 event) 156static u64 p6_pmu_raw_event(u64 hw_event)
141{ 157{
142#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL 158#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
143#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL 159#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
144#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL 160#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
145#define P6_EVNTSEL_INV_MASK 0x00800000ULL 161#define P6_EVNTSEL_INV_MASK 0x00800000ULL
146#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL 162#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
147 163
148#define P6_EVNTSEL_MASK \ 164#define P6_EVNTSEL_MASK \
149 (P6_EVNTSEL_EVENT_MASK | \ 165 (P6_EVNTSEL_EVENT_MASK | \
150 P6_EVNTSEL_UNIT_MASK | \ 166 P6_EVNTSEL_UNIT_MASK | \
151 P6_EVNTSEL_EDGE_MASK | \ 167 P6_EVNTSEL_EDGE_MASK | \
152 P6_EVNTSEL_INV_MASK | \ 168 P6_EVNTSEL_INV_MASK | \
153 P6_EVNTSEL_COUNTER_MASK) 169 P6_EVNTSEL_REG_MASK)
154 170
155 return event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,16 +196,45 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
173static u64 intel_pmu_event_map(int event) 199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[event]; 230 return intel_perfmon_event_map[hw_event];
176} 231}
177 232
178/* 233/*
179 * Generalized hw caching related event table, filled 234 * Generalized hw caching related hw_event table, filled
180 * in on a per model basis. A value of 0 means 235 * in on a per model basis. A value of 0 means
181 * 'not supported', -1 means 'event makes no sense on 236 * 'not supported', -1 means 'hw_event makes no sense on
182 * this CPU', any other value means the raw event 237 * this CPU', any other value means the raw hw_event
183 * ID. 238 * ID.
184 */ 239 */
185 240
@@ -463,22 +518,22 @@ static const u64 atom_hw_cache_event_ids
463 }, 518 },
464}; 519};
465 520
466static u64 intel_pmu_raw_event(u64 event) 521static u64 intel_pmu_raw_event(u64 hw_event)
467{ 522{
468#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL 523#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
476 CORE_EVNTSEL_UNIT_MASK | \ 531 CORE_EVNTSEL_UNIT_MASK | \
477 CORE_EVNTSEL_EDGE_MASK | \ 532 CORE_EVNTSEL_EDGE_MASK | \
478 CORE_EVNTSEL_INV_MASK | \ 533 CORE_EVNTSEL_INV_MASK | \
479 CORE_EVNTSEL_COUNTER_MASK) 534 CORE_EVNTSEL_REG_MASK)
480 535
481 return event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static const u64 amd_hw_cache_event_ids
@@ -585,39 +640,39 @@ static const u64 amd_perfmon_event_map[] =
585 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 640 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
586}; 641};
587 642
588static u64 amd_pmu_event_map(int event) 643static u64 amd_pmu_event_map(int hw_event)
589{ 644{
590 return amd_perfmon_event_map[event]; 645 return amd_perfmon_event_map[hw_event];
591} 646}
592 647
593static u64 amd_pmu_raw_event(u64 event) 648static u64 amd_pmu_raw_event(u64 hw_event)
594{ 649{
595#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL 650#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
596#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 651#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
597#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 652#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
598#define K7_EVNTSEL_INV_MASK 0x000800000ULL 653#define K7_EVNTSEL_INV_MASK 0x000800000ULL
599#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL 654#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
600 655
601#define K7_EVNTSEL_MASK \ 656#define K7_EVNTSEL_MASK \
602 (K7_EVNTSEL_EVENT_MASK | \ 657 (K7_EVNTSEL_EVENT_MASK | \
603 K7_EVNTSEL_UNIT_MASK | \ 658 K7_EVNTSEL_UNIT_MASK | \
604 K7_EVNTSEL_EDGE_MASK | \ 659 K7_EVNTSEL_EDGE_MASK | \
605 K7_EVNTSEL_INV_MASK | \ 660 K7_EVNTSEL_INV_MASK | \
606 K7_EVNTSEL_COUNTER_MASK) 661 K7_EVNTSEL_REG_MASK)
607 662
608 return event & K7_EVNTSEL_MASK; 663 return hw_event & K7_EVNTSEL_MASK;
609} 664}
610 665
611/* 666/*
612 * Propagate counter elapsed time into the generic counter. 667 * Propagate event elapsed time into the generic event.
613 * Can only be executed on the CPU where the counter is active. 668 * Can only be executed on the CPU where the event is active.
614 * Returns the delta events processed. 669 * Returns the delta events processed.
615 */ 670 */
616static u64 671static u64
617x86_perf_counter_update(struct perf_counter *counter, 672x86_perf_event_update(struct perf_event *event,
618 struct hw_perf_counter *hwc, int idx) 673 struct hw_perf_event *hwc, int idx)
619{ 674{
620 int shift = 64 - x86_pmu.counter_bits; 675 int shift = 64 - x86_pmu.event_bits;
621 u64 prev_raw_count, new_raw_count; 676 u64 prev_raw_count, new_raw_count;
622 s64 delta; 677 s64 delta;
623 678
@@ -625,15 +680,15 @@ x86_perf_counter_update(struct perf_counter *counter,
625 return 0; 680 return 0;
626 681
627 /* 682 /*
628 * Careful: an NMI might modify the previous counter value. 683 * Careful: an NMI might modify the previous event value.
629 * 684 *
630 * Our tactic to handle this is to first atomically read and 685 * Our tactic to handle this is to first atomically read and
631 * exchange a new raw count - then add that new-prev delta 686 * exchange a new raw count - then add that new-prev delta
632 * count to the generic counter atomically: 687 * count to the generic event atomically:
633 */ 688 */
634again: 689again:
635 prev_raw_count = atomic64_read(&hwc->prev_count); 690 prev_raw_count = atomic64_read(&hwc->prev_count);
636 rdmsrl(hwc->counter_base + idx, new_raw_count); 691 rdmsrl(hwc->event_base + idx, new_raw_count);
637 692
638 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 693 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
639 new_raw_count) != prev_raw_count) 694 new_raw_count) != prev_raw_count)
@@ -642,7 +697,7 @@ again:
642 /* 697 /*
643 * Now we have the new raw value and have updated the prev 698 * Now we have the new raw value and have updated the prev
644 * timestamp already. We can now calculate the elapsed delta 699 * timestamp already. We can now calculate the elapsed delta
645 * (counter-)time and add that to the generic counter. 700 * (event-)time and add that to the generic event.
646 * 701 *
647 * Careful, not all hw sign-extends above the physical width 702 * Careful, not all hw sign-extends above the physical width
648 * of the count. 703 * of the count.
@@ -650,13 +705,13 @@ again:
650 delta = (new_raw_count << shift) - (prev_raw_count << shift); 705 delta = (new_raw_count << shift) - (prev_raw_count << shift);
651 delta >>= shift; 706 delta >>= shift;
652 707
653 atomic64_add(delta, &counter->count); 708 atomic64_add(delta, &event->count);
654 atomic64_sub(delta, &hwc->period_left); 709 atomic64_sub(delta, &hwc->period_left);
655 710
656 return new_raw_count; 711 return new_raw_count;
657} 712}
658 713
659static atomic_t active_counters; 714static atomic_t active_events;
660static DEFINE_MUTEX(pmc_reserve_mutex); 715static DEFINE_MUTEX(pmc_reserve_mutex);
661 716
662static bool reserve_pmc_hardware(void) 717static bool reserve_pmc_hardware(void)
@@ -667,12 +722,12 @@ static bool reserve_pmc_hardware(void)
667 if (nmi_watchdog == NMI_LOCAL_APIC) 722 if (nmi_watchdog == NMI_LOCAL_APIC)
668 disable_lapic_nmi_watchdog(); 723 disable_lapic_nmi_watchdog();
669 724
670 for (i = 0; i < x86_pmu.num_counters; i++) { 725 for (i = 0; i < x86_pmu.num_events; i++) {
671 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 726 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
672 goto perfctr_fail; 727 goto perfctr_fail;
673 } 728 }
674 729
675 for (i = 0; i < x86_pmu.num_counters; i++) { 730 for (i = 0; i < x86_pmu.num_events; i++) {
676 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 731 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
677 goto eventsel_fail; 732 goto eventsel_fail;
678 } 733 }
@@ -685,7 +740,7 @@ eventsel_fail:
685 for (i--; i >= 0; i--) 740 for (i--; i >= 0; i--)
686 release_evntsel_nmi(x86_pmu.eventsel + i); 741 release_evntsel_nmi(x86_pmu.eventsel + i);
687 742
688 i = x86_pmu.num_counters; 743 i = x86_pmu.num_events;
689 744
690perfctr_fail: 745perfctr_fail:
691 for (i--; i >= 0; i--) 746 for (i--; i >= 0; i--)
@@ -703,7 +758,7 @@ static void release_pmc_hardware(void)
703#ifdef CONFIG_X86_LOCAL_APIC 758#ifdef CONFIG_X86_LOCAL_APIC
704 int i; 759 int i;
705 760
706 for (i = 0; i < x86_pmu.num_counters; i++) { 761 for (i = 0; i < x86_pmu.num_events; i++) {
707 release_perfctr_nmi(x86_pmu.perfctr + i); 762 release_perfctr_nmi(x86_pmu.perfctr + i);
708 release_evntsel_nmi(x86_pmu.eventsel + i); 763 release_evntsel_nmi(x86_pmu.eventsel + i);
709 } 764 }
@@ -720,7 +775,7 @@ static inline bool bts_available(void)
720 775
721static inline void init_debug_store_on_cpu(int cpu) 776static inline void init_debug_store_on_cpu(int cpu)
722{ 777{
723 struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; 778 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
724 779
725 if (!ds) 780 if (!ds)
726 return; 781 return;
@@ -732,7 +787,7 @@ static inline void init_debug_store_on_cpu(int cpu)
732 787
733static inline void fini_debug_store_on_cpu(int cpu) 788static inline void fini_debug_store_on_cpu(int cpu)
734{ 789{
735 if (!per_cpu(cpu_hw_counters, cpu).ds) 790 if (!per_cpu(cpu_hw_events, cpu).ds)
736 return; 791 return;
737 792
738 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); 793 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
@@ -751,12 +806,12 @@ static void release_bts_hardware(void)
751 fini_debug_store_on_cpu(cpu); 806 fini_debug_store_on_cpu(cpu);
752 807
753 for_each_possible_cpu(cpu) { 808 for_each_possible_cpu(cpu) {
754 struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; 809 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
755 810
756 if (!ds) 811 if (!ds)
757 continue; 812 continue;
758 813
759 per_cpu(cpu_hw_counters, cpu).ds = NULL; 814 per_cpu(cpu_hw_events, cpu).ds = NULL;
760 815
761 kfree((void *)(unsigned long)ds->bts_buffer_base); 816 kfree((void *)(unsigned long)ds->bts_buffer_base);
762 kfree(ds); 817 kfree(ds);
@@ -796,7 +851,7 @@ static int reserve_bts_hardware(void)
796 ds->bts_interrupt_threshold = 851 ds->bts_interrupt_threshold =
797 ds->bts_absolute_maximum - BTS_OVFL_TH; 852 ds->bts_absolute_maximum - BTS_OVFL_TH;
798 853
799 per_cpu(cpu_hw_counters, cpu).ds = ds; 854 per_cpu(cpu_hw_events, cpu).ds = ds;
800 err = 0; 855 err = 0;
801 } 856 }
802 857
@@ -812,9 +867,9 @@ static int reserve_bts_hardware(void)
812 return err; 867 return err;
813} 868}
814 869
815static void hw_perf_counter_destroy(struct perf_counter *counter) 870static void hw_perf_event_destroy(struct perf_event *event)
816{ 871{
817 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { 872 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
818 release_pmc_hardware(); 873 release_pmc_hardware();
819 release_bts_hardware(); 874 release_bts_hardware();
820 mutex_unlock(&pmc_reserve_mutex); 875 mutex_unlock(&pmc_reserve_mutex);
@@ -827,7 +882,7 @@ static inline int x86_pmu_initialized(void)
827} 882}
828 883
829static inline int 884static inline int
830set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) 885set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
831{ 886{
832 unsigned int cache_type, cache_op, cache_result; 887 unsigned int cache_type, cache_op, cache_result;
833 u64 config, val; 888 u64 config, val;
@@ -880,7 +935,7 @@ static void intel_pmu_enable_bts(u64 config)
880 935
881static void intel_pmu_disable_bts(void) 936static void intel_pmu_disable_bts(void)
882{ 937{
883 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 938 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
884 unsigned long debugctlmsr; 939 unsigned long debugctlmsr;
885 940
886 if (!cpuc->ds) 941 if (!cpuc->ds)
@@ -898,10 +953,10 @@ static void intel_pmu_disable_bts(void)
898/* 953/*
899 * Setup the hardware configuration for a given attr_type 954 * Setup the hardware configuration for a given attr_type
900 */ 955 */
901static int __hw_perf_counter_init(struct perf_counter *counter) 956static int __hw_perf_event_init(struct perf_event *event)
902{ 957{
903 struct perf_counter_attr *attr = &counter->attr; 958 struct perf_event_attr *attr = &event->attr;
904 struct hw_perf_counter *hwc = &counter->hw; 959 struct hw_perf_event *hwc = &event->hw;
905 u64 config; 960 u64 config;
906 int err; 961 int err;
907 962
@@ -909,27 +964,31 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
909 return -ENODEV; 964 return -ENODEV;
910 965
911 err = 0; 966 err = 0;
912 if (!atomic_inc_not_zero(&active_counters)) { 967 if (!atomic_inc_not_zero(&active_events)) {
913 mutex_lock(&pmc_reserve_mutex); 968 mutex_lock(&pmc_reserve_mutex);
914 if (atomic_read(&active_counters) == 0) { 969 if (atomic_read(&active_events) == 0) {
915 if (!reserve_pmc_hardware()) 970 if (!reserve_pmc_hardware())
916 err = -EBUSY; 971 err = -EBUSY;
917 else 972 else
918 err = reserve_bts_hardware(); 973 err = reserve_bts_hardware();
919 } 974 }
920 if (!err) 975 if (!err)
921 atomic_inc(&active_counters); 976 atomic_inc(&active_events);
922 mutex_unlock(&pmc_reserve_mutex); 977 mutex_unlock(&pmc_reserve_mutex);
923 } 978 }
924 if (err) 979 if (err)
925 return err; 980 return err;
926 981
982 event->destroy = hw_perf_event_destroy;
983
927 /* 984 /*
928 * Generate PMC IRQs: 985 * Generate PMC IRQs:
929 * (keep 'enabled' bit clear for now) 986 * (keep 'enabled' bit clear for now)
930 */ 987 */
931 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
932 989
990 hwc->idx = -1;
991
933 /* 992 /*
934 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
935 */ 994 */
@@ -946,17 +1005,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
946 /* 1005 /*
947 * If we have a PMU initialized but no APIC 1006 * If we have a PMU initialized but no APIC
948 * interrupts, we cannot sample hardware 1007 * interrupts, we cannot sample hardware
949 * counters (user-space has to fall back and 1008 * events (user-space has to fall back and
950 * sample via a hrtimer based software counter): 1009 * sample via a hrtimer based software event):
951 */ 1010 */
952 if (!x86_pmu.apic) 1011 if (!x86_pmu.apic)
953 return -EOPNOTSUPP; 1012 return -EOPNOTSUPP;
954 } 1013 }
955 1014
956 counter->destroy = hw_perf_counter_destroy;
957
958 /* 1015 /*
959 * Raw event type provide the config in the event structure 1016 * Raw hw_event type provide the config in the hw_event structure
960 */ 1017 */
961 if (attr->type == PERF_TYPE_RAW) { 1018 if (attr->type == PERF_TYPE_RAW) {
962 hwc->config |= x86_pmu.raw_event(attr->config); 1019 hwc->config |= x86_pmu.raw_event(attr->config);
@@ -1001,7 +1058,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
1001 1058
1002static void p6_pmu_disable_all(void) 1059static void p6_pmu_disable_all(void)
1003{ 1060{
1004 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1061 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1005 u64 val; 1062 u64 val;
1006 1063
1007 if (!cpuc->enabled) 1064 if (!cpuc->enabled)
@@ -1018,7 +1075,7 @@ static void p6_pmu_disable_all(void)
1018 1075
1019static void intel_pmu_disable_all(void) 1076static void intel_pmu_disable_all(void)
1020{ 1077{
1021 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1078 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1022 1079
1023 if (!cpuc->enabled) 1080 if (!cpuc->enabled)
1024 return; 1081 return;
@@ -1034,7 +1091,7 @@ static void intel_pmu_disable_all(void)
1034 1091
1035static void amd_pmu_disable_all(void) 1092static void amd_pmu_disable_all(void)
1036{ 1093{
1037 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1094 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1038 int idx; 1095 int idx;
1039 1096
1040 if (!cpuc->enabled) 1097 if (!cpuc->enabled)
@@ -1043,12 +1100,12 @@ static void amd_pmu_disable_all(void)
1043 cpuc->enabled = 0; 1100 cpuc->enabled = 0;
1044 /* 1101 /*
1045 * ensure we write the disable before we start disabling the 1102 * ensure we write the disable before we start disabling the
1046 * counters proper, so that amd_pmu_enable_counter() does the 1103 * events proper, so that amd_pmu_enable_event() does the
1047 * right thing. 1104 * right thing.
1048 */ 1105 */
1049 barrier(); 1106 barrier();
1050 1107
1051 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1108 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1052 u64 val; 1109 u64 val;
1053 1110
1054 if (!test_bit(idx, cpuc->active_mask)) 1111 if (!test_bit(idx, cpuc->active_mask))
@@ -1070,7 +1127,7 @@ void hw_perf_disable(void)
1070 1127
1071static void p6_pmu_enable_all(void) 1128static void p6_pmu_enable_all(void)
1072{ 1129{
1073 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1130 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1074 unsigned long val; 1131 unsigned long val;
1075 1132
1076 if (cpuc->enabled) 1133 if (cpuc->enabled)
@@ -1087,7 +1144,7 @@ static void p6_pmu_enable_all(void)
1087 1144
1088static void intel_pmu_enable_all(void) 1145static void intel_pmu_enable_all(void)
1089{ 1146{
1090 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1147 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1091 1148
1092 if (cpuc->enabled) 1149 if (cpuc->enabled)
1093 return; 1150 return;
@@ -1098,19 +1155,19 @@ static void intel_pmu_enable_all(void)
1098 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 1155 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1099 1156
1100 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 1157 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
1101 struct perf_counter *counter = 1158 struct perf_event *event =
1102 cpuc->counters[X86_PMC_IDX_FIXED_BTS]; 1159 cpuc->events[X86_PMC_IDX_FIXED_BTS];
1103 1160
1104 if (WARN_ON_ONCE(!counter)) 1161 if (WARN_ON_ONCE(!event))
1105 return; 1162 return;
1106 1163
1107 intel_pmu_enable_bts(counter->hw.config); 1164 intel_pmu_enable_bts(event->hw.config);
1108 } 1165 }
1109} 1166}
1110 1167
1111static void amd_pmu_enable_all(void) 1168static void amd_pmu_enable_all(void)
1112{ 1169{
1113 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1170 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1114 int idx; 1171 int idx;
1115 1172
1116 if (cpuc->enabled) 1173 if (cpuc->enabled)
@@ -1119,14 +1176,14 @@ static void amd_pmu_enable_all(void)
1119 cpuc->enabled = 1; 1176 cpuc->enabled = 1;
1120 barrier(); 1177 barrier();
1121 1178
1122 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1179 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1123 struct perf_counter *counter = cpuc->counters[idx]; 1180 struct perf_event *event = cpuc->events[idx];
1124 u64 val; 1181 u64 val;
1125 1182
1126 if (!test_bit(idx, cpuc->active_mask)) 1183 if (!test_bit(idx, cpuc->active_mask))
1127 continue; 1184 continue;
1128 1185
1129 val = counter->hw.config; 1186 val = event->hw.config;
1130 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1131 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 1188 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1132 } 1189 }
@@ -1153,19 +1210,19 @@ static inline void intel_pmu_ack_status(u64 ack)
1153 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 1210 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
1154} 1211}
1155 1212
1156static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1213static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1157{ 1214{
1158 (void)checking_wrmsrl(hwc->config_base + idx, 1215 (void)checking_wrmsrl(hwc->config_base + idx,
1159 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 1216 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1160} 1217}
1161 1218
1162static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1219static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1163{ 1220{
1164 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 1221 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
1165} 1222}
1166 1223
1167static inline void 1224static inline void
1168intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) 1225intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
1169{ 1226{
1170 int idx = __idx - X86_PMC_IDX_FIXED; 1227 int idx = __idx - X86_PMC_IDX_FIXED;
1171 u64 ctrl_val, mask; 1228 u64 ctrl_val, mask;
@@ -1178,10 +1235,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
1178} 1235}
1179 1236
1180static inline void 1237static inline void
1181p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1238p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1182{ 1239{
1183 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1240 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1184 u64 val = P6_NOP_COUNTER; 1241 u64 val = P6_NOP_EVENT;
1185 1242
1186 if (cpuc->enabled) 1243 if (cpuc->enabled)
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 1244 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -1190,7 +1247,7 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
1190} 1247}
1191 1248
1192static inline void 1249static inline void
1193intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1250intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1194{ 1251{
1195 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 1252 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1196 intel_pmu_disable_bts(); 1253 intel_pmu_disable_bts();
@@ -1202,24 +1259,24 @@ intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
1202 return; 1259 return;
1203 } 1260 }
1204 1261
1205 x86_pmu_disable_counter(hwc, idx); 1262 x86_pmu_disable_event(hwc, idx);
1206} 1263}
1207 1264
1208static inline void 1265static inline void
1209amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1266amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1210{ 1267{
1211 x86_pmu_disable_counter(hwc, idx); 1268 x86_pmu_disable_event(hwc, idx);
1212} 1269}
1213 1270
1214static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); 1271static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1215 1272
1216/* 1273/*
1217 * Set the next IRQ period, based on the hwc->period_left value. 1274 * Set the next IRQ period, based on the hwc->period_left value.
1218 * To be called with the counter disabled in hw: 1275 * To be called with the event disabled in hw:
1219 */ 1276 */
1220static int 1277static int
1221x86_perf_counter_set_period(struct perf_counter *counter, 1278x86_perf_event_set_period(struct perf_event *event,
1222 struct hw_perf_counter *hwc, int idx) 1279 struct hw_perf_event *hwc, int idx)
1223{ 1280{
1224 s64 left = atomic64_read(&hwc->period_left); 1281 s64 left = atomic64_read(&hwc->period_left);
1225 s64 period = hwc->sample_period; 1282 s64 period = hwc->sample_period;
@@ -1245,7 +1302,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1245 ret = 1; 1302 ret = 1;
1246 } 1303 }
1247 /* 1304 /*
1248 * Quirk: certain CPUs dont like it if just 1 event is left: 1305 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1249 */ 1306 */
1250 if (unlikely(left < 2)) 1307 if (unlikely(left < 2))
1251 left = 2; 1308 left = 2;
@@ -1253,24 +1310,24 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1253 if (left > x86_pmu.max_period) 1310 if (left > x86_pmu.max_period)
1254 left = x86_pmu.max_period; 1311 left = x86_pmu.max_period;
1255 1312
1256 per_cpu(prev_left[idx], smp_processor_id()) = left; 1313 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1257 1314
1258 /* 1315 /*
1259 * The hw counter starts counting from this counter offset, 1316 * The hw event starts counting from this event offset,
1260 * mark it to be able to extra future deltas: 1317 * mark it to be able to extra future deltas:
1261 */ 1318 */
1262 atomic64_set(&hwc->prev_count, (u64)-left); 1319 atomic64_set(&hwc->prev_count, (u64)-left);
1263 1320
1264 err = checking_wrmsrl(hwc->counter_base + idx, 1321 err = checking_wrmsrl(hwc->event_base + idx,
1265 (u64)(-left) & x86_pmu.counter_mask); 1322 (u64)(-left) & x86_pmu.event_mask);
1266 1323
1267 perf_counter_update_userpage(counter); 1324 perf_event_update_userpage(event);
1268 1325
1269 return ret; 1326 return ret;
1270} 1327}
1271 1328
1272static inline void 1329static inline void
1273intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) 1330intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1274{ 1331{
1275 int idx = __idx - X86_PMC_IDX_FIXED; 1332 int idx = __idx - X86_PMC_IDX_FIXED;
1276 u64 ctrl_val, bits, mask; 1333 u64 ctrl_val, bits, mask;
@@ -1295,9 +1352,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
1295 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1352 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1296} 1353}
1297 1354
1298static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1355static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1299{ 1356{
1300 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1357 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1301 u64 val; 1358 u64 val;
1302 1359
1303 val = hwc->config; 1360 val = hwc->config;
@@ -1308,10 +1365,10 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1308} 1365}
1309 1366
1310 1367
1311static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1368static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1312{ 1369{
1313 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 1370 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1314 if (!__get_cpu_var(cpu_hw_counters).enabled) 1371 if (!__get_cpu_var(cpu_hw_events).enabled)
1315 return; 1372 return;
1316 1373
1317 intel_pmu_enable_bts(hwc->config); 1374 intel_pmu_enable_bts(hwc->config);
@@ -1323,134 +1380,189 @@ static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1323 return; 1380 return;
1324 } 1381 }
1325 1382
1326 x86_pmu_enable_counter(hwc, idx); 1383 x86_pmu_enable_event(hwc, idx);
1327} 1384}
1328 1385
1329static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1386static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1330{ 1387{
1331 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1388 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1332 1389
1333 if (cpuc->enabled) 1390 if (cpuc->enabled)
1334 x86_pmu_enable_counter(hwc, idx); 1391 x86_pmu_enable_event(hwc, idx);
1335} 1392}
1336 1393
1337static int 1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
1339{ 1395{
1340 unsigned int event; 1396 unsigned int hw_event;
1341 1397
1342 event = hwc->config & ARCH_PERFMON_EVENT_MASK; 1398 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1343 1399
1344 if (unlikely((event == 1400 if (unlikely((hw_event ==
1345 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 1401 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1346 (hwc->sample_period == 1))) 1402 (hwc->sample_period == 1)))
1347 return X86_PMC_IDX_FIXED_BTS; 1403 return X86_PMC_IDX_FIXED_BTS;
1348 1404
1349 if (!x86_pmu.num_counters_fixed) 1405 if (!x86_pmu.num_events_fixed)
1350 return -1; 1406 return -1;
1351 1407
1352 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1408 /*
1409 * fixed counters do not take all possible filters
1410 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1412 return -1;
1413
1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1355 return X86_PMC_IDX_FIXED_CPU_CYCLES; 1417 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1356 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) 1418 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1357 return X86_PMC_IDX_FIXED_BUS_CYCLES; 1419 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1358 1420
1359 return -1; 1421 return -1;
1360} 1422}
1361 1423
1362/* 1424/*
1363 * Find a PMC slot for the freshly enabled / scheduled in counter: 1425 * generic counter allocator: get next free counter
1364 */ 1426 */
1365static int x86_pmu_enable(struct perf_counter *counter) 1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1429{
1367 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1368 struct hw_perf_counter *hwc = &counter->hw;
1369 int idx; 1430 int idx;
1370 1431
1371 idx = fixed_mode_idx(counter, hwc); 1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435
1436/*
1437 * intel-specific counter allocator: check event constraints
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444
1445 if (!event_constraints)
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1465{
1466 int idx;
1467
1468 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1470 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1471 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1472 return -EAGAIN;
1376 1473
1377 hwc->config_base = 0; 1474 hwc->config_base = 0;
1378 hwc->counter_base = 0; 1475 hwc->event_base = 0;
1379 hwc->idx = idx; 1476 hwc->idx = idx;
1380 } else if (idx >= 0) { 1477 } else if (idx >= 0) {
1381 /* 1478 /*
1382 * Try to get the fixed counter, if that is already taken 1479 * Try to get the fixed event, if that is already taken
1383 * then try to get a generic counter: 1480 * then try to get a generic event:
1384 */ 1481 */
1385 if (test_and_set_bit(idx, cpuc->used_mask)) 1482 if (test_and_set_bit(idx, cpuc->used_mask))
1386 goto try_generic; 1483 goto try_generic;
1387 1484
1388 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1485 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1389 /* 1486 /*
1390 * We set it so that counter_base + idx in wrmsr/rdmsr maps to 1487 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1391 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: 1488 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1392 */ 1489 */
1393 hwc->counter_base = 1490 hwc->event_base =
1394 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; 1491 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1395 hwc->idx = idx; 1492 hwc->idx = idx;
1396 } else { 1493 } else {
1397 idx = hwc->idx; 1494 idx = hwc->idx;
1398 /* Try to get the previous generic counter again */ 1495 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1497try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_counters); 1499 if (idx == -1)
1403 if (idx == x86_pmu.num_counters)
1404 return -EAGAIN; 1500 return -EAGAIN;
1405 1501
1406 set_bit(idx, cpuc->used_mask); 1502 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1503 hwc->idx = idx;
1408 } 1504 }
1409 hwc->config_base = x86_pmu.eventsel; 1505 hwc->config_base = x86_pmu.eventsel;
1410 hwc->counter_base = x86_pmu.perfctr; 1506 hwc->event_base = x86_pmu.perfctr;
1411 } 1507 }
1412 1508
1413 perf_counters_lapic_init(); 1509 return idx;
1510}
1511
1512/*
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1525 perf_events_lapic_init();
1414 1526
1415 x86_pmu.disable(hwc, idx); 1527 x86_pmu.disable(hwc, idx);
1416 1528
1417 cpuc->counters[idx] = counter; 1529 cpuc->events[idx] = event;
1418 set_bit(idx, cpuc->active_mask); 1530 set_bit(idx, cpuc->active_mask);
1419 1531
1420 x86_perf_counter_set_period(counter, hwc, idx); 1532 x86_perf_event_set_period(event, hwc, idx);
1421 x86_pmu.enable(hwc, idx); 1533 x86_pmu.enable(hwc, idx);
1422 1534
1423 perf_counter_update_userpage(counter); 1535 perf_event_update_userpage(event);
1424 1536
1425 return 0; 1537 return 0;
1426} 1538}
1427 1539
1428static void x86_pmu_unthrottle(struct perf_counter *counter) 1540static void x86_pmu_unthrottle(struct perf_event *event)
1429{ 1541{
1430 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1542 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1431 struct hw_perf_counter *hwc = &counter->hw; 1543 struct hw_perf_event *hwc = &event->hw;
1432 1544
1433 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || 1545 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1434 cpuc->counters[hwc->idx] != counter)) 1546 cpuc->events[hwc->idx] != event))
1435 return; 1547 return;
1436 1548
1437 x86_pmu.enable(hwc, hwc->idx); 1549 x86_pmu.enable(hwc, hwc->idx);
1438} 1550}
1439 1551
1440void perf_counter_print_debug(void) 1552void perf_event_print_debug(void)
1441{ 1553{
1442 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1554 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1443 struct cpu_hw_counters *cpuc; 1555 struct cpu_hw_events *cpuc;
1444 unsigned long flags; 1556 unsigned long flags;
1445 int cpu, idx; 1557 int cpu, idx;
1446 1558
1447 if (!x86_pmu.num_counters) 1559 if (!x86_pmu.num_events)
1448 return; 1560 return;
1449 1561
1450 local_irq_save(flags); 1562 local_irq_save(flags);
1451 1563
1452 cpu = smp_processor_id(); 1564 cpu = smp_processor_id();
1453 cpuc = &per_cpu(cpu_hw_counters, cpu); 1565 cpuc = &per_cpu(cpu_hw_events, cpu);
1454 1566
1455 if (x86_pmu.version >= 2) { 1567 if (x86_pmu.version >= 2) {
1456 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1568 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
@@ -1466,11 +1578,11 @@ void perf_counter_print_debug(void)
1466 } 1578 }
1467 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1579 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1468 1580
1469 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1581 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1470 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1582 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1471 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1583 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1472 1584
1473 prev_left = per_cpu(prev_left[idx], cpu); 1585 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1474 1586
1475 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1587 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1476 cpu, idx, pmc_ctrl); 1588 cpu, idx, pmc_ctrl);
@@ -1479,7 +1591,7 @@ void perf_counter_print_debug(void)
1479 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1591 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1480 cpu, idx, prev_left); 1592 cpu, idx, prev_left);
1481 } 1593 }
1482 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1594 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1483 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1595 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1484 1596
1485 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1597 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1488,8 +1600,7 @@ void perf_counter_print_debug(void)
1488 local_irq_restore(flags); 1600 local_irq_restore(flags);
1489} 1601}
1490 1602
1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, 1603static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1492 struct perf_sample_data *data)
1493{ 1604{
1494 struct debug_store *ds = cpuc->ds; 1605 struct debug_store *ds = cpuc->ds;
1495 struct bts_record { 1606 struct bts_record {
@@ -1497,11 +1608,14 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
1497 u64 to; 1608 u64 to;
1498 u64 flags; 1609 u64 flags;
1499 }; 1610 };
1500 struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; 1611 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1501 unsigned long orig_ip = data->regs->ip;
1502 struct bts_record *at, *top; 1612 struct bts_record *at, *top;
1613 struct perf_output_handle handle;
1614 struct perf_event_header header;
1615 struct perf_sample_data data;
1616 struct pt_regs regs;
1503 1617
1504 if (!counter) 1618 if (!event)
1505 return; 1619 return;
1506 1620
1507 if (!ds) 1621 if (!ds)
@@ -1510,26 +1624,45 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
1510 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; 1624 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1511 top = (struct bts_record *)(unsigned long)ds->bts_index; 1625 top = (struct bts_record *)(unsigned long)ds->bts_index;
1512 1626
1627 if (top <= at)
1628 return;
1629
1513 ds->bts_index = ds->bts_buffer_base; 1630 ds->bts_index = ds->bts_buffer_base;
1514 1631
1632
1633 data.period = event->hw.last_period;
1634 data.addr = 0;
1635 regs.ip = 0;
1636
1637 /*
1638 * Prepare a generic sample, i.e. fill in the invariant fields.
1639 * We will overwrite the from and to address before we output
1640 * the sample.
1641 */
1642 perf_prepare_sample(&header, &data, event, &regs);
1643
1644 if (perf_output_begin(&handle, event,
1645 header.size * (top - at), 1, 1))
1646 return;
1647
1515 for (; at < top; at++) { 1648 for (; at < top; at++) {
1516 data->regs->ip = at->from; 1649 data.ip = at->from;
1517 data->addr = at->to; 1650 data.addr = at->to;
1518 1651
1519 perf_counter_output(counter, 1, data); 1652 perf_output_sample(&handle, &header, &data, event);
1520 } 1653 }
1521 1654
1522 data->regs->ip = orig_ip; 1655 perf_output_end(&handle);
1523 data->addr = 0;
1524 1656
1525 /* There's new data available. */ 1657 /* There's new data available. */
1526 counter->pending_kill = POLL_IN; 1658 event->hw.interrupts++;
1659 event->pending_kill = POLL_IN;
1527} 1660}
1528 1661
1529static void x86_pmu_disable(struct perf_counter *counter) 1662static void x86_pmu_disable(struct perf_event *event)
1530{ 1663{
1531 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1664 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1532 struct hw_perf_counter *hwc = &counter->hw; 1665 struct hw_perf_event *hwc = &event->hw;
1533 int idx = hwc->idx; 1666 int idx = hwc->idx;
1534 1667
1535 /* 1668 /*
@@ -1541,67 +1674,63 @@ static void x86_pmu_disable(struct perf_counter *counter)
1541 1674
1542 /* 1675 /*
1543 * Make sure the cleared pointer becomes visible before we 1676 * Make sure the cleared pointer becomes visible before we
1544 * (potentially) free the counter: 1677 * (potentially) free the event:
1545 */ 1678 */
1546 barrier(); 1679 barrier();
1547 1680
1548 /* 1681 /*
1549 * Drain the remaining delta count out of a counter 1682 * Drain the remaining delta count out of a event
1550 * that we are disabling: 1683 * that we are disabling:
1551 */ 1684 */
1552 x86_perf_counter_update(counter, hwc, idx); 1685 x86_perf_event_update(event, hwc, idx);
1553 1686
1554 /* Drain the remaining BTS records. */ 1687 /* Drain the remaining BTS records. */
1555 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 1688 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1556 struct perf_sample_data data; 1689 intel_pmu_drain_bts_buffer(cpuc);
1557 struct pt_regs regs;
1558 1690
1559 data.regs = &regs; 1691 cpuc->events[idx] = NULL;
1560 intel_pmu_drain_bts_buffer(cpuc, &data);
1561 }
1562 cpuc->counters[idx] = NULL;
1563 clear_bit(idx, cpuc->used_mask); 1692 clear_bit(idx, cpuc->used_mask);
1564 1693
1565 perf_counter_update_userpage(counter); 1694 perf_event_update_userpage(event);
1566} 1695}
1567 1696
1568/* 1697/*
1569 * Save and restart an expired counter. Called by NMI contexts, 1698 * Save and restart an expired event. Called by NMI contexts,
1570 * so it has to be careful about preempting normal counter ops: 1699 * so it has to be careful about preempting normal event ops:
1571 */ 1700 */
1572static int intel_pmu_save_and_restart(struct perf_counter *counter) 1701static int intel_pmu_save_and_restart(struct perf_event *event)
1573{ 1702{
1574 struct hw_perf_counter *hwc = &counter->hw; 1703 struct hw_perf_event *hwc = &event->hw;
1575 int idx = hwc->idx; 1704 int idx = hwc->idx;
1576 int ret; 1705 int ret;
1577 1706
1578 x86_perf_counter_update(counter, hwc, idx); 1707 x86_perf_event_update(event, hwc, idx);
1579 ret = x86_perf_counter_set_period(counter, hwc, idx); 1708 ret = x86_perf_event_set_period(event, hwc, idx);
1580 1709
1581 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 1710 if (event->state == PERF_EVENT_STATE_ACTIVE)
1582 intel_pmu_enable_counter(hwc, idx); 1711 intel_pmu_enable_event(hwc, idx);
1583 1712
1584 return ret; 1713 return ret;
1585} 1714}
1586 1715
1587static void intel_pmu_reset(void) 1716static void intel_pmu_reset(void)
1588{ 1717{
1589 struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; 1718 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1590 unsigned long flags; 1719 unsigned long flags;
1591 int idx; 1720 int idx;
1592 1721
1593 if (!x86_pmu.num_counters) 1722 if (!x86_pmu.num_events)
1594 return; 1723 return;
1595 1724
1596 local_irq_save(flags); 1725 local_irq_save(flags);
1597 1726
1598 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 1727 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1599 1728
1600 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1729 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1601 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 1730 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1602 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 1731 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1603 } 1732 }
1604 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1733 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1605 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 1734 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1606 } 1735 }
1607 if (ds) 1736 if (ds)
@@ -1613,39 +1742,38 @@ static void intel_pmu_reset(void)
1613static int p6_pmu_handle_irq(struct pt_regs *regs) 1742static int p6_pmu_handle_irq(struct pt_regs *regs)
1614{ 1743{
1615 struct perf_sample_data data; 1744 struct perf_sample_data data;
1616 struct cpu_hw_counters *cpuc; 1745 struct cpu_hw_events *cpuc;
1617 struct perf_counter *counter; 1746 struct perf_event *event;
1618 struct hw_perf_counter *hwc; 1747 struct hw_perf_event *hwc;
1619 int idx, handled = 0; 1748 int idx, handled = 0;
1620 u64 val; 1749 u64 val;
1621 1750
1622 data.regs = regs;
1623 data.addr = 0; 1751 data.addr = 0;
1624 1752
1625 cpuc = &__get_cpu_var(cpu_hw_counters); 1753 cpuc = &__get_cpu_var(cpu_hw_events);
1626 1754
1627 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1755 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1628 if (!test_bit(idx, cpuc->active_mask)) 1756 if (!test_bit(idx, cpuc->active_mask))
1629 continue; 1757 continue;
1630 1758
1631 counter = cpuc->counters[idx]; 1759 event = cpuc->events[idx];
1632 hwc = &counter->hw; 1760 hwc = &event->hw;
1633 1761
1634 val = x86_perf_counter_update(counter, hwc, idx); 1762 val = x86_perf_event_update(event, hwc, idx);
1635 if (val & (1ULL << (x86_pmu.counter_bits - 1))) 1763 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1636 continue; 1764 continue;
1637 1765
1638 /* 1766 /*
1639 * counter overflow 1767 * event overflow
1640 */ 1768 */
1641 handled = 1; 1769 handled = 1;
1642 data.period = counter->hw.last_period; 1770 data.period = event->hw.last_period;
1643 1771
1644 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1772 if (!x86_perf_event_set_period(event, hwc, idx))
1645 continue; 1773 continue;
1646 1774
1647 if (perf_counter_overflow(counter, 1, &data)) 1775 if (perf_event_overflow(event, 1, &data, regs))
1648 p6_pmu_disable_counter(hwc, idx); 1776 p6_pmu_disable_event(hwc, idx);
1649 } 1777 }
1650 1778
1651 if (handled) 1779 if (handled)
@@ -1661,17 +1789,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1661static int intel_pmu_handle_irq(struct pt_regs *regs) 1789static int intel_pmu_handle_irq(struct pt_regs *regs)
1662{ 1790{
1663 struct perf_sample_data data; 1791 struct perf_sample_data data;
1664 struct cpu_hw_counters *cpuc; 1792 struct cpu_hw_events *cpuc;
1665 int bit, loops; 1793 int bit, loops;
1666 u64 ack, status; 1794 u64 ack, status;
1667 1795
1668 data.regs = regs;
1669 data.addr = 0; 1796 data.addr = 0;
1670 1797
1671 cpuc = &__get_cpu_var(cpu_hw_counters); 1798 cpuc = &__get_cpu_var(cpu_hw_events);
1672 1799
1673 perf_disable(); 1800 perf_disable();
1674 intel_pmu_drain_bts_buffer(cpuc, &data); 1801 intel_pmu_drain_bts_buffer(cpuc);
1675 status = intel_pmu_get_status(); 1802 status = intel_pmu_get_status();
1676 if (!status) { 1803 if (!status) {
1677 perf_enable(); 1804 perf_enable();
@@ -1681,8 +1808,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1681 loops = 0; 1808 loops = 0;
1682again: 1809again:
1683 if (++loops > 100) { 1810 if (++loops > 100) {
1684 WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); 1811 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1685 perf_counter_print_debug(); 1812 perf_event_print_debug();
1686 intel_pmu_reset(); 1813 intel_pmu_reset();
1687 perf_enable(); 1814 perf_enable();
1688 return 1; 1815 return 1;
@@ -1691,19 +1818,19 @@ again:
1691 inc_irq_stat(apic_perf_irqs); 1818 inc_irq_stat(apic_perf_irqs);
1692 ack = status; 1819 ack = status;
1693 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 1820 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1694 struct perf_counter *counter = cpuc->counters[bit]; 1821 struct perf_event *event = cpuc->events[bit];
1695 1822
1696 clear_bit(bit, (unsigned long *) &status); 1823 clear_bit(bit, (unsigned long *) &status);
1697 if (!test_bit(bit, cpuc->active_mask)) 1824 if (!test_bit(bit, cpuc->active_mask))
1698 continue; 1825 continue;
1699 1826
1700 if (!intel_pmu_save_and_restart(counter)) 1827 if (!intel_pmu_save_and_restart(event))
1701 continue; 1828 continue;
1702 1829
1703 data.period = counter->hw.last_period; 1830 data.period = event->hw.last_period;
1704 1831
1705 if (perf_counter_overflow(counter, 1, &data)) 1832 if (perf_event_overflow(event, 1, &data, regs))
1706 intel_pmu_disable_counter(&counter->hw, bit); 1833 intel_pmu_disable_event(&event->hw, bit);
1707 } 1834 }
1708 1835
1709 intel_pmu_ack_status(ack); 1836 intel_pmu_ack_status(ack);
@@ -1723,39 +1850,38 @@ again:
1723static int amd_pmu_handle_irq(struct pt_regs *regs) 1850static int amd_pmu_handle_irq(struct pt_regs *regs)
1724{ 1851{
1725 struct perf_sample_data data; 1852 struct perf_sample_data data;
1726 struct cpu_hw_counters *cpuc; 1853 struct cpu_hw_events *cpuc;
1727 struct perf_counter *counter; 1854 struct perf_event *event;
1728 struct hw_perf_counter *hwc; 1855 struct hw_perf_event *hwc;
1729 int idx, handled = 0; 1856 int idx, handled = 0;
1730 u64 val; 1857 u64 val;
1731 1858
1732 data.regs = regs;
1733 data.addr = 0; 1859 data.addr = 0;
1734 1860
1735 cpuc = &__get_cpu_var(cpu_hw_counters); 1861 cpuc = &__get_cpu_var(cpu_hw_events);
1736 1862
1737 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1863 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1738 if (!test_bit(idx, cpuc->active_mask)) 1864 if (!test_bit(idx, cpuc->active_mask))
1739 continue; 1865 continue;
1740 1866
1741 counter = cpuc->counters[idx]; 1867 event = cpuc->events[idx];
1742 hwc = &counter->hw; 1868 hwc = &event->hw;
1743 1869
1744 val = x86_perf_counter_update(counter, hwc, idx); 1870 val = x86_perf_event_update(event, hwc, idx);
1745 if (val & (1ULL << (x86_pmu.counter_bits - 1))) 1871 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1746 continue; 1872 continue;
1747 1873
1748 /* 1874 /*
1749 * counter overflow 1875 * event overflow
1750 */ 1876 */
1751 handled = 1; 1877 handled = 1;
1752 data.period = counter->hw.last_period; 1878 data.period = event->hw.last_period;
1753 1879
1754 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1880 if (!x86_perf_event_set_period(event, hwc, idx))
1755 continue; 1881 continue;
1756 1882
1757 if (perf_counter_overflow(counter, 1, &data)) 1883 if (perf_event_overflow(event, 1, &data, regs))
1758 amd_pmu_disable_counter(hwc, idx); 1884 amd_pmu_disable_event(hwc, idx);
1759 } 1885 }
1760 1886
1761 if (handled) 1887 if (handled)
@@ -1769,18 +1895,21 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
1769 irq_enter(); 1895 irq_enter();
1770 ack_APIC_irq(); 1896 ack_APIC_irq();
1771 inc_irq_stat(apic_pending_irqs); 1897 inc_irq_stat(apic_pending_irqs);
1772 perf_counter_do_pending(); 1898 perf_event_do_pending();
1773 irq_exit(); 1899 irq_exit();
1774} 1900}
1775 1901
1776void set_perf_counter_pending(void) 1902void set_perf_event_pending(void)
1777{ 1903{
1778#ifdef CONFIG_X86_LOCAL_APIC 1904#ifdef CONFIG_X86_LOCAL_APIC
1905 if (!x86_pmu.apic || !x86_pmu_initialized())
1906 return;
1907
1779 apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1908 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1780#endif 1909#endif
1781} 1910}
1782 1911
1783void perf_counters_lapic_init(void) 1912void perf_events_lapic_init(void)
1784{ 1913{
1785#ifdef CONFIG_X86_LOCAL_APIC 1914#ifdef CONFIG_X86_LOCAL_APIC
1786 if (!x86_pmu.apic || !x86_pmu_initialized()) 1915 if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1794,13 +1923,13 @@ void perf_counters_lapic_init(void)
1794} 1923}
1795 1924
1796static int __kprobes 1925static int __kprobes
1797perf_counter_nmi_handler(struct notifier_block *self, 1926perf_event_nmi_handler(struct notifier_block *self,
1798 unsigned long cmd, void *__args) 1927 unsigned long cmd, void *__args)
1799{ 1928{
1800 struct die_args *args = __args; 1929 struct die_args *args = __args;
1801 struct pt_regs *regs; 1930 struct pt_regs *regs;
1802 1931
1803 if (!atomic_read(&active_counters)) 1932 if (!atomic_read(&active_events))
1804 return NOTIFY_DONE; 1933 return NOTIFY_DONE;
1805 1934
1806 switch (cmd) { 1935 switch (cmd) {
@@ -1819,7 +1948,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
1819#endif 1948#endif
1820 /* 1949 /*
1821 * Can't rely on the handled return value to say it was our NMI, two 1950 * Can't rely on the handled return value to say it was our NMI, two
1822 * counters could trigger 'simultaneously' raising two back-to-back NMIs. 1951 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1823 * 1952 *
1824 * If the first NMI handles both, the latter will be empty and daze 1953 * If the first NMI handles both, the latter will be empty and daze
1825 * the CPU. 1954 * the CPU.
@@ -1829,8 +1958,8 @@ perf_counter_nmi_handler(struct notifier_block *self,
1829 return NOTIFY_STOP; 1958 return NOTIFY_STOP;
1830} 1959}
1831 1960
1832static __read_mostly struct notifier_block perf_counter_nmi_notifier = { 1961static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1833 .notifier_call = perf_counter_nmi_handler, 1962 .notifier_call = perf_event_nmi_handler,
1834 .next = NULL, 1963 .next = NULL,
1835 .priority = 1 1964 .priority = 1
1836}; 1965};
@@ -1840,8 +1969,8 @@ static struct x86_pmu p6_pmu = {
1840 .handle_irq = p6_pmu_handle_irq, 1969 .handle_irq = p6_pmu_handle_irq,
1841 .disable_all = p6_pmu_disable_all, 1970 .disable_all = p6_pmu_disable_all,
1842 .enable_all = p6_pmu_enable_all, 1971 .enable_all = p6_pmu_enable_all,
1843 .enable = p6_pmu_enable_counter, 1972 .enable = p6_pmu_enable_event,
1844 .disable = p6_pmu_disable_counter, 1973 .disable = p6_pmu_disable_event,
1845 .eventsel = MSR_P6_EVNTSEL0, 1974 .eventsel = MSR_P6_EVNTSEL0,
1846 .perfctr = MSR_P6_PERFCTR0, 1975 .perfctr = MSR_P6_PERFCTR0,
1847 .event_map = p6_pmu_event_map, 1976 .event_map = p6_pmu_event_map,
@@ -1850,16 +1979,17 @@ static struct x86_pmu p6_pmu = {
1850 .apic = 1, 1979 .apic = 1,
1851 .max_period = (1ULL << 31) - 1, 1980 .max_period = (1ULL << 31) - 1,
1852 .version = 0, 1981 .version = 0,
1853 .num_counters = 2, 1982 .num_events = 2,
1854 /* 1983 /*
1855 * Counters have 40 bits implemented. However they are designed such 1984 * Events have 40 bits implemented. However they are designed such
1856 * that bits [32-39] are sign extensions of bit 31. As such the 1985 * that bits [32-39] are sign extensions of bit 31. As such the
1857 * effective width of a counter for P6-like PMU is 32 bits only. 1986 * effective width of a event for P6-like PMU is 32 bits only.
1858 * 1987 *
1859 * See IA-32 Intel Architecture Software developer manual Vol 3B 1988 * See IA-32 Intel Architecture Software developer manual Vol 3B
1860 */ 1989 */
1861 .counter_bits = 32, 1990 .event_bits = 32,
1862 .counter_mask = (1ULL << 32) - 1, 1991 .event_mask = (1ULL << 32) - 1,
1992 .get_event_idx = intel_get_event_idx,
1863}; 1993};
1864 1994
1865static struct x86_pmu intel_pmu = { 1995static struct x86_pmu intel_pmu = {
@@ -1867,8 +1997,8 @@ static struct x86_pmu intel_pmu = {
1867 .handle_irq = intel_pmu_handle_irq, 1997 .handle_irq = intel_pmu_handle_irq,
1868 .disable_all = intel_pmu_disable_all, 1998 .disable_all = intel_pmu_disable_all,
1869 .enable_all = intel_pmu_enable_all, 1999 .enable_all = intel_pmu_enable_all,
1870 .enable = intel_pmu_enable_counter, 2000 .enable = intel_pmu_enable_event,
1871 .disable = intel_pmu_disable_counter, 2001 .disable = intel_pmu_disable_event,
1872 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 2002 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1873 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 2003 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1874 .event_map = intel_pmu_event_map, 2004 .event_map = intel_pmu_event_map,
@@ -1878,11 +2008,12 @@ static struct x86_pmu intel_pmu = {
1878 /* 2008 /*
1879 * Intel PMCs cannot be accessed sanely above 32 bit width, 2009 * Intel PMCs cannot be accessed sanely above 32 bit width,
1880 * so we install an artificial 1<<31 period regardless of 2010 * so we install an artificial 1<<31 period regardless of
1881 * the generic counter period: 2011 * the generic event period:
1882 */ 2012 */
1883 .max_period = (1ULL << 31) - 1, 2013 .max_period = (1ULL << 31) - 1,
1884 .enable_bts = intel_pmu_enable_bts, 2014 .enable_bts = intel_pmu_enable_bts,
1885 .disable_bts = intel_pmu_disable_bts, 2015 .disable_bts = intel_pmu_disable_bts,
2016 .get_event_idx = intel_get_event_idx,
1886}; 2017};
1887 2018
1888static struct x86_pmu amd_pmu = { 2019static struct x86_pmu amd_pmu = {
@@ -1890,19 +2021,20 @@ static struct x86_pmu amd_pmu = {
1890 .handle_irq = amd_pmu_handle_irq, 2021 .handle_irq = amd_pmu_handle_irq,
1891 .disable_all = amd_pmu_disable_all, 2022 .disable_all = amd_pmu_disable_all,
1892 .enable_all = amd_pmu_enable_all, 2023 .enable_all = amd_pmu_enable_all,
1893 .enable = amd_pmu_enable_counter, 2024 .enable = amd_pmu_enable_event,
1894 .disable = amd_pmu_disable_counter, 2025 .disable = amd_pmu_disable_event,
1895 .eventsel = MSR_K7_EVNTSEL0, 2026 .eventsel = MSR_K7_EVNTSEL0,
1896 .perfctr = MSR_K7_PERFCTR0, 2027 .perfctr = MSR_K7_PERFCTR0,
1897 .event_map = amd_pmu_event_map, 2028 .event_map = amd_pmu_event_map,
1898 .raw_event = amd_pmu_raw_event, 2029 .raw_event = amd_pmu_raw_event,
1899 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 2030 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1900 .num_counters = 4, 2031 .num_events = 4,
1901 .counter_bits = 48, 2032 .event_bits = 48,
1902 .counter_mask = (1ULL << 48) - 1, 2033 .event_mask = (1ULL << 48) - 1,
1903 .apic = 1, 2034 .apic = 1,
1904 /* use highest bit to detect overflow */ 2035 /* use highest bit to detect overflow */
1905 .max_period = (1ULL << 47) - 1, 2036 .max_period = (1ULL << 47) - 1,
2037 .get_event_idx = gen_get_event_idx,
1906}; 2038};
1907 2039
1908static int p6_pmu_init(void) 2040static int p6_pmu_init(void)
@@ -1915,10 +2047,12 @@ static int p6_pmu_init(void)
1915 case 7: 2047 case 7:
1916 case 8: 2048 case 8:
1917 case 11: /* Pentium III */ 2049 case 11: /* Pentium III */
2050 event_constraints = intel_p6_event_constraints;
1918 break; 2051 break;
1919 case 9: 2052 case 9:
1920 case 13: 2053 case 13:
1921 /* Pentium M */ 2054 /* Pentium M */
2055 event_constraints = intel_p6_event_constraints;
1922 break; 2056 break;
1923 default: 2057 default:
1924 pr_cont("unsupported p6 CPU model %d ", 2058 pr_cont("unsupported p6 CPU model %d ",
@@ -1956,7 +2090,7 @@ static int intel_pmu_init(void)
1956 2090
1957 /* 2091 /*
1958 * Check whether the Architectural PerfMon supports 2092 * Check whether the Architectural PerfMon supports
1959 * Branch Misses Retired Event or not. 2093 * Branch Misses Retired hw_event or not.
1960 */ 2094 */
1961 cpuid(10, &eax.full, &ebx, &unused, &edx.full); 2095 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1962 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) 2096 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
@@ -1968,15 +2102,15 @@ static int intel_pmu_init(void)
1968 2102
1969 x86_pmu = intel_pmu; 2103 x86_pmu = intel_pmu;
1970 x86_pmu.version = version; 2104 x86_pmu.version = version;
1971 x86_pmu.num_counters = eax.split.num_counters; 2105 x86_pmu.num_events = eax.split.num_events;
1972 x86_pmu.counter_bits = eax.split.bit_width; 2106 x86_pmu.event_bits = eax.split.bit_width;
1973 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; 2107 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
1974 2108
1975 /* 2109 /*
1976 * Quirk: v2 perfmon does not report fixed-purpose counters, so 2110 * Quirk: v2 perfmon does not report fixed-purpose events, so
1977 * assume at least 3 counters: 2111 * assume at least 3 events:
1978 */ 2112 */
1979 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); 2113 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
1980 2114
1981 /* 2115 /*
1982 * Install the hw-cache-events table: 2116 * Install the hw-cache-events table:
@@ -1990,12 +2124,14 @@ static int intel_pmu_init(void)
1990 sizeof(hw_cache_event_ids)); 2124 sizeof(hw_cache_event_ids));
1991 2125
1992 pr_cont("Core2 events, "); 2126 pr_cont("Core2 events, ");
2127 event_constraints = intel_core_event_constraints;
1993 break; 2128 break;
1994 default: 2129 default:
1995 case 26: 2130 case 26:
1996 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2131 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1997 sizeof(hw_cache_event_ids)); 2132 sizeof(hw_cache_event_ids));
1998 2133
2134 event_constraints = intel_nehalem_event_constraints;
1999 pr_cont("Nehalem/Corei7 events, "); 2135 pr_cont("Nehalem/Corei7 events, ");
2000 break; 2136 break;
2001 case 28: 2137 case 28:
@@ -2023,11 +2159,11 @@ static int amd_pmu_init(void)
2023 return 0; 2159 return 0;
2024} 2160}
2025 2161
2026void __init init_hw_perf_counters(void) 2162void __init init_hw_perf_events(void)
2027{ 2163{
2028 int err; 2164 int err;
2029 2165
2030 pr_info("Performance Counters: "); 2166 pr_info("Performance Events: ");
2031 2167
2032 switch (boot_cpu_data.x86_vendor) { 2168 switch (boot_cpu_data.x86_vendor) {
2033 case X86_VENDOR_INTEL: 2169 case X86_VENDOR_INTEL:
@@ -2040,45 +2176,45 @@ void __init init_hw_perf_counters(void)
2040 return; 2176 return;
2041 } 2177 }
2042 if (err != 0) { 2178 if (err != 0) {
2043 pr_cont("no PMU driver, software counters only.\n"); 2179 pr_cont("no PMU driver, software events only.\n");
2044 return; 2180 return;
2045 } 2181 }
2046 2182
2047 pr_cont("%s PMU driver.\n", x86_pmu.name); 2183 pr_cont("%s PMU driver.\n", x86_pmu.name);
2048 2184
2049 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 2185 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
2050 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", 2186 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
2051 x86_pmu.num_counters, X86_PMC_MAX_GENERIC); 2187 x86_pmu.num_events, X86_PMC_MAX_GENERIC);
2052 x86_pmu.num_counters = X86_PMC_MAX_GENERIC; 2188 x86_pmu.num_events = X86_PMC_MAX_GENERIC;
2053 } 2189 }
2054 perf_counter_mask = (1 << x86_pmu.num_counters) - 1; 2190 perf_event_mask = (1 << x86_pmu.num_events) - 1;
2055 perf_max_counters = x86_pmu.num_counters; 2191 perf_max_events = x86_pmu.num_events;
2056 2192
2057 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 2193 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
2058 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", 2194 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
2059 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); 2195 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
2060 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; 2196 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
2061 } 2197 }
2062 2198
2063 perf_counter_mask |= 2199 perf_event_mask |=
2064 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; 2200 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
2065 x86_pmu.intel_ctrl = perf_counter_mask; 2201 x86_pmu.intel_ctrl = perf_event_mask;
2066 2202
2067 perf_counters_lapic_init(); 2203 perf_events_lapic_init();
2068 register_die_notifier(&perf_counter_nmi_notifier); 2204 register_die_notifier(&perf_event_nmi_notifier);
2069 2205
2070 pr_info("... version: %d\n", x86_pmu.version); 2206 pr_info("... version: %d\n", x86_pmu.version);
2071 pr_info("... bit width: %d\n", x86_pmu.counter_bits); 2207 pr_info("... bit width: %d\n", x86_pmu.event_bits);
2072 pr_info("... generic counters: %d\n", x86_pmu.num_counters); 2208 pr_info("... generic registers: %d\n", x86_pmu.num_events);
2073 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); 2209 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask);
2074 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 2210 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
2075 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); 2211 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
2076 pr_info("... counter mask: %016Lx\n", perf_counter_mask); 2212 pr_info("... event mask: %016Lx\n", perf_event_mask);
2077} 2213}
2078 2214
2079static inline void x86_pmu_read(struct perf_counter *counter) 2215static inline void x86_pmu_read(struct perf_event *event)
2080{ 2216{
2081 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); 2217 x86_perf_event_update(event, &event->hw, event->hw.idx);
2082} 2218}
2083 2219
2084static const struct pmu pmu = { 2220static const struct pmu pmu = {
@@ -2088,13 +2224,52 @@ static const struct pmu pmu = {
2088 .unthrottle = x86_pmu_unthrottle, 2224 .unthrottle = x86_pmu_unthrottle,
2089}; 2225};
2090 2226
2091const struct pmu *hw_perf_counter_init(struct perf_counter *counter) 2227static int
2228validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2229{
2230 struct hw_perf_event fake_event = event->hw;
2231
2232 if (event->pmu != &pmu)
2233 return 0;
2234
2235 return x86_schedule_event(cpuc, &fake_event);
2236}
2237
2238static int validate_group(struct perf_event *event)
2239{
2240 struct perf_event *sibling, *leader = event->group_leader;
2241 struct cpu_hw_events fake_pmu;
2242
2243 memset(&fake_pmu, 0, sizeof(fake_pmu));
2244
2245 if (!validate_event(&fake_pmu, leader))
2246 return -ENOSPC;
2247
2248 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2249 if (!validate_event(&fake_pmu, sibling))
2250 return -ENOSPC;
2251 }
2252
2253 if (!validate_event(&fake_pmu, event))
2254 return -ENOSPC;
2255
2256 return 0;
2257}
2258
2259const struct pmu *hw_perf_event_init(struct perf_event *event)
2092{ 2260{
2093 int err; 2261 int err;
2094 2262
2095 err = __hw_perf_counter_init(counter); 2263 err = __hw_perf_event_init(event);
2096 if (err) 2264 if (!err) {
2265 if (event->group_leader != event)
2266 err = validate_group(event);
2267 }
2268 if (err) {
2269 if (event->destroy)
2270 event->destroy(event);
2097 return ERR_PTR(err); 2271 return ERR_PTR(err);
2272 }
2098 2273
2099 return &pmu; 2274 return &pmu;
2100} 2275}
@@ -2110,8 +2285,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2110 entry->ip[entry->nr++] = ip; 2285 entry->ip[entry->nr++] = ip;
2111} 2286}
2112 2287
2113static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 2288static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2114static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 2289static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2115static DEFINE_PER_CPU(int, in_nmi_frame); 2290static DEFINE_PER_CPU(int, in_nmi_frame);
2116 2291
2117 2292
@@ -2264,9 +2439,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2264 struct perf_callchain_entry *entry; 2439 struct perf_callchain_entry *entry;
2265 2440
2266 if (in_nmi()) 2441 if (in_nmi())
2267 entry = &__get_cpu_var(nmi_entry); 2442 entry = &__get_cpu_var(pmc_nmi_entry);
2268 else 2443 else
2269 entry = &__get_cpu_var(irq_entry); 2444 entry = &__get_cpu_var(pmc_irq_entry);
2270 2445
2271 entry->nr = 0; 2446 entry->nr = 0;
2272 2447
@@ -2275,7 +2450,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2275 return entry; 2450 return entry;
2276} 2451}
2277 2452
2278void hw_perf_counter_setup_online(int cpu) 2453void hw_perf_event_setup_online(int cpu)
2279{ 2454{
2280 init_debug_store_on_cpu(cpu); 2455 init_debug_store_on_cpu(cpu);
2281} 2456}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 392bea43b890..fab786f60ed6 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_counter.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 000000000000..a640ae5ad201
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index bc24f514ec93..1cbed97b59cf 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,6 +24,7 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <asm/div64.h> 25#include <asm/div64.h>
26#include <asm/vmware.h> 26#include <asm/vmware.h>
27#include <asm/x86_init.h>
27 28
28#define CPUID_VMWARE_INFO_LEAF 0x40000000 29#define CPUID_VMWARE_INFO_LEAF 0x40000000
29#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 30#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -47,21 +48,35 @@ static inline int __vmware_platform(void)
47 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; 48 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
48} 49}
49 50
50static unsigned long __vmware_get_tsc_khz(void) 51static unsigned long vmware_get_tsc_khz(void)
51{ 52{
52 uint64_t tsc_hz; 53 uint64_t tsc_hz;
53 uint32_t eax, ebx, ecx, edx; 54 uint32_t eax, ebx, ecx, edx;
54 55
55 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 56 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
56 57
57 if (ebx == UINT_MAX)
58 return 0;
59 tsc_hz = eax | (((uint64_t)ebx) << 32); 58 tsc_hz = eax | (((uint64_t)ebx) << 32);
60 do_div(tsc_hz, 1000); 59 do_div(tsc_hz, 1000);
61 BUG_ON(tsc_hz >> 32); 60 BUG_ON(tsc_hz >> 32);
61 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
62 (unsigned long) tsc_hz / 1000,
63 (unsigned long) tsc_hz % 1000);
62 return tsc_hz; 64 return tsc_hz;
63} 65}
64 66
67void __init vmware_platform_setup(void)
68{
69 uint32_t eax, ebx, ecx, edx;
70
71 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
72
73 if (ebx != UINT_MAX)
74 x86_platform.calibrate_tsc = vmware_get_tsc_khz;
75 else
76 printk(KERN_WARNING
77 "Failed to get TSC freq from the hypervisor\n");
78}
79
65/* 80/*
66 * While checking the dmi string infomation, just checking the product 81 * While checking the dmi string infomation, just checking the product
67 * serial key should be enough, as this will always have a VMware 82 * serial key should be enough, as this will always have a VMware
@@ -87,12 +102,6 @@ int vmware_platform(void)
87 return 0; 102 return 0;
88} 103}
89 104
90unsigned long vmware_get_tsc_khz(void)
91{
92 BUG_ON(!vmware_platform());
93 return __vmware_get_tsc_khz();
94}
95
96/* 105/*
97 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 106 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
98 * Still, due to timing difference when running on virtual cpus, the TSC can 107 * Still, due to timing difference when running on virtual cpus, the TSC can
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index b07af8861244..6a52d4b36a30 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev) 185static char *cpuid_devnode(struct device *dev, mode_t *mode)
186{ 186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); 187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188} 188}
@@ -203,7 +203,7 @@ static int __init cpuid_init(void)
203 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
204 goto out_chrdev; 204 goto out_chrdev;
205 } 205 }
206 cpuid_class->nodename = cpuid_nodename; 206 cpuid_class->devnode = cpuid_devnode;
207 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
208 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
209 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9e..f7dd2a7c3bf4 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -5,7 +5,6 @@
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/kprobes.h> 6#include <linux/kprobes.h>
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h> 8#include <linux/hardirq.h>
10#include <linux/kdebug.h> 9#include <linux/kdebug.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a3276766..a071e6be177e 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -5,7 +5,6 @@
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/kprobes.h> 6#include <linux/kprobes.h>
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h> 8#include <linux/hardirq.h>
10#include <linux/kdebug.h> 9#include <linux/kdebug.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 147005a1cc3c..d17d482a04f4 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void)
1331 struct resource *res; 1331 struct resource *res;
1332 u64 end; 1332 u64 end;
1333 1333
1334 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1334 res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
1335 e820_res = res; 1335 e820_res = res;
1336 for (i = 0; i < e820.nr_map; i++) { 1336 for (i = 0; i < e820.nr_map; i++) {
1337 end = e820.map[i].addr + e820.map[i].size - 1; 1337 end = e820.map[i].addr + e820.map[i].size - 1;
@@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos)
1378 if (mb < 16) 1378 if (mb < 16)
1379 return 1024*1024; 1379 return 1024*1024;
1380 1380
1381 /* To 32MB for anything above that */ 1381 /* To 64MB for anything above that */
1382 return 32*1024*1024; 1382 return 64*1024*1024;
1383} 1383}
1384 1384
1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1) 1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
@@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void)
1455 return who; 1455 return who;
1456} 1456}
1457 1457
1458char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1459{
1460 if (x86_quirks->arch_memory_setup) {
1461 char *who = x86_quirks->arch_memory_setup();
1462
1463 if (who)
1464 return who;
1465 }
1466 return default_machine_specific_memory_setup();
1467}
1468
1469/* Overridden in paravirt.c if CONFIG_PARAVIRT */
1470char * __init __attribute__((weak)) memory_setup(void)
1471{
1472 return machine_specific_memory_setup();
1473}
1474
1475void __init setup_memory_map(void) 1458void __init setup_memory_map(void)
1476{ 1459{
1477 char *who; 1460 char *who;
1478 1461
1479 who = memory_setup(); 1462 who = x86_init.resources.memory_setup();
1480 memcpy(&e820_saved, &e820, sizeof(struct e820map)); 1463 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1481 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1464 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1482 e820_print_map(who); 1465 e820_print_map(who);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 335f049d110f..b9c830c12b4a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -160,721 +160,6 @@ static struct console early_serial_console = {
160 .index = -1, 160 .index = -1,
161}; 161};
162 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void __init dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void __init dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int __init dbgp_control_msg(unsigned devnum, int requesttype,
390 int request, int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int __init ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int __init ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void __init default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
565
566static void __init nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
878/* Direct interface for emergencies */ 163/* Direct interface for emergencies */
879static struct console *early_console = &early_vga_console; 164static struct console *early_console = &early_vga_console;
880static int __initdata early_console_initialized; 165static int __initdata early_console_initialized;
@@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...)
891 va_end(ap); 176 va_end(ap);
892} 177}
893 178
179static inline void early_console_register(struct console *con, int keep_early)
180{
181 if (early_console->index != -1) {
182 printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
183 con->name);
184 return;
185 }
186 early_console = con;
187 if (keep_early)
188 early_console->flags &= ~CON_BOOT;
189 else
190 early_console->flags |= CON_BOOT;
191 register_console(early_console);
192}
894 193
895static int __init setup_early_printk(char *buf) 194static int __init setup_early_printk(char *buf)
896{ 195{
897 int keep_early; 196 int keep;
898 197
899 if (!buf) 198 if (!buf)
900 return 0; 199 return 0;
@@ -903,42 +202,37 @@ static int __init setup_early_printk(char *buf)
903 return 0; 202 return 0;
904 early_console_initialized = 1; 203 early_console_initialized = 1;
905 204
906 keep_early = (strstr(buf, "keep") != NULL); 205 keep = (strstr(buf, "keep") != NULL);
907 206
908 if (!strncmp(buf, "serial", 6)) { 207 while (*buf != '\0') {
909 early_serial_init(buf + 6); 208 if (!strncmp(buf, "serial", 6)) {
910 early_console = &early_serial_console; 209 buf += 6;
911 } else if (!strncmp(buf, "ttyS", 4)) { 210 early_serial_init(buf);
912 early_serial_init(buf); 211 early_console_register(&early_serial_console, keep);
913 early_console = &early_serial_console; 212 if (!strncmp(buf, ",ttyS", 5))
914 } else if (!strncmp(buf, "vga", 3) 213 buf += 5;
915 && boot_params.screen_info.orig_video_isVGA == 1) { 214 }
916 max_xpos = boot_params.screen_info.orig_video_cols; 215 if (!strncmp(buf, "ttyS", 4)) {
917 max_ypos = boot_params.screen_info.orig_video_lines; 216 early_serial_init(buf + 4);
918 current_ypos = boot_params.screen_info.orig_y; 217 early_console_register(&early_serial_console, keep);
919 early_console = &early_vga_console; 218 }
219 if (!strncmp(buf, "vga", 3) &&
220 boot_params.screen_info.orig_video_isVGA == 1) {
221 max_xpos = boot_params.screen_info.orig_video_cols;
222 max_ypos = boot_params.screen_info.orig_video_lines;
223 current_ypos = boot_params.screen_info.orig_y;
224 early_console_register(&early_vga_console, keep);
225 }
920#ifdef CONFIG_EARLY_PRINTK_DBGP 226#ifdef CONFIG_EARLY_PRINTK_DBGP
921 } else if (!strncmp(buf, "dbgp", 4)) { 227 if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
922 if (early_dbgp_init(buf+4) < 0) 228 early_console_register(&early_dbgp_console, keep);
923 return 0;
924 early_console = &early_dbgp_console;
925 /*
926 * usb subsys will reset ehci controller, so don't keep
927 * that early console
928 */
929 keep_early = 0;
930#endif 229#endif
931#ifdef CONFIG_HVC_XEN 230#ifdef CONFIG_HVC_XEN
932 } else if (!strncmp(buf, "xen", 3)) { 231 if (!strncmp(buf, "xen", 3))
933 early_console = &xenboot_console; 232 early_console_register(&xenboot_console, keep);
934#endif 233#endif
234 buf++;
935 } 235 }
936
937 if (keep_early)
938 early_console->flags &= ~CON_BOOT;
939 else
940 early_console->flags |= CON_BOOT;
941 register_console(early_console);
942 return 0; 236 return 0;
943} 237}
944 238
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index fe26ba3e3451..ad5bd988fb79 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -42,6 +42,7 @@
42#include <asm/time.h> 42#include <asm/time.h>
43#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
44#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
45#include <asm/x86_init.h>
45 46
46#define EFI_DEBUG 1 47#define EFI_DEBUG 1
47#define PFX "EFI: " 48#define PFX "EFI: "
@@ -453,6 +454,9 @@ void __init efi_init(void)
453 if (add_efi_memmap) 454 if (add_efi_memmap)
454 do_add_efi_memmap(); 455 do_add_efi_memmap();
455 456
457 x86_platform.get_wallclock = efi_get_time;
458 x86_platform.set_wallclock = efi_set_rtc_mmss;
459
456 /* Setup for EFI runtime service */ 460 /* Setup for EFI runtime service */
457 reboot_type = BOOT_EFI; 461 reboot_type = BOOT_EFI;
458 462
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..7d52e9da5e0c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1185,17 +1185,14 @@ END(ftrace_graph_caller)
1185 1185
1186.globl return_to_handler 1186.globl return_to_handler
1187return_to_handler: 1187return_to_handler:
1188 pushl $0
1189 pushl %eax 1188 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1189 pushl %edx
1192 movl %ebp, %eax 1190 movl %ebp, %eax
1193 call ftrace_return_to_handler 1191 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1192 movl %eax, %ecx
1195 popl %edx 1193 popl %edx
1196 popl %ecx
1197 popl %eax 1194 popl %eax
1198 ret 1195 jmp *%ecx
1199#endif 1196#endif
1200 1197
1201.section .rodata,"a" 1198.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be745107..bd5bbddddf91 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)
146END(ftrace_graph_caller) 146END(ftrace_graph_caller)
147 147
148GLOBAL(return_to_handler) 148GLOBAL(return_to_handler)
149 subq $80, %rsp 149 subq $24, %rsp
150 150
151 /* Save the return values */ 151 /* Save the return values */
152 movq %rax, (%rsp) 152 movq %rax, (%rsp)
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 72(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $72, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -536,20 +536,13 @@ sysret_signal:
536 bt $TIF_SYSCALL_AUDIT,%edx 536 bt $TIF_SYSCALL_AUDIT,%edx
537 jc sysret_audit 537 jc sysret_audit
538#endif 538#endif
539 /* edx: work flags (arg3) */ 539 /*
540 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 540 * We have a signal, or exit tracing or single-step.
541 xorl %esi,%esi # oldset -> arg2 541 * These all wind up with the iret return path anyway,
542 SAVE_REST 542 * so just join that path right now.
543 FIXUP_TOP_OF_STACK %r11 543 */
544 call do_notify_resume 544 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
545 RESTORE_TOP_OF_STACK %r11 545 jmp int_check_syscall_exit_work
546 RESTORE_REST
547 movl $_TIF_WORK_MASK,%edi
548 /* Use IRET because user could have changed frame. This
549 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
550 DISABLE_INTERRUPTS(CLBR_NONE)
551 TRACE_IRQS_OFF
552 jmp int_with_check
553 546
554badsys: 547badsys:
555 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 548 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -654,6 +647,7 @@ int_careful:
654int_very_careful: 647int_very_careful:
655 TRACE_IRQS_ON 648 TRACE_IRQS_ON
656 ENABLE_INTERRUPTS(CLBR_NONE) 649 ENABLE_INTERRUPTS(CLBR_NONE)
650int_check_syscall_exit_work:
657 SAVE_REST 651 SAVE_REST
658 /* Check for syscall exit trace */ 652 /* Check for syscall exit trace */
659 testl $_TIF_WORK_SYSCALL_EXIT,%edx 653 testl $_TIF_WORK_SYSCALL_EXIT,%edx
@@ -1021,7 +1015,7 @@ apicinterrupt ERROR_APIC_VECTOR \
1021apicinterrupt SPURIOUS_APIC_VECTOR \ 1015apicinterrupt SPURIOUS_APIC_VECTOR \
1022 spurious_interrupt smp_spurious_interrupt 1016 spurious_interrupt smp_spurious_interrupt
1023 1017
1024#ifdef CONFIG_PERF_COUNTERS 1018#ifdef CONFIG_PERF_EVENTS
1025apicinterrupt LOCAL_PENDING_VECTOR \ 1019apicinterrupt LOCAL_PENDING_VECTOR \
1026 perf_pending_interrupt smp_perf_pending_interrupt 1020 perf_pending_interrupt smp_perf_pending_interrupt
1027#endif 1021#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..5a1b9758fd62 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data)
336 338
337 switch (faulted) { 339 switch (faulted) {
338 case 0: 340 case 0:
339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 341 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 342 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
341 break; 343 break;
342 case 1: 344 case 1:
343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 345 pr_info("converting mcount calls to 66 66 66 66 90\n");
344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 346 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
345 break; 347 break;
346 case 2: 348 case 2:
347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 349 pr_info("converting mcount calls to jmp . + 5\n");
348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 350 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
349 break; 351 break;
350 } 352 }
@@ -468,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
468 470
469#ifdef CONFIG_FTRACE_SYSCALLS 471#ifdef CONFIG_FTRACE_SYSCALLS
470 472
471extern unsigned long __start_syscalls_metadata[];
472extern unsigned long __stop_syscalls_metadata[];
473extern unsigned long *sys_call_table; 473extern unsigned long *sys_call_table;
474 474
475static struct syscall_metadata **syscalls_metadata; 475unsigned long __init arch_syscall_addr(int nr)
476
477static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
478{
479 struct syscall_metadata *start;
480 struct syscall_metadata *stop;
481 char str[KSYM_SYMBOL_LEN];
482
483
484 start = (struct syscall_metadata *)__start_syscalls_metadata;
485 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
487
488 for ( ; start < stop; start++) {
489 if (start->name && !strcmp(start->name, str))
490 return start;
491 }
492 return NULL;
493}
494
495struct syscall_metadata *syscall_nr_to_meta(int nr)
496{
497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
498 return NULL;
499
500 return syscalls_metadata[nr];
501}
502
503int syscall_name_to_nr(char *name)
504{ 476{
505 int i; 477 return (unsigned long)(&sys_call_table)[nr];
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
530{
531 int i;
532 struct syscall_metadata *meta;
533 unsigned long **psys_syscall_table = &sys_call_table;
534
535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
536 NR_syscalls, GFP_KERNEL);
537 if (!syscalls_metadata) {
538 WARN_ON(1);
539 return -ENOMEM;
540 }
541
542 for (i = 0; i < NR_syscalls; i++) {
543 meta = find_syscall_meta(psys_syscall_table[i]);
544 syscalls_metadata[i] = meta;
545 }
546 return 0;
547} 478}
548arch_initcall(arch_init_ftrace_syscalls);
549#endif 479#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3f8579f8d42c..4f8e2507e8f3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -11,8 +11,21 @@
11#include <asm/setup.h> 11#include <asm/setup.h>
12#include <asm/sections.h> 12#include <asm/sections.h>
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/bios_ebda.h> 14#include <asm/page.h>
15#include <asm/trampoline.h> 15#include <asm/trampoline.h>
16#include <asm/apic.h>
17#include <asm/io_apic.h>
18#include <asm/bios_ebda.h>
19
20static void __init i386_default_early_setup(void)
21{
22 /* Initilize 32bit specific setup functions */
23 x86_init.resources.probe_roms = probe_roms;
24 x86_init.resources.reserve_resources = i386_reserve_resources;
25 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
26
27 reserve_ebda_region();
28}
16 29
17void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
18{ 31{
@@ -29,7 +42,16 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 42 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 43 }
31#endif 44#endif
32 reserve_ebda_region(); 45
46 /* Call the subarch specific early setup function */
47 switch (boot_params.hdr.hardware_subarch) {
48 case X86_SUBARCH_MRST:
49 x86_mrst_early_setup();
50 break;
51 default:
52 i386_default_early_setup();
53 break;
54 }
33 55
34 /* 56 /*
35 * At this point everything still needed from the boot loader 57 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 70eaa852c732..0b06cd778fd9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -23,8 +23,8 @@
23#include <asm/sections.h> 23#include <asm/sections.h>
24#include <asm/kdebug.h> 24#include <asm/kdebug.h>
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 26#include <asm/trampoline.h>
27#include <asm/bios_ebda.h>
28 28
29static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
30{ 30{
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7ffec6b3b331..050c278481b1 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
79 * any particular GDT layout, because we load our own as soon as we 79 * any particular GDT layout, because we load our own as soon as we
80 * can. 80 * can.
81 */ 81 */
82.section .text.head,"ax",@progbits 82__HEAD
83ENTRY(startup_32) 83ENTRY(startup_32)
84 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 84 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
85 us to not reload segments */ 85 us to not reload segments */
@@ -157,6 +157,7 @@ subarch_entries:
157 .long default_entry /* normal x86/PC */ 157 .long default_entry /* normal x86/PC */
158 .long lguest_entry /* lguest hypervisor */ 158 .long lguest_entry /* lguest hypervisor */
159 .long xen_entry /* Xen hypervisor */ 159 .long xen_entry /* Xen hypervisor */
160 .long default_entry /* Moorestown MID */
160num_subarch_entries = (. - subarch_entries) / 4 161num_subarch_entries = (. - subarch_entries) / 4
161.previous 162.previous
162#endif /* CONFIG_PARAVIRT */ 163#endif /* CONFIG_PARAVIRT */
@@ -607,7 +608,7 @@ ENTRY(initial_code)
607/* 608/*
608 * BSS section 609 * BSS section
609 */ 610 */
610.section ".bss.page_aligned","wa" 611__PAGE_ALIGNED_BSS
611 .align PAGE_SIZE_asm 612 .align PAGE_SIZE_asm
612#ifdef CONFIG_X86_PAE 613#ifdef CONFIG_X86_PAE
613swapper_pg_pmd: 614swapper_pg_pmd:
@@ -625,7 +626,7 @@ ENTRY(empty_zero_page)
625 * This starts the data section. 626 * This starts the data section.
626 */ 627 */
627#ifdef CONFIG_X86_PAE 628#ifdef CONFIG_X86_PAE
628.section ".data.page_aligned","wa" 629__PAGE_ALIGNED_DATA
629 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
630 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
631ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a05..780cd928fcd5 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map)
40L3_START_KERNEL = pud_index(__START_KERNEL_map) 40L3_START_KERNEL = pud_index(__START_KERNEL_map)
41 41
42 .text 42 .text
43 .section .text.head 43 __HEAD
44 .code64 44 .code64
45 .globl startup_64 45 .globl startup_64
46startup_64: 46startup_64:
@@ -418,7 +418,7 @@ ENTRY(phys_base)
418ENTRY(idt_table) 418ENTRY(idt_table)
419 .skip IDT_ENTRIES * 16 419 .skip IDT_ENTRIES * 16
420 420
421 .section .bss.page_aligned, "aw", @nobits 421 __PAGE_ALIGNED_BSS
422 .align PAGE_SIZE 422 .align PAGE_SIZE
423ENTRY(empty_zero_page) 423ENTRY(empty_zero_page)
424 .skip PAGE_SIZE 424 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 43cec6bdda63..9c3bd4a2050e 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -10,6 +10,16 @@
10EXPORT_SYMBOL(mcount); 10EXPORT_SYMBOL(mcount);
11#endif 11#endif
12 12
13/*
14 * Note, this is a prototype to get at the symbol for
15 * the export, but dont use it from C code, it is used
16 * by assembly code and is not using C calling convention!
17 */
18#ifndef CONFIG_X86_CMPXCHG64
19extern void cmpxchg8b_emu(void);
20EXPORT_SYMBOL(cmpxchg8b_emu);
21#endif
22
13/* Networking helper routines. */ 23/* Networking helper routines. */
14EXPORT_SYMBOL(csum_partial_copy_generic); 24EXPORT_SYMBOL(csum_partial_copy_generic);
15 25
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 5cf36c053ac4..23c167925a5c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -19,12 +19,6 @@
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22#ifdef CONFIG_X86_32
23static void pit_disable_clocksource(void);
24#else
25static inline void pit_disable_clocksource(void) { }
26#endif
27
28/* 22/*
29 * HPET replaces the PIT, when enabled. So we need to know, which of 23 * HPET replaces the PIT, when enabled. So we need to know, which of
30 * the two timers is used 24 * the two timers is used
@@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode,
57 outb_pit(0, PIT_CH0); 51 outb_pit(0, PIT_CH0);
58 outb_pit(0, PIT_CH0); 52 outb_pit(0, PIT_CH0);
59 } 53 }
60 pit_disable_clocksource();
61 break; 54 break;
62 55
63 case CLOCK_EVT_MODE_ONESHOT: 56 case CLOCK_EVT_MODE_ONESHOT:
64 /* One shot setup */ 57 /* One shot setup */
65 pit_disable_clocksource();
66 outb_pit(0x38, PIT_MODE); 58 outb_pit(0x38, PIT_MODE);
67 break; 59 break;
68 60
@@ -200,17 +192,6 @@ static struct clocksource pit_cs = {
200 .shift = 20, 192 .shift = 20,
201}; 193};
202 194
203static void pit_disable_clocksource(void)
204{
205 /*
206 * Use mult to check whether it is registered or not
207 */
208 if (pit_cs.mult) {
209 clocksource_unregister(&pit_cs);
210 pit_cs.mult = 0;
211 }
212}
213
214static int __init init_pit_clocksource(void) 195static int __init init_pit_clocksource(void)
215{ 196{
216 /* 197 /*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 270ff83efc11..3a54dcb9cd0e 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
20 * way process stacks are handled. This is done by having a special 20 * way process stacks are handled. This is done by having a special
21 * "init_task" linker map entry.. 21 * "init_task" linker map entry..
22 */ 22 */
23union thread_union init_thread_union 23union thread_union init_thread_union __init_task_data =
24 __attribute__((__section__(".data.init_task"))) = 24 { INIT_THREAD_INFO(init_task) };
25 { INIT_THREAD_INFO(init_task) };
26 25
27/* 26/*
28 * Initial task structure. 27 * Initial task structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b0cdde6932f5..74656d1d4e30 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
104 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
105# endif 105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j) 109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); 110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
@@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
200 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
201# endif 201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu); 205 sum += per_cpu(mce_poll_count, cpu);
206#endif 206#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 92b7703d3d58..40f30773fb29 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
116 return 0; 116 return 0;
117} 117}
118 118
119static void __init init_ISA_irqs(void) 119void __init init_ISA_irqs(void)
120{ 120{
121 int i; 121 int i;
122 122
@@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void)
140 } 140 }
141} 141}
142 142
143/* Overridden in paravirt.c */ 143void __init init_IRQ(void)
144void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 144{
145 x86_init.irqs.intr_init();
146}
145 147
146static void __init smp_intr_init(void) 148static void __init smp_intr_init(void)
147{ 149{
@@ -190,7 +192,7 @@ static void __init apic_intr_init(void)
190#ifdef CONFIG_X86_MCE_THRESHOLD 192#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 193 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 194#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 195#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); 196 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
195#endif 197#endif
196 198
@@ -206,39 +208,19 @@ static void __init apic_intr_init(void)
206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 208 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207 209
208 /* Performance monitoring interrupts: */ 210 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS 211# ifdef CONFIG_PERF_EVENTS
210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 212 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
211# endif 213# endif
212 214
213#endif 215#endif
214} 216}
215 217
216/**
217 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
218 *
219 * Description:
220 * Perform any necessary interrupt initialisation prior to setting up
221 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
222 * interrupts should be initialised here if the machine emulates a PC
223 * in any way.
224 **/
225static void __init x86_quirk_pre_intr_init(void)
226{
227#ifdef CONFIG_X86_32
228 if (x86_quirks->arch_pre_intr_init) {
229 if (x86_quirks->arch_pre_intr_init())
230 return;
231 }
232#endif
233 init_ISA_irqs();
234}
235
236void __init native_init_IRQ(void) 218void __init native_init_IRQ(void)
237{ 219{
238 int i; 220 int i;
239 221
240 /* Execute any quirks before the call gates are initialised: */ 222 /* Execute any quirks before the call gates are initialised: */
241 x86_quirk_pre_intr_init(); 223 x86_init.irqs.pre_vector_init();
242 224
243 apic_intr_init(); 225 apic_intr_init();
244 226
@@ -258,12 +240,6 @@ void __init native_init_IRQ(void)
258 240
259#ifdef CONFIG_X86_32 241#ifdef CONFIG_X86_32
260 /* 242 /*
261 * Call quirks after call gates are initialised (usually add in
262 * the architecture specific gates):
263 */
264 x86_quirk_intr_init();
265
266 /*
267 * External FPU? Set up irq13 if so, for 243 * External FPU? Set up irq13 if so, for
268 * original braindamaged IBM FERR coupling. 244 * original braindamaged IBM FERR coupling.
269 */ 245 */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e5efcdcca31b..feaeb0d3aa4f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,8 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25
26#include <asm/x86_init.h>
25#include <asm/reboot.h> 27#include <asm/reboot.h>
26 28
27#define KVM_SCALE 22 29#define KVM_SCALE 22
@@ -182,12 +184,13 @@ void __init kvmclock_init(void)
182 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 184 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
183 if (kvm_register_clock("boot clock")) 185 if (kvm_register_clock("boot clock"))
184 return; 186 return;
185 pv_time_ops.get_wallclock = kvm_get_wallclock;
186 pv_time_ops.set_wallclock = kvm_set_wallclock;
187 pv_time_ops.sched_clock = kvm_clock_read; 187 pv_time_ops.sched_clock = kvm_clock_read;
188 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; 188 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
189 x86_platform.get_wallclock = kvm_get_wallclock;
190 x86_platform.set_wallclock = kvm_set_wallclock;
189#ifdef CONFIG_X86_LOCAL_APIC 191#ifdef CONFIG_X86_LOCAL_APIC
190 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 192 x86_cpuinit.setup_percpu_clockev =
193 kvm_setup_secondary_clock;
191#endif 194#endif
192#ifdef CONFIG_SMP 195#ifdef CONFIG_SMP
193 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 196 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 71f1d99a635d..ec6ef60cbd17 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
67#ifdef CONFIG_SMP 67#ifdef CONFIG_SMP
68 preempt_disable(); 68 preempt_disable();
69 load_LDT(pc); 69 load_LDT(pc);
70 if (!cpus_equal(current->mm->cpu_vm_mask, 70 if (!cpumask_equal(mm_cpumask(current->mm),
71 cpumask_of_cpu(smp_processor_id()))) 71 cpumask_of(smp_processor_id())))
72 smp_call_function(flush_ldt, current->mm, 1); 72 smp_call_function(flush_ldt, current->mm, 1);
73 preempt_enable(); 73 preempt_enable();
74#else 74#else
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9371448290ac..378e9a8f1bf8 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
210{ 210{
211 ssize_t ret = -EINVAL; 211 ssize_t ret = -EINVAL;
212 212
213 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages); 214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);
215 return ret; 215 return ret;
216 } 216 }
217 217
@@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = {
236static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
237 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
238 .name = "microcode", 238 .name = "microcode",
239 .devnode = "cpu/microcode", 239 .nodename = "cpu/microcode",
240 .fops = &microcode_fops, 240 .fops = &microcode_fops,
241}; 241};
242 242
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index fcd513bf2846..5be95ef4ffec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len)
45 return sum & 0xFF; 45 return sum & 0xFF;
46} 46}
47 47
48int __init default_mpc_apic_id(struct mpc_cpu *m)
49{
50 return m->apicid;
51}
52
48static void __init MP_processor_info(struct mpc_cpu *m) 53static void __init MP_processor_info(struct mpc_cpu *m)
49{ 54{
50 int apicid; 55 int apicid;
@@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
55 return; 60 return;
56 } 61 }
57 62
58 if (x86_quirks->mpc_apic_id) 63 apicid = x86_init.mpparse.mpc_apic_id(m);
59 apicid = x86_quirks->mpc_apic_id(m);
60 else
61 apicid = m->apicid;
62 64
63 if (m->cpuflag & CPU_BOOTPROCESSOR) { 65 if (m->cpuflag & CPU_BOOTPROCESSOR) {
64 bootup_cpu = " (Bootup-CPU)"; 66 bootup_cpu = " (Bootup-CPU)";
@@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m)
70} 72}
71 73
72#ifdef CONFIG_X86_IO_APIC 74#ifdef CONFIG_X86_IO_APIC
73static void __init MP_bus_info(struct mpc_bus *m) 75void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
74{ 76{
75 char str[7];
76 memcpy(str, m->bustype, 6); 77 memcpy(str, m->bustype, 6);
77 str[6] = 0; 78 str[6] = 0;
79 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
80}
78 81
79 if (x86_quirks->mpc_oem_bus_info) 82static void __init MP_bus_info(struct mpc_bus *m)
80 x86_quirks->mpc_oem_bus_info(m, str); 83{
81 else 84 char str[7];
82 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); 85
86 x86_init.mpparse.mpc_oem_bus_info(m, str);
83 87
84#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
85 if (m->busid >= MAX_MP_BUSSES) { 89 if (m->busid >= MAX_MP_BUSSES) {
@@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
96 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
97#endif 101#endif
98 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
99 if (x86_quirks->mpc_oem_pci_bus) 103 if (x86_init.mpparse.mpc_oem_pci_bus)
100 x86_quirks->mpc_oem_pci_bus(m); 104 x86_init.mpparse.mpc_oem_pci_bus(m);
101 105
102 clear_bit(m->busid, mp_bus_not_pci); 106 clear_bit(m->busid, mp_bus_not_pci);
103#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
@@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
291 1, mpc, mpc->length, 1); 295 1, mpc, mpc->length, 1);
292} 296}
293 297
298void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
299
294static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 300static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
295{ 301{
296 char str[16]; 302 char str[16];
@@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
312 if (early) 318 if (early)
313 return 1; 319 return 1;
314 320
315 if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { 321 if (mpc->oemptr)
316 struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; 322 x86_init.mpparse.smp_read_mpc_oem(mpc);
317 x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
318 }
319 323
320 /* 324 /*
321 * Now process the configuration blocks. 325 * Now process the configuration blocks.
322 */ 326 */
323 if (x86_quirks->mpc_record) 327 x86_init.mpparse.mpc_record(0);
324 *x86_quirks->mpc_record = 0;
325 328
326 while (count < mpc->length) { 329 while (count < mpc->length) {
327 switch (*mpt) { 330 switch (*mpt) {
@@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
353 count = mpc->length; 356 count = mpc->length;
354 break; 357 break;
355 } 358 }
356 if (x86_quirks->mpc_record) 359 x86_init.mpparse.mpc_record(1);
357 (*x86_quirks->mpc_record)++;
358 } 360 }
359 361
360#ifdef CONFIG_X86_BIGSMP 362#ifdef CONFIG_X86_BIGSMP
@@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
608/* 610/*
609 * Scan the memory blocks for an SMP configuration block. 611 * Scan the memory blocks for an SMP configuration block.
610 */ 612 */
611static void __init __get_smp_config(unsigned int early) 613void __init default_get_smp_config(unsigned int early)
612{ 614{
613 struct mpf_intel *mpf = mpf_found; 615 struct mpf_intel *mpf = mpf_found;
614 616
@@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early)
625 if (acpi_lapic && acpi_ioapic) 627 if (acpi_lapic && acpi_ioapic)
626 return; 628 return;
627 629
628 if (x86_quirks->mach_get_smp_config) {
629 if (x86_quirks->mach_get_smp_config(early))
630 return;
631 }
632
633 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 630 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
634 mpf->specification); 631 mpf->specification);
635#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 632#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early)
670 */ 667 */
671} 668}
672 669
673void __init early_get_smp_config(void)
674{
675 __get_smp_config(1);
676}
677
678void __init get_smp_config(void)
679{
680 __get_smp_config(0);
681}
682
683static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
684{ 671{
685 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
@@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
745 return 0; 732 return 0;
746} 733}
747 734
748static void __init __find_smp_config(unsigned int reserve) 735void __init default_find_smp_config(unsigned int reserve)
749{ 736{
750 unsigned int address; 737 unsigned int address;
751 738
752 if (x86_quirks->mach_find_smp_config) {
753 if (x86_quirks->mach_find_smp_config(reserve))
754 return;
755 }
756 /* 739 /*
757 * FIXME: Linux assumes you have 640K of base ram.. 740 * FIXME: Linux assumes you have 640K of base ram..
758 * this continues the error... 741 * this continues the error...
@@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve)
787 smp_scan_config(address, 0x400, reserve); 770 smp_scan_config(address, 0x400, reserve);
788} 771}
789 772
790void __init early_find_smp_config(void)
791{
792 __find_smp_config(0);
793}
794
795void __init find_smp_config(void)
796{
797 __find_smp_config(1);
798}
799
800#ifdef CONFIG_X86_IO_APIC 773#ifdef CONFIG_X86_IO_APIC
801static u8 __initdata irq_used[MAX_IRQ_SOURCES]; 774static u8 __initdata irq_used[MAX_IRQ_SOURCES];
802 775
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
new file mode 100644
index 000000000000..3b7078abc871
--- /dev/null
+++ b/arch/x86/kernel/mrst.c
@@ -0,0 +1,24 @@
1/*
2 * mrst.c: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/init.h>
13
14#include <asm/setup.h>
15
16/*
17 * Moorestown specific x86_init function overrides and early setup
18 * calls.
19 */
20void __init x86_mrst_early_setup(void)
21{
22 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop;
24}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7dd950094178..6a3cefc7dda1 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -241,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
241 .notifier_call = msr_class_cpu_callback, 241 .notifier_call = msr_class_cpu_callback,
242}; 242};
243 243
244static char *msr_nodename(struct device *dev) 244static char *msr_devnode(struct device *dev, mode_t *mode)
245{ 245{
246 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); 246 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
247} 247}
@@ -262,7 +262,7 @@ static int __init msr_init(void)
262 err = PTR_ERR(msr_class); 262 err = PTR_ERR(msr_class);
263 goto out_chrdev; 263 goto out_chrdev;
264 } 264 }
265 msr_class->nodename = msr_nodename; 265 msr_class->devnode = msr_devnode;
266 for_each_online_cpu(i) { 266 for_each_online_cpu(i) {
267 err = msr_device_create(i); 267 err = msr_device_create(i);
268 if (err != 0) 268 if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index f5b0b4a01fb2..1b1739d16310 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x)
54 return x; 54 return x;
55} 55}
56 56
57static void __init default_banner(void) 57void __init default_banner(void)
58{ 58{
59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
60 pv_info.name); 60 pv_info.name);
61} 61}
62 62
63char *memory_setup(void)
64{
65 return pv_init_ops.memory_setup();
66}
67
68/* Simple instruction patching code. */ 63/* Simple instruction patching code. */
69#define DEF_NATIVE(ops, name, code) \ 64#define DEF_NATIVE(ops, name, code) \
70 extern const char start_##ops##_##name[], end_##ops##_##name[]; \ 65 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
@@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
188 return insn_len; 183 return insn_len;
189} 184}
190 185
191void init_IRQ(void)
192{
193 pv_irq_ops.init_IRQ();
194}
195
196static void native_flush_tlb(void) 186static void native_flush_tlb(void)
197{ 187{
198 __native_flush_tlb(); 188 __native_flush_tlb();
@@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void);
218extern void native_usergs_sysret32(void); 208extern void native_usergs_sysret32(void);
219extern void native_usergs_sysret64(void); 209extern void native_usergs_sysret64(void);
220 210
221static int __init print_banner(void)
222{
223 pv_init_ops.banner();
224 return 0;
225}
226core_initcall(print_banner);
227
228static struct resource reserve_ioports = { 211static struct resource reserve_ioports = {
229 .start = 0, 212 .start = 0,
230 .end = IO_SPACE_LIMIT, 213 .end = IO_SPACE_LIMIT,
@@ -320,21 +303,13 @@ struct pv_info pv_info = {
320 303
321struct pv_init_ops pv_init_ops = { 304struct pv_init_ops pv_init_ops = {
322 .patch = native_patch, 305 .patch = native_patch,
323 .banner = default_banner,
324 .arch_setup = paravirt_nop,
325 .memory_setup = machine_specific_memory_setup,
326}; 306};
327 307
328struct pv_time_ops pv_time_ops = { 308struct pv_time_ops pv_time_ops = {
329 .time_init = hpet_time_init,
330 .get_wallclock = native_get_wallclock,
331 .set_wallclock = native_set_wallclock,
332 .sched_clock = native_sched_clock, 309 .sched_clock = native_sched_clock,
333 .get_tsc_khz = native_calibrate_tsc,
334}; 310};
335 311
336struct pv_irq_ops pv_irq_ops = { 312struct pv_irq_ops pv_irq_ops = {
337 .init_IRQ = native_init_IRQ,
338 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), 313 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
339 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), 314 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
340 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), 315 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
@@ -409,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = {
409 384
410struct pv_apic_ops pv_apic_ops = { 385struct pv_apic_ops pv_apic_ops = {
411#ifdef CONFIG_X86_LOCAL_APIC 386#ifdef CONFIG_X86_LOCAL_APIC
412 .setup_boot_clock = setup_boot_APIC_clock,
413 .setup_secondary_clock = setup_secondary_APIC_clock,
414 .startup_ipi_hook = paravirt_nop, 387 .startup_ipi_hook = paravirt_nop,
415#endif 388#endif
416}; 389};
@@ -424,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = {
424#endif 397#endif
425 398
426struct pv_mmu_ops pv_mmu_ops = { 399struct pv_mmu_ops pv_mmu_ops = {
427#ifndef CONFIG_X86_64
428 .pagetable_setup_start = native_pagetable_setup_start,
429 .pagetable_setup_done = native_pagetable_setup_done,
430#else
431 .pagetable_setup_start = paravirt_nop,
432 .pagetable_setup_done = paravirt_nop,
433#endif
434 400
435 .read_cr2 = native_read_cr2, 401 .read_cr2 = native_read_cr2,
436 .write_cr2 = native_write_cr2, 402 .write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d71c8655905b..b2a71dca5642 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -35,7 +35,7 @@ int iommu_detected __read_mostly = 0;
35 35
36/* 36/*
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 37 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA ranslation for 38 * If this variable is 1, IOMMU implementations do no DMA translation for
39 * devices and allow every device to access to whole physical memory. This is 39 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to 40 * useful if a user want to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation. 41 * guests and not for driver dma translation.
@@ -225,10 +225,8 @@ static __init int iommu_setup(char *p)
225 if (!strncmp(p, "soft", 4)) 225 if (!strncmp(p, "soft", 4))
226 swiotlb = 1; 226 swiotlb = 1;
227#endif 227#endif
228 if (!strncmp(p, "pt", 2)) { 228 if (!strncmp(p, "pt", 2))
229 iommu_pass_through = 1; 229 iommu_pass_through = 1;
230 return 1;
231 }
232 230
233 gart_parse_options(p); 231 gart_parse_options(p);
234 232
@@ -313,7 +311,7 @@ void pci_iommu_shutdown(void)
313 amd_iommu_shutdown(); 311 amd_iommu_shutdown();
314} 312}
315/* Must execute after PCI subsystem */ 313/* Must execute after PCI subsystem */
316fs_initcall(pci_iommu_init); 314rootfs_initcall(pci_iommu_init);
317 315
318#ifdef CONFIG_PCI 316#ifdef CONFIG_PCI
319/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 317/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 98a827ee9ed7..a7f1b64f86e0 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -16,6 +16,7 @@
16#include <linux/agp_backend.h> 16#include <linux/agp_backend.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/sched.h>
19#include <linux/string.h> 20#include <linux/string.h>
20#include <linux/spinlock.h> 21#include <linux/spinlock.h>
21#include <linux/pci.h> 22#include <linux/pci.h>
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index e8a35016115f..aaa6b7839f1e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -46,9 +46,8 @@ void __init pci_swiotlb_init(void)
46{ 46{
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 48#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || 49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
50 iommu_pass_through) 50 swiotlb = 1;
51 swiotlb = 1;
52#endif 51#endif
53 if (swiotlb_force) 52 if (swiotlb_force)
54 swiotlb = 1; 53 swiotlb = 1;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1092a1a2fbe6..2275ce5776de 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <trace/power.h> 12#include <trace/events/power.h>
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h> 15#include <asm/syscalls.h>
@@ -27,9 +27,6 @@ EXPORT_SYMBOL(idle_nomwait);
27 27
28struct kmem_cache *task_xstate_cachep; 28struct kmem_cache *task_xstate_cachep;
29 29
30DEFINE_TRACE(power_start);
31DEFINE_TRACE(power_end);
32
33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 30int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
34{ 31{
35 *dst = *src; 32 *dst = *src;
@@ -289,9 +286,7 @@ static inline int hlt_use_halt(void)
289void default_idle(void) 286void default_idle(void)
290{ 287{
291 if (hlt_use_halt()) { 288 if (hlt_use_halt()) {
292 struct power_trace it; 289 trace_power_start(POWER_CSTATE, 1);
293
294 trace_power_start(&it, POWER_CSTATE, 1);
295 current_thread_info()->status &= ~TS_POLLING; 290 current_thread_info()->status &= ~TS_POLLING;
296 /* 291 /*
297 * TS_POLLING-cleared state must be visible before we 292 * TS_POLLING-cleared state must be visible before we
@@ -304,7 +299,6 @@ void default_idle(void)
304 else 299 else
305 local_irq_enable(); 300 local_irq_enable();
306 current_thread_info()->status |= TS_POLLING; 301 current_thread_info()->status |= TS_POLLING;
307 trace_power_end(&it);
308 } else { 302 } else {
309 local_irq_enable(); 303 local_irq_enable();
310 /* loop is done by the caller */ 304 /* loop is done by the caller */
@@ -362,9 +356,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
362 */ 356 */
363void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 357void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
364{ 358{
365 struct power_trace it; 359 trace_power_start(POWER_CSTATE, (ax>>4)+1);
366
367 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
368 if (!need_resched()) { 360 if (!need_resched()) {
369 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 361 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
370 clflush((void *)&current_thread_info()->flags); 362 clflush((void *)&current_thread_info()->flags);
@@ -374,15 +366,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
374 if (!need_resched()) 366 if (!need_resched())
375 __mwait(ax, cx); 367 __mwait(ax, cx);
376 } 368 }
377 trace_power_end(&it);
378} 369}
379 370
380/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 371/* Default MONITOR/MWAIT with no hints, used for default C1 state */
381static void mwait_idle(void) 372static void mwait_idle(void)
382{ 373{
383 struct power_trace it;
384 if (!need_resched()) { 374 if (!need_resched()) {
385 trace_power_start(&it, POWER_CSTATE, 1); 375 trace_power_start(POWER_CSTATE, 1);
386 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 376 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
387 clflush((void *)&current_thread_info()->flags); 377 clflush((void *)&current_thread_info()->flags);
388 378
@@ -392,7 +382,6 @@ static void mwait_idle(void)
392 __sti_mwait(0, 0); 382 __sti_mwait(0, 0);
393 else 383 else
394 local_irq_enable(); 384 local_irq_enable();
395 trace_power_end(&it);
396 } else 385 } else
397 local_irq_enable(); 386 local_irq_enable();
398} 387}
@@ -404,13 +393,11 @@ static void mwait_idle(void)
404 */ 393 */
405static void poll_idle(void) 394static void poll_idle(void)
406{ 395{
407 struct power_trace it; 396 trace_power_start(POWER_CSTATE, 0);
408
409 trace_power_start(&it, POWER_CSTATE, 0);
410 local_irq_enable(); 397 local_irq_enable();
411 while (!need_resched()) 398 while (!need_resched())
412 cpu_relax(); 399 cpu_relax();
413 trace_power_end(&it); 400 trace_power_end(0);
414} 401}
415 402
416/* 403/*
@@ -558,10 +545,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
558void __init init_c1e_mask(void) 545void __init init_c1e_mask(void)
559{ 546{
560 /* If we're using c1e_idle, we need to allocate c1e_mask. */ 547 /* If we're using c1e_idle, we need to allocate c1e_mask. */
561 if (pm_idle == c1e_idle) { 548 if (pm_idle == c1e_idle)
562 alloc_cpumask_var(&c1e_mask, GFP_KERNEL); 549 zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
563 cpumask_clear(c1e_mask);
564 }
565} 550}
566 551
567static int __init idle_setup(char *str) 552static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 113b8927c822..267cb85b479c 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -312,16 +312,6 @@ static int putreg(struct task_struct *child,
312 return set_flags(child, value); 312 return set_flags(child, value);
313 313
314#ifdef CONFIG_X86_64 314#ifdef CONFIG_X86_64
315 /*
316 * Orig_ax is really just a flag with small positive and
317 * negative values, so make sure to always sign-extend it
318 * from 32 bits so that it works correctly regardless of
319 * whether we come from a 32-bit environment or not.
320 */
321 case offsetof(struct user_regs_struct, orig_ax):
322 value = (long) (s32) value;
323 break;
324
325 case offsetof(struct user_regs_struct,fs_base): 315 case offsetof(struct user_regs_struct,fs_base):
326 if (value >= TASK_SIZE_OF(child)) 316 if (value >= TASK_SIZE_OF(child))
327 return -EIO; 317 return -EIO;
@@ -1177,10 +1167,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1177 1167
1178 case offsetof(struct user32, regs.orig_eax): 1168 case offsetof(struct user32, regs.orig_eax):
1179 /* 1169 /*
1180 * Sign-extend the value so that orig_eax = -1 1170 * A 32-bit debugger setting orig_eax means to restore
1181 * causes (long)orig_ax < 0 tests to fire correctly. 1171 * the state of the task restarting a 32-bit syscall.
1172 * Make sure we interpret the -ERESTART* codes correctly
1173 * in case the task is not actually still sitting at the
1174 * exit from a 32-bit syscall with TS_COMPAT still set.
1182 */ 1175 */
1183 regs->orig_ax = (long) (s32) value; 1176 regs->orig_ax = value;
1177 if (syscall_get_nr(child, regs) >= 0)
1178 task_thread_info(child)->status |= TS_COMPAT;
1184 break; 1179 break;
1185 1180
1186 case offsetof(struct user32, regs.eflags): 1181 case offsetof(struct user32, regs.eflags):
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624bf..6c3b2c6fd772 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
508 508
509 pci_read_config_dword(nb_ht, 0x60, &val); 509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev); 511 pci_dev_put(nb_ht);
512} 512}
513 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d101844..a1a3cdda06e1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,8 @@
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h>
8#include <linux/tboot.h>
7#include <acpi/reboot.h> 9#include <acpi/reboot.h>
8#include <asm/io.h> 10#include <asm/io.h>
9#include <asm/apic.h> 11#include <asm/apic.h>
@@ -508,6 +510,8 @@ static void native_machine_emergency_restart(void)
508 if (reboot_emergency) 510 if (reboot_emergency)
509 emergency_vmx_disable_all(); 511 emergency_vmx_disable_all();
510 512
513 tboot_shutdown(TB_SHUTDOWN_REBOOT);
514
511 /* Tell the BIOS if we want cold or warm reboot */ 515 /* Tell the BIOS if we want cold or warm reboot */
512 *((unsigned short *)__va(0x472)) = reboot_mode; 516 *((unsigned short *)__va(0x472)) = reboot_mode;
513 517
@@ -634,6 +638,8 @@ static void native_machine_halt(void)
634 /* stop other cpus and apics */ 638 /* stop other cpus and apics */
635 machine_shutdown(); 639 machine_shutdown();
636 640
641 tboot_shutdown(TB_SHUTDOWN_HALT);
642
637 /* stop this cpu */ 643 /* stop this cpu */
638 stop_this_cpu(NULL); 644 stop_this_cpu(NULL);
639} 645}
@@ -645,6 +651,8 @@ static void native_machine_power_off(void)
645 machine_shutdown(); 651 machine_shutdown();
646 pm_power_off(); 652 pm_power_off();
647 } 653 }
654 /* a fallback in case there is no PM info available */
655 tboot_shutdown(TB_SHUTDOWN_HALT);
648} 656}
649 657
650struct machine_ops machine_ops = { 658struct machine_ops machine_ops = {
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e72..1cfbbfc3ae26 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -8,6 +8,7 @@
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9 9
10#include <asm/vsyscall.h> 10#include <asm/vsyscall.h>
11#include <asm/x86_init.h>
11#include <asm/time.h> 12#include <asm/time.h>
12 13
13#ifdef CONFIG_X86_32 14#ifdef CONFIG_X86_32
@@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
165} 166}
166EXPORT_SYMBOL(rtc_cmos_write); 167EXPORT_SYMBOL(rtc_cmos_write);
167 168
168static int set_rtc_mmss(unsigned long nowtime) 169int update_persistent_clock(struct timespec now)
169{ 170{
170 unsigned long flags; 171 unsigned long flags;
171 int retval; 172 int retval;
172 173
173 spin_lock_irqsave(&rtc_lock, flags); 174 spin_lock_irqsave(&rtc_lock, flags);
174 retval = set_wallclock(nowtime); 175 retval = x86_platform.set_wallclock(now.tv_sec);
175 spin_unlock_irqrestore(&rtc_lock, flags); 176 spin_unlock_irqrestore(&rtc_lock, flags);
176 177
177 return retval; 178 return retval;
178} 179}
179 180
180/* not static: needed by APM */ 181/* not static: needed by APM */
181unsigned long read_persistent_clock(void) 182void read_persistent_clock(struct timespec *ts)
182{ 183{
183 unsigned long retval, flags; 184 unsigned long retval, flags;
184 185
185 spin_lock_irqsave(&rtc_lock, flags); 186 spin_lock_irqsave(&rtc_lock, flags);
186 retval = get_wallclock(); 187 retval = x86_platform.get_wallclock();
187 spin_unlock_irqrestore(&rtc_lock, flags); 188 spin_unlock_irqrestore(&rtc_lock, flags);
188 189
189 return retval; 190 ts->tv_sec = retval;
190} 191 ts->tv_nsec = 0;
191
192int update_persistent_clock(struct timespec now)
193{
194 return set_rtc_mmss(now.tv_sec);
195} 192}
196 193
197unsigned long long native_read_tsc(void) 194unsigned long long native_read_tsc(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef2..e09f0e2c14b5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -27,6 +27,7 @@
27#include <linux/screen_info.h> 27#include <linux/screen_info.h>
28#include <linux/ioport.h> 28#include <linux/ioport.h>
29#include <linux/acpi.h> 29#include <linux/acpi.h>
30#include <linux/sfi.h>
30#include <linux/apm_bios.h> 31#include <linux/apm_bios.h>
31#include <linux/initrd.h> 32#include <linux/initrd.h>
32#include <linux/bootmem.h> 33#include <linux/bootmem.h>
@@ -66,6 +67,7 @@
66 67
67#include <linux/percpu.h> 68#include <linux/percpu.h>
68#include <linux/crash_dump.h> 69#include <linux/crash_dump.h>
70#include <linux/tboot.h>
69 71
70#include <video/edid.h> 72#include <video/edid.h>
71 73
@@ -108,10 +110,6 @@
108#include <asm/numa_64.h> 110#include <asm/numa_64.h>
109#endif 111#endif
110 112
111#ifndef ARCH_SETUP
112#define ARCH_SETUP
113#endif
114
115/* 113/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access 115 * The direct mapping extends to max_pfn_mapped, so that we can directly access
@@ -133,9 +131,9 @@ int default_cpu_present_to_apicid(int mps_cpu)
133 return __default_cpu_present_to_apicid(mps_cpu); 131 return __default_cpu_present_to_apicid(mps_cpu);
134} 132}
135 133
136int default_check_phys_apicid_present(int boot_cpu_physical_apicid) 134int default_check_phys_apicid_present(int phys_apicid)
137{ 135{
138 return __default_check_phys_apicid_present(boot_cpu_physical_apicid); 136 return __default_check_phys_apicid_present(phys_apicid);
139} 137}
140#endif 138#endif
141 139
@@ -171,13 +169,6 @@ static struct resource bss_resource = {
171 169
172 170
173#ifdef CONFIG_X86_32 171#ifdef CONFIG_X86_32
174static struct resource video_ram_resource = {
175 .name = "Video RAM area",
176 .start = 0xa0000,
177 .end = 0xbffff,
178 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
179};
180
181/* cpu data as detected by the assembly code in head.S */ 172/* cpu data as detected by the assembly code in head.S */
182struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; 173struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
183/* common cpu data for all cpus */ 174/* common cpu data for all cpus */
@@ -605,7 +596,7 @@ static struct resource standard_io_resources[] = {
605 .flags = IORESOURCE_BUSY | IORESOURCE_IO } 596 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
606}; 597};
607 598
608static void __init reserve_standard_io_resources(void) 599void __init reserve_standard_io_resources(void)
609{ 600{
610 int i; 601 int i;
611 602
@@ -637,10 +628,6 @@ static int __init setup_elfcorehdr(char *arg)
637early_param("elfcorehdr", setup_elfcorehdr); 628early_param("elfcorehdr", setup_elfcorehdr);
638#endif 629#endif
639 630
640static struct x86_quirks default_x86_quirks __initdata;
641
642struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
643
644#ifdef CONFIG_X86_RESERVE_LOW_64K 631#ifdef CONFIG_X86_RESERVE_LOW_64K
645static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 632static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
646{ 633{
@@ -757,7 +744,7 @@ void __init setup_arch(char **cmdline_p)
757 } 744 }
758#endif 745#endif
759 746
760 ARCH_SETUP 747 x86_init.oem.arch_setup();
761 748
762 setup_memory_map(); 749 setup_memory_map();
763 parse_setup_data(); 750 parse_setup_data();
@@ -796,6 +783,16 @@ void __init setup_arch(char **cmdline_p)
796 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 783 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
797 *cmdline_p = command_line; 784 *cmdline_p = command_line;
798 785
786#ifdef CONFIG_X86_64
787 /*
788 * Must call this twice: Once just to detect whether hardware doesn't
789 * support NX (so that the early EHCI debug console setup can safely
790 * call set_fixmap(), and then again after parsing early parameters to
791 * honor the respective command line option.
792 */
793 check_efer();
794#endif
795
799 parse_early_param(); 796 parse_early_param();
800 797
801#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -833,11 +830,9 @@ void __init setup_arch(char **cmdline_p)
833 * VMware detection requires dmi to be available, so this 830 * VMware detection requires dmi to be available, so this
834 * needs to be done after dmi_scan_machine, for the BP. 831 * needs to be done after dmi_scan_machine, for the BP.
835 */ 832 */
836 init_hypervisor(&boot_cpu_data); 833 init_hypervisor_platform();
837 834
838#ifdef CONFIG_X86_32 835 x86_init.resources.probe_roms();
839 probe_roms();
840#endif
841 836
842 /* after parse_early_param, so could debug it */ 837 /* after parse_early_param, so could debug it */
843 insert_resource(&iomem_resource, &code_resource); 838 insert_resource(&iomem_resource, &code_resource);
@@ -972,10 +967,11 @@ void __init setup_arch(char **cmdline_p)
972 kvmclock_init(); 967 kvmclock_init();
973#endif 968#endif
974 969
975 paravirt_pagetable_setup_start(swapper_pg_dir); 970 x86_init.paging.pagetable_setup_start(swapper_pg_dir);
976 paging_init(); 971 paging_init();
977 paravirt_pagetable_setup_done(swapper_pg_dir); 972 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
978 paravirt_post_allocator_init(); 973
974 tboot_probe();
979 975
980#ifdef CONFIG_X86_64 976#ifdef CONFIG_X86_64
981 map_vsyscall(); 977 map_vsyscall();
@@ -990,13 +986,13 @@ void __init setup_arch(char **cmdline_p)
990 */ 986 */
991 acpi_boot_init(); 987 acpi_boot_init();
992 988
993#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) 989 sfi_init();
990
994 /* 991 /*
995 * get boot-time SMP configuration: 992 * get boot-time SMP configuration:
996 */ 993 */
997 if (smp_found_config) 994 if (smp_found_config)
998 get_smp_config(); 995 get_smp_config();
999#endif
1000 996
1001 prefill_possible_map(); 997 prefill_possible_map();
1002 998
@@ -1015,10 +1011,7 @@ void __init setup_arch(char **cmdline_p)
1015 e820_reserve_resources(); 1011 e820_reserve_resources();
1016 e820_mark_nosave_regions(max_low_pfn); 1012 e820_mark_nosave_regions(max_low_pfn);
1017 1013
1018#ifdef CONFIG_X86_32 1014 x86_init.resources.reserve_resources();
1019 request_resource(&iomem_resource, &video_ram_resource);
1020#endif
1021 reserve_standard_io_resources();
1022 1015
1023 e820_setup_gap(); 1016 e820_setup_gap();
1024 1017
@@ -1030,78 +1023,22 @@ void __init setup_arch(char **cmdline_p)
1030 conswitchp = &dummy_con; 1023 conswitchp = &dummy_con;
1031#endif 1024#endif
1032#endif 1025#endif
1026 x86_init.oem.banner();
1033} 1027}
1034 1028
1035#ifdef CONFIG_X86_32 1029#ifdef CONFIG_X86_32
1036 1030
1037/** 1031static struct resource video_ram_resource = {
1038 * x86_quirk_intr_init - post gate setup interrupt initialisation 1032 .name = "Video RAM area",
1039 * 1033 .start = 0xa0000,
1040 * Description: 1034 .end = 0xbffff,
1041 * Fill in any interrupts that may have been left out by the general 1035 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1042 * init_IRQ() routine. interrupts having to do with the machine rather
1043 * than the devices on the I/O bus (like APIC interrupts in intel MP
1044 * systems) are started here.
1045 **/
1046void __init x86_quirk_intr_init(void)
1047{
1048 if (x86_quirks->arch_intr_init) {
1049 if (x86_quirks->arch_intr_init())
1050 return;
1051 }
1052}
1053
1054/**
1055 * x86_quirk_trap_init - initialise system specific traps
1056 *
1057 * Description:
1058 * Called as the final act of trap_init(). Used in VISWS to initialise
1059 * the various board specific APIC traps.
1060 **/
1061void __init x86_quirk_trap_init(void)
1062{
1063 if (x86_quirks->arch_trap_init) {
1064 if (x86_quirks->arch_trap_init())
1065 return;
1066 }
1067}
1068
1069static struct irqaction irq0 = {
1070 .handler = timer_interrupt,
1071 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
1072 .name = "timer"
1073}; 1036};
1074 1037
1075/** 1038void __init i386_reserve_resources(void)
1076 * x86_quirk_pre_time_init - do any specific initialisations before.
1077 *
1078 **/
1079void __init x86_quirk_pre_time_init(void)
1080{ 1039{
1081 if (x86_quirks->arch_pre_time_init) 1040 request_resource(&iomem_resource, &video_ram_resource);
1082 x86_quirks->arch_pre_time_init(); 1041 reserve_standard_io_resources();
1083} 1042}
1084 1043
1085/**
1086 * x86_quirk_time_init - do any specific initialisations for the system timer.
1087 *
1088 * Description:
1089 * Must plug the system timer interrupt source at HZ into the IRQ listed
1090 * in irq_vectors.h:TIMER_IRQ
1091 **/
1092void __init x86_quirk_time_init(void)
1093{
1094 if (x86_quirks->arch_time_init) {
1095 /*
1096 * A nonzero return code does not mean failure, it means
1097 * that the architecture quirk does not want any
1098 * generic (timer) setup to be performed after this:
1099 */
1100 if (x86_quirks->arch_time_init())
1101 return;
1102 }
1103
1104 irq0.mask = cpumask_of_cpu(0);
1105 setup_irq(0, &irq0);
1106}
1107#endif /* CONFIG_X86_32 */ 1044#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f212..d559af913e1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
55#define PERCPU_FIRST_CHUNK_RESERVE 0 55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif 56#endif
57 57
58#ifdef CONFIG_X86_32
58/** 59/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA 60 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 * 61 *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
83#endif 84#endif
84 return false; 85 return false;
85} 86}
87#endif
86 88
87/** 89/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 90 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 126}
125 127
126/* 128/*
127 * Large page remap allocator 129 * Helpers for first chunk memory allocation
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 130 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 131static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
140struct pcpul_ent {
141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
148
149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
150{ 132{
151 size_t off = (size_t)pageno << PAGE_SHIFT; 133 return pcpu_alloc_bootmem(cpu, size, align);
152
153 if (off >= pcpul_size)
154 return NULL;
155
156 return virt_to_page(pcpul_map[cpu].ptr + off);
157} 134}
158 135
159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 136static void __init pcpu_fc_free(void *ptr, size_t size)
160{ 137{
161 size_t map_size, dyn_size; 138 free_bootmem(__pa(ptr), size);
162 unsigned int cpu;
163 int i, j;
164 ssize_t ret;
165
166 if (!chosen) {
167 size_t vm_size = VMALLOC_END - VMALLOC_START;
168 size_t tot_size = nr_cpu_ids * PMD_SIZE;
169
170 /* on non-NUMA, embedding is better */
171 if (!pcpu_need_numa())
172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
185 return -EINVAL;
186 }
187
188 /*
189 * Currently supports only single page. Supporting multiple
190 * pages won't be too difficult if it ever becomes necessary.
191 */
192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
193 PERCPU_DYNAMIC_RESERVE);
194 if (pcpul_size > PMD_SIZE) {
195 pr_warning("PERCPU: static data is larger than large page, "
196 "can't use large page\n");
197 return -EINVAL;
198 }
199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
200
201 /* allocate pointer array and alloc large pages */
202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
203 pcpul_map = alloc_bootmem(map_size);
204
205 for_each_possible_cpu(cpu) {
206 pcpul_map[cpu].cpu = cpu;
207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
212 goto enomem;
213 }
214
215 /*
216 * Only use pcpul_size bytes and give back the rest.
217 *
218 * Ingo: The 2MB up-rounding bootmem is needed to make
219 * sure the partial 2MB page is still fully RAM - it's
220 * not well-specified to have a PAT-incompatible area
221 * (unmapped RAM, device memory, etc.) in that hole.
222 */
223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
224 PMD_SIZE - pcpul_size);
225
226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
227 }
228
229 /* allocate address and map */
230 pcpul_vm.flags = VM_ALLOC;
231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
233
234 for_each_possible_cpu(cpu) {
235 pmd_t *pmd, pmd_v;
236
237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
238 cpu * PMD_SIZE);
239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
242 }
243
244 /* we're ready, commit */
245 pr_info("PERCPU: Remapped at %p with large pages, static data "
246 "%zu bytes\n", pcpul_vm.addr, static_size);
247
248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
250 PMD_SIZE, pcpul_vm.addr, NULL);
251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
262
263enomem:
264 for_each_possible_cpu(cpu)
265 if (pcpul_map[cpu].ptr)
266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
267 free_bootmem(__pa(pcpul_map), map_size);
268 return -ENOMEM;
269} 139}
270 140
271/** 141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{ 142{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); 143#ifdef CONFIG_NEED_MULTIPLE_NODES
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; 144 if (early_cpu_to_node(from) == early_cpu_to_node(to))
291 int left = 0, right = nr_cpu_ids - 1; 145 return LOCAL_DISTANCE;
292 int pos; 146 else
293 147 return REMOTE_DISTANCE;
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
316}
317#else 148#else
318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 149 return LOCAL_DISTANCE;
319{
320 return -EINVAL;
321}
322#endif 150#endif
323
324/*
325 * Embedding allocator
326 *
327 * The first chunk is sized to just contain the static area plus
328 * module and dynamic reserves and embedded into linear physical
329 * mapping so that it can use PMD mapping without additional TLB
330 * pressure.
331 */
332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
333{
334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
335
336 /*
337 * If large page isn't supported, there's no benefit in doing
338 * this. Also, embedding allocation doesn't play well with
339 * NUMA.
340 */
341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
342 return -EINVAL;
343
344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
345 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
346} 151}
347 152
348/* 153static void __init pcpup_populate_pte(unsigned long addr)
349 * 4k page allocator
350 *
351 * This is the basic allocator. Static percpu area is allocated
352 * page-by-page and most of initialization is done by the generic
353 * setup function.
354 */
355static struct page **pcpu4k_pages __initdata;
356static int pcpu4k_nr_static_pages __initdata;
357
358static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
359{
360 if (pageno < pcpu4k_nr_static_pages)
361 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
362 return NULL;
363}
364
365static void __init pcpu4k_populate_pte(unsigned long addr)
366{ 154{
367 populate_extra_pte(addr); 155 populate_extra_pte(addr);
368} 156}
369 157
370static ssize_t __init setup_pcpu_4k(size_t static_size)
371{
372 size_t pages_size;
373 unsigned int cpu;
374 int i, j;
375 ssize_t ret;
376
377 pcpu4k_nr_static_pages = PFN_UP(static_size);
378
379 /* unaligned allocations can't be freed, round up to page size */
380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
381 * sizeof(pcpu4k_pages[0]));
382 pcpu4k_pages = alloc_bootmem(pages_size);
383
384 /* allocate and copy */
385 j = 0;
386 for_each_possible_cpu(cpu)
387 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
388 void *ptr;
389
390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
394 goto enomem;
395 }
396
397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
398 pcpu4k_pages[j++] = virt_to_page(ptr);
399 }
400
401 /* we're ready, commit */
402 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
403 pcpu4k_nr_static_pages, static_size);
404
405 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
406 PERCPU_FIRST_CHUNK_RESERVE, -1,
407 -1, NULL, pcpu4k_populate_pte);
408 goto out_free_ar;
409
410enomem:
411 while (--j >= 0)
412 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
413 ret = -ENOMEM;
414out_free_ar:
415 free_bootmem(__pa(pcpu4k_pages), pages_size);
416 return ret;
417}
418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
429static inline void setup_percpu_segment(int cpu) 158static inline void setup_percpu_segment(int cpu)
430{ 159{
431#ifdef CONFIG_X86_32 160#ifdef CONFIG_X86_32
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
441 170
442void __init setup_per_cpu_areas(void) 171void __init setup_per_cpu_areas(void)
443{ 172{
444 size_t static_size = __per_cpu_end - __per_cpu_start;
445 unsigned int cpu; 173 unsigned int cpu;
446 unsigned long delta; 174 unsigned long delta;
447 size_t pcpu_unit_size; 175 int rc;
448 ssize_t ret;
449 176
450 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 177 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
451 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 178 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
452 179
453 /* 180 /*
454 * Allocate percpu area. If PSE is supported, try to make use 181 * Allocate percpu area. Embedding allocator is our favorite;
455 * of large page mappings. Please read comments on top of 182 * however, on NUMA configurations, it can result in very
456 * each allocator for details. 183 * sparse unit mapping and vmalloc area isn't spacious enough
184 * on 32bit. Use page in that case.
457 */ 185 */
458 ret = -EINVAL; 186#ifdef CONFIG_X86_32
459 if (strlen(pcpu_chosen_alloc)) { 187 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
460 if (strcmp(pcpu_chosen_alloc, "4k")) { 188 pcpu_chosen_fc = PCPU_FC_PAGE;
461 if (!strcmp(pcpu_chosen_alloc, "lpage")) 189#endif
462 ret = setup_pcpu_lpage(static_size, true); 190 rc = -EINVAL;
463 else if (!strcmp(pcpu_chosen_alloc, "embed")) 191 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
464 ret = setup_pcpu_embed(static_size, true); 192 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
465 else 193 const size_t dyn_size = PERCPU_MODULE_RESERVE +
466 pr_warning("PERCPU: unknown allocator %s " 194 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
467 "specified\n", pcpu_chosen_alloc); 195
468 if (ret < 0) 196 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
469 pr_warning("PERCPU: %s allocator failed (%zd), " 197 dyn_size, atom_size,
470 "falling back to 4k\n", 198 pcpu_cpu_distance,
471 pcpu_chosen_alloc, ret); 199 pcpu_fc_alloc, pcpu_fc_free);
472 } 200 if (rc < 0)
473 } else { 201 pr_warning("PERCPU: %s allocator failed (%d), "
474 ret = setup_pcpu_lpage(static_size, false); 202 "falling back to page size\n",
475 if (ret < 0) 203 pcpu_fc_names[pcpu_chosen_fc], rc);
476 ret = setup_pcpu_embed(static_size, false);
477 } 204 }
478 if (ret < 0) 205 if (rc < 0)
479 ret = setup_pcpu_4k(static_size); 206 rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
480 if (ret < 0) 207 pcpu_fc_alloc, pcpu_fc_free,
481 panic("cannot allocate static percpu area (%zu bytes, err=%zd)", 208 pcpup_populate_pte);
482 static_size, ret); 209 if (rc < 0)
483 210 panic("cannot initialize percpu area (err=%d)", rc);
484 pcpu_unit_size = ret;
485 211
486 /* alrighty, percpu areas up and running */ 212 /* alrighty, percpu areas up and running */
487 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 213 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
488 for_each_possible_cpu(cpu) { 214 for_each_possible_cpu(cpu) {
489 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 215 per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
490 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 216 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
491 per_cpu(cpu_number, cpu) = cpu; 217 per_cpu(cpu_number, cpu) = cpu;
492 setup_percpu_segment(cpu); 218 setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
new file mode 100644
index 000000000000..34e099382651
--- /dev/null
+++ b/arch/x86/kernel/sfi.c
@@ -0,0 +1,122 @@
1/*
2 * sfi.c - x86 architecture SFI support.
3 *
4 * Copyright (c) 2009, Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#define KMSG_COMPONENT "SFI"
22#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24#include <linux/acpi.h>
25#include <linux/init.h>
26#include <linux/sfi.h>
27#include <linux/io.h>
28
29#include <asm/io_apic.h>
30#include <asm/mpspec.h>
31#include <asm/setup.h>
32#include <asm/apic.h>
33
34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36
37void __init mp_sfi_register_lapic_address(unsigned long address)
38{
39 mp_lapic_addr = address;
40
41 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
42 if (boot_cpu_physical_apicid == -1U)
43 boot_cpu_physical_apicid = read_apic_id();
44
45 pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
46}
47
48/* All CPUs enumerated by SFI must be present and enabled */
49void __cpuinit mp_sfi_register_lapic(u8 id)
50{
51 if (MAX_APICS - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n",
53 id, MAX_APICS);
54 return;
55 }
56
57 pr_info("registering lapic[%d]\n", id);
58
59 generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
60}
61
62static int __init sfi_parse_cpus(struct sfi_table_header *table)
63{
64 struct sfi_table_simple *sb;
65 struct sfi_cpu_table_entry *pentry;
66 int i;
67 int cpu_num;
68
69 sb = (struct sfi_table_simple *)table;
70 cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
71 pentry = (struct sfi_cpu_table_entry *)sb->pentry;
72
73 for (i = 0; i < cpu_num; i++) {
74 mp_sfi_register_lapic(pentry->apic_id);
75 pentry++;
76 }
77
78 smp_found_config = 1;
79 return 0;
80}
81#endif /* CONFIG_X86_LOCAL_APIC */
82
83#ifdef CONFIG_X86_IO_APIC
84static u32 gsi_base;
85
86static int __init sfi_parse_ioapic(struct sfi_table_header *table)
87{
88 struct sfi_table_simple *sb;
89 struct sfi_apic_table_entry *pentry;
90 int i, num;
91
92 sb = (struct sfi_table_simple *)table;
93 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
94 pentry = (struct sfi_apic_table_entry *)sb->pentry;
95
96 for (i = 0; i < num; i++) {
97 mp_register_ioapic(i, pentry->phys_addr, gsi_base);
98 gsi_base += io_apic_get_redir_entries(i);
99 pentry++;
100 }
101
102 WARN(pic_mode, KERN_WARNING
103 "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
104 pic_mode = 0;
105 return 0;
106}
107#endif /* CONFIG_X86_IO_APIC */
108
109/*
110 * sfi_platform_init(): register lapics & io-apics
111 */
112int __init sfi_platform_init(void)
113{
114#ifdef CONFIG_X86_LOCAL_APIC
115 mp_sfi_register_lapic_address(sfi_lapic_addr);
116 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
117#endif
118#ifdef CONFIG_X86_IO_APIC
119 sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
120#endif
121 return 0;
122}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index baaf8052f355..fbf3b07c8567 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -847,7 +847,7 @@ static void do_signal(struct pt_regs *regs)
847void 847void
848do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 848do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
849{ 849{
850#ifdef CONFIG_X86_NEW_MCE 850#ifdef CONFIG_X86_MCE
851 /* notify userspace of pending MCEs */ 851 /* notify userspace of pending MCEs */
852 if (thread_info_flags & _TIF_MCE_NOTIFY) 852 if (thread_info_flags & _TIF_MCE_NOTIFY)
853 mce_notify_process(); 853 mce_notify_process();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ec7b64c2df82..213a7a3e4562 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
47#include <linux/bootmem.h> 47#include <linux/bootmem.h>
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h>
50 51
51#include <asm/acpi.h> 52#include <asm/acpi.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
@@ -324,7 +325,7 @@ notrace static void __cpuinit start_secondary(void *unused)
324 /* enable local interrupts */ 325 /* enable local interrupts */
325 local_irq_enable(); 326 local_irq_enable();
326 327
327 setup_secondary_clock(); 328 x86_cpuinit.setup_percpu_clockev();
328 329
329 wmb(); 330 wmb();
330 load_debug_registers(); 331 load_debug_registers();
@@ -1060,12 +1061,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1060#endif 1061#endif
1061 current_thread_info()->cpu = 0; /* needed? */ 1062 current_thread_info()->cpu = 0; /* needed? */
1062 for_each_possible_cpu(i) { 1063 for_each_possible_cpu(i) {
1063 alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1064 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1064 alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1065 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1065 alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1066 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
1066 cpumask_clear(per_cpu(cpu_core_map, i));
1067 cpumask_clear(per_cpu(cpu_sibling_map, i));
1068 cpumask_clear(cpu_data(i).llc_shared_map);
1069 } 1067 }
1070 set_cpu_sibling_map(0); 1068 set_cpu_sibling_map(0);
1071 1069
@@ -1115,13 +1113,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1115 1113
1116 printk(KERN_INFO "CPU%d: ", 0); 1114 printk(KERN_INFO "CPU%d: ", 0);
1117 print_cpu_info(&cpu_data(0)); 1115 print_cpu_info(&cpu_data(0));
1118 setup_boot_clock(); 1116 x86_init.timers.setup_percpu_clockev();
1119 1117
1120 if (is_uv_system()) 1118 if (is_uv_system())
1121 uv_system_init(); 1119 uv_system_init();
1120
1121 set_mtrr_aps_delayed_init();
1122out: 1122out:
1123 preempt_enable(); 1123 preempt_enable();
1124} 1124}
1125
1126void arch_enable_nonboot_cpus_begin(void)
1127{
1128 set_mtrr_aps_delayed_init();
1129}
1130
1131void arch_enable_nonboot_cpus_end(void)
1132{
1133 mtrr_aps_init();
1134}
1135
1125/* 1136/*
1126 * Early setup to make printk work. 1137 * Early setup to make printk work.
1127 */ 1138 */
@@ -1143,6 +1154,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1143 setup_ioapic_dest(); 1154 setup_ioapic_dest();
1144#endif 1155#endif
1145 check_nmi_watchdog(); 1156 check_nmi_watchdog();
1157 mtrr_aps_init();
1146} 1158}
1147 1159
1148static int __initdata setup_possible_cpus = -1; 1160static int __initdata setup_possible_cpus = -1;
@@ -1321,6 +1333,7 @@ void play_dead_common(void)
1321void native_play_dead(void) 1333void native_play_dead(void)
1322{ 1334{
1323 play_dead_common(); 1335 play_dead_common();
1336 tboot_shutdown(TB_SHUTDOWN_WFS);
1324 wbinvd_halt(); 1337 wbinvd_halt();
1325} 1338}
1326 1339
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321ddafda..0157cd26d7cc 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,4 +335,4 @@ ENTRY(sys_call_table)
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open 338 .long sys_perf_event_open
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 000000000000..86c9f91b48ae
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,447 @@
1/*
2 * tboot.c: main implementation of helper functions used by kernel for
3 * runtime support of Intel(R) Trusted Execution Technology
4 *
5 * Copyright (c) 2006-2009, Intel Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 */
21
22#include <linux/dma_remapping.h>
23#include <linux/init_task.h>
24#include <linux/spinlock.h>
25#include <linux/delay.h>
26#include <linux/sched.h>
27#include <linux/init.h>
28#include <linux/dmar.h>
29#include <linux/cpu.h>
30#include <linux/pfn.h>
31#include <linux/mm.h>
32#include <linux/tboot.h>
33
34#include <asm/trampoline.h>
35#include <asm/processor.h>
36#include <asm/bootparam.h>
37#include <asm/pgtable.h>
38#include <asm/pgalloc.h>
39#include <asm/fixmap.h>
40#include <asm/proto.h>
41#include <asm/setup.h>
42#include <asm/e820.h>
43#include <asm/io.h>
44
45#include "acpi/realmode/wakeup.h"
46
47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly;
49
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1
52
53#undef pr_fmt
54#define pr_fmt(fmt) "tboot: " fmt
55
56static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
57
58void __init tboot_probe(void)
59{
60 /* Look for valid page-aligned address for shared page. */
61 if (!boot_params.tboot_addr)
62 return;
63 /*
64 * also verify that it is mapped as we expect it before calling
65 * set_fixmap(), to reduce chance of garbage value causing crash
66 */
67 if (!e820_any_mapped(boot_params.tboot_addr,
68 boot_params.tboot_addr, E820_RESERVED)) {
69 pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
70 return;
71 }
72
73 /* only a natively booted kernel should be using TXT */
74 if (paravirt_enabled()) {
75 pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
76 return;
77 }
78
79 /* Map and check for tboot UUID. */
80 set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
81 tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
82 if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
83 pr_warning("tboot at 0x%llx is invalid\n",
84 boot_params.tboot_addr);
85 tboot = NULL;
86 return;
87 }
88 if (tboot->version < 5) {
89 pr_warning("tboot version is invalid: %u\n", tboot->version);
90 tboot = NULL;
91 return;
92 }
93
94 pr_info("found shared page at phys addr 0x%llx:\n",
95 boot_params.tboot_addr);
96 pr_debug("version: %d\n", tboot->version);
97 pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
98 pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
99 pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
100 pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
101}
102
103static pgd_t *tboot_pg_dir;
104static struct mm_struct tboot_mm = {
105 .mm_rb = RB_ROOT,
106 .pgd = swapper_pg_dir,
107 .mm_users = ATOMIC_INIT(2),
108 .mm_count = ATOMIC_INIT(1),
109 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
110 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
111 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
112 .cpu_vm_mask = CPU_MASK_ALL,
113};
114
115static inline void switch_to_tboot_pt(void)
116{
117 write_cr3(virt_to_phys(tboot_pg_dir));
118}
119
120static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
121 pgprot_t prot)
122{
123 pgd_t *pgd;
124 pud_t *pud;
125 pmd_t *pmd;
126 pte_t *pte;
127
128 pgd = pgd_offset(&tboot_mm, vaddr);
129 pud = pud_alloc(&tboot_mm, pgd, vaddr);
130 if (!pud)
131 return -1;
132 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
133 if (!pmd)
134 return -1;
135 pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
136 if (!pte)
137 return -1;
138 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
139 pte_unmap(pte);
140 return 0;
141}
142
143static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
144 unsigned long nr)
145{
146 /* Reuse the original kernel mapping */
147 tboot_pg_dir = pgd_alloc(&tboot_mm);
148 if (!tboot_pg_dir)
149 return -1;
150
151 for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
152 if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
153 return -1;
154 }
155
156 return 0;
157}
158
159static void tboot_create_trampoline(void)
160{
161 u32 map_base, map_size;
162
163 /* Create identity map for tboot shutdown code. */
164 map_base = PFN_DOWN(tboot->tboot_base);
165 map_size = PFN_UP(tboot->tboot_size);
166 if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
167 panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
168 map_base, map_size);
169}
170
171#ifdef CONFIG_ACPI_SLEEP
172
173static void add_mac_region(phys_addr_t start, unsigned long size)
174{
175 struct tboot_mac_region *mr;
176 phys_addr_t end = start + size;
177
178 if (start && size) {
179 mr = &tboot->mac_regions[tboot->num_mac_regions++];
180 mr->start = round_down(start, PAGE_SIZE);
181 mr->size = round_up(end, PAGE_SIZE) - mr->start;
182 }
183}
184
185static int tboot_setup_sleep(void)
186{
187 tboot->num_mac_regions = 0;
188
189 /* S3 resume code */
190 add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
191
192#ifdef CONFIG_X86_TRAMPOLINE
193 /* AP trampoline code */
194 add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
195#endif
196
197 /* kernel code + data + bss */
198 add_mac_region(virt_to_phys(_text), _end - _text);
199
200 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
201
202 return 0;
203}
204
205#else /* no CONFIG_ACPI_SLEEP */
206
207static int tboot_setup_sleep(void)
208{
209 /* S3 shutdown requested, but S3 not supported by the kernel... */
210 BUG();
211 return -1;
212}
213
214#endif
215
216void tboot_shutdown(u32 shutdown_type)
217{
218 void (*shutdown)(void);
219
220 if (!tboot_enabled())
221 return;
222
223 /*
224 * if we're being called before the 1:1 mapping is set up then just
225 * return and let the normal shutdown happen; this should only be
226 * due to very early panic()
227 */
228 if (!tboot_pg_dir)
229 return;
230
231 /* if this is S3 then set regions to MAC */
232 if (shutdown_type == TB_SHUTDOWN_S3)
233 if (tboot_setup_sleep())
234 return;
235
236 tboot->shutdown_type = shutdown_type;
237
238 switch_to_tboot_pt();
239
240 shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
241 shutdown();
242
243 /* should not reach here */
244 while (1)
245 halt();
246}
247
248static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
249{
250#define TB_COPY_GAS(tbg, g) \
251 tbg.space_id = g.space_id; \
252 tbg.bit_width = g.bit_width; \
253 tbg.bit_offset = g.bit_offset; \
254 tbg.access_width = g.access_width; \
255 tbg.address = g.address;
256
257 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
258 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
259 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
260 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
261
262 /*
263 * We need phys addr of waking vector, but can't use virt_to_phys() on
264 * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
265 * addr.
266 */
267 tboot->acpi_sinfo.wakeup_vector = fadt->facs +
268 offsetof(struct acpi_table_facs, firmware_waking_vector);
269}
270
271void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
272{
273 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
274 /* S0,1,2: */ -1, -1, -1,
275 /* S3: */ TB_SHUTDOWN_S3,
276 /* S4: */ TB_SHUTDOWN_S4,
277 /* S5: */ TB_SHUTDOWN_S5 };
278
279 if (!tboot_enabled())
280 return;
281
282 tboot_copy_fadt(&acpi_gbl_FADT);
283 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
284 tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
285 /* we always use the 32b wakeup vector */
286 tboot->acpi_sinfo.vector_width = 32;
287
288 if (sleep_state >= ACPI_S_STATE_COUNT ||
289 acpi_shutdown_map[sleep_state] == -1) {
290 pr_warning("unsupported sleep state 0x%x\n", sleep_state);
291 return;
292 }
293
294 tboot_shutdown(acpi_shutdown_map[sleep_state]);
295}
296
297static atomic_t ap_wfs_count;
298
299static int tboot_wait_for_aps(int num_aps)
300{
301 unsigned long timeout;
302
303 timeout = AP_WAIT_TIMEOUT*HZ;
304 while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
305 timeout) {
306 mdelay(1);
307 timeout--;
308 }
309
310 if (timeout)
311 pr_warning("tboot wait for APs timeout\n");
312
313 return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
314}
315
316static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
317 unsigned long action, void *hcpu)
318{
319 switch (action) {
320 case CPU_DYING:
321 atomic_inc(&ap_wfs_count);
322 if (num_online_cpus() == 1)
323 if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
324 return NOTIFY_BAD;
325 break;
326 }
327 return NOTIFY_OK;
328}
329
330static struct notifier_block tboot_cpu_notifier __cpuinitdata =
331{
332 .notifier_call = tboot_cpu_callback,
333};
334
335static __init int tboot_late_init(void)
336{
337 if (!tboot_enabled())
338 return 0;
339
340 tboot_create_trampoline();
341
342 atomic_set(&ap_wfs_count, 0);
343 register_hotcpu_notifier(&tboot_cpu_notifier);
344 return 0;
345}
346
347late_initcall(tboot_late_init);
348
349/*
350 * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
351 */
352
353#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
354#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
355
356/* # pages for each config regs space - used by fixmap */
357#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
358 TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
359
360/* offsets from pub/priv config space */
361#define TXTCR_HEAP_BASE 0x0300
362#define TXTCR_HEAP_SIZE 0x0308
363
364#define SHA1_SIZE 20
365
366struct sha1_hash {
367 u8 hash[SHA1_SIZE];
368};
369
370struct sinit_mle_data {
371 u32 version; /* currently 6 */
372 struct sha1_hash bios_acm_id;
373 u32 edx_senter_flags;
374 u64 mseg_valid;
375 struct sha1_hash sinit_hash;
376 struct sha1_hash mle_hash;
377 struct sha1_hash stm_hash;
378 struct sha1_hash lcp_policy_hash;
379 u32 lcp_policy_control;
380 u32 rlp_wakeup_addr;
381 u32 reserved;
382 u32 num_mdrs;
383 u32 mdrs_off;
384 u32 num_vtd_dmars;
385 u32 vtd_dmars_off;
386} __packed;
387
388struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
389{
390 void *heap_base, *heap_ptr, *config;
391
392 if (!tboot_enabled())
393 return dmar_tbl;
394
395 /*
396 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
397 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
398 */
399
400 /* map config space in order to get heap addr */
401 config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
402 PAGE_SIZE);
403 if (!config)
404 return NULL;
405
406 /* now map TXT heap */
407 heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
408 *(u64 *)(config + TXTCR_HEAP_SIZE));
409 iounmap(config);
410 if (!heap_base)
411 return NULL;
412
413 /* walk heap to SinitMleData */
414 /* skip BiosData */
415 heap_ptr = heap_base + *(u64 *)heap_base;
416 /* skip OsMleData */
417 heap_ptr += *(u64 *)heap_ptr;
418 /* skip OsSinitData */
419 heap_ptr += *(u64 *)heap_ptr;
420 /* now points to SinitMleDataSize; set to SinitMleData */
421 heap_ptr += sizeof(u64);
422 /* get addr of DMAR table */
423 dmar_tbl = (struct acpi_table_header *)(heap_ptr +
424 ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
425 sizeof(u64));
426
427 /* don't unmap heap because dmar.c needs access to this */
428
429 return dmar_tbl;
430}
431
432int tboot_force_iommu(void)
433{
434 if (!tboot_enabled())
435 return 0;
436
437 if (no_iommu || swiotlb || dmar_disabled)
438 pr_warning("Forcing Intel-IOMMU to enabled\n");
439
440 dmar_disabled = 0;
441#ifdef CONFIG_SWIOTLB
442 swiotlb = 0;
443#endif
444 no_iommu = 0;
445
446 return 1;
447}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 000000000000..be2573448ed9
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,121 @@
1/*
2 * Copyright (c) 1991,1992,1995 Linus Torvalds
3 * Copyright (c) 1994 Alan Modra
4 * Copyright (c) 1995 Markus Kuhn
5 * Copyright (c) 1996 Ingo Molnar
6 * Copyright (c) 1998 Andrea Arcangeli
7 * Copyright (c) 2002,2006 Vojtech Pavlik
8 * Copyright (c) 2003 Andi Kleen
9 *
10 */
11
12#include <linux/clockchips.h>
13#include <linux/interrupt.h>
14#include <linux/time.h>
15#include <linux/mca.h>
16
17#include <asm/vsyscall.h>
18#include <asm/x86_init.h>
19#include <asm/i8259.h>
20#include <asm/i8253.h>
21#include <asm/timer.h>
22#include <asm/hpet.h>
23#include <asm/time.h>
24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
31#endif
32
33unsigned long profile_pc(struct pt_regs *regs)
34{
35 unsigned long pc = instruction_pointer(regs);
36
37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
41 unsigned long *sp =
42 (unsigned long *)kernel_stack_pointer(regs);
43 /*
44 * Return address is either directly at stack pointer
45 * or above a saved flags. Eflags has bits 22-31 zero,
46 * kernel addresses don't.
47 */
48 if (sp[0] >> 22)
49 return sp[0];
50 if (sp[1] >> 22)
51 return sp[1];
52#endif
53 }
54 return pc;
55}
56EXPORT_SYMBOL(profile_pc);
57
58/*
59 * Default timer interrupt handler for PIT/HPET
60 */
61static irqreturn_t timer_interrupt(int irq, void *dev_id)
62{
63 /* Keep nmi watchdog up to date */
64 inc_irq_stat(irq0_irqs);
65
66 /* Optimized out for !IO_APIC and x86_64 */
67 if (timer_ack) {
68 /*
69 * Subtle, when I/O APICs are used we have to ack timer IRQ
70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system.
72 */
73 spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock);
78 }
79
80 global_clock_event->event_handler(global_clock_event);
81
82 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
83 if (MCA_bus)
84 outb_p(inb_p(0x61)| 0x80, 0x61);
85
86 return IRQ_HANDLED;
87}
88
89static struct irqaction irq0 = {
90 .handler = timer_interrupt,
91 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
92 .name = "timer"
93};
94
95void __init setup_default_timer_irq(void)
96{
97 setup_irq(0, &irq0);
98}
99
100/* Default timer init function */
101void __init hpet_time_init(void)
102{
103 if (!hpet_enable())
104 setup_pit_timer();
105 setup_default_timer_irq();
106}
107
108static __init void x86_late_time_init(void)
109{
110 x86_init.timers.timer_init();
111 tsc_init();
112}
113
114/*
115 * Initialize TSC and delay the periodic timer init to
116 * late x86_late_time_init() so ioremap works.
117 */
118void __init time_init(void)
119{
120 late_time_init = x86_late_time_init;
121}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
deleted file mode 100644
index 5c5d87f0b2e1..000000000000
--- a/arch/x86/kernel/time_32.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
3 *
4 * This file contains the PC-specific time handling details:
5 * reading the RTC at bootup, etc..
6 * 1994-07-02 Alan Modra
7 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
8 * 1995-03-26 Markus Kuhn
9 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
10 * precision CMOS clock update
11 * 1996-05-03 Ingo Molnar
12 * fixed time warps in do_[slow|fast]_gettimeoffset()
13 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
14 * "A Kernel Model for Precision Timekeeping" by Dave Mills
15 * 1998-09-05 (Various)
16 * More robust do_fast_gettimeoffset() algorithm implemented
17 * (works with APM, Cyrix 6x86MX and Centaur C6),
18 * monotonic gettimeofday() with fast_get_timeoffset(),
19 * drift-proof precision TSC calibration on boot
20 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
21 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
22 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
23 * 1998-12-16 Andrea Arcangeli
24 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
25 * because was not accounting lost_ticks.
26 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
27 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
28 * serialize accesses to xtime/lost_ticks).
29 */
30
31#include <linux/init.h>
32#include <linux/interrupt.h>
33#include <linux/time.h>
34#include <linux/mca.h>
35
36#include <asm/setup.h>
37#include <asm/hpet.h>
38#include <asm/time.h>
39#include <asm/timer.h>
40
41#include <asm/do_timer.h>
42
43int timer_ack;
44
45unsigned long profile_pc(struct pt_regs *regs)
46{
47 unsigned long pc = instruction_pointer(regs);
48
49#ifdef CONFIG_SMP
50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
51#ifdef CONFIG_FRAME_POINTER
52 return *(unsigned long *)(regs->bp + sizeof(long));
53#else
54 unsigned long *sp = (unsigned long *)&regs->sp;
55
56 /* Return address is either directly at stack pointer
57 or above a saved flags. Eflags has bits 22-31 zero,
58 kernel addresses don't. */
59 if (sp[0] >> 22)
60 return sp[0];
61 if (sp[1] >> 22)
62 return sp[1];
63#endif
64 }
65#endif
66 return pc;
67}
68EXPORT_SYMBOL(profile_pc);
69
70/*
71 * This is the same as the above, except we _also_ save the current
72 * Time Stamp Counter value at the time of the timer interrupt, so that
73 * we later on can estimate the time of day more exactly.
74 */
75irqreturn_t timer_interrupt(int irq, void *dev_id)
76{
77 /* Keep nmi watchdog up to date */
78 inc_irq_stat(irq0_irqs);
79
80#ifdef CONFIG_X86_IO_APIC
81 if (timer_ack) {
82 /*
83 * Subtle, when I/O APICs are used we have to ack timer IRQ
84 * manually to deassert NMI lines for the watchdog if run
85 * on an 82489DX-based system.
86 */
87 spin_lock(&i8259A_lock);
88 outb(0x0c, PIC_MASTER_OCW3);
89 /* Ack the IRQ; AEOI will end it automatically. */
90 inb(PIC_MASTER_POLL);
91 spin_unlock(&i8259A_lock);
92 }
93#endif
94
95 do_timer_interrupt_hook();
96
97#ifdef CONFIG_MCA
98 if (MCA_bus) {
99 /* The PS/2 uses level-triggered interrupts. You can't
100 turn them off, nor would you want to (any attempt to
101 enable edge-triggered interrupts usually gets intercepted by a
102 special hardware circuit). Hence we have to acknowledge
103 the timer interrupt. Through some incredibly stupid
104 design idea, the reset for IRQ 0 is done by setting the
105 high bit of the PPI port B (0x61). Note that some PS/2s,
106 notably the 55SX, work fine if this is removed. */
107
108 u8 irq_v = inb_p(0x61); /* read the current state */
109 outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */
110 }
111#endif
112
113 return IRQ_HANDLED;
114}
115
116/* Duplicate of time_init() below, with hpet_enable part added */
117void __init hpet_time_init(void)
118{
119 if (!hpet_enable())
120 setup_pit_timer();
121 x86_quirk_time_init();
122}
123
124/*
125 * This is called directly from init code; we must delay timer setup in the
126 * HPET case as we can't make the decision to turn on HPET this early in the
127 * boot process.
128 *
129 * The chosen time_init function will usually be hpet_time_init, above, but
130 * in the case of virtual hardware, an alternative function may be substituted.
131 */
132void __init time_init(void)
133{
134 x86_quirk_pre_time_init();
135 tsc_init();
136 late_time_init = choose_time_init();
137}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
deleted file mode 100644
index 5ba343e61844..000000000000
--- a/arch/x86/kernel/time_64.c
+++ /dev/null
@@ -1,135 +0,0 @@
1/*
2 * "High Precision Event Timer" based timekeeping.
3 *
4 * Copyright (c) 1991,1992,1995 Linus Torvalds
5 * Copyright (c) 1994 Alan Modra
6 * Copyright (c) 1995 Markus Kuhn
7 * Copyright (c) 1996 Ingo Molnar
8 * Copyright (c) 1998 Andrea Arcangeli
9 * Copyright (c) 2002,2006 Vojtech Pavlik
10 * Copyright (c) 2003 Andi Kleen
11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
12 */
13
14#include <linux/clockchips.h>
15#include <linux/init.h>
16#include <linux/interrupt.h>
17#include <linux/module.h>
18#include <linux/time.h>
19#include <linux/mca.h>
20#include <linux/nmi.h>
21
22#include <asm/i8253.h>
23#include <asm/hpet.h>
24#include <asm/vgtod.h>
25#include <asm/time.h>
26#include <asm/timer.h>
27
28volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
29
30unsigned long profile_pc(struct pt_regs *regs)
31{
32 unsigned long pc = instruction_pointer(regs);
33
34 /* Assume the lock function has either no stack frame or a copy
35 of flags from PUSHF
36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
41 unsigned long *sp = (unsigned long *)regs->sp;
42 if (sp[0] >> 22)
43 return sp[0];
44 if (sp[1] >> 22)
45 return sp[1];
46#endif
47 }
48 return pc;
49}
50EXPORT_SYMBOL(profile_pc);
51
52static irqreturn_t timer_interrupt(int irq, void *dev_id)
53{
54 inc_irq_stat(irq0_irqs);
55
56 global_clock_event->event_handler(global_clock_event);
57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
65 return IRQ_HANDLED;
66}
67
68/* calibrate_cpu is used on systems with fixed rate TSCs to determine
69 * processor frequency */
70#define TICK_COUNT 100000000
71unsigned long __init calibrate_cpu(void)
72{
73 int tsc_start, tsc_now;
74 int i, no_ctr_free;
75 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
76 unsigned long flags;
77
78 for (i = 0; i < 4; i++)
79 if (avail_to_resrv_perfctr_nmi_bit(i))
80 break;
81 no_ctr_free = (i == 4);
82 if (no_ctr_free) {
83 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
84 "cpu_khz value may be incorrect.\n");
85 i = 3;
86 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
87 wrmsrl(MSR_K7_EVNTSEL3, 0);
88 rdmsrl(MSR_K7_PERFCTR3, pmc3);
89 } else {
90 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
91 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
92 }
93 local_irq_save(flags);
94 /* start measuring cycles, incrementing from 0 */
95 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
96 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
97 rdtscl(tsc_start);
98 do {
99 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
100 tsc_now = get_cycles();
101 } while ((tsc_now - tsc_start) < TICK_COUNT);
102
103 local_irq_restore(flags);
104 if (no_ctr_free) {
105 wrmsrl(MSR_K7_EVNTSEL3, 0);
106 wrmsrl(MSR_K7_PERFCTR3, pmc3);
107 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
108 } else {
109 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
110 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
111 }
112
113 return pmc_now * tsc_khz / (tsc_now - tsc_start);
114}
115
116static struct irqaction irq0 = {
117 .handler = timer_interrupt,
118 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
119 .name = "timer"
120};
121
122void __init hpet_time_init(void)
123{
124 if (!hpet_enable())
125 setup_pit_timer();
126
127 setup_irq(0, &irq0);
128}
129
130void __init time_init(void)
131{
132 tsc_init();
133
134 late_time_init = choose_time_init();
135}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 808031a5ba19..cd022121cab6 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,8 +3,16 @@
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h> 4#include <asm/e820.h>
5 5
6#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
7#define __trampinit
8#define __trampinitdata
9#else
10#define __trampinit __cpuinit
11#define __trampinitdata __cpuinitdata
12#endif
13
6/* ready for x86_64 and x86 */ 14/* ready for x86_64 and x86 */
7unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 15unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE);
8 16
9void __init reserve_trampoline_memory(void) 17void __init reserve_trampoline_memory(void)
10{ 18{
@@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void)
26 * bootstrap into the page concerned. The caller 34 * bootstrap into the page concerned. The caller
27 * has made sure it's suitably aligned. 35 * has made sure it's suitably aligned.
28 */ 36 */
29unsigned long setup_trampoline(void) 37unsigned long __trampinit setup_trampoline(void)
30{ 38{
31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 39 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
32 return virt_to_phys(trampoline_base); 40 return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 66d874e5404c..8508237e8e43 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -28,16 +28,12 @@
28 */ 28 */
29 29
30#include <linux/linkage.h> 30#include <linux/linkage.h>
31#include <linux/init.h>
31#include <asm/segment.h> 32#include <asm/segment.h>
32#include <asm/page_types.h> 33#include <asm/page_types.h>
33 34
34/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35/* We can free up trampoline after bootup if cpu hotplug is not supported. */
35#ifndef CONFIG_HOTPLUG_CPU 36__CPUINITRODATA
36.section ".cpuinit.data","aw",@progbits
37#else
38.section .rodata,"a",@progbits
39#endif
40
41.code16 37.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index cddfb8d386b9..3af2dff58b21 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,14 +25,19 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/init.h>
28#include <asm/pgtable_types.h> 29#include <asm/pgtable_types.h>
29#include <asm/page_types.h> 30#include <asm/page_types.h>
30#include <asm/msr.h> 31#include <asm/msr.h>
31#include <asm/segment.h> 32#include <asm/segment.h>
32#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
33 34
35#ifdef CONFIG_ACPI_SLEEP
34.section .rodata, "a", @progbits 36.section .rodata, "a", @progbits
35 37#else
38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
39__CPUINITRODATA
40#endif
36.code16 41.code16
37 42
38ENTRY(trampoline_data) 43ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 49a401b1d4d7..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -14,7 +14,6 @@
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/kprobes.h> 15#include <linux/kprobes.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/utsname.h>
18#include <linux/kdebug.h> 17#include <linux/kdebug.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
20#include <linux/module.h> 19#include <linux/module.h>
@@ -59,12 +58,12 @@
59#include <asm/mach_traps.h> 58#include <asm/mach_traps.h>
60 59
61#ifdef CONFIG_X86_64 60#ifdef CONFIG_X86_64
61#include <asm/x86_init.h>
62#include <asm/pgalloc.h> 62#include <asm/pgalloc.h>
63#include <asm/proto.h> 63#include <asm/proto.h>
64#else 64#else
65#include <asm/processor-flags.h> 65#include <asm/processor-flags.h>
66#include <asm/setup.h> 66#include <asm/setup.h>
67#include <asm/traps.h>
68 67
69asmlinkage int system_call(void); 68asmlinkage int system_call(void);
70 69
@@ -73,11 +72,9 @@ char ignore_fpu_irq;
73 72
74/* 73/*
75 * The IDT has to be page-aligned to simplify the Pentium 74 * The IDT has to be page-aligned to simplify the Pentium
76 * F0 0F bug workaround.. We have a special link segment 75 * F0 0F bug workaround.
77 * for this.
78 */ 76 */
79gate_desc idt_table[NR_VECTORS] 77gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
80 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
81#endif 78#endif
82 79
83DECLARE_BITMAP(used_vectors, NR_VECTORS); 80DECLARE_BITMAP(used_vectors, NR_VECTORS);
@@ -951,7 +948,5 @@ void __init trap_init(void)
951 */ 948 */
952 cpu_init(); 949 cpu_init();
953 950
954#ifdef CONFIG_X86_32 951 x86_init.irqs.trap_init();
955 x86_quirk_trap_init();
956#endif
957} 952}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357e..cd982f48e23e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,6 +17,8 @@
17#include <asm/time.h> 17#include <asm/time.h>
18#include <asm/delay.h> 18#include <asm/delay.h>
19#include <asm/hypervisor.h> 19#include <asm/hypervisor.h>
20#include <asm/nmi.h>
21#include <asm/x86_init.h>
20 22
21unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ 23unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
22EXPORT_SYMBOL(cpu_khz); 24EXPORT_SYMBOL(cpu_khz);
@@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void)
400{ 402{
401 u64 tsc1, tsc2, delta, ref1, ref2; 403 u64 tsc1, tsc2, delta, ref1, ref2;
402 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 404 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
403 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; 405 unsigned long flags, latch, ms, fast_calibrate;
404 int hpet = is_hpet_enabled(), i, loopmin; 406 int hpet = is_hpet_enabled(), i, loopmin;
405 407
406 hv_tsc_khz = get_hypervisor_tsc_freq();
407 if (hv_tsc_khz) {
408 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
409 return hv_tsc_khz;
410 }
411
412 local_irq_save(flags); 408 local_irq_save(flags);
413 fast_calibrate = quick_pit_calibrate(); 409 fast_calibrate = quick_pit_calibrate();
414 local_irq_restore(flags); 410 local_irq_restore(flags);
@@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void)
566 unsigned long cpu_khz_old = cpu_khz; 562 unsigned long cpu_khz_old = cpu_khz;
567 563
568 if (cpu_has_tsc) { 564 if (cpu_has_tsc) {
569 tsc_khz = calibrate_tsc(); 565 tsc_khz = x86_platform.calibrate_tsc();
570 cpu_khz = tsc_khz; 566 cpu_khz = tsc_khz;
571 cpu_data(0).loops_per_jiffy = 567 cpu_data(0).loops_per_jiffy =
572 cpufreq_scale(cpu_data(0).loops_per_jiffy, 568 cpufreq_scale(cpu_data(0).loops_per_jiffy,
@@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
670 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || 666 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
671 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || 667 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
672 (val == CPUFREQ_RESUMECHANGE)) { 668 (val == CPUFREQ_RESUMECHANGE)) {
673 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); 669 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
674 670
675 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 671 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
676 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) 672 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
@@ -744,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void)
744} 740}
745#endif 741#endif
746 742
743static void resume_tsc(void)
744{
745 clocksource_tsc.cycle_last = 0;
746}
747
747static struct clocksource clocksource_tsc = { 748static struct clocksource clocksource_tsc = {
748 .name = "tsc", 749 .name = "tsc",
749 .rating = 300, 750 .rating = 300,
750 .read = read_tsc, 751 .read = read_tsc,
752 .resume = resume_tsc,
751 .mask = CLOCKSOURCE_MASK(64), 753 .mask = CLOCKSOURCE_MASK(64),
752 .shift = 22, 754 .shift = 22,
753 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 755 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -761,12 +763,14 @@ void mark_tsc_unstable(char *reason)
761{ 763{
762 if (!tsc_unstable) { 764 if (!tsc_unstable) {
763 tsc_unstable = 1; 765 tsc_unstable = 1;
764 printk("Marking TSC unstable due to %s\n", reason); 766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
765 /* Change only the rating, when not registered */ 767 /* Change only the rating, when not registered */
766 if (clocksource_tsc.mult) 768 if (clocksource_tsc.mult)
767 clocksource_change_rating(&clocksource_tsc, 0); 769 clocksource_mark_unstable(&clocksource_tsc);
768 else 770 else {
771 clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
769 clocksource_tsc.rating = 0; 772 clocksource_tsc.rating = 0;
773 }
770 } 774 }
771} 775}
772 776
@@ -852,15 +856,71 @@ static void __init init_tsc_clocksource(void)
852 clocksource_register(&clocksource_tsc); 856 clocksource_register(&clocksource_tsc);
853} 857}
854 858
859#ifdef CONFIG_X86_64
860/*
861 * calibrate_cpu is used on systems with fixed rate TSCs to determine
862 * processor frequency
863 */
864#define TICK_COUNT 100000000
865static unsigned long __init calibrate_cpu(void)
866{
867 int tsc_start, tsc_now;
868 int i, no_ctr_free;
869 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
870 unsigned long flags;
871
872 for (i = 0; i < 4; i++)
873 if (avail_to_resrv_perfctr_nmi_bit(i))
874 break;
875 no_ctr_free = (i == 4);
876 if (no_ctr_free) {
877 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
878 "cpu_khz value may be incorrect.\n");
879 i = 3;
880 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
881 wrmsrl(MSR_K7_EVNTSEL3, 0);
882 rdmsrl(MSR_K7_PERFCTR3, pmc3);
883 } else {
884 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
885 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
886 }
887 local_irq_save(flags);
888 /* start measuring cycles, incrementing from 0 */
889 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
890 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
891 rdtscl(tsc_start);
892 do {
893 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
894 tsc_now = get_cycles();
895 } while ((tsc_now - tsc_start) < TICK_COUNT);
896
897 local_irq_restore(flags);
898 if (no_ctr_free) {
899 wrmsrl(MSR_K7_EVNTSEL3, 0);
900 wrmsrl(MSR_K7_PERFCTR3, pmc3);
901 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
902 } else {
903 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
904 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
905 }
906
907 return pmc_now * tsc_khz / (tsc_now - tsc_start);
908}
909#else
910static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
911#endif
912
855void __init tsc_init(void) 913void __init tsc_init(void)
856{ 914{
857 u64 lpj; 915 u64 lpj;
858 int cpu; 916 int cpu;
859 917
918 x86_init.timers.tsc_pre_init();
919
860 if (!cpu_has_tsc) 920 if (!cpu_has_tsc)
861 return; 921 return;
862 922
863 tsc_khz = calibrate_tsc(); 923 tsc_khz = x86_platform.calibrate_tsc();
864 cpu_khz = tsc_khz; 924 cpu_khz = tsc_khz;
865 925
866 if (!tsc_khz) { 926 if (!tsc_khz) {
@@ -868,11 +928,9 @@ void __init tsc_init(void)
868 return; 928 return;
869 } 929 }
870 930
871#ifdef CONFIG_X86_64
872 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && 931 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
873 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) 932 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
874 cpu_khz = calibrate_cpu(); 933 cpu_khz = calibrate_cpu();
875#endif
876 934
877 printk("Detected %lu.%03lu MHz processor.\n", 935 printk("Detected %lu.%03lu MHz processor.\n",
878 (unsigned long)cpu_khz / 1000, 936 (unsigned long)cpu_khz / 1000,
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 027b5b498993..f37930954d15 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 pr_info("Skipping synchronization checks as TSC is reliable.\n"); 117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 31ffc24eec4d..f068553a1b17 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -30,6 +30,7 @@
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/e820.h> 32#include <asm/e820.h>
33#include <asm/time.h>
33#include <asm/io.h> 34#include <asm/io.h>
34 35
35#include <linux/kernel_stat.h> 36#include <linux/kernel_stat.h>
@@ -53,7 +54,7 @@ int is_visws_box(void)
53 return visws_board_type >= 0; 54 return visws_board_type >= 0;
54} 55}
55 56
56static int __init visws_time_init(void) 57static void __init visws_time_init(void)
57{ 58{
58 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 59 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
59 60
@@ -66,21 +67,13 @@ static int __init visws_time_init(void)
66 /* Enable (unmask) the timer interrupt */ 67 /* Enable (unmask) the timer interrupt */
67 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); 68 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
68 69
69 /* 70 setup_default_timer_irq();
70 * Zero return means the generic timer setup code will set up
71 * the standard vector:
72 */
73 return 0;
74} 71}
75 72
76static int __init visws_pre_intr_init(void) 73/* Replaces the default init_ISA_irqs in the generic setup */
74static void __init visws_pre_intr_init(void)
77{ 75{
78 init_VISWS_APIC_irqs(); 76 init_VISWS_APIC_irqs();
79
80 /*
81 * We dont want ISA irqs to be set up by the generic code:
82 */
83 return 1;
84} 77}
85 78
86/* Quirk for machine specific memory setup. */ 79/* Quirk for machine specific memory setup. */
@@ -156,12 +149,8 @@ static void visws_machine_power_off(void)
156 outl(PIIX_SPECIAL_STOP, 0xCFC); 149 outl(PIIX_SPECIAL_STOP, 0xCFC);
157} 150}
158 151
159static int __init visws_get_smp_config(unsigned int early) 152static void __init visws_get_smp_config(unsigned int early)
160{ 153{
161 /*
162 * Prevent MP-table parsing by the generic code:
163 */
164 return 1;
165} 154}
166 155
167/* 156/*
@@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
208 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
209} 198}
210 199
211static int __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(unsigned int reserve)
212{ 201{
213 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
214 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve)
230 MP_processor_info(mp++); 219 MP_processor_info(mp++);
231 220
232 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 221 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
233
234 return 1;
235} 222}
236 223
237static int visws_trap_init(void); 224static void visws_trap_init(void);
238
239static struct x86_quirks visws_x86_quirks __initdata = {
240 .arch_time_init = visws_time_init,
241 .arch_pre_intr_init = visws_pre_intr_init,
242 .arch_memory_setup = visws_memory_setup,
243 .arch_intr_init = NULL,
244 .arch_trap_init = visws_trap_init,
245 .mach_get_smp_config = visws_get_smp_config,
246 .mach_find_smp_config = visws_find_smp_config,
247};
248 225
249void __init visws_early_detect(void) 226void __init visws_early_detect(void)
250{ 227{
@@ -257,11 +234,14 @@ void __init visws_early_detect(void)
257 return; 234 return;
258 235
259 /* 236 /*
260 * Install special quirks for timer, interrupt and memory setup: 237 * Override the default platform setup functions
261 * Fall back to generic behavior for traps:
262 * Override generic MP-table parsing:
263 */ 238 */
264 x86_quirks = &visws_x86_quirks; 239 x86_init.resources.memory_setup = visws_memory_setup;
240 x86_init.mpparse.get_smp_config = visws_get_smp_config;
241 x86_init.mpparse.find_smp_config = visws_find_smp_config;
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init;
265 245
266 /* 246 /*
267 * Install reboot quirks: 247 * Install reboot quirks:
@@ -400,12 +380,10 @@ static __init void cobalt_init(void)
400 co_apic_read(CO_APIC_ID)); 380 co_apic_read(CO_APIC_ID));
401} 381}
402 382
403static int __init visws_trap_init(void) 383static void __init visws_trap_init(void)
404{ 384{
405 lithium_init(); 385 lithium_init();
406 cobalt_init(); 386 cobalt_init();
407
408 return 1;
409} 387}
410 388
411/* 389/*
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95a7289e4b0c..d430e4c30193 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -648,7 +648,7 @@ static inline int __init activate_vmi(void)
648 648
649 pv_info.paravirt_enabled = 1; 649 pv_info.paravirt_enabled = 1;
650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
651 pv_info.name = "vmi"; 651 pv_info.name = "vmi [deprecated]";
652 652
653 pv_init_ops.patch = vmi_patch; 653 pv_init_ops.patch = vmi_patch;
654 654
@@ -817,15 +817,15 @@ static inline int __init activate_vmi(void)
817 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); 817 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
818 vmi_timer_ops.cancel_alarm = 818 vmi_timer_ops.cancel_alarm =
819 vmi_get_function(VMI_CALL_CancelAlarm); 819 vmi_get_function(VMI_CALL_CancelAlarm);
820 pv_time_ops.time_init = vmi_time_init; 820 x86_init.timers.timer_init = vmi_time_init;
821 pv_time_ops.get_wallclock = vmi_get_wallclock;
822 pv_time_ops.set_wallclock = vmi_set_wallclock;
823#ifdef CONFIG_X86_LOCAL_APIC 821#ifdef CONFIG_X86_LOCAL_APIC
824 pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; 822 x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
825 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; 823 x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
826#endif 824#endif
827 pv_time_ops.sched_clock = vmi_sched_clock; 825 pv_time_ops.sched_clock = vmi_sched_clock;
828 pv_time_ops.get_tsc_khz = vmi_tsc_khz; 826 x86_platform.calibrate_tsc = vmi_tsc_khz;
827 x86_platform.get_wallclock = vmi_get_wallclock;
828 x86_platform.set_wallclock = vmi_set_wallclock;
829 829
830 /* We have true wallclock functions; disable CMOS clock sync */ 830 /* We have true wallclock functions; disable CMOS clock sync */
831 no_sync_cmos_clock = 1; 831 no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 2b3eb82efeeb..611b9e2360d3 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void)
68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); 68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
69} 69}
70 70
71/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ 71/* x86_platform.calibrate_tsc = vmi_tsc_khz */
72unsigned long vmi_tsc_khz(void) 72unsigned long vmi_tsc_khz(void)
73{ 73{
74 unsigned long long khz; 74 unsigned long long khz;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9fc178255c04..8d6001ad8d8d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -45,9 +45,9 @@ PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64 47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */ 48 user PT_LOAD FLAGS(5); /* R_E */
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 percpu PT_LOAD FLAGS(7); /* RWE */ 50 percpu PT_LOAD FLAGS(6); /* RW_ */
51#endif 51#endif
52 init PT_LOAD FLAGS(7); /* RWE */ 52 init PT_LOAD FLAGS(7); /* RWE */
53#endif 53#endif
@@ -65,17 +65,11 @@ SECTIONS
65#endif 65#endif
66 66
67 /* Text and read-only data */ 67 /* Text and read-only data */
68
69 /* bootstrapping code */
70 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
71 _text = .;
72 *(.text.head)
73 } :text = 0x9090
74
75 /* The rest of the text */
76 .text : AT(ADDR(.text) - LOAD_OFFSET) { 68 .text : AT(ADDR(.text) - LOAD_OFFSET) {
69 _text = .;
70 /* bootstrapping code */
71 HEAD_TEXT
77#ifdef CONFIG_X86_32 72#ifdef CONFIG_X86_32
78 /* not really needed, already page aligned */
79 . = ALIGN(PAGE_SIZE); 73 . = ALIGN(PAGE_SIZE);
80 *(.text.page_aligned) 74 *(.text.page_aligned)
81#endif 75#endif
@@ -94,13 +88,7 @@ SECTIONS
94 88
95 NOTES :text :note 89 NOTES :text :note
96 90
97 /* Exception table */ 91 EXCEPTION_TABLE(16) :text = 0x9090
98 . = ALIGN(16);
99 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
100 __start___ex_table = .;
101 *(__ex_table)
102 __stop___ex_table = .;
103 } :text = 0x9090
104 92
105 RO_DATA(PAGE_SIZE) 93 RO_DATA(PAGE_SIZE)
106 94
@@ -118,7 +106,6 @@ SECTIONS
118#endif 106#endif
119 107
120 PAGE_ALIGNED_DATA(PAGE_SIZE) 108 PAGE_ALIGNED_DATA(PAGE_SIZE)
121 *(.data.idt)
122 109
123 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
124 111
@@ -135,24 +122,21 @@ SECTIONS
135#ifdef CONFIG_X86_64 122#ifdef CONFIG_X86_64
136 123
137#define VSYSCALL_ADDR (-10*1024*1024) 124#define VSYSCALL_ADDR (-10*1024*1024)
138#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
139 PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
140#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
141 PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
142 125
143#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) 126#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET)
144#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) 127#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
145 128
146#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) 129#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
147#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 130#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
148 131
132 . = ALIGN(4096);
133 __vsyscall_0 = .;
134
149 . = VSYSCALL_ADDR; 135 . = VSYSCALL_ADDR;
150 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { 136 .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
151 *(.vsyscall_0) 137 *(.vsyscall_0)
152 } :user 138 } :user
153 139
154 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
155
156 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
157 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
158 *(.vsyscall_fn) 142 *(.vsyscall_fn)
@@ -192,11 +176,9 @@ SECTIONS
192 *(.vsyscall_3) 176 *(.vsyscall_3)
193 } 177 }
194 178
195 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; 179 . = __vsyscall_0 + PAGE_SIZE;
196 180
197#undef VSYSCALL_ADDR 181#undef VSYSCALL_ADDR
198#undef VSYSCALL_PHYS_ADDR
199#undef VSYSCALL_VIRT_ADDR
200#undef VLOAD_OFFSET 182#undef VLOAD_OFFSET
201#undef VLOAD 183#undef VLOAD
202#undef VVIRT_OFFSET 184#undef VVIRT_OFFSET
@@ -219,36 +201,12 @@ SECTIONS
219 PERCPU_VADDR(0, :percpu) 201 PERCPU_VADDR(0, :percpu)
220#endif 202#endif
221 203
222 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 204 INIT_TEXT_SECTION(PAGE_SIZE)
223 _sinittext = .;
224 INIT_TEXT
225 _einittext = .;
226 }
227#ifdef CONFIG_X86_64 205#ifdef CONFIG_X86_64
228 :init 206 :init
229#endif 207#endif
230 208
231 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { 209 INIT_DATA_SECTION(16)
232 INIT_DATA
233 }
234
235 . = ALIGN(16);
236 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
237 __setup_start = .;
238 *(.init.setup)
239 __setup_end = .;
240 }
241 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
242 __initcall_start = .;
243 INITCALLS
244 __initcall_end = .;
245 }
246
247 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
248 __con_initcall_start = .;
249 *(.con_initcall.init)
250 __con_initcall_end = .;
251 }
252 210
253 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 211 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
254 __x86_cpu_dev_start = .; 212 __x86_cpu_dev_start = .;
@@ -256,8 +214,6 @@ SECTIONS
256 __x86_cpu_dev_end = .; 214 __x86_cpu_dev_end = .;
257 } 215 }
258 216
259 SECURITY_INIT
260
261 . = ALIGN(8); 217 . = ALIGN(8);
262 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 218 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
263 __parainstructions = .; 219 __parainstructions = .;
@@ -288,15 +244,6 @@ SECTIONS
288 EXIT_DATA 244 EXIT_DATA
289 } 245 }
290 246
291#ifdef CONFIG_BLK_DEV_INITRD
292 . = ALIGN(PAGE_SIZE);
293 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
294 __initramfs_start = .;
295 *(.init.ramfs)
296 __initramfs_end = .;
297 }
298#endif
299
300#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 247#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
301 PERCPU(PAGE_SIZE) 248 PERCPU(PAGE_SIZE)
302#endif 249#endif
@@ -348,21 +295,18 @@ SECTIONS
348 _end = .; 295 _end = .;
349 } 296 }
350 297
351 /* Sections to be discarded */
352 /DISCARD/ : {
353 *(.exitcall.exit)
354 *(.eh_frame)
355 *(.discard)
356 }
357
358 STABS_DEBUG 298 STABS_DEBUG
359 DWARF_DEBUG 299 DWARF_DEBUG
300
301 /* Sections to be discarded */
302 DISCARDS
303 /DISCARD/ : { *(.eh_frame) }
360} 304}
361 305
362 306
363#ifdef CONFIG_X86_32 307#ifdef CONFIG_X86_32
364. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 308ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
365 "kernel image bigger than KERNEL_IMAGE_SIZE"); 309 "kernel image bigger than KERNEL_IMAGE_SIZE");
366#else 310#else
367/* 311/*
368 * Per-cpu symbols which need to be offset from __per_cpu_load 312 * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -375,12 +319,12 @@ INIT_PER_CPU(irq_stack_union);
375/* 319/*
376 * Build-time check on the image size: 320 * Build-time check on the image size:
377 */ 321 */
378. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 322ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
379 "kernel image bigger than KERNEL_IMAGE_SIZE"); 323 "kernel image bigger than KERNEL_IMAGE_SIZE");
380 324
381#ifdef CONFIG_SMP 325#ifdef CONFIG_SMP
382. = ASSERT((per_cpu__irq_stack_union == 0), 326ASSERT((per_cpu__irq_stack_union == 0),
383 "irq_stack_union is not at start of per-cpu area"); 327 "irq_stack_union is not at start of per-cpu area");
384#endif 328#endif
385 329
386#endif /* CONFIG_X86_32 */ 330#endif /* CONFIG_X86_32 */
@@ -388,7 +332,6 @@ INIT_PER_CPU(irq_stack_union);
388#ifdef CONFIG_KEXEC 332#ifdef CONFIG_KEXEC
389#include <asm/kexec.h> 333#include <asm/kexec.h>
390 334
391. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, 335ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
392 "kexec control code size is too big"); 336 "kexec control code size is too big");
393#endif 337#endif
394
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 25ee06a80aad..8cb4974ff599 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
90 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
90 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 91 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
91} 92}
92 93
@@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
227} 228}
228 229
229#ifdef CONFIG_SYSCTL 230#ifdef CONFIG_SYSCTL
230
231static int
232vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
233 void __user *buffer, size_t *lenp, loff_t *ppos)
234{
235 return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
236}
237
238static ctl_table kernel_table2[] = { 231static ctl_table kernel_table2[] = {
239 { .procname = "vsyscall64", 232 { .procname = "vsyscall64",
240 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), 233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
241 .mode = 0644, 234 .mode = 0644,
242 .proc_handler = vsyscall_sysctl_change }, 235 .proc_handler = proc_dointvec },
243 {} 236 {}
244}; 237};
245 238
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 000000000000..4449a4a2c2ed
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,75 @@
1/*
2 * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
3 *
4 * For licencing details see kernel-base/COPYING
5 */
6#include <linux/init.h>
7
8#include <asm/bios_ebda.h>
9#include <asm/paravirt.h>
10#include <asm/mpspec.h>
11#include <asm/setup.h>
12#include <asm/apic.h>
13#include <asm/e820.h>
14#include <asm/time.h>
15#include <asm/irq.h>
16#include <asm/tsc.h>
17
18void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { }
21
22/*
23 * The platform setup functions are preset with the default functions
24 * for standard PC hardware.
25 */
26struct x86_init_ops x86_init __initdata = {
27
28 .resources = {
29 .probe_roms = x86_init_noop,
30 .reserve_resources = reserve_standard_io_resources,
31 .memory_setup = default_machine_specific_memory_setup,
32 },
33
34 .mpparse = {
35 .mpc_record = x86_init_uint_noop,
36 .setup_ioapic_ids = x86_init_noop,
37 .mpc_apic_id = default_mpc_apic_id,
38 .smp_read_mpc_oem = default_smp_read_mpc_oem,
39 .mpc_oem_bus_info = default_mpc_oem_bus_info,
40 .find_smp_config = default_find_smp_config,
41 .get_smp_config = default_get_smp_config,
42 },
43
44 .irqs = {
45 .pre_vector_init = init_ISA_irqs,
46 .intr_init = native_init_IRQ,
47 .trap_init = x86_init_noop,
48 },
49
50 .oem = {
51 .arch_setup = x86_init_noop,
52 .banner = default_banner,
53 },
54
55 .paging = {
56 .pagetable_setup_start = native_pagetable_setup_start,
57 .pagetable_setup_done = native_pagetable_setup_done,
58 },
59
60 .timers = {
61 .setup_percpu_clockev = setup_boot_APIC_clock,
62 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init,
64 },
65};
66
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
68 .setup_percpu_clockev = setup_secondary_APIC_clock,
69};
70
71struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss,
75};
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1ae5ceba7eb2..7024224f0fc8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -664,7 +664,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
664{ 664{
665 ktime_t now = apic->lapic_timer.timer.base->get_time(); 665 ktime_t now = apic->lapic_timer.timer.base->get_time();
666 666
667 apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) * 667 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
668 APIC_BUS_CYCLE_NS * apic->divide_count; 668 APIC_BUS_CYCLE_NS * apic->divide_count;
669 atomic_set(&apic->lapic_timer.pending, 0); 669 atomic_set(&apic->lapic_timer.pending, 0);
670 670
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eca41ae9f453..685a4ffac8e6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -156,6 +156,8 @@ module_param(oos_shadow, bool, 0644);
156#define CREATE_TRACE_POINTS 156#define CREATE_TRACE_POINTS
157#include "mmutrace.h" 157#include "mmutrace.h"
158 158
159#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
160
159#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 161#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
160 162
161struct kvm_rmap_desc { 163struct kvm_rmap_desc {
@@ -634,9 +636,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
634 if (*spte & shadow_accessed_mask) 636 if (*spte & shadow_accessed_mask)
635 kvm_set_pfn_accessed(pfn); 637 kvm_set_pfn_accessed(pfn);
636 if (is_writeble_pte(*spte)) 638 if (is_writeble_pte(*spte))
637 kvm_release_pfn_dirty(pfn); 639 kvm_set_pfn_dirty(pfn);
638 else
639 kvm_release_pfn_clean(pfn);
640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); 640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
641 if (!*rmapp) { 641 if (!*rmapp) {
642 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 642 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
748 return write_protected; 748 return write_protected;
749} 749}
750 750
751static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 751static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
752{ 752{
753 u64 *spte; 753 u64 *spte;
754 int need_tlb_flush = 0; 754 int need_tlb_flush = 0;
@@ -763,8 +763,45 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
763 return need_tlb_flush; 763 return need_tlb_flush;
764} 764}
765 765
766static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 766static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
767 int (*handler)(struct kvm *kvm, unsigned long *rmapp)) 767{
768 int need_flush = 0;
769 u64 *spte, new_spte;
770 pte_t *ptep = (pte_t *)data;
771 pfn_t new_pfn;
772
773 WARN_ON(pte_huge(*ptep));
774 new_pfn = pte_pfn(*ptep);
775 spte = rmap_next(kvm, rmapp, NULL);
776 while (spte) {
777 BUG_ON(!is_shadow_present_pte(*spte));
778 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
779 need_flush = 1;
780 if (pte_write(*ptep)) {
781 rmap_remove(kvm, spte);
782 __set_spte(spte, shadow_trap_nonpresent_pte);
783 spte = rmap_next(kvm, rmapp, NULL);
784 } else {
785 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
786 new_spte |= (u64)new_pfn << PAGE_SHIFT;
787
788 new_spte &= ~PT_WRITABLE_MASK;
789 new_spte &= ~SPTE_HOST_WRITEABLE;
790 if (is_writeble_pte(*spte))
791 kvm_set_pfn_dirty(spte_to_pfn(*spte));
792 __set_spte(spte, new_spte);
793 spte = rmap_next(kvm, rmapp, spte);
794 }
795 }
796 if (need_flush)
797 kvm_flush_remote_tlbs(kvm);
798
799 return 0;
800}
801
802static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, u64 data,
803 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
804 u64 data))
768{ 805{
769 int i, j; 806 int i, j;
770 int retval = 0; 807 int retval = 0;
@@ -786,13 +823,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
786 if (hva >= start && hva < end) { 823 if (hva >= start && hva < end) {
787 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 824 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
788 825
789 retval |= handler(kvm, &memslot->rmap[gfn_offset]); 826 retval |= handler(kvm, &memslot->rmap[gfn_offset],
827 data);
790 828
791 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 829 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
792 int idx = gfn_offset; 830 int idx = gfn_offset;
793 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 831 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
794 retval |= handler(kvm, 832 retval |= handler(kvm,
795 &memslot->lpage_info[j][idx].rmap_pde); 833 &memslot->lpage_info[j][idx].rmap_pde,
834 data);
796 } 835 }
797 } 836 }
798 } 837 }
@@ -802,10 +841,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
802 841
803int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 842int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
804{ 843{
805 return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 844 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
806} 845}
807 846
808static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) 847void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
848{
849 kvm_handle_hva(kvm, hva, (u64)&pte, kvm_set_pte_rmapp);
850}
851
852static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
809{ 853{
810 u64 *spte; 854 u64 *spte;
811 int young = 0; 855 int young = 0;
@@ -841,13 +885,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
841 gfn = unalias_gfn(vcpu->kvm, gfn); 885 gfn = unalias_gfn(vcpu->kvm, gfn);
842 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 886 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
843 887
844 kvm_unmap_rmapp(vcpu->kvm, rmapp); 888 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
845 kvm_flush_remote_tlbs(vcpu->kvm); 889 kvm_flush_remote_tlbs(vcpu->kvm);
846} 890}
847 891
848int kvm_age_hva(struct kvm *kvm, unsigned long hva) 892int kvm_age_hva(struct kvm *kvm, unsigned long hva)
849{ 893{
850 return kvm_handle_hva(kvm, hva, kvm_age_rmapp); 894 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
851} 895}
852 896
853#ifdef MMU_DEBUG 897#ifdef MMU_DEBUG
@@ -1756,7 +1800,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1756 unsigned pte_access, int user_fault, 1800 unsigned pte_access, int user_fault,
1757 int write_fault, int dirty, int level, 1801 int write_fault, int dirty, int level,
1758 gfn_t gfn, pfn_t pfn, bool speculative, 1802 gfn_t gfn, pfn_t pfn, bool speculative,
1759 bool can_unsync) 1803 bool can_unsync, bool reset_host_protection)
1760{ 1804{
1761 u64 spte; 1805 u64 spte;
1762 int ret = 0; 1806 int ret = 0;
@@ -1783,6 +1827,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1783 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 1827 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1784 kvm_is_mmio_pfn(pfn)); 1828 kvm_is_mmio_pfn(pfn));
1785 1829
1830 if (reset_host_protection)
1831 spte |= SPTE_HOST_WRITEABLE;
1832
1786 spte |= (u64)pfn << PAGE_SHIFT; 1833 spte |= (u64)pfn << PAGE_SHIFT;
1787 1834
1788 if ((pte_access & ACC_WRITE_MASK) 1835 if ((pte_access & ACC_WRITE_MASK)
@@ -1828,7 +1875,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1828 unsigned pt_access, unsigned pte_access, 1875 unsigned pt_access, unsigned pte_access,
1829 int user_fault, int write_fault, int dirty, 1876 int user_fault, int write_fault, int dirty,
1830 int *ptwrite, int level, gfn_t gfn, 1877 int *ptwrite, int level, gfn_t gfn,
1831 pfn_t pfn, bool speculative) 1878 pfn_t pfn, bool speculative,
1879 bool reset_host_protection)
1832{ 1880{
1833 int was_rmapped = 0; 1881 int was_rmapped = 0;
1834 int was_writeble = is_writeble_pte(*sptep); 1882 int was_writeble = is_writeble_pte(*sptep);
@@ -1860,7 +1908,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1860 } 1908 }
1861 1909
1862 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 1910 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1863 dirty, level, gfn, pfn, speculative, true)) { 1911 dirty, level, gfn, pfn, speculative, true,
1912 reset_host_protection)) {
1864 if (write_fault) 1913 if (write_fault)
1865 *ptwrite = 1; 1914 *ptwrite = 1;
1866 kvm_x86_ops->tlb_flush(vcpu); 1915 kvm_x86_ops->tlb_flush(vcpu);
@@ -1877,8 +1926,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1877 page_header_update_slot(vcpu->kvm, sptep, gfn); 1926 page_header_update_slot(vcpu->kvm, sptep, gfn);
1878 if (!was_rmapped) { 1927 if (!was_rmapped) {
1879 rmap_count = rmap_add(vcpu, sptep, gfn); 1928 rmap_count = rmap_add(vcpu, sptep, gfn);
1880 if (!is_rmap_spte(*sptep)) 1929 kvm_release_pfn_clean(pfn);
1881 kvm_release_pfn_clean(pfn);
1882 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1930 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1883 rmap_recycle(vcpu, sptep, gfn); 1931 rmap_recycle(vcpu, sptep, gfn);
1884 } else { 1932 } else {
@@ -1909,7 +1957,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1909 if (iterator.level == level) { 1957 if (iterator.level == level) {
1910 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1958 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1911 0, write, 1, &pt_write, 1959 0, write, 1, &pt_write,
1912 level, gfn, pfn, false); 1960 level, gfn, pfn, false, true);
1913 ++vcpu->stat.pf_fixed; 1961 ++vcpu->stat.pf_fixed;
1914 break; 1962 break;
1915 } 1963 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d2fec9c12d22..72558f8ff3f5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -273,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
273 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) 273 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
274 return; 274 return;
275 kvm_get_pfn(pfn); 275 kvm_get_pfn(pfn);
276 /*
277 * we call mmu_set_spte() with reset_host_protection = true beacuse that
278 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
279 */
276 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 280 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
277 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 281 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
278 gpte_to_gfn(gpte), pfn, true); 282 gpte_to_gfn(gpte), pfn, true, true);
279} 283}
280 284
281/* 285/*
@@ -308,7 +312,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
308 user_fault, write_fault, 312 user_fault, write_fault,
309 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 313 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
310 ptwrite, level, 314 ptwrite, level,
311 gw->gfn, pfn, false); 315 gw->gfn, pfn, false, true);
312 break; 316 break;
313 } 317 }
314 318
@@ -558,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
558static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 562static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
559{ 563{
560 int i, offset, nr_present; 564 int i, offset, nr_present;
565 bool reset_host_protection;
561 566
562 offset = nr_present = 0; 567 offset = nr_present = 0;
563 568
@@ -595,9 +600,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
595 600
596 nr_present++; 601 nr_present++;
597 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 602 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
603 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
604 pte_access &= ~ACC_WRITE_MASK;
605 reset_host_protection = 0;
606 } else {
607 reset_host_protection = 1;
608 }
598 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 609 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
599 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 610 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
600 spte_to_pfn(sp->spt[i]), true, false); 611 spte_to_pfn(sp->spt[i]), true, false,
612 reset_host_protection);
601 } 613 }
602 614
603 return !nr_present; 615 return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 944cc9c04b3c..c17404add91f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -767,6 +767,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
767 rdtscll(tsc_this); 767 rdtscll(tsc_this);
768 delta = vcpu->arch.host_tsc - tsc_this; 768 delta = vcpu->arch.host_tsc - tsc_this;
769 svm->vmcb->control.tsc_offset += delta; 769 svm->vmcb->control.tsc_offset += delta;
770 if (is_nested(svm))
771 svm->nested.hsave->control.tsc_offset += delta;
770 vcpu->cpu = cpu; 772 vcpu->cpu = cpu;
771 kvm_migrate_timers(vcpu); 773 kvm_migrate_timers(vcpu);
772 svm->asid_generation = 0; 774 svm->asid_generation = 0;
@@ -2057,10 +2059,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2057 2059
2058 switch (ecx) { 2060 switch (ecx) {
2059 case MSR_IA32_TSC: { 2061 case MSR_IA32_TSC: {
2060 u64 tsc; 2062 u64 tsc_offset;
2063
2064 if (is_nested(svm))
2065 tsc_offset = svm->nested.hsave->control.tsc_offset;
2066 else
2067 tsc_offset = svm->vmcb->control.tsc_offset;
2061 2068
2062 rdtscll(tsc); 2069 *data = tsc_offset + native_read_tsc();
2063 *data = svm->vmcb->control.tsc_offset + tsc;
2064 break; 2070 break;
2065 } 2071 }
2066 case MSR_K6_STAR: 2072 case MSR_K6_STAR:
@@ -2146,10 +2152,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2146 2152
2147 switch (ecx) { 2153 switch (ecx) {
2148 case MSR_IA32_TSC: { 2154 case MSR_IA32_TSC: {
2149 u64 tsc; 2155 u64 tsc_offset = data - native_read_tsc();
2156 u64 g_tsc_offset = 0;
2157
2158 if (is_nested(svm)) {
2159 g_tsc_offset = svm->vmcb->control.tsc_offset -
2160 svm->nested.hsave->control.tsc_offset;
2161 svm->nested.hsave->control.tsc_offset = tsc_offset;
2162 }
2163
2164 svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
2150 2165
2151 rdtscll(tsc);
2152 svm->vmcb->control.tsc_offset = data - tsc;
2153 break; 2166 break;
2154 } 2167 }
2155 case MSR_K6_STAR: 2168 case MSR_K6_STAR:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f3812014bd0b..ed53b42caba1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -709,7 +709,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
709 if (vcpu->cpu != cpu) { 709 if (vcpu->cpu != cpu) {
710 vcpu_clear(vmx); 710 vcpu_clear(vmx);
711 kvm_migrate_timers(vcpu); 711 kvm_migrate_timers(vcpu);
712 vpid_sync_vcpu_all(vmx); 712 set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
713 local_irq_disable(); 713 local_irq_disable();
714 list_add(&vmx->local_vcpus_link, 714 list_add(&vmx->local_vcpus_link,
715 &per_cpu(vcpus_on_cpu, cpu)); 715 &per_cpu(vcpus_on_cpu, cpu));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 74029f50b26a..fc2974adf9b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1591,6 +1591,8 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1591 1591
1592 if (cpuid->nent < 1) 1592 if (cpuid->nent < 1)
1593 goto out; 1593 goto out;
1594 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1595 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1594 r = -ENOMEM; 1596 r = -ENOMEM;
1595 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1597 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1596 if (!cpuid_entries) 1598 if (!cpuid_entries)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d677fa9ca650..7e59dc1d3fc2 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1135,11 +1135,6 @@ static struct notifier_block paniced = {
1135/* Setting up memory is fairly easy. */ 1135/* Setting up memory is fairly easy. */
1136static __init char *lguest_memory_setup(void) 1136static __init char *lguest_memory_setup(void)
1137{ 1137{
1138 /* We do this here and not earlier because lockcheck used to barf if we
1139 * did it before start_kernel(). I think we fixed that, so it'd be
1140 * nice to move it back to lguest_init. Patch welcome... */
1141 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
1142
1143 /* 1138 /*
1144 *The Linux bootloader header contains an "e820" memory map: the 1139 *The Linux bootloader header contains an "e820" memory map: the
1145 * Launcher populated the first entry with our memory limit. 1140 * Launcher populated the first entry with our memory limit.
@@ -1262,7 +1257,6 @@ __init void lguest_init(void)
1262 */ 1257 */
1263 1258
1264 /* Interrupt-related operations */ 1259 /* Interrupt-related operations */
1265 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1266 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1260 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1267 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1261 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
1268 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); 1262 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
@@ -1270,7 +1264,6 @@ __init void lguest_init(void)
1270 pv_irq_ops.safe_halt = lguest_safe_halt; 1264 pv_irq_ops.safe_halt = lguest_safe_halt;
1271 1265
1272 /* Setup operations */ 1266 /* Setup operations */
1273 pv_init_ops.memory_setup = lguest_memory_setup;
1274 pv_init_ops.patch = lguest_patch; 1267 pv_init_ops.patch = lguest_patch;
1275 1268
1276 /* Intercepts of various CPU instructions */ 1269 /* Intercepts of various CPU instructions */
@@ -1320,10 +1313,11 @@ __init void lguest_init(void)
1320 set_lguest_basic_apic_ops(); 1313 set_lguest_basic_apic_ops();
1321#endif 1314#endif
1322 1315
1323 /* Time operations */ 1316 x86_init.resources.memory_setup = lguest_memory_setup;
1324 pv_time_ops.get_wallclock = lguest_get_wallclock; 1317 x86_init.irqs.intr_init = lguest_init_IRQ;
1325 pv_time_ops.time_init = lguest_time_init; 1318 x86_init.timers.timer_init = lguest_time_init;
1326 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1319 x86_platform.calibrate_tsc = lguest_tsc_khz;
1320 x86_platform.get_wallclock = lguest_get_wallclock;
1327 1321
1328 /* 1322 /*
1329 * Now is a good time to look at the implementations of these functions 1323 * Now is a good time to look at the implementations of these functions
@@ -1365,10 +1359,13 @@ __init void lguest_init(void)
1365 1359
1366 /* 1360 /*
1367 * If we don't initialize the lock dependency checker now, it crashes 1361 * If we don't initialize the lock dependency checker now, it crashes
1368 * paravirt_disable_iospace. 1362 * atomic_notifier_chain_register, then paravirt_disable_iospace.
1369 */ 1363 */
1370 lockdep_init(); 1364 lockdep_init();
1371 1365
1366 /* Hook in our special panic hypercall code. */
1367 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
1368
1372 /* 1369 /*
1373 * The IDE code spends about 3 seconds probing for disks: if we reserve 1370 * The IDE code spends about 3 seconds probing for disks: if we reserve
1374 * all the I/O ports up front it can't get them and so doesn't probe. 1371 * all the I/O ports up front it can't get them and so doesn't probe.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 9e609206fac9..85f5db95c60f 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -16,7 +16,9 @@ ifeq ($(CONFIG_X86_32),y)
16 lib-y += checksum_32.o 16 lib-y += checksum_32.o
17 lib-y += strstr_32.o 17 lib-y += strstr_32.o
18 lib-y += semaphore_32.o string_32.o 18 lib-y += semaphore_32.o string_32.o
19 19ifneq ($(CONFIG_X86_CMPXCHG64),y)
20 lib-y += cmpxchg8b_emu.o
21endif
20 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 22 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
21else 23else
22 obj-y += io_64.o iomap_copy_64.o 24 obj-y += io_64.o iomap_copy_64.o
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
new file mode 100644
index 000000000000..828cb710dec2
--- /dev/null
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -0,0 +1,57 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; version 2
5 * of the License.
6 *
7 */
8
9#include <linux/linkage.h>
10#include <asm/alternative-asm.h>
11#include <asm/frame.h>
12#include <asm/dwarf2.h>
13
14
15.text
16
17/*
18 * Inputs:
19 * %esi : memory location to compare
20 * %eax : low 32 bits of old value
21 * %edx : high 32 bits of old value
22 * %ebx : low 32 bits of new value
23 * %ecx : high 32 bits of new value
24 */
25ENTRY(cmpxchg8b_emu)
26CFI_STARTPROC
27
28#
29# Emulate 'cmpxchg8b (%esi)' on UP except we don't
30# set the whole ZF thing (caller will just compare
31# eax:edx with the expected value)
32#
33cmpxchg8b_emu:
34 pushfl
35 cli
36
37 cmpl (%esi), %eax
38 jne not_same
39 cmpl 4(%esi), %edx
40 jne half_same
41
42 movl %ebx, (%esi)
43 movl %ecx, 4(%esi)
44
45 popfl
46 ret
47
48 not_same:
49 movl (%esi), %eax
50 half_same:
51 movl 4(%esi), %edx
52
53 popfl
54 ret
55
56CFI_ENDPROC
57ENDPROC(cmpxchg8b_emu)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9b5a9f59a478..06630d26e56d 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,9 +1,10 @@
1obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o physaddr.o gup.o 2 pat.o pgtable.o physaddr.o gup.o setup_nx.o
3 3
4# Make sure __phys_addr has no stackprotector 4# Make sure __phys_addr has no stackprotector
5nostackp := $(call cc-option, -fno-stack-protector) 5nostackp := $(call cc-option, -fno-stack-protector)
6CFLAGS_physaddr.o := $(nostackp) 6CFLAGS_physaddr.o := $(nostackp)
7CFLAGS_setup_nx.o := $(nostackp)
7 8
8obj-$(CONFIG_SMP) += tlb.o 9obj-$(CONFIG_SMP) += tlb.o
9 10
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 775a020990a5..f4cee9028cf0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,7 +10,7 @@
10#include <linux/bootmem.h> /* max_low_pfn */ 10#include <linux/bootmem.h> /* max_low_pfn */
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_counter.h> /* perf_swcounter_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14 14
15#include <asm/traps.h> /* dotraplinkage, ... */ 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 16#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
167 info.si_errno = 0; 167 info.si_errno = 0;
168 info.si_code = si_code; 168 info.si_code = si_code;
169 info.si_addr = (void __user *)address; 169 info.si_addr = (void __user *)address;
170 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
170 171
171 force_sig_info(si_signo, &info, tsk); 172 force_sig_info(si_signo, &info, tsk);
172} 173}
@@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
790} 791}
791 792
792static void 793static void
793do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) 794do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
795 unsigned int fault)
794{ 796{
795 struct task_struct *tsk = current; 797 struct task_struct *tsk = current;
796 struct mm_struct *mm = tsk->mm; 798 struct mm_struct *mm = tsk->mm;
799 int code = BUS_ADRERR;
797 800
798 up_read(&mm->mmap_sem); 801 up_read(&mm->mmap_sem);
799 802
@@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
809 tsk->thread.error_code = error_code; 812 tsk->thread.error_code = error_code;
810 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
811 814
812 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 815#ifdef CONFIG_MEMORY_FAILURE
816 if (fault & VM_FAULT_HWPOISON) {
817 printk(KERN_ERR
818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
819 tsk->comm, tsk->pid, address);
820 code = BUS_MCEERR_AR;
821 }
822#endif
823 force_sig_info_fault(SIGBUS, code, address, tsk);
813} 824}
814 825
815static noinline void 826static noinline void
@@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
819 if (fault & VM_FAULT_OOM) { 830 if (fault & VM_FAULT_OOM) {
820 out_of_memory(regs, error_code, address); 831 out_of_memory(regs, error_code, address);
821 } else { 832 } else {
822 if (fault & VM_FAULT_SIGBUS) 833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
823 do_sigbus(regs, error_code, address); 834 do_sigbus(regs, error_code, address, fault);
824 else 835 else
825 BUG(); 836 BUG();
826 } 837 }
@@ -1017,7 +1028,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1017 if (unlikely(error_code & PF_RSVD)) 1028 if (unlikely(error_code & PF_RSVD))
1018 pgtable_bad(regs, error_code, address); 1029 pgtable_bad(regs, error_code, address);
1019 1030
1020 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 1031 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
1021 1032
1022 /* 1033 /*
1023 * If we're in an interrupt, have no user context or are running 1034 * If we're in an interrupt, have no user context or are running
@@ -1114,11 +1125,11 @@ good_area:
1114 1125
1115 if (fault & VM_FAULT_MAJOR) { 1126 if (fault & VM_FAULT_MAJOR) {
1116 tsk->maj_flt++; 1127 tsk->maj_flt++;
1117 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1128 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1118 regs, address); 1129 regs, address);
1119 } else { 1130 } else {
1120 tsk->min_flt++; 1131 tsk->min_flt++;
1121 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1132 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1122 regs, address); 1133 regs, address);
1123 } 1134 }
1124 1135
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 0607119cef94..73ffd5536f62 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -28,69 +28,6 @@ int direct_gbpages
28#endif 28#endif
29; 29;
30 30
31int nx_enabled;
32
33#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
34static int disable_nx __cpuinitdata;
35
36/*
37 * noexec = on|off
38 *
39 * Control non-executable mappings for processes.
40 *
41 * on Enable
42 * off Disable
43 */
44static int __init noexec_setup(char *str)
45{
46 if (!str)
47 return -EINVAL;
48 if (!strncmp(str, "on", 2)) {
49 __supported_pte_mask |= _PAGE_NX;
50 disable_nx = 0;
51 } else if (!strncmp(str, "off", 3)) {
52 disable_nx = 1;
53 __supported_pte_mask &= ~_PAGE_NX;
54 }
55 return 0;
56}
57early_param("noexec", noexec_setup);
58#endif
59
60#ifdef CONFIG_X86_PAE
61static void __init set_nx(void)
62{
63 unsigned int v[4], l, h;
64
65 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
66 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
67
68 if ((v[3] & (1 << 20)) && !disable_nx) {
69 rdmsr(MSR_EFER, l, h);
70 l |= EFER_NX;
71 wrmsr(MSR_EFER, l, h);
72 nx_enabled = 1;
73 __supported_pte_mask |= _PAGE_NX;
74 }
75 }
76}
77#else
78static inline void set_nx(void)
79{
80}
81#endif
82
83#ifdef CONFIG_X86_64
84void __cpuinit check_efer(void)
85{
86 unsigned long efer;
87
88 rdmsrl(MSR_EFER, efer);
89 if (!(efer & EFER_NX) || disable_nx)
90 __supported_pte_mask &= ~_PAGE_NX;
91}
92#endif
93
94static void __init find_early_table_space(unsigned long end, int use_pse, 31static void __init find_early_table_space(unsigned long end, int use_pse,
95 int use_gbpages) 32 int use_gbpages)
96{ 33{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3cd7711bb949..30938c1d8d5d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
84#ifdef CONFIG_X86_PAE 84#ifdef CONFIG_X86_PAE
85 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 85 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
86 if (after_bootmem) 86 if (after_bootmem)
87 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 87 pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
88 else 88 else
89 pmd_table = (pmd_t *)alloc_low_page(); 89 pmd_table = (pmd_t *)alloc_low_page();
90 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 90 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
@@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
116#endif 116#endif
117 if (!page_table) 117 if (!page_table)
118 page_table = 118 page_table =
119 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 119 (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
120 } else 120 } else
121 page_table = (pte_t *)alloc_low_page(); 121 page_table = (pte_t *)alloc_low_page();
122 122
@@ -857,8 +857,6 @@ static void __init test_wp_bit(void)
857 } 857 }
858} 858}
859 859
860static struct kcore_list kcore_mem, kcore_vmalloc;
861
862void __init mem_init(void) 860void __init mem_init(void)
863{ 861{
864 int codesize, reservedpages, datasize, initsize; 862 int codesize, reservedpages, datasize, initsize;
@@ -886,13 +884,9 @@ void __init mem_init(void)
886 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 884 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
887 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 885 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
888 886
889 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
890 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
891 VMALLOC_END-VMALLOC_START);
892
893 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " 887 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
894 "%dk reserved, %dk data, %dk init, %ldk highmem)\n", 888 "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
895 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 889 nr_free_pages() << (PAGE_SHIFT-10),
896 num_physpages << (PAGE_SHIFT-10), 890 num_physpages << (PAGE_SHIFT-10),
897 codesize >> 10, 891 codesize >> 10,
898 reservedpages << (PAGE_SHIFT-10), 892 reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ea56b8cbb6a6..5a4398a6006b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,8 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
647 647
648#endif /* CONFIG_MEMORY_HOTPLUG */ 648#endif /* CONFIG_MEMORY_HOTPLUG */
649 649
650static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, 650static struct kcore_list kcore_vsyscall;
651 kcore_modules, kcore_vsyscall;
652 651
653void __init mem_init(void) 652void __init mem_init(void)
654{ 653{
@@ -677,17 +676,12 @@ void __init mem_init(void)
677 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 676 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
678 677
679 /* Register memory areas for /proc/kcore */ 678 /* Register memory areas for /proc/kcore */
680 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
681 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
682 VMALLOC_END-VMALLOC_START);
683 kclist_add(&kcore_kernel, &_stext, _end - _stext);
684 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
685 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 679 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
686 VSYSCALL_END - VSYSCALL_START); 680 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
687 681
688 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 682 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
689 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", 683 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
690 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 684 nr_free_pages() << (PAGE_SHIFT-10),
691 max_pfn << (PAGE_SHIFT-10), 685 max_pfn << (PAGE_SHIFT-10),
692 codesize >> 10, 686 codesize >> 10,
693 absent_pages << (PAGE_SHIFT-10), 687 absent_pages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index fe6f84ca121e..84e236ce76ba 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -21,7 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23 23
24int is_io_mapping_possible(resource_size_t base, unsigned long size) 24static int is_io_mapping_possible(resource_size_t base, unsigned long size)
25{ 25{
26#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) 26#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
27 /* There is no way to map greater than 1 << 32 address without PAE */ 27 /* There is no way to map greater than 1 << 32 address without PAE */
@@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
30#endif 30#endif
31 return 1; 31 return 1;
32} 32}
33EXPORT_SYMBOL_GPL(is_io_mapping_possible); 33
34int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
35{
36 unsigned long flag = _PAGE_CACHE_WC;
37 int ret;
38
39 if (!is_io_mapping_possible(base, size))
40 return -EINVAL;
41
42 ret = io_reserve_memtype(base, base + size, &flag);
43 if (ret)
44 return ret;
45
46 *prot = __pgprot(__PAGE_KERNEL | flag);
47 return 0;
48}
49EXPORT_SYMBOL_GPL(iomap_create_wc);
50
51void
52iomap_free(resource_size_t base, unsigned long size)
53{
54 io_free_memtype(base, base + size);
55}
56EXPORT_SYMBOL_GPL(iomap_free);
34 57
35void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 58void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
36{ 59{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 04e1ad60c63a..334e63ca7b2b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -158,24 +158,14 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
158 retval = reserve_memtype(phys_addr, (u64)phys_addr + size, 158 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
159 prot_val, &new_prot_val); 159 prot_val, &new_prot_val);
160 if (retval) { 160 if (retval) {
161 pr_debug("Warning: reserve_memtype returned %d\n", retval); 161 printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
162 return NULL; 162 return NULL;
163 } 163 }
164 164
165 if (prot_val != new_prot_val) { 165 if (prot_val != new_prot_val) {
166 /* 166 if (!is_new_memtype_allowed(phys_addr, size,
167 * Do not fallback to certain memory types with certain 167 prot_val, new_prot_val)) {
168 * requested type: 168 printk(KERN_ERR
169 * - request is uc-, return cannot be write-back
170 * - request is uc-, return cannot be write-combine
171 * - request is write-combine, return cannot be write-back
172 */
173 if ((prot_val == _PAGE_CACHE_UC_MINUS &&
174 (new_prot_val == _PAGE_CACHE_WB ||
175 new_prot_val == _PAGE_CACHE_WC)) ||
176 (prot_val == _PAGE_CACHE_WC &&
177 new_prot_val == _PAGE_CACHE_WB)) {
178 pr_debug(
179 "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", 169 "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
180 (unsigned long long)phys_addr, 170 (unsigned long long)phys_addr,
181 (unsigned long long)(phys_addr + size), 171 (unsigned long long)(phys_addr + size),
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 528bf954eb74..8cc183344140 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs)
225 225
226 BUG_ON(!irqs_disabled()); 226 BUG_ON(!irqs_disabled());
227 227
228 if (data->balance == 0)
229 return;
230
231 if (unlikely(data->balance != 1)) { 228 if (unlikely(data->balance != 1)) {
232 kmemcheck_show_all(); 229 kmemcheck_show_all();
233 kmemcheck_error_save_bug(regs); 230 kmemcheck_error_save_bug(regs);
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index e773b6bd0079..3f66b82076a3 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -1,7 +1,6 @@
1#include <linux/kmemcheck.h> 1#include <linux/kmemcheck.h>
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/module.h>
5 4
6#include <asm/page.h> 5#include <asm/page.h>
7#include <asm/pgtable.h> 6#include <asm/pgtable.h>
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 165829600566..c8191defc38a 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -29,13 +29,26 @@
29#include <linux/random.h> 29#include <linux/random.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <asm/elf.h>
33
34static unsigned int stack_maxrandom_size(void)
35{
36 unsigned int max = 0;
37 if ((current->flags & PF_RANDOMIZE) &&
38 !(current->personality & ADDR_NO_RANDOMIZE)) {
39 max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
40 }
41
42 return max;
43}
44
32 45
33/* 46/*
34 * Top of mmap area (just below the process stack). 47 * Top of mmap area (just below the process stack).
35 * 48 *
36 * Leave an at least ~128 MB hole. 49 * Leave an at least ~128 MB hole with possible stack randomization.
37 */ 50 */
38#define MIN_GAP (128*1024*1024) 51#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
39#define MAX_GAP (TASK_SIZE/6*5) 52#define MAX_GAP (TASK_SIZE/6*5)
40 53
41/* 54/*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962db..dd38bfbefd1f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/pfn.h> 14#include <linux/pfn.h>
15#include <linux/percpu.h>
15 16
16#include <asm/e820.h> 17#include <asm/e820.h>
17#include <asm/processor.h> 18#include <asm/processor.h>
@@ -143,6 +144,7 @@ void clflush_cache_range(void *vaddr, unsigned int size)
143 144
144 mb(); 145 mb();
145} 146}
147EXPORT_SYMBOL_GPL(clflush_cache_range);
146 148
147static void __cpa_flush_all(void *arg) 149static void __cpa_flush_all(void *arg)
148{ 150{
@@ -686,7 +688,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
686{ 688{
687 struct cpa_data alias_cpa; 689 struct cpa_data alias_cpa;
688 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); 690 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
689 unsigned long vaddr, remapped; 691 unsigned long vaddr;
690 int ret; 692 int ret;
691 693
692 if (cpa->pfn >= max_pfn_mapped) 694 if (cpa->pfn >= max_pfn_mapped)
@@ -744,24 +746,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
744 } 746 }
745#endif 747#endif
746 748
747 /*
748 * If the PMD page was partially used for per-cpu remapping,
749 * the recycled area needs to be split and modified. Because
750 * the area is always proper subset of a PMD page
751 * cpa->numpages is guaranteed to be 1 for these areas, so
752 * there's no need to loop over and check for further remaps.
753 */
754 remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
755 if (remapped) {
756 WARN_ON(cpa->numpages > 1);
757 alias_cpa = *cpa;
758 alias_cpa.vaddr = &remapped;
759 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
760 ret = __change_page_attr_set_clr(&alias_cpa, 0);
761 if (ret)
762 return ret;
763 }
764
765 return 0; 749 return 0;
766} 750}
767 751
@@ -822,6 +806,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
822{ 806{
823 struct cpa_data cpa; 807 struct cpa_data cpa;
824 int ret, cache, checkalias; 808 int ret, cache, checkalias;
809 unsigned long baddr = 0;
825 810
826 /* 811 /*
827 * Check, if we are requested to change a not supported 812 * Check, if we are requested to change a not supported
@@ -853,6 +838,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
853 */ 838 */
854 WARN_ON_ONCE(1); 839 WARN_ON_ONCE(1);
855 } 840 }
841 /*
842 * Save address for cache flush. *addr is modified in the call
843 * to __change_page_attr_set_clr() below.
844 */
845 baddr = *addr;
856 } 846 }
857 847
858 /* Must avoid aliasing mappings in the highmem code */ 848 /* Must avoid aliasing mappings in the highmem code */
@@ -900,7 +890,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
900 cpa_flush_array(addr, numpages, cache, 890 cpa_flush_array(addr, numpages, cache,
901 cpa.flags, pages); 891 cpa.flags, pages);
902 } else 892 } else
903 cpa_flush_range(*addr, numpages, cache); 893 cpa_flush_range(baddr, numpages, cache);
904 } else 894 } else
905 cpa_flush_all(cache); 895 cpa_flush_all(cache);
906 896
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index b2f7d3e59b86..e78cd0ec2bcf 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -15,6 +15,7 @@
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/rbtree.h>
18 19
19#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
20#include <asm/processor.h> 21#include <asm/processor.h>
@@ -80,6 +81,7 @@ enum {
80void pat_init(void) 81void pat_init(void)
81{ 82{
82 u64 pat; 83 u64 pat;
84 bool boot_cpu = !boot_pat_state;
83 85
84 if (!pat_enabled) 86 if (!pat_enabled)
85 return; 87 return;
@@ -121,8 +123,10 @@ void pat_init(void)
121 rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); 123 rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
122 124
123 wrmsrl(MSR_IA32_CR_PAT, pat); 125 wrmsrl(MSR_IA32_CR_PAT, pat);
124 printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", 126
125 smp_processor_id(), boot_pat_state, pat); 127 if (boot_cpu)
128 printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
129 smp_processor_id(), boot_pat_state, pat);
126} 130}
127 131
128#undef PAT 132#undef PAT
@@ -148,11 +152,10 @@ static char *cattr_name(unsigned long flags)
148 * areas). All the aliases have the same cache attributes of course. 152 * areas). All the aliases have the same cache attributes of course.
149 * Zero attributes are represented as holes. 153 * Zero attributes are represented as holes.
150 * 154 *
151 * Currently the data structure is a list because the number of mappings 155 * The data structure is a list that is also organized as an rbtree
152 * are expected to be relatively small. If this should be a problem 156 * sorted on the start address of memtype range.
153 * it could be changed to a rbtree or similar.
154 * 157 *
155 * memtype_lock protects the whole list. 158 * memtype_lock protects both the linear list and rbtree.
156 */ 159 */
157 160
158struct memtype { 161struct memtype {
@@ -160,11 +163,53 @@ struct memtype {
160 u64 end; 163 u64 end;
161 unsigned long type; 164 unsigned long type;
162 struct list_head nd; 165 struct list_head nd;
166 struct rb_node rb;
163}; 167};
164 168
169static struct rb_root memtype_rbroot = RB_ROOT;
165static LIST_HEAD(memtype_list); 170static LIST_HEAD(memtype_list);
166static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 171static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
167 172
173static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
174{
175 struct rb_node *node = root->rb_node;
176 struct memtype *last_lower = NULL;
177
178 while (node) {
179 struct memtype *data = container_of(node, struct memtype, rb);
180
181 if (data->start < start) {
182 last_lower = data;
183 node = node->rb_right;
184 } else if (data->start > start) {
185 node = node->rb_left;
186 } else
187 return data;
188 }
189
190 /* Will return NULL if there is no entry with its start <= start */
191 return last_lower;
192}
193
194static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
195{
196 struct rb_node **new = &(root->rb_node);
197 struct rb_node *parent = NULL;
198
199 while (*new) {
200 struct memtype *this = container_of(*new, struct memtype, rb);
201
202 parent = *new;
203 if (data->start <= this->start)
204 new = &((*new)->rb_left);
205 else if (data->start > this->start)
206 new = &((*new)->rb_right);
207 }
208
209 rb_link_node(&data->rb, parent, new);
210 rb_insert_color(&data->rb, root);
211}
212
168/* 213/*
169 * Does intersection of PAT memory type and MTRR memory type and returns 214 * Does intersection of PAT memory type and MTRR memory type and returns
170 * the resulting memory type as PAT understands it. 215 * the resulting memory type as PAT understands it.
@@ -218,9 +263,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
218 return -EBUSY; 263 return -EBUSY;
219} 264}
220 265
221static struct memtype *cached_entry;
222static u64 cached_start;
223
224static int pat_pagerange_is_ram(unsigned long start, unsigned long end) 266static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
225{ 267{
226 int ram_page = 0, not_rampage = 0; 268 int ram_page = 0, not_rampage = 0;
@@ -249,63 +291,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
249} 291}
250 292
251/* 293/*
252 * For RAM pages, mark the pages as non WB memory type using 294 * For RAM pages, we use page flags to mark the pages with appropriate type.
253 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or 295 * Here we do two pass:
254 * set_memory_wc() on a RAM page at a time before marking it as WB again. 296 * - Find the memtype of all the pages in the range, look for any conflicts
255 * This is ok, because only one driver will be owning the page and 297 * - In case of no conflicts, set the new memtype for pages in the range
256 * doing set_memory_*() calls.
257 * 298 *
258 * For now, we use PageNonWB to track that the RAM page is being mapped 299 * Caller must hold memtype_lock for atomicity.
259 * as non WB. In future, we will have to use one more flag
260 * (or some other mechanism in page_struct) to distinguish between
261 * UC and WC mapping.
262 */ 300 */
263static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, 301static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
264 unsigned long *new_type) 302 unsigned long *new_type)
265{ 303{
266 struct page *page; 304 struct page *page;
267 u64 pfn, end_pfn; 305 u64 pfn;
306
307 if (req_type == _PAGE_CACHE_UC) {
308 /* We do not support strong UC */
309 WARN_ON_ONCE(1);
310 req_type = _PAGE_CACHE_UC_MINUS;
311 }
268 312
269 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 313 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
270 page = pfn_to_page(pfn); 314 unsigned long type;
271 if (page_mapped(page) || PageNonWB(page))
272 goto out;
273 315
274 SetPageNonWB(page); 316 page = pfn_to_page(pfn);
317 type = get_page_memtype(page);
318 if (type != -1) {
319 printk(KERN_INFO "reserve_ram_pages_type failed "
320 "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
321 start, end, type, req_type);
322 if (new_type)
323 *new_type = type;
324
325 return -EBUSY;
326 }
275 } 327 }
276 return 0;
277 328
278out: 329 if (new_type)
279 end_pfn = pfn; 330 *new_type = req_type;
280 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { 331
332 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
281 page = pfn_to_page(pfn); 333 page = pfn_to_page(pfn);
282 ClearPageNonWB(page); 334 set_page_memtype(page, req_type);
283 } 335 }
284 336 return 0;
285 return -EINVAL;
286} 337}
287 338
288static int free_ram_pages_type(u64 start, u64 end) 339static int free_ram_pages_type(u64 start, u64 end)
289{ 340{
290 struct page *page; 341 struct page *page;
291 u64 pfn, end_pfn; 342 u64 pfn;
292 343
293 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 344 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
294 page = pfn_to_page(pfn); 345 page = pfn_to_page(pfn);
295 if (page_mapped(page) || !PageNonWB(page)) 346 set_page_memtype(page, -1);
296 goto out;
297
298 ClearPageNonWB(page);
299 } 347 }
300 return 0; 348 return 0;
301
302out:
303 end_pfn = pfn;
304 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
305 page = pfn_to_page(pfn);
306 SetPageNonWB(page);
307 }
308 return -EINVAL;
309} 349}
310 350
311/* 351/*
@@ -339,6 +379,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
339 if (new_type) { 379 if (new_type) {
340 if (req_type == -1) 380 if (req_type == -1)
341 *new_type = _PAGE_CACHE_WB; 381 *new_type = _PAGE_CACHE_WB;
382 else if (req_type == _PAGE_CACHE_WC)
383 *new_type = _PAGE_CACHE_UC_MINUS;
342 else 384 else
343 *new_type = req_type & _PAGE_CACHE_MASK; 385 *new_type = req_type & _PAGE_CACHE_MASK;
344 } 386 }
@@ -364,11 +406,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
364 *new_type = actual_type; 406 *new_type = actual_type;
365 407
366 is_range_ram = pat_pagerange_is_ram(start, end); 408 is_range_ram = pat_pagerange_is_ram(start, end);
367 if (is_range_ram == 1) 409 if (is_range_ram == 1) {
368 return reserve_ram_pages_type(start, end, req_type, 410
369 new_type); 411 spin_lock(&memtype_lock);
370 else if (is_range_ram < 0) 412 err = reserve_ram_pages_type(start, end, req_type, new_type);
413 spin_unlock(&memtype_lock);
414
415 return err;
416 } else if (is_range_ram < 0) {
371 return -EINVAL; 417 return -EINVAL;
418 }
372 419
373 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 420 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
374 if (!new) 421 if (!new)
@@ -380,17 +427,11 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
380 427
381 spin_lock(&memtype_lock); 428 spin_lock(&memtype_lock);
382 429
383 if (cached_entry && start >= cached_start)
384 entry = cached_entry;
385 else
386 entry = list_entry(&memtype_list, struct memtype, nd);
387
388 /* Search for existing mapping that overlaps the current range */ 430 /* Search for existing mapping that overlaps the current range */
389 where = NULL; 431 where = NULL;
390 list_for_each_entry_continue(entry, &memtype_list, nd) { 432 list_for_each_entry(entry, &memtype_list, nd) {
391 if (end <= entry->start) { 433 if (end <= entry->start) {
392 where = entry->nd.prev; 434 where = entry->nd.prev;
393 cached_entry = list_entry(where, struct memtype, nd);
394 break; 435 break;
395 } else if (start <= entry->start) { /* end > entry->start */ 436 } else if (start <= entry->start) { /* end > entry->start */
396 err = chk_conflict(new, entry, new_type); 437 err = chk_conflict(new, entry, new_type);
@@ -398,8 +439,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
398 dprintk("Overlap at 0x%Lx-0x%Lx\n", 439 dprintk("Overlap at 0x%Lx-0x%Lx\n",
399 entry->start, entry->end); 440 entry->start, entry->end);
400 where = entry->nd.prev; 441 where = entry->nd.prev;
401 cached_entry = list_entry(where,
402 struct memtype, nd);
403 } 442 }
404 break; 443 break;
405 } else if (start < entry->end) { /* start > entry->start */ 444 } else if (start < entry->end) { /* start > entry->start */
@@ -407,8 +446,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
407 if (!err) { 446 if (!err) {
408 dprintk("Overlap at 0x%Lx-0x%Lx\n", 447 dprintk("Overlap at 0x%Lx-0x%Lx\n",
409 entry->start, entry->end); 448 entry->start, entry->end);
410 cached_entry = list_entry(entry->nd.prev,
411 struct memtype, nd);
412 449
413 /* 450 /*
414 * Move to right position in the linked 451 * Move to right position in the linked
@@ -436,13 +473,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
436 return err; 473 return err;
437 } 474 }
438 475
439 cached_start = start;
440
441 if (where) 476 if (where)
442 list_add(&new->nd, where); 477 list_add(&new->nd, where);
443 else 478 else
444 list_add_tail(&new->nd, &memtype_list); 479 list_add_tail(&new->nd, &memtype_list);
445 480
481 memtype_rb_insert(&memtype_rbroot, new);
482
446 spin_unlock(&memtype_lock); 483 spin_unlock(&memtype_lock);
447 484
448 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", 485 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -454,7 +491,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
454 491
455int free_memtype(u64 start, u64 end) 492int free_memtype(u64 start, u64 end)
456{ 493{
457 struct memtype *entry; 494 struct memtype *entry, *saved_entry;
458 int err = -EINVAL; 495 int err = -EINVAL;
459 int is_range_ram; 496 int is_range_ram;
460 497
@@ -466,23 +503,58 @@ int free_memtype(u64 start, u64 end)
466 return 0; 503 return 0;
467 504
468 is_range_ram = pat_pagerange_is_ram(start, end); 505 is_range_ram = pat_pagerange_is_ram(start, end);
469 if (is_range_ram == 1) 506 if (is_range_ram == 1) {
470 return free_ram_pages_type(start, end); 507
471 else if (is_range_ram < 0) 508 spin_lock(&memtype_lock);
509 err = free_ram_pages_type(start, end);
510 spin_unlock(&memtype_lock);
511
512 return err;
513 } else if (is_range_ram < 0) {
472 return -EINVAL; 514 return -EINVAL;
515 }
473 516
474 spin_lock(&memtype_lock); 517 spin_lock(&memtype_lock);
475 list_for_each_entry(entry, &memtype_list, nd) { 518
519 entry = memtype_rb_search(&memtype_rbroot, start);
520 if (unlikely(entry == NULL))
521 goto unlock_ret;
522
523 /*
524 * Saved entry points to an entry with start same or less than what
525 * we searched for. Now go through the list in both directions to look
526 * for the entry that matches with both start and end, with list stored
527 * in sorted start address
528 */
529 saved_entry = entry;
530 list_for_each_entry_from(entry, &memtype_list, nd) {
476 if (entry->start == start && entry->end == end) { 531 if (entry->start == start && entry->end == end) {
477 if (cached_entry == entry || cached_start == start) 532 rb_erase(&entry->rb, &memtype_rbroot);
478 cached_entry = NULL; 533 list_del(&entry->nd);
534 kfree(entry);
535 err = 0;
536 break;
537 } else if (entry->start > start) {
538 break;
539 }
540 }
479 541
542 if (!err)
543 goto unlock_ret;
544
545 entry = saved_entry;
546 list_for_each_entry_reverse(entry, &memtype_list, nd) {
547 if (entry->start == start && entry->end == end) {
548 rb_erase(&entry->rb, &memtype_rbroot);
480 list_del(&entry->nd); 549 list_del(&entry->nd);
481 kfree(entry); 550 kfree(entry);
482 err = 0; 551 err = 0;
483 break; 552 break;
553 } else if (entry->start < start) {
554 break;
484 } 555 }
485 } 556 }
557unlock_ret:
486 spin_unlock(&memtype_lock); 558 spin_unlock(&memtype_lock);
487 559
488 if (err) { 560 if (err) {
@@ -496,6 +568,101 @@ int free_memtype(u64 start, u64 end)
496} 568}
497 569
498 570
571/**
572 * lookup_memtype - Looksup the memory type for a physical address
573 * @paddr: physical address of which memory type needs to be looked up
574 *
575 * Only to be called when PAT is enabled
576 *
577 * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
578 * _PAGE_CACHE_UC
579 */
580static unsigned long lookup_memtype(u64 paddr)
581{
582 int rettype = _PAGE_CACHE_WB;
583 struct memtype *entry;
584
585 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
586 return rettype;
587
588 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
589 struct page *page;
590 spin_lock(&memtype_lock);
591 page = pfn_to_page(paddr >> PAGE_SHIFT);
592 rettype = get_page_memtype(page);
593 spin_unlock(&memtype_lock);
594 /*
595 * -1 from get_page_memtype() implies RAM page is in its
596 * default state and not reserved, and hence of type WB
597 */
598 if (rettype == -1)
599 rettype = _PAGE_CACHE_WB;
600
601 return rettype;
602 }
603
604 spin_lock(&memtype_lock);
605
606 entry = memtype_rb_search(&memtype_rbroot, paddr);
607 if (entry != NULL)
608 rettype = entry->type;
609 else
610 rettype = _PAGE_CACHE_UC_MINUS;
611
612 spin_unlock(&memtype_lock);
613 return rettype;
614}
615
616/**
617 * io_reserve_memtype - Request a memory type mapping for a region of memory
618 * @start: start (physical address) of the region
619 * @end: end (physical address) of the region
620 * @type: A pointer to memtype, with requested type. On success, requested
621 * or any other compatible type that was available for the region is returned
622 *
623 * On success, returns 0
624 * On failure, returns non-zero
625 */
626int io_reserve_memtype(resource_size_t start, resource_size_t end,
627 unsigned long *type)
628{
629 resource_size_t size = end - start;
630 unsigned long req_type = *type;
631 unsigned long new_type;
632 int ret;
633
634 WARN_ON_ONCE(iomem_map_sanity_check(start, size));
635
636 ret = reserve_memtype(start, end, req_type, &new_type);
637 if (ret)
638 goto out_err;
639
640 if (!is_new_memtype_allowed(start, size, req_type, new_type))
641 goto out_free;
642
643 if (kernel_map_sync_memtype(start, size, new_type) < 0)
644 goto out_free;
645
646 *type = new_type;
647 return 0;
648
649out_free:
650 free_memtype(start, end);
651 ret = -EBUSY;
652out_err:
653 return ret;
654}
655
656/**
657 * io_free_memtype - Release a memory type mapping for a region of memory
658 * @start: start (physical address) of the region
659 * @end: end (physical address) of the region
660 */
661void io_free_memtype(resource_size_t start, resource_size_t end)
662{
663 free_memtype(start, end);
664}
665
499pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 666pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
500 unsigned long size, pgprot_t vma_prot) 667 unsigned long size, pgprot_t vma_prot)
501{ 668{
@@ -577,7 +744,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
577{ 744{
578 unsigned long id_sz; 745 unsigned long id_sz;
579 746
580 if (!pat_enabled || base >= __pa(high_memory)) 747 if (base >= __pa(high_memory))
581 return 0; 748 return 0;
582 749
583 id_sz = (__pa(high_memory) < base + size) ? 750 id_sz = (__pa(high_memory) < base + size) ?
@@ -612,11 +779,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
612 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 779 is_ram = pat_pagerange_is_ram(paddr, paddr + size);
613 780
614 /* 781 /*
615 * reserve_pfn_range() doesn't support RAM pages. Maintain the current 782 * reserve_pfn_range() for RAM pages. We do not refcount to keep
616 * behavior with RAM pages by returning success. 783 * track of number of mappings of RAM pages. We can assert that
784 * the type requested matches the type of first page in the range.
617 */ 785 */
618 if (is_ram != 0) 786 if (is_ram) {
787 if (!pat_enabled)
788 return 0;
789
790 flags = lookup_memtype(paddr);
791 if (want_flags != flags) {
792 printk(KERN_WARNING
793 "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
794 current->comm, current->pid,
795 cattr_name(want_flags),
796 (unsigned long long)paddr,
797 (unsigned long long)(paddr + size),
798 cattr_name(flags));
799 *vma_prot = __pgprot((pgprot_val(*vma_prot) &
800 (~_PAGE_CACHE_MASK)) |
801 flags);
802 }
619 return 0; 803 return 0;
804 }
620 805
621 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); 806 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
622 if (ret) 807 if (ret)
@@ -678,14 +863,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
678 unsigned long vma_size = vma->vm_end - vma->vm_start; 863 unsigned long vma_size = vma->vm_end - vma->vm_start;
679 pgprot_t pgprot; 864 pgprot_t pgprot;
680 865
681 if (!pat_enabled)
682 return 0;
683
684 /*
685 * For now, only handle remap_pfn_range() vmas where
686 * is_linear_pfn_mapping() == TRUE. Handling of
687 * vm_insert_pfn() is TBD.
688 */
689 if (is_linear_pfn_mapping(vma)) { 866 if (is_linear_pfn_mapping(vma)) {
690 /* 867 /*
691 * reserve the whole chunk covered by vma. We need the 868 * reserve the whole chunk covered by vma. We need the
@@ -713,23 +890,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
713int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, 890int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
714 unsigned long pfn, unsigned long size) 891 unsigned long pfn, unsigned long size)
715{ 892{
893 unsigned long flags;
716 resource_size_t paddr; 894 resource_size_t paddr;
717 unsigned long vma_size = vma->vm_end - vma->vm_start; 895 unsigned long vma_size = vma->vm_end - vma->vm_start;
718 896
719 if (!pat_enabled)
720 return 0;
721
722 /*
723 * For now, only handle remap_pfn_range() vmas where
724 * is_linear_pfn_mapping() == TRUE. Handling of
725 * vm_insert_pfn() is TBD.
726 */
727 if (is_linear_pfn_mapping(vma)) { 897 if (is_linear_pfn_mapping(vma)) {
728 /* reserve the whole chunk starting from vm_pgoff */ 898 /* reserve the whole chunk starting from vm_pgoff */
729 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; 899 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
730 return reserve_pfn_range(paddr, vma_size, prot, 0); 900 return reserve_pfn_range(paddr, vma_size, prot, 0);
731 } 901 }
732 902
903 if (!pat_enabled)
904 return 0;
905
906 /* for vm_insert_pfn and friends, we set prot based on lookup */
907 flags = lookup_memtype(pfn << PAGE_SHIFT);
908 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
909 flags);
910
733 return 0; 911 return 0;
734} 912}
735 913
@@ -744,14 +922,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
744 resource_size_t paddr; 922 resource_size_t paddr;
745 unsigned long vma_size = vma->vm_end - vma->vm_start; 923 unsigned long vma_size = vma->vm_end - vma->vm_start;
746 924
747 if (!pat_enabled)
748 return;
749
750 /*
751 * For now, only handle remap_pfn_range() vmas where
752 * is_linear_pfn_mapping() == TRUE. Handling of
753 * vm_insert_pfn() is TBD.
754 */
755 if (is_linear_pfn_mapping(vma)) { 925 if (is_linear_pfn_mapping(vma)) {
756 /* free the whole chunk starting from vm_pgoff */ 926 /* free the whole chunk starting from vm_pgoff */
757 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; 927 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
new file mode 100644
index 000000000000..513d8ed5d2ec
--- /dev/null
+++ b/arch/x86/mm/setup_nx.c
@@ -0,0 +1,69 @@
1#include <linux/spinlock.h>
2#include <linux/errno.h>
3#include <linux/init.h>
4
5#include <asm/pgtable.h>
6
7int nx_enabled;
8
9#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
10static int disable_nx __cpuinitdata;
11
12/*
13 * noexec = on|off
14 *
15 * Control non-executable mappings for processes.
16 *
17 * on Enable
18 * off Disable
19 */
20static int __init noexec_setup(char *str)
21{
22 if (!str)
23 return -EINVAL;
24 if (!strncmp(str, "on", 2)) {
25 __supported_pte_mask |= _PAGE_NX;
26 disable_nx = 0;
27 } else if (!strncmp(str, "off", 3)) {
28 disable_nx = 1;
29 __supported_pte_mask &= ~_PAGE_NX;
30 }
31 return 0;
32}
33early_param("noexec", noexec_setup);
34#endif
35
36#ifdef CONFIG_X86_PAE
37void __init set_nx(void)
38{
39 unsigned int v[4], l, h;
40
41 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
42 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
43
44 if ((v[3] & (1 << 20)) && !disable_nx) {
45 rdmsr(MSR_EFER, l, h);
46 l |= EFER_NX;
47 wrmsr(MSR_EFER, l, h);
48 nx_enabled = 1;
49 __supported_pte_mask |= _PAGE_NX;
50 }
51 }
52}
53#else
54void set_nx(void)
55{
56}
57#endif
58
59#ifdef CONFIG_X86_64
60void __cpuinit check_efer(void)
61{
62 unsigned long efer;
63
64 rdmsrl(MSR_EFER, efer);
65 if (!(efer & EFER_NX) || disable_nx)
66 __supported_pte_mask &= ~_PAGE_NX;
67}
68#endif
69
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df5..8565d944f7cf 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
1/* 1/*
2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> 2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
3 */ 3 */
4
5#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6
4#include <linux/module.h> 7#include <linux/module.h>
5#include <linux/io.h> 8#include <linux/io.h>
6#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
7 10
8#define MODULE_NAME "testmmiotrace"
9
10static unsigned long mmio_address; 11static unsigned long mmio_address;
11module_param(mmio_address, ulong, 0); 12module_param(mmio_address, ulong, 0);
12MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " 13MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
30static void do_write_test(void __iomem *p) 31static void do_write_test(void __iomem *p)
31{ 32{
32 unsigned int i; 33 unsigned int i;
33 pr_info(MODULE_NAME ": write test.\n"); 34 pr_info("write test.\n");
34 mmiotrace_printk("Write test.\n"); 35 mmiotrace_printk("Write test.\n");
35 36
36 for (i = 0; i < 256; i++) 37 for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
47{ 48{
48 unsigned int i; 49 unsigned int i;
49 unsigned errs[3] = { 0 }; 50 unsigned errs[3] = { 0 };
50 pr_info(MODULE_NAME ": read test.\n"); 51 pr_info("read test.\n");
51 mmiotrace_printk("Read test.\n"); 52 mmiotrace_printk("Read test.\n");
52 53
53 for (i = 0; i < 256; i++) 54 for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
68 69
69static void do_read_far_test(void __iomem *p) 70static void do_read_far_test(void __iomem *p)
70{ 71{
71 pr_info(MODULE_NAME ": read far test.\n"); 72 pr_info("read far test.\n");
72 mmiotrace_printk("Read far test.\n"); 73 mmiotrace_printk("Read far test.\n");
73 74
74 ioread32(p + read_far); 75 ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
78{ 79{
79 void __iomem *p = ioremap_nocache(mmio_address, size); 80 void __iomem *p = ioremap_nocache(mmio_address, size);
80 if (!p) { 81 if (!p) {
81 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 82 pr_err("could not ioremap, aborting.\n");
82 return; 83 return;
83 } 84 }
84 mmiotrace_printk("ioremap returned %p.\n", p); 85 mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
94 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
95 96
96 if (mmio_address == 0) { 97 if (mmio_address == 0) {
97 pr_err(MODULE_NAME ": you have to use the module argument " 98 pr_err("you have to use the module argument mmio_address.\n");
98 "mmio_address.\n"); 99 pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
99 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
100 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
101 return -ENXIO; 100 return -ENXIO;
102 } 101 }
103 102
104 pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " 103 pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
105 "address space, and writing 16 kB of rubbish in there.\n", 104 "and writing 16 kB of rubbish in there.\n",
106 size >> 10, mmio_address); 105 size >> 10, mmio_address);
107 do_test(size); 106 do_test(size);
108 pr_info(MODULE_NAME ": All done.\n"); 107 pr_info("All done.\n");
109 return 0; 108 return 0;
110} 109}
111 110
112static void __exit cleanup(void) 111static void __exit cleanup(void)
113{ 112{
114 pr_debug(MODULE_NAME ": unloaded.\n"); 113 pr_debug("unloaded.\n");
115} 114}
116 115
117module_init(init); 116module_init(init);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c814e144a3f0..36fe08eeb5c3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -59,7 +59,8 @@ void leave_mm(int cpu)
59{ 59{
60 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 60 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
61 BUG(); 61 BUG();
62 cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); 62 cpumask_clear_cpu(cpu,
63 mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
63 load_cr3(swapper_pg_dir); 64 load_cr3(swapper_pg_dir);
64} 65}
65EXPORT_SYMBOL_GPL(leave_mm); 66EXPORT_SYMBOL_GPL(leave_mm);
@@ -234,8 +235,8 @@ void flush_tlb_current_task(void)
234 preempt_disable(); 235 preempt_disable();
235 236
236 local_flush_tlb(); 237 local_flush_tlb();
237 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) 238 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
238 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); 239 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
239 preempt_enable(); 240 preempt_enable();
240} 241}
241 242
@@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm)
249 else 250 else
250 leave_mm(smp_processor_id()); 251 leave_mm(smp_processor_id());
251 } 252 }
252 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) 253 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
253 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); 254 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
254 255
255 preempt_enable(); 256 preempt_enable();
256} 257}
@@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
268 leave_mm(smp_processor_id()); 269 leave_mm(smp_processor_id());
269 } 270 }
270 271
271 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) 272 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
272 flush_tlb_others(&mm->cpu_vm_mask, mm, va); 273 flush_tlb_others(mm_cpumask(mm), mm, va);
273 274
274 preempt_enable(); 275 preempt_enable();
275} 276}
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 4899215999de..8eb05878554c 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void)
234 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && 234 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
235 current_cpu_data.x86_model == 15) { 235 current_cpu_data.x86_model == 15) {
236 eax.split.version_id = 2; 236 eax.split.version_id = 2;
237 eax.split.num_counters = 2; 237 eax.split.num_events = 2;
238 eax.split.bit_width = 40; 238 eax.split.bit_width = 40;
239 } 239 }
240 240
241 num_counters = eax.split.num_counters; 241 num_counters = eax.split.num_events;
242 242
243 op_arch_perfmon_spec.num_counters = num_counters; 243 op_arch_perfmon_spec.num_counters = num_counters;
244 op_arch_perfmon_spec.num_controls = num_counters; 244 op_arch_perfmon_spec.num_controls = num_counters;
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index b83776180c7f..7b8e75d16081 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -13,7 +13,7 @@
13#define OP_X86_MODEL_H 13#define OP_X86_MODEL_H
14 14
15#include <asm/types.h> 15#include <asm/types.h>
16#include <asm/perf_counter.h> 16#include <asm/perf_event.h>
17 17
18struct op_msr { 18struct op_msr {
19 unsigned long addr; 19 unsigned long addr;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 3ffa10df20b9..572ee9782f2a 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -15,63 +15,6 @@
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
16 */ 16 */
17 17
18#ifdef CONFIG_NUMA
19
20#define BUS_NR 256
21
22#ifdef CONFIG_X86_64
23
24static int mp_bus_to_node[BUS_NR];
25
26void set_mp_bus_to_node(int busnum, int node)
27{
28 if (busnum >= 0 && busnum < BUS_NR)
29 mp_bus_to_node[busnum] = node;
30}
31
32int get_mp_bus_to_node(int busnum)
33{
34 int node = -1;
35
36 if (busnum < 0 || busnum > (BUS_NR - 1))
37 return node;
38
39 node = mp_bus_to_node[busnum];
40
41 /*
42 * let numa_node_id to decide it later in dma_alloc_pages
43 * if there is no ram on that node
44 */
45 if (node != -1 && !node_online(node))
46 node = -1;
47
48 return node;
49}
50
51#else /* CONFIG_X86_32 */
52
53static unsigned char mp_bus_to_node[BUS_NR];
54
55void set_mp_bus_to_node(int busnum, int node)
56{
57 if (busnum >= 0 && busnum < BUS_NR)
58 mp_bus_to_node[busnum] = (unsigned char) node;
59}
60
61int get_mp_bus_to_node(int busnum)
62{
63 int node;
64
65 if (busnum < 0 || busnum > (BUS_NR - 1))
66 return 0;
67 node = mp_bus_to_node[busnum];
68 return node;
69}
70
71#endif /* CONFIG_X86_32 */
72
73#endif /* CONFIG_NUMA */
74
75#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
76 19
77/* 20/*
@@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void)
301 u64 val; 244 u64 val;
302 u32 address; 245 u32 address;
303 246
304#ifdef CONFIG_NUMA
305 for (i = 0; i < BUS_NR; i++)
306 mp_bus_to_node[i] = -1;
307#endif
308
309 if (!early_pci_allowed()) 247 if (!early_pci_allowed())
310 return -1; 248 return -1;
311 249
@@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void)
346 node = (reg >> 4) & 0x07; 284 node = (reg >> 4) & 0x07;
347#ifdef CONFIG_NUMA 285#ifdef CONFIG_NUMA
348 for (j = min_bus; j <= max_bus; j++) 286 for (j = min_bus; j <= max_bus; j++)
349 mp_bus_to_node[j] = (unsigned char) node; 287 set_mp_bus_to_node(j, node);
350#endif 288#endif
351 link = (reg >> 8) & 0x03; 289 link = (reg >> 8) & 0x03;
352 290
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2202b6257b82..1331fcf26143 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
600{ 600{
601 return pci_scan_bus_on_node(busno, &pci_root_ops, -1); 601 return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
602} 602}
603
604/*
605 * NUMA info for PCI busses
606 *
607 * Early arch code is responsible for filling in reasonable values here.
608 * A node id of "-1" means "use current node". In other words, if a bus
609 * has a -1 node id, it's not tightly coupled to any particular chunk
610 * of memory (as is the case on some Nehalem systems).
611 */
612#ifdef CONFIG_NUMA
613
614#define BUS_NR 256
615
616#ifdef CONFIG_X86_64
617
618static int mp_bus_to_node[BUS_NR] = {
619 [0 ... BUS_NR - 1] = -1
620};
621
622void set_mp_bus_to_node(int busnum, int node)
623{
624 if (busnum >= 0 && busnum < BUS_NR)
625 mp_bus_to_node[busnum] = node;
626}
627
628int get_mp_bus_to_node(int busnum)
629{
630 int node = -1;
631
632 if (busnum < 0 || busnum > (BUS_NR - 1))
633 return node;
634
635 node = mp_bus_to_node[busnum];
636
637 /*
638 * let numa_node_id to decide it later in dma_alloc_pages
639 * if there is no ram on that node
640 */
641 if (node != -1 && !node_online(node))
642 node = -1;
643
644 return node;
645}
646
647#else /* CONFIG_X86_32 */
648
649static int mp_bus_to_node[BUS_NR] = {
650 [0 ... BUS_NR - 1] = -1
651};
652
653void set_mp_bus_to_node(int busnum, int node)
654{
655 if (busnum >= 0 && busnum < BUS_NR)
656 mp_bus_to_node[busnum] = (unsigned char) node;
657}
658
659int get_mp_bus_to_node(int busnum)
660{
661 int node;
662
663 if (busnum < 0 || busnum > (BUS_NR - 1))
664 return 0;
665 node = mp_bus_to_node[busnum];
666 return node;
667}
668
669#endif /* CONFIG_X86_32 */
670
671#endif /* CONFIG_NUMA */
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 52e62e57fedd..b22d13b0c71d 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -266,7 +266,7 @@ void pcibios_set_master(struct pci_dev *dev)
266 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); 266 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
267} 267}
268 268
269static struct vm_operations_struct pci_mmap_ops = { 269static const struct vm_operations_struct pci_mmap_ops = {
270 .access = generic_access_phys, 270 .access = generic_access_phys,
271}; 271};
272 272
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 712443ec6d43..602c172d3bd5 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -13,10 +13,14 @@
13#include <linux/pci.h> 13#include <linux/pci.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/acpi.h> 15#include <linux/acpi.h>
16#include <linux/sfi_acpi.h>
16#include <linux/bitmap.h> 17#include <linux/bitmap.h>
17#include <linux/sort.h> 18#include <linux/sort.h>
18#include <asm/e820.h> 19#include <asm/e820.h>
19#include <asm/pci_x86.h> 20#include <asm/pci_x86.h>
21#include <asm/acpi.h>
22
23#define PREFIX "PCI: "
20 24
21/* aperture is up to 256MB but BIOS may reserve less */ 25/* aperture is up to 256MB but BIOS may reserve less */
22#define MMCONFIG_APER_MIN (2 * 1024*1024) 26#define MMCONFIG_APER_MIN (2 * 1024*1024)
@@ -491,7 +495,7 @@ static void __init pci_mmcfg_reject_broken(int early)
491 (unsigned int)cfg->start_bus_number, 495 (unsigned int)cfg->start_bus_number,
492 (unsigned int)cfg->end_bus_number); 496 (unsigned int)cfg->end_bus_number);
493 497
494 if (!early) 498 if (!early && !acpi_disabled)
495 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); 499 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
496 500
497 if (valid) 501 if (valid)
@@ -606,7 +610,7 @@ static void __init __pci_mmcfg_init(int early)
606 } 610 }
607 611
608 if (!known_bridge) 612 if (!known_bridge)
609 acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); 613 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
610 614
611 pci_mmcfg_reject_broken(early); 615 pci_mmcfg_reject_broken(early);
612 616
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 8b2d561046a3..f10a7e94a84c 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -11,9 +11,9 @@
11 11
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/acpi.h>
15#include <asm/e820.h> 14#include <asm/e820.h>
16#include <asm/pci_x86.h> 15#include <asm/pci_x86.h>
16#include <acpi/acpi.h>
17 17
18/* Assume systems with more busses have correct MCFG */ 18/* Assume systems with more busses have correct MCFG */
19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) 19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 9e63db8cdee4..e09a44fc4664 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -224,11 +224,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
224 fix_processor_context(); 224 fix_processor_context();
225 225
226 do_fpu_end(); 226 do_fpu_end();
227 mtrr_ap_init(); 227 mtrr_bp_restore();
228
229#ifdef CONFIG_X86_OLD_MCE
230 mcheck_init(&boot_cpu_data);
231#endif
232} 228}
233 229
234/* Needed by apm.c */ 230/* Needed by apm.c */
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 88112b49f02c..6b4ffedb93c9 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO $@
122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ 122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) 123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
124 124
125VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) 125VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
126GCOV_PROFILE := n 126GCOV_PROFILE := n
127 127
128# 128#
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6a40b78b46aa..ee55754cc3c5 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts)
86 return 0; 86 return 0;
87} 87}
88 88
89notrace static noinline int do_realtime_coarse(struct timespec *ts)
90{
91 unsigned long seq;
92 do {
93 seq = read_seqbegin(&gtod->lock);
94 ts->tv_sec = gtod->wall_time_coarse.tv_sec;
95 ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
96 } while (unlikely(read_seqretry(&gtod->lock, seq)));
97 return 0;
98}
99
100notrace static noinline int do_monotonic_coarse(struct timespec *ts)
101{
102 unsigned long seq, ns, secs;
103 do {
104 seq = read_seqbegin(&gtod->lock);
105 secs = gtod->wall_time_coarse.tv_sec;
106 ns = gtod->wall_time_coarse.tv_nsec;
107 secs += gtod->wall_to_monotonic.tv_sec;
108 ns += gtod->wall_to_monotonic.tv_nsec;
109 } while (unlikely(read_seqretry(&gtod->lock, seq)));
110 vset_normalized_timespec(ts, secs, ns);
111 return 0;
112}
113
89notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 114notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
90{ 115{
91 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) 116 if (likely(gtod->sysctl_enabled))
92 switch (clock) { 117 switch (clock) {
93 case CLOCK_REALTIME: 118 case CLOCK_REALTIME:
94 return do_realtime(ts); 119 if (likely(gtod->clock.vread))
120 return do_realtime(ts);
121 break;
95 case CLOCK_MONOTONIC: 122 case CLOCK_MONOTONIC:
96 return do_monotonic(ts); 123 if (likely(gtod->clock.vread))
124 return do_monotonic(ts);
125 break;
126 case CLOCK_REALTIME_COARSE:
127 return do_realtime_coarse(ts);
128 case CLOCK_MONOTONIC_COARSE:
129 return do_monotonic_coarse(ts);
97 } 130 }
98 return vdso_fallback_gettime(clock, ts); 131 return vdso_fallback_gettime(clock, ts);
99} 132}
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index b53225d2cac3..e133ce25e290 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -100,7 +100,7 @@ static int xen_array_release(struct inode *inode, struct file *file)
100 return 0; 100 return 0;
101} 101}
102 102
103static struct file_operations u32_array_fops = { 103static const struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE, 104 .owner = THIS_MODULE,
105 .open = u32_array_open, 105 .open = u32_array_open,
106 .release= xen_array_release, 106 .release= xen_array_release,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0dd0c2c6cae0..3439616d69f1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,19 +912,9 @@ static const struct pv_info xen_info __initdata = {
912 912
913static const struct pv_init_ops xen_init_ops __initdata = { 913static const struct pv_init_ops xen_init_ops __initdata = {
914 .patch = xen_patch, 914 .patch = xen_patch,
915
916 .banner = xen_banner,
917 .memory_setup = xen_memory_setup,
918 .arch_setup = xen_arch_setup,
919 .post_allocator_init = xen_post_allocator_init,
920}; 915};
921 916
922static const struct pv_time_ops xen_time_ops __initdata = { 917static const struct pv_time_ops xen_time_ops __initdata = {
923 .time_init = xen_time_init,
924
925 .set_wallclock = xen_set_wallclock,
926 .get_wallclock = xen_get_wallclock,
927 .get_tsc_khz = xen_tsc_khz,
928 .sched_clock = xen_sched_clock, 918 .sched_clock = xen_sched_clock,
929}; 919};
930 920
@@ -990,8 +980,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
990 980
991static const struct pv_apic_ops xen_apic_ops __initdata = { 981static const struct pv_apic_ops xen_apic_ops __initdata = {
992#ifdef CONFIG_X86_LOCAL_APIC 982#ifdef CONFIG_X86_LOCAL_APIC
993 .setup_boot_clock = paravirt_nop,
994 .setup_secondary_clock = paravirt_nop,
995 .startup_ipi_hook = paravirt_nop, 983 .startup_ipi_hook = paravirt_nop,
996#endif 984#endif
997}; 985};
@@ -1070,7 +1058,18 @@ asmlinkage void __init xen_start_kernel(void)
1070 pv_time_ops = xen_time_ops; 1058 pv_time_ops = xen_time_ops;
1071 pv_cpu_ops = xen_cpu_ops; 1059 pv_cpu_ops = xen_cpu_ops;
1072 pv_apic_ops = xen_apic_ops; 1060 pv_apic_ops = xen_apic_ops;
1073 pv_mmu_ops = xen_mmu_ops; 1061
1062 x86_init.resources.memory_setup = xen_memory_setup;
1063 x86_init.oem.arch_setup = xen_arch_setup;
1064 x86_init.oem.banner = xen_banner;
1065
1066 x86_init.timers.timer_init = xen_time_init;
1067 x86_init.timers.setup_percpu_clockev = x86_init_noop;
1068 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
1069
1070 x86_platform.calibrate_tsc = xen_tsc_khz;
1071 x86_platform.get_wallclock = xen_get_wallclock;
1072 x86_platform.set_wallclock = xen_set_wallclock;
1074 1073
1075 /* 1074 /*
1076 * Set up some pagetable state before starting to set any ptes. 1075 * Set up some pagetable state before starting to set any ptes.
@@ -1083,6 +1082,11 @@ asmlinkage void __init xen_start_kernel(void)
1083 1082
1084 __supported_pte_mask |= _PAGE_IOMAP; 1083 __supported_pte_mask |= _PAGE_IOMAP;
1085 1084
1085#ifdef CONFIG_X86_64
1086 /* Work out if we support NX */
1087 check_efer();
1088#endif
1089
1086 xen_setup_features(); 1090 xen_setup_features();
1087 1091
1088 /* Get mfn list */ 1092 /* Get mfn list */
@@ -1095,6 +1099,7 @@ asmlinkage void __init xen_start_kernel(void)
1095 */ 1099 */
1096 xen_setup_stackprotector(); 1100 xen_setup_stackprotector();
1097 1101
1102 xen_init_mmu_ops();
1098 xen_init_irq_ops(); 1103 xen_init_irq_ops();
1099 xen_init_cpuid_mask(); 1104 xen_init_cpuid_mask();
1100 1105
@@ -1123,11 +1128,6 @@ asmlinkage void __init xen_start_kernel(void)
1123 1128
1124 pgd = (pgd_t *)xen_start_info->pt_base; 1129 pgd = (pgd_t *)xen_start_info->pt_base;
1125 1130
1126#ifdef CONFIG_X86_64
1127 /* Work out if we support NX */
1128 check_efer();
1129#endif
1130
1131 /* Don't do the full vcpu_info placement stuff until we have a 1131 /* Don't do the full vcpu_info placement stuff until we have a
1132 possible map and a non-dummy shared_info. */ 1132 possible map and a non-dummy shared_info. */
1133 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1133 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index cfd17799bd6d..9d30105a0c4a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -1,5 +1,7 @@
1#include <linux/hardirq.h> 1#include <linux/hardirq.h>
2 2
3#include <asm/x86_init.h>
4
3#include <xen/interface/xen.h> 5#include <xen/interface/xen.h>
4#include <xen/interface/sched.h> 6#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h> 7#include <xen/interface/vcpu.h>
@@ -112,8 +114,6 @@ static void xen_halt(void)
112} 114}
113 115
114static const struct pv_irq_ops xen_irq_ops __initdata = { 116static const struct pv_irq_ops xen_irq_ops __initdata = {
115 .init_IRQ = xen_init_IRQ,
116
117 .save_fl = PV_CALLEE_SAVE(xen_save_fl), 117 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), 118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), 119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
@@ -129,4 +129,5 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
129void __init xen_init_irq_ops() 129void __init xen_init_irq_ops()
130{ 130{
131 pv_irq_ops = xen_irq_ops; 131 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ;
132} 133}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4ceb28581652..3bf7b1d250ce 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1165 /* Get the "official" set of cpus referring to our pagetable. */ 1165 /* Get the "official" set of cpus referring to our pagetable. */
1166 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1166 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1167 for_each_online_cpu(cpu) { 1167 for_each_online_cpu(cpu) {
1168 if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) 1168 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1169 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1169 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1170 continue; 1170 continue;
1171 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); 1171 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1172 } 1172 }
1173 return; 1173 return;
1174 } 1174 }
1175 cpumask_copy(mask, &mm->cpu_vm_mask); 1175 cpumask_copy(mask, mm_cpumask(mm));
1176 1176
1177 /* It's possible that a vcpu may have a stale reference to our 1177 /* It's possible that a vcpu may have a stale reference to our
1178 cr3, because its in lazy mode, and it hasn't yet flushed 1178 cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1229,9 +1229,12 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
1229{ 1229{
1230} 1230}
1231 1231
1232static void xen_post_allocator_init(void);
1233
1232static __init void xen_pagetable_setup_done(pgd_t *base) 1234static __init void xen_pagetable_setup_done(pgd_t *base)
1233{ 1235{
1234 xen_setup_shared_info(); 1236 xen_setup_shared_info();
1237 xen_post_allocator_init();
1235} 1238}
1236 1239
1237static void xen_write_cr2(unsigned long cr2) 1240static void xen_write_cr2(unsigned long cr2)
@@ -1841,7 +1844,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1841#endif 1844#endif
1842} 1845}
1843 1846
1844__init void xen_post_allocator_init(void) 1847static __init void xen_post_allocator_init(void)
1845{ 1848{
1846 pv_mmu_ops.set_pte = xen_set_pte; 1849 pv_mmu_ops.set_pte = xen_set_pte;
1847 pv_mmu_ops.set_pmd = xen_set_pmd; 1850 pv_mmu_ops.set_pmd = xen_set_pmd;
@@ -1875,10 +1878,7 @@ static void xen_leave_lazy_mmu(void)
1875 preempt_enable(); 1878 preempt_enable();
1876} 1879}
1877 1880
1878const struct pv_mmu_ops xen_mmu_ops __initdata = { 1881static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1879 .pagetable_setup_start = xen_pagetable_setup_start,
1880 .pagetable_setup_done = xen_pagetable_setup_done,
1881
1882 .read_cr2 = xen_read_cr2, 1882 .read_cr2 = xen_read_cr2,
1883 .write_cr2 = xen_write_cr2, 1883 .write_cr2 = xen_write_cr2,
1884 1884
@@ -1954,6 +1954,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1954 .set_fixmap = xen_set_fixmap, 1954 .set_fixmap = xen_set_fixmap,
1955}; 1955};
1956 1956
1957void __init xen_init_mmu_ops(void)
1958{
1959 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
1960 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
1961 pv_mmu_ops = xen_mmu_ops;
1962}
1957 1963
1958#ifdef CONFIG_XEN_DEBUG_FS 1964#ifdef CONFIG_XEN_DEBUG_FS
1959 1965
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index da7302624897..5fe6bc7f5ecf 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -59,5 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
59 59
60unsigned long xen_read_cr2_direct(void); 60unsigned long xen_read_cr2_direct(void);
61 61
62extern const struct pv_mmu_ops xen_mmu_ops; 62extern void xen_init_mmu_ops(void);
63#endif /* _XEN_MMU_H */ 63#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 22494fd4c9b5..355fa6b99c9c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,8 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 30void xen_ident_map_ISA(void);
31void xen_reserve_top(void); 31void xen_reserve_top(void);
32 32
33void xen_post_allocator_init(void);
34
35char * __init xen_memory_setup(void); 33char * __init xen_memory_setup(void);
36void __init xen_arch_setup(void); 34void __init xen_arch_setup(void);
37void __init xen_init_IRQ(void); 35void __init xen_init_IRQ(void);