aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig58
-rw-r--r--arch/x86/Kconfig.cpu24
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/alternative.h20
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h34
-rw-r--r--arch/x86/include/asm/apic.h13
-rw-r--r--arch/x86/include/asm/arch_hweight.h61
-rw-r--r--arch/x86/include/asm/atomic.h25
-rw-r--r--arch/x86/include/asm/atomic64_32.h278
-rw-r--r--arch/x86/include/asm/atomic64_64.h25
-rw-r--r--arch/x86/include/asm/bitops.h4
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h3
-rw-r--r--arch/x86/include/asm/cpufeature.h58
-rw-r--r--arch/x86/include/asm/ds.h302
-rw-r--r--arch/x86/include/asm/dwarf2.h12
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h10
-rw-r--r--arch/x86/include/asm/hyperv.h11
-rw-r--r--arch/x86/include/asm/hypervisor.h27
-rw-r--r--arch/x86/include/asm/i387.h129
-rw-r--r--arch/x86/include/asm/i8253.h2
-rw-r--r--arch/x86/include/asm/insn.h2
-rw-r--r--arch/x86/include/asm/io_apic.h13
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/include/asm/mpspec.h10
-rw-r--r--arch/x86/include/asm/mshyperv.h14
-rw-r--r--arch/x86/include/asm/msr-index.h15
-rw-r--r--arch/x86/include/asm/percpu.h24
-rw-r--r--arch/x86/include/asm/perf_event.h76
-rw-r--r--arch/x86/include/asm/perf_event_p4.h794
-rw-r--r--arch/x86/include/asm/processor.h45
-rw-r--r--arch/x86/include/asm/ptrace-abi.h57
-rw-r--r--arch/x86/include/asm/ptrace.h6
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/traps.h2
-rw-r--r--arch/x86/include/asm/vmware.h27
-rw-r--r--arch/x86/include/asm/xsave.h7
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c133
-rw-r--r--arch/x86/kernel/alternative.c47
-rw-r--r--arch/x86/kernel/amd_iommu.c197
-rw-r--r--arch/x86/kernel/amd_iommu_init.c6
-rw-r--r--arch/x86/kernel/apic/es7000_32.c19
-rw-r--r--arch/x86/kernel/apic/io_apic.c99
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c14
-rw-r--r--arch/x86/kernel/cpu/common.c5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c169
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c52
-rw-r--r--arch/x86/kernel/cpu/intel.c8
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c181
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c4
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c55
-rw-r--r--arch/x86/kernel/cpu/perf_event.c815
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c357
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c641
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c857
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c31
-rw-r--r--arch/x86/kernel/cpu/vmware.c38
-rw-r--r--arch/x86/kernel/ds.c1437
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/entry_32.S19
-rw-r--r--arch/x86/kernel/hw_breakpoint.c41
-rw-r--r--arch/x86/kernel/i387.c107
-rw-r--r--arch/x86/kernel/i8253.c14
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kprobes.c43
-rw-r--r--arch/x86/kernel/microcode_core.c4
-rw-r--r--arch/x86/kernel/microcode_intel.c22
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/process.c50
-rw-r--r--arch/x86/kernel/process_32.c10
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/ptrace.c384
-rw-r--r--arch/x86/kernel/sfi.c4
-rw-r--r--arch/x86/kernel/step.c46
-rw-r--r--arch/x86/kernel/traps.c177
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/xsave.c8
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/kvm/vmx.c8
-rw-r--r--arch/x86/kvm/x86.c54
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/atomic64_32.c273
-rw-r--r--arch/x86/lib/atomic64_386_32.S174
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S224
-rw-r--r--arch/x86/math-emu/fpu_aux.c6
-rw-r--r--arch/x86/math-emu/fpu_entry.c4
-rw-r--r--arch/x86/math-emu/fpu_system.h2
-rw-r--r--arch/x86/mm/srat_64.c51
-rw-r--r--arch/x86/oprofile/nmi_int.c199
-rw-r--r--arch/x86/oprofile/op_model_amd.c280
-rw-r--r--arch/x86/oprofile/op_model_p4.c52
-rw-r--r--arch/x86/oprofile/op_model_ppro.c81
-rw-r--r--arch/x86/oprofile/op_x86_model.h4
-rw-r--r--arch/x86/pci/mrst.c4
111 files changed, 5780 insertions, 4822 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9458685902bd..a2d3a5fbeeda 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,11 +53,15 @@ config X86
53 select HAVE_KERNEL_LZMA 53 select HAVE_KERNEL_LZMA
54 select HAVE_KERNEL_LZO 54 select HAVE_KERNEL_LZO
55 select HAVE_HW_BREAKPOINT 55 select HAVE_HW_BREAKPOINT
56 select HAVE_MIXED_BREAKPOINTS_REGS
56 select PERF_EVENTS 57 select PERF_EVENTS
57 select ANON_INODES 58 select ANON_INODES
58 select HAVE_ARCH_KMEMCHECK 59 select HAVE_ARCH_KMEMCHECK
59 select HAVE_USER_RETURN_NOTIFIER 60 select HAVE_USER_RETURN_NOTIFIER
60 61
62config INSTRUCTION_DECODER
63 def_bool (KPROBES || PERF_EVENTS)
64
61config OUTPUT_FORMAT 65config OUTPUT_FORMAT
62 string 66 string
63 default "elf32-i386" if X86_32 67 default "elf32-i386" if X86_32
@@ -197,20 +201,17 @@ config HAVE_INTEL_TXT
197 201
198# Use the generic interrupt handling code in kernel/irq/: 202# Use the generic interrupt handling code in kernel/irq/:
199config GENERIC_HARDIRQS 203config GENERIC_HARDIRQS
200 bool 204 def_bool y
201 default y
202 205
203config GENERIC_HARDIRQS_NO__DO_IRQ 206config GENERIC_HARDIRQS_NO__DO_IRQ
204 def_bool y 207 def_bool y
205 208
206config GENERIC_IRQ_PROBE 209config GENERIC_IRQ_PROBE
207 bool 210 def_bool y
208 default y
209 211
210config GENERIC_PENDING_IRQ 212config GENERIC_PENDING_IRQ
211 bool 213 def_bool y
212 depends on GENERIC_HARDIRQS && SMP 214 depends on GENERIC_HARDIRQS && SMP
213 default y
214 215
215config USE_GENERIC_SMP_HELPERS 216config USE_GENERIC_SMP_HELPERS
216 def_bool y 217 def_bool y
@@ -225,19 +226,22 @@ config X86_64_SMP
225 depends on X86_64 && SMP 226 depends on X86_64 && SMP
226 227
227config X86_HT 228config X86_HT
228 bool 229 def_bool y
229 depends on SMP 230 depends on SMP
230 default y
231 231
232config X86_TRAMPOLINE 232config X86_TRAMPOLINE
233 bool 233 def_bool y
234 depends on SMP || (64BIT && ACPI_SLEEP) 234 depends on SMP || (64BIT && ACPI_SLEEP)
235 default y
236 235
237config X86_32_LAZY_GS 236config X86_32_LAZY_GS
238 def_bool y 237 def_bool y
239 depends on X86_32 && !CC_STACKPROTECTOR 238 depends on X86_32 && !CC_STACKPROTECTOR
240 239
240config ARCH_HWEIGHT_CFLAGS
241 string
242 default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
243 default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
244
241config KTIME_SCALAR 245config KTIME_SCALAR
242 def_bool X86_32 246 def_bool X86_32
243source "init/Kconfig" 247source "init/Kconfig"
@@ -447,7 +451,7 @@ config X86_NUMAQ
447 firmware with - send email to <Martin.Bligh@us.ibm.com>. 451 firmware with - send email to <Martin.Bligh@us.ibm.com>.
448 452
449config X86_SUPPORTS_MEMORY_FAILURE 453config X86_SUPPORTS_MEMORY_FAILURE
450 bool 454 def_bool y
451 # MCE code calls memory_failure(): 455 # MCE code calls memory_failure():
452 depends on X86_MCE 456 depends on X86_MCE
453 # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags: 457 # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
@@ -455,7 +459,6 @@ config X86_SUPPORTS_MEMORY_FAILURE
455 # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH: 459 # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
456 depends on X86_64 || !SPARSEMEM 460 depends on X86_64 || !SPARSEMEM
457 select ARCH_SUPPORTS_MEMORY_FAILURE 461 select ARCH_SUPPORTS_MEMORY_FAILURE
458 default y
459 462
460config X86_VISWS 463config X86_VISWS
461 bool "SGI 320/540 (Visual Workstation)" 464 bool "SGI 320/540 (Visual Workstation)"
@@ -570,7 +573,6 @@ config PARAVIRT_SPINLOCKS
570 573
571config PARAVIRT_CLOCK 574config PARAVIRT_CLOCK
572 bool 575 bool
573 default n
574 576
575endif 577endif
576 578
@@ -749,7 +751,6 @@ config MAXSMP
749 bool "Configure Maximum number of SMP Processors and NUMA Nodes" 751 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
750 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL 752 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
751 select CPUMASK_OFFSTACK 753 select CPUMASK_OFFSTACK
752 default n
753 ---help--- 754 ---help---
754 Configure maximum number of CPUS and NUMA Nodes for this architecture. 755 Configure maximum number of CPUS and NUMA Nodes for this architecture.
755 If unsure, say N. 756 If unsure, say N.
@@ -829,7 +830,6 @@ config X86_VISWS_APIC
829 830
830config X86_REROUTE_FOR_BROKEN_BOOT_IRQS 831config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
831 bool "Reroute for broken boot IRQs" 832 bool "Reroute for broken boot IRQs"
832 default n
833 depends on X86_IO_APIC 833 depends on X86_IO_APIC
834 ---help--- 834 ---help---
835 This option enables a workaround that fixes a source of 835 This option enables a workaround that fixes a source of
@@ -876,9 +876,8 @@ config X86_MCE_AMD
876 the DRAM Error Threshold. 876 the DRAM Error Threshold.
877 877
878config X86_ANCIENT_MCE 878config X86_ANCIENT_MCE
879 def_bool n 879 bool "Support for old Pentium 5 / WinChip machine checks"
880 depends on X86_32 && X86_MCE 880 depends on X86_32 && X86_MCE
881 prompt "Support for old Pentium 5 / WinChip machine checks"
882 ---help--- 881 ---help---
883 Include support for machine check handling on old Pentium 5 or WinChip 882 Include support for machine check handling on old Pentium 5 or WinChip
884 systems. These typically need to be enabled explicitely on the command 883 systems. These typically need to be enabled explicitely on the command
@@ -886,8 +885,7 @@ config X86_ANCIENT_MCE
886 885
887config X86_MCE_THRESHOLD 886config X86_MCE_THRESHOLD
888 depends on X86_MCE_AMD || X86_MCE_INTEL 887 depends on X86_MCE_AMD || X86_MCE_INTEL
889 bool 888 def_bool y
890 default y
891 889
892config X86_MCE_INJECT 890config X86_MCE_INJECT
893 depends on X86_MCE 891 depends on X86_MCE
@@ -1026,8 +1024,8 @@ config X86_CPUID
1026 1024
1027choice 1025choice
1028 prompt "High Memory Support" 1026 prompt "High Memory Support"
1029 default HIGHMEM4G if !X86_NUMAQ
1030 default HIGHMEM64G if X86_NUMAQ 1027 default HIGHMEM64G if X86_NUMAQ
1028 default HIGHMEM4G
1031 depends on X86_32 1029 depends on X86_32
1032 1030
1033config NOHIGHMEM 1031config NOHIGHMEM
@@ -1285,7 +1283,7 @@ source "mm/Kconfig"
1285 1283
1286config HIGHPTE 1284config HIGHPTE
1287 bool "Allocate 3rd-level pagetables from highmem" 1285 bool "Allocate 3rd-level pagetables from highmem"
1288 depends on X86_32 && (HIGHMEM4G || HIGHMEM64G) 1286 depends on HIGHMEM
1289 ---help--- 1287 ---help---
1290 The VM uses one page table entry for each page of physical memory. 1288 The VM uses one page table entry for each page of physical memory.
1291 For systems with a lot of RAM, this can be wasteful of precious 1289 For systems with a lot of RAM, this can be wasteful of precious
@@ -1369,8 +1367,7 @@ config MATH_EMULATION
1369 kernel, it won't hurt. 1367 kernel, it won't hurt.
1370 1368
1371config MTRR 1369config MTRR
1372 bool 1370 def_bool y
1373 default y
1374 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED 1371 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
1375 ---help--- 1372 ---help---
1376 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1373 On Intel P6 family processors (Pentium Pro, Pentium II and later)
@@ -1436,8 +1433,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1436 mtrr_spare_reg_nr=N on the kernel command line. 1433 mtrr_spare_reg_nr=N on the kernel command line.
1437 1434
1438config X86_PAT 1435config X86_PAT
1439 bool 1436 def_bool y
1440 default y
1441 prompt "x86 PAT support" if EMBEDDED 1437 prompt "x86 PAT support" if EMBEDDED
1442 depends on MTRR 1438 depends on MTRR
1443 ---help--- 1439 ---help---
@@ -1605,8 +1601,7 @@ config X86_NEED_RELOCS
1605 depends on X86_32 && RELOCATABLE 1601 depends on X86_32 && RELOCATABLE
1606 1602
1607config PHYSICAL_ALIGN 1603config PHYSICAL_ALIGN
1608 hex 1604 hex "Alignment value to which kernel should be aligned" if X86_32
1609 prompt "Alignment value to which kernel should be aligned" if X86_32
1610 default "0x1000000" 1605 default "0x1000000"
1611 range 0x2000 0x1000000 1606 range 0x2000 0x1000000
1612 ---help--- 1607 ---help---
@@ -1653,7 +1648,6 @@ config COMPAT_VDSO
1653 1648
1654config CMDLINE_BOOL 1649config CMDLINE_BOOL
1655 bool "Built-in kernel command line" 1650 bool "Built-in kernel command line"
1656 default n
1657 ---help--- 1651 ---help---
1658 Allow for specifying boot arguments to the kernel at 1652 Allow for specifying boot arguments to the kernel at
1659 build time. On some systems (e.g. embedded ones), it is 1653 build time. On some systems (e.g. embedded ones), it is
@@ -1687,7 +1681,6 @@ config CMDLINE
1687 1681
1688config CMDLINE_OVERRIDE 1682config CMDLINE_OVERRIDE
1689 bool "Built-in command line overrides boot loader arguments" 1683 bool "Built-in command line overrides boot loader arguments"
1690 default n
1691 depends on CMDLINE_BOOL 1684 depends on CMDLINE_BOOL
1692 ---help--- 1685 ---help---
1693 Set this option to 'Y' to have the kernel ignore the boot loader 1686 Set this option to 'Y' to have the kernel ignore the boot loader
@@ -1723,8 +1716,7 @@ source "drivers/acpi/Kconfig"
1723source "drivers/sfi/Kconfig" 1716source "drivers/sfi/Kconfig"
1724 1717
1725config X86_APM_BOOT 1718config X86_APM_BOOT
1726 bool 1719 def_bool y
1727 default y
1728 depends on APM || APM_MODULE 1720 depends on APM || APM_MODULE
1729 1721
1730menuconfig APM 1722menuconfig APM
@@ -1953,8 +1945,7 @@ config DMAR_DEFAULT_ON
1953 experimental. 1945 experimental.
1954 1946
1955config DMAR_BROKEN_GFX_WA 1947config DMAR_BROKEN_GFX_WA
1956 def_bool n 1948 bool "Workaround broken graphics drivers (going away soon)"
1957 prompt "Workaround broken graphics drivers (going away soon)"
1958 depends on DMAR && BROKEN 1949 depends on DMAR && BROKEN
1959 ---help--- 1950 ---help---
1960 Current Graphics drivers tend to use physical address 1951 Current Graphics drivers tend to use physical address
@@ -2052,7 +2043,6 @@ config SCx200HR_TIMER
2052config OLPC 2043config OLPC
2053 bool "One Laptop Per Child support" 2044 bool "One Laptop Per Child support"
2054 select GPIOLIB 2045 select GPIOLIB
2055 default n
2056 ---help--- 2046 ---help---
2057 Add support for detecting the unique features of the OLPC 2047 Add support for detecting the unique features of the OLPC
2058 XO hardware. 2048 XO hardware.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6a..2ac9069890cd 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -338,6 +338,10 @@ config X86_F00F_BUG
338 def_bool y 338 def_bool y
339 depends on M586MMX || M586TSC || M586 || M486 || M386 339 depends on M586MMX || M586TSC || M586 || M486 || M386
340 340
341config X86_INVD_BUG
342 def_bool y
343 depends on M486 || M386
344
341config X86_WP_WORKS_OK 345config X86_WP_WORKS_OK
342 def_bool y 346 def_bool y
343 depends on !M386 347 depends on !M386
@@ -502,23 +506,3 @@ config CPU_SUP_UMC_32
502 CPU might render the kernel unbootable. 506 CPU might render the kernel unbootable.
503 507
504 If unsure, say N. 508 If unsure, say N.
505
506config X86_DS
507 def_bool X86_PTRACE_BTS
508 depends on X86_DEBUGCTLMSR
509 select HAVE_HW_BRANCH_TRACER
510
511config X86_PTRACE_BTS
512 bool "Branch Trace Store"
513 default y
514 depends on X86_DEBUGCTLMSR
515 depends on BROKEN
516 ---help---
517 This adds a ptrace interface to the hardware's branch trace store.
518
519 Debuggers may use it to collect an execution trace of the debugged
520 application in order to answer the question 'how did I get here?'.
521 Debuggers may trace user mode as well as kernel mode.
522
523 Say Y unless there is no application development on this machine
524 and you want to save a small amount of code size.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb2..75085080b63e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -45,7 +45,6 @@ config EARLY_PRINTK
45 45
46config EARLY_PRINTK_DBGP 46config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port" 47 bool "Early printk via EHCI debug port"
48 default n
49 depends on EARLY_PRINTK && PCI 48 depends on EARLY_PRINTK && PCI
50 ---help--- 49 ---help---
51 Write kernel log output directly into the EHCI debug port. 50 Write kernel log output directly into the EHCI debug port.
@@ -76,7 +75,6 @@ config DEBUG_PER_CPU_MAPS
76 bool "Debug access to per_cpu maps" 75 bool "Debug access to per_cpu maps"
77 depends on DEBUG_KERNEL 76 depends on DEBUG_KERNEL
78 depends on SMP 77 depends on SMP
79 default n
80 ---help--- 78 ---help---
81 Say Y to verify that the per_cpu map being accessed has 79 Say Y to verify that the per_cpu map being accessed has
82 been setup. Adds a fair amount of code to kernel memory 80 been setup. Adds a fair amount of code to kernel memory
@@ -174,15 +172,6 @@ config IOMMU_LEAK
174 Add a simple leak tracer to the IOMMU code. This is useful when you 172 Add a simple leak tracer to the IOMMU code. This is useful when you
175 are debugging a buggy device driver that leaks IOMMU mappings. 173 are debugging a buggy device driver that leaks IOMMU mappings.
176 174
177config X86_DS_SELFTEST
178 bool "DS selftest"
179 default y
180 depends on DEBUG_KERNEL
181 depends on X86_DS
182 ---help---
183 Perform Debug Store selftests at boot time.
184 If in doubt, say "N".
185
186config HAVE_MMIOTRACE_SUPPORT 175config HAVE_MMIOTRACE_SUPPORT
187 def_bool y 176 def_bool y
188 177
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 0a43dc515e4c..8aa1b59b9074 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -95,8 +95,9 @@ sp-$(CONFIG_X86_64) := rsp
95cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) 95cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
96# is .cfi_signal_frame supported too? 96# is .cfi_signal_frame supported too?
97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) 97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
98KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) 98cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
99KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) 99KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
100KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
100 101
101LDFLAGS := -m elf_$(UTS_MACHINE) 102LDFLAGS := -m elf_$(UTS_MACHINE)
102 103
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index b97f786a48d5..a63a68be1cce 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -6,8 +6,8 @@
6 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
71: lock 71: lock
8 .section .smp_locks,"a" 8 .section .smp_locks,"a"
9 _ASM_ALIGN 9 .balign 4
10 _ASM_PTR 1b 10 .long 1b - .
11 .previous 11 .previous
12 .endm 12 .endm
13#else 13#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b09ec55650b3..03b6bb5394a0 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -28,20 +28,20 @@
28 */ 28 */
29 29
30#ifdef CONFIG_SMP 30#ifdef CONFIG_SMP
31#define LOCK_PREFIX \ 31#define LOCK_PREFIX_HERE \
32 ".section .smp_locks,\"a\"\n" \ 32 ".section .smp_locks,\"a\"\n" \
33 _ASM_ALIGN "\n" \ 33 ".balign 4\n" \
34 _ASM_PTR "661f\n" /* address */ \ 34 ".long 671f - .\n" /* offset */ \
35 ".previous\n" \ 35 ".previous\n" \
36 "661:\n\tlock; " 36 "671:"
37
38#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
37 39
38#else /* ! CONFIG_SMP */ 40#else /* ! CONFIG_SMP */
41#define LOCK_PREFIX_HERE ""
39#define LOCK_PREFIX "" 42#define LOCK_PREFIX ""
40#endif 43#endif
41 44
42/* This must be included *after* the definition of LOCK_PREFIX */
43#include <asm/cpufeature.h>
44
45struct alt_instr { 45struct alt_instr {
46 u8 *instr; /* original instruction */ 46 u8 *instr; /* original instruction */
47 u8 *replacement; 47 u8 *replacement;
@@ -96,6 +96,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
96 ".previous" 96 ".previous"
97 97
98/* 98/*
99 * This must be included *after* the definition of ALTERNATIVE due to
100 * <asm/arch_hweight.h>
101 */
102#include <asm/cpufeature.h>
103
104/*
99 * Alternative instructions for different CPU types or capabilities. 105 * Alternative instructions for different CPU types or capabilities.
100 * 106 *
101 * This allows to use optimized instructions even on generic binary 107 * This allows to use optimized instructions even on generic binary
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 86a0ff0aeac7..7014e88bc779 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -174,6 +174,40 @@
174 (~((1ULL << (12 + ((lvl) * 9))) - 1))) 174 (~((1ULL << (12 + ((lvl) * 9))) - 1)))
175#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) 175#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
176 176
177/*
178 * Returns the page table level to use for a given page size
179 * Pagesize is expected to be a power-of-two
180 */
181#define PAGE_SIZE_LEVEL(pagesize) \
182 ((__ffs(pagesize) - 12) / 9)
183/*
184 * Returns the number of ptes to use for a given page size
185 * Pagesize is expected to be a power-of-two
186 */
187#define PAGE_SIZE_PTE_COUNT(pagesize) \
188 (1ULL << ((__ffs(pagesize) - 12) % 9))
189
190/*
191 * Aligns a given io-virtual address to a given page size
192 * Pagesize is expected to be a power-of-two
193 */
194#define PAGE_SIZE_ALIGN(address, pagesize) \
195 ((address) & ~((pagesize) - 1))
196/*
197 * Creates an IOMMU PTE for an address an a given pagesize
198 * The PTE has no permission bits set
199 * Pagesize is expected to be a power-of-two larger than 4096
200 */
201#define PAGE_SIZE_PTE(address, pagesize) \
202 (((address) | ((pagesize) - 1)) & \
203 (~(pagesize >> 1)) & PM_ADDR_MASK)
204
205/*
206 * Takes a PTE value with mode=0x07 and returns the page size it maps
207 */
208#define PTE_PAGE_SIZE(pte) \
209 (1ULL << (1 + ffz(((pte) | 0xfffULL))))
210
177#define IOMMU_PTE_P (1ULL << 0) 211#define IOMMU_PTE_P (1ULL << 0)
178#define IOMMU_PTE_TV (1ULL << 1) 212#define IOMMU_PTE_TV (1ULL << 1)
179#define IOMMU_PTE_U (1ULL << 59) 213#define IOMMU_PTE_U (1ULL << 59)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index b4ac2cdcb64f..1fa03e04ae44 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -373,6 +373,7 @@ extern atomic_t init_deasserted;
373extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); 373extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
374#endif 374#endif
375 375
376#ifdef CONFIG_X86_LOCAL_APIC
376static inline u32 apic_read(u32 reg) 377static inline u32 apic_read(u32 reg)
377{ 378{
378 return apic->read(reg); 379 return apic->read(reg);
@@ -403,10 +404,19 @@ static inline u32 safe_apic_wait_icr_idle(void)
403 return apic->safe_wait_icr_idle(); 404 return apic->safe_wait_icr_idle();
404} 405}
405 406
407#else /* CONFIG_X86_LOCAL_APIC */
408
409static inline u32 apic_read(u32 reg) { return 0; }
410static inline void apic_write(u32 reg, u32 val) { }
411static inline u64 apic_icr_read(void) { return 0; }
412static inline void apic_icr_write(u32 low, u32 high) { }
413static inline void apic_wait_icr_idle(void) { }
414static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
415
416#endif /* CONFIG_X86_LOCAL_APIC */
406 417
407static inline void ack_APIC_irq(void) 418static inline void ack_APIC_irq(void)
408{ 419{
409#ifdef CONFIG_X86_LOCAL_APIC
410 /* 420 /*
411 * ack_APIC_irq() actually gets compiled as a single instruction 421 * ack_APIC_irq() actually gets compiled as a single instruction
412 * ... yummie. 422 * ... yummie.
@@ -414,7 +424,6 @@ static inline void ack_APIC_irq(void)
414 424
415 /* Docs say use 0 for future compatibility */ 425 /* Docs say use 0 for future compatibility */
416 apic_write(APIC_EOI, 0); 426 apic_write(APIC_EOI, 0);
417#endif
418} 427}
419 428
420static inline unsigned default_get_apic_id(unsigned long x) 429static inline unsigned default_get_apic_id(unsigned long x)
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
new file mode 100644
index 000000000000..9686c3d9ff73
--- /dev/null
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -0,0 +1,61 @@
1#ifndef _ASM_X86_HWEIGHT_H
2#define _ASM_X86_HWEIGHT_H
3
4#ifdef CONFIG_64BIT
5/* popcnt %edi, %eax -- redundant REX prefix for alignment */
6#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
7/* popcnt %rdi, %rax */
8#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
9#define REG_IN "D"
10#define REG_OUT "a"
11#else
12/* popcnt %eax, %eax */
13#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
14#define REG_IN "a"
15#define REG_OUT "a"
16#endif
17
18/*
19 * __sw_hweightXX are called from within the alternatives below
20 * and callee-clobbered registers need to be taken care of. See
21 * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
22 * compiler switches.
23 */
24static inline unsigned int __arch_hweight32(unsigned int w)
25{
26 unsigned int res = 0;
27
28 asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
29 : "="REG_OUT (res)
30 : REG_IN (w));
31
32 return res;
33}
34
35static inline unsigned int __arch_hweight16(unsigned int w)
36{
37 return __arch_hweight32(w & 0xffff);
38}
39
40static inline unsigned int __arch_hweight8(unsigned int w)
41{
42 return __arch_hweight32(w & 0xff);
43}
44
45static inline unsigned long __arch_hweight64(__u64 w)
46{
47 unsigned long res = 0;
48
49#ifdef CONFIG_X86_32
50 return __arch_hweight32((u32)w) +
51 __arch_hweight32((u32)(w >> 32));
52#else
53 asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
54 : "="REG_OUT (res)
55 : REG_IN (w));
56#endif /* CONFIG_X86_32 */
57
58 return res;
59}
60
61#endif
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 8f8217b9bdac..952a826ac4e5 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -22,7 +22,7 @@
22 */ 22 */
23static inline int atomic_read(const atomic_t *v) 23static inline int atomic_read(const atomic_t *v)
24{ 24{
25 return v->counter; 25 return (*(volatile int *)&(v)->counter);
26} 26}
27 27
28/** 28/**
@@ -246,6 +246,29 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
246 246
247#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) 247#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
248 248
249/*
250 * atomic_dec_if_positive - decrement by 1 if old value positive
251 * @v: pointer of type atomic_t
252 *
253 * The function returns the old value of *v minus 1, even if
254 * the atomic variable, v, was not decremented.
255 */
256static inline int atomic_dec_if_positive(atomic_t *v)
257{
258 int c, old, dec;
259 c = atomic_read(v);
260 for (;;) {
261 dec = c - 1;
262 if (unlikely(dec < 0))
263 break;
264 old = atomic_cmpxchg((v), c, dec);
265 if (likely(old == c))
266 break;
267 c = old;
268 }
269 return dec;
270}
271
249/** 272/**
250 * atomic_inc_short - increment of a short integer 273 * atomic_inc_short - increment of a short integer
251 * @v: pointer to type int 274 * @v: pointer to type int
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 03027bf28de5..2a934aa19a43 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,109 +14,193 @@ typedef struct {
14 14
15#define ATOMIC64_INIT(val) { (val) } 15#define ATOMIC64_INIT(val) { (val) }
16 16
17extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); 17#ifdef CONFIG_X86_CMPXCHG64
18#define ATOMIC64_ALTERNATIVE_(f, g) "call atomic64_" #g "_cx8"
19#else
20#define ATOMIC64_ALTERNATIVE_(f, g) ALTERNATIVE("call atomic64_" #f "_386", "call atomic64_" #g "_cx8", X86_FEATURE_CX8)
21#endif
22
23#define ATOMIC64_ALTERNATIVE(f) ATOMIC64_ALTERNATIVE_(f, f)
24
25/**
26 * atomic64_cmpxchg - cmpxchg atomic64 variable
27 * @p: pointer to type atomic64_t
28 * @o: expected value
29 * @n: new value
30 *
31 * Atomically sets @v to @n if it was equal to @o and returns
32 * the old value.
33 */
34
35static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
36{
37 return cmpxchg64(&v->counter, o, n);
38}
18 39
19/** 40/**
20 * atomic64_xchg - xchg atomic64 variable 41 * atomic64_xchg - xchg atomic64 variable
21 * @ptr: pointer to type atomic64_t 42 * @v: pointer to type atomic64_t
22 * @new_val: value to assign 43 * @n: value to assign
23 * 44 *
24 * Atomically xchgs the value of @ptr to @new_val and returns 45 * Atomically xchgs the value of @v to @n and returns
25 * the old value. 46 * the old value.
26 */ 47 */
27extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); 48static inline long long atomic64_xchg(atomic64_t *v, long long n)
49{
50 long long o;
51 unsigned high = (unsigned)(n >> 32);
52 unsigned low = (unsigned)n;
53 asm volatile(ATOMIC64_ALTERNATIVE(xchg)
54 : "=A" (o), "+b" (low), "+c" (high)
55 : "S" (v)
56 : "memory"
57 );
58 return o;
59}
28 60
29/** 61/**
30 * atomic64_set - set atomic64 variable 62 * atomic64_set - set atomic64 variable
31 * @ptr: pointer to type atomic64_t 63 * @v: pointer to type atomic64_t
32 * @new_val: value to assign 64 * @n: value to assign
33 * 65 *
34 * Atomically sets the value of @ptr to @new_val. 66 * Atomically sets the value of @v to @n.
35 */ 67 */
36extern void atomic64_set(atomic64_t *ptr, u64 new_val); 68static inline void atomic64_set(atomic64_t *v, long long i)
69{
70 unsigned high = (unsigned)(i >> 32);
71 unsigned low = (unsigned)i;
72 asm volatile(ATOMIC64_ALTERNATIVE(set)
73 : "+b" (low), "+c" (high)
74 : "S" (v)
75 : "eax", "edx", "memory"
76 );
77}
37 78
38/** 79/**
39 * atomic64_read - read atomic64 variable 80 * atomic64_read - read atomic64 variable
40 * @ptr: pointer to type atomic64_t 81 * @v: pointer to type atomic64_t
41 * 82 *
42 * Atomically reads the value of @ptr and returns it. 83 * Atomically reads the value of @v and returns it.
43 */ 84 */
44static inline u64 atomic64_read(atomic64_t *ptr) 85static inline long long atomic64_read(atomic64_t *v)
45{ 86{
46 u64 res; 87 long long r;
47 88 asm volatile(ATOMIC64_ALTERNATIVE(read)
48 /* 89 : "=A" (r), "+c" (v)
49 * Note, we inline this atomic64_t primitive because 90 : : "memory"
50 * it only clobbers EAX/EDX and leaves the others 91 );
51 * untouched. We also (somewhat subtly) rely on the 92 return r;
52 * fact that cmpxchg8b returns the current 64-bit value 93 }
53 * of the memory location we are touching:
54 */
55 asm volatile(
56 "mov %%ebx, %%eax\n\t"
57 "mov %%ecx, %%edx\n\t"
58 LOCK_PREFIX "cmpxchg8b %1\n"
59 : "=&A" (res)
60 : "m" (*ptr)
61 );
62
63 return res;
64}
65
66extern u64 atomic64_read(atomic64_t *ptr);
67 94
68/** 95/**
69 * atomic64_add_return - add and return 96 * atomic64_add_return - add and return
70 * @delta: integer value to add 97 * @i: integer value to add
71 * @ptr: pointer to type atomic64_t 98 * @v: pointer to type atomic64_t
72 * 99 *
73 * Atomically adds @delta to @ptr and returns @delta + *@ptr 100 * Atomically adds @i to @v and returns @i + *@v
74 */ 101 */
75extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); 102static inline long long atomic64_add_return(long long i, atomic64_t *v)
103{
104 asm volatile(ATOMIC64_ALTERNATIVE(add_return)
105 : "+A" (i), "+c" (v)
106 : : "memory"
107 );
108 return i;
109}
76 110
77/* 111/*
78 * Other variants with different arithmetic operators: 112 * Other variants with different arithmetic operators:
79 */ 113 */
80extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); 114static inline long long atomic64_sub_return(long long i, atomic64_t *v)
81extern u64 atomic64_inc_return(atomic64_t *ptr); 115{
82extern u64 atomic64_dec_return(atomic64_t *ptr); 116 asm volatile(ATOMIC64_ALTERNATIVE(sub_return)
117 : "+A" (i), "+c" (v)
118 : : "memory"
119 );
120 return i;
121}
122
123static inline long long atomic64_inc_return(atomic64_t *v)
124{
125 long long a;
126 asm volatile(ATOMIC64_ALTERNATIVE(inc_return)
127 : "=A" (a)
128 : "S" (v)
129 : "memory", "ecx"
130 );
131 return a;
132}
133
134static inline long long atomic64_dec_return(atomic64_t *v)
135{
136 long long a;
137 asm volatile(ATOMIC64_ALTERNATIVE(dec_return)
138 : "=A" (a)
139 : "S" (v)
140 : "memory", "ecx"
141 );
142 return a;
143}
83 144
84/** 145/**
85 * atomic64_add - add integer to atomic64 variable 146 * atomic64_add - add integer to atomic64 variable
86 * @delta: integer value to add 147 * @i: integer value to add
87 * @ptr: pointer to type atomic64_t 148 * @v: pointer to type atomic64_t
88 * 149 *
89 * Atomically adds @delta to @ptr. 150 * Atomically adds @i to @v.
90 */ 151 */
91extern void atomic64_add(u64 delta, atomic64_t *ptr); 152static inline long long atomic64_add(long long i, atomic64_t *v)
153{
154 asm volatile(ATOMIC64_ALTERNATIVE_(add, add_return)
155 : "+A" (i), "+c" (v)
156 : : "memory"
157 );
158 return i;
159}
92 160
93/** 161/**
94 * atomic64_sub - subtract the atomic64 variable 162 * atomic64_sub - subtract the atomic64 variable
95 * @delta: integer value to subtract 163 * @i: integer value to subtract
96 * @ptr: pointer to type atomic64_t 164 * @v: pointer to type atomic64_t
97 * 165 *
98 * Atomically subtracts @delta from @ptr. 166 * Atomically subtracts @i from @v.
99 */ 167 */
100extern void atomic64_sub(u64 delta, atomic64_t *ptr); 168static inline long long atomic64_sub(long long i, atomic64_t *v)
169{
170 asm volatile(ATOMIC64_ALTERNATIVE_(sub, sub_return)
171 : "+A" (i), "+c" (v)
172 : : "memory"
173 );
174 return i;
175}
101 176
102/** 177/**
103 * atomic64_sub_and_test - subtract value from variable and test result 178 * atomic64_sub_and_test - subtract value from variable and test result
104 * @delta: integer value to subtract 179 * @i: integer value to subtract
105 * @ptr: pointer to type atomic64_t 180 * @v: pointer to type atomic64_t
106 * 181 *
107 * Atomically subtracts @delta from @ptr and returns 182 * Atomically subtracts @i from @v and returns
108 * true if the result is zero, or false for all 183 * true if the result is zero, or false for all
109 * other cases. 184 * other cases.
110 */ 185 */
111extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); 186static inline int atomic64_sub_and_test(long long i, atomic64_t *v)
187{
188 return atomic64_sub_return(i, v) == 0;
189}
112 190
113/** 191/**
114 * atomic64_inc - increment atomic64 variable 192 * atomic64_inc - increment atomic64 variable
115 * @ptr: pointer to type atomic64_t 193 * @v: pointer to type atomic64_t
116 * 194 *
117 * Atomically increments @ptr by 1. 195 * Atomically increments @v by 1.
118 */ 196 */
119extern void atomic64_inc(atomic64_t *ptr); 197static inline void atomic64_inc(atomic64_t *v)
198{
199 asm volatile(ATOMIC64_ALTERNATIVE_(inc, inc_return)
200 : : "S" (v)
201 : "memory", "eax", "ecx", "edx"
202 );
203}
120 204
121/** 205/**
122 * atomic64_dec - decrement atomic64 variable 206 * atomic64_dec - decrement atomic64 variable
@@ -124,37 +208,97 @@ extern void atomic64_inc(atomic64_t *ptr);
124 * 208 *
125 * Atomically decrements @ptr by 1. 209 * Atomically decrements @ptr by 1.
126 */ 210 */
127extern void atomic64_dec(atomic64_t *ptr); 211static inline void atomic64_dec(atomic64_t *v)
212{
213 asm volatile(ATOMIC64_ALTERNATIVE_(dec, dec_return)
214 : : "S" (v)
215 : "memory", "eax", "ecx", "edx"
216 );
217}
128 218
129/** 219/**
130 * atomic64_dec_and_test - decrement and test 220 * atomic64_dec_and_test - decrement and test
131 * @ptr: pointer to type atomic64_t 221 * @v: pointer to type atomic64_t
132 * 222 *
133 * Atomically decrements @ptr by 1 and 223 * Atomically decrements @v by 1 and
134 * returns true if the result is 0, or false for all other 224 * returns true if the result is 0, or false for all other
135 * cases. 225 * cases.
136 */ 226 */
137extern int atomic64_dec_and_test(atomic64_t *ptr); 227static inline int atomic64_dec_and_test(atomic64_t *v)
228{
229 return atomic64_dec_return(v) == 0;
230}
138 231
139/** 232/**
140 * atomic64_inc_and_test - increment and test 233 * atomic64_inc_and_test - increment and test
141 * @ptr: pointer to type atomic64_t 234 * @v: pointer to type atomic64_t
142 * 235 *
143 * Atomically increments @ptr by 1 236 * Atomically increments @v by 1
144 * and returns true if the result is zero, or false for all 237 * and returns true if the result is zero, or false for all
145 * other cases. 238 * other cases.
146 */ 239 */
147extern int atomic64_inc_and_test(atomic64_t *ptr); 240static inline int atomic64_inc_and_test(atomic64_t *v)
241{
242 return atomic64_inc_return(v) == 0;
243}
148 244
149/** 245/**
150 * atomic64_add_negative - add and test if negative 246 * atomic64_add_negative - add and test if negative
151 * @delta: integer value to add 247 * @i: integer value to add
152 * @ptr: pointer to type atomic64_t 248 * @v: pointer to type atomic64_t
153 * 249 *
154 * Atomically adds @delta to @ptr and returns true 250 * Atomically adds @i to @v and returns true
155 * if the result is negative, or false when 251 * if the result is negative, or false when
156 * result is greater than or equal to zero. 252 * result is greater than or equal to zero.
157 */ 253 */
158extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); 254static inline int atomic64_add_negative(long long i, atomic64_t *v)
255{
256 return atomic64_add_return(i, v) < 0;
257}
258
259/**
260 * atomic64_add_unless - add unless the number is a given value
261 * @v: pointer of type atomic64_t
262 * @a: the amount to add to v...
263 * @u: ...unless v is equal to u.
264 *
265 * Atomically adds @a to @v, so long as it was not @u.
266 * Returns non-zero if @v was not @u, and zero otherwise.
267 */
268static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
269{
270 unsigned low = (unsigned)u;
271 unsigned high = (unsigned)(u >> 32);
272 asm volatile(ATOMIC64_ALTERNATIVE(add_unless) "\n\t"
273 : "+A" (a), "+c" (v), "+S" (low), "+D" (high)
274 : : "memory");
275 return (int)a;
276}
277
278
279static inline int atomic64_inc_not_zero(atomic64_t *v)
280{
281 int r;
282 asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero)
283 : "=a" (r)
284 : "S" (v)
285 : "ecx", "edx", "memory"
286 );
287 return r;
288}
289
290static inline long long atomic64_dec_if_positive(atomic64_t *v)
291{
292 long long r;
293 asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive)
294 : "=A" (r)
295 : "S" (v)
296 : "ecx", "memory"
297 );
298 return r;
299}
300
301#undef ATOMIC64_ALTERNATIVE
302#undef ATOMIC64_ALTERNATIVE_
159 303
160#endif /* _ASM_X86_ATOMIC64_32_H */ 304#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 51c5b4056929..49fd1ea22951 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
18 */ 18 */
19static inline long atomic64_read(const atomic64_t *v) 19static inline long atomic64_read(const atomic64_t *v)
20{ 20{
21 return v->counter; 21 return (*(volatile long *)&(v)->counter);
22} 22}
23 23
24/** 24/**
@@ -221,4 +221,27 @@ static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
221 221
222#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) 222#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
223 223
224/*
225 * atomic64_dec_if_positive - decrement by 1 if old value positive
226 * @v: pointer of type atomic_t
227 *
228 * The function returns the old value of *v minus 1, even if
229 * the atomic variable, v, was not decremented.
230 */
231static inline long atomic64_dec_if_positive(atomic64_t *v)
232{
233 long c, old, dec;
234 c = atomic64_read(v);
235 for (;;) {
236 dec = c - 1;
237 if (unlikely(dec < 0))
238 break;
239 old = atomic64_cmpxchg((v), c, dec);
240 if (likely(old == c))
241 break;
242 c = old;
243 }
244 return dec;
245}
246
224#endif /* _ASM_X86_ATOMIC64_64_H */ 247#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 02b47a603fc8..545776efeb16 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -444,7 +444,9 @@ static inline int fls(int x)
444 444
445#define ARCH_HAS_FAST_MULTIPLIER 1 445#define ARCH_HAS_FAST_MULTIPLIER 1
446 446
447#include <asm-generic/bitops/hweight.h> 447#include <asm/arch_hweight.h>
448
449#include <asm-generic/bitops/const_hweight.h>
448 450
449#endif /* __KERNEL__ */ 451#endif /* __KERNEL__ */
450 452
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 7a1065958ba9..3b62ab56c7a0 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -24,7 +24,7 @@
24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) 24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
25 25
26#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ 26#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
27 (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)) 27 (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
28#error "Invalid value for CONFIG_PHYSICAL_ALIGN" 28#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
29#endif 29#endif
30 30
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ffb9bb6b6c37..8859e12dd3cf 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -271,7 +271,8 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
271 __typeof__(*(ptr)) __ret; \ 271 __typeof__(*(ptr)) __ret; \
272 __typeof__(*(ptr)) __old = (o); \ 272 __typeof__(*(ptr)) __old = (o); \
273 __typeof__(*(ptr)) __new = (n); \ 273 __typeof__(*(ptr)) __new = (n); \
274 alternative_io("call cmpxchg8b_emu", \ 274 alternative_io(LOCK_PREFIX_HERE \
275 "call cmpxchg8b_emu", \
275 "lock; cmpxchg8b (%%esi)" , \ 276 "lock; cmpxchg8b (%%esi)" , \
276 X86_FEATURE_CX8, \ 277 X86_FEATURE_CX8, \
277 "=A" (__ret), \ 278 "=A" (__ret), \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0cd82d068613..dca9c545f44e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -161,6 +161,7 @@
161 */ 161 */
162#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ 162#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
163#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ 163#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
164#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
164 165
165/* Virtualization flags: Linux defined */ 166/* Virtualization flags: Linux defined */
166#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ 167#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
@@ -175,6 +176,7 @@
175 176
176#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 177#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
177 178
179#include <asm/asm.h>
178#include <linux/bitops.h> 180#include <linux/bitops.h>
179 181
180extern const char * const x86_cap_flags[NCAPINTS*32]; 182extern const char * const x86_cap_flags[NCAPINTS*32];
@@ -283,6 +285,62 @@ extern const char * const x86_power_flags[32];
283 285
284#endif /* CONFIG_X86_64 */ 286#endif /* CONFIG_X86_64 */
285 287
288/*
289 * Static testing of CPU features. Used the same as boot_cpu_has().
290 * These are only valid after alternatives have run, but will statically
291 * patch the target code for additional performance.
292 *
293 */
294static __always_inline __pure bool __static_cpu_has(u8 bit)
295{
296#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
297 asm goto("1: jmp %l[t_no]\n"
298 "2:\n"
299 ".section .altinstructions,\"a\"\n"
300 _ASM_ALIGN "\n"
301 _ASM_PTR "1b\n"
302 _ASM_PTR "0\n" /* no replacement */
303 " .byte %P0\n" /* feature bit */
304 " .byte 2b - 1b\n" /* source len */
305 " .byte 0\n" /* replacement len */
306 " .byte 0xff + 0 - (2b-1b)\n" /* padding */
307 ".previous\n"
308 : : "i" (bit) : : t_no);
309 return true;
310 t_no:
311 return false;
312#else
313 u8 flag;
314 /* Open-coded due to __stringify() in ALTERNATIVE() */
315 asm volatile("1: movb $0,%0\n"
316 "2:\n"
317 ".section .altinstructions,\"a\"\n"
318 _ASM_ALIGN "\n"
319 _ASM_PTR "1b\n"
320 _ASM_PTR "3f\n"
321 " .byte %P1\n" /* feature bit */
322 " .byte 2b - 1b\n" /* source len */
323 " .byte 4f - 3f\n" /* replacement len */
324 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */
325 ".previous\n"
326 ".section .altinstr_replacement,\"ax\"\n"
327 "3: movb $1,%0\n"
328 "4:\n"
329 ".previous\n"
330 : "=qm" (flag) : "i" (bit));
331 return flag;
332#endif
333}
334
335#define static_cpu_has(bit) \
336( \
337 __builtin_constant_p(boot_cpu_has(bit)) ? \
338 boot_cpu_has(bit) : \
339 (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \
340 __static_cpu_has(bit) : \
341 boot_cpu_has(bit) \
342)
343
286#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ 344#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
287 345
288#endif /* _ASM_X86_CPUFEATURE_H */ 346#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
deleted file mode 100644
index 70dac199b093..000000000000
--- a/arch/x86/include/asm/ds.h
+++ /dev/null
@@ -1,302 +0,0 @@
1/*
2 * Debug Store (DS) support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done)
11 * - buffer access
12 *
13 * It does not do:
14 * - security checking (is the caller allowed to trace the task)
15 * - buffer allocation (memory accounting)
16 *
17 *
18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */
21
22#ifndef _ASM_X86_DS_H
23#define _ASM_X86_DS_H
24
25
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/err.h>
29
30
31#ifdef CONFIG_X86_DS
32
33struct task_struct;
34struct ds_context;
35struct ds_tracer;
36struct bts_tracer;
37struct pebs_tracer;
38
39typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
40typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
41
42
43/*
44 * A list of features plus corresponding macros to talk about them in
45 * the ds_request function's flags parameter.
46 *
47 * We use the enum to index an array of corresponding control bits;
48 * we use the macro to index a flags bit-vector.
49 */
50enum ds_feature {
51 dsf_bts = 0,
52 dsf_bts_kernel,
53#define BTS_KERNEL (1 << dsf_bts_kernel)
54 /* trace kernel-mode branches */
55
56 dsf_bts_user,
57#define BTS_USER (1 << dsf_bts_user)
58 /* trace user-mode branches */
59
60 dsf_bts_overflow,
61 dsf_bts_max,
62 dsf_pebs = dsf_bts_max,
63
64 dsf_pebs_max,
65 dsf_ctl_max = dsf_pebs_max,
66 dsf_bts_timestamps = dsf_ctl_max,
67#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
68 /* add timestamps into BTS trace */
69
70#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
71};
72
73
74/*
75 * Request BTS or PEBS
76 *
77 * Due to alignement constraints, the actual buffer may be slightly
78 * smaller than the requested or provided buffer.
79 *
80 * Returns a pointer to a tracer structure on success, or
81 * ERR_PTR(errcode) on failure.
82 *
83 * The interrupt threshold is independent from the overflow callback
84 * to allow users to use their own overflow interrupt handling mechanism.
85 *
86 * The function might sleep.
87 *
88 * task: the task to request recording for
89 * cpu: the cpu to request recording for
90 * base: the base pointer for the (non-pageable) buffer;
91 * size: the size of the provided buffer in bytes
92 * ovfl: pointer to a function to be called on buffer overflow;
93 * NULL if cyclic buffer requested
94 * th: the interrupt threshold in records from the end of the buffer;
95 * -1 if no interrupt threshold is requested.
96 * flags: a bit-mask of the above flags
97 */
98extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
99 void *base, size_t size,
100 bts_ovfl_callback_t ovfl,
101 size_t th, unsigned int flags);
102extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
103 bts_ovfl_callback_t ovfl,
104 size_t th, unsigned int flags);
105extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
106 void *base, size_t size,
107 pebs_ovfl_callback_t ovfl,
108 size_t th, unsigned int flags);
109extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
110 void *base, size_t size,
111 pebs_ovfl_callback_t ovfl,
112 size_t th, unsigned int flags);
113
114/*
115 * Release BTS or PEBS resources
116 * Suspend and resume BTS or PEBS tracing
117 *
118 * Must be called with irq's enabled.
119 *
120 * tracer: the tracer handle returned from ds_request_~()
121 */
122extern void ds_release_bts(struct bts_tracer *tracer);
123extern void ds_suspend_bts(struct bts_tracer *tracer);
124extern void ds_resume_bts(struct bts_tracer *tracer);
125extern void ds_release_pebs(struct pebs_tracer *tracer);
126extern void ds_suspend_pebs(struct pebs_tracer *tracer);
127extern void ds_resume_pebs(struct pebs_tracer *tracer);
128
129/*
130 * Release BTS or PEBS resources
131 * Suspend and resume BTS or PEBS tracing
132 *
133 * Cpu tracers must call this on the traced cpu.
134 * Task tracers must call ds_release_~_noirq() for themselves.
135 *
136 * May be called with irq's disabled.
137 *
138 * Returns 0 if successful;
139 * -EPERM if the cpu tracer does not trace the current cpu.
140 * -EPERM if the task tracer does not trace itself.
141 *
142 * tracer: the tracer handle returned from ds_request_~()
143 */
144extern int ds_release_bts_noirq(struct bts_tracer *tracer);
145extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
146extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
147extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
148extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
149extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
150
151
152/*
153 * The raw DS buffer state as it is used for BTS and PEBS recording.
154 *
155 * This is the low-level, arch-dependent interface for working
156 * directly on the raw trace data.
157 */
158struct ds_trace {
159 /* the number of bts/pebs records */
160 size_t n;
161 /* the size of a bts/pebs record in bytes */
162 size_t size;
163 /* pointers into the raw buffer:
164 - to the first entry */
165 void *begin;
166 /* - one beyond the last entry */
167 void *end;
168 /* - one beyond the newest entry */
169 void *top;
170 /* - the interrupt threshold */
171 void *ith;
172 /* flags given on ds_request() */
173 unsigned int flags;
174};
175
176/*
177 * An arch-independent view on branch trace data.
178 */
179enum bts_qualifier {
180 bts_invalid,
181#define BTS_INVALID bts_invalid
182
183 bts_branch,
184#define BTS_BRANCH bts_branch
185
186 bts_task_arrives,
187#define BTS_TASK_ARRIVES bts_task_arrives
188
189 bts_task_departs,
190#define BTS_TASK_DEPARTS bts_task_departs
191
192 bts_qual_bit_size = 4,
193 bts_qual_max = (1 << bts_qual_bit_size),
194};
195
196struct bts_struct {
197 __u64 qualifier;
198 union {
199 /* BTS_BRANCH */
200 struct {
201 __u64 from;
202 __u64 to;
203 } lbr;
204 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
205 struct {
206 __u64 clock;
207 pid_t pid;
208 } event;
209 } variant;
210};
211
212
213/*
214 * The BTS state.
215 *
216 * This gives access to the raw DS state and adds functions to provide
217 * an arch-independent view of the BTS data.
218 */
219struct bts_trace {
220 struct ds_trace ds;
221
222 int (*read)(struct bts_tracer *tracer, const void *at,
223 struct bts_struct *out);
224 int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
225};
226
227
228/*
229 * The PEBS state.
230 *
231 * This gives access to the raw DS state and the PEBS-specific counter
232 * reset value.
233 */
234struct pebs_trace {
235 struct ds_trace ds;
236
237 /* the number of valid counters in the below array */
238 unsigned int counters;
239
240#define MAX_PEBS_COUNTERS 4
241 /* the counter reset value */
242 unsigned long long counter_reset[MAX_PEBS_COUNTERS];
243};
244
245
246/*
247 * Read the BTS or PEBS trace.
248 *
249 * Returns a view on the trace collected for the parameter tracer.
250 *
251 * The view remains valid as long as the traced task is not running or
252 * the tracer is suspended.
253 * Writes into the trace buffer are not reflected.
254 *
255 * tracer: the tracer handle returned from ds_request_~()
256 */
257extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
258extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
259
260
261/*
262 * Reset the write pointer of the BTS/PEBS buffer.
263 *
264 * Returns 0 on success; -Eerrno on error
265 *
266 * tracer: the tracer handle returned from ds_request_~()
267 */
268extern int ds_reset_bts(struct bts_tracer *tracer);
269extern int ds_reset_pebs(struct pebs_tracer *tracer);
270
271/*
272 * Set the PEBS counter reset value.
273 *
274 * Returns 0 on success; -Eerrno on error
275 *
276 * tracer: the tracer handle returned from ds_request_pebs()
277 * counter: the index of the counter
278 * value: the new counter reset value
279 */
280extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
281 unsigned int counter, u64 value);
282
283/*
284 * Initialization
285 */
286struct cpuinfo_x86;
287extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
288
289/*
290 * Context switch work
291 */
292extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
293
294#else /* CONFIG_X86_DS */
295
296struct cpuinfo_x86;
297static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
298static inline void ds_switch_to(struct task_struct *prev,
299 struct task_struct *next) {}
300
301#endif /* CONFIG_X86_DS */
302#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index ae6253ab9029..733f7e91e7a9 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -34,6 +34,18 @@
34#define CFI_SIGNAL_FRAME 34#define CFI_SIGNAL_FRAME
35#endif 35#endif
36 36
37#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
38 /*
39 * Emit CFI data in .debug_frame sections, not .eh_frame sections.
40 * The latter we currently just discard since we don't do DWARF
41 * unwinding at runtime. So only the offline DWARF information is
42 * useful to anyone. Note we should not use this directive if this
43 * file is used in the vDSO assembly, or if vmlinux.lds.S gets
44 * changed so it doesn't discard .eh_frame.
45 */
46 .cfi_sections .debug_frame
47#endif
48
37#else 49#else
38 50
39/* 51/*
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f8576427cfe..aeab29aee617 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
35 35
36#define __ARCH_IRQ_STAT 36#define __ARCH_IRQ_STAT
37 37
38#define inc_irq_stat(member) percpu_add(irq_stat.member, 1) 38#define inc_irq_stat(member) percpu_inc(irq_stat.member)
39 39
40#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending) 40#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
41 41
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 2a1bd8f4f23a..942255310e6a 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -41,12 +41,16 @@ struct arch_hw_breakpoint {
41/* Total number of available HW breakpoint registers */ 41/* Total number of available HW breakpoint registers */
42#define HBP_NUM 4 42#define HBP_NUM 4
43 43
44static inline int hw_breakpoint_slots(int type)
45{
46 return HBP_NUM;
47}
48
44struct perf_event; 49struct perf_event;
45struct pmu; 50struct pmu;
46 51
47extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len); 52extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
48extern int arch_validate_hwbkpt_settings(struct perf_event *bp, 53extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
49 struct task_struct *tsk);
50extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, 54extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
51 unsigned long val, void *data); 55 unsigned long val, void *data);
52 56
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index e153a2b3889a..5df477ac3af7 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -1,5 +1,5 @@
1#ifndef _ASM_X86_KVM_HYPERV_H 1#ifndef _ASM_X86_HYPERV_H
2#define _ASM_X86_KVM_HYPERV_H 2#define _ASM_X86_HYPERV_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5 5
@@ -14,6 +14,10 @@
14#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 14#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
15#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 15#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
16 16
17#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
18#define HYPERV_CPUID_MIN 0x40000005
19#define HYPERV_CPUID_MAX 0x4000ffff
20
17/* 21/*
18 * Feature identification. EAX indicates which features are available 22 * Feature identification. EAX indicates which features are available
19 * to the partition based upon the current partition privileges. 23 * to the partition based upon the current partition privileges.
@@ -129,6 +133,9 @@
129/* MSR used to provide vcpu index */ 133/* MSR used to provide vcpu index */
130#define HV_X64_MSR_VP_INDEX 0x40000002 134#define HV_X64_MSR_VP_INDEX 0x40000002
131 135
136/* MSR used to read the per-partition time reference counter */
137#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
138
132/* Define the virtual APIC registers */ 139/* Define the virtual APIC registers */
133#define HV_X64_MSR_EOI 0x40000070 140#define HV_X64_MSR_EOI 0x40000070
134#define HV_X64_MSR_ICR 0x40000071 141#define HV_X64_MSR_ICR 0x40000071
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index b78c0941e422..70abda7058c8 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -17,10 +17,33 @@
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 * 18 *
19 */ 19 */
20#ifndef ASM_X86__HYPERVISOR_H 20#ifndef _ASM_X86_HYPERVISOR_H
21#define ASM_X86__HYPERVISOR_H 21#define _ASM_X86_HYPERVISOR_H
22 22
23extern void init_hypervisor(struct cpuinfo_x86 *c); 23extern void init_hypervisor(struct cpuinfo_x86 *c);
24extern void init_hypervisor_platform(void); 24extern void init_hypervisor_platform(void);
25 25
26/*
27 * x86 hypervisor information
28 */
29struct hypervisor_x86 {
30 /* Hypervisor name */
31 const char *name;
32
33 /* Detection routine */
34 bool (*detect)(void);
35
36 /* Adjust CPU feature bits (run once per CPU) */
37 void (*set_cpu_features)(struct cpuinfo_x86 *);
38
39 /* Platform setup (run once per boot) */
40 void (*init_platform)(void);
41};
42
43extern const struct hypervisor_x86 *x86_hyper;
44
45/* Recognized hypervisors */
46extern const struct hypervisor_x86 x86_hyper_vmware;
47extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
48
26#endif 49#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index da2930924501..c991b3a7b904 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -16,7 +16,9 @@
16#include <linux/kernel_stat.h> 16#include <linux/kernel_stat.h>
17#include <linux/regset.h> 17#include <linux/regset.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/slab.h>
19#include <asm/asm.h> 20#include <asm/asm.h>
21#include <asm/cpufeature.h>
20#include <asm/processor.h> 22#include <asm/processor.h>
21#include <asm/sigcontext.h> 23#include <asm/sigcontext.h>
22#include <asm/user.h> 24#include <asm/user.h>
@@ -56,6 +58,11 @@ extern int restore_i387_xstate_ia32(void __user *buf);
56 58
57#define X87_FSW_ES (1 << 7) /* Exception Summary */ 59#define X87_FSW_ES (1 << 7) /* Exception Summary */
58 60
61static __always_inline __pure bool use_xsave(void)
62{
63 return static_cpu_has(X86_FEATURE_XSAVE);
64}
65
59#ifdef CONFIG_X86_64 66#ifdef CONFIG_X86_64
60 67
61/* Ignore delayed exceptions from user space */ 68/* Ignore delayed exceptions from user space */
@@ -91,15 +98,15 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
91 values. The kernel data segment can be sometimes 0 and sometimes 98 values. The kernel data segment can be sometimes 0 and sometimes
92 new user value. Both should be ok. 99 new user value. Both should be ok.
93 Use the PDA as safe address because it should be already in L1. */ 100 Use the PDA as safe address because it should be already in L1. */
94static inline void clear_fpu_state(struct task_struct *tsk) 101static inline void fpu_clear(struct fpu *fpu)
95{ 102{
96 struct xsave_struct *xstate = &tsk->thread.xstate->xsave; 103 struct xsave_struct *xstate = &fpu->state->xsave;
97 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 104 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
98 105
99 /* 106 /*
100 * xsave header may indicate the init state of the FP. 107 * xsave header may indicate the init state of the FP.
101 */ 108 */
102 if ((task_thread_info(tsk)->status & TS_XSAVE) && 109 if (use_xsave() &&
103 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) 110 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
104 return; 111 return;
105 112
@@ -111,6 +118,11 @@ static inline void clear_fpu_state(struct task_struct *tsk)
111 X86_FEATURE_FXSAVE_LEAK); 118 X86_FEATURE_FXSAVE_LEAK);
112} 119}
113 120
121static inline void clear_fpu_state(struct task_struct *tsk)
122{
123 fpu_clear(&tsk->thread.fpu);
124}
125
114static inline int fxsave_user(struct i387_fxsave_struct __user *fx) 126static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
115{ 127{
116 int err; 128 int err;
@@ -135,7 +147,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
135 return err; 147 return err;
136} 148}
137 149
138static inline void fxsave(struct task_struct *tsk) 150static inline void fpu_fxsave(struct fpu *fpu)
139{ 151{
140 /* Using "rex64; fxsave %0" is broken because, if the memory operand 152 /* Using "rex64; fxsave %0" is broken because, if the memory operand
141 uses any extended registers for addressing, a second REX prefix 153 uses any extended registers for addressing, a second REX prefix
@@ -145,42 +157,45 @@ static inline void fxsave(struct task_struct *tsk)
145 /* Using "fxsaveq %0" would be the ideal choice, but is only supported 157 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
146 starting with gas 2.16. */ 158 starting with gas 2.16. */
147 __asm__ __volatile__("fxsaveq %0" 159 __asm__ __volatile__("fxsaveq %0"
148 : "=m" (tsk->thread.xstate->fxsave)); 160 : "=m" (fpu->state->fxsave));
149#elif 0 161#elif 0
150 /* Using, as a workaround, the properly prefixed form below isn't 162 /* Using, as a workaround, the properly prefixed form below isn't
151 accepted by any binutils version so far released, complaining that 163 accepted by any binutils version so far released, complaining that
152 the same type of prefix is used twice if an extended register is 164 the same type of prefix is used twice if an extended register is
153 needed for addressing (fix submitted to mainline 2005-11-21). */ 165 needed for addressing (fix submitted to mainline 2005-11-21). */
154 __asm__ __volatile__("rex64/fxsave %0" 166 __asm__ __volatile__("rex64/fxsave %0"
155 : "=m" (tsk->thread.xstate->fxsave)); 167 : "=m" (fpu->state->fxsave));
156#else 168#else
157 /* This, however, we can work around by forcing the compiler to select 169 /* This, however, we can work around by forcing the compiler to select
158 an addressing mode that doesn't require extended registers. */ 170 an addressing mode that doesn't require extended registers. */
159 __asm__ __volatile__("rex64/fxsave (%1)" 171 __asm__ __volatile__("rex64/fxsave (%1)"
160 : "=m" (tsk->thread.xstate->fxsave) 172 : "=m" (fpu->state->fxsave)
161 : "cdaSDb" (&tsk->thread.xstate->fxsave)); 173 : "cdaSDb" (&fpu->state->fxsave));
162#endif 174#endif
163} 175}
164 176
165static inline void __save_init_fpu(struct task_struct *tsk) 177static inline void fpu_save_init(struct fpu *fpu)
166{ 178{
167 if (task_thread_info(tsk)->status & TS_XSAVE) 179 if (use_xsave())
168 xsave(tsk); 180 fpu_xsave(fpu);
169 else 181 else
170 fxsave(tsk); 182 fpu_fxsave(fpu);
183
184 fpu_clear(fpu);
185}
171 186
172 clear_fpu_state(tsk); 187static inline void __save_init_fpu(struct task_struct *tsk)
188{
189 fpu_save_init(&tsk->thread.fpu);
173 task_thread_info(tsk)->status &= ~TS_USEDFPU; 190 task_thread_info(tsk)->status &= ~TS_USEDFPU;
174} 191}
175 192
176#else /* CONFIG_X86_32 */ 193#else /* CONFIG_X86_32 */
177 194
178#ifdef CONFIG_MATH_EMULATION 195#ifdef CONFIG_MATH_EMULATION
179extern void finit_task(struct task_struct *tsk); 196extern void finit_soft_fpu(struct i387_soft_struct *soft);
180#else 197#else
181static inline void finit_task(struct task_struct *tsk) 198static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
182{
183}
184#endif 199#endif
185 200
186static inline void tolerant_fwait(void) 201static inline void tolerant_fwait(void)
@@ -216,13 +231,13 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
216/* 231/*
217 * These must be called with preempt disabled 232 * These must be called with preempt disabled
218 */ 233 */
219static inline void __save_init_fpu(struct task_struct *tsk) 234static inline void fpu_save_init(struct fpu *fpu)
220{ 235{
221 if (task_thread_info(tsk)->status & TS_XSAVE) { 236 if (use_xsave()) {
222 struct xsave_struct *xstate = &tsk->thread.xstate->xsave; 237 struct xsave_struct *xstate = &fpu->state->xsave;
223 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 238 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
224 239
225 xsave(tsk); 240 fpu_xsave(fpu);
226 241
227 /* 242 /*
228 * xsave header may indicate the init state of the FP. 243 * xsave header may indicate the init state of the FP.
@@ -246,8 +261,8 @@ static inline void __save_init_fpu(struct task_struct *tsk)
246 "fxsave %[fx]\n" 261 "fxsave %[fx]\n"
247 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", 262 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
248 X86_FEATURE_FXSR, 263 X86_FEATURE_FXSR,
249 [fx] "m" (tsk->thread.xstate->fxsave), 264 [fx] "m" (fpu->state->fxsave),
250 [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory"); 265 [fsw] "m" (fpu->state->fxsave.swd) : "memory");
251clear_state: 266clear_state:
252 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception 267 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
253 is pending. Clear the x87 state here by setting it to fixed 268 is pending. Clear the x87 state here by setting it to fixed
@@ -259,17 +274,34 @@ clear_state:
259 X86_FEATURE_FXSAVE_LEAK, 274 X86_FEATURE_FXSAVE_LEAK,
260 [addr] "m" (safe_address)); 275 [addr] "m" (safe_address));
261end: 276end:
277 ;
278}
279
280static inline void __save_init_fpu(struct task_struct *tsk)
281{
282 fpu_save_init(&tsk->thread.fpu);
262 task_thread_info(tsk)->status &= ~TS_USEDFPU; 283 task_thread_info(tsk)->status &= ~TS_USEDFPU;
263} 284}
264 285
286
265#endif /* CONFIG_X86_64 */ 287#endif /* CONFIG_X86_64 */
266 288
267static inline int restore_fpu_checking(struct task_struct *tsk) 289static inline int fpu_fxrstor_checking(struct fpu *fpu)
268{ 290{
269 if (task_thread_info(tsk)->status & TS_XSAVE) 291 return fxrstor_checking(&fpu->state->fxsave);
270 return xrstor_checking(&tsk->thread.xstate->xsave); 292}
293
294static inline int fpu_restore_checking(struct fpu *fpu)
295{
296 if (use_xsave())
297 return fpu_xrstor_checking(fpu);
271 else 298 else
272 return fxrstor_checking(&tsk->thread.xstate->fxsave); 299 return fpu_fxrstor_checking(fpu);
300}
301
302static inline int restore_fpu_checking(struct task_struct *tsk)
303{
304 return fpu_restore_checking(&tsk->thread.fpu);
273} 305}
274 306
275/* 307/*
@@ -397,30 +429,59 @@ static inline void clear_fpu(struct task_struct *tsk)
397static inline unsigned short get_fpu_cwd(struct task_struct *tsk) 429static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
398{ 430{
399 if (cpu_has_fxsr) { 431 if (cpu_has_fxsr) {
400 return tsk->thread.xstate->fxsave.cwd; 432 return tsk->thread.fpu.state->fxsave.cwd;
401 } else { 433 } else {
402 return (unsigned short)tsk->thread.xstate->fsave.cwd; 434 return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
403 } 435 }
404} 436}
405 437
406static inline unsigned short get_fpu_swd(struct task_struct *tsk) 438static inline unsigned short get_fpu_swd(struct task_struct *tsk)
407{ 439{
408 if (cpu_has_fxsr) { 440 if (cpu_has_fxsr) {
409 return tsk->thread.xstate->fxsave.swd; 441 return tsk->thread.fpu.state->fxsave.swd;
410 } else { 442 } else {
411 return (unsigned short)tsk->thread.xstate->fsave.swd; 443 return (unsigned short)tsk->thread.fpu.state->fsave.swd;
412 } 444 }
413} 445}
414 446
415static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) 447static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
416{ 448{
417 if (cpu_has_xmm) { 449 if (cpu_has_xmm) {
418 return tsk->thread.xstate->fxsave.mxcsr; 450 return tsk->thread.fpu.state->fxsave.mxcsr;
419 } else { 451 } else {
420 return MXCSR_DEFAULT; 452 return MXCSR_DEFAULT;
421 } 453 }
422} 454}
423 455
456static bool fpu_allocated(struct fpu *fpu)
457{
458 return fpu->state != NULL;
459}
460
461static inline int fpu_alloc(struct fpu *fpu)
462{
463 if (fpu_allocated(fpu))
464 return 0;
465 fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
466 if (!fpu->state)
467 return -ENOMEM;
468 WARN_ON((unsigned long)fpu->state & 15);
469 return 0;
470}
471
472static inline void fpu_free(struct fpu *fpu)
473{
474 if (fpu->state) {
475 kmem_cache_free(task_xstate_cachep, fpu->state);
476 fpu->state = NULL;
477 }
478}
479
480static inline void fpu_copy(struct fpu *dst, struct fpu *src)
481{
482 memcpy(dst->state, src->state, xstate_size);
483}
484
424#endif /* __ASSEMBLY__ */ 485#endif /* __ASSEMBLY__ */
425 486
426#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 487#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index 1edbf89680fd..fc1f579fb965 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,7 +6,7 @@
6#define PIT_CH0 0x40 6#define PIT_CH0 0x40
7#define PIT_CH2 0x42 7#define PIT_CH2 0x42
8 8
9extern spinlock_t i8253_lock; 9extern raw_spinlock_t i8253_lock;
10 10
11extern struct clock_event_device *global_clock_event; 11extern struct clock_event_device *global_clock_event;
12 12
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 96c2e0ad04ca..88c765e16410 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -68,6 +68,8 @@ struct insn {
68 const insn_byte_t *next_byte; 68 const insn_byte_t *next_byte;
69}; 69};
70 70
71#define MAX_INSN_SIZE 16
72
71#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) 73#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
72#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) 74#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
73#define X86_MODRM_RM(modrm) ((modrm) & 0x07) 75#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 35832a03a515..63cb4096c3dc 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,6 @@ struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 159extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 160 struct io_apic_irq_attr *irq_attr);
161void setup_IO_APIC_irq_extra(u32 gsi); 161void setup_IO_APIC_irq_extra(u32 gsi);
162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
163extern void ioapic_init_mappings(void); 162extern void ioapic_init_mappings(void);
164extern void ioapic_insert_resources(void); 163extern void ioapic_insert_resources(void);
165 164
@@ -180,12 +179,13 @@ extern void ioapic_write_entry(int apic, int pin,
180extern void setup_ioapic_ids_from_mpc(void); 179extern void setup_ioapic_ids_from_mpc(void);
181 180
182struct mp_ioapic_gsi{ 181struct mp_ioapic_gsi{
183 int gsi_base; 182 u32 gsi_base;
184 int gsi_end; 183 u32 gsi_end;
185}; 184};
186extern struct mp_ioapic_gsi mp_gsi_routing[]; 185extern struct mp_ioapic_gsi mp_gsi_routing[];
187int mp_find_ioapic(int gsi); 186extern u32 gsi_end;
188int mp_find_ioapic_pin(int ioapic, int gsi); 187int mp_find_ioapic(u32 gsi);
188int mp_find_ioapic_pin(int ioapic, u32 gsi);
189void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); 189void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
190extern void __init pre_init_apic_IRQ0(void); 190extern void __init pre_init_apic_IRQ0(void);
191 191
@@ -197,7 +197,8 @@ static const int timer_through_8259 = 0;
197static inline void ioapic_init_mappings(void) { } 197static inline void ioapic_init_mappings(void) { }
198static inline void ioapic_insert_resources(void) { } 198static inline void ioapic_insert_resources(void) { }
199static inline void probe_nr_irqs_gsi(void) { } 199static inline void probe_nr_irqs_gsi(void) { }
200static inline int mp_find_ioapic(int gsi) { return 0; } 200#define gsi_end (NR_IRQS_LEGACY - 1)
201static inline int mp_find_ioapic(u32 gsi) { return 0; }
201 202
202struct io_apic_irq_attr; 203struct io_apic_irq_attr;
203static inline int io_apic_set_pci_routing(struct device *dev, int irq, 204static inline int io_apic_set_pci_routing(struct device *dev, int irq,
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index f70e60071fe8..af00bd1d2089 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -16,11 +16,16 @@ extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void); 16extern int k8_scan_nodes(void);
17 17
18#ifdef CONFIG_K8_NB 18#ifdef CONFIG_K8_NB
19extern int num_k8_northbridges;
20
19static inline struct pci_dev *node_to_k8_nb_misc(int node) 21static inline struct pci_dev *node_to_k8_nb_misc(int node)
20{ 22{
21 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; 23 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
22} 24}
25
23#else 26#else
27#define num_k8_northbridges 0
28
24static inline struct pci_dev *node_to_k8_nb_misc(int node) 29static inline struct pci_dev *node_to_k8_nb_misc(int node)
25{ 30{
26 return NULL; 31 return NULL;
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4ffa345a8ccb..547882539157 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -24,6 +24,7 @@
24#include <linux/types.h> 24#include <linux/types.h>
25#include <linux/ptrace.h> 25#include <linux/ptrace.h>
26#include <linux/percpu.h> 26#include <linux/percpu.h>
27#include <asm/insn.h>
27 28
28#define __ARCH_WANT_KPROBES_INSN_SLOT 29#define __ARCH_WANT_KPROBES_INSN_SLOT
29 30
@@ -36,7 +37,6 @@ typedef u8 kprobe_opcode_t;
36#define RELATIVEJUMP_SIZE 5 37#define RELATIVEJUMP_SIZE 5
37#define RELATIVECALL_OPCODE 0xe8 38#define RELATIVECALL_OPCODE 0xe8
38#define RELATIVE_ADDR_SIZE 4 39#define RELATIVE_ADDR_SIZE 4
39#define MAX_INSN_SIZE 16
40#define MAX_STACK_SIZE 64 40#define MAX_STACK_SIZE 64
41#define MIN_STACK_SIZE(ADDR) \ 41#define MIN_STACK_SIZE(ADDR) \
42 (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \ 42 (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index d8bf23a88d05..c82868e9f905 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -105,16 +105,6 @@ extern void mp_config_acpi_legacy_irqs(void);
105struct device; 105struct device;
106extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, 106extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
107 int active_high_low); 107 int active_high_low);
108extern int acpi_probe_gsi(void);
109#ifdef CONFIG_X86_IO_APIC
110extern int mp_find_ioapic(int gsi);
111extern int mp_find_ioapic_pin(int ioapic, int gsi);
112#endif
113#else /* !CONFIG_ACPI: */
114static inline int acpi_probe_gsi(void)
115{
116 return 0;
117}
118#endif /* CONFIG_ACPI */ 108#endif /* CONFIG_ACPI */
119 109
120#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) 110#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
new file mode 100644
index 000000000000..79ce5685ab64
--- /dev/null
+++ b/arch/x86/include/asm/mshyperv.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_MSHYPER_H
2#define _ASM_X86_MSHYPER_H
3
4#include <linux/types.h>
5#include <asm/hyperv.h>
6
7struct ms_hyperv_info {
8 u32 features;
9 u32 hints;
10};
11
12extern struct ms_hyperv_info ms_hyperv;
13
14#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4604e6a54d36..bc473acfa7f9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -71,11 +71,14 @@
71#define MSR_IA32_LASTINTTOIP 0x000001de 71#define MSR_IA32_LASTINTTOIP 0x000001de
72 72
73/* DEBUGCTLMSR bits (others vary by model): */ 73/* DEBUGCTLMSR bits (others vary by model): */
74#define _DEBUGCTLMSR_LBR 0 /* last branch recording */ 74#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
75#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */ 75#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
76 76#define DEBUGCTLMSR_TR (1UL << 6)
77#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR) 77#define DEBUGCTLMSR_BTS (1UL << 7)
78#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF) 78#define DEBUGCTLMSR_BTINT (1UL << 8)
79#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
80#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
81#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
79 82
80#define MSR_IA32_MC0_CTL 0x00000400 83#define MSR_IA32_MC0_CTL 0x00000400
81#define MSR_IA32_MC0_STATUS 0x00000401 84#define MSR_IA32_MC0_STATUS 0x00000401
@@ -359,6 +362,8 @@
359#define MSR_P4_U2L_ESCR0 0x000003b0 362#define MSR_P4_U2L_ESCR0 0x000003b0
360#define MSR_P4_U2L_ESCR1 0x000003b1 363#define MSR_P4_U2L_ESCR1 0x000003b1
361 364
365#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
366
362/* Intel Core-based CPU performance counters */ 367/* Intel Core-based CPU performance counters */
363#define MSR_CORE_PERF_FIXED_CTR0 0x00000309 368#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
364#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a 369#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 66a272dfd8b8..0ec6d12d84e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -190,6 +190,29 @@ do { \
190 pfo_ret__; \ 190 pfo_ret__; \
191}) 191})
192 192
193#define percpu_unary_op(op, var) \
194({ \
195 switch (sizeof(var)) { \
196 case 1: \
197 asm(op "b "__percpu_arg(0) \
198 : "+m" (var)); \
199 break; \
200 case 2: \
201 asm(op "w "__percpu_arg(0) \
202 : "+m" (var)); \
203 break; \
204 case 4: \
205 asm(op "l "__percpu_arg(0) \
206 : "+m" (var)); \
207 break; \
208 case 8: \
209 asm(op "q "__percpu_arg(0) \
210 : "+m" (var)); \
211 break; \
212 default: __bad_percpu_size(); \
213 } \
214})
215
193/* 216/*
194 * percpu_read() makes gcc load the percpu variable every time it is 217 * percpu_read() makes gcc load the percpu variable every time it is
195 * accessed while percpu_read_stable() allows the value to be cached. 218 * accessed while percpu_read_stable() allows the value to be cached.
@@ -207,6 +230,7 @@ do { \
207#define percpu_and(var, val) percpu_to_op("and", var, val) 230#define percpu_and(var, val) percpu_to_op("and", var, val)
208#define percpu_or(var, val) percpu_to_op("or", var, val) 231#define percpu_or(var, val) percpu_to_op("or", var, val)
209#define percpu_xor(var, val) percpu_to_op("xor", var, val) 232#define percpu_xor(var, val) percpu_to_op("xor", var, val)
233#define percpu_inc(var) percpu_unary_op("inc", var)
210 234
211#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 235#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
212#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 236#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index db6109a885a7..254883d0c7e0 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,7 +5,7 @@
5 * Performance event hw details: 5 * Performance event hw details:
6 */ 6 */
7 7
8#define X86_PMC_MAX_GENERIC 8 8#define X86_PMC_MAX_GENERIC 32
9#define X86_PMC_MAX_FIXED 3 9#define X86_PMC_MAX_FIXED 3
10 10
11#define X86_PMC_IDX_GENERIC 0 11#define X86_PMC_IDX_GENERIC 0
@@ -18,39 +18,31 @@
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20 20
21#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22) 21#define ARCH_PERFMON_EVENTSEL_EVENT 0x000000FFULL
22#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) 22#define ARCH_PERFMON_EVENTSEL_UMASK 0x0000FF00ULL
23#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) 23#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16)
24#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) 24#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
25#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) 25#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18)
26 26#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20)
27/* 27#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21)
28 * Includes eventsel and unit mask as well: 28#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
29 */ 29#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
30 30#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
31 31
32#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL 32#define AMD64_EVENTSEL_EVENT \
33#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL 33 (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
34#define INTEL_ARCH_EDGE_MASK 0x00040000ULL 34#define INTEL_ARCH_EVENT_MASK \
35#define INTEL_ARCH_INV_MASK 0x00800000ULL 35 (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
36#define INTEL_ARCH_CNT_MASK 0xFF000000ULL 36
37#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK) 37#define X86_RAW_EVENT_MASK \
38 38 (ARCH_PERFMON_EVENTSEL_EVENT | \
39/* 39 ARCH_PERFMON_EVENTSEL_UMASK | \
40 * filter mask to validate fixed counter events. 40 ARCH_PERFMON_EVENTSEL_EDGE | \
41 * the following filters disqualify for fixed counters: 41 ARCH_PERFMON_EVENTSEL_INV | \
42 * - inv 42 ARCH_PERFMON_EVENTSEL_CMASK)
43 * - edge 43#define AMD64_RAW_EVENT_MASK \
44 * - cnt-mask 44 (X86_RAW_EVENT_MASK | \
45 * The other filters are supported by fixed counters. 45 AMD64_EVENTSEL_EVENT)
46 * The any-thread option is supported starting with v3.
47 */
48#define INTEL_ARCH_FIXED_MASK \
49 (INTEL_ARCH_CNT_MASK| \
50 INTEL_ARCH_INV_MASK| \
51 INTEL_ARCH_EDGE_MASK|\
52 INTEL_ARCH_UNIT_MASK|\
53 INTEL_ARCH_EVENT_MASK)
54 46
55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 47#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
56#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 48#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
@@ -67,7 +59,7 @@
67union cpuid10_eax { 59union cpuid10_eax {
68 struct { 60 struct {
69 unsigned int version_id:8; 61 unsigned int version_id:8;
70 unsigned int num_events:8; 62 unsigned int num_counters:8;
71 unsigned int bit_width:8; 63 unsigned int bit_width:8;
72 unsigned int mask_length:8; 64 unsigned int mask_length:8;
73 } split; 65 } split;
@@ -76,7 +68,7 @@ union cpuid10_eax {
76 68
77union cpuid10_edx { 69union cpuid10_edx {
78 struct { 70 struct {
79 unsigned int num_events_fixed:4; 71 unsigned int num_counters_fixed:4;
80 unsigned int reserved:28; 72 unsigned int reserved:28;
81 } split; 73 } split;
82 unsigned int full; 74 unsigned int full;
@@ -136,6 +128,18 @@ extern void perf_events_lapic_init(void);
136 128
137#define PERF_EVENT_INDEX_OFFSET 0 129#define PERF_EVENT_INDEX_OFFSET 0
138 130
131/*
132 * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
133 * This flag is otherwise unused and ABI specified to be 0, so nobody should
134 * care what we do with it.
135 */
136#define PERF_EFLAGS_EXACT (1UL << 3)
137
138struct pt_regs;
139extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
140extern unsigned long perf_misc_flags(struct pt_regs *regs);
141#define perf_misc_flags(regs) perf_misc_flags(regs)
142
139#else 143#else
140static inline void init_hw_perf_events(void) { } 144static inline void init_hw_perf_events(void) { }
141static inline void perf_events_lapic_init(void) { } 145static inline void perf_events_lapic_init(void) { }
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
new file mode 100644
index 000000000000..b05400a542ff
--- /dev/null
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -0,0 +1,794 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 */
4
5#ifndef PERF_EVENT_P4_H
6#define PERF_EVENT_P4_H
7
8#include <linux/cpu.h>
9#include <linux/bitops.h>
10
11/*
12 * NetBurst has perfomance MSRs shared between
13 * threads if HT is turned on, ie for both logical
14 * processors (mem: in turn in Atom with HT support
15 * perf-MSRs are not shared and every thread has its
16 * own perf-MSRs set)
17 */
18#define ARCH_P4_TOTAL_ESCR (46)
19#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
21#define ARCH_P4_MAX_CCCR (18)
22#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2)
23
24#define P4_ESCR_EVENT_MASK 0x7e000000U
25#define P4_ESCR_EVENT_SHIFT 25
26#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
27#define P4_ESCR_EVENTMASK_SHIFT 9
28#define P4_ESCR_TAG_MASK 0x000001e0U
29#define P4_ESCR_TAG_SHIFT 5
30#define P4_ESCR_TAG_ENABLE 0x00000010U
31#define P4_ESCR_T0_OS 0x00000008U
32#define P4_ESCR_T0_USR 0x00000004U
33#define P4_ESCR_T1_OS 0x00000002U
34#define P4_ESCR_T1_USR 0x00000001U
35
36#define P4_ESCR_EVENT(v) ((v) << P4_ESCR_EVENT_SHIFT)
37#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
38#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
39
40/* Non HT mask */
41#define P4_ESCR_MASK \
42 (P4_ESCR_EVENT_MASK | \
43 P4_ESCR_EVENTMASK_MASK | \
44 P4_ESCR_TAG_MASK | \
45 P4_ESCR_TAG_ENABLE | \
46 P4_ESCR_T0_OS | \
47 P4_ESCR_T0_USR)
48
49/* HT mask */
50#define P4_ESCR_MASK_HT \
51 (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
52
53#define P4_CCCR_OVF 0x80000000U
54#define P4_CCCR_CASCADE 0x40000000U
55#define P4_CCCR_OVF_PMI_T0 0x04000000U
56#define P4_CCCR_OVF_PMI_T1 0x08000000U
57#define P4_CCCR_FORCE_OVF 0x02000000U
58#define P4_CCCR_EDGE 0x01000000U
59#define P4_CCCR_THRESHOLD_MASK 0x00f00000U
60#define P4_CCCR_THRESHOLD_SHIFT 20
61#define P4_CCCR_COMPLEMENT 0x00080000U
62#define P4_CCCR_COMPARE 0x00040000U
63#define P4_CCCR_ESCR_SELECT_MASK 0x0000e000U
64#define P4_CCCR_ESCR_SELECT_SHIFT 13
65#define P4_CCCR_ENABLE 0x00001000U
66#define P4_CCCR_THREAD_SINGLE 0x00010000U
67#define P4_CCCR_THREAD_BOTH 0x00020000U
68#define P4_CCCR_THREAD_ANY 0x00030000U
69#define P4_CCCR_RESERVED 0x00000fffU
70
71#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
72#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
73
74/* Custom bits in reerved CCCR area */
75#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU
76
77
78/* Non HT mask */
79#define P4_CCCR_MASK \
80 (P4_CCCR_OVF | \
81 P4_CCCR_CASCADE | \
82 P4_CCCR_OVF_PMI_T0 | \
83 P4_CCCR_FORCE_OVF | \
84 P4_CCCR_EDGE | \
85 P4_CCCR_THRESHOLD_MASK | \
86 P4_CCCR_COMPLEMENT | \
87 P4_CCCR_COMPARE | \
88 P4_CCCR_ESCR_SELECT_MASK | \
89 P4_CCCR_ENABLE)
90
91/* HT mask */
92#define P4_CCCR_MASK_HT (P4_CCCR_MASK | P4_CCCR_THREAD_ANY)
93
94#define P4_GEN_ESCR_EMASK(class, name, bit) \
95 class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
96#define P4_ESCR_EMASK_BIT(class, name) class##__##name
97
98/*
99 * config field is 64bit width and consists of
100 * HT << 63 | ESCR << 32 | CCCR
101 * where HT is HyperThreading bit (since ESCR
102 * has it reserved we may use it for own purpose)
103 *
104 * note that this is NOT the addresses of respective
105 * ESCR and CCCR but rather an only packed value should
106 * be unpacked and written to a proper addresses
107 *
108 * the base idea is to pack as much info as
109 * possible
110 */
111#define p4_config_pack_escr(v) (((u64)(v)) << 32)
112#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
113#define p4_config_unpack_escr(v) (((u64)(v)) >> 32)
114#define p4_config_unpack_cccr(v) (((u64)(v)) & 0xffffffffULL)
115
116#define p4_config_unpack_emask(v) \
117 ({ \
118 u32 t = p4_config_unpack_escr((v)); \
119 t = t & P4_ESCR_EVENTMASK_MASK; \
120 t = t >> P4_ESCR_EVENTMASK_SHIFT; \
121 t; \
122 })
123
124#define p4_config_unpack_event(v) \
125 ({ \
126 u32 t = p4_config_unpack_escr((v)); \
127 t = t & P4_ESCR_EVENT_MASK; \
128 t = t >> P4_ESCR_EVENT_SHIFT; \
129 t; \
130 })
131
132#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
133
134#define P4_CONFIG_HT_SHIFT 63
135#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
136
137static inline bool p4_is_event_cascaded(u64 config)
138{
139 u32 cccr = p4_config_unpack_cccr(config);
140 return !!(cccr & P4_CCCR_CASCADE);
141}
142
143static inline int p4_ht_config_thread(u64 config)
144{
145 return !!(config & P4_CONFIG_HT);
146}
147
148static inline u64 p4_set_ht_bit(u64 config)
149{
150 return config | P4_CONFIG_HT;
151}
152
153static inline u64 p4_clear_ht_bit(u64 config)
154{
155 return config & ~P4_CONFIG_HT;
156}
157
158static inline int p4_ht_active(void)
159{
160#ifdef CONFIG_SMP
161 return smp_num_siblings > 1;
162#endif
163 return 0;
164}
165
166static inline int p4_ht_thread(int cpu)
167{
168#ifdef CONFIG_SMP
169 if (smp_num_siblings == 2)
170 return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
171#endif
172 return 0;
173}
174
175static inline int p4_should_swap_ts(u64 config, int cpu)
176{
177 return p4_ht_config_thread(config) ^ p4_ht_thread(cpu);
178}
179
180static inline u32 p4_default_cccr_conf(int cpu)
181{
182 /*
183 * Note that P4_CCCR_THREAD_ANY is "required" on
184 * non-HT machines (on HT machines we count TS events
185 * regardless the state of second logical processor
186 */
187 u32 cccr = P4_CCCR_THREAD_ANY;
188
189 if (!p4_ht_thread(cpu))
190 cccr |= P4_CCCR_OVF_PMI_T0;
191 else
192 cccr |= P4_CCCR_OVF_PMI_T1;
193
194 return cccr;
195}
196
197static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
198{
199 u32 escr = 0;
200
201 if (!p4_ht_thread(cpu)) {
202 if (!exclude_os)
203 escr |= P4_ESCR_T0_OS;
204 if (!exclude_usr)
205 escr |= P4_ESCR_T0_USR;
206 } else {
207 if (!exclude_os)
208 escr |= P4_ESCR_T1_OS;
209 if (!exclude_usr)
210 escr |= P4_ESCR_T1_USR;
211 }
212
213 return escr;
214}
215
216enum P4_EVENTS {
217 P4_EVENT_TC_DELIVER_MODE,
218 P4_EVENT_BPU_FETCH_REQUEST,
219 P4_EVENT_ITLB_REFERENCE,
220 P4_EVENT_MEMORY_CANCEL,
221 P4_EVENT_MEMORY_COMPLETE,
222 P4_EVENT_LOAD_PORT_REPLAY,
223 P4_EVENT_STORE_PORT_REPLAY,
224 P4_EVENT_MOB_LOAD_REPLAY,
225 P4_EVENT_PAGE_WALK_TYPE,
226 P4_EVENT_BSQ_CACHE_REFERENCE,
227 P4_EVENT_IOQ_ALLOCATION,
228 P4_EVENT_IOQ_ACTIVE_ENTRIES,
229 P4_EVENT_FSB_DATA_ACTIVITY,
230 P4_EVENT_BSQ_ALLOCATION,
231 P4_EVENT_BSQ_ACTIVE_ENTRIES,
232 P4_EVENT_SSE_INPUT_ASSIST,
233 P4_EVENT_PACKED_SP_UOP,
234 P4_EVENT_PACKED_DP_UOP,
235 P4_EVENT_SCALAR_SP_UOP,
236 P4_EVENT_SCALAR_DP_UOP,
237 P4_EVENT_64BIT_MMX_UOP,
238 P4_EVENT_128BIT_MMX_UOP,
239 P4_EVENT_X87_FP_UOP,
240 P4_EVENT_TC_MISC,
241 P4_EVENT_GLOBAL_POWER_EVENTS,
242 P4_EVENT_TC_MS_XFER,
243 P4_EVENT_UOP_QUEUE_WRITES,
244 P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE,
245 P4_EVENT_RETIRED_BRANCH_TYPE,
246 P4_EVENT_RESOURCE_STALL,
247 P4_EVENT_WC_BUFFER,
248 P4_EVENT_B2B_CYCLES,
249 P4_EVENT_BNR,
250 P4_EVENT_SNOOP,
251 P4_EVENT_RESPONSE,
252 P4_EVENT_FRONT_END_EVENT,
253 P4_EVENT_EXECUTION_EVENT,
254 P4_EVENT_REPLAY_EVENT,
255 P4_EVENT_INSTR_RETIRED,
256 P4_EVENT_UOPS_RETIRED,
257 P4_EVENT_UOP_TYPE,
258 P4_EVENT_BRANCH_RETIRED,
259 P4_EVENT_MISPRED_BRANCH_RETIRED,
260 P4_EVENT_X87_ASSIST,
261 P4_EVENT_MACHINE_CLEAR,
262 P4_EVENT_INSTR_COMPLETED,
263};
264
265#define P4_OPCODE(event) event##_OPCODE
266#define P4_OPCODE_ESEL(opcode) ((opcode & 0x00ff) >> 0)
267#define P4_OPCODE_EVNT(opcode) ((opcode & 0xff00) >> 8)
268#define P4_OPCODE_PACK(event, sel) (((event) << 8) | sel)
269
270/*
271 * Comments below the event represent ESCR restriction
272 * for this event and counter index per ESCR
273 *
274 * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early
275 * processor builds (family 0FH, models 01H-02H). These MSRs
276 * are not available on later versions, so that we don't use
277 * them completely
278 *
279 * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly
280 * working so that we should not use this CCCR and respective
281 * counter as result
282 */
283enum P4_EVENT_OPCODES {
284 P4_OPCODE(P4_EVENT_TC_DELIVER_MODE) = P4_OPCODE_PACK(0x01, 0x01),
285 /*
286 * MSR_P4_TC_ESCR0: 4, 5
287 * MSR_P4_TC_ESCR1: 6, 7
288 */
289
290 P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST) = P4_OPCODE_PACK(0x03, 0x00),
291 /*
292 * MSR_P4_BPU_ESCR0: 0, 1
293 * MSR_P4_BPU_ESCR1: 2, 3
294 */
295
296 P4_OPCODE(P4_EVENT_ITLB_REFERENCE) = P4_OPCODE_PACK(0x18, 0x03),
297 /*
298 * MSR_P4_ITLB_ESCR0: 0, 1
299 * MSR_P4_ITLB_ESCR1: 2, 3
300 */
301
302 P4_OPCODE(P4_EVENT_MEMORY_CANCEL) = P4_OPCODE_PACK(0x02, 0x05),
303 /*
304 * MSR_P4_DAC_ESCR0: 8, 9
305 * MSR_P4_DAC_ESCR1: 10, 11
306 */
307
308 P4_OPCODE(P4_EVENT_MEMORY_COMPLETE) = P4_OPCODE_PACK(0x08, 0x02),
309 /*
310 * MSR_P4_SAAT_ESCR0: 8, 9
311 * MSR_P4_SAAT_ESCR1: 10, 11
312 */
313
314 P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY) = P4_OPCODE_PACK(0x04, 0x02),
315 /*
316 * MSR_P4_SAAT_ESCR0: 8, 9
317 * MSR_P4_SAAT_ESCR1: 10, 11
318 */
319
320 P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY) = P4_OPCODE_PACK(0x05, 0x02),
321 /*
322 * MSR_P4_SAAT_ESCR0: 8, 9
323 * MSR_P4_SAAT_ESCR1: 10, 11
324 */
325
326 P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY) = P4_OPCODE_PACK(0x03, 0x02),
327 /*
328 * MSR_P4_MOB_ESCR0: 0, 1
329 * MSR_P4_MOB_ESCR1: 2, 3
330 */
331
332 P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE) = P4_OPCODE_PACK(0x01, 0x04),
333 /*
334 * MSR_P4_PMH_ESCR0: 0, 1
335 * MSR_P4_PMH_ESCR1: 2, 3
336 */
337
338 P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE) = P4_OPCODE_PACK(0x0c, 0x07),
339 /*
340 * MSR_P4_BSU_ESCR0: 0, 1
341 * MSR_P4_BSU_ESCR1: 2, 3
342 */
343
344 P4_OPCODE(P4_EVENT_IOQ_ALLOCATION) = P4_OPCODE_PACK(0x03, 0x06),
345 /*
346 * MSR_P4_FSB_ESCR0: 0, 1
347 * MSR_P4_FSB_ESCR1: 2, 3
348 */
349
350 P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x1a, 0x06),
351 /*
352 * MSR_P4_FSB_ESCR1: 2, 3
353 */
354
355 P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY) = P4_OPCODE_PACK(0x17, 0x06),
356 /*
357 * MSR_P4_FSB_ESCR0: 0, 1
358 * MSR_P4_FSB_ESCR1: 2, 3
359 */
360
361 P4_OPCODE(P4_EVENT_BSQ_ALLOCATION) = P4_OPCODE_PACK(0x05, 0x07),
362 /*
363 * MSR_P4_BSU_ESCR0: 0, 1
364 */
365
366 P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x06, 0x07),
367 /*
368 * NOTE: no ESCR name in docs, it's guessed
369 * MSR_P4_BSU_ESCR1: 2, 3
370 */
371
372 P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST) = P4_OPCODE_PACK(0x34, 0x01),
373 /*
374 * MSR_P4_FIRM_ESCR0: 8, 9
375 * MSR_P4_FIRM_ESCR1: 10, 11
376 */
377
378 P4_OPCODE(P4_EVENT_PACKED_SP_UOP) = P4_OPCODE_PACK(0x08, 0x01),
379 /*
380 * MSR_P4_FIRM_ESCR0: 8, 9
381 * MSR_P4_FIRM_ESCR1: 10, 11
382 */
383
384 P4_OPCODE(P4_EVENT_PACKED_DP_UOP) = P4_OPCODE_PACK(0x0c, 0x01),
385 /*
386 * MSR_P4_FIRM_ESCR0: 8, 9
387 * MSR_P4_FIRM_ESCR1: 10, 11
388 */
389
390 P4_OPCODE(P4_EVENT_SCALAR_SP_UOP) = P4_OPCODE_PACK(0x0a, 0x01),
391 /*
392 * MSR_P4_FIRM_ESCR0: 8, 9
393 * MSR_P4_FIRM_ESCR1: 10, 11
394 */
395
396 P4_OPCODE(P4_EVENT_SCALAR_DP_UOP) = P4_OPCODE_PACK(0x0e, 0x01),
397 /*
398 * MSR_P4_FIRM_ESCR0: 8, 9
399 * MSR_P4_FIRM_ESCR1: 10, 11
400 */
401
402 P4_OPCODE(P4_EVENT_64BIT_MMX_UOP) = P4_OPCODE_PACK(0x02, 0x01),
403 /*
404 * MSR_P4_FIRM_ESCR0: 8, 9
405 * MSR_P4_FIRM_ESCR1: 10, 11
406 */
407
408 P4_OPCODE(P4_EVENT_128BIT_MMX_UOP) = P4_OPCODE_PACK(0x1a, 0x01),
409 /*
410 * MSR_P4_FIRM_ESCR0: 8, 9
411 * MSR_P4_FIRM_ESCR1: 10, 11
412 */
413
414 P4_OPCODE(P4_EVENT_X87_FP_UOP) = P4_OPCODE_PACK(0x04, 0x01),
415 /*
416 * MSR_P4_FIRM_ESCR0: 8, 9
417 * MSR_P4_FIRM_ESCR1: 10, 11
418 */
419
420 P4_OPCODE(P4_EVENT_TC_MISC) = P4_OPCODE_PACK(0x06, 0x01),
421 /*
422 * MSR_P4_TC_ESCR0: 4, 5
423 * MSR_P4_TC_ESCR1: 6, 7
424 */
425
426 P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS) = P4_OPCODE_PACK(0x13, 0x06),
427 /*
428 * MSR_P4_FSB_ESCR0: 0, 1
429 * MSR_P4_FSB_ESCR1: 2, 3
430 */
431
432 P4_OPCODE(P4_EVENT_TC_MS_XFER) = P4_OPCODE_PACK(0x05, 0x00),
433 /*
434 * MSR_P4_MS_ESCR0: 4, 5
435 * MSR_P4_MS_ESCR1: 6, 7
436 */
437
438 P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES) = P4_OPCODE_PACK(0x09, 0x00),
439 /*
440 * MSR_P4_MS_ESCR0: 4, 5
441 * MSR_P4_MS_ESCR1: 6, 7
442 */
443
444 P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x05, 0x02),
445 /*
446 * MSR_P4_TBPU_ESCR0: 4, 5
447 * MSR_P4_TBPU_ESCR1: 6, 7
448 */
449
450 P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x04, 0x02),
451 /*
452 * MSR_P4_TBPU_ESCR0: 4, 5
453 * MSR_P4_TBPU_ESCR1: 6, 7
454 */
455
456 P4_OPCODE(P4_EVENT_RESOURCE_STALL) = P4_OPCODE_PACK(0x01, 0x01),
457 /*
458 * MSR_P4_ALF_ESCR0: 12, 13, 16
459 * MSR_P4_ALF_ESCR1: 14, 15, 17
460 */
461
462 P4_OPCODE(P4_EVENT_WC_BUFFER) = P4_OPCODE_PACK(0x05, 0x05),
463 /*
464 * MSR_P4_DAC_ESCR0: 8, 9
465 * MSR_P4_DAC_ESCR1: 10, 11
466 */
467
468 P4_OPCODE(P4_EVENT_B2B_CYCLES) = P4_OPCODE_PACK(0x16, 0x03),
469 /*
470 * MSR_P4_FSB_ESCR0: 0, 1
471 * MSR_P4_FSB_ESCR1: 2, 3
472 */
473
474 P4_OPCODE(P4_EVENT_BNR) = P4_OPCODE_PACK(0x08, 0x03),
475 /*
476 * MSR_P4_FSB_ESCR0: 0, 1
477 * MSR_P4_FSB_ESCR1: 2, 3
478 */
479
480 P4_OPCODE(P4_EVENT_SNOOP) = P4_OPCODE_PACK(0x06, 0x03),
481 /*
482 * MSR_P4_FSB_ESCR0: 0, 1
483 * MSR_P4_FSB_ESCR1: 2, 3
484 */
485
486 P4_OPCODE(P4_EVENT_RESPONSE) = P4_OPCODE_PACK(0x04, 0x03),
487 /*
488 * MSR_P4_FSB_ESCR0: 0, 1
489 * MSR_P4_FSB_ESCR1: 2, 3
490 */
491
492 P4_OPCODE(P4_EVENT_FRONT_END_EVENT) = P4_OPCODE_PACK(0x08, 0x05),
493 /*
494 * MSR_P4_CRU_ESCR2: 12, 13, 16
495 * MSR_P4_CRU_ESCR3: 14, 15, 17
496 */
497
498 P4_OPCODE(P4_EVENT_EXECUTION_EVENT) = P4_OPCODE_PACK(0x0c, 0x05),
499 /*
500 * MSR_P4_CRU_ESCR2: 12, 13, 16
501 * MSR_P4_CRU_ESCR3: 14, 15, 17
502 */
503
504 P4_OPCODE(P4_EVENT_REPLAY_EVENT) = P4_OPCODE_PACK(0x09, 0x05),
505 /*
506 * MSR_P4_CRU_ESCR2: 12, 13, 16
507 * MSR_P4_CRU_ESCR3: 14, 15, 17
508 */
509
510 P4_OPCODE(P4_EVENT_INSTR_RETIRED) = P4_OPCODE_PACK(0x02, 0x04),
511 /*
512 * MSR_P4_CRU_ESCR0: 12, 13, 16
513 * MSR_P4_CRU_ESCR1: 14, 15, 17
514 */
515
516 P4_OPCODE(P4_EVENT_UOPS_RETIRED) = P4_OPCODE_PACK(0x01, 0x04),
517 /*
518 * MSR_P4_CRU_ESCR0: 12, 13, 16
519 * MSR_P4_CRU_ESCR1: 14, 15, 17
520 */
521
522 P4_OPCODE(P4_EVENT_UOP_TYPE) = P4_OPCODE_PACK(0x02, 0x02),
523 /*
524 * MSR_P4_RAT_ESCR0: 12, 13, 16
525 * MSR_P4_RAT_ESCR1: 14, 15, 17
526 */
527
528 P4_OPCODE(P4_EVENT_BRANCH_RETIRED) = P4_OPCODE_PACK(0x06, 0x05),
529 /*
530 * MSR_P4_CRU_ESCR2: 12, 13, 16
531 * MSR_P4_CRU_ESCR3: 14, 15, 17
532 */
533
534 P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED) = P4_OPCODE_PACK(0x03, 0x04),
535 /*
536 * MSR_P4_CRU_ESCR0: 12, 13, 16
537 * MSR_P4_CRU_ESCR1: 14, 15, 17
538 */
539
540 P4_OPCODE(P4_EVENT_X87_ASSIST) = P4_OPCODE_PACK(0x03, 0x05),
541 /*
542 * MSR_P4_CRU_ESCR2: 12, 13, 16
543 * MSR_P4_CRU_ESCR3: 14, 15, 17
544 */
545
546 P4_OPCODE(P4_EVENT_MACHINE_CLEAR) = P4_OPCODE_PACK(0x02, 0x05),
547 /*
548 * MSR_P4_CRU_ESCR2: 12, 13, 16
549 * MSR_P4_CRU_ESCR3: 14, 15, 17
550 */
551
552 P4_OPCODE(P4_EVENT_INSTR_COMPLETED) = P4_OPCODE_PACK(0x07, 0x04),
553 /*
554 * MSR_P4_CRU_ESCR0: 12, 13, 16
555 * MSR_P4_CRU_ESCR1: 14, 15, 17
556 */
557};
558
559/*
560 * a caller should use P4_ESCR_EMASK_NAME helper to
561 * pick the EventMask needed, for example
562 *
563 * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
564 */
565enum P4_ESCR_EMASKS {
566 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
567 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1),
568 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2),
569 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3),
570 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4),
571 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5),
572 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6),
573
574 P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0),
575
576 P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0),
577 P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1),
578 P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2),
579
580 P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2),
581 P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3),
582
583 P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0),
584 P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1),
585
586 P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1),
587
588 P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1),
589
590 P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1),
591 P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3),
592 P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
593 P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
594
595 P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0),
596 P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1),
597
598 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
599 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
600 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
601 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
602 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
603 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
604 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
605 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
606 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
607
608 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0),
609 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5),
610 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6),
611 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7),
612 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8),
613 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9),
614 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10),
615 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11),
616 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13),
617 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14),
618 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15),
619
620 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
621 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
622 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
623 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
624 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
625 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
626 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
627 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
628 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13),
629 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14),
630 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
631
632 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
633 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
634 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
635 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
636 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
637 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
638
639 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0),
640 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1),
641 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2),
642 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3),
643 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
644 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
645 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
646 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
647 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
648 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
649 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11),
650 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12),
651 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13),
652
653 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
654 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
655 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
656 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
657 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
658 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
659 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
660 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
661 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
662 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
663 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
664 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
665 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
666
667 P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15),
668
669 P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15),
670
671 P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15),
672
673 P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15),
674
675 P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15),
676
677 P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15),
678
679 P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15),
680
681 P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15),
682
683 P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4),
684
685 P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0),
686
687 P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0),
688
689 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
690 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
691 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2),
692
693 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
694 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
695 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
696 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
697
698 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
699 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2),
700 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3),
701 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4),
702
703 P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5),
704
705 P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0),
706 P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1),
707
708 P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0),
709 P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1),
710
711 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0),
712 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1),
713 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2),
714 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3),
715 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4),
716 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5),
717 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6),
718 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7),
719
720 P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0),
721 P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1),
722
723 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0),
724 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1),
725 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2),
726 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3),
727
728 P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0),
729 P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1),
730
731 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1),
732 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2),
733
734 P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0),
735 P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1),
736 P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2),
737 P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3),
738
739 P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
740
741 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0),
742 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1),
743 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2),
744 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3),
745 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4),
746
747 P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0),
748 P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1),
749 P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2),
750
751 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0),
752 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
753};
754
755/* P4 PEBS: stale for a while */
756#define P4_PEBS_METRIC_MASK 0x00001fffU
757#define P4_PEBS_UOB_TAG 0x01000000U
758#define P4_PEBS_ENABLE 0x02000000U
759
760/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
761#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001
762#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002
763#define P4_PEBS__dtlb_load_miss_retired 0x3000004
764#define P4_PEBS__dtlb_store_miss_retired 0x3000004
765#define P4_PEBS__dtlb_all_miss_retired 0x3000004
766#define P4_PEBS__tagged_mispred_branch 0x3018000
767#define P4_PEBS__mob_load_replay_retired 0x3000200
768#define P4_PEBS__split_load_retired 0x3000400
769#define P4_PEBS__split_store_retired 0x3000400
770
771#define P4_VERT__1stl_cache_load_miss_retired 0x0000001
772#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001
773#define P4_VERT__dtlb_load_miss_retired 0x0000001
774#define P4_VERT__dtlb_store_miss_retired 0x0000002
775#define P4_VERT__dtlb_all_miss_retired 0x0000003
776#define P4_VERT__tagged_mispred_branch 0x0000010
777#define P4_VERT__mob_load_replay_retired 0x0000001
778#define P4_VERT__split_load_retired 0x0000001
779#define P4_VERT__split_store_retired 0x0000002
780
781enum P4_CACHE_EVENTS {
782 P4_CACHE__NONE,
783
784 P4_CACHE__1stl_cache_load_miss_retired,
785 P4_CACHE__2ndl_cache_load_miss_retired,
786 P4_CACHE__dtlb_load_miss_retired,
787 P4_CACHE__dtlb_store_miss_retired,
788 P4_CACHE__itlb_reference_hit,
789 P4_CACHE__itlb_reference_miss,
790
791 P4_CACHE__MAX
792};
793
794#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703a..5a51379dcbe4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,7 +21,6 @@ struct mm_struct;
21#include <asm/msr.h> 21#include <asm/msr.h>
22#include <asm/desc_defs.h> 22#include <asm/desc_defs.h>
23#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/ds.h>
25 24
26#include <linux/personality.h> 25#include <linux/personality.h>
27#include <linux/cpumask.h> 26#include <linux/cpumask.h>
@@ -29,6 +28,7 @@ struct mm_struct;
29#include <linux/threads.h> 28#include <linux/threads.h>
30#include <linux/math64.h> 29#include <linux/math64.h>
31#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/err.h>
32 32
33#define HBP_NUM 4 33#define HBP_NUM 4
34/* 34/*
@@ -113,7 +113,6 @@ struct cpuinfo_x86 {
113 /* Index into per_cpu list: */ 113 /* Index into per_cpu list: */
114 u16 cpu_index; 114 u16 cpu_index;
115#endif 115#endif
116 unsigned int x86_hyper_vendor;
117} __attribute__((__aligned__(SMP_CACHE_BYTES))); 116} __attribute__((__aligned__(SMP_CACHE_BYTES)));
118 117
119#define X86_VENDOR_INTEL 0 118#define X86_VENDOR_INTEL 0
@@ -127,9 +126,6 @@ struct cpuinfo_x86 {
127 126
128#define X86_VENDOR_UNKNOWN 0xff 127#define X86_VENDOR_UNKNOWN 0xff
129 128
130#define X86_HYPER_VENDOR_NONE 0
131#define X86_HYPER_VENDOR_VMWARE 1
132
133/* 129/*
134 * capabilities of CPUs 130 * capabilities of CPUs
135 */ 131 */
@@ -380,6 +376,10 @@ union thread_xstate {
380 struct xsave_struct xsave; 376 struct xsave_struct xsave;
381}; 377};
382 378
379struct fpu {
380 union thread_xstate *state;
381};
382
383#ifdef CONFIG_X86_64 383#ifdef CONFIG_X86_64
384DECLARE_PER_CPU(struct orig_ist, orig_ist); 384DECLARE_PER_CPU(struct orig_ist, orig_ist);
385 385
@@ -457,7 +457,7 @@ struct thread_struct {
457 unsigned long trap_no; 457 unsigned long trap_no;
458 unsigned long error_code; 458 unsigned long error_code;
459 /* floating point and extended processor state */ 459 /* floating point and extended processor state */
460 union thread_xstate *xstate; 460 struct fpu fpu;
461#ifdef CONFIG_X86_32 461#ifdef CONFIG_X86_32
462 /* Virtual 86 mode info */ 462 /* Virtual 86 mode info */
463 struct vm86_struct __user *vm86_info; 463 struct vm86_struct __user *vm86_info;
@@ -473,10 +473,6 @@ struct thread_struct {
473 unsigned long iopl; 473 unsigned long iopl;
474 /* Max allowed port in the bitmap, in bytes: */ 474 /* Max allowed port in the bitmap, in bytes: */
475 unsigned io_bitmap_max; 475 unsigned io_bitmap_max;
476/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
477 unsigned long debugctlmsr;
478 /* Debug Store context; see asm/ds.h */
479 struct ds_context *ds_ctx;
480}; 476};
481 477
482static inline unsigned long native_get_debugreg(int regno) 478static inline unsigned long native_get_debugreg(int regno)
@@ -803,7 +799,7 @@ extern void cpu_init(void);
803 799
804static inline unsigned long get_debugctlmsr(void) 800static inline unsigned long get_debugctlmsr(void)
805{ 801{
806 unsigned long debugctlmsr = 0; 802 unsigned long debugctlmsr = 0;
807 803
808#ifndef CONFIG_X86_DEBUGCTLMSR 804#ifndef CONFIG_X86_DEBUGCTLMSR
809 if (boot_cpu_data.x86 < 6) 805 if (boot_cpu_data.x86 < 6)
@@ -811,21 +807,6 @@ static inline unsigned long get_debugctlmsr(void)
811#endif 807#endif
812 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); 808 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
813 809
814 return debugctlmsr;
815}
816
817static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
818{
819 u64 debugctlmsr = 0;
820 u32 val1, val2;
821
822#ifndef CONFIG_X86_DEBUGCTLMSR
823 if (boot_cpu_data.x86 < 6)
824 return 0;
825#endif
826 rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
827 debugctlmsr = val1 | ((u64)val2 << 32);
828
829 return debugctlmsr; 810 return debugctlmsr;
830} 811}
831 812
@@ -838,18 +819,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
838 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); 819 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
839} 820}
840 821
841static inline void update_debugctlmsr_on_cpu(int cpu,
842 unsigned long debugctlmsr)
843{
844#ifndef CONFIG_X86_DEBUGCTLMSR
845 if (boot_cpu_data.x86 < 6)
846 return;
847#endif
848 wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
849 (u32)((u64)debugctlmsr),
850 (u32)((u64)debugctlmsr >> 32));
851}
852
853/* 822/*
854 * from system description table in BIOS. Mostly for MCA use, but 823 * from system description table in BIOS. Mostly for MCA use, but
855 * others may find it useful: 824 * others may find it useful:
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 86723035a515..52b098a6eebb 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -82,61 +82,6 @@
82 82
83#ifndef __ASSEMBLY__ 83#ifndef __ASSEMBLY__
84#include <linux/types.h> 84#include <linux/types.h>
85 85#endif
86/* configuration/status structure used in PTRACE_BTS_CONFIG and
87 PTRACE_BTS_STATUS commands.
88*/
89struct ptrace_bts_config {
90 /* requested or actual size of BTS buffer in bytes */
91 __u32 size;
92 /* bitmask of below flags */
93 __u32 flags;
94 /* buffer overflow signal */
95 __u32 signal;
96 /* actual size of bts_struct in bytes */
97 __u32 bts_size;
98};
99#endif /* __ASSEMBLY__ */
100
101#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
102#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
103#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
104 instead of wrapping around */
105#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */
106
107#define PTRACE_BTS_CONFIG 40
108/* Configure branch trace recording.
109 ADDR points to a struct ptrace_bts_config.
110 DATA gives the size of that buffer.
111 A new buffer is allocated, if requested in the flags.
112 An overflow signal may only be requested for new buffers.
113 Returns the number of bytes read.
114*/
115#define PTRACE_BTS_STATUS 41
116/* Return the current configuration in a struct ptrace_bts_config
117 pointed to by ADDR; DATA gives the size of that buffer.
118 Returns the number of bytes written.
119*/
120#define PTRACE_BTS_SIZE 42
121/* Return the number of available BTS records for draining.
122 DATA and ADDR are ignored.
123*/
124#define PTRACE_BTS_GET 43
125/* Get a single BTS record.
126 DATA defines the index into the BTS array, where 0 is the newest
127 entry, and higher indices refer to older entries.
128 ADDR is pointing to struct bts_struct (see asm/ds.h).
129*/
130#define PTRACE_BTS_CLEAR 44
131/* Clear the BTS buffer.
132 DATA and ADDR are ignored.
133*/
134#define PTRACE_BTS_DRAIN 45
135/* Read all available BTS records and clear the buffer.
136 ADDR points to an array of struct bts_struct.
137 DATA gives the size of that buffer.
138 BTS records are read from oldest to newest.
139 Returns number of BTS records drained.
140*/
141 86
142#endif /* _ASM_X86_PTRACE_ABI_H */ 87#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 69a686a7dff0..78cd1ea94500 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -289,12 +289,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
289extern int do_set_thread_area(struct task_struct *p, int idx, 289extern int do_set_thread_area(struct task_struct *p, int idx,
290 struct user_desc __user *info, int can_allocate); 290 struct user_desc __user *info, int can_allocate);
291 291
292#ifdef CONFIG_X86_PTRACE_BTS
293extern void ptrace_bts_untrace(struct task_struct *tsk);
294
295#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk)
296#endif /* CONFIG_X86_PTRACE_BTS */
297
298#endif /* __KERNEL__ */ 292#endif /* __KERNEL__ */
299 293
300#endif /* !__ASSEMBLY__ */ 294#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e969..d4092fac226b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,8 +92,7 @@ struct thread_info {
92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
93#define TIF_FREEZE 23 /* is freezing for suspend */ 93#define TIF_FREEZE 23 /* is freezing for suspend */
94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ 96#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ 97#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
99 98
@@ -115,8 +114,7 @@ struct thread_info {
115#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 114#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
116#define _TIF_FREEZE (1 << TIF_FREEZE) 115#define _TIF_FREEZE (1 << TIF_FREEZE)
117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 117#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) 118#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
121#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) 119#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
122 120
@@ -147,7 +145,7 @@ struct thread_info {
147 145
148/* flags to check in __switch_to() */ 146/* flags to check in __switch_to() */
149#define _TIF_WORK_CTXSW \ 147#define _TIF_WORK_CTXSW \
150 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 148 (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
151 149
152#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) 150#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
153#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 151#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
@@ -244,7 +242,6 @@ static inline struct thread_info *current_thread_info(void)
244#define TS_POLLING 0x0004 /* true if in idle loop 242#define TS_POLLING 0x0004 /* true if in idle loop
245 and not sleeping */ 243 and not sleeping */
246#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ 244#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
247#define TS_XSAVE 0x0010 /* Use xsave/xrstor */
248 245
249#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) 246#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
250 247
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4da91ad69e0d..f66cda56781d 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -79,7 +79,7 @@ static inline int get_si_code(unsigned long condition)
79 79
80extern int panic_on_unrecovered_nmi; 80extern int panic_on_unrecovered_nmi;
81 81
82void math_error(void __user *); 82void math_error(struct pt_regs *, int, int);
83void math_emulate(struct math_emu_info *); 83void math_emulate(struct math_emu_info *);
84#ifndef CONFIG_X86_32 84#ifndef CONFIG_X86_32
85asmlinkage void smp_thermal_interrupt(void); 85asmlinkage void smp_thermal_interrupt(void);
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
deleted file mode 100644
index e49ed6d2fd4e..000000000000
--- a/arch/x86/include/asm/vmware.h
+++ /dev/null
@@ -1,27 +0,0 @@
1/*
2 * Copyright (C) 2008, VMware, Inc.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
12 * NON INFRINGEMENT. See the GNU General Public License for more
13 * details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20#ifndef ASM_X86__VMWARE_H
21#define ASM_X86__VMWARE_H
22
23extern void vmware_platform_setup(void);
24extern int vmware_platform(void);
25extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
26
27#endif
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index ddc04ccad03b..2c4390cae228 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -37,8 +37,9 @@ extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
37 void __user *fpstate, 37 void __user *fpstate,
38 struct _fpx_sw_bytes *sw); 38 struct _fpx_sw_bytes *sw);
39 39
40static inline int xrstor_checking(struct xsave_struct *fx) 40static inline int fpu_xrstor_checking(struct fpu *fpu)
41{ 41{
42 struct xsave_struct *fx = &fpu->state->xsave;
42 int err; 43 int err;
43 44
44 asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" 45 asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
@@ -110,12 +111,12 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
110 : "memory"); 111 : "memory");
111} 112}
112 113
113static inline void xsave(struct task_struct *tsk) 114static inline void fpu_xsave(struct fpu *fpu)
114{ 115{
115 /* This, however, we can work around by forcing the compiler to select 116 /* This, however, we can work around by forcing the compiler to select
116 an addressing mode that doesn't require extended registers. */ 117 an addressing mode that doesn't require extended registers. */
117 __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" 118 __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
118 : : "D" (&(tsk->thread.xstate->xsave)), 119 : : "D" (&(fpu->state->xsave)),
119 "a" (-1), "d"(-1) : "memory"); 120 "a" (-1), "d"(-1) : "memory");
120} 121}
121#endif 122#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e0..e77b22083721 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
47obj-y += process.o 47obj-y += process.o
48obj-y += i387.o xsave.o 48obj-y += i387.o xsave.o
49obj-y += ptrace.o 49obj-y += ptrace.o
50obj-$(CONFIG_X86_DS) += ds.o
51obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 50obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 51obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 52obj-y += step.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa95..9a5ed58f09dc 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -94,6 +94,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
94 94
95 95
96/* 96/*
97 * ISA irqs by default are the first 16 gsis but can be
98 * any gsi as specified by an interrupt source override.
99 */
100static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
101 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
102};
103
104static unsigned int gsi_to_irq(unsigned int gsi)
105{
106 unsigned int irq = gsi + NR_IRQS_LEGACY;
107 unsigned int i;
108
109 for (i = 0; i < NR_IRQS_LEGACY; i++) {
110 if (isa_irq_to_gsi[i] == gsi) {
111 return i;
112 }
113 }
114
115 /* Provide an identity mapping of gsi == irq
116 * except on truly weird platforms that have
117 * non isa irqs in the first 16 gsis.
118 */
119 if (gsi >= NR_IRQS_LEGACY)
120 irq = gsi;
121 else
122 irq = gsi_end + 1 + gsi;
123
124 return irq;
125}
126
127static u32 irq_to_gsi(int irq)
128{
129 unsigned int gsi;
130
131 if (irq < NR_IRQS_LEGACY)
132 gsi = isa_irq_to_gsi[irq];
133 else if (irq <= gsi_end)
134 gsi = irq;
135 else if (irq <= (gsi_end + NR_IRQS_LEGACY))
136 gsi = irq - gsi_end;
137 else
138 gsi = 0xffffffff;
139
140 return gsi;
141}
142
143/*
97 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, 144 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
98 * to map the target physical address. The problem is that set_fixmap() 145 * to map the target physical address. The problem is that set_fixmap()
99 * provides a single page, and it is possible that the page is not 146 * provides a single page, and it is possible that the page is not
@@ -313,7 +360,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
313/* 360/*
314 * Parse Interrupt Source Override for the ACPI SCI 361 * Parse Interrupt Source Override for the ACPI SCI
315 */ 362 */
316static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) 363static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
317{ 364{
318 if (trigger == 0) /* compatible SCI trigger is level */ 365 if (trigger == 0) /* compatible SCI trigger is level */
319 trigger = 3; 366 trigger = 3;
@@ -333,7 +380,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
333 * If GSI is < 16, this will update its flags, 380 * If GSI is < 16, this will update its flags,
334 * else it will create a new mp_irqs[] entry. 381 * else it will create a new mp_irqs[] entry.
335 */ 382 */
336 mp_override_legacy_irq(gsi, polarity, trigger, gsi); 383 mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
337 384
338 /* 385 /*
339 * stash over-ride to indicate we've been here 386 * stash over-ride to indicate we've been here
@@ -357,9 +404,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
357 acpi_table_print_madt_entry(header); 404 acpi_table_print_madt_entry(header);
358 405
359 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { 406 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
360 acpi_sci_ioapic_setup(intsrc->global_irq, 407 acpi_sci_ioapic_setup(intsrc->source_irq,
361 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, 408 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
362 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2); 409 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
410 intsrc->global_irq);
363 return 0; 411 return 0;
364 } 412 }
365 413
@@ -448,7 +496,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
448 496
449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 497int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
450{ 498{
451 *irq = gsi; 499 *irq = gsi_to_irq(gsi);
452 500
453#ifdef CONFIG_X86_IO_APIC 501#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) 502 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -458,6 +506,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
458 return 0; 506 return 0;
459} 507}
460 508
509int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
510{
511 if (isa_irq >= 16)
512 return -1;
513 *gsi = irq_to_gsi(isa_irq);
514 return 0;
515}
516
461/* 517/*
462 * success: return IRQ number (>=0) 518 * success: return IRQ number (>=0)
463 * failure: return < 0 519 * failure: return < 0
@@ -482,7 +538,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 538 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
483 } 539 }
484#endif 540#endif
485 irq = plat_gsi; 541 irq = gsi_to_irq(plat_gsi);
486 542
487 return irq; 543 return irq;
488} 544}
@@ -867,29 +923,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
867extern int es7000_plat; 923extern int es7000_plat;
868#endif 924#endif
869 925
870int __init acpi_probe_gsi(void)
871{
872 int idx;
873 int gsi;
874 int max_gsi = 0;
875
876 if (acpi_disabled)
877 return 0;
878
879 if (!acpi_ioapic)
880 return 0;
881
882 max_gsi = 0;
883 for (idx = 0; idx < nr_ioapics; idx++) {
884 gsi = mp_gsi_routing[idx].gsi_end;
885
886 if (gsi > max_gsi)
887 max_gsi = gsi;
888 }
889
890 return max_gsi + 1;
891}
892
893static void assign_to_mp_irq(struct mpc_intsrc *m, 926static void assign_to_mp_irq(struct mpc_intsrc *m,
894 struct mpc_intsrc *mp_irq) 927 struct mpc_intsrc *mp_irq)
895{ 928{
@@ -947,13 +980,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
947 mp_irq.dstirq = pin; /* INTIN# */ 980 mp_irq.dstirq = pin; /* INTIN# */
948 981
949 save_mp_irq(&mp_irq); 982 save_mp_irq(&mp_irq);
983
984 isa_irq_to_gsi[bus_irq] = gsi;
950} 985}
951 986
952void __init mp_config_acpi_legacy_irqs(void) 987void __init mp_config_acpi_legacy_irqs(void)
953{ 988{
954 int i; 989 int i;
955 int ioapic;
956 unsigned int dstapic;
957 struct mpc_intsrc mp_irq; 990 struct mpc_intsrc mp_irq;
958 991
959#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 992#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -974,19 +1007,27 @@ void __init mp_config_acpi_legacy_irqs(void)
974#endif 1007#endif
975 1008
976 /* 1009 /*
977 * Locate the IOAPIC that manages the ISA IRQs (0-15).
978 */
979 ioapic = mp_find_ioapic(0);
980 if (ioapic < 0)
981 return;
982 dstapic = mp_ioapics[ioapic].apicid;
983
984 /*
985 * Use the default configuration for the IRQs 0-15. Unless 1010 * Use the default configuration for the IRQs 0-15. Unless
986 * overridden by (MADT) interrupt source override entries. 1011 * overridden by (MADT) interrupt source override entries.
987 */ 1012 */
988 for (i = 0; i < 16; i++) { 1013 for (i = 0; i < 16; i++) {
1014 int ioapic, pin;
1015 unsigned int dstapic;
989 int idx; 1016 int idx;
1017 u32 gsi;
1018
1019 /* Locate the gsi that irq i maps to. */
1020 if (acpi_isa_irq_to_gsi(i, &gsi))
1021 continue;
1022
1023 /*
1024 * Locate the IOAPIC that manages the ISA IRQ.
1025 */
1026 ioapic = mp_find_ioapic(gsi);
1027 if (ioapic < 0)
1028 continue;
1029 pin = mp_find_ioapic_pin(ioapic, gsi);
1030 dstapic = mp_ioapics[ioapic].apicid;
990 1031
991 for (idx = 0; idx < mp_irq_entries; idx++) { 1032 for (idx = 0; idx < mp_irq_entries; idx++) {
992 struct mpc_intsrc *irq = mp_irqs + idx; 1033 struct mpc_intsrc *irq = mp_irqs + idx;
@@ -996,7 +1037,7 @@ void __init mp_config_acpi_legacy_irqs(void)
996 break; 1037 break;
997 1038
998 /* Do we already have a mapping for this IOAPIC pin */ 1039 /* Do we already have a mapping for this IOAPIC pin */
999 if (irq->dstapic == dstapic && irq->dstirq == i) 1040 if (irq->dstapic == dstapic && irq->dstirq == pin)
1000 break; 1041 break;
1001 } 1042 }
1002 1043
@@ -1011,7 +1052,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1011 mp_irq.dstapic = dstapic; 1052 mp_irq.dstapic = dstapic;
1012 mp_irq.irqtype = mp_INT; 1053 mp_irq.irqtype = mp_INT;
1013 mp_irq.srcbusirq = i; /* Identity mapped */ 1054 mp_irq.srcbusirq = i; /* Identity mapped */
1014 mp_irq.dstirq = i; 1055 mp_irq.dstirq = pin;
1015 1056
1016 save_mp_irq(&mp_irq); 1057 save_mp_irq(&mp_irq);
1017 } 1058 }
@@ -1076,11 +1117,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1076 1117
1077 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); 1118 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
1078 1119
1079#ifdef CONFIG_X86_32
1080 if (ioapic_renumber_irq)
1081 gsi = ioapic_renumber_irq(ioapic, gsi);
1082#endif
1083
1084 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1120 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1085 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1121 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1086 "%d-%d\n", mp_ioapics[ioapic].apicid, 1122 "%d-%d\n", mp_ioapics[ioapic].apicid,
@@ -1094,7 +1130,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1094 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, 1130 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1095 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, 1131 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1096 polarity == ACPI_ACTIVE_HIGH ? 0 : 1); 1132 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1097 io_apic_set_pci_routing(dev, gsi, &irq_attr); 1133 io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
1098 1134
1099 return gsi; 1135 return gsi;
1100} 1136}
@@ -1154,7 +1190,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1154 * pretend we got one so we can set the SCI flags. 1190 * pretend we got one so we can set the SCI flags.
1155 */ 1191 */
1156 if (!acpi_sci_override_gsi) 1192 if (!acpi_sci_override_gsi)
1157 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1193 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
1194 acpi_gbl_FADT.sci_interrupt);
1158 1195
1159 /* Fill in identity legacy mappings where no override */ 1196 /* Fill in identity legacy mappings where no override */
1160 mp_config_acpi_legacy_irqs(); 1197 mp_config_acpi_legacy_irqs();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d0..70237732a6c7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
194} 194}
195 195
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 196extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern u8 *__smp_locks[], *__smp_locks_end[]; 197extern s32 __smp_locks[], __smp_locks_end[];
198static void *text_poke_early(void *addr, const void *opcode, size_t len); 198static void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
235 235
236#ifdef CONFIG_SMP 236#ifdef CONFIG_SMP
237 237
238static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) 238static void alternatives_smp_lock(const s32 *start, const s32 *end,
239 u8 *text, u8 *text_end)
239{ 240{
240 u8 **ptr; 241 const s32 *poff;
241 242
242 mutex_lock(&text_mutex); 243 mutex_lock(&text_mutex);
243 for (ptr = start; ptr < end; ptr++) { 244 for (poff = start; poff < end; poff++) {
244 if (*ptr < text) 245 u8 *ptr = (u8 *)poff + *poff;
245 continue; 246
246 if (*ptr > text_end) 247 if (!*poff || ptr < text || ptr >= text_end)
247 continue; 248 continue;
248 /* turn DS segment override prefix into lock prefix */ 249 /* turn DS segment override prefix into lock prefix */
249 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 250 if (*ptr == 0x3e)
251 text_poke(ptr, ((unsigned char []){0xf0}), 1);
250 }; 252 };
251 mutex_unlock(&text_mutex); 253 mutex_unlock(&text_mutex);
252} 254}
253 255
254static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 256static void alternatives_smp_unlock(const s32 *start, const s32 *end,
257 u8 *text, u8 *text_end)
255{ 258{
256 u8 **ptr; 259 const s32 *poff;
257 260
258 if (noreplace_smp) 261 if (noreplace_smp)
259 return; 262 return;
260 263
261 mutex_lock(&text_mutex); 264 mutex_lock(&text_mutex);
262 for (ptr = start; ptr < end; ptr++) { 265 for (poff = start; poff < end; poff++) {
263 if (*ptr < text) 266 u8 *ptr = (u8 *)poff + *poff;
264 continue; 267
265 if (*ptr > text_end) 268 if (!*poff || ptr < text || ptr >= text_end)
266 continue; 269 continue;
267 /* turn lock prefix into DS segment override prefix */ 270 /* turn lock prefix into DS segment override prefix */
268 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 271 if (*ptr == 0xf0)
272 text_poke(ptr, ((unsigned char []){0x3E}), 1);
269 }; 273 };
270 mutex_unlock(&text_mutex); 274 mutex_unlock(&text_mutex);
271} 275}
@@ -276,8 +280,8 @@ struct smp_alt_module {
276 char *name; 280 char *name;
277 281
278 /* ptrs to lock prefixes */ 282 /* ptrs to lock prefixes */
279 u8 **locks; 283 const s32 *locks;
280 u8 **locks_end; 284 const s32 *locks_end;
281 285
282 /* .text segment, needed to avoid patching init code ;) */ 286 /* .text segment, needed to avoid patching init code ;) */
283 u8 *text; 287 u8 *text;
@@ -398,16 +402,19 @@ void alternatives_smp_switch(int smp)
398int alternatives_text_reserved(void *start, void *end) 402int alternatives_text_reserved(void *start, void *end)
399{ 403{
400 struct smp_alt_module *mod; 404 struct smp_alt_module *mod;
401 u8 **ptr; 405 const s32 *poff;
402 u8 *text_start = start; 406 u8 *text_start = start;
403 u8 *text_end = end; 407 u8 *text_end = end;
404 408
405 list_for_each_entry(mod, &smp_alt_modules, next) { 409 list_for_each_entry(mod, &smp_alt_modules, next) {
406 if (mod->text > text_end || mod->text_end < text_start) 410 if (mod->text > text_end || mod->text_end < text_start)
407 continue; 411 continue;
408 for (ptr = mod->locks; ptr < mod->locks_end; ptr++) 412 for (poff = mod->locks; poff < mod->locks_end; poff++) {
409 if (text_start <= *ptr && text_end >= *ptr) 413 const u8 *ptr = (const u8 *)poff + *poff;
414
415 if (text_start <= ptr && text_end > ptr)
410 return 1; 416 return 1;
417 }
411 } 418 }
412 419
413 return 0; 420 return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f854d89b7edf..fa5a1474cd18 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -731,18 +731,22 @@ static bool increase_address_space(struct protection_domain *domain,
731 731
732static u64 *alloc_pte(struct protection_domain *domain, 732static u64 *alloc_pte(struct protection_domain *domain,
733 unsigned long address, 733 unsigned long address,
734 int end_lvl, 734 unsigned long page_size,
735 u64 **pte_page, 735 u64 **pte_page,
736 gfp_t gfp) 736 gfp_t gfp)
737{ 737{
738 int level, end_lvl;
738 u64 *pte, *page; 739 u64 *pte, *page;
739 int level; 740
741 BUG_ON(!is_power_of_2(page_size));
740 742
741 while (address > PM_LEVEL_SIZE(domain->mode)) 743 while (address > PM_LEVEL_SIZE(domain->mode))
742 increase_address_space(domain, gfp); 744 increase_address_space(domain, gfp);
743 745
744 level = domain->mode - 1; 746 level = domain->mode - 1;
745 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 747 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
748 address = PAGE_SIZE_ALIGN(address, page_size);
749 end_lvl = PAGE_SIZE_LEVEL(page_size);
746 750
747 while (level > end_lvl) { 751 while (level > end_lvl) {
748 if (!IOMMU_PTE_PRESENT(*pte)) { 752 if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -752,6 +756,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
752 *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); 756 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
753 } 757 }
754 758
759 /* No level skipping support yet */
760 if (PM_PTE_LEVEL(*pte) != level)
761 return NULL;
762
755 level -= 1; 763 level -= 1;
756 764
757 pte = IOMMU_PTE_PAGE(*pte); 765 pte = IOMMU_PTE_PAGE(*pte);
@@ -769,28 +777,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
769 * This function checks if there is a PTE for a given dma address. If 777 * This function checks if there is a PTE for a given dma address. If
770 * there is one, it returns the pointer to it. 778 * there is one, it returns the pointer to it.
771 */ 779 */
772static u64 *fetch_pte(struct protection_domain *domain, 780static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
773 unsigned long address, int map_size)
774{ 781{
775 int level; 782 int level;
776 u64 *pte; 783 u64 *pte;
777 784
778 level = domain->mode - 1; 785 if (address > PM_LEVEL_SIZE(domain->mode))
779 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 786 return NULL;
787
788 level = domain->mode - 1;
789 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
780 790
781 while (level > map_size) { 791 while (level > 0) {
792
793 /* Not Present */
782 if (!IOMMU_PTE_PRESENT(*pte)) 794 if (!IOMMU_PTE_PRESENT(*pte))
783 return NULL; 795 return NULL;
784 796
797 /* Large PTE */
798 if (PM_PTE_LEVEL(*pte) == 0x07) {
799 unsigned long pte_mask, __pte;
800
801 /*
802 * If we have a series of large PTEs, make
803 * sure to return a pointer to the first one.
804 */
805 pte_mask = PTE_PAGE_SIZE(*pte);
806 pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
807 __pte = ((unsigned long)pte) & pte_mask;
808
809 return (u64 *)__pte;
810 }
811
812 /* No level skipping support yet */
813 if (PM_PTE_LEVEL(*pte) != level)
814 return NULL;
815
785 level -= 1; 816 level -= 1;
786 817
818 /* Walk to the next level */
787 pte = IOMMU_PTE_PAGE(*pte); 819 pte = IOMMU_PTE_PAGE(*pte);
788 pte = &pte[PM_LEVEL_INDEX(level, address)]; 820 pte = &pte[PM_LEVEL_INDEX(level, address)];
789
790 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
791 pte = NULL;
792 break;
793 }
794 } 821 }
795 822
796 return pte; 823 return pte;
@@ -807,44 +834,84 @@ static int iommu_map_page(struct protection_domain *dom,
807 unsigned long bus_addr, 834 unsigned long bus_addr,
808 unsigned long phys_addr, 835 unsigned long phys_addr,
809 int prot, 836 int prot,
810 int map_size) 837 unsigned long page_size)
811{ 838{
812 u64 __pte, *pte; 839 u64 __pte, *pte;
813 840 int i, count;
814 bus_addr = PAGE_ALIGN(bus_addr);
815 phys_addr = PAGE_ALIGN(phys_addr);
816
817 BUG_ON(!PM_ALIGNED(map_size, bus_addr));
818 BUG_ON(!PM_ALIGNED(map_size, phys_addr));
819 841
820 if (!(prot & IOMMU_PROT_MASK)) 842 if (!(prot & IOMMU_PROT_MASK))
821 return -EINVAL; 843 return -EINVAL;
822 844
823 pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); 845 bus_addr = PAGE_ALIGN(bus_addr);
846 phys_addr = PAGE_ALIGN(phys_addr);
847 count = PAGE_SIZE_PTE_COUNT(page_size);
848 pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
849
850 for (i = 0; i < count; ++i)
851 if (IOMMU_PTE_PRESENT(pte[i]))
852 return -EBUSY;
824 853
825 if (IOMMU_PTE_PRESENT(*pte)) 854 if (page_size > PAGE_SIZE) {
826 return -EBUSY; 855 __pte = PAGE_SIZE_PTE(phys_addr, page_size);
856 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
857 } else
858 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
827 859
828 __pte = phys_addr | IOMMU_PTE_P;
829 if (prot & IOMMU_PROT_IR) 860 if (prot & IOMMU_PROT_IR)
830 __pte |= IOMMU_PTE_IR; 861 __pte |= IOMMU_PTE_IR;
831 if (prot & IOMMU_PROT_IW) 862 if (prot & IOMMU_PROT_IW)
832 __pte |= IOMMU_PTE_IW; 863 __pte |= IOMMU_PTE_IW;
833 864
834 *pte = __pte; 865 for (i = 0; i < count; ++i)
866 pte[i] = __pte;
835 867
836 update_domain(dom); 868 update_domain(dom);
837 869
838 return 0; 870 return 0;
839} 871}
840 872
841static void iommu_unmap_page(struct protection_domain *dom, 873static unsigned long iommu_unmap_page(struct protection_domain *dom,
842 unsigned long bus_addr, int map_size) 874 unsigned long bus_addr,
875 unsigned long page_size)
843{ 876{
844 u64 *pte = fetch_pte(dom, bus_addr, map_size); 877 unsigned long long unmap_size, unmapped;
878 u64 *pte;
879
880 BUG_ON(!is_power_of_2(page_size));
881
882 unmapped = 0;
845 883
846 if (pte) 884 while (unmapped < page_size) {
847 *pte = 0; 885
886 pte = fetch_pte(dom, bus_addr);
887
888 if (!pte) {
889 /*
890 * No PTE for this address
891 * move forward in 4kb steps
892 */
893 unmap_size = PAGE_SIZE;
894 } else if (PM_PTE_LEVEL(*pte) == 0) {
895 /* 4kb PTE found for this address */
896 unmap_size = PAGE_SIZE;
897 *pte = 0ULL;
898 } else {
899 int count, i;
900
901 /* Large PTE found which maps this address */
902 unmap_size = PTE_PAGE_SIZE(*pte);
903 count = PAGE_SIZE_PTE_COUNT(unmap_size);
904 for (i = 0; i < count; i++)
905 pte[i] = 0ULL;
906 }
907
908 bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
909 unmapped += unmap_size;
910 }
911
912 BUG_ON(!is_power_of_2(unmapped));
913
914 return unmapped;
848} 915}
849 916
850/* 917/*
@@ -878,7 +945,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
878 for (addr = e->address_start; addr < e->address_end; 945 for (addr = e->address_start; addr < e->address_end;
879 addr += PAGE_SIZE) { 946 addr += PAGE_SIZE) {
880 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, 947 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
881 PM_MAP_4k); 948 PAGE_SIZE);
882 if (ret) 949 if (ret)
883 return ret; 950 return ret;
884 /* 951 /*
@@ -1006,7 +1073,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1006 u64 *pte, *pte_page; 1073 u64 *pte, *pte_page;
1007 1074
1008 for (i = 0; i < num_ptes; ++i) { 1075 for (i = 0; i < num_ptes; ++i) {
1009 pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, 1076 pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
1010 &pte_page, gfp); 1077 &pte_page, gfp);
1011 if (!pte) 1078 if (!pte)
1012 goto out_free; 1079 goto out_free;
@@ -1042,7 +1109,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1042 for (i = dma_dom->aperture[index]->offset; 1109 for (i = dma_dom->aperture[index]->offset;
1043 i < dma_dom->aperture_size; 1110 i < dma_dom->aperture_size;
1044 i += PAGE_SIZE) { 1111 i += PAGE_SIZE) {
1045 u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); 1112 u64 *pte = fetch_pte(&dma_dom->domain, i);
1046 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 1113 if (!pte || !IOMMU_PTE_PRESENT(*pte))
1047 continue; 1114 continue;
1048 1115
@@ -1712,7 +1779,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1712 1779
1713 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 1780 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1714 if (!pte) { 1781 if (!pte) {
1715 pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, 1782 pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
1716 GFP_ATOMIC); 1783 GFP_ATOMIC);
1717 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; 1784 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1718 } else 1785 } else
@@ -2439,12 +2506,11 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2439 return ret; 2506 return ret;
2440} 2507}
2441 2508
2442static int amd_iommu_map_range(struct iommu_domain *dom, 2509static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2443 unsigned long iova, phys_addr_t paddr, 2510 phys_addr_t paddr, int gfp_order, int iommu_prot)
2444 size_t size, int iommu_prot)
2445{ 2511{
2512 unsigned long page_size = 0x1000UL << gfp_order;
2446 struct protection_domain *domain = dom->priv; 2513 struct protection_domain *domain = dom->priv;
2447 unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
2448 int prot = 0; 2514 int prot = 0;
2449 int ret; 2515 int ret;
2450 2516
@@ -2453,61 +2519,50 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2453 if (iommu_prot & IOMMU_WRITE) 2519 if (iommu_prot & IOMMU_WRITE)
2454 prot |= IOMMU_PROT_IW; 2520 prot |= IOMMU_PROT_IW;
2455 2521
2456 iova &= PAGE_MASK;
2457 paddr &= PAGE_MASK;
2458
2459 mutex_lock(&domain->api_lock); 2522 mutex_lock(&domain->api_lock);
2460 2523 ret = iommu_map_page(domain, iova, paddr, prot, page_size);
2461 for (i = 0; i < npages; ++i) {
2462 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2463 if (ret)
2464 return ret;
2465
2466 iova += PAGE_SIZE;
2467 paddr += PAGE_SIZE;
2468 }
2469
2470 mutex_unlock(&domain->api_lock); 2524 mutex_unlock(&domain->api_lock);
2471 2525
2472 return 0; 2526 return ret;
2473} 2527}
2474 2528
2475static void amd_iommu_unmap_range(struct iommu_domain *dom, 2529static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2476 unsigned long iova, size_t size) 2530 int gfp_order)
2477{ 2531{
2478
2479 struct protection_domain *domain = dom->priv; 2532 struct protection_domain *domain = dom->priv;
2480 unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); 2533 unsigned long page_size, unmap_size;
2481 2534
2482 iova &= PAGE_MASK; 2535 page_size = 0x1000UL << gfp_order;
2483 2536
2484 mutex_lock(&domain->api_lock); 2537 mutex_lock(&domain->api_lock);
2485 2538 unmap_size = iommu_unmap_page(domain, iova, page_size);
2486 for (i = 0; i < npages; ++i) { 2539 mutex_unlock(&domain->api_lock);
2487 iommu_unmap_page(domain, iova, PM_MAP_4k);
2488 iova += PAGE_SIZE;
2489 }
2490 2540
2491 iommu_flush_tlb_pde(domain); 2541 iommu_flush_tlb_pde(domain);
2492 2542
2493 mutex_unlock(&domain->api_lock); 2543 return get_order(unmap_size);
2494} 2544}
2495 2545
2496static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2546static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2497 unsigned long iova) 2547 unsigned long iova)
2498{ 2548{
2499 struct protection_domain *domain = dom->priv; 2549 struct protection_domain *domain = dom->priv;
2500 unsigned long offset = iova & ~PAGE_MASK; 2550 unsigned long offset_mask;
2501 phys_addr_t paddr; 2551 phys_addr_t paddr;
2502 u64 *pte; 2552 u64 *pte, __pte;
2503 2553
2504 pte = fetch_pte(domain, iova, PM_MAP_4k); 2554 pte = fetch_pte(domain, iova);
2505 2555
2506 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 2556 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2507 return 0; 2557 return 0;
2508 2558
2509 paddr = *pte & IOMMU_PAGE_MASK; 2559 if (PM_PTE_LEVEL(*pte) == 0)
2510 paddr |= offset; 2560 offset_mask = PAGE_SIZE - 1;
2561 else
2562 offset_mask = PTE_PAGE_SIZE(*pte) - 1;
2563
2564 __pte = *pte & PM_ADDR_MASK;
2565 paddr = (__pte & ~offset_mask) | (iova & offset_mask);
2511 2566
2512 return paddr; 2567 return paddr;
2513} 2568}
@@ -2523,8 +2578,8 @@ static struct iommu_ops amd_iommu_ops = {
2523 .domain_destroy = amd_iommu_domain_destroy, 2578 .domain_destroy = amd_iommu_domain_destroy,
2524 .attach_dev = amd_iommu_attach_device, 2579 .attach_dev = amd_iommu_attach_device,
2525 .detach_dev = amd_iommu_detach_device, 2580 .detach_dev = amd_iommu_detach_device,
2526 .map = amd_iommu_map_range, 2581 .map = amd_iommu_map,
2527 .unmap = amd_iommu_unmap_range, 2582 .unmap = amd_iommu_unmap,
2528 .iova_to_phys = amd_iommu_iova_to_phys, 2583 .iova_to_phys = amd_iommu_iova_to_phys,
2529 .domain_has_cap = amd_iommu_domain_has_cap, 2584 .domain_has_cap = amd_iommu_domain_has_cap,
2530}; 2585};
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf993d4..3bacb4d0844c 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
120bool amd_iommu_dump; 120bool amd_iommu_dump;
121 121
122static int __initdata amd_iommu_detected; 122static int __initdata amd_iommu_detected;
123static bool __initdata amd_iommu_disabled;
123 124
124u16 amd_iommu_last_bdf; /* largest PCI device id we have 125u16 amd_iommu_last_bdf; /* largest PCI device id we have
125 to handle */ 126 to handle */
@@ -1372,6 +1373,9 @@ void __init amd_iommu_detect(void)
1372 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 1373 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1373 return; 1374 return;
1374 1375
1376 if (amd_iommu_disabled)
1377 return;
1378
1375 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1379 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1376 iommu_detected = 1; 1380 iommu_detected = 1;
1377 amd_iommu_detected = 1; 1381 amd_iommu_detected = 1;
@@ -1401,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str)
1401 for (; *str; ++str) { 1405 for (; *str; ++str) {
1402 if (strncmp(str, "fullflush", 9) == 0) 1406 if (strncmp(str, "fullflush", 9) == 0)
1403 amd_iommu_unmap_flush = true; 1407 amd_iommu_unmap_flush = true;
1408 if (strncmp(str, "off", 3) == 0)
1409 amd_iommu_disabled = true;
1404 } 1410 }
1405 1411
1406 return 1; 1412 return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5e..425e53a87feb 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -131,24 +131,6 @@ int es7000_plat;
131 131
132static unsigned int base; 132static unsigned int base;
133 133
134static int
135es7000_rename_gsi(int ioapic, int gsi)
136{
137 if (es7000_plat == ES7000_ZORRO)
138 return gsi;
139
140 if (!base) {
141 int i;
142 for (i = 0; i < nr_ioapics; i++)
143 base += nr_ioapic_registers[i];
144 }
145
146 if (!ioapic && (gsi < 16))
147 gsi += base;
148
149 return gsi;
150}
151
152static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 134static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
153{ 135{
154 unsigned long vect = 0, psaival = 0; 136 unsigned long vect = 0, psaival = 0;
@@ -190,7 +172,6 @@ static void setup_unisys(void)
190 es7000_plat = ES7000_ZORRO; 172 es7000_plat = ES7000_ZORRO;
191 else 173 else
192 es7000_plat = ES7000_CLASSIC; 174 es7000_plat = ES7000_CLASSIC;
193 ioapic_renumber_irq = es7000_rename_gsi;
194} 175}
195 176
196/* 177/*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index eb2789c3f721..33f3563a2a52 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
89/* IO APIC gsi routing info */ 89/* IO APIC gsi routing info */
90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; 90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
91 91
92/* The last gsi number used */
93u32 gsi_end;
94
92/* MP IRQ source entries */ 95/* MP IRQ source entries */
93struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; 96struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
94 97
@@ -1013,10 +1016,9 @@ static inline int irq_trigger(int idx)
1013 return MPBIOS_trigger(idx); 1016 return MPBIOS_trigger(idx);
1014} 1017}
1015 1018
1016int (*ioapic_renumber_irq)(int ioapic, int irq);
1017static int pin_2_irq(int idx, int apic, int pin) 1019static int pin_2_irq(int idx, int apic, int pin)
1018{ 1020{
1019 int irq, i; 1021 int irq;
1020 int bus = mp_irqs[idx].srcbus; 1022 int bus = mp_irqs[idx].srcbus;
1021 1023
1022 /* 1024 /*
@@ -1028,18 +1030,12 @@ static int pin_2_irq(int idx, int apic, int pin)
1028 if (test_bit(bus, mp_bus_not_pci)) { 1030 if (test_bit(bus, mp_bus_not_pci)) {
1029 irq = mp_irqs[idx].srcbusirq; 1031 irq = mp_irqs[idx].srcbusirq;
1030 } else { 1032 } else {
1031 /* 1033 u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
1032 * PCI IRQs are mapped in order 1034
1033 */ 1035 if (gsi >= NR_IRQS_LEGACY)
1034 i = irq = 0; 1036 irq = gsi;
1035 while (i < apic) 1037 else
1036 irq += nr_ioapic_registers[i++]; 1038 irq = gsi_end + 1 + gsi;
1037 irq += pin;
1038 /*
1039 * For MPS mode, so far only needed by ES7000 platform
1040 */
1041 if (ioapic_renumber_irq)
1042 irq = ioapic_renumber_irq(apic, irq);
1043 } 1039 }
1044 1040
1045#ifdef CONFIG_X86_32 1041#ifdef CONFIG_X86_32
@@ -1950,20 +1946,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
1950 1946
1951void __init enable_IO_APIC(void) 1947void __init enable_IO_APIC(void)
1952{ 1948{
1953 union IO_APIC_reg_01 reg_01;
1954 int i8259_apic, i8259_pin; 1949 int i8259_apic, i8259_pin;
1955 int apic; 1950 int apic;
1956 unsigned long flags;
1957
1958 /*
1959 * The number of IO-APIC IRQ registers (== #pins):
1960 */
1961 for (apic = 0; apic < nr_ioapics; apic++) {
1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1963 reg_01.raw = io_apic_read(apic, 1);
1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1966 }
1967 1951
1968 if (!legacy_pic->nr_legacy_irqs) 1952 if (!legacy_pic->nr_legacy_irqs)
1969 return; 1953 return;
@@ -3858,27 +3842,20 @@ int __init io_apic_get_redir_entries (int ioapic)
3858 reg_01.raw = io_apic_read(ioapic, 1); 3842 reg_01.raw = io_apic_read(ioapic, 1);
3859 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 3843 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3860 3844
3861 return reg_01.bits.entries; 3845 /* The register returns the maximum index redir index
3846 * supported, which is one less than the total number of redir
3847 * entries.
3848 */
3849 return reg_01.bits.entries + 1;
3862} 3850}
3863 3851
3864void __init probe_nr_irqs_gsi(void) 3852void __init probe_nr_irqs_gsi(void)
3865{ 3853{
3866 int nr = 0; 3854 int nr;
3867 3855
3868 nr = acpi_probe_gsi(); 3856 nr = gsi_end + 1 + NR_IRQS_LEGACY;
3869 if (nr > nr_irqs_gsi) { 3857 if (nr > nr_irqs_gsi)
3870 nr_irqs_gsi = nr; 3858 nr_irqs_gsi = nr;
3871 } else {
3872 /* for acpi=off or acpi is not compiled in */
3873 int idx;
3874
3875 nr = 0;
3876 for (idx = 0; idx < nr_ioapics; idx++)
3877 nr += io_apic_get_redir_entries(idx) + 1;
3878
3879 if (nr > nr_irqs_gsi)
3880 nr_irqs_gsi = nr;
3881 }
3882 3859
3883 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3860 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3884} 3861}
@@ -4085,22 +4062,27 @@ int __init io_apic_get_version(int ioapic)
4085 return reg_01.bits.version; 4062 return reg_01.bits.version;
4086} 4063}
4087 4064
4088int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4065int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
4089{ 4066{
4090 int i; 4067 int ioapic, pin, idx;
4091 4068
4092 if (skip_ioapic_setup) 4069 if (skip_ioapic_setup)
4093 return -1; 4070 return -1;
4094 4071
4095 for (i = 0; i < mp_irq_entries; i++) 4072 ioapic = mp_find_ioapic(gsi);
4096 if (mp_irqs[i].irqtype == mp_INT && 4073 if (ioapic < 0)
4097 mp_irqs[i].srcbusirq == bus_irq)
4098 break;
4099 if (i >= mp_irq_entries)
4100 return -1; 4074 return -1;
4101 4075
4102 *trigger = irq_trigger(i); 4076 pin = mp_find_ioapic_pin(ioapic, gsi);
4103 *polarity = irq_polarity(i); 4077 if (pin < 0)
4078 return -1;
4079
4080 idx = find_irq_entry(ioapic, pin, mp_INT);
4081 if (idx < 0)
4082 return -1;
4083
4084 *trigger = irq_trigger(idx);
4085 *polarity = irq_polarity(idx);
4104 return 0; 4086 return 0;
4105} 4087}
4106 4088
@@ -4241,7 +4223,7 @@ void __init ioapic_insert_resources(void)
4241 } 4223 }
4242} 4224}
4243 4225
4244int mp_find_ioapic(int gsi) 4226int mp_find_ioapic(u32 gsi)
4245{ 4227{
4246 int i = 0; 4228 int i = 0;
4247 4229
@@ -4256,7 +4238,7 @@ int mp_find_ioapic(int gsi)
4256 return -1; 4238 return -1;
4257} 4239}
4258 4240
4259int mp_find_ioapic_pin(int ioapic, int gsi) 4241int mp_find_ioapic_pin(int ioapic, u32 gsi)
4260{ 4242{
4261 if (WARN_ON(ioapic == -1)) 4243 if (WARN_ON(ioapic == -1))
4262 return -1; 4244 return -1;
@@ -4284,6 +4266,7 @@ static int bad_ioapic(unsigned long address)
4284void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 4266void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4285{ 4267{
4286 int idx = 0; 4268 int idx = 0;
4269 int entries;
4287 4270
4288 if (bad_ioapic(address)) 4271 if (bad_ioapic(address))
4289 return; 4272 return;
@@ -4302,9 +4285,17 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4302 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 4285 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4303 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 4286 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4304 */ 4287 */
4288 entries = io_apic_get_redir_entries(idx);
4305 mp_gsi_routing[idx].gsi_base = gsi_base; 4289 mp_gsi_routing[idx].gsi_base = gsi_base;
4306 mp_gsi_routing[idx].gsi_end = gsi_base + 4290 mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
4307 io_apic_get_redir_entries(idx); 4291
4292 /*
4293 * The number of IO-APIC IRQ registers (== #pins):
4294 */
4295 nr_ioapic_registers[idx] = entries;
4296
4297 if (mp_gsi_routing[idx].gsi_end > gsi_end)
4298 gsi_end = mp_gsi_routing[idx].gsi_end;
4308 4299
4309 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 4300 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4310 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, 4301 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0eb..c4f9182ca3ac 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
1224#ifdef INIT_TIMER_AFTER_SUSPEND 1224#ifdef INIT_TIMER_AFTER_SUSPEND
1225 unsigned long flags; 1225 unsigned long flags;
1226 1226
1227 spin_lock_irqsave(&i8253_lock, flags); 1227 raw_spin_lock_irqsave(&i8253_lock, flags);
1228 /* set the clock to HZ */ 1228 /* set the clock to HZ */
1229 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1229 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1230 udelay(10); 1230 udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
1232 udelay(10); 1232 udelay(10);
1233 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ 1233 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
1234 udelay(10); 1234 udelay(10);
1235 spin_unlock_irqrestore(&i8253_lock, flags); 1235 raw_spin_unlock_irqrestore(&i8253_lock, flags);
1236#endif 1236#endif
1237} 1237}
1238 1238
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3a785da34b6f 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o addon_cpuid_features.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79cdf688..10fa5684a662 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,12 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
30 const struct cpuid_bit *cb; 30 const struct cpuid_bit *cb;
31 31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, 35 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, 36 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, 37 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, 38 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
39 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
40 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
39 { 0, 0, 0, 0 } 41 { 0, 0, 0, 0 }
40 }; 42 };
41 43
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951ee..c1c00d0b1692 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1243,10 +1243,7 @@ void __cpuinit cpu_init(void)
1243 /* 1243 /*
1244 * Force FPU initialization: 1244 * Force FPU initialization:
1245 */ 1245 */
1246 if (cpu_has_xsave) 1246 current_thread_info()->status = 0;
1247 current_thread_info()->status = TS_XSAVE;
1248 else
1249 current_thread_info()->status = 0;
1250 clear_used_math(); 1247 clear_used_math();
1251 mxcsr_feature_mask_init(); 1248 mxcsr_feature_mask_init();
1252 1249
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170b..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
2# K8 systems. ACPI is preferred to all other hardware-specific drivers. 2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod. 3# speedstep-* is preferred over p4-clockmod.
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o 7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 459168083b77..1d3cddaa40ee 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -46,6 +46,7 @@
46#include <asm/msr.h> 46#include <asm/msr.h>
47#include <asm/processor.h> 47#include <asm/processor.h>
48#include <asm/cpufeature.h> 48#include <asm/cpufeature.h>
49#include "mperf.h"
49 50
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ 51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg) 52 "acpi-cpufreq", msg)
@@ -71,8 +72,6 @@ struct acpi_cpufreq_data {
71 72
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); 73static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73 74
74static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
75
76/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
77static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
78 77
@@ -240,45 +239,6 @@ static u32 get_cur_val(const struct cpumask *mask)
240 return cmd.val; 239 return cmd.val;
241} 240}
242 241
243/* Called via smp_call_function_single(), on the target CPU */
244static void read_measured_perf_ctrs(void *_cur)
245{
246 struct aperfmperf *am = _cur;
247
248 get_aperfmperf(am);
249}
250
251/*
252 * Return the measured active (C0) frequency on this CPU since last call
253 * to this function.
254 * Input: cpu number
255 * Return: Average CPU frequency in terms of max frequency (zero on error)
256 *
257 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
258 * over a period of time, while CPU is in C0 state.
259 * IA32_MPERF counts at the rate of max advertised frequency
260 * IA32_APERF counts at the rate of actual CPU frequency
261 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
262 * no meaning should be associated with absolute values of these MSRs.
263 */
264static unsigned int get_measured_perf(struct cpufreq_policy *policy,
265 unsigned int cpu)
266{
267 struct aperfmperf perf;
268 unsigned long ratio;
269 unsigned int retval;
270
271 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
272 return 0;
273
274 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
275 per_cpu(acfreq_old_perf, cpu) = perf;
276
277 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
278
279 return retval;
280}
281
282static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 242static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
283{ 243{
284 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); 244 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -702,7 +662,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
702 662
703 /* Check for APERF/MPERF support in hardware */ 663 /* Check for APERF/MPERF support in hardware */
704 if (cpu_has(c, X86_FEATURE_APERFMPERF)) 664 if (cpu_has(c, X86_FEATURE_APERFMPERF))
705 acpi_cpufreq_driver.getavg = get_measured_perf; 665 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
706 666
707 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 667 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
708 for (i = 0; i < perf->state_count; i++) 668 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index b6215b9798e2..6f3dc8fbbfdc 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
1
2/* 1/*
3 * (c) 2003-2006 Advanced Micro Devices, Inc. 2 * (c) 2003-2010 Advanced Micro Devices, Inc.
4 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
5 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
6 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
@@ -46,6 +45,7 @@
46#define PFX "powernow-k8: " 45#define PFX "powernow-k8: "
47#define VERSION "version 2.20.00" 46#define VERSION "version 2.20.00"
48#include "powernow-k8.h" 47#include "powernow-k8.h"
48#include "mperf.h"
49 49
50/* serialize freq changes */ 50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex); 51static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54 54
55static int cpu_family = CPU_OPTERON; 55static int cpu_family = CPU_OPTERON;
56 56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
57#ifndef CONFIG_SMP 63#ifndef CONFIG_SMP
58static inline const struct cpumask *cpu_core_mask(int cpu) 64static inline const struct cpumask *cpu_core_mask(int cpu)
59{ 65{
@@ -1249,6 +1255,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1249 struct powernow_k8_data *data; 1255 struct powernow_k8_data *data;
1250 struct init_on_cpu init_on_cpu; 1256 struct init_on_cpu init_on_cpu;
1251 int rc; 1257 int rc;
1258 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1252 1259
1253 if (!cpu_online(pol->cpu)) 1260 if (!cpu_online(pol->cpu))
1254 return -ENODEV; 1261 return -ENODEV;
@@ -1323,6 +1330,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1323 return -EINVAL; 1330 return -EINVAL;
1324 } 1331 }
1325 1332
1333 /* Check for APERF/MPERF support in hardware */
1334 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1335 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1336
1326 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); 1337 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1327 1338
1328 if (cpu_family == CPU_HW_PSTATE) 1339 if (cpu_family == CPU_HW_PSTATE)
@@ -1394,8 +1405,77 @@ out:
1394 return khz; 1405 return khz;
1395} 1406}
1396 1407
1408static void _cpb_toggle_msrs(bool t)
1409{
1410 int cpu;
1411
1412 get_online_cpus();
1413
1414 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1415
1416 for_each_cpu(cpu, cpu_online_mask) {
1417 struct msr *reg = per_cpu_ptr(msrs, cpu);
1418 if (t)
1419 reg->l &= ~BIT(25);
1420 else
1421 reg->l |= BIT(25);
1422 }
1423 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1424
1425 put_online_cpus();
1426}
1427
1428/*
1429 * Switch on/off core performance boosting.
1430 *
1431 * 0=disable
1432 * 1=enable.
1433 */
1434static void cpb_toggle(bool t)
1435{
1436 if (!cpb_capable)
1437 return;
1438
1439 if (t && !cpb_enabled) {
1440 cpb_enabled = true;
1441 _cpb_toggle_msrs(t);
1442 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1443 } else if (!t && cpb_enabled) {
1444 cpb_enabled = false;
1445 _cpb_toggle_msrs(t);
1446 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1447 }
1448}
1449
1450static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1451 size_t count)
1452{
1453 int ret = -EINVAL;
1454 unsigned long val = 0;
1455
1456 ret = strict_strtoul(buf, 10, &val);
1457 if (!ret && (val == 0 || val == 1) && cpb_capable)
1458 cpb_toggle(val);
1459 else
1460 return -EINVAL;
1461
1462 return count;
1463}
1464
1465static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1466{
1467 return sprintf(buf, "%u\n", cpb_enabled);
1468}
1469
1470#define define_one_rw(_name) \
1471static struct freq_attr _name = \
1472__ATTR(_name, 0644, show_##_name, store_##_name)
1473
1474define_one_rw(cpb);
1475
1397static struct freq_attr *powernow_k8_attr[] = { 1476static struct freq_attr *powernow_k8_attr[] = {
1398 &cpufreq_freq_attr_scaling_available_freqs, 1477 &cpufreq_freq_attr_scaling_available_freqs,
1478 &cpb,
1399 NULL, 1479 NULL,
1400}; 1480};
1401 1481
@@ -1411,10 +1491,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
1411 .attr = powernow_k8_attr, 1491 .attr = powernow_k8_attr,
1412}; 1492};
1413 1493
1494/*
1495 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1496 * cannot block the remaining ones from boosting. On the CPU_UP path we
1497 * simply keep the boost-disable flag in sync with the current global
1498 * state.
1499 */
1500static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
1501 void *hcpu)
1502{
1503 unsigned cpu = (long)hcpu;
1504 u32 lo, hi;
1505
1506 switch (action) {
1507 case CPU_UP_PREPARE:
1508 case CPU_UP_PREPARE_FROZEN:
1509
1510 if (!cpb_enabled) {
1511 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1512 lo |= BIT(25);
1513 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1514 }
1515 break;
1516
1517 case CPU_DOWN_PREPARE:
1518 case CPU_DOWN_PREPARE_FROZEN:
1519 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1520 lo &= ~BIT(25);
1521 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1522 break;
1523
1524 default:
1525 break;
1526 }
1527
1528 return NOTIFY_OK;
1529}
1530
1531static struct notifier_block __cpuinitdata cpb_nb = {
1532 .notifier_call = cpb_notify,
1533};
1534
1414/* driver entry point for init */ 1535/* driver entry point for init */
1415static int __cpuinit powernowk8_init(void) 1536static int __cpuinit powernowk8_init(void)
1416{ 1537{
1417 unsigned int i, supported_cpus = 0; 1538 unsigned int i, supported_cpus = 0, cpu;
1418 1539
1419 for_each_online_cpu(i) { 1540 for_each_online_cpu(i) {
1420 int rc; 1541 int rc;
@@ -1423,15 +1544,36 @@ static int __cpuinit powernowk8_init(void)
1423 supported_cpus++; 1544 supported_cpus++;
1424 } 1545 }
1425 1546
1426 if (supported_cpus == num_online_cpus()) { 1547 if (supported_cpus != num_online_cpus())
1427 printk(KERN_INFO PFX "Found %d %s " 1548 return -ENODEV;
1428 "processors (%d cpu cores) (" VERSION ")\n", 1549
1429 num_online_nodes(), 1550 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1430 boot_cpu_data.x86_model_id, supported_cpus); 1551 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1431 return cpufreq_register_driver(&cpufreq_amd64_driver); 1552
1553 if (boot_cpu_has(X86_FEATURE_CPB)) {
1554
1555 cpb_capable = true;
1556
1557 register_cpu_notifier(&cpb_nb);
1558
1559 msrs = msrs_alloc();
1560 if (!msrs) {
1561 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1562 return -ENOMEM;
1563 }
1564
1565 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1566
1567 for_each_cpu(cpu, cpu_online_mask) {
1568 struct msr *reg = per_cpu_ptr(msrs, cpu);
1569 cpb_enabled |= !(!!(reg->l & BIT(25)));
1570 }
1571
1572 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1573 (cpb_enabled ? "on" : "off"));
1432 } 1574 }
1433 1575
1434 return -ENODEV; 1576 return cpufreq_register_driver(&cpufreq_amd64_driver);
1435} 1577}
1436 1578
1437/* driver entry point for term */ 1579/* driver entry point for term */
@@ -1439,6 +1581,13 @@ static void __exit powernowk8_exit(void)
1439{ 1581{
1440 dprintk("exit\n"); 1582 dprintk("exit\n");
1441 1583
1584 if (boot_cpu_has(X86_FEATURE_CPB)) {
1585 msrs_free(msrs);
1586 msrs = NULL;
1587
1588 unregister_cpu_notifier(&cpb_nb);
1589 }
1590
1442 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1591 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1443} 1592}
1444 1593
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 */ 6 */
7 7
8
9enum pstate { 8enum pstate {
10 HW_PSTATE_INVALID = 0xff, 9 HW_PSTATE_INVALID = 0xff,
11 HW_PSTATE_0 = 0, 10 HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
55 struct cpumask *available_cores; 54 struct cpumask *available_cores;
56}; 55};
57 56
58
59/* processor's cpuid instruction support */ 57/* processor's cpuid instruction support */
60#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ 58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
61#define CPUID_XFAM 0x0ff00000 /* extended family */ 59#define CPUID_XFAM 0x0ff00000 /* extended family */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..dd531cc56a8f 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,55 @@
21 * 21 *
22 */ 22 */
23 23
24#include <linux/module.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h> 26#include <asm/hypervisor.h>
27 27
28static inline void __cpuinit 28/*
29detect_hypervisor_vendor(struct cpuinfo_x86 *c) 29 * Hypervisor detect order. This is specified explicitly here because
30 * some hypervisors might implement compatibility modes for other
31 * hypervisors and therefore need to be detected in specific sequence.
32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] =
30{ 34{
31 if (vmware_platform()) 35 &x86_hyper_vmware,
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; 36 &x86_hyper_ms_hyperv,
33 else 37};
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35}
36 38
37static inline void __cpuinit 39const struct hypervisor_x86 *x86_hyper;
38hypervisor_set_feature_bits(struct cpuinfo_x86 *c) 40EXPORT_SYMBOL(x86_hyper);
41
42static inline void __init
43detect_hypervisor_vendor(void)
39{ 44{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { 45 const struct hypervisor_x86 *h, * const *p;
41 vmware_set_feature_bits(c); 46
42 return; 47 for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
48 h = *p;
49 if (h->detect()) {
50 x86_hyper = h;
51 printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
52 break;
53 }
43 } 54 }
44} 55}
45 56
46void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) 57void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
47{ 58{
48 detect_hypervisor_vendor(c); 59 if (x86_hyper && x86_hyper->set_cpu_features)
49 hypervisor_set_feature_bits(c); 60 x86_hyper->set_cpu_features(c);
50} 61}
51 62
52void __init init_hypervisor_platform(void) 63void __init init_hypervisor_platform(void)
53{ 64{
65
66 detect_hypervisor_vendor();
67
68 if (!x86_hyper)
69 return;
70
54 init_hypervisor(&boot_cpu_data); 71 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) 72
56 vmware_platform_setup(); 73 if (x86_hyper->init_platform)
74 x86_hyper->init_platform();
57} 75}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1366c7cfd483..85f69cdeae10 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17#include <asm/cpu.h> 16#include <asm/cpu.h>
18 17
@@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
373 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 372 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
374 } 373 }
375 374
376 if (c->cpuid_level > 6) {
377 unsigned ecx = cpuid_ecx(6);
378 if (ecx & 0x01)
379 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
380 }
381
382 if (cpu_has_xmm2) 375 if (cpu_has_xmm2)
383 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 376 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
384 if (cpu_has_ds) { 377 if (cpu_has_ds) {
@@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
388 set_cpu_cap(c, X86_FEATURE_BTS); 381 set_cpu_cap(c, X86_FEATURE_BTS);
389 if (!(l1 & (1<<12))) 382 if (!(l1 & (1<<12)))
390 set_cpu_cap(c, X86_FEATURE_PEBS); 383 set_cpu_cap(c, X86_FEATURE_PEBS);
391 ds_init_intel(c);
392 } 384 }
393 385
394 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) 386 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a51..33eae2062cf5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
148 u32 full; 148 u32 full;
149}; 149};
150 150
151struct amd_l3_cache {
152 struct pci_dev *dev;
153 bool can_disable;
154 unsigned indices;
155 u8 subcaches[4];
156};
157
151struct _cpuid4_info { 158struct _cpuid4_info {
152 union _cpuid4_leaf_eax eax; 159 union _cpuid4_leaf_eax eax;
153 union _cpuid4_leaf_ebx ebx; 160 union _cpuid4_leaf_ebx ebx;
154 union _cpuid4_leaf_ecx ecx; 161 union _cpuid4_leaf_ecx ecx;
155 unsigned long size; 162 unsigned long size;
156 bool can_disable; 163 struct amd_l3_cache *l3;
157 unsigned int l3_indices;
158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
159}; 165};
160 166
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
164 union _cpuid4_leaf_ebx ebx; 170 union _cpuid4_leaf_ebx ebx;
165 union _cpuid4_leaf_ecx ecx; 171 union _cpuid4_leaf_ecx ecx;
166 unsigned long size; 172 unsigned long size;
167 bool can_disable; 173 struct amd_l3_cache *l3;
168 unsigned int l3_indices;
169}; 174};
170 175
171unsigned short num_cache_leaves; 176unsigned short num_cache_leaves;
@@ -302,87 +307,163 @@ struct _cache_attr {
302}; 307};
303 308
304#ifdef CONFIG_CPU_SUP_AMD 309#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void) 310
311/*
312 * L3 cache descriptors
313 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
306{ 317{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3; 318 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0; 319 u32 val = 0;
316 320
317 pci_read_config_dword(dev, 0x1C4, &val); 321 pci_read_config_dword(l3->dev, 0x1C4, &val);
318 322
319 /* calculate subcache sizes */ 323 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0)); 324 l3->subcaches[0] = sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4)); 325 l3->subcaches[1] = sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9)); 326 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13)); 327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
324 328
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; 329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
330}
331
332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
333{
334 struct amd_l3_cache *l3;
335 struct pci_dev *dev = node_to_k8_nb_misc(node);
336
337 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
338 if (!l3) {
339 printk(KERN_WARNING "Error allocating L3 struct\n");
340 return NULL;
341 }
342
343 l3->dev = dev;
344
345 amd_calc_l3_indices(l3);
346
347 return l3;
326} 348}
327 349
328static void __cpuinit 350static void __cpuinit
329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 351amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
330{ 352{
331 if (index < 3) 353 int node;
354
355 if (boot_cpu_data.x86 != 0x10)
332 return; 356 return;
333 357
334 if (boot_cpu_data.x86 == 0x11) 358 if (index < 3)
335 return; 359 return;
336 360
337 /* see errata #382 and #388 */ 361 /* see errata #382 and #388 */
338 if ((boot_cpu_data.x86 == 0x10) && 362 if (boot_cpu_data.x86_model < 0x8)
339 ((boot_cpu_data.x86_model < 0x8) || 363 return;
340 (boot_cpu_data.x86_mask < 0x1))) 364
365 if ((boot_cpu_data.x86_model == 0x8 ||
366 boot_cpu_data.x86_model == 0x9)
367 &&
368 boot_cpu_data.x86_mask < 0x1)
369 return;
370
371 /* not in virtualized environments */
372 if (num_k8_northbridges == 0)
341 return; 373 return;
342 374
343 this_leaf->can_disable = true; 375 /*
344 this_leaf->l3_indices = amd_calc_l3_indices(); 376 * Strictly speaking, the amount in @size below is leaked since it is
377 * never freed but this is done only on shutdown so it doesn't matter.
378 */
379 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
381
382 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches)
384 return;
385 }
386
387 node = amd_get_nb_id(smp_processor_id());
388
389 if (!l3_caches[node]) {
390 l3_caches[node] = amd_init_l3_cache(node);
391 l3_caches[node]->can_disable = true;
392 }
393
394 WARN_ON(!l3_caches[node]);
395
396 this_leaf->l3 = l3_caches[node];
345} 397}
346 398
347static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 399static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
348 unsigned int index) 400 unsigned int slot)
349{ 401{
350 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 402 struct pci_dev *dev = this_leaf->l3->dev;
351 int node = amd_get_nb_id(cpu);
352 struct pci_dev *dev = node_to_k8_nb_misc(node);
353 unsigned int reg = 0; 403 unsigned int reg = 0;
354 404
355 if (!this_leaf->can_disable) 405 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
356 return -EINVAL; 406 return -EINVAL;
357 407
358 if (!dev) 408 if (!dev)
359 return -EINVAL; 409 return -EINVAL;
360 410
361 pci_read_config_dword(dev, 0x1BC + index * 4, &reg); 411 pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
362 return sprintf(buf, "0x%08x\n", reg); 412 return sprintf(buf, "0x%08x\n", reg);
363} 413}
364 414
365#define SHOW_CACHE_DISABLE(index) \ 415#define SHOW_CACHE_DISABLE(slot) \
366static ssize_t \ 416static ssize_t \
367show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ 417show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
368{ \ 418{ \
369 return show_cache_disable(this_leaf, buf, index); \ 419 return show_cache_disable(this_leaf, buf, slot); \
370} 420}
371SHOW_CACHE_DISABLE(0) 421SHOW_CACHE_DISABLE(0)
372SHOW_CACHE_DISABLE(1) 422SHOW_CACHE_DISABLE(1)
373 423
424static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
425 unsigned slot, unsigned long idx)
426{
427 int i;
428
429 idx |= BIT(30);
430
431 /*
432 * disable index in all 4 subcaches
433 */
434 for (i = 0; i < 4; i++) {
435 u32 reg = idx | (i << 20);
436
437 if (!l3->subcaches[i])
438 continue;
439
440 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
441
442 /*
443 * We need to WBINVD on a core on the node containing the L3
444 * cache which indices we disable therefore a simple wbinvd()
445 * is not sufficient.
446 */
447 wbinvd_on_cpu(cpu);
448
449 reg |= BIT(31);
450 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
451 }
452}
453
454
374static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 455static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
375 const char *buf, size_t count, unsigned int index) 456 const char *buf, size_t count,
457 unsigned int slot)
376{ 458{
459 struct pci_dev *dev = this_leaf->l3->dev;
377 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 460 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
378 int node = amd_get_nb_id(cpu);
379 struct pci_dev *dev = node_to_k8_nb_misc(node);
380 unsigned long val = 0; 461 unsigned long val = 0;
381 462
382#define SUBCACHE_MASK (3UL << 20) 463#define SUBCACHE_MASK (3UL << 20)
383#define SUBCACHE_INDEX 0xfff 464#define SUBCACHE_INDEX 0xfff
384 465
385 if (!this_leaf->can_disable) 466 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
386 return -EINVAL; 467 return -EINVAL;
387 468
388 if (!capable(CAP_SYS_ADMIN)) 469 if (!capable(CAP_SYS_ADMIN))
@@ -396,26 +477,20 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
396 477
397 /* do not allow writes outside of allowed bits */ 478 /* do not allow writes outside of allowed bits */
398 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || 479 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
399 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) 480 ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
400 return -EINVAL; 481 return -EINVAL;
401 482
402 val |= BIT(30); 483 amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
403 pci_write_config_dword(dev, 0x1BC + index * 4, val); 484
404 /*
405 * We need to WBINVD on a core on the node containing the L3 cache which
406 * indices we disable therefore a simple wbinvd() is not sufficient.
407 */
408 wbinvd_on_cpu(cpu);
409 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
410 return count; 485 return count;
411} 486}
412 487
413#define STORE_CACHE_DISABLE(index) \ 488#define STORE_CACHE_DISABLE(slot) \
414static ssize_t \ 489static ssize_t \
415store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ 490store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
416 const char *buf, size_t count) \ 491 const char *buf, size_t count) \
417{ \ 492{ \
418 return store_cache_disable(this_leaf, buf, count, index); \ 493 return store_cache_disable(this_leaf, buf, count, slot); \
419} 494}
420STORE_CACHE_DISABLE(0) 495STORE_CACHE_DISABLE(0)
421STORE_CACHE_DISABLE(1) 496STORE_CACHE_DISABLE(1)
@@ -443,8 +518,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
443 518
444 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 519 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
445 amd_cpuid4(index, &eax, &ebx, &ecx); 520 amd_cpuid4(index, &eax, &ebx, &ecx);
446 if (boot_cpu_data.x86 >= 0x10) 521 amd_check_l3_disable(index, this_leaf);
447 amd_check_l3_disable(index, this_leaf);
448 } else { 522 } else {
449 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 523 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
450 } 524 }
@@ -701,6 +775,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
701 for (i = 0; i < num_cache_leaves; i++) 775 for (i = 0; i < num_cache_leaves; i++)
702 cache_remove_shared_cpu_map(cpu, i); 776 cache_remove_shared_cpu_map(cpu, i);
703 777
778 kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
704 kfree(per_cpu(ici_cpuid4_info, cpu)); 779 kfree(per_cpu(ici_cpuid4_info, cpu));
705 per_cpu(ici_cpuid4_info, cpu) = NULL; 780 per_cpu(ici_cpuid4_info, cpu) = NULL;
706} 781}
@@ -985,7 +1060,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
985 1060
986 this_leaf = CPUID4_INFO_IDX(cpu, i); 1061 this_leaf = CPUID4_INFO_IDX(cpu, i);
987 1062
988 if (this_leaf->can_disable) 1063 if (this_leaf->l3 && this_leaf->l3->can_disable)
989 ktype_cache.default_attrs = default_l3_attrs; 1064 ktype_cache.default_attrs = default_l3_attrs;
990 else 1065 else
991 ktype_cache.default_attrs = default_attrs; 1066 ktype_cache.default_attrs = default_attrs;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..7a355ddcc64b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
539 struct mce m; 539 struct mce m;
540 int i; 540 int i;
541 541
542 __get_cpu_var(mce_poll_count)++; 542 percpu_inc(mce_poll_count);
543 543
544 mce_setup(&m); 544 mce_setup(&m);
545 545
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
934 934
935 atomic_inc(&mce_entry); 935 atomic_inc(&mce_entry);
936 936
937 __get_cpu_var(mce_exception_count)++; 937 percpu_inc(mce_exception_count);
938 938
939 if (notify_die(DIE_NMI, "machine check", regs, error_code, 939 if (notify_die(DIE_NMI, "machine check", regs, error_code,
940 18, SIGKILL) == NOTIFY_STOP) 940 18, SIGKILL) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..16f41bbe46b6
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,55 @@
1/*
2 * HyperV Detection code.
3 *
4 * Copyright (C) 2010, Novell, Inc.
5 * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 */
12
13#include <linux/types.h>
14#include <linux/module.h>
15#include <asm/processor.h>
16#include <asm/hypervisor.h>
17#include <asm/hyperv.h>
18#include <asm/mshyperv.h>
19
20struct ms_hyperv_info ms_hyperv;
21
22static bool __init ms_hyperv_platform(void)
23{
24 u32 eax;
25 u32 hyp_signature[3];
26
27 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
28 return false;
29
30 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
31 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
32
33 return eax >= HYPERV_CPUID_MIN &&
34 eax <= HYPERV_CPUID_MAX &&
35 !memcmp("Microsoft Hv", hyp_signature, 12);
36}
37
38static void __init ms_hyperv_init_platform(void)
39{
40 /*
41 * Extract the features and hints
42 */
43 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
44 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
45
46 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
47 ms_hyperv.features, ms_hyperv.hints);
48}
49
50const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
51 .name = "Microsoft HyperV",
52 .detect = ms_hyperv_platform,
53 .init_platform = ms_hyperv_init_platform,
54};
55EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf8..fd4db0db3708 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33 33
34static u64 perf_event_mask __read_mostly; 34#if 0
35#undef wrmsrl
36#define wrmsrl(msr, val) \
37do { \
38 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
39 (unsigned long)(val)); \
40 native_write_msr((msr), (u32)((u64)(val)), \
41 (u32)((u64)(val) >> 32)); \
42} while (0)
43#endif
35 44
36/* The maximal number of PEBS events: */ 45/*
37#define MAX_PEBS_EVENTS 4 46 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
47 */
48static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{
51 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0;
54 struct page *page;
55 void *map;
56 int ret;
38 57
39/* The size of a BTS record in bytes: */ 58 do {
40#define BTS_RECORD_SIZE 24 59 ret = __get_user_pages_fast(addr, 1, 0, &page);
60 if (!ret)
61 break;
41 62
42/* The size of a per-cpu BTS buffer in bytes: */ 63 offset = addr & (PAGE_SIZE - 1);
43#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) 64 size = min(PAGE_SIZE - offset, n - len);
44 65
45/* The BTS overflow threshold in bytes from the end of the buffer: */ 66 map = kmap_atomic(page, type);
46#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) 67 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type);
69 put_page(page);
47 70
71 len += size;
72 to += size;
73 addr += size;
48 74
49/* 75 } while (len < n);
50 * Bits in the debugctlmsr controlling branch tracing.
51 */
52#define X86_DEBUGCTL_TR (1 << 6)
53#define X86_DEBUGCTL_BTS (1 << 7)
54#define X86_DEBUGCTL_BTINT (1 << 8)
55#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
56#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
57 76
58/* 77 return len;
59 * A debug store configuration. 78}
60 *
61 * We only support architectures that use 64bit fields.
62 */
63struct debug_store {
64 u64 bts_buffer_base;
65 u64 bts_index;
66 u64 bts_absolute_maximum;
67 u64 bts_interrupt_threshold;
68 u64 pebs_buffer_base;
69 u64 pebs_index;
70 u64 pebs_absolute_maximum;
71 u64 pebs_interrupt_threshold;
72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
73};
74 79
75struct event_constraint { 80struct event_constraint {
76 union { 81 union {
@@ -89,18 +94,41 @@ struct amd_nb {
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90}; 95};
91 96
97#define MAX_LBR_ENTRIES 16
98
92struct cpu_hw_events { 99struct cpu_hw_events {
100 /*
101 * Generic x86 PMC bits
102 */
93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ 103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
95 unsigned long interrupts;
96 int enabled; 105 int enabled;
97 struct debug_store *ds;
98 106
99 int n_events; 107 int n_events;
100 int n_added; 108 int n_added;
101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 109 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX]; 110 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 111 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
112
113 unsigned int group_flag;
114
115 /*
116 * Intel DebugStore bits
117 */
118 struct debug_store *ds;
119 u64 pebs_enabled;
120
121 /*
122 * Intel LBR bits
123 */
124 int lbr_users;
125 void *lbr_context;
126 struct perf_branch_stack lbr_stack;
127 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
128
129 /*
130 * AMD specific bits
131 */
104 struct amd_nb *amd_nb; 132 struct amd_nb *amd_nb;
105}; 133};
106 134
@@ -114,44 +142,75 @@ struct cpu_hw_events {
114#define EVENT_CONSTRAINT(c, n, m) \ 142#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 143 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
116 144
145/*
146 * Constraint on the Event code.
147 */
117#define INTEL_EVENT_CONSTRAINT(c, n) \ 148#define INTEL_EVENT_CONSTRAINT(c, n) \
118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) 149 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
119 150
151/*
152 * Constraint on the Event code + UMask + fixed-mask
153 *
154 * filter mask to validate fixed counter events.
155 * the following filters disqualify for fixed counters:
156 * - inv
157 * - edge
158 * - cnt-mask
159 * The other filters are supported by fixed counters.
160 * The any-thread option is supported starting with v3.
161 */
120#define FIXED_EVENT_CONSTRAINT(c, n) \ 162#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) 163 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
164
165/*
166 * Constraint on the Event code + UMask
167 */
168#define PEBS_EVENT_CONSTRAINT(c, n) \
169 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
122 170
123#define EVENT_CONSTRAINT_END \ 171#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0) 172 EVENT_CONSTRAINT(0, 0, 0)
125 173
126#define for_each_event_constraint(e, c) \ 174#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++) 175 for ((e) = (c); (e)->weight; (e)++)
176
177union perf_capabilities {
178 struct {
179 u64 lbr_format : 6;
180 u64 pebs_trap : 1;
181 u64 pebs_arch_reg : 1;
182 u64 pebs_format : 4;
183 u64 smm_freeze : 1;
184 };
185 u64 capabilities;
186};
128 187
129/* 188/*
130 * struct x86_pmu - generic x86 pmu 189 * struct x86_pmu - generic x86 pmu
131 */ 190 */
132struct x86_pmu { 191struct x86_pmu {
192 /*
193 * Generic x86 PMC bits
194 */
133 const char *name; 195 const char *name;
134 int version; 196 int version;
135 int (*handle_irq)(struct pt_regs *); 197 int (*handle_irq)(struct pt_regs *);
136 void (*disable_all)(void); 198 void (*disable_all)(void);
137 void (*enable_all)(void); 199 void (*enable_all)(int added);
138 void (*enable)(struct perf_event *); 200 void (*enable)(struct perf_event *);
139 void (*disable)(struct perf_event *); 201 void (*disable)(struct perf_event *);
202 int (*hw_config)(struct perf_event *event);
203 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
140 unsigned eventsel; 204 unsigned eventsel;
141 unsigned perfctr; 205 unsigned perfctr;
142 u64 (*event_map)(int); 206 u64 (*event_map)(int);
143 u64 (*raw_event)(u64);
144 int max_events; 207 int max_events;
145 int num_events; 208 int num_counters;
146 int num_events_fixed; 209 int num_counters_fixed;
147 int event_bits; 210 int cntval_bits;
148 u64 event_mask; 211 u64 cntval_mask;
149 int apic; 212 int apic;
150 u64 max_period; 213 u64 max_period;
151 u64 intel_ctrl;
152 void (*enable_bts)(u64 config);
153 void (*disable_bts)(void);
154
155 struct event_constraint * 214 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc, 215 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event); 216 struct perf_event *event);
@@ -159,11 +218,32 @@ struct x86_pmu {
159 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 218 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
160 struct perf_event *event); 219 struct perf_event *event);
161 struct event_constraint *event_constraints; 220 struct event_constraint *event_constraints;
221 void (*quirks)(void);
162 222
163 int (*cpu_prepare)(int cpu); 223 int (*cpu_prepare)(int cpu);
164 void (*cpu_starting)(int cpu); 224 void (*cpu_starting)(int cpu);
165 void (*cpu_dying)(int cpu); 225 void (*cpu_dying)(int cpu);
166 void (*cpu_dead)(int cpu); 226 void (*cpu_dead)(int cpu);
227
228 /*
229 * Intel Arch Perfmon v2+
230 */
231 u64 intel_ctrl;
232 union perf_capabilities intel_cap;
233
234 /*
235 * Intel DebugStore bits
236 */
237 int bts, pebs;
238 int pebs_record_size;
239 void (*drain_pebs)(struct pt_regs *regs);
240 struct event_constraint *pebs_constraints;
241
242 /*
243 * Intel LBR
244 */
245 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
246 int lbr_nr; /* hardware stack size */
167}; 247};
168 248
169static struct x86_pmu x86_pmu __read_mostly; 249static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +278,7 @@ static u64
198x86_perf_event_update(struct perf_event *event) 278x86_perf_event_update(struct perf_event *event)
199{ 279{
200 struct hw_perf_event *hwc = &event->hw; 280 struct hw_perf_event *hwc = &event->hw;
201 int shift = 64 - x86_pmu.event_bits; 281 int shift = 64 - x86_pmu.cntval_bits;
202 u64 prev_raw_count, new_raw_count; 282 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx; 283 int idx = hwc->idx;
204 s64 delta; 284 s64 delta;
@@ -241,33 +321,32 @@ again:
241static atomic_t active_events; 321static atomic_t active_events;
242static DEFINE_MUTEX(pmc_reserve_mutex); 322static DEFINE_MUTEX(pmc_reserve_mutex);
243 323
324#ifdef CONFIG_X86_LOCAL_APIC
325
244static bool reserve_pmc_hardware(void) 326static bool reserve_pmc_hardware(void)
245{ 327{
246#ifdef CONFIG_X86_LOCAL_APIC
247 int i; 328 int i;
248 329
249 if (nmi_watchdog == NMI_LOCAL_APIC) 330 if (nmi_watchdog == NMI_LOCAL_APIC)
250 disable_lapic_nmi_watchdog(); 331 disable_lapic_nmi_watchdog();
251 332
252 for (i = 0; i < x86_pmu.num_events; i++) { 333 for (i = 0; i < x86_pmu.num_counters; i++) {
253 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
254 goto perfctr_fail; 335 goto perfctr_fail;
255 } 336 }
256 337
257 for (i = 0; i < x86_pmu.num_events; i++) { 338 for (i = 0; i < x86_pmu.num_counters; i++) {
258 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 339 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
259 goto eventsel_fail; 340 goto eventsel_fail;
260 } 341 }
261#endif
262 342
263 return true; 343 return true;
264 344
265#ifdef CONFIG_X86_LOCAL_APIC
266eventsel_fail: 345eventsel_fail:
267 for (i--; i >= 0; i--) 346 for (i--; i >= 0; i--)
268 release_evntsel_nmi(x86_pmu.eventsel + i); 347 release_evntsel_nmi(x86_pmu.eventsel + i);
269 348
270 i = x86_pmu.num_events; 349 i = x86_pmu.num_counters;
271 350
272perfctr_fail: 351perfctr_fail:
273 for (i--; i >= 0; i--) 352 for (i--; i >= 0; i--)
@@ -277,128 +356,36 @@ perfctr_fail:
277 enable_lapic_nmi_watchdog(); 356 enable_lapic_nmi_watchdog();
278 357
279 return false; 358 return false;
280#endif
281} 359}
282 360
283static void release_pmc_hardware(void) 361static void release_pmc_hardware(void)
284{ 362{
285#ifdef CONFIG_X86_LOCAL_APIC
286 int i; 363 int i;
287 364
288 for (i = 0; i < x86_pmu.num_events; i++) { 365 for (i = 0; i < x86_pmu.num_counters; i++) {
289 release_perfctr_nmi(x86_pmu.perfctr + i); 366 release_perfctr_nmi(x86_pmu.perfctr + i);
290 release_evntsel_nmi(x86_pmu.eventsel + i); 367 release_evntsel_nmi(x86_pmu.eventsel + i);
291 } 368 }
292 369
293 if (nmi_watchdog == NMI_LOCAL_APIC) 370 if (nmi_watchdog == NMI_LOCAL_APIC)
294 enable_lapic_nmi_watchdog(); 371 enable_lapic_nmi_watchdog();
295#endif
296}
297
298static inline bool bts_available(void)
299{
300 return x86_pmu.enable_bts != NULL;
301} 372}
302 373
303static void init_debug_store_on_cpu(int cpu) 374#else
304{
305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
306
307 if (!ds)
308 return;
309
310 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
311 (u32)((u64)(unsigned long)ds),
312 (u32)((u64)(unsigned long)ds >> 32));
313}
314
315static void fini_debug_store_on_cpu(int cpu)
316{
317 if (!per_cpu(cpu_hw_events, cpu).ds)
318 return;
319
320 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
321}
322
323static void release_bts_hardware(void)
324{
325 int cpu;
326
327 if (!bts_available())
328 return;
329
330 get_online_cpus();
331
332 for_each_online_cpu(cpu)
333 fini_debug_store_on_cpu(cpu);
334
335 for_each_possible_cpu(cpu) {
336 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
337
338 if (!ds)
339 continue;
340
341 per_cpu(cpu_hw_events, cpu).ds = NULL;
342
343 kfree((void *)(unsigned long)ds->bts_buffer_base);
344 kfree(ds);
345 }
346
347 put_online_cpus();
348}
349
350static int reserve_bts_hardware(void)
351{
352 int cpu, err = 0;
353
354 if (!bts_available())
355 return 0;
356
357 get_online_cpus();
358
359 for_each_possible_cpu(cpu) {
360 struct debug_store *ds;
361 void *buffer;
362
363 err = -ENOMEM;
364 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
365 if (unlikely(!buffer))
366 break;
367
368 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
369 if (unlikely(!ds)) {
370 kfree(buffer);
371 break;
372 }
373
374 ds->bts_buffer_base = (u64)(unsigned long)buffer;
375 ds->bts_index = ds->bts_buffer_base;
376 ds->bts_absolute_maximum =
377 ds->bts_buffer_base + BTS_BUFFER_SIZE;
378 ds->bts_interrupt_threshold =
379 ds->bts_absolute_maximum - BTS_OVFL_TH;
380
381 per_cpu(cpu_hw_events, cpu).ds = ds;
382 err = 0;
383 }
384 375
385 if (err) 376static bool reserve_pmc_hardware(void) { return true; }
386 release_bts_hardware(); 377static void release_pmc_hardware(void) {}
387 else {
388 for_each_online_cpu(cpu)
389 init_debug_store_on_cpu(cpu);
390 }
391 378
392 put_online_cpus(); 379#endif
393 380
394 return err; 381static int reserve_ds_buffers(void);
395} 382static void release_ds_buffers(void);
396 383
397static void hw_perf_event_destroy(struct perf_event *event) 384static void hw_perf_event_destroy(struct perf_event *event)
398{ 385{
399 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 386 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
400 release_pmc_hardware(); 387 release_pmc_hardware();
401 release_bts_hardware(); 388 release_ds_buffers();
402 mutex_unlock(&pmc_reserve_mutex); 389 mutex_unlock(&pmc_reserve_mutex);
403 } 390 }
404} 391}
@@ -441,54 +428,11 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
441 return 0; 428 return 0;
442} 429}
443 430
444/* 431static int x86_setup_perfctr(struct perf_event *event)
445 * Setup the hardware configuration for a given attr_type
446 */
447static int __hw_perf_event_init(struct perf_event *event)
448{ 432{
449 struct perf_event_attr *attr = &event->attr; 433 struct perf_event_attr *attr = &event->attr;
450 struct hw_perf_event *hwc = &event->hw; 434 struct hw_perf_event *hwc = &event->hw;
451 u64 config; 435 u64 config;
452 int err;
453
454 if (!x86_pmu_initialized())
455 return -ENODEV;
456
457 err = 0;
458 if (!atomic_inc_not_zero(&active_events)) {
459 mutex_lock(&pmc_reserve_mutex);
460 if (atomic_read(&active_events) == 0) {
461 if (!reserve_pmc_hardware())
462 err = -EBUSY;
463 else
464 err = reserve_bts_hardware();
465 }
466 if (!err)
467 atomic_inc(&active_events);
468 mutex_unlock(&pmc_reserve_mutex);
469 }
470 if (err)
471 return err;
472
473 event->destroy = hw_perf_event_destroy;
474
475 /*
476 * Generate PMC IRQs:
477 * (keep 'enabled' bit clear for now)
478 */
479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
480
481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
484
485 /*
486 * Count user and OS events unless requested not to.
487 */
488 if (!attr->exclude_user)
489 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
490 if (!attr->exclude_kernel)
491 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
492 436
493 if (!hwc->sample_period) { 437 if (!hwc->sample_period) {
494 hwc->sample_period = x86_pmu.max_period; 438 hwc->sample_period = x86_pmu.max_period;
@@ -505,16 +449,8 @@ static int __hw_perf_event_init(struct perf_event *event)
505 return -EOPNOTSUPP; 449 return -EOPNOTSUPP;
506 } 450 }
507 451
508 /* 452 if (attr->type == PERF_TYPE_RAW)
509 * Raw hw_event type provide the config in the hw_event structure
510 */
511 if (attr->type == PERF_TYPE_RAW) {
512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
516 return 0; 453 return 0;
517 }
518 454
519 if (attr->type == PERF_TYPE_HW_CACHE) 455 if (attr->type == PERF_TYPE_HW_CACHE)
520 return set_ext_hw_attr(hwc, attr); 456 return set_ext_hw_attr(hwc, attr);
@@ -539,11 +475,11 @@ static int __hw_perf_event_init(struct perf_event *event)
539 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 475 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
540 (hwc->sample_period == 1)) { 476 (hwc->sample_period == 1)) {
541 /* BTS is not supported by this architecture. */ 477 /* BTS is not supported by this architecture. */
542 if (!bts_available()) 478 if (!x86_pmu.bts)
543 return -EOPNOTSUPP; 479 return -EOPNOTSUPP;
544 480
545 /* BTS is currently only allowed for user-mode. */ 481 /* BTS is currently only allowed for user-mode. */
546 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 482 if (!attr->exclude_kernel)
547 return -EOPNOTSUPP; 483 return -EOPNOTSUPP;
548 } 484 }
549 485
@@ -552,12 +488,87 @@ static int __hw_perf_event_init(struct perf_event *event)
552 return 0; 488 return 0;
553} 489}
554 490
491static int x86_pmu_hw_config(struct perf_event *event)
492{
493 if (event->attr.precise_ip) {
494 int precise = 0;
495
496 /* Support for constant skid */
497 if (x86_pmu.pebs)
498 precise++;
499
500 /* Support for IP fixup */
501 if (x86_pmu.lbr_nr)
502 precise++;
503
504 if (event->attr.precise_ip > precise)
505 return -EOPNOTSUPP;
506 }
507
508 /*
509 * Generate PMC IRQs:
510 * (keep 'enabled' bit clear for now)
511 */
512 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
513
514 /*
515 * Count user and OS events unless requested not to
516 */
517 if (!event->attr.exclude_user)
518 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
519 if (!event->attr.exclude_kernel)
520 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
521
522 if (event->attr.type == PERF_TYPE_RAW)
523 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
524
525 return x86_setup_perfctr(event);
526}
527
528/*
529 * Setup the hardware configuration for a given attr_type
530 */
531static int __hw_perf_event_init(struct perf_event *event)
532{
533 int err;
534
535 if (!x86_pmu_initialized())
536 return -ENODEV;
537
538 err = 0;
539 if (!atomic_inc_not_zero(&active_events)) {
540 mutex_lock(&pmc_reserve_mutex);
541 if (atomic_read(&active_events) == 0) {
542 if (!reserve_pmc_hardware())
543 err = -EBUSY;
544 else {
545 err = reserve_ds_buffers();
546 if (err)
547 release_pmc_hardware();
548 }
549 }
550 if (!err)
551 atomic_inc(&active_events);
552 mutex_unlock(&pmc_reserve_mutex);
553 }
554 if (err)
555 return err;
556
557 event->destroy = hw_perf_event_destroy;
558
559 event->hw.idx = -1;
560 event->hw.last_cpu = -1;
561 event->hw.last_tag = ~0ULL;
562
563 return x86_pmu.hw_config(event);
564}
565
555static void x86_pmu_disable_all(void) 566static void x86_pmu_disable_all(void)
556{ 567{
557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 568 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
558 int idx; 569 int idx;
559 570
560 for (idx = 0; idx < x86_pmu.num_events; idx++) { 571 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
561 u64 val; 572 u64 val;
562 573
563 if (!test_bit(idx, cpuc->active_mask)) 574 if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +598,12 @@ void hw_perf_disable(void)
587 x86_pmu.disable_all(); 598 x86_pmu.disable_all();
588} 599}
589 600
590static void x86_pmu_enable_all(void) 601static void x86_pmu_enable_all(int added)
591{ 602{
592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 603 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
593 int idx; 604 int idx;
594 605
595 for (idx = 0; idx < x86_pmu.num_events; idx++) { 606 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
596 struct perf_event *event = cpuc->events[idx]; 607 struct perf_event *event = cpuc->events[idx];
597 u64 val; 608 u64 val;
598 609
@@ -667,14 +678,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
667 * assign events to counters starting with most 678 * assign events to counters starting with most
668 * constrained events. 679 * constrained events.
669 */ 680 */
670 wmax = x86_pmu.num_events; 681 wmax = x86_pmu.num_counters;
671 682
672 /* 683 /*
673 * when fixed event counters are present, 684 * when fixed event counters are present,
674 * wmax is incremented by 1 to account 685 * wmax is incremented by 1 to account
675 * for one more choice 686 * for one more choice
676 */ 687 */
677 if (x86_pmu.num_events_fixed) 688 if (x86_pmu.num_counters_fixed)
678 wmax++; 689 wmax++;
679 690
680 for (w = 1, num = n; num && w <= wmax; w++) { 691 for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +735,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
724 struct perf_event *event; 735 struct perf_event *event;
725 int n, max_count; 736 int n, max_count;
726 737
727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; 738 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
728 739
729 /* current number of events already accepted */ 740 /* current number of events already accepted */
730 n = cpuc->n_events; 741 n = cpuc->n_events;
@@ -795,7 +806,7 @@ void hw_perf_enable(void)
795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 806 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
796 struct perf_event *event; 807 struct perf_event *event;
797 struct hw_perf_event *hwc; 808 struct hw_perf_event *hwc;
798 int i; 809 int i, added = cpuc->n_added;
799 810
800 if (!x86_pmu_initialized()) 811 if (!x86_pmu_initialized())
801 return; 812 return;
@@ -847,19 +858,20 @@ void hw_perf_enable(void)
847 cpuc->enabled = 1; 858 cpuc->enabled = 1;
848 barrier(); 859 barrier();
849 860
850 x86_pmu.enable_all(); 861 x86_pmu.enable_all(added);
851} 862}
852 863
853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) 864static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
865 u64 enable_mask)
854{ 866{
855 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 867 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857} 868}
858 869
859static inline void x86_pmu_disable_event(struct perf_event *event) 870static inline void x86_pmu_disable_event(struct perf_event *event)
860{ 871{
861 struct hw_perf_event *hwc = &event->hw; 872 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); 873
874 wrmsrl(hwc->config_base + hwc->idx, hwc->config);
863} 875}
864 876
865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 877static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -874,7 +886,7 @@ x86_perf_event_set_period(struct perf_event *event)
874 struct hw_perf_event *hwc = &event->hw; 886 struct hw_perf_event *hwc = &event->hw;
875 s64 left = atomic64_read(&hwc->period_left); 887 s64 left = atomic64_read(&hwc->period_left);
876 s64 period = hwc->sample_period; 888 s64 period = hwc->sample_period;
877 int err, ret = 0, idx = hwc->idx; 889 int ret = 0, idx = hwc->idx;
878 890
879 if (idx == X86_PMC_IDX_FIXED_BTS) 891 if (idx == X86_PMC_IDX_FIXED_BTS)
880 return 0; 892 return 0;
@@ -912,8 +924,8 @@ x86_perf_event_set_period(struct perf_event *event)
912 */ 924 */
913 atomic64_set(&hwc->prev_count, (u64)-left); 925 atomic64_set(&hwc->prev_count, (u64)-left);
914 926
915 err = checking_wrmsrl(hwc->event_base + idx, 927 wrmsrl(hwc->event_base + idx,
916 (u64)(-left) & x86_pmu.event_mask); 928 (u64)(-left) & x86_pmu.cntval_mask);
917 929
918 perf_event_update_userpage(event); 930 perf_event_update_userpage(event);
919 931
@@ -924,7 +936,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
924{ 936{
925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 937 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
926 if (cpuc->enabled) 938 if (cpuc->enabled)
927 __x86_pmu_enable_event(&event->hw); 939 __x86_pmu_enable_event(&event->hw,
940 ARCH_PERFMON_EVENTSEL_ENABLE);
928} 941}
929 942
930/* 943/*
@@ -950,7 +963,15 @@ static int x86_pmu_enable(struct perf_event *event)
950 if (n < 0) 963 if (n < 0)
951 return n; 964 return n;
952 965
953 ret = x86_schedule_events(cpuc, n, assign); 966 /*
967 * If group events scheduling transaction was started,
968 * skip the schedulability test here, it will be peformed
969 * at commit time(->commit_txn) as a whole
970 */
971 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
972 goto out;
973
974 ret = x86_pmu.schedule_events(cpuc, n, assign);
954 if (ret) 975 if (ret)
955 return ret; 976 return ret;
956 /* 977 /*
@@ -959,6 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
959 */ 980 */
960 memcpy(cpuc->assign, assign, n*sizeof(int)); 981 memcpy(cpuc->assign, assign, n*sizeof(int));
961 982
983out:
962 cpuc->n_events = n; 984 cpuc->n_events = n;
963 cpuc->n_added += n - n0; 985 cpuc->n_added += n - n0;
964 986
@@ -991,11 +1013,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
991void perf_event_print_debug(void) 1013void perf_event_print_debug(void)
992{ 1014{
993 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1015 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1016 u64 pebs;
994 struct cpu_hw_events *cpuc; 1017 struct cpu_hw_events *cpuc;
995 unsigned long flags; 1018 unsigned long flags;
996 int cpu, idx; 1019 int cpu, idx;
997 1020
998 if (!x86_pmu.num_events) 1021 if (!x86_pmu.num_counters)
999 return; 1022 return;
1000 1023
1001 local_irq_save(flags); 1024 local_irq_save(flags);
@@ -1008,16 +1031,18 @@ void perf_event_print_debug(void)
1008 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1031 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1009 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1032 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1010 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1033 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1034 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1011 1035
1012 pr_info("\n"); 1036 pr_info("\n");
1013 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1037 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1014 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1038 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1039 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1040 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1041 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1017 } 1042 }
1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1043 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1019 1044
1020 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1045 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1046 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1022 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1047 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1023 1048
@@ -1030,7 +1055,7 @@ void perf_event_print_debug(void)
1030 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1055 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1031 cpu, idx, prev_left); 1056 cpu, idx, prev_left);
1032 } 1057 }
1033 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 1058 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1034 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1059 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1035 1060
1036 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1061 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1095,7 +1120,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1095 1120
1096 cpuc = &__get_cpu_var(cpu_hw_events); 1121 cpuc = &__get_cpu_var(cpu_hw_events);
1097 1122
1098 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1123 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1099 if (!test_bit(idx, cpuc->active_mask)) 1124 if (!test_bit(idx, cpuc->active_mask))
1100 continue; 1125 continue;
1101 1126
@@ -1103,7 +1128,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1103 hwc = &event->hw; 1128 hwc = &event->hw;
1104 1129
1105 val = x86_perf_event_update(event); 1130 val = x86_perf_event_update(event);
1106 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1131 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1107 continue; 1132 continue;
1108 1133
1109 /* 1134 /*
@@ -1146,7 +1171,6 @@ void set_perf_event_pending(void)
1146 1171
1147void perf_events_lapic_init(void) 1172void perf_events_lapic_init(void)
1148{ 1173{
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 if (!x86_pmu.apic || !x86_pmu_initialized()) 1174 if (!x86_pmu.apic || !x86_pmu_initialized())
1151 return; 1175 return;
1152 1176
@@ -1154,7 +1178,6 @@ void perf_events_lapic_init(void)
1154 * Always use NMI for PMU 1178 * Always use NMI for PMU
1155 */ 1179 */
1156 apic_write(APIC_LVTPC, APIC_DM_NMI); 1180 apic_write(APIC_LVTPC, APIC_DM_NMI);
1157#endif
1158} 1181}
1159 1182
1160static int __kprobes 1183static int __kprobes
@@ -1178,9 +1201,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1178 1201
1179 regs = args->regs; 1202 regs = args->regs;
1180 1203
1181#ifdef CONFIG_X86_LOCAL_APIC
1182 apic_write(APIC_LVTPC, APIC_DM_NMI); 1204 apic_write(APIC_LVTPC, APIC_DM_NMI);
1183#endif
1184 /* 1205 /*
1185 * Can't rely on the handled return value to say it was our NMI, two 1206 * Can't rely on the handled return value to say it was our NMI, two
1186 * events could trigger 'simultaneously' raising two back-to-back NMIs. 1207 * events could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1217,118 +1238,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1217 return &unconstrained; 1238 return &unconstrained;
1218} 1239}
1219 1240
1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
1222{
1223 int ret = 0;
1224
1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
1228
1229 if (!is_x86_event(event))
1230 ret = event->pmu->enable(event);
1231
1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
1239}
1240
1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
1243{
1244 event->state = PERF_EVENT_STATE_INACTIVE;
1245 event->oncpu = -1;
1246
1247 if (!is_x86_event(event))
1248 event->pmu->disable(event);
1249
1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
1251
1252 if (!is_software_event(event))
1253 cpuctx->active_oncpu--;
1254
1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1256 cpuctx->exclusive = 0;
1257}
1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
1299 /*
1300 * copy new assignment, now we know it is possible
1301 * will be used by hw_perf_enable()
1302 */
1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
1308
1309 /*
1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
1315 */
1316 return 1;
1317undo:
1318 x86_event_sched_out(leader, cpuctx);
1319 n0 = 1;
1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1322 x86_event_sched_out(sub, cpuctx);
1323 if (++n0 == n1)
1324 break;
1325 }
1326 }
1327 return ret;
1328}
1329
1330#include "perf_event_amd.c" 1241#include "perf_event_amd.c"
1331#include "perf_event_p6.c" 1242#include "perf_event_p6.c"
1243#include "perf_event_p4.c"
1244#include "perf_event_intel_lbr.c"
1245#include "perf_event_intel_ds.c"
1332#include "perf_event_intel.c" 1246#include "perf_event_intel.c"
1333 1247
1334static int __cpuinit 1248static int __cpuinit
@@ -1402,48 +1316,50 @@ void __init init_hw_perf_events(void)
1402 1316
1403 pr_cont("%s PMU driver.\n", x86_pmu.name); 1317 pr_cont("%s PMU driver.\n", x86_pmu.name);
1404 1318
1405 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1319 if (x86_pmu.quirks)
1320 x86_pmu.quirks();
1321
1322 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1406 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1323 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1407 x86_pmu.num_events, X86_PMC_MAX_GENERIC); 1324 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1408 x86_pmu.num_events = X86_PMC_MAX_GENERIC; 1325 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1409 } 1326 }
1410 perf_event_mask = (1 << x86_pmu.num_events) - 1; 1327 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1411 perf_max_events = x86_pmu.num_events; 1328 perf_max_events = x86_pmu.num_counters;
1412 1329
1413 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { 1330 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1414 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1331 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1415 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); 1332 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1416 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; 1333 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1417 } 1334 }
1418 1335
1419 perf_event_mask |= 1336 x86_pmu.intel_ctrl |=
1420 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; 1337 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1421 x86_pmu.intel_ctrl = perf_event_mask;
1422 1338
1423 perf_events_lapic_init(); 1339 perf_events_lapic_init();
1424 register_die_notifier(&perf_event_nmi_notifier); 1340 register_die_notifier(&perf_event_nmi_notifier);
1425 1341
1426 unconstrained = (struct event_constraint) 1342 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 1343 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1428 0, x86_pmu.num_events); 1344 0, x86_pmu.num_counters);
1429 1345
1430 if (x86_pmu.event_constraints) { 1346 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) { 1347 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK) 1348 if (c->cmask != X86_RAW_EVENT_MASK)
1433 continue; 1349 continue;
1434 1350
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; 1351 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1436 c->weight += x86_pmu.num_events; 1352 c->weight += x86_pmu.num_counters;
1437 } 1353 }
1438 } 1354 }
1439 1355
1440 pr_info("... version: %d\n", x86_pmu.version); 1356 pr_info("... version: %d\n", x86_pmu.version);
1441 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1357 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1442 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1358 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
1443 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); 1359 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1360 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1361 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1446 pr_info("... event mask: %016Lx\n", perf_event_mask); 1362 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1447 1363
1448 perf_cpu_notifier(x86_pmu_notifier); 1364 perf_cpu_notifier(x86_pmu_notifier);
1449} 1365}
@@ -1453,6 +1369,59 @@ static inline void x86_pmu_read(struct perf_event *event)
1453 x86_perf_event_update(event); 1369 x86_perf_event_update(event);
1454} 1370}
1455 1371
1372/*
1373 * Start group events scheduling transaction
1374 * Set the flag to make pmu::enable() not perform the
1375 * schedulability test, it will be performed at commit time
1376 */
1377static void x86_pmu_start_txn(const struct pmu *pmu)
1378{
1379 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1380
1381 cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
1382}
1383
1384/*
1385 * Stop group events scheduling transaction
1386 * Clear the flag and pmu::enable() will perform the
1387 * schedulability test.
1388 */
1389static void x86_pmu_cancel_txn(const struct pmu *pmu)
1390{
1391 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1392
1393 cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
1394}
1395
1396/*
1397 * Commit group events scheduling transaction
1398 * Perform the group schedulability test as a whole
1399 * Return 0 if success
1400 */
1401static int x86_pmu_commit_txn(const struct pmu *pmu)
1402{
1403 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1404 int assign[X86_PMC_IDX_MAX];
1405 int n, ret;
1406
1407 n = cpuc->n_events;
1408
1409 if (!x86_pmu_initialized())
1410 return -EAGAIN;
1411
1412 ret = x86_pmu.schedule_events(cpuc, n, assign);
1413 if (ret)
1414 return ret;
1415
1416 /*
1417 * copy new assignment, now we know it is possible
1418 * will be used by hw_perf_enable()
1419 */
1420 memcpy(cpuc->assign, assign, n*sizeof(int));
1421
1422 return 0;
1423}
1424
1456static const struct pmu pmu = { 1425static const struct pmu pmu = {
1457 .enable = x86_pmu_enable, 1426 .enable = x86_pmu_enable,
1458 .disable = x86_pmu_disable, 1427 .disable = x86_pmu_disable,
@@ -1460,9 +1429,38 @@ static const struct pmu pmu = {
1460 .stop = x86_pmu_stop, 1429 .stop = x86_pmu_stop,
1461 .read = x86_pmu_read, 1430 .read = x86_pmu_read,
1462 .unthrottle = x86_pmu_unthrottle, 1431 .unthrottle = x86_pmu_unthrottle,
1432 .start_txn = x86_pmu_start_txn,
1433 .cancel_txn = x86_pmu_cancel_txn,
1434 .commit_txn = x86_pmu_commit_txn,
1463}; 1435};
1464 1436
1465/* 1437/*
1438 * validate that we can schedule this event
1439 */
1440static int validate_event(struct perf_event *event)
1441{
1442 struct cpu_hw_events *fake_cpuc;
1443 struct event_constraint *c;
1444 int ret = 0;
1445
1446 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1447 if (!fake_cpuc)
1448 return -ENOMEM;
1449
1450 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1451
1452 if (!c || !c->weight)
1453 ret = -ENOSPC;
1454
1455 if (x86_pmu.put_event_constraints)
1456 x86_pmu.put_event_constraints(fake_cpuc, event);
1457
1458 kfree(fake_cpuc);
1459
1460 return ret;
1461}
1462
1463/*
1466 * validate a single event group 1464 * validate a single event group
1467 * 1465 *
1468 * validation include: 1466 * validation include:
@@ -1502,7 +1500,7 @@ static int validate_group(struct perf_event *event)
1502 1500
1503 fake_cpuc->n_events = n; 1501 fake_cpuc->n_events = n;
1504 1502
1505 ret = x86_schedule_events(fake_cpuc, n, NULL); 1503 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1506 1504
1507out_free: 1505out_free:
1508 kfree(fake_cpuc); 1506 kfree(fake_cpuc);
@@ -1527,6 +1525,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1527 1525
1528 if (event->group_leader != event) 1526 if (event->group_leader != event)
1529 err = validate_group(event); 1527 err = validate_group(event);
1528 else
1529 err = validate_event(event);
1530 1530
1531 event->pmu = tmp; 1531 event->pmu = tmp;
1532 } 1532 }
@@ -1574,8 +1574,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1574{ 1574{
1575 struct perf_callchain_entry *entry = data; 1575 struct perf_callchain_entry *entry = data;
1576 1576
1577 if (reliable) 1577 callchain_store(entry, addr);
1578 callchain_store(entry, addr);
1579} 1578}
1580 1579
1581static const struct stacktrace_ops backtrace_ops = { 1580static const struct stacktrace_ops backtrace_ops = {
@@ -1597,41 +1596,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1596 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1598} 1597}
1599 1598
1600/*
1601 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1602 */
1603static unsigned long
1604copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1605{
1606 unsigned long offset, addr = (unsigned long)from;
1607 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1608 unsigned long size, len = 0;
1609 struct page *page;
1610 void *map;
1611 int ret;
1612
1613 do {
1614 ret = __get_user_pages_fast(addr, 1, 0, &page);
1615 if (!ret)
1616 break;
1617
1618 offset = addr & (PAGE_SIZE - 1);
1619 size = min(PAGE_SIZE - offset, n - len);
1620
1621 map = kmap_atomic(page, type);
1622 memcpy(to, map+offset, size);
1623 kunmap_atomic(map, type);
1624 put_page(page);
1625
1626 len += size;
1627 to += size;
1628 addr += size;
1629
1630 } while (len < n);
1631
1632 return len;
1633}
1634
1635#ifdef CONFIG_COMPAT 1599#ifdef CONFIG_COMPAT
1636static inline int 1600static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 1601perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1691,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1727{ 1691{
1728 struct perf_callchain_entry *entry; 1692 struct perf_callchain_entry *entry;
1729 1693
1694 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1695 /* TODO: We don't support guest os callchain now */
1696 return NULL;
1697 }
1698
1730 if (in_nmi()) 1699 if (in_nmi())
1731 entry = &__get_cpu_var(pmc_nmi_entry); 1700 entry = &__get_cpu_var(pmc_nmi_entry);
1732 else 1701 else
@@ -1750,3 +1719,37 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
1750 regs->cs = __KERNEL_CS; 1719 regs->cs = __KERNEL_CS;
1751 local_save_flags(regs->flags); 1720 local_save_flags(regs->flags);
1752} 1721}
1722
1723unsigned long perf_instruction_pointer(struct pt_regs *regs)
1724{
1725 unsigned long ip;
1726
1727 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1728 ip = perf_guest_cbs->get_guest_ip();
1729 else
1730 ip = instruction_pointer(regs);
1731
1732 return ip;
1733}
1734
1735unsigned long perf_misc_flags(struct pt_regs *regs)
1736{
1737 int misc = 0;
1738
1739 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1740 if (perf_guest_cbs->is_user_mode())
1741 misc |= PERF_RECORD_MISC_GUEST_USER;
1742 else
1743 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1744 } else {
1745 if (user_mode(regs))
1746 misc |= PERF_RECORD_MISC_USER;
1747 else
1748 misc |= PERF_RECORD_MISC_KERNEL;
1749 }
1750
1751 if (regs->flags & PERF_EFLAGS_EXACT)
1752 misc |= PERF_RECORD_MISC_EXACT_IP;
1753
1754 return misc;
1755}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e1..611df11ba15e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock); 3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4 4
5static __initconst u64 amd_hw_cache_event_ids 5static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
111 return amd_perfmon_event_map[hw_event]; 111 return amd_perfmon_event_map[hw_event];
112} 112}
113 113
114static u64 amd_pmu_raw_event(u64 hw_event) 114static int amd_pmu_hw_config(struct perf_event *event)
115{ 115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL 116 int ret = x86_pmu_hw_config(event);
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 117
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 118 if (ret)
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL 119 return ret;
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL 120
121 121 if (event->attr.type != PERF_TYPE_RAW)
122#define K7_EVNTSEL_MASK \ 122 return 0;
123 (K7_EVNTSEL_EVENT_MASK | \ 123
124 K7_EVNTSEL_UNIT_MASK | \ 124 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
125 K7_EVNTSEL_EDGE_MASK | \ 125
126 K7_EVNTSEL_INV_MASK | \ 126 return 0;
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130} 127}
131 128
132/* 129/*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
165 * be removed on one CPU at a time AND PMU is disabled 162 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here 163 * when we come here
167 */ 164 */
168 for (i = 0; i < x86_pmu.num_events; i++) { 165 for (i = 0; i < x86_pmu.num_counters; i++) {
169 if (nb->owners[i] == event) { 166 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL); 167 cmpxchg(nb->owners+i, event, NULL);
171 break; 168 break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
215 struct hw_perf_event *hwc = &event->hw; 212 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb; 213 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL; 214 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events; 215 int max = x86_pmu.num_counters;
219 int i, j, k = -1; 216 int i, j, k = -1;
220 217
221 /* 218 /*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
293 /* 290 /*
294 * initialize all possible NB constraints 291 * initialize all possible NB constraints
295 */ 292 */
296 for (i = 0; i < x86_pmu.num_events; i++) { 293 for (i = 0; i < x86_pmu.num_counters; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk); 294 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1; 295 nb->event_constraints[i].weight = 1;
299 } 296 }
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
371 raw_spin_unlock(&amd_nb_lock); 368 raw_spin_unlock(&amd_nb_lock);
372} 369}
373 370
374static __initconst struct x86_pmu amd_pmu = { 371static __initconst const struct x86_pmu amd_pmu = {
375 .name = "AMD", 372 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq, 373 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all, 374 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all, 375 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event, 376 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event, 377 .disable = x86_pmu_disable_event,
378 .hw_config = amd_pmu_hw_config,
379 .schedule_events = x86_schedule_events,
381 .eventsel = MSR_K7_EVNTSEL0, 380 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0, 381 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map, 382 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 383 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4, 384 .num_counters = 4,
387 .event_bits = 48, 385 .cntval_bits = 48,
388 .event_mask = (1ULL << 48) - 1, 386 .cntval_mask = (1ULL << 48) - 1,
389 .apic = 1, 387 .apic = 1,
390 /* use highest bit to detect overflow */ 388 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1, 389 .max_period = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac87837..fdbc652d3feb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
88 return intel_perfmon_event_map[hw_event]; 88 return intel_perfmon_event_map[hw_event];
89} 89}
90 90
91static __initconst u64 westmere_hw_cache_event_ids 91static __initconst const u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX] 92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX] 93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
179 }, 179 },
180}; 180};
181 181
182static __initconst u64 nehalem_hw_cache_event_ids 182static __initconst const u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX] 183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX] 184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
270 }, 270 },
271}; 271};
272 272
273static __initconst u64 core2_hw_cache_event_ids 273static __initconst const u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX] 274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX] 275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
361 }, 361 },
362}; 362};
363 363
364static __initconst u64 atom_hw_cache_event_ids 364static __initconst const u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX] 365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX] 366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
452 }, 452 },
453}; 453};
454 454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void) 455static void intel_pmu_disable_all(void)
510{ 456{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 457 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +460,17 @@ static void intel_pmu_disable_all(void)
514 460
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 461 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts(); 462 intel_pmu_disable_bts();
463
464 intel_pmu_pebs_disable_all();
465 intel_pmu_lbr_disable_all();
517} 466}
518 467
519static void intel_pmu_enable_all(void) 468static void intel_pmu_enable_all(int added)
520{ 469{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 470 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522 471
472 intel_pmu_pebs_enable_all();
473 intel_pmu_lbr_enable_all();
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 474 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524 475
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 476 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +484,42 @@ static void intel_pmu_enable_all(void)
533 } 484 }
534} 485}
535 486
487/*
488 * Workaround for:
489 * Intel Errata AAK100 (model 26)
490 * Intel Errata AAP53 (model 30)
491 * Intel Errata BD53 (model 44)
492 *
493 * These chips need to be 'reset' when adding counters by programming
494 * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
495 * either in sequence on the same PMC or on different PMCs.
496 */
497static void intel_pmu_nhm_enable_all(int added)
498{
499 if (added) {
500 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
501 int i;
502
503 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
504 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
505 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
506
507 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
508 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
509
510 for (i = 0; i < 3; i++) {
511 struct perf_event *event = cpuc->events[i];
512
513 if (!event)
514 continue;
515
516 __x86_pmu_enable_event(&event->hw,
517 ARCH_PERFMON_EVENTSEL_ENABLE);
518 }
519 }
520 intel_pmu_enable_all(added);
521}
522
536static inline u64 intel_pmu_get_status(void) 523static inline u64 intel_pmu_get_status(void)
537{ 524{
538 u64 status; 525 u64 status;
@@ -547,8 +534,7 @@ static inline void intel_pmu_ack_status(u64 ack)
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 534 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548} 535}
549 536
550static inline void 537static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{ 538{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED; 539 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask; 540 u64 ctrl_val, mask;
@@ -557,71 +543,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
557 543
558 rdmsrl(hwc->config_base, ctrl_val); 544 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask; 545 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 546 wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621} 547}
622 548
623static inline void 549static void intel_pmu_disable_event(struct perf_event *event)
624intel_pmu_disable_event(struct perf_event *event)
625{ 550{
626 struct hw_perf_event *hwc = &event->hw; 551 struct hw_perf_event *hwc = &event->hw;
627 552
@@ -637,14 +562,15 @@ intel_pmu_disable_event(struct perf_event *event)
637 } 562 }
638 563
639 x86_pmu_disable_event(event); 564 x86_pmu_disable_event(event);
565
566 if (unlikely(event->attr.precise_ip))
567 intel_pmu_pebs_disable(event);
640} 568}
641 569
642static inline void 570static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{ 571{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED; 572 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask; 573 u64 ctrl_val, bits, mask;
647 int err;
648 574
649 /* 575 /*
650 * Enable IRQ generation (0x8), 576 * Enable IRQ generation (0x8),
@@ -669,7 +595,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
669 rdmsrl(hwc->config_base, ctrl_val); 595 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask; 596 ctrl_val &= ~mask;
671 ctrl_val |= bits; 597 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val); 598 wrmsrl(hwc->config_base, ctrl_val);
673} 599}
674 600
675static void intel_pmu_enable_event(struct perf_event *event) 601static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,7 +615,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
689 return; 615 return;
690 } 616 }
691 617
692 __x86_pmu_enable_event(hwc); 618 if (unlikely(event->attr.precise_ip))
619 intel_pmu_pebs_enable(event);
620
621 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
693} 622}
694 623
695/* 624/*
@@ -708,20 +637,20 @@ static void intel_pmu_reset(void)
708 unsigned long flags; 637 unsigned long flags;
709 int idx; 638 int idx;
710 639
711 if (!x86_pmu.num_events) 640 if (!x86_pmu.num_counters)
712 return; 641 return;
713 642
714 local_irq_save(flags); 643 local_irq_save(flags);
715 644
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 645 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717 646
718 for (idx = 0; idx < x86_pmu.num_events; idx++) { 647 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 648 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 649 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 } 650 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 651 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 652 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 } 653
725 if (ds) 654 if (ds)
726 ds->bts_index = ds->bts_buffer_base; 655 ds->bts_index = ds->bts_buffer_base;
727 656
@@ -747,7 +676,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
747 intel_pmu_drain_bts_buffer(); 676 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status(); 677 status = intel_pmu_get_status();
749 if (!status) { 678 if (!status) {
750 intel_pmu_enable_all(); 679 intel_pmu_enable_all(0);
751 return 0; 680 return 0;
752 } 681 }
753 682
@@ -762,6 +691,15 @@ again:
762 691
763 inc_irq_stat(apic_perf_irqs); 692 inc_irq_stat(apic_perf_irqs);
764 ack = status; 693 ack = status;
694
695 intel_pmu_lbr_read();
696
697 /*
698 * PEBS overflow sets bit 62 in the global status register
699 */
700 if (__test_and_clear_bit(62, (unsigned long *)&status))
701 x86_pmu.drain_pebs(regs);
702
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 703 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit]; 704 struct perf_event *event = cpuc->events[bit];
767 705
@@ -787,26 +725,22 @@ again:
787 goto again; 725 goto again;
788 726
789done: 727done:
790 intel_pmu_enable_all(); 728 intel_pmu_enable_all(0);
791 return 1; 729 return 1;
792} 730}
793 731
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint * 732static struct event_constraint *
798intel_special_constraints(struct perf_event *event) 733intel_bts_constraints(struct perf_event *event)
799{ 734{
800 unsigned int hw_event; 735 struct hw_perf_event *hwc = &event->hw;
801 736 unsigned int hw_event, bts_event;
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803 737
804 if (unlikely((hw_event == 738 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 739 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
806 (event->hw.sample_period == 1))) {
807 740
741 if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
808 return &bts_constraint; 742 return &bts_constraint;
809 } 743
810 return NULL; 744 return NULL;
811} 745}
812 746
@@ -815,24 +749,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
815{ 749{
816 struct event_constraint *c; 750 struct event_constraint *c;
817 751
818 c = intel_special_constraints(event); 752 c = intel_bts_constraints(event);
753 if (c)
754 return c;
755
756 c = intel_pebs_constraints(event);
819 if (c) 757 if (c)
820 return c; 758 return c;
821 759
822 return x86_get_event_constraints(cpuc, event); 760 return x86_get_event_constraints(cpuc, event);
823} 761}
824 762
825static __initconst struct x86_pmu core_pmu = { 763static int intel_pmu_hw_config(struct perf_event *event)
764{
765 int ret = x86_pmu_hw_config(event);
766
767 if (ret)
768 return ret;
769
770 if (event->attr.type != PERF_TYPE_RAW)
771 return 0;
772
773 if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
774 return 0;
775
776 if (x86_pmu.version < 3)
777 return -EINVAL;
778
779 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
780 return -EACCES;
781
782 event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
783
784 return 0;
785}
786
787static __initconst const struct x86_pmu core_pmu = {
826 .name = "core", 788 .name = "core",
827 .handle_irq = x86_pmu_handle_irq, 789 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all, 790 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all, 791 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event, 792 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event, 793 .disable = x86_pmu_disable_event,
794 .hw_config = x86_pmu_hw_config,
795 .schedule_events = x86_schedule_events,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 796 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 797 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map, 798 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 799 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1, 800 .apic = 1,
838 /* 801 /*
@@ -845,17 +808,32 @@ static __initconst struct x86_pmu core_pmu = {
845 .event_constraints = intel_core_event_constraints, 808 .event_constraints = intel_core_event_constraints,
846}; 809};
847 810
848static __initconst struct x86_pmu intel_pmu = { 811static void intel_pmu_cpu_starting(int cpu)
812{
813 init_debug_store_on_cpu(cpu);
814 /*
815 * Deal with CPUs that don't clear their LBRs on power-up.
816 */
817 intel_pmu_lbr_reset();
818}
819
820static void intel_pmu_cpu_dying(int cpu)
821{
822 fini_debug_store_on_cpu(cpu);
823}
824
825static __initconst const struct x86_pmu intel_pmu = {
849 .name = "Intel", 826 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq, 827 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all, 828 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all, 829 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event, 830 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event, 831 .disable = intel_pmu_disable_event,
832 .hw_config = intel_pmu_hw_config,
833 .schedule_events = x86_schedule_events,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 834 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 835 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map, 836 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 837 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1, 838 .apic = 1,
861 /* 839 /*
@@ -864,14 +842,38 @@ static __initconst struct x86_pmu intel_pmu = {
864 * the generic event period: 842 * the generic event period:
865 */ 843 */
866 .max_period = (1ULL << 31) - 1, 844 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints, 845 .get_event_constraints = intel_get_event_constraints,
870 846
871 .cpu_starting = init_debug_store_on_cpu, 847 .cpu_starting = intel_pmu_cpu_starting,
872 .cpu_dying = fini_debug_store_on_cpu, 848 .cpu_dying = intel_pmu_cpu_dying,
873}; 849};
874 850
851static void intel_clovertown_quirks(void)
852{
853 /*
854 * PEBS is unreliable due to:
855 *
856 * AJ67 - PEBS may experience CPL leaks
857 * AJ68 - PEBS PMI may be delayed by one event
858 * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
859 * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
860 *
861 * AJ67 could be worked around by restricting the OS/USR flags.
862 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
863 *
864 * AJ106 could possibly be worked around by not allowing LBR
865 * usage from PEBS, including the fixup.
866 * AJ68 could possibly be worked around by always programming
867 * a pebs_event_reset[0] value and coping with the lost events.
868 *
869 * But taken together it might just make sense to not enable PEBS on
870 * these chips.
871 */
872 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
873 x86_pmu.pebs = 0;
874 x86_pmu.pebs_constraints = NULL;
875}
876
875static __init int intel_pmu_init(void) 877static __init int intel_pmu_init(void)
876{ 878{
877 union cpuid10_edx edx; 879 union cpuid10_edx edx;
@@ -881,12 +883,13 @@ static __init int intel_pmu_init(void)
881 int version; 883 int version;
882 884
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 885 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */ 886 switch (boot_cpu_data.x86) {
885 if (boot_cpu_data.x86 == 6) { 887 case 0x6:
886 return p6_pmu_init(); 888 return p6_pmu_init();
887 } else { 889 case 0xf:
890 return p4_pmu_init();
891 }
888 return -ENODEV; 892 return -ENODEV;
889 }
890 } 893 }
891 894
892 /* 895 /*
@@ -904,16 +907,28 @@ static __init int intel_pmu_init(void)
904 x86_pmu = intel_pmu; 907 x86_pmu = intel_pmu;
905 908
906 x86_pmu.version = version; 909 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events; 910 x86_pmu.num_counters = eax.split.num_counters;
908 x86_pmu.event_bits = eax.split.bit_width; 911 x86_pmu.cntval_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; 912 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
910 913
911 /* 914 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so 915 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events: 916 * assume at least 3 events:
914 */ 917 */
915 if (version > 1) 918 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 919 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
920
921 /*
922 * v2 and above have a perf capabilities MSR
923 */
924 if (version > 1) {
925 u64 capabilities;
926
927 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
928 x86_pmu.intel_cap.capabilities = capabilities;
929 }
930
931 intel_ds_init();
917 932
918 /* 933 /*
919 * Install the hw-cache-events table: 934 * Install the hw-cache-events table:
@@ -924,12 +939,15 @@ static __init int intel_pmu_init(void)
924 break; 939 break;
925 940
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 941 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
942 x86_pmu.quirks = intel_clovertown_quirks;
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 943 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 944 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */ 945 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 946 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids)); 947 sizeof(hw_cache_event_ids));
932 948
949 intel_pmu_lbr_init_core();
950
933 x86_pmu.event_constraints = intel_core2_event_constraints; 951 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, "); 952 pr_cont("Core2 events, ");
935 break; 953 break;
@@ -940,13 +958,19 @@ static __init int intel_pmu_init(void)
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 958 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids)); 959 sizeof(hw_cache_event_ids));
942 960
961 intel_pmu_lbr_init_nhm();
962
943 x86_pmu.event_constraints = intel_nehalem_event_constraints; 963 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, "); 964 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
965 pr_cont("Nehalem events, ");
945 break; 966 break;
967
946 case 28: /* Atom */ 968 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 969 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids)); 970 sizeof(hw_cache_event_ids));
949 971
972 intel_pmu_lbr_init_atom();
973
950 x86_pmu.event_constraints = intel_gen_event_constraints; 974 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, "); 975 pr_cont("Atom events, ");
952 break; 976 break;
@@ -956,7 +980,10 @@ static __init int intel_pmu_init(void)
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 980 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids)); 981 sizeof(hw_cache_event_ids));
958 982
983 intel_pmu_lbr_init_nhm();
984
959 x86_pmu.event_constraints = intel_westmere_event_constraints; 985 x86_pmu.event_constraints = intel_westmere_event_constraints;
986 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
960 pr_cont("Westmere events, "); 987 pr_cont("Westmere events, ");
961 break; 988 break;
962 989
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..18018d1311cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/* The maximal number of PEBS events: */
4#define MAX_PEBS_EVENTS 4
5
6/* The size of a BTS record in bytes: */
7#define BTS_RECORD_SIZE 24
8
9#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
10#define PEBS_BUFFER_SIZE PAGE_SIZE
11
12/*
13 * pebs_record_32 for p4 and core not supported
14
15struct pebs_record_32 {
16 u32 flags, ip;
17 u32 ax, bc, cx, dx;
18 u32 si, di, bp, sp;
19};
20
21 */
22
23struct pebs_record_core {
24 u64 flags, ip;
25 u64 ax, bx, cx, dx;
26 u64 si, di, bp, sp;
27 u64 r8, r9, r10, r11;
28 u64 r12, r13, r14, r15;
29};
30
31struct pebs_record_nhm {
32 u64 flags, ip;
33 u64 ax, bx, cx, dx;
34 u64 si, di, bp, sp;
35 u64 r8, r9, r10, r11;
36 u64 r12, r13, r14, r15;
37 u64 status, dla, dse, lat;
38};
39
40/*
41 * A debug store configuration.
42 *
43 * We only support architectures that use 64bit fields.
44 */
45struct debug_store {
46 u64 bts_buffer_base;
47 u64 bts_index;
48 u64 bts_absolute_maximum;
49 u64 bts_interrupt_threshold;
50 u64 pebs_buffer_base;
51 u64 pebs_index;
52 u64 pebs_absolute_maximum;
53 u64 pebs_interrupt_threshold;
54 u64 pebs_event_reset[MAX_PEBS_EVENTS];
55};
56
57static void init_debug_store_on_cpu(int cpu)
58{
59 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
60
61 if (!ds)
62 return;
63
64 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
65 (u32)((u64)(unsigned long)ds),
66 (u32)((u64)(unsigned long)ds >> 32));
67}
68
69static void fini_debug_store_on_cpu(int cpu)
70{
71 if (!per_cpu(cpu_hw_events, cpu).ds)
72 return;
73
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75}
76
77static void release_ds_buffers(void)
78{
79 int cpu;
80
81 if (!x86_pmu.bts && !x86_pmu.pebs)
82 return;
83
84 get_online_cpus();
85
86 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu);
88
89 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
91
92 if (!ds)
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 }
101
102 put_online_cpus();
103}
104
105static int reserve_ds_buffers(void)
106{
107 int cpu, err = 0;
108
109 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0;
111
112 get_online_cpus();
113
114 for_each_possible_cpu(cpu) {
115 struct debug_store *ds;
116 void *buffer;
117 int max, thresh;
118
119 err = -ENOMEM;
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
121 if (unlikely(!ds))
122 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds;
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140
141 if (x86_pmu.pebs) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
143 if (unlikely(!buffer))
144 break;
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158
159 err = 0;
160 }
161
162 if (err)
163 release_ds_buffers();
164 else {
165 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu);
167 }
168
169 put_online_cpus();
170
171 return err;
172}
173
174/*
175 * BTS
176 */
177
178static struct event_constraint bts_constraint =
179 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
180
181static void intel_pmu_enable_bts(u64 config)
182{
183 unsigned long debugctlmsr;
184
185 debugctlmsr = get_debugctlmsr();
186
187 debugctlmsr |= DEBUGCTLMSR_TR;
188 debugctlmsr |= DEBUGCTLMSR_BTS;
189 debugctlmsr |= DEBUGCTLMSR_BTINT;
190
191 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
192 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
193
194 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
195 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
196
197 update_debugctlmsr(debugctlmsr);
198}
199
200static void intel_pmu_disable_bts(void)
201{
202 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
203 unsigned long debugctlmsr;
204
205 if (!cpuc->ds)
206 return;
207
208 debugctlmsr = get_debugctlmsr();
209
210 debugctlmsr &=
211 ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
212 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
213
214 update_debugctlmsr(debugctlmsr);
215}
216
217static void intel_pmu_drain_bts_buffer(void)
218{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds;
221 struct bts_record {
222 u64 from;
223 u64 to;
224 u64 flags;
225 };
226 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
227 struct bts_record *at, *top;
228 struct perf_output_handle handle;
229 struct perf_event_header header;
230 struct perf_sample_data data;
231 struct pt_regs regs;
232
233 if (!event)
234 return;
235
236 if (!ds)
237 return;
238
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index;
241
242 if (top <= at)
243 return;
244
245 ds->bts_index = ds->bts_buffer_base;
246
247 perf_sample_data_init(&data, 0);
248 data.period = event->hw.last_period;
249 regs.ip = 0;
250
251 /*
252 * Prepare a generic sample, i.e. fill in the invariant fields.
253 * We will overwrite the from and to address before we output
254 * the sample.
255 */
256 perf_prepare_sample(&header, &data, event, &regs);
257
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return;
260
261 for (; at < top; at++) {
262 data.ip = at->from;
263 data.addr = at->to;
264
265 perf_output_sample(&handle, &header, &data, event);
266 }
267
268 perf_output_end(&handle);
269
270 /* There's new data available. */
271 event->hw.interrupts++;
272 event->pending_kill = POLL_IN;
273}
274
275/*
276 * PEBS
277 */
278
279static struct event_constraint intel_core_pebs_events[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
289 EVENT_CONSTRAINT_END
290};
291
292static struct event_constraint intel_nehalem_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
302 EVENT_CONSTRAINT_END
303};
304
305static struct event_constraint *
306intel_pebs_constraints(struct perf_event *event)
307{
308 struct event_constraint *c;
309
310 if (!event->attr.precise_ip)
311 return NULL;
312
313 if (x86_pmu.pebs_constraints) {
314 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
315 if ((event->hw.config & c->cmask) == c->code)
316 return c;
317 }
318 }
319
320 return &emptyconstraint;
321}
322
323static void intel_pmu_pebs_enable(struct perf_event *event)
324{
325 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
326 struct hw_perf_event *hwc = &event->hw;
327
328 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
329
330 cpuc->pebs_enabled |= 1ULL << hwc->idx;
331 WARN_ON_ONCE(cpuc->enabled);
332
333 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
334 intel_pmu_lbr_enable(event);
335}
336
337static void intel_pmu_pebs_disable(struct perf_event *event)
338{
339 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
340 struct hw_perf_event *hwc = &event->hw;
341
342 cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
343 if (cpuc->enabled)
344 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
345
346 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
347
348 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
349 intel_pmu_lbr_disable(event);
350}
351
352static void intel_pmu_pebs_enable_all(void)
353{
354 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
355
356 if (cpuc->pebs_enabled)
357 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
358}
359
360static void intel_pmu_pebs_disable_all(void)
361{
362 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
363
364 if (cpuc->pebs_enabled)
365 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
366}
367
368#include <asm/insn.h>
369
370static inline bool kernel_ip(unsigned long ip)
371{
372#ifdef CONFIG_X86_32
373 return ip > PAGE_OFFSET;
374#else
375 return (long)ip < 0;
376#endif
377}
378
379static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
380{
381 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
382 unsigned long from = cpuc->lbr_entries[0].from;
383 unsigned long old_to, to = cpuc->lbr_entries[0].to;
384 unsigned long ip = regs->ip;
385
386 /*
387 * We don't need to fixup if the PEBS assist is fault like
388 */
389 if (!x86_pmu.intel_cap.pebs_trap)
390 return 1;
391
392 /*
393 * No LBR entry, no basic block, no rewinding
394 */
395 if (!cpuc->lbr_stack.nr || !from || !to)
396 return 0;
397
398 /*
399 * Basic blocks should never cross user/kernel boundaries
400 */
401 if (kernel_ip(ip) != kernel_ip(to))
402 return 0;
403
404 /*
405 * unsigned math, either ip is before the start (impossible) or
406 * the basic block is larger than 1 page (sanity)
407 */
408 if ((ip - to) > PAGE_SIZE)
409 return 0;
410
411 /*
412 * We sampled a branch insn, rewind using the LBR stack
413 */
414 if (ip == to) {
415 regs->ip = from;
416 return 1;
417 }
418
419 do {
420 struct insn insn;
421 u8 buf[MAX_INSN_SIZE];
422 void *kaddr;
423
424 old_to = to;
425 if (!kernel_ip(ip)) {
426 int bytes, size = MAX_INSN_SIZE;
427
428 bytes = copy_from_user_nmi(buf, (void __user *)to, size);
429 if (bytes != size)
430 return 0;
431
432 kaddr = buf;
433 } else
434 kaddr = (void *)to;
435
436 kernel_insn_init(&insn, kaddr);
437 insn_get_length(&insn);
438 to += insn.length;
439 } while (to < ip);
440
441 if (to == ip) {
442 regs->ip = old_to;
443 return 1;
444 }
445
446 /*
447 * Even though we decoded the basic block, the instruction stream
448 * never matched the given IP, either the TO or the IP got corrupted.
449 */
450 return 0;
451}
452
453static int intel_pmu_save_and_restart(struct perf_event *event);
454
455static void __intel_pmu_pebs_event(struct perf_event *event,
456 struct pt_regs *iregs, void *__pebs)
457{
458 /*
459 * We cast to pebs_record_core since that is a subset of
460 * both formats and we don't use the other fields in this
461 * routine.
462 */
463 struct pebs_record_core *pebs = __pebs;
464 struct perf_sample_data data;
465 struct pt_regs regs;
466
467 if (!intel_pmu_save_and_restart(event))
468 return;
469
470 perf_sample_data_init(&data, 0);
471 data.period = event->hw.last_period;
472
473 /*
474 * We use the interrupt regs as a base because the PEBS record
475 * does not contain a full regs set, specifically it seems to
476 * lack segment descriptors, which get used by things like
477 * user_mode().
478 *
479 * In the simple case fix up only the IP and BP,SP regs, for
480 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
481 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
482 */
483 regs = *iregs;
484 regs.ip = pebs->ip;
485 regs.bp = pebs->bp;
486 regs.sp = pebs->sp;
487
488 if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
489 regs.flags |= PERF_EFLAGS_EXACT;
490 else
491 regs.flags &= ~PERF_EFLAGS_EXACT;
492
493 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event);
495}
496
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
498{
499 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
500 struct debug_store *ds = cpuc->ds;
501 struct perf_event *event = cpuc->events[0]; /* PMC0 only */
502 struct pebs_record_core *at, *top;
503 int n;
504
505 if (!ds || !x86_pmu.pebs)
506 return;
507
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
509 top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
510
511 /*
512 * Whatever else happens, drain the thing
513 */
514 ds->pebs_index = ds->pebs_buffer_base;
515
516 if (!test_bit(0, cpuc->active_mask))
517 return;
518
519 WARN_ON_ONCE(!event);
520
521 if (!event->attr.precise_ip)
522 return;
523
524 n = top - at;
525 if (n <= 0)
526 return;
527
528 /*
529 * Should not happen, we program the threshold at 1 and do not
530 * set a reset value.
531 */
532 WARN_ON_ONCE(n > 1);
533 at += n - 1;
534
535 __intel_pmu_pebs_event(event, iregs, at);
536}
537
538static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
539{
540 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
541 struct debug_store *ds = cpuc->ds;
542 struct pebs_record_nhm *at, *top;
543 struct perf_event *event = NULL;
544 u64 status = 0;
545 int bit, n;
546
547 if (!ds || !x86_pmu.pebs)
548 return;
549
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
551 top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
552
553 ds->pebs_index = ds->pebs_buffer_base;
554
555 n = top - at;
556 if (n <= 0)
557 return;
558
559 /*
560 * Should not happen, we program the threshold at 1 and do not
561 * set a reset value.
562 */
563 WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
564
565 for ( ; at < top; at++) {
566 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
567 event = cpuc->events[bit];
568 if (!test_bit(bit, cpuc->active_mask))
569 continue;
570
571 WARN_ON_ONCE(!event);
572
573 if (!event->attr.precise_ip)
574 continue;
575
576 if (__test_and_set_bit(bit, (unsigned long *)&status))
577 continue;
578
579 break;
580 }
581
582 if (!event || bit >= MAX_PEBS_EVENTS)
583 continue;
584
585 __intel_pmu_pebs_event(event, iregs, at);
586 }
587}
588
589/*
590 * BTS, PEBS probe and setup
591 */
592
593static void intel_ds_init(void)
594{
595 /*
596 * No support for 32bit formats
597 */
598 if (!boot_cpu_has(X86_FEATURE_DTES64))
599 return;
600
601 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
602 x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
603 if (x86_pmu.pebs) {
604 char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
605 int format = x86_pmu.intel_cap.pebs_format;
606
607 switch (format) {
608 case 0:
609 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
610 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
611 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
612 x86_pmu.pebs_constraints = intel_core_pebs_events;
613 break;
614
615 case 1:
616 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
617 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
618 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
619 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
620 break;
621
622 default:
623 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
624 x86_pmu.pebs = 0;
625 break;
626 }
627 }
628}
629
630#else /* CONFIG_CPU_SUP_INTEL */
631
632static int reserve_ds_buffers(void)
633{
634 return 0;
635}
636
637static void release_ds_buffers(void)
638{
639}
640
641#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..d202c1bece1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3enum {
4 LBR_FORMAT_32 = 0x00,
5 LBR_FORMAT_LIP = 0x01,
6 LBR_FORMAT_EIP = 0x02,
7 LBR_FORMAT_EIP_FLAGS = 0x03,
8};
9
10/*
11 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
12 * otherwise it becomes near impossible to get a reliable stack.
13 */
14
15static void __intel_pmu_lbr_enable(void)
16{
17 u64 debugctl;
18
19 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
20 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
21 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
22}
23
24static void __intel_pmu_lbr_disable(void)
25{
26 u64 debugctl;
27
28 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
29 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
30 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
31}
32
33static void intel_pmu_lbr_reset_32(void)
34{
35 int i;
36
37 for (i = 0; i < x86_pmu.lbr_nr; i++)
38 wrmsrl(x86_pmu.lbr_from + i, 0);
39}
40
41static void intel_pmu_lbr_reset_64(void)
42{
43 int i;
44
45 for (i = 0; i < x86_pmu.lbr_nr; i++) {
46 wrmsrl(x86_pmu.lbr_from + i, 0);
47 wrmsrl(x86_pmu.lbr_to + i, 0);
48 }
49}
50
51static void intel_pmu_lbr_reset(void)
52{
53 if (!x86_pmu.lbr_nr)
54 return;
55
56 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
57 intel_pmu_lbr_reset_32();
58 else
59 intel_pmu_lbr_reset_64();
60}
61
62static void intel_pmu_lbr_enable(struct perf_event *event)
63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
65
66 if (!x86_pmu.lbr_nr)
67 return;
68
69 WARN_ON_ONCE(cpuc->enabled);
70
71 /*
72 * Reset the LBR stack if we changed task context to
73 * avoid data leaks.
74 */
75
76 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
77 intel_pmu_lbr_reset();
78 cpuc->lbr_context = event->ctx;
79 }
80
81 cpuc->lbr_users++;
82}
83
84static void intel_pmu_lbr_disable(struct perf_event *event)
85{
86 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
87
88 if (!x86_pmu.lbr_nr)
89 return;
90
91 cpuc->lbr_users--;
92 WARN_ON_ONCE(cpuc->lbr_users < 0);
93
94 if (cpuc->enabled && !cpuc->lbr_users)
95 __intel_pmu_lbr_disable();
96}
97
98static void intel_pmu_lbr_enable_all(void)
99{
100 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
101
102 if (cpuc->lbr_users)
103 __intel_pmu_lbr_enable();
104}
105
106static void intel_pmu_lbr_disable_all(void)
107{
108 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
109
110 if (cpuc->lbr_users)
111 __intel_pmu_lbr_disable();
112}
113
114static inline u64 intel_pmu_lbr_tos(void)
115{
116 u64 tos;
117
118 rdmsrl(x86_pmu.lbr_tos, tos);
119
120 return tos;
121}
122
123static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
124{
125 unsigned long mask = x86_pmu.lbr_nr - 1;
126 u64 tos = intel_pmu_lbr_tos();
127 int i;
128
129 for (i = 0; i < x86_pmu.lbr_nr; i++) {
130 unsigned long lbr_idx = (tos - i) & mask;
131 union {
132 struct {
133 u32 from;
134 u32 to;
135 };
136 u64 lbr;
137 } msr_lastbranch;
138
139 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
140
141 cpuc->lbr_entries[i].from = msr_lastbranch.from;
142 cpuc->lbr_entries[i].to = msr_lastbranch.to;
143 cpuc->lbr_entries[i].flags = 0;
144 }
145 cpuc->lbr_stack.nr = i;
146}
147
148#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
149
150/*
151 * Due to lack of segmentation in Linux the effective address (offset)
152 * is the same as the linear address, allowing us to merge the LIP and EIP
153 * LBR formats.
154 */
155static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
156{
157 unsigned long mask = x86_pmu.lbr_nr - 1;
158 int lbr_format = x86_pmu.intel_cap.lbr_format;
159 u64 tos = intel_pmu_lbr_tos();
160 int i;
161
162 for (i = 0; i < x86_pmu.lbr_nr; i++) {
163 unsigned long lbr_idx = (tos - i) & mask;
164 u64 from, to, flags = 0;
165
166 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
167 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
168
169 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
170 flags = !!(from & LBR_FROM_FLAG_MISPRED);
171 from = (u64)((((s64)from) << 1) >> 1);
172 }
173
174 cpuc->lbr_entries[i].from = from;
175 cpuc->lbr_entries[i].to = to;
176 cpuc->lbr_entries[i].flags = flags;
177 }
178 cpuc->lbr_stack.nr = i;
179}
180
181static void intel_pmu_lbr_read(void)
182{
183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
184
185 if (!cpuc->lbr_users)
186 return;
187
188 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
189 intel_pmu_lbr_read_32(cpuc);
190 else
191 intel_pmu_lbr_read_64(cpuc);
192}
193
194static void intel_pmu_lbr_init_core(void)
195{
196 x86_pmu.lbr_nr = 4;
197 x86_pmu.lbr_tos = 0x01c9;
198 x86_pmu.lbr_from = 0x40;
199 x86_pmu.lbr_to = 0x60;
200}
201
202static void intel_pmu_lbr_init_nhm(void)
203{
204 x86_pmu.lbr_nr = 16;
205 x86_pmu.lbr_tos = 0x01c9;
206 x86_pmu.lbr_from = 0x680;
207 x86_pmu.lbr_to = 0x6c0;
208}
209
210static void intel_pmu_lbr_init_atom(void)
211{
212 x86_pmu.lbr_nr = 8;
213 x86_pmu.lbr_tos = 0x01c9;
214 x86_pmu.lbr_from = 0x40;
215 x86_pmu.lbr_to = 0x60;
216}
217
218#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..424fc8de68e4
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,857 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#ifdef CONFIG_CPU_SUP_INTEL
11
12#include <asm/perf_event_p4.h>
13
14#define P4_CNTR_LIMIT 3
15/*
16 * array indices: 0,1 - HT threads, used with HT enabled cpu
17 */
18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22};
23
24struct p4_cache_event_bind {
25 unsigned int metric_pebs;
26 unsigned int metric_vert;
27};
28
29#define P4_GEN_CACHE_EVENT_BIND(name) \
30 [P4_CACHE__##name] = { \
31 .metric_pebs = P4_PEBS__##name, \
32 .metric_vert = P4_VERT__##name, \
33 }
34
35static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
36 P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
37 P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
38 P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
39 P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
40};
41
42/*
43 * Note that we don't use CCCR1 here, there is an
44 * exception for P4_BSQ_ALLOCATION but we just have
45 * no workaround
46 *
47 * consider this binding as resources which particular
48 * event may borrow, it doesn't contain EventMask,
49 * Tags and friends -- they are left to a caller
50 */
51static struct p4_event_bind p4_event_bind_map[] = {
52 [P4_EVENT_TC_DELIVER_MODE] = {
53 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
54 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
55 .cntr = { {4, 5, -1}, {6, 7, -1} },
56 },
57 [P4_EVENT_BPU_FETCH_REQUEST] = {
58 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
59 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
60 .cntr = { {0, -1, -1}, {2, -1, -1} },
61 },
62 [P4_EVENT_ITLB_REFERENCE] = {
63 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
64 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
65 .cntr = { {0, -1, -1}, {2, -1, -1} },
66 },
67 [P4_EVENT_MEMORY_CANCEL] = {
68 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
69 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
70 .cntr = { {8, 9, -1}, {10, 11, -1} },
71 },
72 [P4_EVENT_MEMORY_COMPLETE] = {
73 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
74 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
75 .cntr = { {8, 9, -1}, {10, 11, -1} },
76 },
77 [P4_EVENT_LOAD_PORT_REPLAY] = {
78 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
79 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
80 .cntr = { {8, 9, -1}, {10, 11, -1} },
81 },
82 [P4_EVENT_STORE_PORT_REPLAY] = {
83 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
84 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
85 .cntr = { {8, 9, -1}, {10, 11, -1} },
86 },
87 [P4_EVENT_MOB_LOAD_REPLAY] = {
88 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
89 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
90 .cntr = { {0, -1, -1}, {2, -1, -1} },
91 },
92 [P4_EVENT_PAGE_WALK_TYPE] = {
93 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
94 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
95 .cntr = { {0, -1, -1}, {2, -1, -1} },
96 },
97 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
98 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
99 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
100 .cntr = { {0, -1, -1}, {2, -1, -1} },
101 },
102 [P4_EVENT_IOQ_ALLOCATION] = {
103 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
104 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
105 .cntr = { {0, -1, -1}, {2, -1, -1} },
106 },
107 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
108 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
109 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
110 .cntr = { {2, -1, -1}, {3, -1, -1} },
111 },
112 [P4_EVENT_FSB_DATA_ACTIVITY] = {
113 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
114 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
115 .cntr = { {0, -1, -1}, {2, -1, -1} },
116 },
117 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
118 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
119 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
120 .cntr = { {0, -1, -1}, {1, -1, -1} },
121 },
122 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
123 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
124 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
125 .cntr = { {2, -1, -1}, {3, -1, -1} },
126 },
127 [P4_EVENT_SSE_INPUT_ASSIST] = {
128 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
129 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
130 .cntr = { {8, 9, -1}, {10, 11, -1} },
131 },
132 [P4_EVENT_PACKED_SP_UOP] = {
133 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
134 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
135 .cntr = { {8, 9, -1}, {10, 11, -1} },
136 },
137 [P4_EVENT_PACKED_DP_UOP] = {
138 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
139 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
140 .cntr = { {8, 9, -1}, {10, 11, -1} },
141 },
142 [P4_EVENT_SCALAR_SP_UOP] = {
143 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
144 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
145 .cntr = { {8, 9, -1}, {10, 11, -1} },
146 },
147 [P4_EVENT_SCALAR_DP_UOP] = {
148 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
149 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
150 .cntr = { {8, 9, -1}, {10, 11, -1} },
151 },
152 [P4_EVENT_64BIT_MMX_UOP] = {
153 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
154 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
155 .cntr = { {8, 9, -1}, {10, 11, -1} },
156 },
157 [P4_EVENT_128BIT_MMX_UOP] = {
158 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
159 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
160 .cntr = { {8, 9, -1}, {10, 11, -1} },
161 },
162 [P4_EVENT_X87_FP_UOP] = {
163 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
164 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
165 .cntr = { {8, 9, -1}, {10, 11, -1} },
166 },
167 [P4_EVENT_TC_MISC] = {
168 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
169 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
170 .cntr = { {4, 5, -1}, {6, 7, -1} },
171 },
172 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
173 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
174 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
175 .cntr = { {0, -1, -1}, {2, -1, -1} },
176 },
177 [P4_EVENT_TC_MS_XFER] = {
178 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
179 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
180 .cntr = { {4, 5, -1}, {6, 7, -1} },
181 },
182 [P4_EVENT_UOP_QUEUE_WRITES] = {
183 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
184 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
185 .cntr = { {4, 5, -1}, {6, 7, -1} },
186 },
187 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
188 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
189 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
190 .cntr = { {4, 5, -1}, {6, 7, -1} },
191 },
192 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
193 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
194 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
195 .cntr = { {4, 5, -1}, {6, 7, -1} },
196 },
197 [P4_EVENT_RESOURCE_STALL] = {
198 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
199 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
200 .cntr = { {12, 13, 16}, {14, 15, 17} },
201 },
202 [P4_EVENT_WC_BUFFER] = {
203 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
204 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
205 .cntr = { {8, 9, -1}, {10, 11, -1} },
206 },
207 [P4_EVENT_B2B_CYCLES] = {
208 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
209 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
210 .cntr = { {0, -1, -1}, {2, -1, -1} },
211 },
212 [P4_EVENT_BNR] = {
213 .opcode = P4_OPCODE(P4_EVENT_BNR),
214 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
215 .cntr = { {0, -1, -1}, {2, -1, -1} },
216 },
217 [P4_EVENT_SNOOP] = {
218 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
219 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
220 .cntr = { {0, -1, -1}, {2, -1, -1} },
221 },
222 [P4_EVENT_RESPONSE] = {
223 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
224 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
225 .cntr = { {0, -1, -1}, {2, -1, -1} },
226 },
227 [P4_EVENT_FRONT_END_EVENT] = {
228 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
229 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
230 .cntr = { {12, 13, 16}, {14, 15, 17} },
231 },
232 [P4_EVENT_EXECUTION_EVENT] = {
233 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
234 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
235 .cntr = { {12, 13, 16}, {14, 15, 17} },
236 },
237 [P4_EVENT_REPLAY_EVENT] = {
238 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
239 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
240 .cntr = { {12, 13, 16}, {14, 15, 17} },
241 },
242 [P4_EVENT_INSTR_RETIRED] = {
243 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
244 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
245 .cntr = { {12, 13, 16}, {14, 15, 17} },
246 },
247 [P4_EVENT_UOPS_RETIRED] = {
248 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
249 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
250 .cntr = { {12, 13, 16}, {14, 15, 17} },
251 },
252 [P4_EVENT_UOP_TYPE] = {
253 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
254 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
255 .cntr = { {12, 13, 16}, {14, 15, 17} },
256 },
257 [P4_EVENT_BRANCH_RETIRED] = {
258 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
259 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
260 .cntr = { {12, 13, 16}, {14, 15, 17} },
261 },
262 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
263 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
264 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
265 .cntr = { {12, 13, 16}, {14, 15, 17} },
266 },
267 [P4_EVENT_X87_ASSIST] = {
268 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
269 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
270 .cntr = { {12, 13, 16}, {14, 15, 17} },
271 },
272 [P4_EVENT_MACHINE_CLEAR] = {
273 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
274 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
275 .cntr = { {12, 13, 16}, {14, 15, 17} },
276 },
277 [P4_EVENT_INSTR_COMPLETED] = {
278 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
279 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
280 .cntr = { {12, 13, 16}, {14, 15, 17} },
281 },
282};
283
284#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \
285 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
286 P4_ESCR_EMASK_BIT(event, bit)) | \
287 p4_config_pack_cccr(cache_event | \
288 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
289
290static __initconst const u64 p4_hw_cache_event_ids
291 [PERF_COUNT_HW_CACHE_MAX]
292 [PERF_COUNT_HW_CACHE_OP_MAX]
293 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
294{
295 [ C(L1D ) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0,
298 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
299 P4_CACHE__1stl_cache_load_miss_retired),
300 },
301 },
302 [ C(LL ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0,
305 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
306 P4_CACHE__2ndl_cache_load_miss_retired),
307 },
308},
309 [ C(DTLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_CACHE__dtlb_load_miss_retired),
314 },
315 [ C(OP_WRITE) ] = {
316 [ C(RESULT_ACCESS) ] = 0x0,
317 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
318 P4_CACHE__dtlb_store_miss_retired),
319 },
320 },
321 [ C(ITLB) ] = {
322 [ C(OP_READ) ] = {
323 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
324 P4_CACHE__itlb_reference_hit),
325 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
326 P4_CACHE__itlb_reference_miss),
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
340 /* non-halted CPU clocks */
341 [PERF_COUNT_HW_CPU_CYCLES] =
342 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
343 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
344
345 /*
346 * retired instructions
347 * in a sake of simplicity we don't use the FSB tagging
348 */
349 [PERF_COUNT_HW_INSTRUCTIONS] =
350 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
351 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
352 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
353
354 /* cache hits */
355 [PERF_COUNT_HW_CACHE_REFERENCES] =
356 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
358 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
359 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
360 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
361 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
362 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
363
364 /* cache misses */
365 [PERF_COUNT_HW_CACHE_MISSES] =
366 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
367 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
368 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
369 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
370
371 /* branch instructions retired */
372 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
373 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
374 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
375 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
376 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
377 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
378
379 /* mispredicted branches retired */
380 [PERF_COUNT_HW_BRANCH_MISSES] =
381 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
382 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
383
384 /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
385 [PERF_COUNT_HW_BUS_CYCLES] =
386 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
387 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
388 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
389 p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
390};
391
392static struct p4_event_bind *p4_config_get_bind(u64 config)
393{
394 unsigned int evnt = p4_config_unpack_event(config);
395 struct p4_event_bind *bind = NULL;
396
397 if (evnt < ARRAY_SIZE(p4_event_bind_map))
398 bind = &p4_event_bind_map[evnt];
399
400 return bind;
401}
402
403static u64 p4_pmu_event_map(int hw_event)
404{
405 struct p4_event_bind *bind;
406 unsigned int esel;
407 u64 config;
408
409 config = p4_general_events[hw_event];
410 bind = p4_config_get_bind(config);
411 esel = P4_OPCODE_ESEL(bind->opcode);
412 config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
413
414 return config;
415}
416
417static int p4_hw_config(struct perf_event *event)
418{
419 int cpu = get_cpu();
420 int rc = 0;
421 unsigned int evnt;
422 u32 escr, cccr;
423
424 /*
425 * the reason we use cpu that early is that: if we get scheduled
426 * first time on the same cpu -- we will not need swap thread
427 * specific flags in config (and will save some cpu cycles)
428 */
429
430 cccr = p4_default_cccr_conf(cpu);
431 escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
432 event->attr.exclude_user);
433 event->hw.config = p4_config_pack_escr(escr) |
434 p4_config_pack_cccr(cccr);
435
436 if (p4_ht_active() && p4_ht_thread(cpu))
437 event->hw.config = p4_set_ht_bit(event->hw.config);
438
439 if (event->attr.type == PERF_TYPE_RAW) {
440
441 /* user data may have out-of-bound event index */
442 evnt = p4_config_unpack_event(event->attr.config);
443 if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
444 rc = -EINVAL;
445 goto out;
446 }
447
448 /*
449 * We don't control raw events so it's up to the caller
450 * to pass sane values (and we don't count the thread number
451 * on HT machine but allow HT-compatible specifics to be
452 * passed on)
453 *
454 * XXX: HT wide things should check perf_paranoid_cpu() &&
455 * CAP_SYS_ADMIN
456 */
457 event->hw.config |= event->attr.config &
458 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
459 p4_config_pack_cccr(P4_CCCR_MASK_HT));
460 }
461
462 rc = x86_setup_perfctr(event);
463out:
464 put_cpu();
465 return rc;
466}
467
468static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
469{
470 unsigned long dummy;
471
472 rdmsrl(hwc->config_base + hwc->idx, dummy);
473 if (dummy & P4_CCCR_OVF) {
474 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
475 ((u64)dummy) & ~P4_CCCR_OVF);
476 }
477}
478
479static inline void p4_pmu_disable_event(struct perf_event *event)
480{
481 struct hw_perf_event *hwc = &event->hw;
482
483 /*
484 * If event gets disabled while counter is in overflowed
485 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
486 * asserted again and again
487 */
488 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
489 (u64)(p4_config_unpack_cccr(hwc->config)) &
490 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
491}
492
493static void p4_pmu_disable_all(void)
494{
495 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
496 int idx;
497
498 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
499 struct perf_event *event = cpuc->events[idx];
500 if (!test_bit(idx, cpuc->active_mask))
501 continue;
502 p4_pmu_disable_event(event);
503 }
504}
505
506static void p4_pmu_enable_event(struct perf_event *event)
507{
508 struct hw_perf_event *hwc = &event->hw;
509 int thread = p4_ht_config_thread(hwc->config);
510 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
511 unsigned int idx = p4_config_unpack_event(hwc->config);
512 unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
513 struct p4_event_bind *bind;
514 struct p4_cache_event_bind *bind_cache;
515 u64 escr_addr, cccr;
516
517 bind = &p4_event_bind_map[idx];
518 escr_addr = (u64)bind->escr_msr[thread];
519
520 /*
521 * - we dont support cascaded counters yet
522 * - and counter 1 is broken (erratum)
523 */
524 WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
525 WARN_ON_ONCE(hwc->idx == 1);
526
527 /* we need a real Event value */
528 escr_conf &= ~P4_ESCR_EVENT_MASK;
529 escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
530
531 cccr = p4_config_unpack_cccr(hwc->config);
532
533 /*
534 * it could be Cache event so that we need to
535 * set metrics into additional MSRs
536 */
537 BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
538 if (idx_cache > P4_CACHE__NONE &&
539 idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
540 bind_cache = &p4_cache_event_bind_map[idx_cache];
541 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
542 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
543 }
544
545 (void)checking_wrmsrl(escr_addr, escr_conf);
546 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
547 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
548}
549
550static void p4_pmu_enable_all(int added)
551{
552 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
553 int idx;
554
555 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
556 struct perf_event *event = cpuc->events[idx];
557 if (!test_bit(idx, cpuc->active_mask))
558 continue;
559 p4_pmu_enable_event(event);
560 }
561}
562
563static int p4_pmu_handle_irq(struct pt_regs *regs)
564{
565 struct perf_sample_data data;
566 struct cpu_hw_events *cpuc;
567 struct perf_event *event;
568 struct hw_perf_event *hwc;
569 int idx, handled = 0;
570 u64 val;
571
572 data.addr = 0;
573 data.raw = NULL;
574
575 cpuc = &__get_cpu_var(cpu_hw_events);
576
577 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
578
579 if (!test_bit(idx, cpuc->active_mask))
580 continue;
581
582 event = cpuc->events[idx];
583 hwc = &event->hw;
584
585 WARN_ON_ONCE(hwc->idx != idx);
586
587 /*
588 * FIXME: Redundant call, actually not needed
589 * but just to check if we're screwed
590 */
591 p4_pmu_clear_cccr_ovf(hwc);
592
593 val = x86_perf_event_update(event);
594 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
595 continue;
596
597 /*
598 * event overflow
599 */
600 handled = 1;
601 data.period = event->hw.last_period;
602
603 if (!x86_perf_event_set_period(event))
604 continue;
605 if (perf_event_overflow(event, 1, &data, regs))
606 p4_pmu_disable_event(event);
607 }
608
609 if (handled) {
610 /* p4 quirk: unmask it again */
611 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
612 inc_irq_stat(apic_perf_irqs);
613 }
614
615 return handled;
616}
617
618/*
619 * swap thread specific fields according to a thread
620 * we are going to run on
621 */
622static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
623{
624 u32 escr, cccr;
625
626 /*
627 * we either lucky and continue on same cpu or no HT support
628 */
629 if (!p4_should_swap_ts(hwc->config, cpu))
630 return;
631
632 /*
633 * the event is migrated from an another logical
634 * cpu, so we need to swap thread specific flags
635 */
636
637 escr = p4_config_unpack_escr(hwc->config);
638 cccr = p4_config_unpack_cccr(hwc->config);
639
640 if (p4_ht_thread(cpu)) {
641 cccr &= ~P4_CCCR_OVF_PMI_T0;
642 cccr |= P4_CCCR_OVF_PMI_T1;
643 if (escr & P4_ESCR_T0_OS) {
644 escr &= ~P4_ESCR_T0_OS;
645 escr |= P4_ESCR_T1_OS;
646 }
647 if (escr & P4_ESCR_T0_USR) {
648 escr &= ~P4_ESCR_T0_USR;
649 escr |= P4_ESCR_T1_USR;
650 }
651 hwc->config = p4_config_pack_escr(escr);
652 hwc->config |= p4_config_pack_cccr(cccr);
653 hwc->config |= P4_CONFIG_HT;
654 } else {
655 cccr &= ~P4_CCCR_OVF_PMI_T1;
656 cccr |= P4_CCCR_OVF_PMI_T0;
657 if (escr & P4_ESCR_T1_OS) {
658 escr &= ~P4_ESCR_T1_OS;
659 escr |= P4_ESCR_T0_OS;
660 }
661 if (escr & P4_ESCR_T1_USR) {
662 escr &= ~P4_ESCR_T1_USR;
663 escr |= P4_ESCR_T0_USR;
664 }
665 hwc->config = p4_config_pack_escr(escr);
666 hwc->config |= p4_config_pack_cccr(cccr);
667 hwc->config &= ~P4_CONFIG_HT;
668 }
669}
670
671/*
672 * ESCR address hashing is tricky, ESCRs are not sequential
673 * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
674 * the metric between any ESCRs is laid in range [0xa0,0xe1]
675 *
676 * so we make ~70% filled hashtable
677 */
678
679#define P4_ESCR_MSR_BASE 0x000003a0
680#define P4_ESCR_MSR_MAX 0x000003e1
681#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
682#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
683#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
684
685static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
686 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
687 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
688 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
689 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
690 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
691 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
692 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
693 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
694 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
695 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
696 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
697 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
698 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
699 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
700 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
701 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
702 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
703 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
704 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
705 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
706 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
707 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
708 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
709 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
710 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
711 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
712 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
713 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
714 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
715 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
716 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
717 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
718 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
719 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
720 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
721 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
722 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
723 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
724 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
725 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
726 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
727 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
728 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
729 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
730 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
731 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
732};
733
734static int p4_get_escr_idx(unsigned int addr)
735{
736 unsigned int idx = P4_ESCR_MSR_IDX(addr);
737
738 if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
739 !p4_escr_table[idx])) {
740 WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
741 return -1;
742 }
743
744 return idx;
745}
746
747static int p4_next_cntr(int thread, unsigned long *used_mask,
748 struct p4_event_bind *bind)
749{
750 int i, j;
751
752 for (i = 0; i < P4_CNTR_LIMIT; i++) {
753 j = bind->cntr[thread][i];
754 if (j != -1 && !test_bit(j, used_mask))
755 return j;
756 }
757
758 return -1;
759}
760
761static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
762{
763 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
764 unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
765 int cpu = raw_smp_processor_id();
766 struct hw_perf_event *hwc;
767 struct p4_event_bind *bind;
768 unsigned int i, thread, num;
769 int cntr_idx, escr_idx;
770
771 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
772 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
773
774 for (i = 0, num = n; i < n; i++, num--) {
775
776 hwc = &cpuc->event_list[i]->hw;
777 thread = p4_ht_thread(cpu);
778 bind = p4_config_get_bind(hwc->config);
779 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
780 if (unlikely(escr_idx == -1))
781 goto done;
782
783 if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
784 cntr_idx = hwc->idx;
785 if (assign)
786 assign[i] = hwc->idx;
787 goto reserve;
788 }
789
790 cntr_idx = p4_next_cntr(thread, used_mask, bind);
791 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
792 goto done;
793
794 p4_pmu_swap_config_ts(hwc, cpu);
795 if (assign)
796 assign[i] = cntr_idx;
797reserve:
798 set_bit(cntr_idx, used_mask);
799 set_bit(escr_idx, escr_mask);
800 }
801
802done:
803 return num ? -ENOSPC : 0;
804}
805
806static __initconst const struct x86_pmu p4_pmu = {
807 .name = "Netburst P4/Xeon",
808 .handle_irq = p4_pmu_handle_irq,
809 .disable_all = p4_pmu_disable_all,
810 .enable_all = p4_pmu_enable_all,
811 .enable = p4_pmu_enable_event,
812 .disable = p4_pmu_disable_event,
813 .eventsel = MSR_P4_BPU_CCCR0,
814 .perfctr = MSR_P4_BPU_PERFCTR0,
815 .event_map = p4_pmu_event_map,
816 .max_events = ARRAY_SIZE(p4_general_events),
817 .get_event_constraints = x86_get_event_constraints,
818 /*
819 * IF HT disabled we may need to use all
820 * ARCH_P4_MAX_CCCR counters simulaneously
821 * though leave it restricted at moment assuming
822 * HT is on
823 */
824 .num_counters = ARCH_P4_MAX_CCCR,
825 .apic = 1,
826 .cntval_bits = 40,
827 .cntval_mask = (1ULL << 40) - 1,
828 .max_period = (1ULL << 39) - 1,
829 .hw_config = p4_hw_config,
830 .schedule_events = p4_pmu_schedule_events,
831};
832
833static __init int p4_pmu_init(void)
834{
835 unsigned int low, high;
836
837 /* If we get stripped -- indexig fails */
838 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
839
840 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
841 if (!(low & (1 << 7))) {
842 pr_cont("unsupported Netburst CPU model %d ",
843 boot_cpu_data.x86_model);
844 return -ENODEV;
845 }
846
847 memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
848 sizeof(hw_cache_event_ids));
849
850 pr_cont("Netburst events, ");
851
852 x86_pmu = p4_pmu;
853
854 return 0;
855}
856
857#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14da..34ba07be2cda 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
27 */ 27 */
28#define P6_NOP_EVENT 0x0000002EULL 28#define P6_NOP_EVENT 0x0000002EULL
29 29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] = 30static struct event_constraint p6_event_constraints[] =
49{ 31{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ 32 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
66 wrmsrl(MSR_P6_EVNTSEL0, val); 48 wrmsrl(MSR_P6_EVNTSEL0, val);
67} 49}
68 50
69static void p6_pmu_enable_all(void) 51static void p6_pmu_enable_all(int added)
70{ 52{
71 unsigned long val; 53 unsigned long val;
72 54
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103} 85}
104 86
105static __initconst struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
106 .name = "p6", 88 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq, 89 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all, 90 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all, 91 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event, 92 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event, 93 .disable = p6_pmu_disable_event,
94 .hw_config = x86_pmu_hw_config,
95 .schedule_events = x86_schedule_events,
112 .eventsel = MSR_P6_EVNTSEL0, 96 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0, 97 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map, 98 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map), 99 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1, 100 .apic = 1,
118 .max_period = (1ULL << 31) - 1, 101 .max_period = (1ULL << 31) - 1,
119 .version = 0, 102 .version = 0,
120 .num_events = 2, 103 .num_counters = 2,
121 /* 104 /*
122 * Events have 40 bits implemented. However they are designed such 105 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the 106 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
125 * 108 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B 109 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */ 110 */
128 .event_bits = 32, 111 .cntval_bits = 32,
129 .event_mask = (1ULL << 32) - 1, 112 .cntval_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints, 113 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints, 114 .event_constraints = p6_event_constraints,
132}; 115};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index dfdb4dba2320..b9d1ff588445 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,8 +24,8 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/div64.h> 26#include <asm/div64.h>
27#include <asm/vmware.h>
28#include <asm/x86_init.h> 27#include <asm/x86_init.h>
28#include <asm/hypervisor.h>
29 29
30#define CPUID_VMWARE_INFO_LEAF 0x40000000 30#define CPUID_VMWARE_INFO_LEAF 0x40000000
31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -65,7 +65,7 @@ static unsigned long vmware_get_tsc_khz(void)
65 return tsc_hz; 65 return tsc_hz;
66} 66}
67 67
68void __init vmware_platform_setup(void) 68static void __init vmware_platform_setup(void)
69{ 69{
70 uint32_t eax, ebx, ecx, edx; 70 uint32_t eax, ebx, ecx, edx;
71 71
@@ -83,26 +83,22 @@ void __init vmware_platform_setup(void)
83 * serial key should be enough, as this will always have a VMware 83 * serial key should be enough, as this will always have a VMware
84 * specific string when running under VMware hypervisor. 84 * specific string when running under VMware hypervisor.
85 */ 85 */
86int vmware_platform(void) 86static bool __init vmware_platform(void)
87{ 87{
88 if (cpu_has_hypervisor) { 88 if (cpu_has_hypervisor) {
89 unsigned int eax, ebx, ecx, edx; 89 unsigned int eax;
90 char hyper_vendor_id[13]; 90 unsigned int hyper_vendor_id[3];
91 91
92 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); 92 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
93 memcpy(hyper_vendor_id + 0, &ebx, 4); 93 &hyper_vendor_id[1], &hyper_vendor_id[2]);
94 memcpy(hyper_vendor_id + 4, &ecx, 4); 94 if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
95 memcpy(hyper_vendor_id + 8, &edx, 4); 95 return true;
96 hyper_vendor_id[12] = '\0';
97 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
98 return 1;
99 } else if (dmi_available && dmi_name_in_serial("VMware") && 96 } else if (dmi_available && dmi_name_in_serial("VMware") &&
100 __vmware_platform()) 97 __vmware_platform())
101 return 1; 98 return true;
102 99
103 return 0; 100 return false;
104} 101}
105EXPORT_SYMBOL(vmware_platform);
106 102
107/* 103/*
108 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 104 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -116,8 +112,16 @@ EXPORT_SYMBOL(vmware_platform);
116 * so that the kernel could just trust the hypervisor with providing a 112 * so that the kernel could just trust the hypervisor with providing a
117 * reliable virtual TSC that is suitable for timekeeping. 113 * reliable virtual TSC that is suitable for timekeeping.
118 */ 114 */
119void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) 115static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
120{ 116{
121 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
122 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 118 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
123} 119}
120
121const __refconst struct hypervisor_x86 x86_hyper_vmware = {
122 .name = "VMware",
123 .detect = vmware_platform,
124 .set_cpu_features = vmware_set_cpu_features,
125 .init_platform = vmware_platform_setup,
126};
127EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e5..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done)
11 * - buffer access
12 *
13 * It does not do:
14 * - security checking (is the caller allowed to trace the task)
15 * - buffer allocation (memory accounting)
16 *
17 *
18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */
21
22#include <linux/kernel.h>
23#include <linux/string.h>
24#include <linux/errno.h>
25#include <linux/sched.h>
26#include <linux/slab.h>
27#include <linux/mm.h>
28#include <linux/trace_clock.h>
29
30#include <asm/ds.h>
31
32#include "ds_selftest.h"
33
34/*
35 * The configuration for a particular DS hardware implementation:
36 */
37struct ds_configuration {
38 /* The name of the configuration: */
39 const char *name;
40
41 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
42 unsigned char sizeof_ptr_field;
43
44 /* The size of a BTS/PEBS record in bytes: */
45 unsigned char sizeof_rec[2];
46
47 /* The number of pebs counter reset values in the DS structure. */
48 unsigned char nr_counter_reset;
49
50 /* Control bit-masks indexed by enum ds_feature: */
51 unsigned long ctl[dsf_ctl_max];
52};
53static struct ds_configuration ds_cfg __read_mostly;
54
55
56/* Maximal size of a DS configuration: */
57#define MAX_SIZEOF_DS 0x80
58
59/* Maximal size of a BTS record: */
60#define MAX_SIZEOF_BTS (3 * 8)
61
62/* BTS and PEBS buffer alignment: */
63#define DS_ALIGNMENT (1 << 3)
64
65/* Number of buffer pointers in DS: */
66#define NUM_DS_PTR_FIELDS 8
67
68/* Size of a pebs reset value in DS: */
69#define PEBS_RESET_FIELD_SIZE 8
70
71/* Mask of control bits in the DS MSR register: */
72#define BTS_CONTROL \
73 ( ds_cfg.ctl[dsf_bts] | \
74 ds_cfg.ctl[dsf_bts_kernel] | \
75 ds_cfg.ctl[dsf_bts_user] | \
76 ds_cfg.ctl[dsf_bts_overflow] )
77
78/*
79 * A BTS or PEBS tracer.
80 *
81 * This holds the configuration of the tracer and serves as a handle
82 * to identify tracers.
83 */
84struct ds_tracer {
85 /* The DS context (partially) owned by this tracer. */
86 struct ds_context *context;
87 /* The buffer provided on ds_request() and its size in bytes. */
88 void *buffer;
89 size_t size;
90};
91
92struct bts_tracer {
93 /* The common DS part: */
94 struct ds_tracer ds;
95
96 /* The trace including the DS configuration: */
97 struct bts_trace trace;
98
99 /* Buffer overflow notification function: */
100 bts_ovfl_callback_t ovfl;
101
102 /* Active flags affecting trace collection. */
103 unsigned int flags;
104};
105
106struct pebs_tracer {
107 /* The common DS part: */
108 struct ds_tracer ds;
109
110 /* The trace including the DS configuration: */
111 struct pebs_trace trace;
112
113 /* Buffer overflow notification function: */
114 pebs_ovfl_callback_t ovfl;
115};
116
117/*
118 * Debug Store (DS) save area configuration (see Intel64 and IA32
119 * Architectures Software Developer's Manual, section 18.5)
120 *
121 * The DS configuration consists of the following fields; different
122 * architetures vary in the size of those fields.
123 *
124 * - double-word aligned base linear address of the BTS buffer
125 * - write pointer into the BTS buffer
126 * - end linear address of the BTS buffer (one byte beyond the end of
127 * the buffer)
128 * - interrupt pointer into BTS buffer
129 * (interrupt occurs when write pointer passes interrupt pointer)
130 * - double-word aligned base linear address of the PEBS buffer
131 * - write pointer into the PEBS buffer
132 * - end linear address of the PEBS buffer (one byte beyond the end of
133 * the buffer)
134 * - interrupt pointer into PEBS buffer
135 * (interrupt occurs when write pointer passes interrupt pointer)
136 * - value to which counter is reset following counter overflow
137 *
138 * Later architectures use 64bit pointers throughout, whereas earlier
139 * architectures use 32bit pointers in 32bit mode.
140 *
141 *
142 * We compute the base address for the first 8 fields based on:
143 * - the field size stored in the DS configuration
144 * - the relative field position
145 * - an offset giving the start of the respective region
146 *
147 * This offset is further used to index various arrays holding
148 * information for BTS and PEBS at the respective index.
149 *
150 * On later 32bit processors, we only access the lower 32bit of the
151 * 64bit pointer fields. The upper halves will be zeroed out.
152 */
153
154enum ds_field {
155 ds_buffer_base = 0,
156 ds_index,
157 ds_absolute_maximum,
158 ds_interrupt_threshold,
159};
160
161enum ds_qualifier {
162 ds_bts = 0,
163 ds_pebs
164};
165
166static inline unsigned long
167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
168{
169 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
170 return *(unsigned long *)base;
171}
172
173static inline void
174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
175 unsigned long value)
176{
177 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
178 (*(unsigned long *)base) = value;
179}
180
181
182/*
183 * Locking is done only for allocating BTS or PEBS resources.
184 */
185static DEFINE_SPINLOCK(ds_lock);
186
187/*
188 * We either support (system-wide) per-cpu or per-thread allocation.
189 * We distinguish the two based on the task_struct pointer, where a
190 * NULL pointer indicates per-cpu allocation for the current cpu.
191 *
192 * Allocations are use-counted. As soon as resources are allocated,
193 * further allocations must be of the same type (per-cpu or
194 * per-thread). We model this by counting allocations (i.e. the number
195 * of tracers of a certain type) for one type negatively:
196 * =0 no tracers
197 * >0 number of per-thread tracers
198 * <0 number of per-cpu tracers
199 *
200 * Tracers essentially gives the number of ds contexts for a certain
201 * type of allocation.
202 */
203static atomic_t tracers = ATOMIC_INIT(0);
204
205static inline int get_tracer(struct task_struct *task)
206{
207 int error;
208
209 spin_lock_irq(&ds_lock);
210
211 if (task) {
212 error = -EPERM;
213 if (atomic_read(&tracers) < 0)
214 goto out;
215 atomic_inc(&tracers);
216 } else {
217 error = -EPERM;
218 if (atomic_read(&tracers) > 0)
219 goto out;
220 atomic_dec(&tracers);
221 }
222
223 error = 0;
224out:
225 spin_unlock_irq(&ds_lock);
226 return error;
227}
228
229static inline void put_tracer(struct task_struct *task)
230{
231 if (task)
232 atomic_dec(&tracers);
233 else
234 atomic_inc(&tracers);
235}
236
237/*
238 * The DS context is either attached to a thread or to a cpu:
239 * - in the former case, the thread_struct contains a pointer to the
240 * attached context.
241 * - in the latter case, we use a static array of per-cpu context
242 * pointers.
243 *
244 * Contexts are use-counted. They are allocated on first access and
245 * deallocated when the last user puts the context.
246 */
247struct ds_context {
248 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
249 unsigned char ds[MAX_SIZEOF_DS];
250
251 /* The owner of the BTS and PEBS configuration, respectively: */
252 struct bts_tracer *bts_master;
253 struct pebs_tracer *pebs_master;
254
255 /* Use count: */
256 unsigned long count;
257
258 /* Pointer to the context pointer field: */
259 struct ds_context **this;
260
261 /* The traced task; NULL for cpu tracing: */
262 struct task_struct *task;
263
264 /* The traced cpu; only valid if task is NULL: */
265 int cpu;
266};
267
268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269
270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{
273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL;
277
278 /* Chances are small that we already have a context. */
279 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
280 if (!new_context)
281 return NULL;
282
283 spin_lock_irq(&ds_lock);
284
285 context = *p_context;
286 if (likely(!context)) {
287 context = new_context;
288
289 context->this = p_context;
290 context->task = task;
291 context->cpu = cpu;
292 context->count = 0;
293
294 *p_context = context;
295 }
296
297 context->count++;
298
299 spin_unlock_irq(&ds_lock);
300
301 if (context != new_context)
302 kfree(new_context);
303
304 return context;
305}
306
307static void ds_put_context(struct ds_context *context)
308{
309 struct task_struct *task;
310 unsigned long irq;
311
312 if (!context)
313 return;
314
315 spin_lock_irqsave(&ds_lock, irq);
316
317 if (--context->count) {
318 spin_unlock_irqrestore(&ds_lock, irq);
319 return;
320 }
321
322 *(context->this) = NULL;
323
324 task = context->task;
325
326 if (task)
327 clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
328
329 /*
330 * We leave the (now dangling) pointer to the DS configuration in
331 * the DS_AREA msr. This is as good or as bad as replacing it with
332 * NULL - the hardware would crash if we enabled tracing.
333 *
334 * This saves us some problems with having to write an msr on a
335 * different cpu while preventing others from doing the same for the
336 * next context for that same cpu.
337 */
338
339 spin_unlock_irqrestore(&ds_lock, irq);
340
341 /* The context might still be in use for context switching. */
342 if (task && (task != current))
343 wait_task_context_switch(task);
344
345 kfree(context);
346}
347
348static void ds_install_ds_area(struct ds_context *context)
349{
350 unsigned long ds;
351
352 ds = (unsigned long)context->ds;
353
354 /*
355 * There is a race between the bts master and the pebs master.
356 *
357 * The thread/cpu access is synchronized via get/put_cpu() for
358 * task tracing and via wrmsr_on_cpu for cpu tracing.
359 *
360 * If bts and pebs are collected for the same task or same cpu,
361 * the same confiuration is written twice.
362 */
363 if (context->task) {
364 get_cpu();
365 if (context->task == current)
366 wrmsrl(MSR_IA32_DS_AREA, ds);
367 set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
368 put_cpu();
369 } else
370 wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
371 (u32)((u64)ds), (u32)((u64)ds >> 32));
372}
373
374/*
375 * Call the tracer's callback on a buffer overflow.
376 *
377 * context: the ds context
378 * qual: the buffer type
379 */
380static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
381{
382 switch (qual) {
383 case ds_bts:
384 if (context->bts_master &&
385 context->bts_master->ovfl)
386 context->bts_master->ovfl(context->bts_master);
387 break;
388 case ds_pebs:
389 if (context->pebs_master &&
390 context->pebs_master->ovfl)
391 context->pebs_master->ovfl(context->pebs_master);
392 break;
393 }
394}
395
396
397/*
398 * Write raw data into the BTS or PEBS buffer.
399 *
400 * The remainder of any partially written record is zeroed out.
401 *
402 * context: the DS context
403 * qual: the buffer type
404 * record: the data to write
405 * size: the size of the data
406 */
407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
408 const void *record, size_t size)
409{
410 int bytes_written = 0;
411
412 if (!record)
413 return -EINVAL;
414
415 while (size) {
416 unsigned long base, index, end, write_end, int_th;
417 unsigned long write_size, adj_write_size;
418
419 /*
420 * Write as much as possible without producing an
421 * overflow interrupt.
422 *
423 * Interrupt_threshold must either be
424 * - bigger than absolute_maximum or
425 * - point to a record between buffer_base and absolute_maximum
426 *
427 * Index points to a valid record.
428 */
429 base = ds_get(context->ds, qual, ds_buffer_base);
430 index = ds_get(context->ds, qual, ds_index);
431 end = ds_get(context->ds, qual, ds_absolute_maximum);
432 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
433
434 write_end = min(end, int_th);
435
436 /*
437 * If we are already beyond the interrupt threshold,
438 * we fill the entire buffer.
439 */
440 if (write_end <= index)
441 write_end = end;
442
443 if (write_end <= index)
444 break;
445
446 write_size = min((unsigned long) size, write_end - index);
447 memcpy((void *)index, record, write_size);
448
449 record = (const char *)record + write_size;
450 size -= write_size;
451 bytes_written += write_size;
452
453 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
454 adj_write_size *= ds_cfg.sizeof_rec[qual];
455
456 /* Zero out trailing bytes. */
457 memset((char *)index + write_size, 0,
458 adj_write_size - write_size);
459 index += adj_write_size;
460
461 if (index >= end)
462 index = base;
463 ds_set(context->ds, qual, ds_index, index);
464
465 if (index >= int_th)
466 ds_overflow(context, qual);
467 }
468
469 return bytes_written;
470}
471
472
473/*
474 * Branch Trace Store (BTS) uses the following format. Different
475 * architectures vary in the size of those fields.
476 * - source linear address
477 * - destination linear address
478 * - flags
479 *
480 * Later architectures use 64bit pointers throughout, whereas earlier
481 * architectures use 32bit pointers in 32bit mode.
482 *
483 * We compute the base address for the fields based on:
484 * - the field size stored in the DS configuration
485 * - the relative field position
486 *
487 * In order to store additional information in the BTS buffer, we use
488 * a special source address to indicate that the record requires
489 * special interpretation.
490 *
491 * Netburst indicated via a bit in the flags field whether the branch
492 * was predicted; this is ignored.
493 *
494 * We use two levels of abstraction:
495 * - the raw data level defined here
496 * - an arch-independent level defined in ds.h
497 */
498
499enum bts_field {
500 bts_from,
501 bts_to,
502 bts_flags,
503
504 bts_qual = bts_from,
505 bts_clock = bts_to,
506 bts_pid = bts_flags,
507
508 bts_qual_mask = (bts_qual_max - 1),
509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
510};
511
512static inline unsigned long bts_get(const char *base, unsigned long field)
513{
514 base += (ds_cfg.sizeof_ptr_field * field);
515 return *(unsigned long *)base;
516}
517
518static inline void bts_set(char *base, unsigned long field, unsigned long val)
519{
520 base += (ds_cfg.sizeof_ptr_field * field);
521 (*(unsigned long *)base) = val;
522}
523
524
525/*
526 * The raw BTS data is architecture dependent.
527 *
528 * For higher-level users, we give an arch-independent view.
529 * - ds.h defines struct bts_struct
530 * - bts_read translates one raw bts record into a bts_struct
531 * - bts_write translates one bts_struct into the raw format and
532 * writes it into the top of the parameter tracer's buffer.
533 *
534 * return: bytes read/written on success; -Eerrno, otherwise
535 */
536static int
537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
538{
539 if (!tracer)
540 return -EINVAL;
541
542 if (at < tracer->trace.ds.begin)
543 return -EINVAL;
544
545 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
546 return -EINVAL;
547
548 memset(out, 0, sizeof(*out));
549 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
550 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
551 out->variant.event.clock = bts_get(at, bts_clock);
552 out->variant.event.pid = bts_get(at, bts_pid);
553 } else {
554 out->qualifier = bts_branch;
555 out->variant.lbr.from = bts_get(at, bts_from);
556 out->variant.lbr.to = bts_get(at, bts_to);
557
558 if (!out->variant.lbr.from && !out->variant.lbr.to)
559 out->qualifier = bts_invalid;
560 }
561
562 return ds_cfg.sizeof_rec[ds_bts];
563}
564
565static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
566{
567 unsigned char raw[MAX_SIZEOF_BTS];
568
569 if (!tracer)
570 return -EINVAL;
571
572 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
573 return -EOVERFLOW;
574
575 switch (in->qualifier) {
576 case bts_invalid:
577 bts_set(raw, bts_from, 0);
578 bts_set(raw, bts_to, 0);
579 bts_set(raw, bts_flags, 0);
580 break;
581 case bts_branch:
582 bts_set(raw, bts_from, in->variant.lbr.from);
583 bts_set(raw, bts_to, in->variant.lbr.to);
584 bts_set(raw, bts_flags, 0);
585 break;
586 case bts_task_arrives:
587 case bts_task_departs:
588 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
589 bts_set(raw, bts_clock, in->variant.event.clock);
590 bts_set(raw, bts_pid, in->variant.event.pid);
591 break;
592 default:
593 return -EINVAL;
594 }
595
596 return ds_write(tracer->ds.context, ds_bts, raw,
597 ds_cfg.sizeof_rec[ds_bts]);
598}
599
600
601static void ds_write_config(struct ds_context *context,
602 struct ds_trace *cfg, enum ds_qualifier qual)
603{
604 unsigned char *ds = context->ds;
605
606 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
607 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
608 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
609 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
610}
611
612static void ds_read_config(struct ds_context *context,
613 struct ds_trace *cfg, enum ds_qualifier qual)
614{
615 unsigned char *ds = context->ds;
616
617 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
618 cfg->top = (void *)ds_get(ds, qual, ds_index);
619 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
620 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
621}
622
623static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
624 void *base, size_t size, size_t ith,
625 unsigned int flags) {
626 unsigned long buffer, adj;
627
628 /*
629 * Adjust the buffer address and size to meet alignment
630 * constraints:
631 * - buffer is double-word aligned
632 * - size is multiple of record size
633 *
634 * We checked the size at the very beginning; we have enough
635 * space to do the adjustment.
636 */
637 buffer = (unsigned long)base;
638
639 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
640 buffer += adj;
641 size -= adj;
642
643 trace->n = size / ds_cfg.sizeof_rec[qual];
644 trace->size = ds_cfg.sizeof_rec[qual];
645
646 size = (trace->n * trace->size);
647
648 trace->begin = (void *)buffer;
649 trace->top = trace->begin;
650 trace->end = (void *)(buffer + size);
651 /*
652 * The value for 'no threshold' is -1, which will set the
653 * threshold outside of the buffer, just like we want it.
654 */
655 ith *= ds_cfg.sizeof_rec[qual];
656 trace->ith = (void *)(buffer + size - ith);
657
658 trace->flags = flags;
659}
660
661
662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
663 enum ds_qualifier qual, struct task_struct *task,
664 int cpu, void *base, size_t size, size_t th)
665{
666 struct ds_context *context;
667 int error;
668 size_t req_size;
669
670 error = -EOPNOTSUPP;
671 if (!ds_cfg.sizeof_rec[qual])
672 goto out;
673
674 error = -EINVAL;
675 if (!base)
676 goto out;
677
678 req_size = ds_cfg.sizeof_rec[qual];
679 /* We might need space for alignment adjustments. */
680 if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
681 req_size += DS_ALIGNMENT;
682
683 error = -EINVAL;
684 if (size < req_size)
685 goto out;
686
687 if (th != (size_t)-1) {
688 th *= ds_cfg.sizeof_rec[qual];
689
690 error = -EINVAL;
691 if (size <= th)
692 goto out;
693 }
694
695 tracer->buffer = base;
696 tracer->size = size;
697
698 error = -ENOMEM;
699 context = ds_get_context(task, cpu);
700 if (!context)
701 goto out;
702 tracer->context = context;
703
704 /*
705 * Defer any tracer-specific initialization work for the context until
706 * context ownership has been clarified.
707 */
708
709 error = 0;
710 out:
711 return error;
712}
713
714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
715 void *base, size_t size,
716 bts_ovfl_callback_t ovfl, size_t th,
717 unsigned int flags)
718{
719 struct bts_tracer *tracer;
720 int error;
721
722 /* Buffer overflow notification is not yet implemented. */
723 error = -EOPNOTSUPP;
724 if (ovfl)
725 goto out;
726
727 error = get_tracer(task);
728 if (error < 0)
729 goto out;
730
731 error = -ENOMEM;
732 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
733 if (!tracer)
734 goto out_put_tracer;
735 tracer->ovfl = ovfl;
736
737 /* Do some more error checking and acquire a tracing context. */
738 error = ds_request(&tracer->ds, &tracer->trace.ds,
739 ds_bts, task, cpu, base, size, th);
740 if (error < 0)
741 goto out_tracer;
742
743 /* Claim the bts part of the tracing context we acquired above. */
744 spin_lock_irq(&ds_lock);
745
746 error = -EPERM;
747 if (tracer->ds.context->bts_master)
748 goto out_unlock;
749 tracer->ds.context->bts_master = tracer;
750
751 spin_unlock_irq(&ds_lock);
752
753 /*
754 * Now that we own the bts part of the context, let's complete the
755 * initialization for that part.
756 */
757 ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
758 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
759 ds_install_ds_area(tracer->ds.context);
760
761 tracer->trace.read = bts_read;
762 tracer->trace.write = bts_write;
763
764 /* Start tracing. */
765 ds_resume_bts(tracer);
766
767 return tracer;
768
769 out_unlock:
770 spin_unlock_irq(&ds_lock);
771 ds_put_context(tracer->ds.context);
772 out_tracer:
773 kfree(tracer);
774 out_put_tracer:
775 put_tracer(task);
776 out:
777 return ERR_PTR(error);
778}
779
780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
781 void *base, size_t size,
782 bts_ovfl_callback_t ovfl,
783 size_t th, unsigned int flags)
784{
785 return ds_request_bts(task, 0, base, size, ovfl, th, flags);
786}
787
788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
789 bts_ovfl_callback_t ovfl,
790 size_t th, unsigned int flags)
791{
792 return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
793}
794
795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
796 void *base, size_t size,
797 pebs_ovfl_callback_t ovfl, size_t th,
798 unsigned int flags)
799{
800 struct pebs_tracer *tracer;
801 int error;
802
803 /* Buffer overflow notification is not yet implemented. */
804 error = -EOPNOTSUPP;
805 if (ovfl)
806 goto out;
807
808 error = get_tracer(task);
809 if (error < 0)
810 goto out;
811
812 error = -ENOMEM;
813 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
814 if (!tracer)
815 goto out_put_tracer;
816 tracer->ovfl = ovfl;
817
818 /* Do some more error checking and acquire a tracing context. */
819 error = ds_request(&tracer->ds, &tracer->trace.ds,
820 ds_pebs, task, cpu, base, size, th);
821 if (error < 0)
822 goto out_tracer;
823
824 /* Claim the pebs part of the tracing context we acquired above. */
825 spin_lock_irq(&ds_lock);
826
827 error = -EPERM;
828 if (tracer->ds.context->pebs_master)
829 goto out_unlock;
830 tracer->ds.context->pebs_master = tracer;
831
832 spin_unlock_irq(&ds_lock);
833
834 /*
835 * Now that we own the pebs part of the context, let's complete the
836 * initialization for that part.
837 */
838 ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
839 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
840 ds_install_ds_area(tracer->ds.context);
841
842 /* Start tracing. */
843 ds_resume_pebs(tracer);
844
845 return tracer;
846
847 out_unlock:
848 spin_unlock_irq(&ds_lock);
849 ds_put_context(tracer->ds.context);
850 out_tracer:
851 kfree(tracer);
852 out_put_tracer:
853 put_tracer(task);
854 out:
855 return ERR_PTR(error);
856}
857
858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
859 void *base, size_t size,
860 pebs_ovfl_callback_t ovfl,
861 size_t th, unsigned int flags)
862{
863 return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
864}
865
866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
867 pebs_ovfl_callback_t ovfl,
868 size_t th, unsigned int flags)
869{
870 return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
871}
872
873static void ds_free_bts(struct bts_tracer *tracer)
874{
875 struct task_struct *task;
876
877 task = tracer->ds.context->task;
878
879 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
880 tracer->ds.context->bts_master = NULL;
881
882 /* Make sure tracing stopped and the tracer is not in use. */
883 if (task && (task != current))
884 wait_task_context_switch(task);
885
886 ds_put_context(tracer->ds.context);
887 put_tracer(task);
888
889 kfree(tracer);
890}
891
892void ds_release_bts(struct bts_tracer *tracer)
893{
894 might_sleep();
895
896 if (!tracer)
897 return;
898
899 ds_suspend_bts(tracer);
900 ds_free_bts(tracer);
901}
902
903int ds_release_bts_noirq(struct bts_tracer *tracer)
904{
905 struct task_struct *task;
906 unsigned long irq;
907 int error;
908
909 if (!tracer)
910 return 0;
911
912 task = tracer->ds.context->task;
913
914 local_irq_save(irq);
915
916 error = -EPERM;
917 if (!task &&
918 (tracer->ds.context->cpu != smp_processor_id()))
919 goto out;
920
921 error = -EPERM;
922 if (task && (task != current))
923 goto out;
924
925 ds_suspend_bts_noirq(tracer);
926 ds_free_bts(tracer);
927
928 error = 0;
929 out:
930 local_irq_restore(irq);
931 return error;
932}
933
934static void update_task_debugctlmsr(struct task_struct *task,
935 unsigned long debugctlmsr)
936{
937 task->thread.debugctlmsr = debugctlmsr;
938
939 get_cpu();
940 if (task == current)
941 update_debugctlmsr(debugctlmsr);
942 put_cpu();
943}
944
945void ds_suspend_bts(struct bts_tracer *tracer)
946{
947 struct task_struct *task;
948 unsigned long debugctlmsr;
949 int cpu;
950
951 if (!tracer)
952 return;
953
954 tracer->flags = 0;
955
956 task = tracer->ds.context->task;
957 cpu = tracer->ds.context->cpu;
958
959 WARN_ON(!task && irqs_disabled());
960
961 debugctlmsr = (task ?
962 task->thread.debugctlmsr :
963 get_debugctlmsr_on_cpu(cpu));
964 debugctlmsr &= ~BTS_CONTROL;
965
966 if (task)
967 update_task_debugctlmsr(task, debugctlmsr);
968 else
969 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
970}
971
972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
973{
974 struct task_struct *task;
975 unsigned long debugctlmsr, irq;
976 int cpu, error = 0;
977
978 if (!tracer)
979 return 0;
980
981 tracer->flags = 0;
982
983 task = tracer->ds.context->task;
984 cpu = tracer->ds.context->cpu;
985
986 local_irq_save(irq);
987
988 error = -EPERM;
989 if (!task && (cpu != smp_processor_id()))
990 goto out;
991
992 debugctlmsr = (task ?
993 task->thread.debugctlmsr :
994 get_debugctlmsr());
995 debugctlmsr &= ~BTS_CONTROL;
996
997 if (task)
998 update_task_debugctlmsr(task, debugctlmsr);
999 else
1000 update_debugctlmsr(debugctlmsr);
1001
1002 error = 0;
1003 out:
1004 local_irq_restore(irq);
1005 return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010 unsigned long control;
1011
1012 control = ds_cfg.ctl[dsf_bts];
1013 if (!(tracer->trace.ds.flags & BTS_KERNEL))
1014 control |= ds_cfg.ctl[dsf_bts_kernel];
1015 if (!(tracer->trace.ds.flags & BTS_USER))
1016 control |= ds_cfg.ctl[dsf_bts_user];
1017
1018 return control;
1019}
1020
1021void ds_resume_bts(struct bts_tracer *tracer)
1022{
1023 struct task_struct *task;
1024 unsigned long debugctlmsr;
1025 int cpu;
1026
1027 if (!tracer)
1028 return;
1029
1030 tracer->flags = tracer->trace.ds.flags;
1031
1032 task = tracer->ds.context->task;
1033 cpu = tracer->ds.context->cpu;
1034
1035 WARN_ON(!task && irqs_disabled());
1036
1037 debugctlmsr = (task ?
1038 task->thread.debugctlmsr :
1039 get_debugctlmsr_on_cpu(cpu));
1040 debugctlmsr |= ds_bts_control(tracer);
1041
1042 if (task)
1043 update_task_debugctlmsr(task, debugctlmsr);
1044 else
1045 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050 struct task_struct *task;
1051 unsigned long debugctlmsr, irq;
1052 int cpu, error = 0;
1053
1054 if (!tracer)
1055 return 0;
1056
1057 tracer->flags = tracer->trace.ds.flags;
1058
1059 task = tracer->ds.context->task;
1060 cpu = tracer->ds.context->cpu;
1061
1062 local_irq_save(irq);
1063
1064 error = -EPERM;
1065 if (!task && (cpu != smp_processor_id()))
1066 goto out;
1067
1068 debugctlmsr = (task ?
1069 task->thread.debugctlmsr :
1070 get_debugctlmsr());
1071 debugctlmsr |= ds_bts_control(tracer);
1072
1073 if (task)
1074 update_task_debugctlmsr(task, debugctlmsr);
1075 else
1076 update_debugctlmsr(debugctlmsr);
1077
1078 error = 0;
1079 out:
1080 local_irq_restore(irq);
1081 return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086 struct task_struct *task;
1087
1088 task = tracer->ds.context->task;
1089
1090 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
1091 tracer->ds.context->pebs_master = NULL;
1092
1093 ds_put_context(tracer->ds.context);
1094 put_tracer(task);
1095
1096 kfree(tracer);
1097}
1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101 might_sleep();
1102
1103 if (!tracer)
1104 return;
1105
1106 ds_suspend_pebs(tracer);
1107 ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112 struct task_struct *task;
1113 unsigned long irq;
1114 int error;
1115
1116 if (!tracer)
1117 return 0;
1118
1119 task = tracer->ds.context->task;
1120
1121 local_irq_save(irq);
1122
1123 error = -EPERM;
1124 if (!task &&
1125 (tracer->ds.context->cpu != smp_processor_id()))
1126 goto out;
1127
1128 error = -EPERM;
1129 if (task && (task != current))
1130 goto out;
1131
1132 ds_suspend_pebs_noirq(tracer);
1133 ds_free_pebs(tracer);
1134
1135 error = 0;
1136 out:
1137 local_irq_restore(irq);
1138 return error;
1139}
1140
1141void ds_suspend_pebs(struct pebs_tracer *tracer)
1142{
1143
1144}
1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148 return 0;
1149}
1150
1151void ds_resume_pebs(struct pebs_tracer *tracer)
1152{
1153
1154}
1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158 return 0;
1159}
1160
1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
1162{
1163 if (!tracer)
1164 return NULL;
1165
1166 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
1167 return &tracer->trace;
1168}
1169
1170const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
1171{
1172 if (!tracer)
1173 return NULL;
1174
1175 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
1176
1177 tracer->trace.counters = ds_cfg.nr_counter_reset;
1178 memcpy(tracer->trace.counter_reset,
1179 tracer->ds.context->ds +
1180 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181 ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
1182
1183 return &tracer->trace;
1184}
1185
1186int ds_reset_bts(struct bts_tracer *tracer)
1187{
1188 if (!tracer)
1189 return -EINVAL;
1190
1191 tracer->trace.ds.top = tracer->trace.ds.begin;
1192
1193 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
1194 (unsigned long)tracer->trace.ds.top);
1195
1196 return 0;
1197}
1198
1199int ds_reset_pebs(struct pebs_tracer *tracer)
1200{
1201 if (!tracer)
1202 return -EINVAL;
1203
1204 tracer->trace.ds.top = tracer->trace.ds.begin;
1205
1206 ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
1207 (unsigned long)tracer->trace.ds.top);
1208
1209 return 0;
1210}
1211
1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213 unsigned int counter, u64 value)
1214{
1215 if (!tracer)
1216 return -EINVAL;
1217
1218 if (ds_cfg.nr_counter_reset < counter)
1219 return -EINVAL;
1220
1221 *(u64 *)(tracer->ds.context->ds +
1222 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223 (counter * PEBS_RESET_FIELD_SIZE)) = value;
1224
1225 return 0;
1226}
1227
1228static const struct ds_configuration ds_cfg_netburst = {
1229 .name = "Netburst",
1230 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
1231 .ctl[dsf_bts_kernel] = (1 << 5),
1232 .ctl[dsf_bts_user] = (1 << 6),
1233 .nr_counter_reset = 1,
1234};
1235static const struct ds_configuration ds_cfg_pentium_m = {
1236 .name = "Pentium M",
1237 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1238 .nr_counter_reset = 1,
1239};
1240static const struct ds_configuration ds_cfg_core2_atom = {
1241 .name = "Core 2/Atom",
1242 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1243 .ctl[dsf_bts_kernel] = (1 << 9),
1244 .ctl[dsf_bts_user] = (1 << 10),
1245 .nr_counter_reset = 1,
1246};
1247static const struct ds_configuration ds_cfg_core_i7 = {
1248 .name = "Core i7",
1249 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1250 .ctl[dsf_bts_kernel] = (1 << 9),
1251 .ctl[dsf_bts_user] = (1 << 10),
1252 .nr_counter_reset = 4,
1253};
1254
1255static void
1256ds_configure(const struct ds_configuration *cfg,
1257 struct cpuinfo_x86 *cpu)
1258{
1259 unsigned long nr_pebs_fields = 0;
1260
1261 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264 nr_pebs_fields = 10;
1265#else
1266 nr_pebs_fields = 18;
1267#endif
1268
1269 /*
1270 * Starting with version 2, architectural performance
1271 * monitoring supports a format specifier.
1272 */
1273 if ((cpuid_eax(0xa) & 0xff) > 1) {
1274 unsigned long perf_capabilities, format;
1275
1276 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278 format = (perf_capabilities >> 8) & 0xf;
1279
1280 switch (format) {
1281 case 0:
1282 nr_pebs_fields = 18;
1283 break;
1284 case 1:
1285 nr_pebs_fields = 22;
1286 break;
1287 default:
1288 printk(KERN_INFO
1289 "[ds] unknown PEBS format: %lu\n", format);
1290 nr_pebs_fields = 0;
1291 break;
1292 }
1293 }
1294
1295 memset(&ds_cfg, 0, sizeof(ds_cfg));
1296 ds_cfg = *cfg;
1297
1298 ds_cfg.sizeof_ptr_field =
1299 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
1302 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
1303
1304 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
1305 ds_cfg.sizeof_rec[ds_bts] = 0;
1306 printk(KERN_INFO "[ds] bts not available\n");
1307 }
1308 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309 ds_cfg.sizeof_rec[ds_pebs] = 0;
1310 printk(KERN_INFO "[ds] pebs not available\n");
1311 }
1312
1313 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314 8 * ds_cfg.sizeof_ptr_field);
1315 printk("bts/pebs record: %u/%u bytes\n",
1316 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
1317
1318 WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
1319}
1320
1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
1322{
1323 /* Only configure the first cpu. Others are identical. */
1324 if (ds_cfg.name)
1325 return;
1326
1327 switch (c->x86) {
1328 case 0x6:
1329 switch (c->x86_model) {
1330 case 0x9:
1331 case 0xd: /* Pentium M */
1332 ds_configure(&ds_cfg_pentium_m, c);
1333 break;
1334 case 0xf:
1335 case 0x17: /* Core2 */
1336 case 0x1c: /* Atom */
1337 ds_configure(&ds_cfg_core2_atom, c);
1338 break;
1339 case 0x1a: /* Core i7 */
1340 ds_configure(&ds_cfg_core_i7, c);
1341 break;
1342 default:
1343 /* Sorry, don't know about them. */
1344 break;
1345 }
1346 break;
1347 case 0xf:
1348 switch (c->x86_model) {
1349 case 0x0:
1350 case 0x1:
1351 case 0x2: /* Netburst */
1352 ds_configure(&ds_cfg_netburst, c);
1353 break;
1354 default:
1355 /* Sorry, don't know about them. */
1356 break;
1357 }
1358 break;
1359 default:
1360 /* Sorry, don't know about them. */
1361 break;
1362 }
1363}
1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366 enum bts_qualifier qualifier,
1367 struct task_struct *task)
1368{
1369 struct bts_tracer *tracer = context->bts_master;
1370 struct bts_struct ts;
1371
1372 /* Prevent compilers from reading the tracer pointer twice. */
1373 barrier();
1374
1375 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376 return;
1377
1378 memset(&ts, 0, sizeof(ts));
1379 ts.qualifier = qualifier;
1380 ts.variant.event.clock = trace_clock_global();
1381 ts.variant.event.pid = task->pid;
1382
1383 bts_write(tracer, &ts);
1384}
1385
1386/*
1387 * Change the DS configuration from tracing prev to tracing next.
1388 */
1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
1390{
1391 struct ds_context *prev_ctx = prev->thread.ds_ctx;
1392 struct ds_context *next_ctx = next->thread.ds_ctx;
1393 unsigned long debugctlmsr = next->thread.debugctlmsr;
1394
1395 /* Make sure all data is read before we start. */
1396 barrier();
1397
1398 if (prev_ctx) {
1399 update_debugctlmsr(0);
1400
1401 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
1402 }
1403
1404 if (next_ctx) {
1405 ds_take_timestamp(next_ctx, bts_task_arrives, next);
1406
1407 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1408 }
1409
1410 update_debugctlmsr(debugctlmsr);
1411}
1412
1413static __init int ds_selftest(void)
1414{
1415 if (ds_cfg.sizeof_rec[ds_bts]) {
1416 int error;
1417
1418 error = ds_selftest_bts();
1419 if (error) {
1420 WARN(1, "[ds] selftest failed. disabling bts.\n");
1421 ds_cfg.sizeof_rec[ds_bts] = 0;
1422 }
1423 }
1424
1425 if (ds_cfg.sizeof_rec[ds_pebs]) {
1426 int error;
1427
1428 error = ds_selftest_pebs();
1429 if (error) {
1430 WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431 ds_cfg.sizeof_rec[ds_pebs] = 0;
1432 }
1433 }
1434
1435 return 0;
1436}
1437device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#include "ds_selftest.h"
10
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/smp.h>
14#include <linux/cpu.h>
15
16#include <asm/ds.h>
17
18
19#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
20#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
21
22struct ds_selftest_bts_conf {
23 struct bts_tracer *tracer;
24 int error;
25 int (*suspend)(struct bts_tracer *);
26 int (*resume)(struct bts_tracer *);
27};
28
29static int ds_selftest_bts_consistency(const struct bts_trace *trace)
30{
31 int error = 0;
32
33 if (!trace) {
34 printk(KERN_CONT "failed to access trace...");
35 /* Bail out. Other tests are pointless. */
36 return -1;
37 }
38
39 if (!trace->read) {
40 printk(KERN_CONT "bts read not available...");
41 error = -1;
42 }
43
44 /* Do some sanity checks on the trace configuration. */
45 if (!trace->ds.n) {
46 printk(KERN_CONT "empty bts buffer...");
47 error = -1;
48 }
49 if (!trace->ds.size) {
50 printk(KERN_CONT "bad bts trace setup...");
51 error = -1;
52 }
53 if (trace->ds.end !=
54 (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
55 printk(KERN_CONT "bad bts buffer setup...");
56 error = -1;
57 }
58 /*
59 * We allow top in [begin; end], since its not clear when the
60 * overflow adjustment happens: after the increment or before the
61 * write.
62 */
63 if ((trace->ds.top < trace->ds.begin) ||
64 (trace->ds.end < trace->ds.top)) {
65 printk(KERN_CONT "bts top out of bounds...");
66 error = -1;
67 }
68
69 return error;
70}
71
72static int ds_selftest_bts_read(struct bts_tracer *tracer,
73 const struct bts_trace *trace,
74 const void *from, const void *to)
75{
76 const unsigned char *at;
77
78 /*
79 * Check a few things which do not belong to this test.
80 * They should be covered by other tests.
81 */
82 if (!trace)
83 return -1;
84
85 if (!trace->read)
86 return -1;
87
88 if (to < from)
89 return -1;
90
91 if (from < trace->ds.begin)
92 return -1;
93
94 if (trace->ds.end < to)
95 return -1;
96
97 if (!trace->ds.size)
98 return -1;
99
100 /* Now to the test itself. */
101 for (at = from; (void *)at < to; at += trace->ds.size) {
102 struct bts_struct bts;
103 unsigned long index;
104 int error;
105
106 if (((void *)at - trace->ds.begin) % trace->ds.size) {
107 printk(KERN_CONT
108 "read from non-integer index...");
109 return -1;
110 }
111 index = ((void *)at - trace->ds.begin) / trace->ds.size;
112
113 memset(&bts, 0, sizeof(bts));
114 error = trace->read(tracer, at, &bts);
115 if (error < 0) {
116 printk(KERN_CONT
117 "error reading bts trace at [%lu] (0x%p)...",
118 index, at);
119 return error;
120 }
121
122 switch (bts.qualifier) {
123 case BTS_BRANCH:
124 break;
125 default:
126 printk(KERN_CONT
127 "unexpected bts entry %llu at [%lu] (0x%p)...",
128 bts.qualifier, index, at);
129 return -1;
130 }
131 }
132
133 return 0;
134}
135
136static void ds_selftest_bts_cpu(void *arg)
137{
138 struct ds_selftest_bts_conf *conf = arg;
139 const struct bts_trace *trace;
140 void *top;
141
142 if (IS_ERR(conf->tracer)) {
143 conf->error = PTR_ERR(conf->tracer);
144 conf->tracer = NULL;
145
146 printk(KERN_CONT
147 "initialization failed (err: %d)...", conf->error);
148 return;
149 }
150
151 /* We should meanwhile have enough trace. */
152 conf->error = conf->suspend(conf->tracer);
153 if (conf->error < 0)
154 return;
155
156 /* Let's see if we can access the trace. */
157 trace = ds_read_bts(conf->tracer);
158
159 conf->error = ds_selftest_bts_consistency(trace);
160 if (conf->error < 0)
161 return;
162
163 /* If everything went well, we should have a few trace entries. */
164 if (trace->ds.top == trace->ds.begin) {
165 /*
166 * It is possible but highly unlikely that we got a
167 * buffer overflow and end up at exactly the same
168 * position we started from.
169 * Let's issue a warning, but continue.
170 */
171 printk(KERN_CONT "no trace/overflow...");
172 }
173
174 /* Let's try to read the trace we collected. */
175 conf->error =
176 ds_selftest_bts_read(conf->tracer, trace,
177 trace->ds.begin, trace->ds.top);
178 if (conf->error < 0)
179 return;
180
181 /*
182 * Let's read the trace again.
183 * Since we suspended tracing, we should get the same result.
184 */
185 top = trace->ds.top;
186
187 trace = ds_read_bts(conf->tracer);
188 conf->error = ds_selftest_bts_consistency(trace);
189 if (conf->error < 0)
190 return;
191
192 if (top != trace->ds.top) {
193 printk(KERN_CONT "suspend not working...");
194 conf->error = -1;
195 return;
196 }
197
198 /* Let's collect some more trace - see if resume is working. */
199 conf->error = conf->resume(conf->tracer);
200 if (conf->error < 0)
201 return;
202
203 conf->error = conf->suspend(conf->tracer);
204 if (conf->error < 0)
205 return;
206
207 trace = ds_read_bts(conf->tracer);
208
209 conf->error = ds_selftest_bts_consistency(trace);
210 if (conf->error < 0)
211 return;
212
213 if (trace->ds.top == top) {
214 /*
215 * It is possible but highly unlikely that we got a
216 * buffer overflow and end up at exactly the same
217 * position we started from.
218 * Let's issue a warning and check the full trace.
219 */
220 printk(KERN_CONT
221 "no resume progress/overflow...");
222
223 conf->error =
224 ds_selftest_bts_read(conf->tracer, trace,
225 trace->ds.begin, trace->ds.end);
226 } else if (trace->ds.top < top) {
227 /*
228 * We had a buffer overflow - the entire buffer should
229 * contain trace records.
230 */
231 conf->error =
232 ds_selftest_bts_read(conf->tracer, trace,
233 trace->ds.begin, trace->ds.end);
234 } else {
235 /*
236 * It is quite likely that the buffer did not overflow.
237 * Let's just check the delta trace.
238 */
239 conf->error =
240 ds_selftest_bts_read(conf->tracer, trace, top,
241 trace->ds.top);
242 }
243 if (conf->error < 0)
244 return;
245
246 conf->error = 0;
247}
248
249static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
250{
251 ds_suspend_bts(tracer);
252 return 0;
253}
254
255static int ds_resume_bts_wrap(struct bts_tracer *tracer)
256{
257 ds_resume_bts(tracer);
258 return 0;
259}
260
261static void ds_release_bts_noirq_wrap(void *tracer)
262{
263 (void)ds_release_bts_noirq(tracer);
264}
265
266static int ds_selftest_bts_bad_release_noirq(int cpu,
267 struct bts_tracer *tracer)
268{
269 int error = -EPERM;
270
271 /* Try to release the tracer on the wrong cpu. */
272 get_cpu();
273 if (cpu != smp_processor_id()) {
274 error = ds_release_bts_noirq(tracer);
275 if (error != -EPERM)
276 printk(KERN_CONT "release on wrong cpu...");
277 }
278 put_cpu();
279
280 return error ? 0 : -1;
281}
282
283static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
284{
285 struct bts_tracer *tracer;
286 int error;
287
288 /* Try to request cpu tracing while task tracing is active. */
289 tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
290 (size_t)-1, BTS_KERNEL);
291 error = PTR_ERR(tracer);
292 if (!IS_ERR(tracer)) {
293 ds_release_bts(tracer);
294 error = 0;
295 }
296
297 if (error != -EPERM)
298 printk(KERN_CONT "cpu/task tracing overlap...");
299
300 return error ? 0 : -1;
301}
302
303static int ds_selftest_bts_bad_request_task(void *buffer)
304{
305 struct bts_tracer *tracer;
306 int error;
307
308 /* Try to request cpu tracing while task tracing is active. */
309 tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
310 (size_t)-1, BTS_KERNEL);
311 error = PTR_ERR(tracer);
312 if (!IS_ERR(tracer)) {
313 error = 0;
314 ds_release_bts(tracer);
315 }
316
317 if (error != -EPERM)
318 printk(KERN_CONT "task/cpu tracing overlap...");
319
320 return error ? 0 : -1;
321}
322
323int ds_selftest_bts(void)
324{
325 struct ds_selftest_bts_conf conf;
326 unsigned char buffer[BUFFER_SIZE], *small_buffer;
327 unsigned long irq;
328 int cpu;
329
330 printk(KERN_INFO "[ds] bts selftest...");
331 conf.error = 0;
332
333 small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
334
335 get_online_cpus();
336 for_each_online_cpu(cpu) {
337 conf.suspend = ds_suspend_bts_wrap;
338 conf.resume = ds_resume_bts_wrap;
339 conf.tracer =
340 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
341 NULL, (size_t)-1, BTS_KERNEL);
342 ds_selftest_bts_cpu(&conf);
343 if (conf.error >= 0)
344 conf.error = ds_selftest_bts_bad_request_task(buffer);
345 ds_release_bts(conf.tracer);
346 if (conf.error < 0)
347 goto out;
348
349 conf.suspend = ds_suspend_bts_noirq;
350 conf.resume = ds_resume_bts_noirq;
351 conf.tracer =
352 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
353 NULL, (size_t)-1, BTS_KERNEL);
354 smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
355 if (conf.error >= 0) {
356 conf.error =
357 ds_selftest_bts_bad_release_noirq(cpu,
358 conf.tracer);
359 /* We must not release the tracer twice. */
360 if (conf.error < 0)
361 conf.tracer = NULL;
362 }
363 if (conf.error >= 0)
364 conf.error = ds_selftest_bts_bad_request_task(buffer);
365 smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
366 conf.tracer, 1);
367 if (conf.error < 0)
368 goto out;
369 }
370
371 conf.suspend = ds_suspend_bts_wrap;
372 conf.resume = ds_resume_bts_wrap;
373 conf.tracer =
374 ds_request_bts_task(current, buffer, BUFFER_SIZE,
375 NULL, (size_t)-1, BTS_KERNEL);
376 ds_selftest_bts_cpu(&conf);
377 if (conf.error >= 0)
378 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
379 ds_release_bts(conf.tracer);
380 if (conf.error < 0)
381 goto out;
382
383 conf.suspend = ds_suspend_bts_noirq;
384 conf.resume = ds_resume_bts_noirq;
385 conf.tracer =
386 ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
387 NULL, (size_t)-1, BTS_KERNEL);
388 local_irq_save(irq);
389 ds_selftest_bts_cpu(&conf);
390 if (conf.error >= 0)
391 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
392 ds_release_bts_noirq(conf.tracer);
393 local_irq_restore(irq);
394 if (conf.error < 0)
395 goto out;
396
397 conf.error = 0;
398 out:
399 put_online_cpus();
400 printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
401
402 return conf.error;
403}
404
405int ds_selftest_pebs(void)
406{
407 return 0;
408}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#ifdef CONFIG_X86_DS_SELFTEST
10extern int ds_selftest_bts(void);
11extern int ds_selftest_pebs(void);
12#else
13static inline int ds_selftest_bts(void) { return 0; }
14static inline int ds_selftest_pebs(void) { return 0; }
15#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780a..c89a386930b7 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
224 int cpu; 224 int cpu;
225 unsigned long flags; 225 unsigned long flags;
226 226
227 /* notify the hw-branch tracer so it may disable tracing and
228 add the last trace to the trace buffer -
229 the earlier this happens, the more useful the trace. */
230 trace_hw_branch_oops();
231
232 oops_enter(); 227 oops_enter();
233 228
234 /* racy, but better than risking deadlock. */ 229 /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc6737..cd49141cf153 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
53#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56#include <asm/cpufeature.h>
56 57
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 58/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h> 59#include <linux/elf-em.h>
@@ -905,7 +906,25 @@ ENTRY(simd_coprocessor_error)
905 RING0_INT_FRAME 906 RING0_INT_FRAME
906 pushl $0 907 pushl $0
907 CFI_ADJUST_CFA_OFFSET 4 908 CFI_ADJUST_CFA_OFFSET 4
909#ifdef CONFIG_X86_INVD_BUG
910 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
911661: pushl $do_general_protection
912662:
913.section .altinstructions,"a"
914 .balign 4
915 .long 661b
916 .long 663f
917 .byte X86_FEATURE_XMM
918 .byte 662b-661b
919 .byte 664f-663f
920.previous
921.section .altinstr_replacement,"ax"
922663: pushl $do_simd_coprocessor_error
923664:
924.previous
925#else
908 pushl $do_simd_coprocessor_error 926 pushl $do_simd_coprocessor_error
927#endif
909 CFI_ADJUST_CFA_OFFSET 4 928 CFI_ADJUST_CFA_OFFSET 4
910 jmp error_code 929 jmp error_code
911 CFI_ENDPROC 930 CFI_ENDPROC
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519f..a8f1b803d2fd 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len)
189} 189}
190 190
191/* 191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space. 192 * Check for virtual address in kernel space.
205 */ 193 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) 194int arch_check_bp_in_kernelspace(struct perf_event *bp)
207{ 195{
208 unsigned int len; 196 unsigned int len;
197 unsigned long va;
198 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
209 199
210 len = get_hbp_len(hbp_len); 200 va = info->address;
201 len = get_hbp_len(info->len);
211 202
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 203 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213} 204}
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
300/* 291/*
301 * Validate the arch-specific HW Breakpoint register settings 292 * Validate the arch-specific HW Breakpoint register settings
302 */ 293 */
303int arch_validate_hwbkpt_settings(struct perf_event *bp, 294int arch_validate_hwbkpt_settings(struct perf_event *bp)
304 struct task_struct *tsk)
305{ 295{
306 struct arch_hw_breakpoint *info = counter_arch_bp(bp); 296 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
307 unsigned int align; 297 unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
314 304
315 ret = -EINVAL; 305 ret = -EINVAL;
316 306
317 if (info->type == X86_BREAKPOINT_EXECUTE)
318 /*
319 * Ptrace-refactoring code
320 * For now, we'll allow instruction breakpoint only for user-space
321 * addresses
322 */
323 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
324 info->len != X86_BREAKPOINT_EXECUTE)
325 return ret;
326
327 switch (info->len) { 307 switch (info->len) {
328 case X86_BREAKPOINT_LEN_1: 308 case X86_BREAKPOINT_LEN_1:
329 align = 0; 309 align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
350 if (info->address & align) 330 if (info->address & align)
351 return -EINVAL; 331 return -EINVAL;
352 332
353 /* Check that the virtual address is in the proper range */
354 if (tsk) {
355 if (!arch_check_va_in_userspace(info->address, info->len))
356 return -EFAULT;
357 } else {
358 if (!arch_check_va_in_kernelspace(info->address, info->len))
359 return -EFAULT;
360 }
361
362 return 0; 333 return 0;
363} 334}
364 335
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c285488..86cef6b32253 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -102,65 +102,62 @@ void __cpuinit fpu_init(void)
102 102
103 mxcsr_feature_mask_init(); 103 mxcsr_feature_mask_init();
104 /* clean state in init */ 104 /* clean state in init */
105 if (cpu_has_xsave) 105 current_thread_info()->status = 0;
106 current_thread_info()->status = TS_XSAVE;
107 else
108 current_thread_info()->status = 0;
109 clear_used_math(); 106 clear_used_math();
110} 107}
111#endif /* CONFIG_X86_64 */ 108#endif /* CONFIG_X86_64 */
112 109
113/* 110static void fpu_finit(struct fpu *fpu)
114 * The _current_ task is using the FPU for the first time
115 * so initialize it and set the mxcsr to its default
116 * value at reset if we support XMM instructions and then
117 * remeber the current task has used the FPU.
118 */
119int init_fpu(struct task_struct *tsk)
120{ 111{
121 if (tsk_used_math(tsk)) {
122 if (HAVE_HWFP && tsk == current)
123 unlazy_fpu(tsk);
124 return 0;
125 }
126
127 /*
128 * Memory allocation at the first usage of the FPU and other state.
129 */
130 if (!tsk->thread.xstate) {
131 tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
132 GFP_KERNEL);
133 if (!tsk->thread.xstate)
134 return -ENOMEM;
135 }
136
137#ifdef CONFIG_X86_32 112#ifdef CONFIG_X86_32
138 if (!HAVE_HWFP) { 113 if (!HAVE_HWFP) {
139 memset(tsk->thread.xstate, 0, xstate_size); 114 finit_soft_fpu(&fpu->state->soft);
140 finit_task(tsk); 115 return;
141 set_stopped_child_used_math(tsk);
142 return 0;
143 } 116 }
144#endif 117#endif
145 118
146 if (cpu_has_fxsr) { 119 if (cpu_has_fxsr) {
147 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 120 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
148 121
149 memset(fx, 0, xstate_size); 122 memset(fx, 0, xstate_size);
150 fx->cwd = 0x37f; 123 fx->cwd = 0x37f;
151 if (cpu_has_xmm) 124 if (cpu_has_xmm)
152 fx->mxcsr = MXCSR_DEFAULT; 125 fx->mxcsr = MXCSR_DEFAULT;
153 } else { 126 } else {
154 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 127 struct i387_fsave_struct *fp = &fpu->state->fsave;
155 memset(fp, 0, xstate_size); 128 memset(fp, 0, xstate_size);
156 fp->cwd = 0xffff037fu; 129 fp->cwd = 0xffff037fu;
157 fp->swd = 0xffff0000u; 130 fp->swd = 0xffff0000u;
158 fp->twd = 0xffffffffu; 131 fp->twd = 0xffffffffu;
159 fp->fos = 0xffff0000u; 132 fp->fos = 0xffff0000u;
160 } 133 }
134}
135
136/*
137 * The _current_ task is using the FPU for the first time
138 * so initialize it and set the mxcsr to its default
139 * value at reset if we support XMM instructions and then
140 * remeber the current task has used the FPU.
141 */
142int init_fpu(struct task_struct *tsk)
143{
144 int ret;
145
146 if (tsk_used_math(tsk)) {
147 if (HAVE_HWFP && tsk == current)
148 unlazy_fpu(tsk);
149 return 0;
150 }
151
161 /* 152 /*
162 * Only the device not available exception or ptrace can call init_fpu. 153 * Memory allocation at the first usage of the FPU and other state.
163 */ 154 */
155 ret = fpu_alloc(&tsk->thread.fpu);
156 if (ret)
157 return ret;
158
159 fpu_finit(&tsk->thread.fpu);
160
164 set_stopped_child_used_math(tsk); 161 set_stopped_child_used_math(tsk);
165 return 0; 162 return 0;
166} 163}
@@ -194,7 +191,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
194 return ret; 191 return ret;
195 192
196 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 193 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
197 &target->thread.xstate->fxsave, 0, -1); 194 &target->thread.fpu.state->fxsave, 0, -1);
198} 195}
199 196
200int xfpregs_set(struct task_struct *target, const struct user_regset *regset, 197int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -211,19 +208,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
211 return ret; 208 return ret;
212 209
213 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 210 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
214 &target->thread.xstate->fxsave, 0, -1); 211 &target->thread.fpu.state->fxsave, 0, -1);
215 212
216 /* 213 /*
217 * mxcsr reserved bits must be masked to zero for security reasons. 214 * mxcsr reserved bits must be masked to zero for security reasons.
218 */ 215 */
219 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 216 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
220 217
221 /* 218 /*
222 * update the header bits in the xsave header, indicating the 219 * update the header bits in the xsave header, indicating the
223 * presence of FP and SSE state. 220 * presence of FP and SSE state.
224 */ 221 */
225 if (cpu_has_xsave) 222 if (cpu_has_xsave)
226 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; 223 target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
227 224
228 return ret; 225 return ret;
229} 226}
@@ -246,14 +243,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
246 * memory layout in the thread struct, so that we can copy the entire 243 * memory layout in the thread struct, so that we can copy the entire
247 * xstateregs to the user using one user_regset_copyout(). 244 * xstateregs to the user using one user_regset_copyout().
248 */ 245 */
249 memcpy(&target->thread.xstate->fxsave.sw_reserved, 246 memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
250 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); 247 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
251 248
252 /* 249 /*
253 * Copy the xstate memory layout. 250 * Copy the xstate memory layout.
254 */ 251 */
255 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, 252 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
256 &target->thread.xstate->xsave, 0, -1); 253 &target->thread.fpu.state->xsave, 0, -1);
257 return ret; 254 return ret;
258} 255}
259 256
@@ -272,14 +269,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
272 return ret; 269 return ret;
273 270
274 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 271 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
275 &target->thread.xstate->xsave, 0, -1); 272 &target->thread.fpu.state->xsave, 0, -1);
276 273
277 /* 274 /*
278 * mxcsr reserved bits must be masked to zero for security reasons. 275 * mxcsr reserved bits must be masked to zero for security reasons.
279 */ 276 */
280 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 277 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
281 278
282 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; 279 xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
283 280
284 xsave_hdr->xstate_bv &= pcntxt_mask; 281 xsave_hdr->xstate_bv &= pcntxt_mask;
285 /* 282 /*
@@ -365,7 +362,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
365static void 362static void
366convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) 363convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
367{ 364{
368 struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; 365 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
369 struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; 366 struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
370 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; 367 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
371 int i; 368 int i;
@@ -405,7 +402,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
405 const struct user_i387_ia32_struct *env) 402 const struct user_i387_ia32_struct *env)
406 403
407{ 404{
408 struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; 405 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
409 struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; 406 struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
410 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; 407 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
411 int i; 408 int i;
@@ -445,7 +442,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
445 442
446 if (!cpu_has_fxsr) { 443 if (!cpu_has_fxsr) {
447 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 444 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
448 &target->thread.xstate->fsave, 0, 445 &target->thread.fpu.state->fsave, 0,
449 -1); 446 -1);
450 } 447 }
451 448
@@ -475,7 +472,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
475 472
476 if (!cpu_has_fxsr) { 473 if (!cpu_has_fxsr) {
477 return user_regset_copyin(&pos, &count, &kbuf, &ubuf, 474 return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
478 &target->thread.xstate->fsave, 0, -1); 475 &target->thread.fpu.state->fsave, 0, -1);
479 } 476 }
480 477
481 if (pos > 0 || count < sizeof(env)) 478 if (pos > 0 || count < sizeof(env))
@@ -490,7 +487,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
490 * presence of FP. 487 * presence of FP.
491 */ 488 */
492 if (cpu_has_xsave) 489 if (cpu_has_xsave)
493 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; 490 target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
494 return ret; 491 return ret;
495} 492}
496 493
@@ -501,7 +498,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
501static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) 498static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
502{ 499{
503 struct task_struct *tsk = current; 500 struct task_struct *tsk = current;
504 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 501 struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
505 502
506 fp->status = fp->swd; 503 fp->status = fp->swd;
507 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 504 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -512,7 +509,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
512static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) 509static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
513{ 510{
514 struct task_struct *tsk = current; 511 struct task_struct *tsk = current;
515 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 512 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
516 struct user_i387_ia32_struct env; 513 struct user_i387_ia32_struct env;
517 int err = 0; 514 int err = 0;
518 515
@@ -547,7 +544,7 @@ static int save_i387_xsave(void __user *buf)
547 * header as well as change any contents in the memory layout. 544 * header as well as change any contents in the memory layout.
548 * xrestore as part of sigreturn will capture all the changes. 545 * xrestore as part of sigreturn will capture all the changes.
549 */ 546 */
550 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; 547 tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
551 548
552 if (save_i387_fxsave(fx) < 0) 549 if (save_i387_fxsave(fx) < 0)
553 return -1; 550 return -1;
@@ -599,7 +596,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
599{ 596{
600 struct task_struct *tsk = current; 597 struct task_struct *tsk = current;
601 598
602 return __copy_from_user(&tsk->thread.xstate->fsave, buf, 599 return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
603 sizeof(struct i387_fsave_struct)); 600 sizeof(struct i387_fsave_struct));
604} 601}
605 602
@@ -610,10 +607,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
610 struct user_i387_ia32_struct env; 607 struct user_i387_ia32_struct env;
611 int err; 608 int err;
612 609
613 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 610 err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
614 size); 611 size);
615 /* mxcsr reserved bits must be masked to zero for security reasons */ 612 /* mxcsr reserved bits must be masked to zero for security reasons */
616 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 613 tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
617 if (err || __copy_from_user(&env, buf, sizeof(env))) 614 if (err || __copy_from_user(&env, buf, sizeof(env)))
618 return 1; 615 return 1;
619 convert_to_fxsr(tsk, &env); 616 convert_to_fxsr(tsk, &env);
@@ -629,7 +626,7 @@ static int restore_i387_xsave(void __user *buf)
629 struct i387_fxsave_struct __user *fx = 626 struct i387_fxsave_struct __user *fx =
630 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; 627 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
631 struct xsave_hdr_struct *xsave_hdr = 628 struct xsave_hdr_struct *xsave_hdr =
632 &current->thread.xstate->xsave.xsave_hdr; 629 &current->thread.fpu.state->xsave.xsave_hdr;
633 u64 mask; 630 u64 mask;
634 int err; 631 int err;
635 632
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5c..2dfd31597443 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17#include <asm/smp.h> 17#include <asm/smp.h>
18 18
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_RAW_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22/* 22/*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
33static void init_pit_timer(enum clock_event_mode mode, 33static void init_pit_timer(enum clock_event_mode mode,
34 struct clock_event_device *evt) 34 struct clock_event_device *evt)
35{ 35{
36 spin_lock(&i8253_lock); 36 raw_spin_lock(&i8253_lock);
37 37
38 switch (mode) { 38 switch (mode) {
39 case CLOCK_EVT_MODE_PERIODIC: 39 case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
62 /* Nothing to do here */ 62 /* Nothing to do here */
63 break; 63 break;
64 } 64 }
65 spin_unlock(&i8253_lock); 65 raw_spin_unlock(&i8253_lock);
66} 66}
67 67
68/* 68/*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
72 */ 72 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt) 73static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{ 74{
75 spin_lock(&i8253_lock); 75 raw_spin_lock(&i8253_lock);
76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */ 76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */ 77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 spin_unlock(&i8253_lock); 78 raw_spin_unlock(&i8253_lock);
79 79
80 return 0; 80 return 0;
81} 81}
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
130 int count; 130 int count;
131 u32 jifs; 131 u32 jifs;
132 132
133 spin_lock_irqsave(&i8253_lock, flags); 133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /* 134 /*
135 * Although our caller may have the read side of xtime_lock, 135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine 136 * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
176 old_count = count; 176 old_count = count;
177 old_jifs = jifs; 177 old_jifs = jifs;
178 178
179 spin_unlock_irqrestore(&i8253_lock, flags); 179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180 180
181 count = (LATCH - 1) - count; 181 count = (LATCH - 1) - count;
182 182
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd46..990ae7cfc578 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
60 outb(0, 0xF0); 60 outb(0, 0xF0);
61 if (ignore_fpu_irq || !boot_cpu_data.hard_math) 61 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
62 return IRQ_NONE; 62 return IRQ_NONE;
63 math_error((void __user *)get_irq_regs()->ip); 63 math_error(get_irq_regs(), 0, 16);
64 return IRQ_HANDLED; 64 return IRQ_HANDLED;
65} 65}
66 66
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c0..345a4b1fe144 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
422 422
423static void __kprobes clear_btf(void) 423static void __kprobes clear_btf(void)
424{ 424{
425 if (test_thread_flag(TIF_DEBUGCTLMSR)) 425 if (test_thread_flag(TIF_BLOCKSTEP)) {
426 update_debugctlmsr(0); 426 unsigned long debugctl = get_debugctlmsr();
427
428 debugctl &= ~DEBUGCTLMSR_BTF;
429 update_debugctlmsr(debugctl);
430 }
427} 431}
428 432
429static void __kprobes restore_btf(void) 433static void __kprobes restore_btf(void)
430{ 434{
431 if (test_thread_flag(TIF_DEBUGCTLMSR)) 435 if (test_thread_flag(TIF_BLOCKSTEP)) {
432 update_debugctlmsr(current->thread.debugctlmsr); 436 unsigned long debugctl = get_debugctlmsr();
437
438 debugctl |= DEBUGCTLMSR_BTF;
439 update_debugctlmsr(debugctl);
440 }
433} 441}
434 442
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 443void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
@@ -534,20 +542,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
534 struct kprobe_ctlblk *kcb; 542 struct kprobe_ctlblk *kcb;
535 543
536 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 544 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
537 if (*addr != BREAKPOINT_INSTRUCTION) {
538 /*
539 * The breakpoint instruction was removed right
540 * after we hit it. Another cpu has removed
541 * either a probepoint or a debugger breakpoint
542 * at this address. In either case, no further
543 * handling of this interrupt is appropriate.
544 * Back up over the (now missing) int3 and run
545 * the original instruction.
546 */
547 regs->ip = (unsigned long)addr;
548 return 1;
549 }
550
551 /* 545 /*
552 * We don't want to be preempted for the entire 546 * We don't want to be preempted for the entire
553 * duration of kprobe processing. We conditionally 547 * duration of kprobe processing. We conditionally
@@ -579,6 +573,19 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
579 setup_singlestep(p, regs, kcb, 0); 573 setup_singlestep(p, regs, kcb, 0);
580 return 1; 574 return 1;
581 } 575 }
576 } else if (*addr != BREAKPOINT_INSTRUCTION) {
577 /*
578 * The breakpoint instruction was removed right
579 * after we hit it. Another cpu has removed
580 * either a probepoint or a debugger breakpoint
581 * at this address. In either case, no further
582 * handling of this interrupt is appropriate.
583 * Back up over the (now missing) int3 and run
584 * the original instruction.
585 */
586 regs->ip = (unsigned long)addr;
587 preempt_enable_no_resched();
588 return 1;
582 } else if (kprobe_running()) { 589 } else if (kprobe_running()) {
583 p = __get_cpu_var(current_kprobe); 590 p = __get_cpu_var(current_kprobe);
584 if (p->break_handler && p->break_handler(p, regs)) { 591 if (p->break_handler && p->break_handler(p, regs)) {
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c2..2cd8c544e41a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 return error; 201 return error;
202} 202}
203 203
204static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *inode, struct file *file)
205{ 205{
206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
207} 207}
208 208
209static ssize_t microcode_write(struct file *file, const char __user *buf, 209static ssize_t microcode_write(struct file *file, const char __user *buf,
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e28937..356170262a93 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
343 int (*get_ucode_data)(void *, const void *, size_t)) 343 int (*get_ucode_data)(void *, const void *, size_t))
344{ 344{
345 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 345 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
346 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 346 u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
347 int new_rev = uci->cpu_sig.rev; 347 int new_rev = uci->cpu_sig.rev;
348 unsigned int leftover = size; 348 unsigned int leftover = size;
349 enum ucode_state state = UCODE_OK; 349 enum ucode_state state = UCODE_OK;
350 unsigned int curr_mc_size = 0;
350 351
351 while (leftover) { 352 while (leftover) {
352 struct microcode_header_intel mc_header; 353 struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
361 break; 362 break;
362 } 363 }
363 364
364 mc = vmalloc(mc_size); 365 /* For performance reasons, reuse mc area when possible */
365 if (!mc) 366 if (!mc || mc_size > curr_mc_size) {
366 break; 367 if (mc)
368 vfree(mc);
369 mc = vmalloc(mc_size);
370 if (!mc)
371 break;
372 curr_mc_size = mc_size;
373 }
367 374
368 if (get_ucode_data(mc, ucode_ptr, mc_size) || 375 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
369 microcode_sanity_check(mc) < 0) { 376 microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
376 vfree(new_mc); 383 vfree(new_mc);
377 new_rev = mc_header.rev; 384 new_rev = mc_header.rev;
378 new_mc = mc; 385 new_mc = mc;
379 } else 386 mc = NULL; /* trigger new vmalloc */
380 vfree(mc); 387 }
381 388
382 ucode_ptr += mc_size; 389 ucode_ptr += mc_size;
383 leftover -= mc_size; 390 leftover -= mc_size;
384 } 391 }
385 392
393 if (mc)
394 vfree(mc);
395
386 if (leftover) { 396 if (leftover) {
387 if (new_mc) 397 if (new_mc)
388 vfree(new_mc); 398 vfree(new_mc);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8f..5ae5d2426edf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
116} 116}
117 117
118static int bad_ioapic(unsigned long address)
119{
120 if (nr_ioapics >= MAX_IO_APICS) {
121 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
122 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
123 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
124 }
125 if (!address) {
126 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
127 " found in table, skipping!\n");
128 return 1;
129 }
130 return 0;
131}
132
133static void __init MP_ioapic_info(struct mpc_ioapic *m) 118static void __init MP_ioapic_info(struct mpc_ioapic *m)
134{ 119{
135 if (!(m->flags & MPC_APIC_USABLE)) 120 if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
138 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", 123 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
139 m->apicid, m->apicver, m->apicaddr); 124 m->apicid, m->apicver, m->apicaddr);
140 125
141 if (bad_ioapic(m->apicaddr)) 126 mp_register_ioapic(m->apicid, m->apicaddr, gsi_end + 1);
142 return;
143
144 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
145 mp_ioapics[nr_ioapics].apicid = m->apicid;
146 mp_ioapics[nr_ioapics].type = m->type;
147 mp_ioapics[nr_ioapics].apicver = m->apicver;
148 mp_ioapics[nr_ioapics].flags = m->flags;
149 nr_ioapics++;
150} 127}
151 128
152static void print_MP_intsrc_info(struct mpc_intsrc *m) 129static void print_MP_intsrc_info(struct mpc_intsrc *m)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b94..e7e35219b32f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <asm/i387.h> 22#include <asm/i387.h>
23#include <asm/ds.h>
24#include <asm/debugreg.h> 23#include <asm/debugreg.h>
25 24
26unsigned long idle_halt; 25unsigned long idle_halt;
@@ -32,26 +31,22 @@ struct kmem_cache *task_xstate_cachep;
32 31
33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 32int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
34{ 33{
34 int ret;
35
35 *dst = *src; 36 *dst = *src;
36 if (src->thread.xstate) { 37 if (fpu_allocated(&src->thread.fpu)) {
37 dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, 38 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
38 GFP_KERNEL); 39 ret = fpu_alloc(&dst->thread.fpu);
39 if (!dst->thread.xstate) 40 if (ret)
40 return -ENOMEM; 41 return ret;
41 WARN_ON((unsigned long)dst->thread.xstate & 15); 42 fpu_copy(&dst->thread.fpu, &src->thread.fpu);
42 memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
43 } 43 }
44 return 0; 44 return 0;
45} 45}
46 46
47void free_thread_xstate(struct task_struct *tsk) 47void free_thread_xstate(struct task_struct *tsk)
48{ 48{
49 if (tsk->thread.xstate) { 49 fpu_free(&tsk->thread.fpu);
50 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
51 tsk->thread.xstate = NULL;
52 }
53
54 WARN(tsk->thread.ds_ctx, "leaking DS context\n");
55} 50}
56 51
57void free_thread_info(struct thread_info *ti) 52void free_thread_info(struct thread_info *ti)
@@ -198,11 +193,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
198 prev = &prev_p->thread; 193 prev = &prev_p->thread;
199 next = &next_p->thread; 194 next = &next_p->thread;
200 195
201 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || 196 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
202 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) 197 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
203 ds_switch_to(prev_p, next_p); 198 unsigned long debugctl = get_debugctlmsr();
204 else if (next->debugctlmsr != prev->debugctlmsr) 199
205 update_debugctlmsr(next->debugctlmsr); 200 debugctl &= ~DEBUGCTLMSR_BTF;
201 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
202 debugctl |= DEBUGCTLMSR_BTF;
203
204 update_debugctlmsr(debugctl);
205 }
206 206
207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
208 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 208 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
@@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
546 * check OSVW bit for CPUs that are not affected 546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400 547 * by erratum #400
548 */ 548 */
549 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); 549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 if (val >= 2) { 550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 rdmsrl(MSR_AMD64_OSVW_STATUS, val); 551 if (val >= 2) {
552 if (!(val & BIT(1))) 552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 goto no_c1e_idle; 553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
554 } 556 }
555 return 1; 557 return 1;
556 } 558 }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30c..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
55#include <asm/cpu.h> 55#include <asm/cpu.h>
56#include <asm/idle.h> 56#include <asm/idle.h>
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/ds.h>
59#include <asm/debugreg.h> 58#include <asm/debugreg.h>
60 59
61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
238 kfree(p->thread.io_bitmap_ptr); 237 kfree(p->thread.io_bitmap_ptr);
239 p->thread.io_bitmap_max = 0; 238 p->thread.io_bitmap_max = 0;
240 } 239 }
241
242 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
243 p->thread.ds_ctx = NULL;
244
245 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
246 p->thread.debugctlmsr = 0;
247
248 return err; 240 return err;
249} 241}
250 242
@@ -317,7 +309,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
317 309
318 /* we're going to use this soon, after a few expensive things */ 310 /* we're going to use this soon, after a few expensive things */
319 if (preload_fpu) 311 if (preload_fpu)
320 prefetch(next->xstate); 312 prefetch(next->fpu.state);
321 313
322 /* 314 /*
323 * Reload esp0. 315 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf7..3c2422a99f1f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
49#include <asm/ia32.h> 49#include <asm/ia32.h>
50#include <asm/idle.h> 50#include <asm/idle.h>
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/ds.h>
53#include <asm/debugreg.h> 52#include <asm/debugreg.h>
54 53
55asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
313 if (err) 312 if (err)
314 goto out; 313 goto out;
315 } 314 }
316
317 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
318 p->thread.ds_ctx = NULL;
319
320 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
321 p->thread.debugctlmsr = 0;
322
323 err = 0; 315 err = 0;
324out: 316out:
325 if (err && p->thread.io_bitmap_ptr) { 317 if (err && p->thread.io_bitmap_ptr) {
@@ -396,7 +388,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
396 388
397 /* we're going to use this soon, after a few expensive things */ 389 /* we're going to use this soon, after a few expensive things */
398 if (preload_fpu) 390 if (preload_fpu)
399 prefetch(next->xstate); 391 prefetch(next->fpu.state);
400 392
401 /* 393 /*
402 * Reload esp0, LDT and the page table pointer: 394 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2e9b55027b7e..70c4872cd8aa 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
2/* 2/*
3 * Pentium III FXSR, SSE support 3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000 4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * BTS tracing
7 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
8 */ 5 */
9 6
10#include <linux/kernel.h> 7#include <linux/kernel.h>
@@ -22,7 +19,6 @@
22#include <linux/audit.h> 19#include <linux/audit.h>
23#include <linux/seccomp.h> 20#include <linux/seccomp.h>
24#include <linux/signal.h> 21#include <linux/signal.h>
25#include <linux/workqueue.h>
26#include <linux/perf_event.h> 22#include <linux/perf_event.h>
27#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
28 24
@@ -36,7 +32,6 @@
36#include <asm/desc.h> 32#include <asm/desc.h>
37#include <asm/prctl.h> 33#include <asm/prctl.h>
38#include <asm/proto.h> 34#include <asm/proto.h>
39#include <asm/ds.h>
40#include <asm/hw_breakpoint.h> 35#include <asm/hw_breakpoint.h>
41 36
42#include "tls.h" 37#include "tls.h"
@@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
693 struct perf_event_attr attr; 688 struct perf_event_attr attr;
694 689
695 if (!t->ptrace_bps[nr]) { 690 if (!t->ptrace_bps[nr]) {
696 hw_breakpoint_init(&attr); 691 ptrace_breakpoint_init(&attr);
697 /* 692 /*
698 * Put stub len and type to register (reserve) an inactive but 693 * Put stub len and type to register (reserve) an inactive but
699 * correct bp 694 * correct bp
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target,
789 0, IO_BITMAP_BYTES); 784 0, IO_BITMAP_BYTES);
790} 785}
791 786
792#ifdef CONFIG_X86_PTRACE_BTS
793/*
794 * A branch trace store context.
795 *
796 * Contexts may only be installed by ptrace_bts_config() and only for
797 * ptraced tasks.
798 *
799 * Contexts are destroyed when the tracee is detached from the tracer.
800 * The actual destruction work requires interrupts enabled, so the
801 * work is deferred and will be scheduled during __ptrace_unlink().
802 *
803 * Contexts hold an additional task_struct reference on the traced
804 * task, as well as a reference on the tracer's mm.
805 *
806 * Ptrace already holds a task_struct for the duration of ptrace operations,
807 * but since destruction is deferred, it may be executed after both
808 * tracer and tracee exited.
809 */
810struct bts_context {
811 /* The branch trace handle. */
812 struct bts_tracer *tracer;
813
814 /* The buffer used to store the branch trace and its size. */
815 void *buffer;
816 unsigned int size;
817
818 /* The mm that paid for the above buffer. */
819 struct mm_struct *mm;
820
821 /* The task this context belongs to. */
822 struct task_struct *task;
823
824 /* The signal to send on a bts buffer overflow. */
825 unsigned int bts_ovfl_signal;
826
827 /* The work struct to destroy a context. */
828 struct work_struct work;
829};
830
831static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
832{
833 void *buffer = NULL;
834 int err = -ENOMEM;
835
836 err = account_locked_memory(current->mm, current->signal->rlim, size);
837 if (err < 0)
838 return err;
839
840 buffer = kzalloc(size, GFP_KERNEL);
841 if (!buffer)
842 goto out_refund;
843
844 context->buffer = buffer;
845 context->size = size;
846 context->mm = get_task_mm(current);
847
848 return 0;
849
850 out_refund:
851 refund_locked_memory(current->mm, size);
852 return err;
853}
854
855static inline void free_bts_buffer(struct bts_context *context)
856{
857 if (!context->buffer)
858 return;
859
860 kfree(context->buffer);
861 context->buffer = NULL;
862
863 refund_locked_memory(context->mm, context->size);
864 context->size = 0;
865
866 mmput(context->mm);
867 context->mm = NULL;
868}
869
870static void free_bts_context_work(struct work_struct *w)
871{
872 struct bts_context *context;
873
874 context = container_of(w, struct bts_context, work);
875
876 ds_release_bts(context->tracer);
877 put_task_struct(context->task);
878 free_bts_buffer(context);
879 kfree(context);
880}
881
882static inline void free_bts_context(struct bts_context *context)
883{
884 INIT_WORK(&context->work, free_bts_context_work);
885 schedule_work(&context->work);
886}
887
888static inline struct bts_context *alloc_bts_context(struct task_struct *task)
889{
890 struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
891 if (context) {
892 context->task = task;
893 task->bts = context;
894
895 get_task_struct(task);
896 }
897
898 return context;
899}
900
901static int ptrace_bts_read_record(struct task_struct *child, size_t index,
902 struct bts_struct __user *out)
903{
904 struct bts_context *context;
905 const struct bts_trace *trace;
906 struct bts_struct bts;
907 const unsigned char *at;
908 int error;
909
910 context = child->bts;
911 if (!context)
912 return -ESRCH;
913
914 trace = ds_read_bts(context->tracer);
915 if (!trace)
916 return -ESRCH;
917
918 at = trace->ds.top - ((index + 1) * trace->ds.size);
919 if ((void *)at < trace->ds.begin)
920 at += (trace->ds.n * trace->ds.size);
921
922 if (!trace->read)
923 return -EOPNOTSUPP;
924
925 error = trace->read(context->tracer, at, &bts);
926 if (error < 0)
927 return error;
928
929 if (copy_to_user(out, &bts, sizeof(bts)))
930 return -EFAULT;
931
932 return sizeof(bts);
933}
934
935static int ptrace_bts_drain(struct task_struct *child,
936 long size,
937 struct bts_struct __user *out)
938{
939 struct bts_context *context;
940 const struct bts_trace *trace;
941 const unsigned char *at;
942 int error, drained = 0;
943
944 context = child->bts;
945 if (!context)
946 return -ESRCH;
947
948 trace = ds_read_bts(context->tracer);
949 if (!trace)
950 return -ESRCH;
951
952 if (!trace->read)
953 return -EOPNOTSUPP;
954
955 if (size < (trace->ds.top - trace->ds.begin))
956 return -EIO;
957
958 for (at = trace->ds.begin; (void *)at < trace->ds.top;
959 out++, drained++, at += trace->ds.size) {
960 struct bts_struct bts;
961
962 error = trace->read(context->tracer, at, &bts);
963 if (error < 0)
964 return error;
965
966 if (copy_to_user(out, &bts, sizeof(bts)))
967 return -EFAULT;
968 }
969
970 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
971
972 error = ds_reset_bts(context->tracer);
973 if (error < 0)
974 return error;
975
976 return drained;
977}
978
979static int ptrace_bts_config(struct task_struct *child,
980 long cfg_size,
981 const struct ptrace_bts_config __user *ucfg)
982{
983 struct bts_context *context;
984 struct ptrace_bts_config cfg;
985 unsigned int flags = 0;
986
987 if (cfg_size < sizeof(cfg))
988 return -EIO;
989
990 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
991 return -EFAULT;
992
993 context = child->bts;
994 if (!context)
995 context = alloc_bts_context(child);
996 if (!context)
997 return -ENOMEM;
998
999 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
1000 if (!cfg.signal)
1001 return -EINVAL;
1002
1003 return -EOPNOTSUPP;
1004 context->bts_ovfl_signal = cfg.signal;
1005 }
1006
1007 ds_release_bts(context->tracer);
1008 context->tracer = NULL;
1009
1010 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
1011 int err;
1012
1013 free_bts_buffer(context);
1014 if (!cfg.size)
1015 return 0;
1016
1017 err = alloc_bts_buffer(context, cfg.size);
1018 if (err < 0)
1019 return err;
1020 }
1021
1022 if (cfg.flags & PTRACE_BTS_O_TRACE)
1023 flags |= BTS_USER;
1024
1025 if (cfg.flags & PTRACE_BTS_O_SCHED)
1026 flags |= BTS_TIMESTAMPS;
1027
1028 context->tracer =
1029 ds_request_bts_task(child, context->buffer, context->size,
1030 NULL, (size_t)-1, flags);
1031 if (unlikely(IS_ERR(context->tracer))) {
1032 int error = PTR_ERR(context->tracer);
1033
1034 free_bts_buffer(context);
1035 context->tracer = NULL;
1036 return error;
1037 }
1038
1039 return sizeof(cfg);
1040}
1041
1042static int ptrace_bts_status(struct task_struct *child,
1043 long cfg_size,
1044 struct ptrace_bts_config __user *ucfg)
1045{
1046 struct bts_context *context;
1047 const struct bts_trace *trace;
1048 struct ptrace_bts_config cfg;
1049
1050 context = child->bts;
1051 if (!context)
1052 return -ESRCH;
1053
1054 if (cfg_size < sizeof(cfg))
1055 return -EIO;
1056
1057 trace = ds_read_bts(context->tracer);
1058 if (!trace)
1059 return -ESRCH;
1060
1061 memset(&cfg, 0, sizeof(cfg));
1062 cfg.size = trace->ds.end - trace->ds.begin;
1063 cfg.signal = context->bts_ovfl_signal;
1064 cfg.bts_size = sizeof(struct bts_struct);
1065
1066 if (cfg.signal)
1067 cfg.flags |= PTRACE_BTS_O_SIGNAL;
1068
1069 if (trace->ds.flags & BTS_USER)
1070 cfg.flags |= PTRACE_BTS_O_TRACE;
1071
1072 if (trace->ds.flags & BTS_TIMESTAMPS)
1073 cfg.flags |= PTRACE_BTS_O_SCHED;
1074
1075 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
1076 return -EFAULT;
1077
1078 return sizeof(cfg);
1079}
1080
1081static int ptrace_bts_clear(struct task_struct *child)
1082{
1083 struct bts_context *context;
1084 const struct bts_trace *trace;
1085
1086 context = child->bts;
1087 if (!context)
1088 return -ESRCH;
1089
1090 trace = ds_read_bts(context->tracer);
1091 if (!trace)
1092 return -ESRCH;
1093
1094 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
1095
1096 return ds_reset_bts(context->tracer);
1097}
1098
1099static int ptrace_bts_size(struct task_struct *child)
1100{
1101 struct bts_context *context;
1102 const struct bts_trace *trace;
1103
1104 context = child->bts;
1105 if (!context)
1106 return -ESRCH;
1107
1108 trace = ds_read_bts(context->tracer);
1109 if (!trace)
1110 return -ESRCH;
1111
1112 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
1113}
1114
1115/*
1116 * Called from __ptrace_unlink() after the child has been moved back
1117 * to its original parent.
1118 */
1119void ptrace_bts_untrace(struct task_struct *child)
1120{
1121 if (unlikely(child->bts)) {
1122 free_bts_context(child->bts);
1123 child->bts = NULL;
1124 }
1125}
1126#endif /* CONFIG_X86_PTRACE_BTS */
1127
1128/* 787/*
1129 * Called by kernel/ptrace.c when detaching.. 788 * Called by kernel/ptrace.c when detaching..
1130 * 789 *
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1252 break; 911 break;
1253#endif 912#endif
1254 913
1255 /*
1256 * These bits need more cooking - not enabled yet:
1257 */
1258#ifdef CONFIG_X86_PTRACE_BTS
1259 case PTRACE_BTS_CONFIG:
1260 ret = ptrace_bts_config
1261 (child, data, (struct ptrace_bts_config __user *)addr);
1262 break;
1263
1264 case PTRACE_BTS_STATUS:
1265 ret = ptrace_bts_status
1266 (child, data, (struct ptrace_bts_config __user *)addr);
1267 break;
1268
1269 case PTRACE_BTS_SIZE:
1270 ret = ptrace_bts_size(child);
1271 break;
1272
1273 case PTRACE_BTS_GET:
1274 ret = ptrace_bts_read_record
1275 (child, data, (struct bts_struct __user *) addr);
1276 break;
1277
1278 case PTRACE_BTS_CLEAR:
1279 ret = ptrace_bts_clear(child);
1280 break;
1281
1282 case PTRACE_BTS_DRAIN:
1283 ret = ptrace_bts_drain
1284 (child, data, (struct bts_struct __user *) addr);
1285 break;
1286#endif /* CONFIG_X86_PTRACE_BTS */
1287
1288 default: 914 default:
1289 ret = ptrace_request(child, request, addr, data); 915 ret = ptrace_request(child, request, addr, data);
1290 break; 916 break;
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1544 1170
1545 case PTRACE_GET_THREAD_AREA: 1171 case PTRACE_GET_THREAD_AREA:
1546 case PTRACE_SET_THREAD_AREA: 1172 case PTRACE_SET_THREAD_AREA:
1547#ifdef CONFIG_X86_PTRACE_BTS
1548 case PTRACE_BTS_CONFIG:
1549 case PTRACE_BTS_STATUS:
1550 case PTRACE_BTS_SIZE:
1551 case PTRACE_BTS_GET:
1552 case PTRACE_BTS_CLEAR:
1553 case PTRACE_BTS_DRAIN:
1554#endif /* CONFIG_X86_PTRACE_BTS */
1555 return arch_ptrace(child, request, addr, data); 1173 return arch_ptrace(child, request, addr, data);
1556 1174
1557 default: 1175 default:
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e099382651..7ded57896c0a 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
81#endif /* CONFIG_X86_LOCAL_APIC */ 81#endif /* CONFIG_X86_LOCAL_APIC */
82 82
83#ifdef CONFIG_X86_IO_APIC 83#ifdef CONFIG_X86_IO_APIC
84static u32 gsi_base;
85 84
86static int __init sfi_parse_ioapic(struct sfi_table_header *table) 85static int __init sfi_parse_ioapic(struct sfi_table_header *table)
87{ 86{
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
94 pentry = (struct sfi_apic_table_entry *)sb->pentry; 93 pentry = (struct sfi_apic_table_entry *)sb->pentry;
95 94
96 for (i = 0; i < num; i++) { 95 for (i = 0; i < num; i++) {
97 mp_register_ioapic(i, pentry->phys_addr, gsi_base); 96 mp_register_ioapic(i, pentry->phys_addr, gsi_end + 1);
98 gsi_base += io_apic_get_redir_entries(i);
99 pentry++; 97 pentry++;
100 } 98 }
101 99
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff107..58de45ee08b6 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child)
158} 158}
159 159
160/* 160/*
161 * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
162 */
163static void write_debugctlmsr(struct task_struct *child, unsigned long val)
164{
165 if (child->thread.debugctlmsr == val)
166 return;
167
168 child->thread.debugctlmsr = val;
169
170 if (child != current)
171 return;
172
173 update_debugctlmsr(val);
174}
175
176/*
177 * Enable single or block step. 161 * Enable single or block step.
178 */ 162 */
179static void enable_step(struct task_struct *child, bool block) 163static void enable_step(struct task_struct *child, bool block)
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block)
186 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
187 */ 171 */
188 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
189 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 173 unsigned long debugctl = get_debugctlmsr();
190 write_debugctlmsr(child, 174
191 child->thread.debugctlmsr | DEBUGCTLMSR_BTF); 175 debugctl |= DEBUGCTLMSR_BTF;
192 } else { 176 update_debugctlmsr(debugctl);
193 write_debugctlmsr(child, 177 set_tsk_thread_flag(child, TIF_BLOCKSTEP);
194 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 178 } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
195 179 unsigned long debugctl = get_debugctlmsr();
196 if (!child->thread.debugctlmsr) 180
197 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 181 debugctl &= ~DEBUGCTLMSR_BTF;
182 update_debugctlmsr(debugctl);
183 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
198 } 184 }
199} 185}
200 186
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child)
213 /* 199 /*
214 * Make sure block stepping (BTF) is disabled. 200 * Make sure block stepping (BTF) is disabled.
215 */ 201 */
216 write_debugctlmsr(child, 202 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
217 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 203 unsigned long debugctl = get_debugctlmsr();
218 204
219 if (!child->thread.debugctlmsr) 205 debugctl &= ~DEBUGCTLMSR_BTF;
220 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 206 update_debugctlmsr(debugctl);
207 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
208 }
221 209
222 /* Always clear TIF_SINGLESTEP... */ 210 /* Always clear TIF_SINGLESTEP... */
223 clear_tsk_thread_flag(child, TIF_SINGLESTEP); 211 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e4454188..02cfb9b8f5b1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,15 +108,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
108 dec_preempt_count(); 108 dec_preempt_count();
109} 109}
110 110
111#ifdef CONFIG_X86_32
112static inline void
113die_if_kernel(const char *str, struct pt_regs *regs, long err)
114{
115 if (!user_mode_vm(regs))
116 die(str, regs, err);
117}
118#endif
119
120static void __kprobes 111static void __kprobes
121do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, 112do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
122 long error_code, siginfo_t *info) 113 long error_code, siginfo_t *info)
@@ -543,11 +534,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
543 534
544 /* DR6 may or may not be cleared by the CPU */ 535 /* DR6 may or may not be cleared by the CPU */
545 set_debugreg(0, 6); 536 set_debugreg(0, 6);
537
546 /* 538 /*
547 * The processor cleared BTF, so don't mark that we need it set. 539 * The processor cleared BTF, so don't mark that we need it set.
548 */ 540 */
549 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 541 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
550 tsk->thread.debugctlmsr = 0;
551 542
552 /* Store the virtualized DR6 value */ 543 /* Store the virtualized DR6 value */
553 tsk->thread.debugreg6 = dr6; 544 tsk->thread.debugreg6 = dr6;
@@ -585,55 +576,67 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
585 return; 576 return;
586} 577}
587 578
588#ifdef CONFIG_X86_64
589static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
590{
591 if (fixup_exception(regs))
592 return 1;
593
594 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
595 /* Illegal floating point operation in the kernel */
596 current->thread.trap_no = trapnr;
597 die(str, regs, 0);
598 return 0;
599}
600#endif
601
602/* 579/*
603 * Note that we play around with the 'TS' bit in an attempt to get 580 * Note that we play around with the 'TS' bit in an attempt to get
604 * the correct behaviour even in the presence of the asynchronous 581 * the correct behaviour even in the presence of the asynchronous
605 * IRQ13 behaviour 582 * IRQ13 behaviour
606 */ 583 */
607void math_error(void __user *ip) 584void math_error(struct pt_regs *regs, int error_code, int trapnr)
608{ 585{
609 struct task_struct *task; 586 struct task_struct *task = current;
610 siginfo_t info; 587 siginfo_t info;
611 unsigned short cwd, swd, err; 588 unsigned short err;
589 char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
590
591 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
592 return;
593 conditional_sti(regs);
594
595 if (!user_mode_vm(regs))
596 {
597 if (!fixup_exception(regs)) {
598 task->thread.error_code = error_code;
599 task->thread.trap_no = trapnr;
600 die(str, regs, error_code);
601 }
602 return;
603 }
612 604
613 /* 605 /*
614 * Save the info for the exception handler and clear the error. 606 * Save the info for the exception handler and clear the error.
615 */ 607 */
616 task = current;
617 save_init_fpu(task); 608 save_init_fpu(task);
618 task->thread.trap_no = 16; 609 task->thread.trap_no = trapnr;
619 task->thread.error_code = 0; 610 task->thread.error_code = error_code;
620 info.si_signo = SIGFPE; 611 info.si_signo = SIGFPE;
621 info.si_errno = 0; 612 info.si_errno = 0;
622 info.si_addr = ip; 613 info.si_addr = (void __user *)regs->ip;
623 /* 614 if (trapnr == 16) {
624 * (~cwd & swd) will mask out exceptions that are not set to unmasked 615 unsigned short cwd, swd;
625 * status. 0x3f is the exception bits in these regs, 0x200 is the 616 /*
626 * C1 reg you need in case of a stack fault, 0x040 is the stack 617 * (~cwd & swd) will mask out exceptions that are not set to unmasked
627 * fault bit. We should only be taking one exception at a time, 618 * status. 0x3f is the exception bits in these regs, 0x200 is the
628 * so if this combination doesn't produce any single exception, 619 * C1 reg you need in case of a stack fault, 0x040 is the stack
629 * then we have a bad program that isn't synchronizing its FPU usage 620 * fault bit. We should only be taking one exception at a time,
630 * and it will suffer the consequences since we won't be able to 621 * so if this combination doesn't produce any single exception,
631 * fully reproduce the context of the exception 622 * then we have a bad program that isn't synchronizing its FPU usage
632 */ 623 * and it will suffer the consequences since we won't be able to
633 cwd = get_fpu_cwd(task); 624 * fully reproduce the context of the exception
634 swd = get_fpu_swd(task); 625 */
626 cwd = get_fpu_cwd(task);
627 swd = get_fpu_swd(task);
635 628
636 err = swd & ~cwd; 629 err = swd & ~cwd;
630 } else {
631 /*
632 * The SIMD FPU exceptions are handled a little differently, as there
633 * is only a single status/control register. Thus, to determine which
634 * unmasked exception was caught we must mask the exception mask bits
635 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
636 */
637 unsigned short mxcsr = get_fpu_mxcsr(task);
638 err = ~(mxcsr >> 7) & mxcsr;
639 }
637 640
638 if (err & 0x001) { /* Invalid op */ 641 if (err & 0x001) { /* Invalid op */
639 /* 642 /*
@@ -662,97 +665,17 @@ void math_error(void __user *ip)
662 665
663dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 666dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
664{ 667{
665 conditional_sti(regs);
666
667#ifdef CONFIG_X86_32 668#ifdef CONFIG_X86_32
668 ignore_fpu_irq = 1; 669 ignore_fpu_irq = 1;
669#else
670 if (!user_mode(regs) &&
671 kernel_math_error(regs, "kernel x87 math error", 16))
672 return;
673#endif 670#endif
674 671
675 math_error((void __user *)regs->ip); 672 math_error(regs, error_code, 16);
676}
677
678static void simd_math_error(void __user *ip)
679{
680 struct task_struct *task;
681 siginfo_t info;
682 unsigned short mxcsr;
683
684 /*
685 * Save the info for the exception handler and clear the error.
686 */
687 task = current;
688 save_init_fpu(task);
689 task->thread.trap_no = 19;
690 task->thread.error_code = 0;
691 info.si_signo = SIGFPE;
692 info.si_errno = 0;
693 info.si_code = __SI_FAULT;
694 info.si_addr = ip;
695 /*
696 * The SIMD FPU exceptions are handled a little differently, as there
697 * is only a single status/control register. Thus, to determine which
698 * unmasked exception was caught we must mask the exception mask bits
699 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
700 */
701 mxcsr = get_fpu_mxcsr(task);
702 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
703 case 0x000:
704 default:
705 break;
706 case 0x001: /* Invalid Op */
707 info.si_code = FPE_FLTINV;
708 break;
709 case 0x002: /* Denormalize */
710 case 0x010: /* Underflow */
711 info.si_code = FPE_FLTUND;
712 break;
713 case 0x004: /* Zero Divide */
714 info.si_code = FPE_FLTDIV;
715 break;
716 case 0x008: /* Overflow */
717 info.si_code = FPE_FLTOVF;
718 break;
719 case 0x020: /* Precision */
720 info.si_code = FPE_FLTRES;
721 break;
722 }
723 force_sig_info(SIGFPE, &info, task);
724} 673}
725 674
726dotraplinkage void 675dotraplinkage void
727do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 676do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
728{ 677{
729 conditional_sti(regs); 678 math_error(regs, error_code, 19);
730
731#ifdef CONFIG_X86_32
732 if (cpu_has_xmm) {
733 /* Handle SIMD FPU exceptions on PIII+ processors. */
734 ignore_fpu_irq = 1;
735 simd_math_error((void __user *)regs->ip);
736 return;
737 }
738 /*
739 * Handle strange cache flush from user space exception
740 * in all other cases. This is undocumented behaviour.
741 */
742 if (regs->flags & X86_VM_MASK) {
743 handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
744 return;
745 }
746 current->thread.trap_no = 19;
747 current->thread.error_code = error_code;
748 die_if_kernel("cache flush denied", regs, error_code);
749 force_sig(SIGSEGV, current);
750#else
751 if (!user_mode(regs) &&
752 kernel_math_error(regs, "kernel simd math error", 19))
753 return;
754 simd_math_error((void __user *)regs->ip);
755#endif
756} 679}
757 680
758dotraplinkage void 681dotraplinkage void
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b22496..1b950d151e58 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55 55
56EXPORT_SYMBOL(empty_zero_page); 56EXPORT_SYMBOL(empty_zero_page);
57EXPORT_SYMBOL(init_level4_pgt);
58#ifndef CONFIG_PARAVIRT 57#ifndef CONFIG_PARAVIRT
59EXPORT_SYMBOL(native_load_gs_index); 58EXPORT_SYMBOL(native_load_gs_index);
60#endif 59#endif
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec6..37e68fc5e24a 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -99,7 +99,7 @@ int save_i387_xstate(void __user *buf)
99 if (err) 99 if (err)
100 return err; 100 return err;
101 101
102 if (task_thread_info(tsk)->status & TS_XSAVE) 102 if (use_xsave())
103 err = xsave_user(buf); 103 err = xsave_user(buf);
104 else 104 else
105 err = fxsave_user(buf); 105 err = fxsave_user(buf);
@@ -109,14 +109,14 @@ int save_i387_xstate(void __user *buf)
109 task_thread_info(tsk)->status &= ~TS_USEDFPU; 109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts(); 110 stts();
111 } else { 111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, 112 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
113 xstate_size)) 113 xstate_size))
114 return -1; 114 return -1;
115 } 115 }
116 116
117 clear_used_math(); /* trigger finit */ 117 clear_used_math(); /* trigger finit */
118 118
119 if (task_thread_info(tsk)->status & TS_XSAVE) { 119 if (use_xsave()) {
120 struct _fpstate __user *fx = buf; 120 struct _fpstate __user *fx = buf;
121 struct _xstate __user *x = buf; 121 struct _xstate __user *x = buf;
122 u64 xstate_bv; 122 u64 xstate_bv;
@@ -225,7 +225,7 @@ int restore_i387_xstate(void __user *buf)
225 clts(); 225 clts();
226 task_thread_info(current)->status |= TS_USEDFPU; 226 task_thread_info(current)->status |= TS_USEDFPU;
227 } 227 }
228 if (task_thread_info(tsk)->status & TS_XSAVE) 228 if (use_xsave())
229 err = restore_user_xstate(buf); 229 err = restore_user_xstate(buf);
230 else 230 else
231 err = fxrstor_checking((__force struct i387_fxsave_struct *) 231 err = fxrstor_checking((__force struct i387_fxsave_struct *)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ba58206812a..737361fcd503 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2067,7 +2067,7 @@ static int cpuid_interception(struct vcpu_svm *svm)
2067static int iret_interception(struct vcpu_svm *svm) 2067static int iret_interception(struct vcpu_svm *svm)
2068{ 2068{
2069 ++svm->vcpu.stat.nmi_window_exits; 2069 ++svm->vcpu.stat.nmi_window_exits;
2070 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2070 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
2071 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2071 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2072 return 1; 2072 return 1;
2073} 2073}
@@ -2479,7 +2479,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2479 2479
2480 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 2480 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2481 vcpu->arch.hflags |= HF_NMI_MASK; 2481 vcpu->arch.hflags |= HF_NMI_MASK;
2482 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); 2482 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
2483 ++vcpu->stat.nmi_injections; 2483 ++vcpu->stat.nmi_injections;
2484} 2484}
2485 2485
@@ -2539,10 +2539,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2539 2539
2540 if (masked) { 2540 if (masked) {
2541 svm->vcpu.arch.hflags |= HF_NMI_MASK; 2541 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2542 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); 2542 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
2543 } else { 2543 } else {
2544 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 2544 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2545 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2545 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
2546 } 2546 }
2547} 2547}
2548 2548
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc933cfb4e66..edca080407a5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2703,8 +2703,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2703 return 0; 2703 return 0;
2704 2704
2705 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2705 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2706 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | 2706 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
2707 GUEST_INTR_STATE_NMI));
2708} 2707}
2709 2708
2710static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 2709static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -3660,8 +3659,11 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3660 3659
3661 /* We need to handle NMIs before interrupts are enabled */ 3660 /* We need to handle NMIs before interrupts are enabled */
3662 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && 3661 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3663 (exit_intr_info & INTR_INFO_VALID_MASK)) 3662 (exit_intr_info & INTR_INFO_VALID_MASK)) {
3663 kvm_before_handle_nmi(&vmx->vcpu);
3664 asm("int $2"); 3664 asm("int $2");
3665 kvm_after_handle_nmi(&vmx->vcpu);
3666 }
3665 3667
3666 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3668 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3667 3669
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4ca98ad27f..dd9bc8fb81ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -40,6 +40,7 @@
40#include <linux/user-return-notifier.h> 40#include <linux/user-return-notifier.h>
41#include <linux/srcu.h> 41#include <linux/srcu.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/perf_event.h>
43#include <trace/events/kvm.h> 44#include <trace/events/kvm.h>
44#undef TRACE_INCLUDE_FILE 45#undef TRACE_INCLUDE_FILE
45#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
@@ -1712,6 +1713,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1712 if (copy_from_user(cpuid_entries, entries, 1713 if (copy_from_user(cpuid_entries, entries,
1713 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1714 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1714 goto out_free; 1715 goto out_free;
1716 vcpu_load(vcpu);
1715 for (i = 0; i < cpuid->nent; i++) { 1717 for (i = 0; i < cpuid->nent; i++) {
1716 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1718 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1717 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1719 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1729,6 +1731,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1729 r = 0; 1731 r = 0;
1730 kvm_apic_set_version(vcpu); 1732 kvm_apic_set_version(vcpu);
1731 kvm_x86_ops->cpuid_update(vcpu); 1733 kvm_x86_ops->cpuid_update(vcpu);
1734 vcpu_put(vcpu);
1732 1735
1733out_free: 1736out_free:
1734 vfree(cpuid_entries); 1737 vfree(cpuid_entries);
@@ -1749,9 +1752,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1749 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1752 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1750 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1753 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1751 goto out; 1754 goto out;
1755 vcpu_load(vcpu);
1752 vcpu->arch.cpuid_nent = cpuid->nent; 1756 vcpu->arch.cpuid_nent = cpuid->nent;
1753 kvm_apic_set_version(vcpu); 1757 kvm_apic_set_version(vcpu);
1754 kvm_x86_ops->cpuid_update(vcpu); 1758 kvm_x86_ops->cpuid_update(vcpu);
1759 vcpu_put(vcpu);
1755 return 0; 1760 return 0;
1756 1761
1757out: 1762out:
@@ -3743,6 +3748,51 @@ static void kvm_timer_init(void)
3743 } 3748 }
3744} 3749}
3745 3750
3751static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
3752
3753static int kvm_is_in_guest(void)
3754{
3755 return percpu_read(current_vcpu) != NULL;
3756}
3757
3758static int kvm_is_user_mode(void)
3759{
3760 int user_mode = 3;
3761
3762 if (percpu_read(current_vcpu))
3763 user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
3764
3765 return user_mode != 0;
3766}
3767
3768static unsigned long kvm_get_guest_ip(void)
3769{
3770 unsigned long ip = 0;
3771
3772 if (percpu_read(current_vcpu))
3773 ip = kvm_rip_read(percpu_read(current_vcpu));
3774
3775 return ip;
3776}
3777
3778static struct perf_guest_info_callbacks kvm_guest_cbs = {
3779 .is_in_guest = kvm_is_in_guest,
3780 .is_user_mode = kvm_is_user_mode,
3781 .get_guest_ip = kvm_get_guest_ip,
3782};
3783
3784void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
3785{
3786 percpu_write(current_vcpu, vcpu);
3787}
3788EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
3789
3790void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
3791{
3792 percpu_write(current_vcpu, NULL);
3793}
3794EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
3795
3746int kvm_arch_init(void *opaque) 3796int kvm_arch_init(void *opaque)
3747{ 3797{
3748 int r; 3798 int r;
@@ -3779,6 +3829,8 @@ int kvm_arch_init(void *opaque)
3779 3829
3780 kvm_timer_init(); 3830 kvm_timer_init();
3781 3831
3832 perf_register_guest_info_callbacks(&kvm_guest_cbs);
3833
3782 return 0; 3834 return 0;
3783 3835
3784out: 3836out:
@@ -3787,6 +3839,8 @@ out:
3787 3839
3788void kvm_arch_exit(void) 3840void kvm_arch_exit(void)
3789{ 3841{
3842 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
3843
3790 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 3844 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
3791 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 3845 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
3792 CPUFREQ_TRANSITION_NOTIFIER); 3846 CPUFREQ_TRANSITION_NOTIFIER);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d101639bd8d..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,4 +65,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
70
68#endif 71#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 419386c24b82..f871e04b6965 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -20,17 +20,18 @@ lib-y := delay.o
20lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
21lib-y += usercopy_$(BITS).o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o getuser.o putuser.o
22lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
23lib-$(CONFIG_KPROBES) += insn.o inat.o 23lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
24 24
25obj-y += msr.o msr-reg.o msr-reg-export.o 25obj-y += msr.o msr-reg.o msr-reg-export.o
26 26
27ifeq ($(CONFIG_X86_32),y) 27ifeq ($(CONFIG_X86_32),y)
28 obj-y += atomic64_32.o 28 obj-y += atomic64_32.o
29 lib-y += atomic64_cx8_32.o
29 lib-y += checksum_32.o 30 lib-y += checksum_32.o
30 lib-y += strstr_32.o 31 lib-y += strstr_32.o
31 lib-y += semaphore_32.o string_32.o 32 lib-y += semaphore_32.o string_32.o
32ifneq ($(CONFIG_X86_CMPXCHG64),y) 33ifneq ($(CONFIG_X86_CMPXCHG64),y)
33 lib-y += cmpxchg8b_emu.o 34 lib-y += cmpxchg8b_emu.o atomic64_386_32.o
34endif 35endif
35 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 36 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
36else 37else
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 824fa0be55a3..540179e8e9fa 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -6,225 +6,54 @@
6#include <asm/cmpxchg.h> 6#include <asm/cmpxchg.h>
7#include <asm/atomic.h> 7#include <asm/atomic.h>
8 8
9static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) 9long long atomic64_read_cx8(long long, const atomic64_t *v);
10{ 10EXPORT_SYMBOL(atomic64_read_cx8);
11 u32 low = new; 11long long atomic64_set_cx8(long long, const atomic64_t *v);
12 u32 high = new >> 32; 12EXPORT_SYMBOL(atomic64_set_cx8);
13 13long long atomic64_xchg_cx8(long long, unsigned high);
14 asm volatile( 14EXPORT_SYMBOL(atomic64_xchg_cx8);
15 LOCK_PREFIX "cmpxchg8b %1\n" 15long long atomic64_add_return_cx8(long long a, atomic64_t *v);
16 : "+A" (old), "+m" (*ptr) 16EXPORT_SYMBOL(atomic64_add_return_cx8);
17 : "b" (low), "c" (high) 17long long atomic64_sub_return_cx8(long long a, atomic64_t *v);
18 ); 18EXPORT_SYMBOL(atomic64_sub_return_cx8);
19 return old; 19long long atomic64_inc_return_cx8(long long a, atomic64_t *v);
20} 20EXPORT_SYMBOL(atomic64_inc_return_cx8);
21 21long long atomic64_dec_return_cx8(long long a, atomic64_t *v);
22u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) 22EXPORT_SYMBOL(atomic64_dec_return_cx8);
23{ 23long long atomic64_dec_if_positive_cx8(atomic64_t *v);
24 return cmpxchg8b(&ptr->counter, old_val, new_val); 24EXPORT_SYMBOL(atomic64_dec_if_positive_cx8);
25} 25int atomic64_inc_not_zero_cx8(atomic64_t *v);
26EXPORT_SYMBOL(atomic64_cmpxchg); 26EXPORT_SYMBOL(atomic64_inc_not_zero_cx8);
27 27int atomic64_add_unless_cx8(atomic64_t *v, long long a, long long u);
28/** 28EXPORT_SYMBOL(atomic64_add_unless_cx8);
29 * atomic64_xchg - xchg atomic64 variable 29
30 * @ptr: pointer to type atomic64_t 30#ifndef CONFIG_X86_CMPXCHG64
31 * @new_val: value to assign 31long long atomic64_read_386(long long, const atomic64_t *v);
32 * 32EXPORT_SYMBOL(atomic64_read_386);
33 * Atomically xchgs the value of @ptr to @new_val and returns 33long long atomic64_set_386(long long, const atomic64_t *v);
34 * the old value. 34EXPORT_SYMBOL(atomic64_set_386);
35 */ 35long long atomic64_xchg_386(long long, unsigned high);
36u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) 36EXPORT_SYMBOL(atomic64_xchg_386);
37{ 37long long atomic64_add_return_386(long long a, atomic64_t *v);
38 /* 38EXPORT_SYMBOL(atomic64_add_return_386);
39 * Try first with a (possibly incorrect) assumption about 39long long atomic64_sub_return_386(long long a, atomic64_t *v);
40 * what we have there. We'll do two loops most likely, 40EXPORT_SYMBOL(atomic64_sub_return_386);
41 * but we'll get an ownership MESI transaction straight away 41long long atomic64_inc_return_386(long long a, atomic64_t *v);
42 * instead of a read transaction followed by a 42EXPORT_SYMBOL(atomic64_inc_return_386);
43 * flush-for-ownership transaction: 43long long atomic64_dec_return_386(long long a, atomic64_t *v);
44 */ 44EXPORT_SYMBOL(atomic64_dec_return_386);
45 u64 old_val, real_val = 0; 45long long atomic64_add_386(long long a, atomic64_t *v);
46 46EXPORT_SYMBOL(atomic64_add_386);
47 do { 47long long atomic64_sub_386(long long a, atomic64_t *v);
48 old_val = real_val; 48EXPORT_SYMBOL(atomic64_sub_386);
49 49long long atomic64_inc_386(long long a, atomic64_t *v);
50 real_val = atomic64_cmpxchg(ptr, old_val, new_val); 50EXPORT_SYMBOL(atomic64_inc_386);
51 51long long atomic64_dec_386(long long a, atomic64_t *v);
52 } while (real_val != old_val); 52EXPORT_SYMBOL(atomic64_dec_386);
53 53long long atomic64_dec_if_positive_386(atomic64_t *v);
54 return old_val; 54EXPORT_SYMBOL(atomic64_dec_if_positive_386);
55} 55int atomic64_inc_not_zero_386(atomic64_t *v);
56EXPORT_SYMBOL(atomic64_xchg); 56EXPORT_SYMBOL(atomic64_inc_not_zero_386);
57 57int atomic64_add_unless_386(atomic64_t *v, long long a, long long u);
58/** 58EXPORT_SYMBOL(atomic64_add_unless_386);
59 * atomic64_set - set atomic64 variable 59#endif
60 * @ptr: pointer to type atomic64_t
61 * @new_val: value to assign
62 *
63 * Atomically sets the value of @ptr to @new_val.
64 */
65void atomic64_set(atomic64_t *ptr, u64 new_val)
66{
67 atomic64_xchg(ptr, new_val);
68}
69EXPORT_SYMBOL(atomic64_set);
70
71/**
72EXPORT_SYMBOL(atomic64_read);
73 * atomic64_add_return - add and return
74 * @delta: integer value to add
75 * @ptr: pointer to type atomic64_t
76 *
77 * Atomically adds @delta to @ptr and returns @delta + *@ptr
78 */
79noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
80{
81 /*
82 * Try first with a (possibly incorrect) assumption about
83 * what we have there. We'll do two loops most likely,
84 * but we'll get an ownership MESI transaction straight away
85 * instead of a read transaction followed by a
86 * flush-for-ownership transaction:
87 */
88 u64 old_val, new_val, real_val = 0;
89
90 do {
91 old_val = real_val;
92 new_val = old_val + delta;
93
94 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
95
96 } while (real_val != old_val);
97
98 return new_val;
99}
100EXPORT_SYMBOL(atomic64_add_return);
101
102u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
103{
104 return atomic64_add_return(-delta, ptr);
105}
106EXPORT_SYMBOL(atomic64_sub_return);
107
108u64 atomic64_inc_return(atomic64_t *ptr)
109{
110 return atomic64_add_return(1, ptr);
111}
112EXPORT_SYMBOL(atomic64_inc_return);
113
114u64 atomic64_dec_return(atomic64_t *ptr)
115{
116 return atomic64_sub_return(1, ptr);
117}
118EXPORT_SYMBOL(atomic64_dec_return);
119
120/**
121 * atomic64_add - add integer to atomic64 variable
122 * @delta: integer value to add
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically adds @delta to @ptr.
126 */
127void atomic64_add(u64 delta, atomic64_t *ptr)
128{
129 atomic64_add_return(delta, ptr);
130}
131EXPORT_SYMBOL(atomic64_add);
132
133/**
134 * atomic64_sub - subtract the atomic64 variable
135 * @delta: integer value to subtract
136 * @ptr: pointer to type atomic64_t
137 *
138 * Atomically subtracts @delta from @ptr.
139 */
140void atomic64_sub(u64 delta, atomic64_t *ptr)
141{
142 atomic64_add(-delta, ptr);
143}
144EXPORT_SYMBOL(atomic64_sub);
145
146/**
147 * atomic64_sub_and_test - subtract value from variable and test result
148 * @delta: integer value to subtract
149 * @ptr: pointer to type atomic64_t
150 *
151 * Atomically subtracts @delta from @ptr and returns
152 * true if the result is zero, or false for all
153 * other cases.
154 */
155int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
156{
157 u64 new_val = atomic64_sub_return(delta, ptr);
158
159 return new_val == 0;
160}
161EXPORT_SYMBOL(atomic64_sub_and_test);
162
163/**
164 * atomic64_inc - increment atomic64 variable
165 * @ptr: pointer to type atomic64_t
166 *
167 * Atomically increments @ptr by 1.
168 */
169void atomic64_inc(atomic64_t *ptr)
170{
171 atomic64_add(1, ptr);
172}
173EXPORT_SYMBOL(atomic64_inc);
174
175/**
176 * atomic64_dec - decrement atomic64 variable
177 * @ptr: pointer to type atomic64_t
178 *
179 * Atomically decrements @ptr by 1.
180 */
181void atomic64_dec(atomic64_t *ptr)
182{
183 atomic64_sub(1, ptr);
184}
185EXPORT_SYMBOL(atomic64_dec);
186
187/**
188 * atomic64_dec_and_test - decrement and test
189 * @ptr: pointer to type atomic64_t
190 *
191 * Atomically decrements @ptr by 1 and
192 * returns true if the result is 0, or false for all other
193 * cases.
194 */
195int atomic64_dec_and_test(atomic64_t *ptr)
196{
197 return atomic64_sub_and_test(1, ptr);
198}
199EXPORT_SYMBOL(atomic64_dec_and_test);
200
201/**
202 * atomic64_inc_and_test - increment and test
203 * @ptr: pointer to type atomic64_t
204 *
205 * Atomically increments @ptr by 1
206 * and returns true if the result is zero, or false for all
207 * other cases.
208 */
209int atomic64_inc_and_test(atomic64_t *ptr)
210{
211 return atomic64_sub_and_test(-1, ptr);
212}
213EXPORT_SYMBOL(atomic64_inc_and_test);
214
215/**
216 * atomic64_add_negative - add and test if negative
217 * @delta: integer value to add
218 * @ptr: pointer to type atomic64_t
219 *
220 * Atomically adds @delta to @ptr and returns true
221 * if the result is negative, or false when
222 * result is greater than or equal to zero.
223 */
224int atomic64_add_negative(u64 delta, atomic64_t *ptr)
225{
226 s64 new_val = atomic64_add_return(delta, ptr);
227
228 return new_val < 0;
229}
230EXPORT_SYMBOL(atomic64_add_negative);
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 000000000000..4a5979aa6883
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,174 @@
1/*
2 * atomic64_t for 386/486
3 *
4 * Copyright © 2010 Luca Barbieri
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13#include <asm/alternative-asm.h>
14#include <asm/dwarf2.h>
15
16/* if you want SMP support, implement these with real spinlocks */
17.macro LOCK reg
18 pushfl
19 CFI_ADJUST_CFA_OFFSET 4
20 cli
21.endm
22
23.macro UNLOCK reg
24 popfl
25 CFI_ADJUST_CFA_OFFSET -4
26.endm
27
28.macro BEGIN func reg
29$v = \reg
30
31ENTRY(atomic64_\func\()_386)
32 CFI_STARTPROC
33 LOCK $v
34
35.macro RETURN
36 UNLOCK $v
37 ret
38.endm
39
40.macro END_
41 CFI_ENDPROC
42ENDPROC(atomic64_\func\()_386)
43.purgem RETURN
44.purgem END_
45.purgem END
46.endm
47
48.macro END
49RETURN
50END_
51.endm
52.endm
53
54BEGIN read %ecx
55 movl ($v), %eax
56 movl 4($v), %edx
57END
58
59BEGIN set %esi
60 movl %ebx, ($v)
61 movl %ecx, 4($v)
62END
63
64BEGIN xchg %esi
65 movl ($v), %eax
66 movl 4($v), %edx
67 movl %ebx, ($v)
68 movl %ecx, 4($v)
69END
70
71BEGIN add %ecx
72 addl %eax, ($v)
73 adcl %edx, 4($v)
74END
75
76BEGIN add_return %ecx
77 addl ($v), %eax
78 adcl 4($v), %edx
79 movl %eax, ($v)
80 movl %edx, 4($v)
81END
82
83BEGIN sub %ecx
84 subl %eax, ($v)
85 sbbl %edx, 4($v)
86END
87
88BEGIN sub_return %ecx
89 negl %edx
90 negl %eax
91 sbbl $0, %edx
92 addl ($v), %eax
93 adcl 4($v), %edx
94 movl %eax, ($v)
95 movl %edx, 4($v)
96END
97
98BEGIN inc %esi
99 addl $1, ($v)
100 adcl $0, 4($v)
101END
102
103BEGIN inc_return %esi
104 movl ($v), %eax
105 movl 4($v), %edx
106 addl $1, %eax
107 adcl $0, %edx
108 movl %eax, ($v)
109 movl %edx, 4($v)
110END
111
112BEGIN dec %esi
113 subl $1, ($v)
114 sbbl $0, 4($v)
115END
116
117BEGIN dec_return %esi
118 movl ($v), %eax
119 movl 4($v), %edx
120 subl $1, %eax
121 sbbl $0, %edx
122 movl %eax, ($v)
123 movl %edx, 4($v)
124END
125
126BEGIN add_unless %ecx
127 addl %eax, %esi
128 adcl %edx, %edi
129 addl ($v), %eax
130 adcl 4($v), %edx
131 cmpl %eax, %esi
132 je 3f
1331:
134 movl %eax, ($v)
135 movl %edx, 4($v)
136 movl $1, %eax
1372:
138RETURN
1393:
140 cmpl %edx, %edi
141 jne 1b
142 xorl %eax, %eax
143 jmp 2b
144END_
145
146BEGIN inc_not_zero %esi
147 movl ($v), %eax
148 movl 4($v), %edx
149 testl %eax, %eax
150 je 3f
1511:
152 addl $1, %eax
153 adcl $0, %edx
154 movl %eax, ($v)
155 movl %edx, 4($v)
156 movl $1, %eax
1572:
158RETURN
1593:
160 testl %edx, %edx
161 jne 1b
162 jmp 2b
163END_
164
165BEGIN dec_if_positive %esi
166 movl ($v), %eax
167 movl 4($v), %edx
168 subl $1, %eax
169 sbbl $0, %edx
170 js 1f
171 movl %eax, ($v)
172 movl %edx, 4($v)
1731:
174END
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 000000000000..71e080de3352
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,224 @@
1/*
2 * atomic64_t for 586+
3 *
4 * Copyright © 2010 Luca Barbieri
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13#include <asm/alternative-asm.h>
14#include <asm/dwarf2.h>
15
16.macro SAVE reg
17 pushl %\reg
18 CFI_ADJUST_CFA_OFFSET 4
19 CFI_REL_OFFSET \reg, 0
20.endm
21
22.macro RESTORE reg
23 popl %\reg
24 CFI_ADJUST_CFA_OFFSET -4
25 CFI_RESTORE \reg
26.endm
27
28.macro read64 reg
29 movl %ebx, %eax
30 movl %ecx, %edx
31/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
32 LOCK_PREFIX
33 cmpxchg8b (\reg)
34.endm
35
36ENTRY(atomic64_read_cx8)
37 CFI_STARTPROC
38
39 read64 %ecx
40 ret
41 CFI_ENDPROC
42ENDPROC(atomic64_read_cx8)
43
44ENTRY(atomic64_set_cx8)
45 CFI_STARTPROC
46
471:
48/* we don't need LOCK_PREFIX since aligned 64-bit writes
49 * are atomic on 586 and newer */
50 cmpxchg8b (%esi)
51 jne 1b
52
53 ret
54 CFI_ENDPROC
55ENDPROC(atomic64_set_cx8)
56
57ENTRY(atomic64_xchg_cx8)
58 CFI_STARTPROC
59
60 movl %ebx, %eax
61 movl %ecx, %edx
621:
63 LOCK_PREFIX
64 cmpxchg8b (%esi)
65 jne 1b
66
67 ret
68 CFI_ENDPROC
69ENDPROC(atomic64_xchg_cx8)
70
71.macro addsub_return func ins insc
72ENTRY(atomic64_\func\()_return_cx8)
73 CFI_STARTPROC
74 SAVE ebp
75 SAVE ebx
76 SAVE esi
77 SAVE edi
78
79 movl %eax, %esi
80 movl %edx, %edi
81 movl %ecx, %ebp
82
83 read64 %ebp
841:
85 movl %eax, %ebx
86 movl %edx, %ecx
87 \ins\()l %esi, %ebx
88 \insc\()l %edi, %ecx
89 LOCK_PREFIX
90 cmpxchg8b (%ebp)
91 jne 1b
92
9310:
94 movl %ebx, %eax
95 movl %ecx, %edx
96 RESTORE edi
97 RESTORE esi
98 RESTORE ebx
99 RESTORE ebp
100 ret
101 CFI_ENDPROC
102ENDPROC(atomic64_\func\()_return_cx8)
103.endm
104
105addsub_return add add adc
106addsub_return sub sub sbb
107
108.macro incdec_return func ins insc
109ENTRY(atomic64_\func\()_return_cx8)
110 CFI_STARTPROC
111 SAVE ebx
112
113 read64 %esi
1141:
115 movl %eax, %ebx
116 movl %edx, %ecx
117 \ins\()l $1, %ebx
118 \insc\()l $0, %ecx
119 LOCK_PREFIX
120 cmpxchg8b (%esi)
121 jne 1b
122
12310:
124 movl %ebx, %eax
125 movl %ecx, %edx
126 RESTORE ebx
127 ret
128 CFI_ENDPROC
129ENDPROC(atomic64_\func\()_return_cx8)
130.endm
131
132incdec_return inc add adc
133incdec_return dec sub sbb
134
135ENTRY(atomic64_dec_if_positive_cx8)
136 CFI_STARTPROC
137 SAVE ebx
138
139 read64 %esi
1401:
141 movl %eax, %ebx
142 movl %edx, %ecx
143 subl $1, %ebx
144 sbb $0, %ecx
145 js 2f
146 LOCK_PREFIX
147 cmpxchg8b (%esi)
148 jne 1b
149
1502:
151 movl %ebx, %eax
152 movl %ecx, %edx
153 RESTORE ebx
154 ret
155 CFI_ENDPROC
156ENDPROC(atomic64_dec_if_positive_cx8)
157
158ENTRY(atomic64_add_unless_cx8)
159 CFI_STARTPROC
160 SAVE ebp
161 SAVE ebx
162/* these just push these two parameters on the stack */
163 SAVE edi
164 SAVE esi
165
166 movl %ecx, %ebp
167 movl %eax, %esi
168 movl %edx, %edi
169
170 read64 %ebp
1711:
172 cmpl %eax, 0(%esp)
173 je 4f
1742:
175 movl %eax, %ebx
176 movl %edx, %ecx
177 addl %esi, %ebx
178 adcl %edi, %ecx
179 LOCK_PREFIX
180 cmpxchg8b (%ebp)
181 jne 1b
182
183 movl $1, %eax
1843:
185 addl $8, %esp
186 CFI_ADJUST_CFA_OFFSET -8
187 RESTORE ebx
188 RESTORE ebp
189 ret
1904:
191 cmpl %edx, 4(%esp)
192 jne 2b
193 xorl %eax, %eax
194 jmp 3b
195 CFI_ENDPROC
196ENDPROC(atomic64_add_unless_cx8)
197
198ENTRY(atomic64_inc_not_zero_cx8)
199 CFI_STARTPROC
200 SAVE ebx
201
202 read64 %esi
2031:
204 testl %eax, %eax
205 je 4f
2062:
207 movl %eax, %ebx
208 movl %edx, %ecx
209 addl $1, %ebx
210 adcl $0, %ecx
211 LOCK_PREFIX
212 cmpxchg8b (%esi)
213 jne 1b
214
215 movl $1, %eax
2163:
217 RESTORE ebx
218 ret
2194:
220 testl %edx, %edx
221 jne 2b
222 jmp 3b
223 CFI_ENDPROC
224ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index aa0987088774..dc8adad10a2f 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,10 +30,10 @@ static void fclex(void)
30} 30}
31 31
32/* Needs to be externally visible */ 32/* Needs to be externally visible */
33void finit_task(struct task_struct *tsk) 33void finit_soft_fpu(struct i387_soft_struct *soft)
34{ 34{
35 struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
36 struct address *oaddr, *iaddr; 35 struct address *oaddr, *iaddr;
36 memset(soft, 0, sizeof(*soft));
37 soft->cwd = 0x037f; 37 soft->cwd = 0x037f;
38 soft->swd = 0; 38 soft->swd = 0;
39 soft->ftop = 0; /* We don't keep top in the status word internally. */ 39 soft->ftop = 0; /* We don't keep top in the status word internally. */
@@ -52,7 +52,7 @@ void finit_task(struct task_struct *tsk)
52 52
53void finit(void) 53void finit(void)
54{ 54{
55 finit_task(current); 55 finit_soft_fpu(&current->thread.fpu.state->soft);
56} 56}
57 57
58/* 58/*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 5d87f586f8d7..7718541541d4 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -681,7 +681,7 @@ int fpregs_soft_set(struct task_struct *target,
681 unsigned int pos, unsigned int count, 681 unsigned int pos, unsigned int count,
682 const void *kbuf, const void __user *ubuf) 682 const void *kbuf, const void __user *ubuf)
683{ 683{
684 struct i387_soft_struct *s387 = &target->thread.xstate->soft; 684 struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
685 void *space = s387->st_space; 685 void *space = s387->st_space;
686 int ret; 686 int ret;
687 int offset, other, i, tags, regnr, tag, newtop; 687 int offset, other, i, tags, regnr, tag, newtop;
@@ -733,7 +733,7 @@ int fpregs_soft_get(struct task_struct *target,
733 unsigned int pos, unsigned int count, 733 unsigned int pos, unsigned int count,
734 void *kbuf, void __user *ubuf) 734 void *kbuf, void __user *ubuf)
735{ 735{
736 struct i387_soft_struct *s387 = &target->thread.xstate->soft; 736 struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
737 const void *space = s387->st_space; 737 const void *space = s387->st_space;
738 int ret; 738 int ret;
739 int offset = (S387->ftop & 7) * 10, other = 80 - offset; 739 int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 50fa0ec2c8a5..2c614410a5f3 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -31,7 +31,7 @@
31#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ 31#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \
32 == (1 << 10)) 32 == (1 << 10))
33 33
34#define I387 (current->thread.xstate) 34#define I387 (current->thread.fpu.state)
35#define FPU_info (I387->soft.info) 35#define FPU_info (I387->soft.info)
36 36
37#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) 37#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 28c68762648f..f9897f7a9ef1 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -363,6 +363,54 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
363 for (i = 0; i < MAX_NUMNODES; i++) 363 for (i = 0; i < MAX_NUMNODES; i++)
364 cutoff_node(i, start, end); 364 cutoff_node(i, start, end);
365 365
366 /*
367 * Join together blocks on the same node, holes between
368 * which don't overlap with memory on other nodes.
369 */
370 for (i = 0; i < num_node_memblks; ++i) {
371 int j, k;
372
373 for (j = i + 1; j < num_node_memblks; ++j) {
374 unsigned long start, end;
375
376 if (memblk_nodeid[i] != memblk_nodeid[j])
377 continue;
378 start = min(node_memblk_range[i].end,
379 node_memblk_range[j].end);
380 end = max(node_memblk_range[i].start,
381 node_memblk_range[j].start);
382 for (k = 0; k < num_node_memblks; ++k) {
383 if (memblk_nodeid[i] == memblk_nodeid[k])
384 continue;
385 if (start < node_memblk_range[k].end &&
386 end > node_memblk_range[k].start)
387 break;
388 }
389 if (k < num_node_memblks)
390 continue;
391 start = min(node_memblk_range[i].start,
392 node_memblk_range[j].start);
393 end = max(node_memblk_range[i].end,
394 node_memblk_range[j].end);
395 printk(KERN_INFO "SRAT: Node %d "
396 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
397 memblk_nodeid[i],
398 node_memblk_range[i].start,
399 node_memblk_range[i].end,
400 node_memblk_range[j].start,
401 node_memblk_range[j].end,
402 start, end);
403 node_memblk_range[i].start = start;
404 node_memblk_range[i].end = end;
405 k = --num_node_memblks - j;
406 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
407 k * sizeof(*memblk_nodeid));
408 memmove(node_memblk_range + j, node_memblk_range + j+1,
409 k * sizeof(*node_memblk_range));
410 --j;
411 }
412 }
413
366 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 414 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
367 memblk_nodeid); 415 memblk_nodeid);
368 if (memnode_shift < 0) { 416 if (memnode_shift < 0) {
@@ -461,7 +509,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
461 * node, it must now point to the fake node ID. 509 * node, it must now point to the fake node ID.
462 */ 510 */
463 for (j = 0; j < MAX_LOCAL_APIC; j++) 511 for (j = 0; j < MAX_LOCAL_APIC; j++)
464 if (apicid_to_node[j] == nid) 512 if (apicid_to_node[j] == nid &&
513 fake_apicid_to_node[j] == NUMA_NO_NODE)
465 fake_apicid_to_node[j] = i; 514 fake_apicid_to_node[j] = i;
466 } 515 }
467 for (i = 0; i < num_nodes; i++) 516 for (i = 0; i < num_nodes; i++)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2c505ee71014..b28d2f1253bb 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -31,8 +31,9 @@ static struct op_x86_model_spec *model;
31static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); 31static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
32static DEFINE_PER_CPU(unsigned long, saved_lvtpc); 32static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
33 33
34/* 0 == registered but off, 1 == registered and on */ 34/* must be protected with get_online_cpus()/put_online_cpus(): */
35static int nmi_enabled = 0; 35static int nmi_enabled;
36static int ctr_running;
36 37
37struct op_counter_config counter_config[OP_MAX_COUNTER]; 38struct op_counter_config counter_config[OP_MAX_COUNTER];
38 39
@@ -61,12 +62,16 @@ static int profile_exceptions_notify(struct notifier_block *self,
61{ 62{
62 struct die_args *args = (struct die_args *)data; 63 struct die_args *args = (struct die_args *)data;
63 int ret = NOTIFY_DONE; 64 int ret = NOTIFY_DONE;
64 int cpu = smp_processor_id();
65 65
66 switch (val) { 66 switch (val) {
67 case DIE_NMI: 67 case DIE_NMI:
68 case DIE_NMI_IPI: 68 case DIE_NMI_IPI:
69 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); 69 if (ctr_running)
70 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
71 else if (!nmi_enabled)
72 break;
73 else
74 model->stop(&__get_cpu_var(cpu_msrs));
70 ret = NOTIFY_STOP; 75 ret = NOTIFY_STOP;
71 break; 76 break;
72 default: 77 default:
@@ -95,24 +100,36 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
95static void nmi_cpu_start(void *dummy) 100static void nmi_cpu_start(void *dummy)
96{ 101{
97 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); 102 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
98 model->start(msrs); 103 if (!msrs->controls)
104 WARN_ON_ONCE(1);
105 else
106 model->start(msrs);
99} 107}
100 108
101static int nmi_start(void) 109static int nmi_start(void)
102{ 110{
111 get_online_cpus();
103 on_each_cpu(nmi_cpu_start, NULL, 1); 112 on_each_cpu(nmi_cpu_start, NULL, 1);
113 ctr_running = 1;
114 put_online_cpus();
104 return 0; 115 return 0;
105} 116}
106 117
107static void nmi_cpu_stop(void *dummy) 118static void nmi_cpu_stop(void *dummy)
108{ 119{
109 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); 120 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
110 model->stop(msrs); 121 if (!msrs->controls)
122 WARN_ON_ONCE(1);
123 else
124 model->stop(msrs);
111} 125}
112 126
113static void nmi_stop(void) 127static void nmi_stop(void)
114{ 128{
129 get_online_cpus();
115 on_each_cpu(nmi_cpu_stop, NULL, 1); 130 on_each_cpu(nmi_cpu_stop, NULL, 1);
131 ctr_running = 0;
132 put_online_cpus();
116} 133}
117 134
118#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 135#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
@@ -252,7 +269,10 @@ static int nmi_switch_event(void)
252 if (nmi_multiplex_on() < 0) 269 if (nmi_multiplex_on() < 0)
253 return -EINVAL; /* not necessary */ 270 return -EINVAL; /* not necessary */
254 271
255 on_each_cpu(nmi_cpu_switch, NULL, 1); 272 get_online_cpus();
273 if (ctr_running)
274 on_each_cpu(nmi_cpu_switch, NULL, 1);
275 put_online_cpus();
256 276
257 return 0; 277 return 0;
258} 278}
@@ -295,6 +315,7 @@ static void free_msrs(void)
295 kfree(per_cpu(cpu_msrs, i).controls); 315 kfree(per_cpu(cpu_msrs, i).controls);
296 per_cpu(cpu_msrs, i).controls = NULL; 316 per_cpu(cpu_msrs, i).controls = NULL;
297 } 317 }
318 nmi_shutdown_mux();
298} 319}
299 320
300static int allocate_msrs(void) 321static int allocate_msrs(void)
@@ -307,14 +328,21 @@ static int allocate_msrs(void)
307 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, 328 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
308 GFP_KERNEL); 329 GFP_KERNEL);
309 if (!per_cpu(cpu_msrs, i).counters) 330 if (!per_cpu(cpu_msrs, i).counters)
310 return 0; 331 goto fail;
311 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, 332 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
312 GFP_KERNEL); 333 GFP_KERNEL);
313 if (!per_cpu(cpu_msrs, i).controls) 334 if (!per_cpu(cpu_msrs, i).controls)
314 return 0; 335 goto fail;
315 } 336 }
316 337
338 if (!nmi_setup_mux())
339 goto fail;
340
317 return 1; 341 return 1;
342
343fail:
344 free_msrs();
345 return 0;
318} 346}
319 347
320static void nmi_cpu_setup(void *dummy) 348static void nmi_cpu_setup(void *dummy)
@@ -336,49 +364,6 @@ static struct notifier_block profile_exceptions_nb = {
336 .priority = 2 364 .priority = 2
337}; 365};
338 366
339static int nmi_setup(void)
340{
341 int err = 0;
342 int cpu;
343
344 if (!allocate_msrs())
345 err = -ENOMEM;
346 else if (!nmi_setup_mux())
347 err = -ENOMEM;
348 else
349 err = register_die_notifier(&profile_exceptions_nb);
350
351 if (err) {
352 free_msrs();
353 nmi_shutdown_mux();
354 return err;
355 }
356
357 /* We need to serialize save and setup for HT because the subset
358 * of msrs are distinct for save and setup operations
359 */
360
361 /* Assume saved/restored counters are the same on all CPUs */
362 model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
363 for_each_possible_cpu(cpu) {
364 if (!cpu)
365 continue;
366
367 memcpy(per_cpu(cpu_msrs, cpu).counters,
368 per_cpu(cpu_msrs, 0).counters,
369 sizeof(struct op_msr) * model->num_counters);
370
371 memcpy(per_cpu(cpu_msrs, cpu).controls,
372 per_cpu(cpu_msrs, 0).controls,
373 sizeof(struct op_msr) * model->num_controls);
374
375 mux_clone(cpu);
376 }
377 on_each_cpu(nmi_cpu_setup, NULL, 1);
378 nmi_enabled = 1;
379 return 0;
380}
381
382static void nmi_cpu_restore_registers(struct op_msrs *msrs) 367static void nmi_cpu_restore_registers(struct op_msrs *msrs)
383{ 368{
384 struct op_msr *counters = msrs->counters; 369 struct op_msr *counters = msrs->counters;
@@ -412,20 +397,24 @@ static void nmi_cpu_shutdown(void *dummy)
412 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); 397 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
413 apic_write(APIC_LVTERR, v); 398 apic_write(APIC_LVTERR, v);
414 nmi_cpu_restore_registers(msrs); 399 nmi_cpu_restore_registers(msrs);
400 if (model->cpu_down)
401 model->cpu_down();
415} 402}
416 403
417static void nmi_shutdown(void) 404static void nmi_cpu_up(void *dummy)
418{ 405{
419 struct op_msrs *msrs; 406 if (nmi_enabled)
407 nmi_cpu_setup(dummy);
408 if (ctr_running)
409 nmi_cpu_start(dummy);
410}
420 411
421 nmi_enabled = 0; 412static void nmi_cpu_down(void *dummy)
422 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 413{
423 unregister_die_notifier(&profile_exceptions_nb); 414 if (ctr_running)
424 nmi_shutdown_mux(); 415 nmi_cpu_stop(dummy);
425 msrs = &get_cpu_var(cpu_msrs); 416 if (nmi_enabled)
426 model->shutdown(msrs); 417 nmi_cpu_shutdown(dummy);
427 free_msrs();
428 put_cpu_var(cpu_msrs);
429} 418}
430 419
431static int nmi_create_files(struct super_block *sb, struct dentry *root) 420static int nmi_create_files(struct super_block *sb, struct dentry *root)
@@ -457,7 +446,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
457 return 0; 446 return 0;
458} 447}
459 448
460#ifdef CONFIG_SMP
461static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, 449static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
462 void *data) 450 void *data)
463{ 451{
@@ -465,10 +453,10 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
465 switch (action) { 453 switch (action) {
466 case CPU_DOWN_FAILED: 454 case CPU_DOWN_FAILED:
467 case CPU_ONLINE: 455 case CPU_ONLINE:
468 smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); 456 smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
469 break; 457 break;
470 case CPU_DOWN_PREPARE: 458 case CPU_DOWN_PREPARE:
471 smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); 459 smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
472 break; 460 break;
473 } 461 }
474 return NOTIFY_DONE; 462 return NOTIFY_DONE;
@@ -477,7 +465,75 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
477static struct notifier_block oprofile_cpu_nb = { 465static struct notifier_block oprofile_cpu_nb = {
478 .notifier_call = oprofile_cpu_notifier 466 .notifier_call = oprofile_cpu_notifier
479}; 467};
480#endif 468
469static int nmi_setup(void)
470{
471 int err = 0;
472 int cpu;
473
474 if (!allocate_msrs())
475 return -ENOMEM;
476
477 /* We need to serialize save and setup for HT because the subset
478 * of msrs are distinct for save and setup operations
479 */
480
481 /* Assume saved/restored counters are the same on all CPUs */
482 err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
483 if (err)
484 goto fail;
485
486 for_each_possible_cpu(cpu) {
487 if (!cpu)
488 continue;
489
490 memcpy(per_cpu(cpu_msrs, cpu).counters,
491 per_cpu(cpu_msrs, 0).counters,
492 sizeof(struct op_msr) * model->num_counters);
493
494 memcpy(per_cpu(cpu_msrs, cpu).controls,
495 per_cpu(cpu_msrs, 0).controls,
496 sizeof(struct op_msr) * model->num_controls);
497
498 mux_clone(cpu);
499 }
500
501 nmi_enabled = 0;
502 ctr_running = 0;
503 barrier();
504 err = register_die_notifier(&profile_exceptions_nb);
505 if (err)
506 goto fail;
507
508 get_online_cpus();
509 register_cpu_notifier(&oprofile_cpu_nb);
510 on_each_cpu(nmi_cpu_setup, NULL, 1);
511 nmi_enabled = 1;
512 put_online_cpus();
513
514 return 0;
515fail:
516 free_msrs();
517 return err;
518}
519
520static void nmi_shutdown(void)
521{
522 struct op_msrs *msrs;
523
524 get_online_cpus();
525 unregister_cpu_notifier(&oprofile_cpu_nb);
526 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
527 nmi_enabled = 0;
528 ctr_running = 0;
529 put_online_cpus();
530 barrier();
531 unregister_die_notifier(&profile_exceptions_nb);
532 msrs = &get_cpu_var(cpu_msrs);
533 model->shutdown(msrs);
534 free_msrs();
535 put_cpu_var(cpu_msrs);
536}
481 537
482#ifdef CONFIG_PM 538#ifdef CONFIG_PM
483 539
@@ -687,9 +743,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
687 return -ENODEV; 743 return -ENODEV;
688 } 744 }
689 745
690#ifdef CONFIG_SMP
691 register_cpu_notifier(&oprofile_cpu_nb);
692#endif
693 /* default values, can be overwritten by model */ 746 /* default values, can be overwritten by model */
694 ops->create_files = nmi_create_files; 747 ops->create_files = nmi_create_files;
695 ops->setup = nmi_setup; 748 ops->setup = nmi_setup;
@@ -716,12 +769,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
716 769
717void op_nmi_exit(void) 770void op_nmi_exit(void)
718{ 771{
719 if (using_nmi) { 772 if (using_nmi)
720 exit_sysfs(); 773 exit_sysfs();
721#ifdef CONFIG_SMP
722 unregister_cpu_notifier(&oprofile_cpu_nb);
723#endif
724 }
725 if (model->exit)
726 model->exit();
727} 774}
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 090cbbec7dbd..b67a6b5aa8d4 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -30,13 +30,10 @@
30#include "op_counter.h" 30#include "op_counter.h"
31 31
32#define NUM_COUNTERS 4 32#define NUM_COUNTERS 4
33#define NUM_CONTROLS 4
34#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 33#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
35#define NUM_VIRT_COUNTERS 32 34#define NUM_VIRT_COUNTERS 32
36#define NUM_VIRT_CONTROLS 32
37#else 35#else
38#define NUM_VIRT_COUNTERS NUM_COUNTERS 36#define NUM_VIRT_COUNTERS NUM_COUNTERS
39#define NUM_VIRT_CONTROLS NUM_CONTROLS
40#endif 37#endif
41 38
42#define OP_EVENT_MASK 0x0FFF 39#define OP_EVENT_MASK 0x0FFF
@@ -105,102 +102,6 @@ static u32 get_ibs_caps(void)
105 return ibs_caps; 102 return ibs_caps;
106} 103}
107 104
108#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
109
110static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
111 struct op_msrs const * const msrs)
112{
113 u64 val;
114 int i;
115
116 /* enable active counters */
117 for (i = 0; i < NUM_COUNTERS; ++i) {
118 int virt = op_x86_phys_to_virt(i);
119 if (!reset_value[virt])
120 continue;
121 rdmsrl(msrs->controls[i].addr, val);
122 val &= model->reserved;
123 val |= op_x86_get_ctrl(model, &counter_config[virt]);
124 wrmsrl(msrs->controls[i].addr, val);
125 }
126}
127
128#endif
129
130/* functions for op_amd_spec */
131
132static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
133{
134 int i;
135
136 for (i = 0; i < NUM_COUNTERS; i++) {
137 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
138 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
139 }
140
141 for (i = 0; i < NUM_CONTROLS; i++) {
142 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
143 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
144 }
145}
146
147static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
148 struct op_msrs const * const msrs)
149{
150 u64 val;
151 int i;
152
153 /* setup reset_value */
154 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
155 if (counter_config[i].enabled
156 && msrs->counters[op_x86_virt_to_phys(i)].addr)
157 reset_value[i] = counter_config[i].count;
158 else
159 reset_value[i] = 0;
160 }
161
162 /* clear all counters */
163 for (i = 0; i < NUM_CONTROLS; ++i) {
164 if (unlikely(!msrs->controls[i].addr)) {
165 if (counter_config[i].enabled && !smp_processor_id())
166 /*
167 * counter is reserved, this is on all
168 * cpus, so report only for cpu #0
169 */
170 op_x86_warn_reserved(i);
171 continue;
172 }
173 rdmsrl(msrs->controls[i].addr, val);
174 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
175 op_x86_warn_in_use(i);
176 val &= model->reserved;
177 wrmsrl(msrs->controls[i].addr, val);
178 }
179
180 /* avoid a false detection of ctr overflows in NMI handler */
181 for (i = 0; i < NUM_COUNTERS; ++i) {
182 if (unlikely(!msrs->counters[i].addr))
183 continue;
184 wrmsrl(msrs->counters[i].addr, -1LL);
185 }
186
187 /* enable active counters */
188 for (i = 0; i < NUM_COUNTERS; ++i) {
189 int virt = op_x86_phys_to_virt(i);
190 if (!reset_value[virt])
191 continue;
192
193 /* setup counter registers */
194 wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
195
196 /* setup control registers */
197 rdmsrl(msrs->controls[i].addr, val);
198 val &= model->reserved;
199 val |= op_x86_get_ctrl(model, &counter_config[virt]);
200 wrmsrl(msrs->controls[i].addr, val);
201 }
202}
203
204/* 105/*
205 * 16-bit Linear Feedback Shift Register (LFSR) 106 * 16-bit Linear Feedback Shift Register (LFSR)
206 * 107 *
@@ -365,6 +266,125 @@ static void op_amd_stop_ibs(void)
365 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 266 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
366} 267}
367 268
269#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
270
271static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
272 struct op_msrs const * const msrs)
273{
274 u64 val;
275 int i;
276
277 /* enable active counters */
278 for (i = 0; i < NUM_COUNTERS; ++i) {
279 int virt = op_x86_phys_to_virt(i);
280 if (!reset_value[virt])
281 continue;
282 rdmsrl(msrs->controls[i].addr, val);
283 val &= model->reserved;
284 val |= op_x86_get_ctrl(model, &counter_config[virt]);
285 wrmsrl(msrs->controls[i].addr, val);
286 }
287}
288
289#endif
290
291/* functions for op_amd_spec */
292
293static void op_amd_shutdown(struct op_msrs const * const msrs)
294{
295 int i;
296
297 for (i = 0; i < NUM_COUNTERS; ++i) {
298 if (!msrs->counters[i].addr)
299 continue;
300 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
301 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
302 }
303}
304
305static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
306{
307 int i;
308
309 for (i = 0; i < NUM_COUNTERS; i++) {
310 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
311 goto fail;
312 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
313 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
314 goto fail;
315 }
316 /* both registers must be reserved */
317 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
318 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
319 continue;
320 fail:
321 if (!counter_config[i].enabled)
322 continue;
323 op_x86_warn_reserved(i);
324 op_amd_shutdown(msrs);
325 return -EBUSY;
326 }
327
328 return 0;
329}
330
331static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
332 struct op_msrs const * const msrs)
333{
334 u64 val;
335 int i;
336
337 /* setup reset_value */
338 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
339 if (counter_config[i].enabled
340 && msrs->counters[op_x86_virt_to_phys(i)].addr)
341 reset_value[i] = counter_config[i].count;
342 else
343 reset_value[i] = 0;
344 }
345
346 /* clear all counters */
347 for (i = 0; i < NUM_COUNTERS; ++i) {
348 if (!msrs->controls[i].addr)
349 continue;
350 rdmsrl(msrs->controls[i].addr, val);
351 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
352 op_x86_warn_in_use(i);
353 val &= model->reserved;
354 wrmsrl(msrs->controls[i].addr, val);
355 /*
356 * avoid a false detection of ctr overflows in NMI
357 * handler
358 */
359 wrmsrl(msrs->counters[i].addr, -1LL);
360 }
361
362 /* enable active counters */
363 for (i = 0; i < NUM_COUNTERS; ++i) {
364 int virt = op_x86_phys_to_virt(i);
365 if (!reset_value[virt])
366 continue;
367
368 /* setup counter registers */
369 wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
370
371 /* setup control registers */
372 rdmsrl(msrs->controls[i].addr, val);
373 val &= model->reserved;
374 val |= op_x86_get_ctrl(model, &counter_config[virt]);
375 wrmsrl(msrs->controls[i].addr, val);
376 }
377
378 if (ibs_caps)
379 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
380}
381
382static void op_amd_cpu_shutdown(void)
383{
384 if (ibs_caps)
385 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
386}
387
368static int op_amd_check_ctrs(struct pt_regs * const regs, 388static int op_amd_check_ctrs(struct pt_regs * const regs,
369 struct op_msrs const * const msrs) 389 struct op_msrs const * const msrs)
370{ 390{
@@ -425,42 +445,16 @@ static void op_amd_stop(struct op_msrs const * const msrs)
425 op_amd_stop_ibs(); 445 op_amd_stop_ibs();
426} 446}
427 447
428static void op_amd_shutdown(struct op_msrs const * const msrs) 448static int __init_ibs_nmi(void)
429{
430 int i;
431
432 for (i = 0; i < NUM_COUNTERS; ++i) {
433 if (msrs->counters[i].addr)
434 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
435 }
436 for (i = 0; i < NUM_CONTROLS; ++i) {
437 if (msrs->controls[i].addr)
438 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
439 }
440}
441
442static u8 ibs_eilvt_off;
443
444static inline void apic_init_ibs_nmi_per_cpu(void *arg)
445{
446 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
447}
448
449static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
450{
451 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
452}
453
454static int init_ibs_nmi(void)
455{ 449{
456#define IBSCTL_LVTOFFSETVAL (1 << 8) 450#define IBSCTL_LVTOFFSETVAL (1 << 8)
457#define IBSCTL 0x1cc 451#define IBSCTL 0x1cc
458 struct pci_dev *cpu_cfg; 452 struct pci_dev *cpu_cfg;
459 int nodes; 453 int nodes;
460 u32 value = 0; 454 u32 value = 0;
455 u8 ibs_eilvt_off;
461 456
462 /* per CPU setup */ 457 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
463 on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
464 458
465 nodes = 0; 459 nodes = 0;
466 cpu_cfg = NULL; 460 cpu_cfg = NULL;
@@ -490,22 +484,15 @@ static int init_ibs_nmi(void)
490 return 0; 484 return 0;
491} 485}
492 486
493/* uninitialize the APIC for the IBS interrupts if needed */
494static void clear_ibs_nmi(void)
495{
496 if (ibs_caps)
497 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
498}
499
500/* initialize the APIC for the IBS interrupts if available */ 487/* initialize the APIC for the IBS interrupts if available */
501static void ibs_init(void) 488static void init_ibs(void)
502{ 489{
503 ibs_caps = get_ibs_caps(); 490 ibs_caps = get_ibs_caps();
504 491
505 if (!ibs_caps) 492 if (!ibs_caps)
506 return; 493 return;
507 494
508 if (init_ibs_nmi()) { 495 if (__init_ibs_nmi()) {
509 ibs_caps = 0; 496 ibs_caps = 0;
510 return; 497 return;
511 } 498 }
@@ -514,14 +501,6 @@ static void ibs_init(void)
514 (unsigned)ibs_caps); 501 (unsigned)ibs_caps);
515} 502}
516 503
517static void ibs_exit(void)
518{
519 if (!ibs_caps)
520 return;
521
522 clear_ibs_nmi();
523}
524
525static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 504static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
526 505
527static int setup_ibs_files(struct super_block *sb, struct dentry *root) 506static int setup_ibs_files(struct super_block *sb, struct dentry *root)
@@ -570,27 +549,22 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
570 549
571static int op_amd_init(struct oprofile_operations *ops) 550static int op_amd_init(struct oprofile_operations *ops)
572{ 551{
573 ibs_init(); 552 init_ibs();
574 create_arch_files = ops->create_files; 553 create_arch_files = ops->create_files;
575 ops->create_files = setup_ibs_files; 554 ops->create_files = setup_ibs_files;
576 return 0; 555 return 0;
577} 556}
578 557
579static void op_amd_exit(void)
580{
581 ibs_exit();
582}
583
584struct op_x86_model_spec op_amd_spec = { 558struct op_x86_model_spec op_amd_spec = {
585 .num_counters = NUM_COUNTERS, 559 .num_counters = NUM_COUNTERS,
586 .num_controls = NUM_CONTROLS, 560 .num_controls = NUM_COUNTERS,
587 .num_virt_counters = NUM_VIRT_COUNTERS, 561 .num_virt_counters = NUM_VIRT_COUNTERS,
588 .reserved = MSR_AMD_EVENTSEL_RESERVED, 562 .reserved = MSR_AMD_EVENTSEL_RESERVED,
589 .event_mask = OP_EVENT_MASK, 563 .event_mask = OP_EVENT_MASK,
590 .init = op_amd_init, 564 .init = op_amd_init,
591 .exit = op_amd_exit,
592 .fill_in_addresses = &op_amd_fill_in_addresses, 565 .fill_in_addresses = &op_amd_fill_in_addresses,
593 .setup_ctrs = &op_amd_setup_ctrs, 566 .setup_ctrs = &op_amd_setup_ctrs,
567 .cpu_down = &op_amd_cpu_shutdown,
594 .check_ctrs = &op_amd_check_ctrs, 568 .check_ctrs = &op_amd_check_ctrs,
595 .start = &op_amd_start, 569 .start = &op_amd_start,
596 .stop = &op_amd_stop, 570 .stop = &op_amd_stop,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index e6a160a4684a..182558dd5515 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -385,8 +385,26 @@ static unsigned int get_stagger(void)
385 385
386static unsigned long reset_value[NUM_COUNTERS_NON_HT]; 386static unsigned long reset_value[NUM_COUNTERS_NON_HT];
387 387
388static void p4_shutdown(struct op_msrs const * const msrs)
389{
390 int i;
388 391
389static void p4_fill_in_addresses(struct op_msrs * const msrs) 392 for (i = 0; i < num_counters; ++i) {
393 if (msrs->counters[i].addr)
394 release_perfctr_nmi(msrs->counters[i].addr);
395 }
396 /*
397 * some of the control registers are specially reserved in
398 * conjunction with the counter registers (hence the starting offset).
399 * This saves a few bits.
400 */
401 for (i = num_counters; i < num_controls; ++i) {
402 if (msrs->controls[i].addr)
403 release_evntsel_nmi(msrs->controls[i].addr);
404 }
405}
406
407static int p4_fill_in_addresses(struct op_msrs * const msrs)
390{ 408{
391 unsigned int i; 409 unsigned int i;
392 unsigned int addr, cccraddr, stag; 410 unsigned int addr, cccraddr, stag;
@@ -468,6 +486,18 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
468 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; 486 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
469 } 487 }
470 } 488 }
489
490 for (i = 0; i < num_counters; ++i) {
491 if (!counter_config[i].enabled)
492 continue;
493 if (msrs->controls[i].addr)
494 continue;
495 op_x86_warn_reserved(i);
496 p4_shutdown(msrs);
497 return -EBUSY;
498 }
499
500 return 0;
471} 501}
472 502
473 503
@@ -668,26 +698,6 @@ static void p4_stop(struct op_msrs const * const msrs)
668 } 698 }
669} 699}
670 700
671static void p4_shutdown(struct op_msrs const * const msrs)
672{
673 int i;
674
675 for (i = 0; i < num_counters; ++i) {
676 if (msrs->counters[i].addr)
677 release_perfctr_nmi(msrs->counters[i].addr);
678 }
679 /*
680 * some of the control registers are specially reserved in
681 * conjunction with the counter registers (hence the starting offset).
682 * This saves a few bits.
683 */
684 for (i = num_counters; i < num_controls; ++i) {
685 if (msrs->controls[i].addr)
686 release_evntsel_nmi(msrs->controls[i].addr);
687 }
688}
689
690
691#ifdef CONFIG_SMP 701#ifdef CONFIG_SMP
692struct op_x86_model_spec op_p4_ht2_spec = { 702struct op_x86_model_spec op_p4_ht2_spec = {
693 .num_counters = NUM_COUNTERS_HT2, 703 .num_counters = NUM_COUNTERS_HT2,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b5..d769cda54082 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -30,19 +30,46 @@ static int counter_width = 32;
30 30
31static u64 *reset_value; 31static u64 *reset_value;
32 32
33static void ppro_fill_in_addresses(struct op_msrs * const msrs) 33static void ppro_shutdown(struct op_msrs const * const msrs)
34{ 34{
35 int i; 35 int i;
36 36
37 for (i = 0; i < num_counters; i++) { 37 for (i = 0; i < num_counters; ++i) {
38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) 38 if (!msrs->counters[i].addr)
39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; 39 continue;
40 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
41 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
42 }
43 if (reset_value) {
44 kfree(reset_value);
45 reset_value = NULL;
40 } 46 }
47}
48
49static int ppro_fill_in_addresses(struct op_msrs * const msrs)
50{
51 int i;
41 52
42 for (i = 0; i < num_counters; i++) { 53 for (i = 0; i < num_counters; i++) {
43 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) 54 if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
44 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; 55 goto fail;
56 if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
57 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
58 goto fail;
59 }
60 /* both registers must be reserved */
61 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
62 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
63 continue;
64 fail:
65 if (!counter_config[i].enabled)
66 continue;
67 op_x86_warn_reserved(i);
68 ppro_shutdown(msrs);
69 return -EBUSY;
45 } 70 }
71
72 return 0;
46} 73}
47 74
48 75
@@ -78,26 +105,17 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
78 105
79 /* clear all counters */ 106 /* clear all counters */
80 for (i = 0; i < num_counters; ++i) { 107 for (i = 0; i < num_counters; ++i) {
81 if (unlikely(!msrs->controls[i].addr)) { 108 if (!msrs->controls[i].addr)
82 if (counter_config[i].enabled && !smp_processor_id())
83 /*
84 * counter is reserved, this is on all
85 * cpus, so report only for cpu #0
86 */
87 op_x86_warn_reserved(i);
88 continue; 109 continue;
89 }
90 rdmsrl(msrs->controls[i].addr, val); 110 rdmsrl(msrs->controls[i].addr, val);
91 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) 111 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
92 op_x86_warn_in_use(i); 112 op_x86_warn_in_use(i);
93 val &= model->reserved; 113 val &= model->reserved;
94 wrmsrl(msrs->controls[i].addr, val); 114 wrmsrl(msrs->controls[i].addr, val);
95 } 115 /*
96 116 * avoid a false detection of ctr overflows in NMI *
97 /* avoid a false detection of ctr overflows in NMI handler */ 117 * handler
98 for (i = 0; i < num_counters; ++i) { 118 */
99 if (unlikely(!msrs->counters[i].addr))
100 continue;
101 wrmsrl(msrs->counters[i].addr, -1LL); 119 wrmsrl(msrs->counters[i].addr, -1LL);
102 } 120 }
103 121
@@ -189,25 +207,6 @@ static void ppro_stop(struct op_msrs const * const msrs)
189 } 207 }
190} 208}
191 209
192static void ppro_shutdown(struct op_msrs const * const msrs)
193{
194 int i;
195
196 for (i = 0; i < num_counters; ++i) {
197 if (msrs->counters[i].addr)
198 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
199 }
200 for (i = 0; i < num_counters; ++i) {
201 if (msrs->controls[i].addr)
202 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
203 }
204 if (reset_value) {
205 kfree(reset_value);
206 reset_value = NULL;
207 }
208}
209
210
211struct op_x86_model_spec op_ppro_spec = { 210struct op_x86_model_spec op_ppro_spec = {
212 .num_counters = 2, 211 .num_counters = 2,
213 .num_controls = 2, 212 .num_controls = 2,
@@ -239,11 +238,11 @@ static void arch_perfmon_setup_counters(void)
239 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && 238 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
240 current_cpu_data.x86_model == 15) { 239 current_cpu_data.x86_model == 15) {
241 eax.split.version_id = 2; 240 eax.split.version_id = 2;
242 eax.split.num_events = 2; 241 eax.split.num_counters = 2;
243 eax.split.bit_width = 40; 242 eax.split.bit_width = 40;
244 } 243 }
245 244
246 num_counters = eax.split.num_events; 245 num_counters = eax.split.num_counters;
247 246
248 op_arch_perfmon_spec.num_counters = num_counters; 247 op_arch_perfmon_spec.num_counters = num_counters;
249 op_arch_perfmon_spec.num_controls = num_counters; 248 op_arch_perfmon_spec.num_controls = num_counters;
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index ff82a755edd4..89017fa1fd63 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -40,10 +40,10 @@ struct op_x86_model_spec {
40 u64 reserved; 40 u64 reserved;
41 u16 event_mask; 41 u16 event_mask;
42 int (*init)(struct oprofile_operations *ops); 42 int (*init)(struct oprofile_operations *ops);
43 void (*exit)(void); 43 int (*fill_in_addresses)(struct op_msrs * const msrs);
44 void (*fill_in_addresses)(struct op_msrs * const msrs);
45 void (*setup_ctrs)(struct op_x86_model_spec const *model, 44 void (*setup_ctrs)(struct op_x86_model_spec const *model,
46 struct op_msrs const * const msrs); 45 struct op_msrs const * const msrs);
46 void (*cpu_down)(void);
47 int (*check_ctrs)(struct pt_regs * const regs, 47 int (*check_ctrs)(struct pt_regs * const regs,
48 struct op_msrs const * const msrs); 48 struct op_msrs const * const msrs);
49 void (*start)(struct op_msrs const * const msrs); 49 void (*start)(struct op_msrs const * const msrs);
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index d5c7aefe56ff..7ef3a2735df3 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -247,6 +247,10 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
247 u32 size; 247 u32 size;
248 int i; 248 int i;
249 249
250 /* Must have extended configuration space */
251 if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
252 return;
253
250 /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ 254 /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
251 offset = fixed_bar_cap(dev->bus, dev->devfn); 255 offset = fixed_bar_cap(dev->bus, dev->devfn);
252 if (!offset || PCI_DEVFN(2, 0) == dev->devfn || 256 if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||