aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig73
-rw-r--r--arch/x86/Kconfig.cpu24
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S115
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c130
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/alternative.h20
-rw-r--r--arch/x86/include/asm/apic.h13
-rw-r--r--arch/x86/include/asm/arch_hweight.h61
-rw-r--r--arch/x86/include/asm/atomic.h25
-rw-r--r--arch/x86/include/asm/atomic64_32.h278
-rw-r--r--arch/x86/include/asm/atomic64_64.h25
-rw-r--r--arch/x86/include/asm/bitops.h4
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/cacheflush.h46
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h3
-rw-r--r--arch/x86/include/asm/cpufeature.h65
-rw-r--r--arch/x86/include/asm/ds.h302
-rw-r--r--arch/x86/include/asm/dwarf2.h12
-rw-r--r--arch/x86/include/asm/e820.h7
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hpet.h1
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h10
-rw-r--r--arch/x86/include/asm/hyperv.h11
-rw-r--r--arch/x86/include/asm/hypervisor.h27
-rw-r--r--arch/x86/include/asm/i387.h129
-rw-r--r--arch/x86/include/asm/i8253.h2
-rw-r--r--arch/x86/include/asm/insn.h2
-rw-r--r--arch/x86/include/asm/inst.h96
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h55
-rw-r--r--arch/x86/include/asm/io_apic.h13
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/kgdb.h3
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/include/asm/kvm.h17
-rw-r--r--arch/x86/include/asm/kvm_emulate.h46
-rw-r--r--arch/x86/include/asm/kvm_host.h80
-rw-r--r--arch/x86/include/asm/kvm_para.h13
-rw-r--r--arch/x86/include/asm/mce.h8
-rw-r--r--arch/x86/include/asm/mpspec.h10
-rw-r--r--arch/x86/include/asm/mshyperv.h14
-rw-r--r--arch/x86/include/asm/msr-index.h22
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/percpu.h26
-rw-r--r--arch/x86/include/asm/perf_event.h76
-rw-r--r--arch/x86/include/asm/perf_event_p4.h795
-rw-r--r--arch/x86/include/asm/processor.h47
-rw-r--r--arch/x86/include/asm/ptrace-abi.h57
-rw-r--r--arch/x86/include/asm/ptrace.h6
-rw-r--r--arch/x86/include/asm/pvclock-abi.h4
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/rdc321x_defs.h12
-rw-r--r--arch/x86/include/asm/scatterlist.h5
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/include/asm/thread_info.h15
-rw-r--r--arch/x86/include/asm/topology.h26
-rw-r--r--arch/x86/include/asm/traps.h2
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h247
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h2
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h528
-rw-r--r--arch/x86/include/asm/vmware.h27
-rw-r--r--arch/x86/include/asm/vmx.h12
-rw-r--r--arch/x86/include/asm/xsave.h7
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c156
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/alternative.c47
-rw-r--r--arch/x86/kernel/apic/apic.c41
-rw-r--r--arch/x86/kernel/apic/es7000_32.c19
-rw-r--r--arch/x86/kernel/apic/io_apic.c99
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c3
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c14
-rw-r--r--arch/x86/kernel/cpu/bugs.c2
-rw-r--r--arch/x86/kernel/cpu/common.c40
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c169
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c52
-rw-r--r--arch/x86/kernel/cpu/intel.c8
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c181
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c83
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c55
-rw-r--r--arch/x86/kernel/cpu/perf_event.c821
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c357
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c641
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c858
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c31
-rw-r--r--arch/x86/kernel/cpu/vmware.c38
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/ds.c1437
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/early_printk.c8
-rw-r--r--arch/x86/kernel/entry_32.S19
-rw-r--r--arch/x86/kernel/hpet.c29
-rw-r--r--arch/x86/kernel/hw_breakpoint.c41
-rw-r--r--arch/x86/kernel/i387.c107
-rw-r--r--arch/x86/kernel/i8253.c14
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kgdb.c103
-rw-r--r--arch/x86/kernel/kprobes.c43
-rw-r--r--arch/x86/kernel/kvmclock.c56
-rw-r--r--arch/x86/kernel/microcode_core.c5
-rw-r--r--arch/x86/kernel/microcode_intel.c22
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/mrst.c5
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c50
-rw-r--r--arch/x86/kernel/process_32.c10
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/ptrace.c384
-rw-r--r--arch/x86/kernel/pvclock.c37
-rw-r--r--arch/x86/kernel/quirks.c8
-rw-r--r--arch/x86/kernel/setup.c12
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/sfi.c4
-rw-r--r--arch/x86/kernel/smpboot.c26
-rw-r--r--arch/x86/kernel/step.c46
-rw-r--r--arch/x86/kernel/tboot.c21
-rw-r--r--arch/x86/kernel/tlb_uv.c1280
-rw-r--r--arch/x86/kernel/traps.c197
-rw-r--r--arch/x86/kernel/uv_irq.c12
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/xsave.c8
-rw-r--r--arch/x86/kvm/emulate.c1247
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_timer.h4
-rw-r--r--arch/x86/kvm/mmu.c225
-rw-r--r--arch/x86/kvm/mmutrace.h84
-rw-r--r--arch/x86/kvm/paging_tmpl.h46
-rw-r--r--arch/x86/kvm/svm.c952
-rw-r--r--arch/x86/kvm/timer.c3
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c386
-rw-r--r--arch/x86/kvm/x86.c1653
-rw-r--r--arch/x86/kvm/x86.h10
-rw-r--r--arch/x86/lguest/boot.c1
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/atomic64_32.c273
-rw-r--r--arch/x86/lib/atomic64_386_32.S174
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S224
-rw-r--r--arch/x86/math-emu/fpu_aux.c6
-rw-r--r--arch/x86/math-emu/fpu_entry.c4
-rw-r--r--arch/x86/math-emu/fpu_system.h2
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/numa_64.c9
-rw-r--r--arch/x86/mm/pageattr.c53
-rw-r--r--arch/x86/mm/pat.c247
-rw-r--r--arch/x86/mm/pat_internal.h46
-rw-r--r--arch/x86/mm/pat_rbtree.c274
-rw-r--r--arch/x86/mm/pf_in.c2
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/srat_64.c51
-rw-r--r--arch/x86/oprofile/nmi_int.c199
-rw-r--r--arch/x86/oprofile/op_model_amd.c280
-rw-r--r--arch/x86/oprofile/op_model_p4.c52
-rw-r--r--arch/x86/oprofile/op_model_ppro.c81
-rw-r--r--arch/x86/oprofile/op_x86_model.h4
-rw-r--r--arch/x86/pci/Makefile2
-rw-r--r--arch/x86/pci/acpi.c8
-rw-r--r--arch/x86/pci/broadcom_bus.c101
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/pci/direct.c16
-rw-r--r--arch/x86/pci/irq.c9
-rw-r--r--arch/x86/pci/mmconfig-shared.c17
-rw-r--r--arch/x86/pci/mmconfig_32.c8
-rw-r--r--arch/x86/pci/mrst.c6
-rw-r--r--arch/x86/pci/numaq_32.c8
-rw-r--r--arch/x86/pci/pcbios.c8
-rw-r--r--arch/x86/xen/time.c6
186 files changed, 11259 insertions, 8175 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9458685902b..dcb0593b4a6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,11 +53,15 @@ config X86
53 select HAVE_KERNEL_LZMA 53 select HAVE_KERNEL_LZMA
54 select HAVE_KERNEL_LZO 54 select HAVE_KERNEL_LZO
55 select HAVE_HW_BREAKPOINT 55 select HAVE_HW_BREAKPOINT
56 select HAVE_MIXED_BREAKPOINTS_REGS
56 select PERF_EVENTS 57 select PERF_EVENTS
57 select ANON_INODES 58 select ANON_INODES
58 select HAVE_ARCH_KMEMCHECK 59 select HAVE_ARCH_KMEMCHECK
59 select HAVE_USER_RETURN_NOTIFIER 60 select HAVE_USER_RETURN_NOTIFIER
60 61
62config INSTRUCTION_DECODER
63 def_bool (KPROBES || PERF_EVENTS)
64
61config OUTPUT_FORMAT 65config OUTPUT_FORMAT
62 string 66 string
63 default "elf32-i386" if X86_32 67 default "elf32-i386" if X86_32
@@ -105,6 +109,9 @@ config SBUS
105config NEED_DMA_MAP_STATE 109config NEED_DMA_MAP_STATE
106 def_bool (X86_64 || DMAR || DMA_API_DEBUG) 110 def_bool (X86_64 || DMAR || DMA_API_DEBUG)
107 111
112config NEED_SG_DMA_LENGTH
113 def_bool y
114
108config GENERIC_ISA_DMA 115config GENERIC_ISA_DMA
109 def_bool y 116 def_bool y
110 117
@@ -197,20 +204,17 @@ config HAVE_INTEL_TXT
197 204
198# Use the generic interrupt handling code in kernel/irq/: 205# Use the generic interrupt handling code in kernel/irq/:
199config GENERIC_HARDIRQS 206config GENERIC_HARDIRQS
200 bool 207 def_bool y
201 default y
202 208
203config GENERIC_HARDIRQS_NO__DO_IRQ 209config GENERIC_HARDIRQS_NO__DO_IRQ
204 def_bool y 210 def_bool y
205 211
206config GENERIC_IRQ_PROBE 212config GENERIC_IRQ_PROBE
207 bool 213 def_bool y
208 default y
209 214
210config GENERIC_PENDING_IRQ 215config GENERIC_PENDING_IRQ
211 bool 216 def_bool y
212 depends on GENERIC_HARDIRQS && SMP 217 depends on GENERIC_HARDIRQS && SMP
213 default y
214 218
215config USE_GENERIC_SMP_HELPERS 219config USE_GENERIC_SMP_HELPERS
216 def_bool y 220 def_bool y
@@ -225,19 +229,22 @@ config X86_64_SMP
225 depends on X86_64 && SMP 229 depends on X86_64 && SMP
226 230
227config X86_HT 231config X86_HT
228 bool 232 def_bool y
229 depends on SMP 233 depends on SMP
230 default y
231 234
232config X86_TRAMPOLINE 235config X86_TRAMPOLINE
233 bool 236 def_bool y
234 depends on SMP || (64BIT && ACPI_SLEEP) 237 depends on SMP || (64BIT && ACPI_SLEEP)
235 default y
236 238
237config X86_32_LAZY_GS 239config X86_32_LAZY_GS
238 def_bool y 240 def_bool y
239 depends on X86_32 && !CC_STACKPROTECTOR 241 depends on X86_32 && !CC_STACKPROTECTOR
240 242
243config ARCH_HWEIGHT_CFLAGS
244 string
245 default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
246 default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
247
241config KTIME_SCALAR 248config KTIME_SCALAR
242 def_bool X86_32 249 def_bool X86_32
243source "init/Kconfig" 250source "init/Kconfig"
@@ -447,7 +454,7 @@ config X86_NUMAQ
447 firmware with - send email to <Martin.Bligh@us.ibm.com>. 454 firmware with - send email to <Martin.Bligh@us.ibm.com>.
448 455
449config X86_SUPPORTS_MEMORY_FAILURE 456config X86_SUPPORTS_MEMORY_FAILURE
450 bool 457 def_bool y
451 # MCE code calls memory_failure(): 458 # MCE code calls memory_failure():
452 depends on X86_MCE 459 depends on X86_MCE
453 # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags: 460 # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
@@ -455,7 +462,6 @@ config X86_SUPPORTS_MEMORY_FAILURE
455 # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH: 462 # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
456 depends on X86_64 || !SPARSEMEM 463 depends on X86_64 || !SPARSEMEM
457 select ARCH_SUPPORTS_MEMORY_FAILURE 464 select ARCH_SUPPORTS_MEMORY_FAILURE
458 default y
459 465
460config X86_VISWS 466config X86_VISWS
461 bool "SGI 320/540 (Visual Workstation)" 467 bool "SGI 320/540 (Visual Workstation)"
@@ -570,7 +576,6 @@ config PARAVIRT_SPINLOCKS
570 576
571config PARAVIRT_CLOCK 577config PARAVIRT_CLOCK
572 bool 578 bool
573 default n
574 579
575endif 580endif
576 581
@@ -749,7 +754,6 @@ config MAXSMP
749 bool "Configure Maximum number of SMP Processors and NUMA Nodes" 754 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
750 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL 755 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
751 select CPUMASK_OFFSTACK 756 select CPUMASK_OFFSTACK
752 default n
753 ---help--- 757 ---help---
754 Configure maximum number of CPUS and NUMA Nodes for this architecture. 758 Configure maximum number of CPUS and NUMA Nodes for this architecture.
755 If unsure, say N. 759 If unsure, say N.
@@ -829,7 +833,6 @@ config X86_VISWS_APIC
829 833
830config X86_REROUTE_FOR_BROKEN_BOOT_IRQS 834config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
831 bool "Reroute for broken boot IRQs" 835 bool "Reroute for broken boot IRQs"
832 default n
833 depends on X86_IO_APIC 836 depends on X86_IO_APIC
834 ---help--- 837 ---help---
835 This option enables a workaround that fixes a source of 838 This option enables a workaround that fixes a source of
@@ -876,9 +879,8 @@ config X86_MCE_AMD
876 the DRAM Error Threshold. 879 the DRAM Error Threshold.
877 880
878config X86_ANCIENT_MCE 881config X86_ANCIENT_MCE
879 def_bool n 882 bool "Support for old Pentium 5 / WinChip machine checks"
880 depends on X86_32 && X86_MCE 883 depends on X86_32 && X86_MCE
881 prompt "Support for old Pentium 5 / WinChip machine checks"
882 ---help--- 884 ---help---
883 Include support for machine check handling on old Pentium 5 or WinChip 885 Include support for machine check handling on old Pentium 5 or WinChip
884 systems. These typically need to be enabled explicitely on the command 886 systems. These typically need to be enabled explicitely on the command
@@ -886,8 +888,7 @@ config X86_ANCIENT_MCE
886 888
887config X86_MCE_THRESHOLD 889config X86_MCE_THRESHOLD
888 depends on X86_MCE_AMD || X86_MCE_INTEL 890 depends on X86_MCE_AMD || X86_MCE_INTEL
889 bool 891 def_bool y
890 default y
891 892
892config X86_MCE_INJECT 893config X86_MCE_INJECT
893 depends on X86_MCE 894 depends on X86_MCE
@@ -1026,8 +1027,8 @@ config X86_CPUID
1026 1027
1027choice 1028choice
1028 prompt "High Memory Support" 1029 prompt "High Memory Support"
1029 default HIGHMEM4G if !X86_NUMAQ
1030 default HIGHMEM64G if X86_NUMAQ 1030 default HIGHMEM64G if X86_NUMAQ
1031 default HIGHMEM4G
1031 depends on X86_32 1032 depends on X86_32
1032 1033
1033config NOHIGHMEM 1034config NOHIGHMEM
@@ -1285,7 +1286,7 @@ source "mm/Kconfig"
1285 1286
1286config HIGHPTE 1287config HIGHPTE
1287 bool "Allocate 3rd-level pagetables from highmem" 1288 bool "Allocate 3rd-level pagetables from highmem"
1288 depends on X86_32 && (HIGHMEM4G || HIGHMEM64G) 1289 depends on HIGHMEM
1289 ---help--- 1290 ---help---
1290 The VM uses one page table entry for each page of physical memory. 1291 The VM uses one page table entry for each page of physical memory.
1291 For systems with a lot of RAM, this can be wasteful of precious 1292 For systems with a lot of RAM, this can be wasteful of precious
@@ -1369,8 +1370,7 @@ config MATH_EMULATION
1369 kernel, it won't hurt. 1370 kernel, it won't hurt.
1370 1371
1371config MTRR 1372config MTRR
1372 bool 1373 def_bool y
1373 default y
1374 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED 1374 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
1375 ---help--- 1375 ---help---
1376 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1376 On Intel P6 family processors (Pentium Pro, Pentium II and later)
@@ -1436,8 +1436,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1436 mtrr_spare_reg_nr=N on the kernel command line. 1436 mtrr_spare_reg_nr=N on the kernel command line.
1437 1437
1438config X86_PAT 1438config X86_PAT
1439 bool 1439 def_bool y
1440 default y
1441 prompt "x86 PAT support" if EMBEDDED 1440 prompt "x86 PAT support" if EMBEDDED
1442 depends on MTRR 1441 depends on MTRR
1443 ---help--- 1442 ---help---
@@ -1605,8 +1604,7 @@ config X86_NEED_RELOCS
1605 depends on X86_32 && RELOCATABLE 1604 depends on X86_32 && RELOCATABLE
1606 1605
1607config PHYSICAL_ALIGN 1606config PHYSICAL_ALIGN
1608 hex 1607 hex "Alignment value to which kernel should be aligned" if X86_32
1609 prompt "Alignment value to which kernel should be aligned" if X86_32
1610 default "0x1000000" 1608 default "0x1000000"
1611 range 0x2000 0x1000000 1609 range 0x2000 0x1000000
1612 ---help--- 1610 ---help---
@@ -1653,7 +1651,6 @@ config COMPAT_VDSO
1653 1651
1654config CMDLINE_BOOL 1652config CMDLINE_BOOL
1655 bool "Built-in kernel command line" 1653 bool "Built-in kernel command line"
1656 default n
1657 ---help--- 1654 ---help---
1658 Allow for specifying boot arguments to the kernel at 1655 Allow for specifying boot arguments to the kernel at
1659 build time. On some systems (e.g. embedded ones), it is 1656 build time. On some systems (e.g. embedded ones), it is
@@ -1687,7 +1684,6 @@ config CMDLINE
1687 1684
1688config CMDLINE_OVERRIDE 1685config CMDLINE_OVERRIDE
1689 bool "Built-in command line overrides boot loader arguments" 1686 bool "Built-in command line overrides boot loader arguments"
1690 default n
1691 depends on CMDLINE_BOOL 1687 depends on CMDLINE_BOOL
1692 ---help--- 1688 ---help---
1693 Set this option to 'Y' to have the kernel ignore the boot loader 1689 Set this option to 'Y' to have the kernel ignore the boot loader
@@ -1710,6 +1706,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
1710 def_bool X86_64 1706 def_bool X86_64
1711 depends on NUMA 1707 depends on NUMA
1712 1708
1709config USE_PERCPU_NUMA_NODE_ID
1710 def_bool X86_64
1711 depends on NUMA
1712
1713menu "Power management and ACPI options" 1713menu "Power management and ACPI options"
1714 1714
1715config ARCH_HIBERNATION_HEADER 1715config ARCH_HIBERNATION_HEADER
@@ -1723,8 +1723,7 @@ source "drivers/acpi/Kconfig"
1723source "drivers/sfi/Kconfig" 1723source "drivers/sfi/Kconfig"
1724 1724
1725config X86_APM_BOOT 1725config X86_APM_BOOT
1726 bool 1726 def_bool y
1727 default y
1728 depends on APM || APM_MODULE 1727 depends on APM || APM_MODULE
1729 1728
1730menuconfig APM 1729menuconfig APM
@@ -1931,6 +1930,14 @@ config PCI_MMCONFIG
1931 bool "Support mmconfig PCI config space access" 1930 bool "Support mmconfig PCI config space access"
1932 depends on X86_64 && PCI && ACPI 1931 depends on X86_64 && PCI && ACPI
1933 1932
1933config PCI_CNB20LE_QUIRK
1934 bool "Read CNB20LE Host Bridge Windows"
1935 depends on PCI
1936 help
1937 Read the PCI windows out of the CNB20LE host bridge. This allows
1938 PCI hotplug to work on systems with the CNB20LE chipset which do
1939 not have ACPI.
1940
1934config DMAR 1941config DMAR
1935 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1942 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1936 depends on PCI_MSI && ACPI && EXPERIMENTAL 1943 depends on PCI_MSI && ACPI && EXPERIMENTAL
@@ -1953,8 +1960,7 @@ config DMAR_DEFAULT_ON
1953 experimental. 1960 experimental.
1954 1961
1955config DMAR_BROKEN_GFX_WA 1962config DMAR_BROKEN_GFX_WA
1956 def_bool n 1963 bool "Workaround broken graphics drivers (going away soon)"
1957 prompt "Workaround broken graphics drivers (going away soon)"
1958 depends on DMAR && BROKEN 1964 depends on DMAR && BROKEN
1959 ---help--- 1965 ---help---
1960 Current Graphics drivers tend to use physical address 1966 Current Graphics drivers tend to use physical address
@@ -2052,7 +2058,6 @@ config SCx200HR_TIMER
2052config OLPC 2058config OLPC
2053 bool "One Laptop Per Child support" 2059 bool "One Laptop Per Child support"
2054 select GPIOLIB 2060 select GPIOLIB
2055 default n
2056 ---help--- 2061 ---help---
2057 Add support for detecting the unique features of the OLPC 2062 Add support for detecting the unique features of the OLPC
2058 XO hardware. 2063 XO hardware.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6..2ac9069890c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -338,6 +338,10 @@ config X86_F00F_BUG
338 def_bool y 338 def_bool y
339 depends on M586MMX || M586TSC || M586 || M486 || M386 339 depends on M586MMX || M586TSC || M586 || M486 || M386
340 340
341config X86_INVD_BUG
342 def_bool y
343 depends on M486 || M386
344
341config X86_WP_WORKS_OK 345config X86_WP_WORKS_OK
342 def_bool y 346 def_bool y
343 depends on !M386 347 depends on !M386
@@ -502,23 +506,3 @@ config CPU_SUP_UMC_32
502 CPU might render the kernel unbootable. 506 CPU might render the kernel unbootable.
503 507
504 If unsure, say N. 508 If unsure, say N.
505
506config X86_DS
507 def_bool X86_PTRACE_BTS
508 depends on X86_DEBUGCTLMSR
509 select HAVE_HW_BRANCH_TRACER
510
511config X86_PTRACE_BTS
512 bool "Branch Trace Store"
513 default y
514 depends on X86_DEBUGCTLMSR
515 depends on BROKEN
516 ---help---
517 This adds a ptrace interface to the hardware's branch trace store.
518
519 Debuggers may use it to collect an execution trace of the debugged
520 application in order to answer the question 'how did I get here?'.
521 Debuggers may trace user mode as well as kernel mode.
522
523 Say Y unless there is no application development on this machine
524 and you want to save a small amount of code size.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb..75085080b63 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -45,7 +45,6 @@ config EARLY_PRINTK
45 45
46config EARLY_PRINTK_DBGP 46config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port" 47 bool "Early printk via EHCI debug port"
48 default n
49 depends on EARLY_PRINTK && PCI 48 depends on EARLY_PRINTK && PCI
50 ---help--- 49 ---help---
51 Write kernel log output directly into the EHCI debug port. 50 Write kernel log output directly into the EHCI debug port.
@@ -76,7 +75,6 @@ config DEBUG_PER_CPU_MAPS
76 bool "Debug access to per_cpu maps" 75 bool "Debug access to per_cpu maps"
77 depends on DEBUG_KERNEL 76 depends on DEBUG_KERNEL
78 depends on SMP 77 depends on SMP
79 default n
80 ---help--- 78 ---help---
81 Say Y to verify that the per_cpu map being accessed has 79 Say Y to verify that the per_cpu map being accessed has
82 been setup. Adds a fair amount of code to kernel memory 80 been setup. Adds a fair amount of code to kernel memory
@@ -174,15 +172,6 @@ config IOMMU_LEAK
174 Add a simple leak tracer to the IOMMU code. This is useful when you 172 Add a simple leak tracer to the IOMMU code. This is useful when you
175 are debugging a buggy device driver that leaks IOMMU mappings. 173 are debugging a buggy device driver that leaks IOMMU mappings.
176 174
177config X86_DS_SELFTEST
178 bool "DS selftest"
179 default y
180 depends on DEBUG_KERNEL
181 depends on X86_DS
182 ---help---
183 Perform Debug Store selftests at boot time.
184 If in doubt, say "N".
185
186config HAVE_MMIOTRACE_SUPPORT 175config HAVE_MMIOTRACE_SUPPORT
187 def_bool y 176 def_bool y
188 177
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 0a43dc515e4..8aa1b59b907 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -95,8 +95,9 @@ sp-$(CONFIG_X86_64) := rsp
95cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) 95cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
96# is .cfi_signal_frame supported too? 96# is .cfi_signal_frame supported too?
97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) 97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
98KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) 98cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
99KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) 99KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
100KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
100 101
101LDFLAGS := -m elf_$(UTS_MACHINE) 102LDFLAGS := -m elf_$(UTS_MACHINE)
102 103
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 20bb0e1ac68..ff16756a51c 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,9 @@
32#define IN IN1 32#define IN IN1
33#define KEY %xmm2 33#define KEY %xmm2
34#define IV %xmm3 34#define IV %xmm3
35#define BSWAP_MASK %xmm10
36#define CTR %xmm11
37#define INC %xmm12
35 38
36#define KEYP %rdi 39#define KEYP %rdi
37#define OUTP %rsi 40#define OUTP %rsi
@@ -42,6 +45,7 @@
42#define T1 %r10 45#define T1 %r10
43#define TKEYP T1 46#define TKEYP T1
44#define T2 %r11 47#define T2 %r11
48#define TCTR_LOW T2
45 49
46_key_expansion_128: 50_key_expansion_128:
47_key_expansion_256a: 51_key_expansion_256a:
@@ -724,3 +728,114 @@ ENTRY(aesni_cbc_dec)
724 movups IV, (IVP) 728 movups IV, (IVP)
725.Lcbc_dec_just_ret: 729.Lcbc_dec_just_ret:
726 ret 730 ret
731
732.align 16
733.Lbswap_mask:
734 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
735
736/*
737 * _aesni_inc_init: internal ABI
738 * setup registers used by _aesni_inc
739 * input:
740 * IV
741 * output:
742 * CTR: == IV, in little endian
743 * TCTR_LOW: == lower qword of CTR
744 * INC: == 1, in little endian
745 * BSWAP_MASK == endian swapping mask
746 */
747_aesni_inc_init:
748 movaps .Lbswap_mask, BSWAP_MASK
749 movaps IV, CTR
750 PSHUFB_XMM BSWAP_MASK CTR
751 mov $1, TCTR_LOW
752 MOVQ_R64_XMM TCTR_LOW INC
753 MOVQ_R64_XMM CTR TCTR_LOW
754 ret
755
756/*
757 * _aesni_inc: internal ABI
758 * Increase IV by 1, IV is in big endian
759 * input:
760 * IV
761 * CTR: == IV, in little endian
762 * TCTR_LOW: == lower qword of CTR
763 * INC: == 1, in little endian
764 * BSWAP_MASK == endian swapping mask
765 * output:
766 * IV: Increase by 1
767 * changed:
768 * CTR: == output IV, in little endian
769 * TCTR_LOW: == lower qword of CTR
770 */
771_aesni_inc:
772 paddq INC, CTR
773 add $1, TCTR_LOW
774 jnc .Linc_low
775 pslldq $8, INC
776 paddq INC, CTR
777 psrldq $8, INC
778.Linc_low:
779 movaps CTR, IV
780 PSHUFB_XMM BSWAP_MASK IV
781 ret
782
783/*
784 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
785 * size_t len, u8 *iv)
786 */
787ENTRY(aesni_ctr_enc)
788 cmp $16, LEN
789 jb .Lctr_enc_just_ret
790 mov 480(KEYP), KLEN
791 movups (IVP), IV
792 call _aesni_inc_init
793 cmp $64, LEN
794 jb .Lctr_enc_loop1
795.align 4
796.Lctr_enc_loop4:
797 movaps IV, STATE1
798 call _aesni_inc
799 movups (INP), IN1
800 movaps IV, STATE2
801 call _aesni_inc
802 movups 0x10(INP), IN2
803 movaps IV, STATE3
804 call _aesni_inc
805 movups 0x20(INP), IN3
806 movaps IV, STATE4
807 call _aesni_inc
808 movups 0x30(INP), IN4
809 call _aesni_enc4
810 pxor IN1, STATE1
811 movups STATE1, (OUTP)
812 pxor IN2, STATE2
813 movups STATE2, 0x10(OUTP)
814 pxor IN3, STATE3
815 movups STATE3, 0x20(OUTP)
816 pxor IN4, STATE4
817 movups STATE4, 0x30(OUTP)
818 sub $64, LEN
819 add $64, INP
820 add $64, OUTP
821 cmp $64, LEN
822 jge .Lctr_enc_loop4
823 cmp $16, LEN
824 jb .Lctr_enc_ret
825.align 4
826.Lctr_enc_loop1:
827 movaps IV, STATE
828 call _aesni_inc
829 movups (INP), IN
830 call _aesni_enc1
831 pxor IN, STATE
832 movups STATE, (OUTP)
833 sub $16, LEN
834 add $16, INP
835 add $16, OUTP
836 cmp $16, LEN
837 jge .Lctr_enc_loop1
838.Lctr_enc_ret:
839 movups IV, (IVP)
840.Lctr_enc_just_ret:
841 ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 49c552c060e..2cb3dcc4490 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -18,6 +18,7 @@
18#include <crypto/algapi.h> 18#include <crypto/algapi.h>
19#include <crypto/aes.h> 19#include <crypto/aes.h>
20#include <crypto/cryptd.h> 20#include <crypto/cryptd.h>
21#include <crypto/ctr.h>
21#include <asm/i387.h> 22#include <asm/i387.h>
22#include <asm/aes.h> 23#include <asm/aes.h>
23 24
@@ -58,6 +59,8 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
58 const u8 *in, unsigned int len, u8 *iv); 59 const u8 *in, unsigned int len, u8 *iv);
59asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 60asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
60 const u8 *in, unsigned int len, u8 *iv); 61 const u8 *in, unsigned int len, u8 *iv);
62asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
63 const u8 *in, unsigned int len, u8 *iv);
61 64
62static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 65static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
63{ 66{
@@ -321,6 +324,72 @@ static struct crypto_alg blk_cbc_alg = {
321 }, 324 },
322}; 325};
323 326
327static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
328 struct blkcipher_walk *walk)
329{
330 u8 *ctrblk = walk->iv;
331 u8 keystream[AES_BLOCK_SIZE];
332 u8 *src = walk->src.virt.addr;
333 u8 *dst = walk->dst.virt.addr;
334 unsigned int nbytes = walk->nbytes;
335
336 aesni_enc(ctx, keystream, ctrblk);
337 crypto_xor(keystream, src, nbytes);
338 memcpy(dst, keystream, nbytes);
339 crypto_inc(ctrblk, AES_BLOCK_SIZE);
340}
341
342static int ctr_crypt(struct blkcipher_desc *desc,
343 struct scatterlist *dst, struct scatterlist *src,
344 unsigned int nbytes)
345{
346 struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
347 struct blkcipher_walk walk;
348 int err;
349
350 blkcipher_walk_init(&walk, dst, src, nbytes);
351 err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
352 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
353
354 kernel_fpu_begin();
355 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
356 aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
357 nbytes & AES_BLOCK_MASK, walk.iv);
358 nbytes &= AES_BLOCK_SIZE - 1;
359 err = blkcipher_walk_done(desc, &walk, nbytes);
360 }
361 if (walk.nbytes) {
362 ctr_crypt_final(ctx, &walk);
363 err = blkcipher_walk_done(desc, &walk, 0);
364 }
365 kernel_fpu_end();
366
367 return err;
368}
369
370static struct crypto_alg blk_ctr_alg = {
371 .cra_name = "__ctr-aes-aesni",
372 .cra_driver_name = "__driver-ctr-aes-aesni",
373 .cra_priority = 0,
374 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
375 .cra_blocksize = 1,
376 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
377 .cra_alignmask = 0,
378 .cra_type = &crypto_blkcipher_type,
379 .cra_module = THIS_MODULE,
380 .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
381 .cra_u = {
382 .blkcipher = {
383 .min_keysize = AES_MIN_KEY_SIZE,
384 .max_keysize = AES_MAX_KEY_SIZE,
385 .ivsize = AES_BLOCK_SIZE,
386 .setkey = aes_set_key,
387 .encrypt = ctr_crypt,
388 .decrypt = ctr_crypt,
389 },
390 },
391};
392
324static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, 393static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
325 unsigned int key_len) 394 unsigned int key_len)
326{ 395{
@@ -467,13 +536,11 @@ static struct crypto_alg ablk_cbc_alg = {
467 }, 536 },
468}; 537};
469 538
470#ifdef HAS_CTR
471static int ablk_ctr_init(struct crypto_tfm *tfm) 539static int ablk_ctr_init(struct crypto_tfm *tfm)
472{ 540{
473 struct cryptd_ablkcipher *cryptd_tfm; 541 struct cryptd_ablkcipher *cryptd_tfm;
474 542
475 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", 543 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0);
476 0, 0);
477 if (IS_ERR(cryptd_tfm)) 544 if (IS_ERR(cryptd_tfm))
478 return PTR_ERR(cryptd_tfm); 545 return PTR_ERR(cryptd_tfm);
479 ablk_init_common(tfm, cryptd_tfm); 546 ablk_init_common(tfm, cryptd_tfm);
@@ -500,11 +567,50 @@ static struct crypto_alg ablk_ctr_alg = {
500 .ivsize = AES_BLOCK_SIZE, 567 .ivsize = AES_BLOCK_SIZE,
501 .setkey = ablk_set_key, 568 .setkey = ablk_set_key,
502 .encrypt = ablk_encrypt, 569 .encrypt = ablk_encrypt,
503 .decrypt = ablk_decrypt, 570 .decrypt = ablk_encrypt,
504 .geniv = "chainiv", 571 .geniv = "chainiv",
505 }, 572 },
506 }, 573 },
507}; 574};
575
576#ifdef HAS_CTR
577static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
578{
579 struct cryptd_ablkcipher *cryptd_tfm;
580
581 cryptd_tfm = cryptd_alloc_ablkcipher(
582 "rfc3686(__driver-ctr-aes-aesni)", 0, 0);
583 if (IS_ERR(cryptd_tfm))
584 return PTR_ERR(cryptd_tfm);
585 ablk_init_common(tfm, cryptd_tfm);
586 return 0;
587}
588
589static struct crypto_alg ablk_rfc3686_ctr_alg = {
590 .cra_name = "rfc3686(ctr(aes))",
591 .cra_driver_name = "rfc3686-ctr-aes-aesni",
592 .cra_priority = 400,
593 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
594 .cra_blocksize = 1,
595 .cra_ctxsize = sizeof(struct async_aes_ctx),
596 .cra_alignmask = 0,
597 .cra_type = &crypto_ablkcipher_type,
598 .cra_module = THIS_MODULE,
599 .cra_list = LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list),
600 .cra_init = ablk_rfc3686_ctr_init,
601 .cra_exit = ablk_exit,
602 .cra_u = {
603 .ablkcipher = {
604 .min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
605 .max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
606 .ivsize = CTR_RFC3686_IV_SIZE,
607 .setkey = ablk_set_key,
608 .encrypt = ablk_encrypt,
609 .decrypt = ablk_decrypt,
610 .geniv = "seqiv",
611 },
612 },
613};
508#endif 614#endif
509 615
510#ifdef HAS_LRW 616#ifdef HAS_LRW
@@ -640,13 +746,17 @@ static int __init aesni_init(void)
640 goto blk_ecb_err; 746 goto blk_ecb_err;
641 if ((err = crypto_register_alg(&blk_cbc_alg))) 747 if ((err = crypto_register_alg(&blk_cbc_alg)))
642 goto blk_cbc_err; 748 goto blk_cbc_err;
749 if ((err = crypto_register_alg(&blk_ctr_alg)))
750 goto blk_ctr_err;
643 if ((err = crypto_register_alg(&ablk_ecb_alg))) 751 if ((err = crypto_register_alg(&ablk_ecb_alg)))
644 goto ablk_ecb_err; 752 goto ablk_ecb_err;
645 if ((err = crypto_register_alg(&ablk_cbc_alg))) 753 if ((err = crypto_register_alg(&ablk_cbc_alg)))
646 goto ablk_cbc_err; 754 goto ablk_cbc_err;
647#ifdef HAS_CTR
648 if ((err = crypto_register_alg(&ablk_ctr_alg))) 755 if ((err = crypto_register_alg(&ablk_ctr_alg)))
649 goto ablk_ctr_err; 756 goto ablk_ctr_err;
757#ifdef HAS_CTR
758 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
759 goto ablk_rfc3686_ctr_err;
650#endif 760#endif
651#ifdef HAS_LRW 761#ifdef HAS_LRW
652 if ((err = crypto_register_alg(&ablk_lrw_alg))) 762 if ((err = crypto_register_alg(&ablk_lrw_alg)))
@@ -675,13 +785,17 @@ ablk_pcbc_err:
675ablk_lrw_err: 785ablk_lrw_err:
676#endif 786#endif
677#ifdef HAS_CTR 787#ifdef HAS_CTR
788 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
789ablk_rfc3686_ctr_err:
790#endif
678 crypto_unregister_alg(&ablk_ctr_alg); 791 crypto_unregister_alg(&ablk_ctr_alg);
679ablk_ctr_err: 792ablk_ctr_err:
680#endif
681 crypto_unregister_alg(&ablk_cbc_alg); 793 crypto_unregister_alg(&ablk_cbc_alg);
682ablk_cbc_err: 794ablk_cbc_err:
683 crypto_unregister_alg(&ablk_ecb_alg); 795 crypto_unregister_alg(&ablk_ecb_alg);
684ablk_ecb_err: 796ablk_ecb_err:
797 crypto_unregister_alg(&blk_ctr_alg);
798blk_ctr_err:
685 crypto_unregister_alg(&blk_cbc_alg); 799 crypto_unregister_alg(&blk_cbc_alg);
686blk_cbc_err: 800blk_cbc_err:
687 crypto_unregister_alg(&blk_ecb_alg); 801 crypto_unregister_alg(&blk_ecb_alg);
@@ -705,10 +819,12 @@ static void __exit aesni_exit(void)
705 crypto_unregister_alg(&ablk_lrw_alg); 819 crypto_unregister_alg(&ablk_lrw_alg);
706#endif 820#endif
707#ifdef HAS_CTR 821#ifdef HAS_CTR
708 crypto_unregister_alg(&ablk_ctr_alg); 822 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
709#endif 823#endif
824 crypto_unregister_alg(&ablk_ctr_alg);
710 crypto_unregister_alg(&ablk_cbc_alg); 825 crypto_unregister_alg(&ablk_cbc_alg);
711 crypto_unregister_alg(&ablk_ecb_alg); 826 crypto_unregister_alg(&ablk_ecb_alg);
827 crypto_unregister_alg(&blk_ctr_alg);
712 crypto_unregister_alg(&blk_cbc_alg); 828 crypto_unregister_alg(&blk_cbc_alg);
713 crypto_unregister_alg(&blk_ecb_alg); 829 crypto_unregister_alg(&blk_ecb_alg);
714 crypto_unregister_alg(&__aesni_alg); 830 crypto_unregister_alg(&__aesni_alg);
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 56f462cf22d..aa2c39d968f 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -85,7 +85,6 @@ extern int acpi_ioapic;
85extern int acpi_noirq; 85extern int acpi_noirq;
86extern int acpi_strict; 86extern int acpi_strict;
87extern int acpi_disabled; 87extern int acpi_disabled;
88extern int acpi_ht;
89extern int acpi_pci_disabled; 88extern int acpi_pci_disabled;
90extern int acpi_skip_timer_override; 89extern int acpi_skip_timer_override;
91extern int acpi_use_timer_override; 90extern int acpi_use_timer_override;
@@ -97,7 +96,6 @@ void acpi_pic_sci_set_trigger(unsigned int, u16);
97static inline void disable_acpi(void) 96static inline void disable_acpi(void)
98{ 97{
99 acpi_disabled = 1; 98 acpi_disabled = 1;
100 acpi_ht = 0;
101 acpi_pci_disabled = 1; 99 acpi_pci_disabled = 1;
102 acpi_noirq = 1; 100 acpi_noirq = 1;
103} 101}
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index b97f786a48d..a63a68be1cc 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -6,8 +6,8 @@
6 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
71: lock 71: lock
8 .section .smp_locks,"a" 8 .section .smp_locks,"a"
9 _ASM_ALIGN 9 .balign 4
10 _ASM_PTR 1b 10 .long 1b - .
11 .previous 11 .previous
12 .endm 12 .endm
13#else 13#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b09ec55650b..03b6bb5394a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -28,20 +28,20 @@
28 */ 28 */
29 29
30#ifdef CONFIG_SMP 30#ifdef CONFIG_SMP
31#define LOCK_PREFIX \ 31#define LOCK_PREFIX_HERE \
32 ".section .smp_locks,\"a\"\n" \ 32 ".section .smp_locks,\"a\"\n" \
33 _ASM_ALIGN "\n" \ 33 ".balign 4\n" \
34 _ASM_PTR "661f\n" /* address */ \ 34 ".long 671f - .\n" /* offset */ \
35 ".previous\n" \ 35 ".previous\n" \
36 "661:\n\tlock; " 36 "671:"
37
38#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
37 39
38#else /* ! CONFIG_SMP */ 40#else /* ! CONFIG_SMP */
41#define LOCK_PREFIX_HERE ""
39#define LOCK_PREFIX "" 42#define LOCK_PREFIX ""
40#endif 43#endif
41 44
42/* This must be included *after* the definition of LOCK_PREFIX */
43#include <asm/cpufeature.h>
44
45struct alt_instr { 45struct alt_instr {
46 u8 *instr; /* original instruction */ 46 u8 *instr; /* original instruction */
47 u8 *replacement; 47 u8 *replacement;
@@ -96,6 +96,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
96 ".previous" 96 ".previous"
97 97
98/* 98/*
99 * This must be included *after* the definition of ALTERNATIVE due to
100 * <asm/arch_hweight.h>
101 */
102#include <asm/cpufeature.h>
103
104/*
99 * Alternative instructions for different CPU types or capabilities. 105 * Alternative instructions for different CPU types or capabilities.
100 * 106 *
101 * This allows to use optimized instructions even on generic binary 107 * This allows to use optimized instructions even on generic binary
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index b4ac2cdcb64..1fa03e04ae4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -373,6 +373,7 @@ extern atomic_t init_deasserted;
373extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); 373extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
374#endif 374#endif
375 375
376#ifdef CONFIG_X86_LOCAL_APIC
376static inline u32 apic_read(u32 reg) 377static inline u32 apic_read(u32 reg)
377{ 378{
378 return apic->read(reg); 379 return apic->read(reg);
@@ -403,10 +404,19 @@ static inline u32 safe_apic_wait_icr_idle(void)
403 return apic->safe_wait_icr_idle(); 404 return apic->safe_wait_icr_idle();
404} 405}
405 406
407#else /* CONFIG_X86_LOCAL_APIC */
408
409static inline u32 apic_read(u32 reg) { return 0; }
410static inline void apic_write(u32 reg, u32 val) { }
411static inline u64 apic_icr_read(void) { return 0; }
412static inline void apic_icr_write(u32 low, u32 high) { }
413static inline void apic_wait_icr_idle(void) { }
414static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
415
416#endif /* CONFIG_X86_LOCAL_APIC */
406 417
407static inline void ack_APIC_irq(void) 418static inline void ack_APIC_irq(void)
408{ 419{
409#ifdef CONFIG_X86_LOCAL_APIC
410 /* 420 /*
411 * ack_APIC_irq() actually gets compiled as a single instruction 421 * ack_APIC_irq() actually gets compiled as a single instruction
412 * ... yummie. 422 * ... yummie.
@@ -414,7 +424,6 @@ static inline void ack_APIC_irq(void)
414 424
415 /* Docs say use 0 for future compatibility */ 425 /* Docs say use 0 for future compatibility */
416 apic_write(APIC_EOI, 0); 426 apic_write(APIC_EOI, 0);
417#endif
418} 427}
419 428
420static inline unsigned default_get_apic_id(unsigned long x) 429static inline unsigned default_get_apic_id(unsigned long x)
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
new file mode 100644
index 00000000000..9686c3d9ff7
--- /dev/null
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -0,0 +1,61 @@
1#ifndef _ASM_X86_HWEIGHT_H
2#define _ASM_X86_HWEIGHT_H
3
4#ifdef CONFIG_64BIT
5/* popcnt %edi, %eax -- redundant REX prefix for alignment */
6#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
7/* popcnt %rdi, %rax */
8#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
9#define REG_IN "D"
10#define REG_OUT "a"
11#else
12/* popcnt %eax, %eax */
13#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
14#define REG_IN "a"
15#define REG_OUT "a"
16#endif
17
18/*
19 * __sw_hweightXX are called from within the alternatives below
20 * and callee-clobbered registers need to be taken care of. See
21 * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
22 * compiler switches.
23 */
24static inline unsigned int __arch_hweight32(unsigned int w)
25{
26 unsigned int res = 0;
27
28 asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
29 : "="REG_OUT (res)
30 : REG_IN (w));
31
32 return res;
33}
34
35static inline unsigned int __arch_hweight16(unsigned int w)
36{
37 return __arch_hweight32(w & 0xffff);
38}
39
40static inline unsigned int __arch_hweight8(unsigned int w)
41{
42 return __arch_hweight32(w & 0xff);
43}
44
45static inline unsigned long __arch_hweight64(__u64 w)
46{
47 unsigned long res = 0;
48
49#ifdef CONFIG_X86_32
50 return __arch_hweight32((u32)w) +
51 __arch_hweight32((u32)(w >> 32));
52#else
53 asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
54 : "="REG_OUT (res)
55 : REG_IN (w));
56#endif /* CONFIG_X86_32 */
57
58 return res;
59}
60
61#endif
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 8f8217b9bda..952a826ac4e 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -22,7 +22,7 @@
22 */ 22 */
23static inline int atomic_read(const atomic_t *v) 23static inline int atomic_read(const atomic_t *v)
24{ 24{
25 return v->counter; 25 return (*(volatile int *)&(v)->counter);
26} 26}
27 27
28/** 28/**
@@ -246,6 +246,29 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
246 246
247#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) 247#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
248 248
249/*
250 * atomic_dec_if_positive - decrement by 1 if old value positive
251 * @v: pointer of type atomic_t
252 *
253 * The function returns the old value of *v minus 1, even if
254 * the atomic variable, v, was not decremented.
255 */
256static inline int atomic_dec_if_positive(atomic_t *v)
257{
258 int c, old, dec;
259 c = atomic_read(v);
260 for (;;) {
261 dec = c - 1;
262 if (unlikely(dec < 0))
263 break;
264 old = atomic_cmpxchg((v), c, dec);
265 if (likely(old == c))
266 break;
267 c = old;
268 }
269 return dec;
270}
271
249/** 272/**
250 * atomic_inc_short - increment of a short integer 273 * atomic_inc_short - increment of a short integer
251 * @v: pointer to type int 274 * @v: pointer to type int
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 03027bf28de..2a934aa19a4 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,109 +14,193 @@ typedef struct {
14 14
15#define ATOMIC64_INIT(val) { (val) } 15#define ATOMIC64_INIT(val) { (val) }
16 16
17extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); 17#ifdef CONFIG_X86_CMPXCHG64
18#define ATOMIC64_ALTERNATIVE_(f, g) "call atomic64_" #g "_cx8"
19#else
20#define ATOMIC64_ALTERNATIVE_(f, g) ALTERNATIVE("call atomic64_" #f "_386", "call atomic64_" #g "_cx8", X86_FEATURE_CX8)
21#endif
22
23#define ATOMIC64_ALTERNATIVE(f) ATOMIC64_ALTERNATIVE_(f, f)
24
25/**
26 * atomic64_cmpxchg - cmpxchg atomic64 variable
27 * @p: pointer to type atomic64_t
28 * @o: expected value
29 * @n: new value
30 *
31 * Atomically sets @v to @n if it was equal to @o and returns
32 * the old value.
33 */
34
35static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
36{
37 return cmpxchg64(&v->counter, o, n);
38}
18 39
19/** 40/**
20 * atomic64_xchg - xchg atomic64 variable 41 * atomic64_xchg - xchg atomic64 variable
21 * @ptr: pointer to type atomic64_t 42 * @v: pointer to type atomic64_t
22 * @new_val: value to assign 43 * @n: value to assign
23 * 44 *
24 * Atomically xchgs the value of @ptr to @new_val and returns 45 * Atomically xchgs the value of @v to @n and returns
25 * the old value. 46 * the old value.
26 */ 47 */
27extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); 48static inline long long atomic64_xchg(atomic64_t *v, long long n)
49{
50 long long o;
51 unsigned high = (unsigned)(n >> 32);
52 unsigned low = (unsigned)n;
53 asm volatile(ATOMIC64_ALTERNATIVE(xchg)
54 : "=A" (o), "+b" (low), "+c" (high)
55 : "S" (v)
56 : "memory"
57 );
58 return o;
59}
28 60
29/** 61/**
30 * atomic64_set - set atomic64 variable 62 * atomic64_set - set atomic64 variable
31 * @ptr: pointer to type atomic64_t 63 * @v: pointer to type atomic64_t
32 * @new_val: value to assign 64 * @n: value to assign
33 * 65 *
34 * Atomically sets the value of @ptr to @new_val. 66 * Atomically sets the value of @v to @n.
35 */ 67 */
36extern void atomic64_set(atomic64_t *ptr, u64 new_val); 68static inline void atomic64_set(atomic64_t *v, long long i)
69{
70 unsigned high = (unsigned)(i >> 32);
71 unsigned low = (unsigned)i;
72 asm volatile(ATOMIC64_ALTERNATIVE(set)
73 : "+b" (low), "+c" (high)
74 : "S" (v)
75 : "eax", "edx", "memory"
76 );
77}
37 78
38/** 79/**
39 * atomic64_read - read atomic64 variable 80 * atomic64_read - read atomic64 variable
40 * @ptr: pointer to type atomic64_t 81 * @v: pointer to type atomic64_t
41 * 82 *
42 * Atomically reads the value of @ptr and returns it. 83 * Atomically reads the value of @v and returns it.
43 */ 84 */
44static inline u64 atomic64_read(atomic64_t *ptr) 85static inline long long atomic64_read(atomic64_t *v)
45{ 86{
46 u64 res; 87 long long r;
47 88 asm volatile(ATOMIC64_ALTERNATIVE(read)
48 /* 89 : "=A" (r), "+c" (v)
49 * Note, we inline this atomic64_t primitive because 90 : : "memory"
50 * it only clobbers EAX/EDX and leaves the others 91 );
51 * untouched. We also (somewhat subtly) rely on the 92 return r;
52 * fact that cmpxchg8b returns the current 64-bit value 93 }
53 * of the memory location we are touching:
54 */
55 asm volatile(
56 "mov %%ebx, %%eax\n\t"
57 "mov %%ecx, %%edx\n\t"
58 LOCK_PREFIX "cmpxchg8b %1\n"
59 : "=&A" (res)
60 : "m" (*ptr)
61 );
62
63 return res;
64}
65
66extern u64 atomic64_read(atomic64_t *ptr);
67 94
68/** 95/**
69 * atomic64_add_return - add and return 96 * atomic64_add_return - add and return
70 * @delta: integer value to add 97 * @i: integer value to add
71 * @ptr: pointer to type atomic64_t 98 * @v: pointer to type atomic64_t
72 * 99 *
73 * Atomically adds @delta to @ptr and returns @delta + *@ptr 100 * Atomically adds @i to @v and returns @i + *@v
74 */ 101 */
75extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); 102static inline long long atomic64_add_return(long long i, atomic64_t *v)
103{
104 asm volatile(ATOMIC64_ALTERNATIVE(add_return)
105 : "+A" (i), "+c" (v)
106 : : "memory"
107 );
108 return i;
109}
76 110
77/* 111/*
78 * Other variants with different arithmetic operators: 112 * Other variants with different arithmetic operators:
79 */ 113 */
80extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); 114static inline long long atomic64_sub_return(long long i, atomic64_t *v)
81extern u64 atomic64_inc_return(atomic64_t *ptr); 115{
82extern u64 atomic64_dec_return(atomic64_t *ptr); 116 asm volatile(ATOMIC64_ALTERNATIVE(sub_return)
117 : "+A" (i), "+c" (v)
118 : : "memory"
119 );
120 return i;
121}
122
123static inline long long atomic64_inc_return(atomic64_t *v)
124{
125 long long a;
126 asm volatile(ATOMIC64_ALTERNATIVE(inc_return)
127 : "=A" (a)
128 : "S" (v)
129 : "memory", "ecx"
130 );
131 return a;
132}
133
134static inline long long atomic64_dec_return(atomic64_t *v)
135{
136 long long a;
137 asm volatile(ATOMIC64_ALTERNATIVE(dec_return)
138 : "=A" (a)
139 : "S" (v)
140 : "memory", "ecx"
141 );
142 return a;
143}
83 144
84/** 145/**
85 * atomic64_add - add integer to atomic64 variable 146 * atomic64_add - add integer to atomic64 variable
86 * @delta: integer value to add 147 * @i: integer value to add
87 * @ptr: pointer to type atomic64_t 148 * @v: pointer to type atomic64_t
88 * 149 *
89 * Atomically adds @delta to @ptr. 150 * Atomically adds @i to @v.
90 */ 151 */
91extern void atomic64_add(u64 delta, atomic64_t *ptr); 152static inline long long atomic64_add(long long i, atomic64_t *v)
153{
154 asm volatile(ATOMIC64_ALTERNATIVE_(add, add_return)
155 : "+A" (i), "+c" (v)
156 : : "memory"
157 );
158 return i;
159}
92 160
93/** 161/**
94 * atomic64_sub - subtract the atomic64 variable 162 * atomic64_sub - subtract the atomic64 variable
95 * @delta: integer value to subtract 163 * @i: integer value to subtract
96 * @ptr: pointer to type atomic64_t 164 * @v: pointer to type atomic64_t
97 * 165 *
98 * Atomically subtracts @delta from @ptr. 166 * Atomically subtracts @i from @v.
99 */ 167 */
100extern void atomic64_sub(u64 delta, atomic64_t *ptr); 168static inline long long atomic64_sub(long long i, atomic64_t *v)
169{
170 asm volatile(ATOMIC64_ALTERNATIVE_(sub, sub_return)
171 : "+A" (i), "+c" (v)
172 : : "memory"
173 );
174 return i;
175}
101 176
102/** 177/**
103 * atomic64_sub_and_test - subtract value from variable and test result 178 * atomic64_sub_and_test - subtract value from variable and test result
104 * @delta: integer value to subtract 179 * @i: integer value to subtract
105 * @ptr: pointer to type atomic64_t 180 * @v: pointer to type atomic64_t
106 * 181 *
107 * Atomically subtracts @delta from @ptr and returns 182 * Atomically subtracts @i from @v and returns
108 * true if the result is zero, or false for all 183 * true if the result is zero, or false for all
109 * other cases. 184 * other cases.
110 */ 185 */
111extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); 186static inline int atomic64_sub_and_test(long long i, atomic64_t *v)
187{
188 return atomic64_sub_return(i, v) == 0;
189}
112 190
113/** 191/**
114 * atomic64_inc - increment atomic64 variable 192 * atomic64_inc - increment atomic64 variable
115 * @ptr: pointer to type atomic64_t 193 * @v: pointer to type atomic64_t
116 * 194 *
117 * Atomically increments @ptr by 1. 195 * Atomically increments @v by 1.
118 */ 196 */
119extern void atomic64_inc(atomic64_t *ptr); 197static inline void atomic64_inc(atomic64_t *v)
198{
199 asm volatile(ATOMIC64_ALTERNATIVE_(inc, inc_return)
200 : : "S" (v)
201 : "memory", "eax", "ecx", "edx"
202 );
203}
120 204
121/** 205/**
122 * atomic64_dec - decrement atomic64 variable 206 * atomic64_dec - decrement atomic64 variable
@@ -124,37 +208,97 @@ extern void atomic64_inc(atomic64_t *ptr);
124 * 208 *
125 * Atomically decrements @ptr by 1. 209 * Atomically decrements @ptr by 1.
126 */ 210 */
127extern void atomic64_dec(atomic64_t *ptr); 211static inline void atomic64_dec(atomic64_t *v)
212{
213 asm volatile(ATOMIC64_ALTERNATIVE_(dec, dec_return)
214 : : "S" (v)
215 : "memory", "eax", "ecx", "edx"
216 );
217}
128 218
129/** 219/**
130 * atomic64_dec_and_test - decrement and test 220 * atomic64_dec_and_test - decrement and test
131 * @ptr: pointer to type atomic64_t 221 * @v: pointer to type atomic64_t
132 * 222 *
133 * Atomically decrements @ptr by 1 and 223 * Atomically decrements @v by 1 and
134 * returns true if the result is 0, or false for all other 224 * returns true if the result is 0, or false for all other
135 * cases. 225 * cases.
136 */ 226 */
137extern int atomic64_dec_and_test(atomic64_t *ptr); 227static inline int atomic64_dec_and_test(atomic64_t *v)
228{
229 return atomic64_dec_return(v) == 0;
230}
138 231
139/** 232/**
140 * atomic64_inc_and_test - increment and test 233 * atomic64_inc_and_test - increment and test
141 * @ptr: pointer to type atomic64_t 234 * @v: pointer to type atomic64_t
142 * 235 *
143 * Atomically increments @ptr by 1 236 * Atomically increments @v by 1
144 * and returns true if the result is zero, or false for all 237 * and returns true if the result is zero, or false for all
145 * other cases. 238 * other cases.
146 */ 239 */
147extern int atomic64_inc_and_test(atomic64_t *ptr); 240static inline int atomic64_inc_and_test(atomic64_t *v)
241{
242 return atomic64_inc_return(v) == 0;
243}
148 244
149/** 245/**
150 * atomic64_add_negative - add and test if negative 246 * atomic64_add_negative - add and test if negative
151 * @delta: integer value to add 247 * @i: integer value to add
152 * @ptr: pointer to type atomic64_t 248 * @v: pointer to type atomic64_t
153 * 249 *
154 * Atomically adds @delta to @ptr and returns true 250 * Atomically adds @i to @v and returns true
155 * if the result is negative, or false when 251 * if the result is negative, or false when
156 * result is greater than or equal to zero. 252 * result is greater than or equal to zero.
157 */ 253 */
158extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); 254static inline int atomic64_add_negative(long long i, atomic64_t *v)
255{
256 return atomic64_add_return(i, v) < 0;
257}
258
259/**
260 * atomic64_add_unless - add unless the number is a given value
261 * @v: pointer of type atomic64_t
262 * @a: the amount to add to v...
263 * @u: ...unless v is equal to u.
264 *
265 * Atomically adds @a to @v, so long as it was not @u.
266 * Returns non-zero if @v was not @u, and zero otherwise.
267 */
268static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
269{
270 unsigned low = (unsigned)u;
271 unsigned high = (unsigned)(u >> 32);
272 asm volatile(ATOMIC64_ALTERNATIVE(add_unless) "\n\t"
273 : "+A" (a), "+c" (v), "+S" (low), "+D" (high)
274 : : "memory");
275 return (int)a;
276}
277
278
279static inline int atomic64_inc_not_zero(atomic64_t *v)
280{
281 int r;
282 asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero)
283 : "=a" (r)
284 : "S" (v)
285 : "ecx", "edx", "memory"
286 );
287 return r;
288}
289
290static inline long long atomic64_dec_if_positive(atomic64_t *v)
291{
292 long long r;
293 asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive)
294 : "=A" (r)
295 : "S" (v)
296 : "ecx", "memory"
297 );
298 return r;
299}
300
301#undef ATOMIC64_ALTERNATIVE
302#undef ATOMIC64_ALTERNATIVE_
159 303
160#endif /* _ASM_X86_ATOMIC64_32_H */ 304#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 51c5b405692..49fd1ea2295 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
18 */ 18 */
19static inline long atomic64_read(const atomic64_t *v) 19static inline long atomic64_read(const atomic64_t *v)
20{ 20{
21 return v->counter; 21 return (*(volatile long *)&(v)->counter);
22} 22}
23 23
24/** 24/**
@@ -221,4 +221,27 @@ static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
221 221
222#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) 222#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
223 223
224/*
225 * atomic64_dec_if_positive - decrement by 1 if old value positive
226 * @v: pointer of type atomic_t
227 *
228 * The function returns the old value of *v minus 1, even if
229 * the atomic variable, v, was not decremented.
230 */
231static inline long atomic64_dec_if_positive(atomic64_t *v)
232{
233 long c, old, dec;
234 c = atomic64_read(v);
235 for (;;) {
236 dec = c - 1;
237 if (unlikely(dec < 0))
238 break;
239 old = atomic64_cmpxchg((v), c, dec);
240 if (likely(old == c))
241 break;
242 c = old;
243 }
244 return dec;
245}
246
224#endif /* _ASM_X86_ATOMIC64_64_H */ 247#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 02b47a603fc..545776efeb1 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -444,7 +444,9 @@ static inline int fls(int x)
444 444
445#define ARCH_HAS_FAST_MULTIPLIER 1 445#define ARCH_HAS_FAST_MULTIPLIER 1
446 446
447#include <asm-generic/bitops/hweight.h> 447#include <asm/arch_hweight.h>
448
449#include <asm-generic/bitops/const_hweight.h>
448 450
449#endif /* __KERNEL__ */ 451#endif /* __KERNEL__ */
450 452
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 7a1065958ba..3b62ab56c7a 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -24,7 +24,7 @@
24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) 24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
25 25
26#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ 26#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
27 (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)) 27 (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
28#error "Invalid value for CONFIG_PHYSICAL_ALIGN" 28#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
29#endif 29#endif
30 30
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 634c40a739a..63e35ec9075 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -44,9 +44,6 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
44 memcpy(dst, src, len); 44 memcpy(dst, src, len);
45} 45}
46 46
47#define PG_WC PG_arch_1
48PAGEFLAG(WC, WC)
49
50#ifdef CONFIG_X86_PAT 47#ifdef CONFIG_X86_PAT
51/* 48/*
52 * X86 PAT uses page flags WC and Uncached together to keep track of 49 * X86 PAT uses page flags WC and Uncached together to keep track of
@@ -55,16 +52,24 @@ PAGEFLAG(WC, WC)
55 * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not 52 * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
56 * been changed from its default (value of -1 used to denote this). 53 * been changed from its default (value of -1 used to denote this).
57 * Note we do not support _PAGE_CACHE_UC here. 54 * Note we do not support _PAGE_CACHE_UC here.
58 *
59 * Caller must hold memtype_lock for atomicity.
60 */ 55 */
56
57#define _PGMT_DEFAULT 0
58#define _PGMT_WC (1UL << PG_arch_1)
59#define _PGMT_UC_MINUS (1UL << PG_uncached)
60#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1)
61#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1)
62#define _PGMT_CLEAR_MASK (~_PGMT_MASK)
63
61static inline unsigned long get_page_memtype(struct page *pg) 64static inline unsigned long get_page_memtype(struct page *pg)
62{ 65{
63 if (!PageUncached(pg) && !PageWC(pg)) 66 unsigned long pg_flags = pg->flags & _PGMT_MASK;
67
68 if (pg_flags == _PGMT_DEFAULT)
64 return -1; 69 return -1;
65 else if (!PageUncached(pg) && PageWC(pg)) 70 else if (pg_flags == _PGMT_WC)
66 return _PAGE_CACHE_WC; 71 return _PAGE_CACHE_WC;
67 else if (PageUncached(pg) && !PageWC(pg)) 72 else if (pg_flags == _PGMT_UC_MINUS)
68 return _PAGE_CACHE_UC_MINUS; 73 return _PAGE_CACHE_UC_MINUS;
69 else 74 else
70 return _PAGE_CACHE_WB; 75 return _PAGE_CACHE_WB;
@@ -72,25 +77,26 @@ static inline unsigned long get_page_memtype(struct page *pg)
72 77
73static inline void set_page_memtype(struct page *pg, unsigned long memtype) 78static inline void set_page_memtype(struct page *pg, unsigned long memtype)
74{ 79{
80 unsigned long memtype_flags = _PGMT_DEFAULT;
81 unsigned long old_flags;
82 unsigned long new_flags;
83
75 switch (memtype) { 84 switch (memtype) {
76 case _PAGE_CACHE_WC: 85 case _PAGE_CACHE_WC:
77 ClearPageUncached(pg); 86 memtype_flags = _PGMT_WC;
78 SetPageWC(pg);
79 break; 87 break;
80 case _PAGE_CACHE_UC_MINUS: 88 case _PAGE_CACHE_UC_MINUS:
81 SetPageUncached(pg); 89 memtype_flags = _PGMT_UC_MINUS;
82 ClearPageWC(pg);
83 break; 90 break;
84 case _PAGE_CACHE_WB: 91 case _PAGE_CACHE_WB:
85 SetPageUncached(pg); 92 memtype_flags = _PGMT_WB;
86 SetPageWC(pg);
87 break;
88 default:
89 case -1:
90 ClearPageUncached(pg);
91 ClearPageWC(pg);
92 break; 93 break;
93 } 94 }
95
96 do {
97 old_flags = pg->flags;
98 new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
99 } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
94} 100}
95#else 101#else
96static inline unsigned long get_page_memtype(struct page *pg) { return -1; } 102static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
@@ -139,9 +145,11 @@ int set_memory_np(unsigned long addr, int numpages);
139int set_memory_4k(unsigned long addr, int numpages); 145int set_memory_4k(unsigned long addr, int numpages);
140 146
141int set_memory_array_uc(unsigned long *addr, int addrinarray); 147int set_memory_array_uc(unsigned long *addr, int addrinarray);
148int set_memory_array_wc(unsigned long *addr, int addrinarray);
142int set_memory_array_wb(unsigned long *addr, int addrinarray); 149int set_memory_array_wb(unsigned long *addr, int addrinarray);
143 150
144int set_pages_array_uc(struct page **pages, int addrinarray); 151int set_pages_array_uc(struct page **pages, int addrinarray);
152int set_pages_array_wc(struct page **pages, int addrinarray);
145int set_pages_array_wb(struct page **pages, int addrinarray); 153int set_pages_array_wb(struct page **pages, int addrinarray);
146 154
147/* 155/*
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ffb9bb6b6c3..8859e12dd3c 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -271,7 +271,8 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
271 __typeof__(*(ptr)) __ret; \ 271 __typeof__(*(ptr)) __ret; \
272 __typeof__(*(ptr)) __old = (o); \ 272 __typeof__(*(ptr)) __old = (o); \
273 __typeof__(*(ptr)) __new = (n); \ 273 __typeof__(*(ptr)) __new = (n); \
274 alternative_io("call cmpxchg8b_emu", \ 274 alternative_io(LOCK_PREFIX_HERE \
275 "call cmpxchg8b_emu", \
275 "lock; cmpxchg8b (%%esi)" , \ 276 "lock; cmpxchg8b (%%esi)" , \
276 X86_FEATURE_CX8, \ 277 X86_FEATURE_CX8, \
277 "=A" (__ret), \ 278 "=A" (__ret), \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0cd82d06861..46814591438 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -161,6 +161,7 @@
161 */ 161 */
162#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ 162#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
163#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ 163#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
164#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
164 165
165/* Virtualization flags: Linux defined */ 166/* Virtualization flags: Linux defined */
166#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ 167#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
@@ -175,6 +176,7 @@
175 176
176#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 177#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
177 178
179#include <asm/asm.h>
178#include <linux/bitops.h> 180#include <linux/bitops.h>
179 181
180extern const char * const x86_cap_flags[NCAPINTS*32]; 182extern const char * const x86_cap_flags[NCAPINTS*32];
@@ -283,6 +285,69 @@ extern const char * const x86_power_flags[32];
283 285
284#endif /* CONFIG_X86_64 */ 286#endif /* CONFIG_X86_64 */
285 287
288/*
289 * Static testing of CPU features. Used the same as boot_cpu_has().
290 * These are only valid after alternatives have run, but will statically
291 * patch the target code for additional performance.
292 *
293 */
294static __always_inline __pure bool __static_cpu_has(u8 bit)
295{
296#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
297 asm goto("1: jmp %l[t_no]\n"
298 "2:\n"
299 ".section .altinstructions,\"a\"\n"
300 _ASM_ALIGN "\n"
301 _ASM_PTR "1b\n"
302 _ASM_PTR "0\n" /* no replacement */
303 " .byte %P0\n" /* feature bit */
304 " .byte 2b - 1b\n" /* source len */
305 " .byte 0\n" /* replacement len */
306 " .byte 0xff + 0 - (2b-1b)\n" /* padding */
307 ".previous\n"
308 : : "i" (bit) : : t_no);
309 return true;
310 t_no:
311 return false;
312#else
313 u8 flag;
314 /* Open-coded due to __stringify() in ALTERNATIVE() */
315 asm volatile("1: movb $0,%0\n"
316 "2:\n"
317 ".section .altinstructions,\"a\"\n"
318 _ASM_ALIGN "\n"
319 _ASM_PTR "1b\n"
320 _ASM_PTR "3f\n"
321 " .byte %P1\n" /* feature bit */
322 " .byte 2b - 1b\n" /* source len */
323 " .byte 4f - 3f\n" /* replacement len */
324 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */
325 ".previous\n"
326 ".section .altinstr_replacement,\"ax\"\n"
327 "3: movb $1,%0\n"
328 "4:\n"
329 ".previous\n"
330 : "=qm" (flag) : "i" (bit));
331 return flag;
332#endif
333}
334
335#if __GNUC__ >= 4
336#define static_cpu_has(bit) \
337( \
338 __builtin_constant_p(boot_cpu_has(bit)) ? \
339 boot_cpu_has(bit) : \
340 (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \
341 __static_cpu_has(bit) : \
342 boot_cpu_has(bit) \
343)
344#else
345/*
346 * gcc 3.x is too stupid to do the static test; fall back to dynamic.
347 */
348#define static_cpu_has(bit) boot_cpu_has(bit)
349#endif
350
286#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ 351#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
287 352
288#endif /* _ASM_X86_CPUFEATURE_H */ 353#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
deleted file mode 100644
index 70dac199b09..00000000000
--- a/arch/x86/include/asm/ds.h
+++ /dev/null
@@ -1,302 +0,0 @@
1/*
2 * Debug Store (DS) support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done)
11 * - buffer access
12 *
13 * It does not do:
14 * - security checking (is the caller allowed to trace the task)
15 * - buffer allocation (memory accounting)
16 *
17 *
18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */
21
22#ifndef _ASM_X86_DS_H
23#define _ASM_X86_DS_H
24
25
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/err.h>
29
30
31#ifdef CONFIG_X86_DS
32
33struct task_struct;
34struct ds_context;
35struct ds_tracer;
36struct bts_tracer;
37struct pebs_tracer;
38
39typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
40typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
41
42
43/*
44 * A list of features plus corresponding macros to talk about them in
45 * the ds_request function's flags parameter.
46 *
47 * We use the enum to index an array of corresponding control bits;
48 * we use the macro to index a flags bit-vector.
49 */
50enum ds_feature {
51 dsf_bts = 0,
52 dsf_bts_kernel,
53#define BTS_KERNEL (1 << dsf_bts_kernel)
54 /* trace kernel-mode branches */
55
56 dsf_bts_user,
57#define BTS_USER (1 << dsf_bts_user)
58 /* trace user-mode branches */
59
60 dsf_bts_overflow,
61 dsf_bts_max,
62 dsf_pebs = dsf_bts_max,
63
64 dsf_pebs_max,
65 dsf_ctl_max = dsf_pebs_max,
66 dsf_bts_timestamps = dsf_ctl_max,
67#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
68 /* add timestamps into BTS trace */
69
70#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
71};
72
73
74/*
75 * Request BTS or PEBS
76 *
77 * Due to alignement constraints, the actual buffer may be slightly
78 * smaller than the requested or provided buffer.
79 *
80 * Returns a pointer to a tracer structure on success, or
81 * ERR_PTR(errcode) on failure.
82 *
83 * The interrupt threshold is independent from the overflow callback
84 * to allow users to use their own overflow interrupt handling mechanism.
85 *
86 * The function might sleep.
87 *
88 * task: the task to request recording for
89 * cpu: the cpu to request recording for
90 * base: the base pointer for the (non-pageable) buffer;
91 * size: the size of the provided buffer in bytes
92 * ovfl: pointer to a function to be called on buffer overflow;
93 * NULL if cyclic buffer requested
94 * th: the interrupt threshold in records from the end of the buffer;
95 * -1 if no interrupt threshold is requested.
96 * flags: a bit-mask of the above flags
97 */
98extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
99 void *base, size_t size,
100 bts_ovfl_callback_t ovfl,
101 size_t th, unsigned int flags);
102extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
103 bts_ovfl_callback_t ovfl,
104 size_t th, unsigned int flags);
105extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
106 void *base, size_t size,
107 pebs_ovfl_callback_t ovfl,
108 size_t th, unsigned int flags);
109extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
110 void *base, size_t size,
111 pebs_ovfl_callback_t ovfl,
112 size_t th, unsigned int flags);
113
114/*
115 * Release BTS or PEBS resources
116 * Suspend and resume BTS or PEBS tracing
117 *
118 * Must be called with irq's enabled.
119 *
120 * tracer: the tracer handle returned from ds_request_~()
121 */
122extern void ds_release_bts(struct bts_tracer *tracer);
123extern void ds_suspend_bts(struct bts_tracer *tracer);
124extern void ds_resume_bts(struct bts_tracer *tracer);
125extern void ds_release_pebs(struct pebs_tracer *tracer);
126extern void ds_suspend_pebs(struct pebs_tracer *tracer);
127extern void ds_resume_pebs(struct pebs_tracer *tracer);
128
129/*
130 * Release BTS or PEBS resources
131 * Suspend and resume BTS or PEBS tracing
132 *
133 * Cpu tracers must call this on the traced cpu.
134 * Task tracers must call ds_release_~_noirq() for themselves.
135 *
136 * May be called with irq's disabled.
137 *
138 * Returns 0 if successful;
139 * -EPERM if the cpu tracer does not trace the current cpu.
140 * -EPERM if the task tracer does not trace itself.
141 *
142 * tracer: the tracer handle returned from ds_request_~()
143 */
144extern int ds_release_bts_noirq(struct bts_tracer *tracer);
145extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
146extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
147extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
148extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
149extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
150
151
152/*
153 * The raw DS buffer state as it is used for BTS and PEBS recording.
154 *
155 * This is the low-level, arch-dependent interface for working
156 * directly on the raw trace data.
157 */
158struct ds_trace {
159 /* the number of bts/pebs records */
160 size_t n;
161 /* the size of a bts/pebs record in bytes */
162 size_t size;
163 /* pointers into the raw buffer:
164 - to the first entry */
165 void *begin;
166 /* - one beyond the last entry */
167 void *end;
168 /* - one beyond the newest entry */
169 void *top;
170 /* - the interrupt threshold */
171 void *ith;
172 /* flags given on ds_request() */
173 unsigned int flags;
174};
175
176/*
177 * An arch-independent view on branch trace data.
178 */
179enum bts_qualifier {
180 bts_invalid,
181#define BTS_INVALID bts_invalid
182
183 bts_branch,
184#define BTS_BRANCH bts_branch
185
186 bts_task_arrives,
187#define BTS_TASK_ARRIVES bts_task_arrives
188
189 bts_task_departs,
190#define BTS_TASK_DEPARTS bts_task_departs
191
192 bts_qual_bit_size = 4,
193 bts_qual_max = (1 << bts_qual_bit_size),
194};
195
196struct bts_struct {
197 __u64 qualifier;
198 union {
199 /* BTS_BRANCH */
200 struct {
201 __u64 from;
202 __u64 to;
203 } lbr;
204 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
205 struct {
206 __u64 clock;
207 pid_t pid;
208 } event;
209 } variant;
210};
211
212
213/*
214 * The BTS state.
215 *
216 * This gives access to the raw DS state and adds functions to provide
217 * an arch-independent view of the BTS data.
218 */
219struct bts_trace {
220 struct ds_trace ds;
221
222 int (*read)(struct bts_tracer *tracer, const void *at,
223 struct bts_struct *out);
224 int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
225};
226
227
228/*
229 * The PEBS state.
230 *
231 * This gives access to the raw DS state and the PEBS-specific counter
232 * reset value.
233 */
234struct pebs_trace {
235 struct ds_trace ds;
236
237 /* the number of valid counters in the below array */
238 unsigned int counters;
239
240#define MAX_PEBS_COUNTERS 4
241 /* the counter reset value */
242 unsigned long long counter_reset[MAX_PEBS_COUNTERS];
243};
244
245
246/*
247 * Read the BTS or PEBS trace.
248 *
249 * Returns a view on the trace collected for the parameter tracer.
250 *
251 * The view remains valid as long as the traced task is not running or
252 * the tracer is suspended.
253 * Writes into the trace buffer are not reflected.
254 *
255 * tracer: the tracer handle returned from ds_request_~()
256 */
257extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
258extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
259
260
261/*
262 * Reset the write pointer of the BTS/PEBS buffer.
263 *
264 * Returns 0 on success; -Eerrno on error
265 *
266 * tracer: the tracer handle returned from ds_request_~()
267 */
268extern int ds_reset_bts(struct bts_tracer *tracer);
269extern int ds_reset_pebs(struct pebs_tracer *tracer);
270
271/*
272 * Set the PEBS counter reset value.
273 *
274 * Returns 0 on success; -Eerrno on error
275 *
276 * tracer: the tracer handle returned from ds_request_pebs()
277 * counter: the index of the counter
278 * value: the new counter reset value
279 */
280extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
281 unsigned int counter, u64 value);
282
283/*
284 * Initialization
285 */
286struct cpuinfo_x86;
287extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
288
289/*
290 * Context switch work
291 */
292extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
293
294#else /* CONFIG_X86_DS */
295
296struct cpuinfo_x86;
297static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
298static inline void ds_switch_to(struct task_struct *prev,
299 struct task_struct *next) {}
300
301#endif /* CONFIG_X86_DS */
302#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index ae6253ab902..733f7e91e7a 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -34,6 +34,18 @@
34#define CFI_SIGNAL_FRAME 34#define CFI_SIGNAL_FRAME
35#endif 35#endif
36 36
37#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
38 /*
39 * Emit CFI data in .debug_frame sections, not .eh_frame sections.
40 * The latter we currently just discard since we don't do DWARF
41 * unwinding at runtime. So only the offline DWARF information is
42 * useful to anyone. Note we should not use this directive if this
43 * file is used in the vDSO assembly, or if vmlinux.lds.S gets
44 * changed so it doesn't discard .eh_frame.
45 */
46 .cfi_sections .debug_frame
47#endif
48
37#else 49#else
38 50
39/* 51/*
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 0e22296790d..ec8a52d14ab 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -45,7 +45,12 @@
45#define E820_NVS 4 45#define E820_NVS 4
46#define E820_UNUSABLE 5 46#define E820_UNUSABLE 5
47 47
48/* reserved RAM used by kernel itself */ 48/*
49 * reserved RAM used by kernel itself
50 * if CONFIG_INTEL_TXT is enabled, memory of this type will be
51 * included in the S3 integrity calculation and so should not include
52 * any memory that BIOS might alter over the S3 transition
53 */
49#define E820_RESERVED_KERN 128 54#define E820_RESERVED_KERN 128
50 55
51#ifndef __ASSEMBLY__ 56#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f8576427cf..aeab29aee61 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
35 35
36#define __ARCH_IRQ_STAT 36#define __ARCH_IRQ_STAT
37 37
38#define inc_irq_stat(member) percpu_add(irq_stat.member, 1) 38#define inc_irq_stat(member) percpu_inc(irq_stat.member)
39 39
40#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending) 40#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
41 41
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1d5c08a1bdf..004e6e25e91 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -68,6 +68,7 @@ extern unsigned long force_hpet_address;
68extern u8 hpet_blockid; 68extern u8 hpet_blockid;
69extern int hpet_force_user; 69extern int hpet_force_user;
70extern u8 hpet_msi_disable; 70extern u8 hpet_msi_disable;
71extern u8 hpet_readback_cmp;
71extern int is_hpet_enabled(void); 72extern int is_hpet_enabled(void);
72extern int hpet_enable(void); 73extern int hpet_enable(void);
73extern void hpet_disable(void); 74extern void hpet_disable(void);
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 2a1bd8f4f23..942255310e6 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -41,12 +41,16 @@ struct arch_hw_breakpoint {
41/* Total number of available HW breakpoint registers */ 41/* Total number of available HW breakpoint registers */
42#define HBP_NUM 4 42#define HBP_NUM 4
43 43
44static inline int hw_breakpoint_slots(int type)
45{
46 return HBP_NUM;
47}
48
44struct perf_event; 49struct perf_event;
45struct pmu; 50struct pmu;
46 51
47extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len); 52extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
48extern int arch_validate_hwbkpt_settings(struct perf_event *bp, 53extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
49 struct task_struct *tsk);
50extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, 54extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
51 unsigned long val, void *data); 55 unsigned long val, void *data);
52 56
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index e153a2b3889..5df477ac3af 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -1,5 +1,5 @@
1#ifndef _ASM_X86_KVM_HYPERV_H 1#ifndef _ASM_X86_HYPERV_H
2#define _ASM_X86_KVM_HYPERV_H 2#define _ASM_X86_HYPERV_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5 5
@@ -14,6 +14,10 @@
14#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 14#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
15#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 15#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
16 16
17#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
18#define HYPERV_CPUID_MIN 0x40000005
19#define HYPERV_CPUID_MAX 0x4000ffff
20
17/* 21/*
18 * Feature identification. EAX indicates which features are available 22 * Feature identification. EAX indicates which features are available
19 * to the partition based upon the current partition privileges. 23 * to the partition based upon the current partition privileges.
@@ -129,6 +133,9 @@
129/* MSR used to provide vcpu index */ 133/* MSR used to provide vcpu index */
130#define HV_X64_MSR_VP_INDEX 0x40000002 134#define HV_X64_MSR_VP_INDEX 0x40000002
131 135
136/* MSR used to read the per-partition time reference counter */
137#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
138
132/* Define the virtual APIC registers */ 139/* Define the virtual APIC registers */
133#define HV_X64_MSR_EOI 0x40000070 140#define HV_X64_MSR_EOI 0x40000070
134#define HV_X64_MSR_ICR 0x40000071 141#define HV_X64_MSR_ICR 0x40000071
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index b78c0941e42..70abda7058c 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -17,10 +17,33 @@
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 * 18 *
19 */ 19 */
20#ifndef ASM_X86__HYPERVISOR_H 20#ifndef _ASM_X86_HYPERVISOR_H
21#define ASM_X86__HYPERVISOR_H 21#define _ASM_X86_HYPERVISOR_H
22 22
23extern void init_hypervisor(struct cpuinfo_x86 *c); 23extern void init_hypervisor(struct cpuinfo_x86 *c);
24extern void init_hypervisor_platform(void); 24extern void init_hypervisor_platform(void);
25 25
26/*
27 * x86 hypervisor information
28 */
29struct hypervisor_x86 {
30 /* Hypervisor name */
31 const char *name;
32
33 /* Detection routine */
34 bool (*detect)(void);
35
36 /* Adjust CPU feature bits (run once per CPU) */
37 void (*set_cpu_features)(struct cpuinfo_x86 *);
38
39 /* Platform setup (run once per boot) */
40 void (*init_platform)(void);
41};
42
43extern const struct hypervisor_x86 *x86_hyper;
44
45/* Recognized hypervisors */
46extern const struct hypervisor_x86 x86_hyper_vmware;
47extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
48
26#endif 49#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index da293092450..c991b3a7b90 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -16,7 +16,9 @@
16#include <linux/kernel_stat.h> 16#include <linux/kernel_stat.h>
17#include <linux/regset.h> 17#include <linux/regset.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/slab.h>
19#include <asm/asm.h> 20#include <asm/asm.h>
21#include <asm/cpufeature.h>
20#include <asm/processor.h> 22#include <asm/processor.h>
21#include <asm/sigcontext.h> 23#include <asm/sigcontext.h>
22#include <asm/user.h> 24#include <asm/user.h>
@@ -56,6 +58,11 @@ extern int restore_i387_xstate_ia32(void __user *buf);
56 58
57#define X87_FSW_ES (1 << 7) /* Exception Summary */ 59#define X87_FSW_ES (1 << 7) /* Exception Summary */
58 60
61static __always_inline __pure bool use_xsave(void)
62{
63 return static_cpu_has(X86_FEATURE_XSAVE);
64}
65
59#ifdef CONFIG_X86_64 66#ifdef CONFIG_X86_64
60 67
61/* Ignore delayed exceptions from user space */ 68/* Ignore delayed exceptions from user space */
@@ -91,15 +98,15 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
91 values. The kernel data segment can be sometimes 0 and sometimes 98 values. The kernel data segment can be sometimes 0 and sometimes
92 new user value. Both should be ok. 99 new user value. Both should be ok.
93 Use the PDA as safe address because it should be already in L1. */ 100 Use the PDA as safe address because it should be already in L1. */
94static inline void clear_fpu_state(struct task_struct *tsk) 101static inline void fpu_clear(struct fpu *fpu)
95{ 102{
96 struct xsave_struct *xstate = &tsk->thread.xstate->xsave; 103 struct xsave_struct *xstate = &fpu->state->xsave;
97 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 104 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
98 105
99 /* 106 /*
100 * xsave header may indicate the init state of the FP. 107 * xsave header may indicate the init state of the FP.
101 */ 108 */
102 if ((task_thread_info(tsk)->status & TS_XSAVE) && 109 if (use_xsave() &&
103 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) 110 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
104 return; 111 return;
105 112
@@ -111,6 +118,11 @@ static inline void clear_fpu_state(struct task_struct *tsk)
111 X86_FEATURE_FXSAVE_LEAK); 118 X86_FEATURE_FXSAVE_LEAK);
112} 119}
113 120
121static inline void clear_fpu_state(struct task_struct *tsk)
122{
123 fpu_clear(&tsk->thread.fpu);
124}
125
114static inline int fxsave_user(struct i387_fxsave_struct __user *fx) 126static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
115{ 127{
116 int err; 128 int err;
@@ -135,7 +147,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
135 return err; 147 return err;
136} 148}
137 149
138static inline void fxsave(struct task_struct *tsk) 150static inline void fpu_fxsave(struct fpu *fpu)
139{ 151{
140 /* Using "rex64; fxsave %0" is broken because, if the memory operand 152 /* Using "rex64; fxsave %0" is broken because, if the memory operand
141 uses any extended registers for addressing, a second REX prefix 153 uses any extended registers for addressing, a second REX prefix
@@ -145,42 +157,45 @@ static inline void fxsave(struct task_struct *tsk)
145 /* Using "fxsaveq %0" would be the ideal choice, but is only supported 157 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
146 starting with gas 2.16. */ 158 starting with gas 2.16. */
147 __asm__ __volatile__("fxsaveq %0" 159 __asm__ __volatile__("fxsaveq %0"
148 : "=m" (tsk->thread.xstate->fxsave)); 160 : "=m" (fpu->state->fxsave));
149#elif 0 161#elif 0
150 /* Using, as a workaround, the properly prefixed form below isn't 162 /* Using, as a workaround, the properly prefixed form below isn't
151 accepted by any binutils version so far released, complaining that 163 accepted by any binutils version so far released, complaining that
152 the same type of prefix is used twice if an extended register is 164 the same type of prefix is used twice if an extended register is
153 needed for addressing (fix submitted to mainline 2005-11-21). */ 165 needed for addressing (fix submitted to mainline 2005-11-21). */
154 __asm__ __volatile__("rex64/fxsave %0" 166 __asm__ __volatile__("rex64/fxsave %0"
155 : "=m" (tsk->thread.xstate->fxsave)); 167 : "=m" (fpu->state->fxsave));
156#else 168#else
157 /* This, however, we can work around by forcing the compiler to select 169 /* This, however, we can work around by forcing the compiler to select
158 an addressing mode that doesn't require extended registers. */ 170 an addressing mode that doesn't require extended registers. */
159 __asm__ __volatile__("rex64/fxsave (%1)" 171 __asm__ __volatile__("rex64/fxsave (%1)"
160 : "=m" (tsk->thread.xstate->fxsave) 172 : "=m" (fpu->state->fxsave)
161 : "cdaSDb" (&tsk->thread.xstate->fxsave)); 173 : "cdaSDb" (&fpu->state->fxsave));
162#endif 174#endif
163} 175}
164 176
165static inline void __save_init_fpu(struct task_struct *tsk) 177static inline void fpu_save_init(struct fpu *fpu)
166{ 178{
167 if (task_thread_info(tsk)->status & TS_XSAVE) 179 if (use_xsave())
168 xsave(tsk); 180 fpu_xsave(fpu);
169 else 181 else
170 fxsave(tsk); 182 fpu_fxsave(fpu);
183
184 fpu_clear(fpu);
185}
171 186
172 clear_fpu_state(tsk); 187static inline void __save_init_fpu(struct task_struct *tsk)
188{
189 fpu_save_init(&tsk->thread.fpu);
173 task_thread_info(tsk)->status &= ~TS_USEDFPU; 190 task_thread_info(tsk)->status &= ~TS_USEDFPU;
174} 191}
175 192
176#else /* CONFIG_X86_32 */ 193#else /* CONFIG_X86_32 */
177 194
178#ifdef CONFIG_MATH_EMULATION 195#ifdef CONFIG_MATH_EMULATION
179extern void finit_task(struct task_struct *tsk); 196extern void finit_soft_fpu(struct i387_soft_struct *soft);
180#else 197#else
181static inline void finit_task(struct task_struct *tsk) 198static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
182{
183}
184#endif 199#endif
185 200
186static inline void tolerant_fwait(void) 201static inline void tolerant_fwait(void)
@@ -216,13 +231,13 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
216/* 231/*
217 * These must be called with preempt disabled 232 * These must be called with preempt disabled
218 */ 233 */
219static inline void __save_init_fpu(struct task_struct *tsk) 234static inline void fpu_save_init(struct fpu *fpu)
220{ 235{
221 if (task_thread_info(tsk)->status & TS_XSAVE) { 236 if (use_xsave()) {
222 struct xsave_struct *xstate = &tsk->thread.xstate->xsave; 237 struct xsave_struct *xstate = &fpu->state->xsave;
223 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 238 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
224 239
225 xsave(tsk); 240 fpu_xsave(fpu);
226 241
227 /* 242 /*
228 * xsave header may indicate the init state of the FP. 243 * xsave header may indicate the init state of the FP.
@@ -246,8 +261,8 @@ static inline void __save_init_fpu(struct task_struct *tsk)
246 "fxsave %[fx]\n" 261 "fxsave %[fx]\n"
247 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", 262 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
248 X86_FEATURE_FXSR, 263 X86_FEATURE_FXSR,
249 [fx] "m" (tsk->thread.xstate->fxsave), 264 [fx] "m" (fpu->state->fxsave),
250 [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory"); 265 [fsw] "m" (fpu->state->fxsave.swd) : "memory");
251clear_state: 266clear_state:
252 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception 267 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
253 is pending. Clear the x87 state here by setting it to fixed 268 is pending. Clear the x87 state here by setting it to fixed
@@ -259,17 +274,34 @@ clear_state:
259 X86_FEATURE_FXSAVE_LEAK, 274 X86_FEATURE_FXSAVE_LEAK,
260 [addr] "m" (safe_address)); 275 [addr] "m" (safe_address));
261end: 276end:
277 ;
278}
279
280static inline void __save_init_fpu(struct task_struct *tsk)
281{
282 fpu_save_init(&tsk->thread.fpu);
262 task_thread_info(tsk)->status &= ~TS_USEDFPU; 283 task_thread_info(tsk)->status &= ~TS_USEDFPU;
263} 284}
264 285
286
265#endif /* CONFIG_X86_64 */ 287#endif /* CONFIG_X86_64 */
266 288
267static inline int restore_fpu_checking(struct task_struct *tsk) 289static inline int fpu_fxrstor_checking(struct fpu *fpu)
268{ 290{
269 if (task_thread_info(tsk)->status & TS_XSAVE) 291 return fxrstor_checking(&fpu->state->fxsave);
270 return xrstor_checking(&tsk->thread.xstate->xsave); 292}
293
294static inline int fpu_restore_checking(struct fpu *fpu)
295{
296 if (use_xsave())
297 return fpu_xrstor_checking(fpu);
271 else 298 else
272 return fxrstor_checking(&tsk->thread.xstate->fxsave); 299 return fpu_fxrstor_checking(fpu);
300}
301
302static inline int restore_fpu_checking(struct task_struct *tsk)
303{
304 return fpu_restore_checking(&tsk->thread.fpu);
273} 305}
274 306
275/* 307/*
@@ -397,30 +429,59 @@ static inline void clear_fpu(struct task_struct *tsk)
397static inline unsigned short get_fpu_cwd(struct task_struct *tsk) 429static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
398{ 430{
399 if (cpu_has_fxsr) { 431 if (cpu_has_fxsr) {
400 return tsk->thread.xstate->fxsave.cwd; 432 return tsk->thread.fpu.state->fxsave.cwd;
401 } else { 433 } else {
402 return (unsigned short)tsk->thread.xstate->fsave.cwd; 434 return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
403 } 435 }
404} 436}
405 437
406static inline unsigned short get_fpu_swd(struct task_struct *tsk) 438static inline unsigned short get_fpu_swd(struct task_struct *tsk)
407{ 439{
408 if (cpu_has_fxsr) { 440 if (cpu_has_fxsr) {
409 return tsk->thread.xstate->fxsave.swd; 441 return tsk->thread.fpu.state->fxsave.swd;
410 } else { 442 } else {
411 return (unsigned short)tsk->thread.xstate->fsave.swd; 443 return (unsigned short)tsk->thread.fpu.state->fsave.swd;
412 } 444 }
413} 445}
414 446
415static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) 447static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
416{ 448{
417 if (cpu_has_xmm) { 449 if (cpu_has_xmm) {
418 return tsk->thread.xstate->fxsave.mxcsr; 450 return tsk->thread.fpu.state->fxsave.mxcsr;
419 } else { 451 } else {
420 return MXCSR_DEFAULT; 452 return MXCSR_DEFAULT;
421 } 453 }
422} 454}
423 455
456static bool fpu_allocated(struct fpu *fpu)
457{
458 return fpu->state != NULL;
459}
460
461static inline int fpu_alloc(struct fpu *fpu)
462{
463 if (fpu_allocated(fpu))
464 return 0;
465 fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
466 if (!fpu->state)
467 return -ENOMEM;
468 WARN_ON((unsigned long)fpu->state & 15);
469 return 0;
470}
471
472static inline void fpu_free(struct fpu *fpu)
473{
474 if (fpu->state) {
475 kmem_cache_free(task_xstate_cachep, fpu->state);
476 fpu->state = NULL;
477 }
478}
479
480static inline void fpu_copy(struct fpu *dst, struct fpu *src)
481{
482 memcpy(dst->state, src->state, xstate_size);
483}
484
424#endif /* __ASSEMBLY__ */ 485#endif /* __ASSEMBLY__ */
425 486
426#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 487#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index 1edbf89680f..fc1f579fb96 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,7 +6,7 @@
6#define PIT_CH0 0x40 6#define PIT_CH0 0x40
7#define PIT_CH2 0x42 7#define PIT_CH2 0x42
8 8
9extern spinlock_t i8253_lock; 9extern raw_spinlock_t i8253_lock;
10 10
11extern struct clock_event_device *global_clock_event; 11extern struct clock_event_device *global_clock_event;
12 12
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 96c2e0ad04c..88c765e1641 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -68,6 +68,8 @@ struct insn {
68 const insn_byte_t *next_byte; 68 const insn_byte_t *next_byte;
69}; 69};
70 70
71#define MAX_INSN_SIZE 16
72
71#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) 73#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
72#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) 74#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
73#define X86_MODRM_RM(modrm) ((modrm) & 0x07) 75#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index 14cf526091f..280bf7fb6ab 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -7,7 +7,66 @@
7 7
8#ifdef __ASSEMBLY__ 8#ifdef __ASSEMBLY__
9 9
10#define REG_NUM_INVALID 100
11
12#define REG_TYPE_R64 0
13#define REG_TYPE_XMM 1
14#define REG_TYPE_INVALID 100
15
16 .macro R64_NUM opd r64
17 \opd = REG_NUM_INVALID
18 .ifc \r64,%rax
19 \opd = 0
20 .endif
21 .ifc \r64,%rcx
22 \opd = 1
23 .endif
24 .ifc \r64,%rdx
25 \opd = 2
26 .endif
27 .ifc \r64,%rbx
28 \opd = 3
29 .endif
30 .ifc \r64,%rsp
31 \opd = 4
32 .endif
33 .ifc \r64,%rbp
34 \opd = 5
35 .endif
36 .ifc \r64,%rsi
37 \opd = 6
38 .endif
39 .ifc \r64,%rdi
40 \opd = 7
41 .endif
42 .ifc \r64,%r8
43 \opd = 8
44 .endif
45 .ifc \r64,%r9
46 \opd = 9
47 .endif
48 .ifc \r64,%r10
49 \opd = 10
50 .endif
51 .ifc \r64,%r11
52 \opd = 11
53 .endif
54 .ifc \r64,%r12
55 \opd = 12
56 .endif
57 .ifc \r64,%r13
58 \opd = 13
59 .endif
60 .ifc \r64,%r14
61 \opd = 14
62 .endif
63 .ifc \r64,%r15
64 \opd = 15
65 .endif
66 .endm
67
10 .macro XMM_NUM opd xmm 68 .macro XMM_NUM opd xmm
69 \opd = REG_NUM_INVALID
11 .ifc \xmm,%xmm0 70 .ifc \xmm,%xmm0
12 \opd = 0 71 \opd = 0
13 .endif 72 .endif
@@ -58,13 +117,25 @@
58 .endif 117 .endif
59 .endm 118 .endm
60 119
120 .macro REG_TYPE type reg
121 R64_NUM reg_type_r64 \reg
122 XMM_NUM reg_type_xmm \reg
123 .if reg_type_r64 <> REG_NUM_INVALID
124 \type = REG_TYPE_R64
125 .elseif reg_type_xmm <> REG_NUM_INVALID
126 \type = REG_TYPE_XMM
127 .else
128 \type = REG_TYPE_INVALID
129 .endif
130 .endm
131
61 .macro PFX_OPD_SIZE 132 .macro PFX_OPD_SIZE
62 .byte 0x66 133 .byte 0x66
63 .endm 134 .endm
64 135
65 .macro PFX_REX opd1 opd2 136 .macro PFX_REX opd1 opd2 W=0
66 .if (\opd1 | \opd2) & 8 137 .if ((\opd1 | \opd2) & 8) || \W
67 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) 138 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
68 .endif 139 .endif
69 .endm 140 .endm
70 141
@@ -145,6 +216,25 @@
145 .byte 0x0f, 0x38, 0xdf 216 .byte 0x0f, 0x38, 0xdf
146 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 217 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
147 .endm 218 .endm
219
220 .macro MOVQ_R64_XMM opd1 opd2
221 REG_TYPE movq_r64_xmm_opd1_type \opd1
222 .if movq_r64_xmm_opd1_type == REG_TYPE_XMM
223 XMM_NUM movq_r64_xmm_opd1 \opd1
224 R64_NUM movq_r64_xmm_opd2 \opd2
225 .else
226 R64_NUM movq_r64_xmm_opd1 \opd1
227 XMM_NUM movq_r64_xmm_opd2 \opd2
228 .endif
229 PFX_OPD_SIZE
230 PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
231 .if movq_r64_xmm_opd1_type == REG_TYPE_XMM
232 .byte 0x0f, 0x7e
233 .else
234 .byte 0x0f, 0x6e
235 .endif
236 MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
237 .endm
148#endif 238#endif
149 239
150#endif 240#endif
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
new file mode 100644
index 00000000000..4470c9ad4a3
--- /dev/null
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -0,0 +1,55 @@
1#ifndef _ASM_X86_INTEL_SCU_IPC_H_
2#define _ASM_X86_INTEL_SCU_IPC_H_
3
4/* Read single register */
5int intel_scu_ipc_ioread8(u16 addr, u8 *data);
6
7/* Read two sequential registers */
8int intel_scu_ipc_ioread16(u16 addr, u16 *data);
9
10/* Read four sequential registers */
11int intel_scu_ipc_ioread32(u16 addr, u32 *data);
12
13/* Read a vector */
14int intel_scu_ipc_readv(u16 *addr, u8 *data, int len);
15
16/* Write single register */
17int intel_scu_ipc_iowrite8(u16 addr, u8 data);
18
19/* Write two sequential registers */
20int intel_scu_ipc_iowrite16(u16 addr, u16 data);
21
22/* Write four sequential registers */
23int intel_scu_ipc_iowrite32(u16 addr, u32 data);
24
25/* Write a vector */
26int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
27
28/* Update single register based on the mask */
29int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
30
31/*
32 * Indirect register read
33 * Can be used when SCCB(System Controller Configuration Block) register
34 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
35 */
36int intel_scu_ipc_register_read(u32 addr, u32 *data);
37
38/*
39 * Indirect register write
40 * Can be used when SCCB(System Controller Configuration Block) register
41 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
42 */
43int intel_scu_ipc_register_write(u32 addr, u32 data);
44
45/* Issue commands to the SCU with or without data */
46int intel_scu_ipc_simple_command(int cmd, int sub);
47int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
48 u32 *out, int outlen);
49/* I2C control api */
50int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data);
51
52/* Update FW version */
53int intel_scu_ipc_fw_update(u8 *buffer, u32 length);
54
55#endif
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 35832a03a51..63cb4096c3d 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,6 @@ struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 159extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 160 struct io_apic_irq_attr *irq_attr);
161void setup_IO_APIC_irq_extra(u32 gsi); 161void setup_IO_APIC_irq_extra(u32 gsi);
162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
163extern void ioapic_init_mappings(void); 162extern void ioapic_init_mappings(void);
164extern void ioapic_insert_resources(void); 163extern void ioapic_insert_resources(void);
165 164
@@ -180,12 +179,13 @@ extern void ioapic_write_entry(int apic, int pin,
180extern void setup_ioapic_ids_from_mpc(void); 179extern void setup_ioapic_ids_from_mpc(void);
181 180
182struct mp_ioapic_gsi{ 181struct mp_ioapic_gsi{
183 int gsi_base; 182 u32 gsi_base;
184 int gsi_end; 183 u32 gsi_end;
185}; 184};
186extern struct mp_ioapic_gsi mp_gsi_routing[]; 185extern struct mp_ioapic_gsi mp_gsi_routing[];
187int mp_find_ioapic(int gsi); 186extern u32 gsi_end;
188int mp_find_ioapic_pin(int ioapic, int gsi); 187int mp_find_ioapic(u32 gsi);
188int mp_find_ioapic_pin(int ioapic, u32 gsi);
189void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); 189void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
190extern void __init pre_init_apic_IRQ0(void); 190extern void __init pre_init_apic_IRQ0(void);
191 191
@@ -197,7 +197,8 @@ static const int timer_through_8259 = 0;
197static inline void ioapic_init_mappings(void) { } 197static inline void ioapic_init_mappings(void) { }
198static inline void ioapic_insert_resources(void) { } 198static inline void ioapic_insert_resources(void) { }
199static inline void probe_nr_irqs_gsi(void) { } 199static inline void probe_nr_irqs_gsi(void) { }
200static inline int mp_find_ioapic(int gsi) { return 0; } 200#define gsi_end (NR_IRQS_LEGACY - 1)
201static inline int mp_find_ioapic(u32 gsi) { return 0; }
201 202
202struct io_apic_irq_attr; 203struct io_apic_irq_attr;
203static inline int io_apic_set_pci_routing(struct device *dev, int irq, 204static inline int io_apic_set_pci_routing(struct device *dev, int irq,
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index f70e60071fe..af00bd1d208 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -16,11 +16,16 @@ extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void); 16extern int k8_scan_nodes(void);
17 17
18#ifdef CONFIG_K8_NB 18#ifdef CONFIG_K8_NB
19extern int num_k8_northbridges;
20
19static inline struct pci_dev *node_to_k8_nb_misc(int node) 21static inline struct pci_dev *node_to_k8_nb_misc(int node)
20{ 22{
21 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; 23 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
22} 24}
25
23#else 26#else
27#define num_k8_northbridges 0
28
24static inline struct pci_dev *node_to_k8_nb_misc(int node) 29static inline struct pci_dev *node_to_k8_nb_misc(int node)
25{ 30{
26 return NULL; 31 return NULL;
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index e6c6c808489..006da3687cd 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -76,4 +76,7 @@ static inline void arch_kgdb_breakpoint(void)
76#define BREAK_INSTR_SIZE 1 76#define BREAK_INSTR_SIZE 1
77#define CACHE_FLUSH_IS_SAFE 1 77#define CACHE_FLUSH_IS_SAFE 1
78 78
79extern int kgdb_ll_trap(int cmd, const char *str,
80 struct pt_regs *regs, long err, int trap, int sig);
81
79#endif /* _ASM_X86_KGDB_H */ 82#endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4ffa345a8cc..54788253915 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -24,6 +24,7 @@
24#include <linux/types.h> 24#include <linux/types.h>
25#include <linux/ptrace.h> 25#include <linux/ptrace.h>
26#include <linux/percpu.h> 26#include <linux/percpu.h>
27#include <asm/insn.h>
27 28
28#define __ARCH_WANT_KPROBES_INSN_SLOT 29#define __ARCH_WANT_KPROBES_INSN_SLOT
29 30
@@ -36,7 +37,6 @@ typedef u8 kprobe_opcode_t;
36#define RELATIVEJUMP_SIZE 5 37#define RELATIVEJUMP_SIZE 5
37#define RELATIVECALL_OPCODE 0xe8 38#define RELATIVECALL_OPCODE 0xe8
38#define RELATIVE_ADDR_SIZE 4 39#define RELATIVE_ADDR_SIZE 4
39#define MAX_INSN_SIZE 16
40#define MAX_STACK_SIZE 64 40#define MAX_STACK_SIZE 64
41#define MIN_STACK_SIZE(ADDR) \ 41#define MIN_STACK_SIZE(ADDR) \
42 (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \ 42 (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f6c16..ff90055c7f0 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS
24 25
25/* Architectural interrupt line count. */ 26/* Architectural interrupt line count. */
26#define KVM_NR_INTERRUPTS 256 27#define KVM_NR_INTERRUPTS 256
@@ -257,6 +258,11 @@ struct kvm_reinject_control {
257/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ 258/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
258#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 259#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
259#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 260#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
261#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
262
263/* Interrupt shadow states */
264#define KVM_X86_SHADOW_INT_MOV_SS 0x01
265#define KVM_X86_SHADOW_INT_STI 0x02
260 266
261/* for KVM_GET/SET_VCPU_EVENTS */ 267/* for KVM_GET/SET_VCPU_EVENTS */
262struct kvm_vcpu_events { 268struct kvm_vcpu_events {
@@ -271,7 +277,7 @@ struct kvm_vcpu_events {
271 __u8 injected; 277 __u8 injected;
272 __u8 nr; 278 __u8 nr;
273 __u8 soft; 279 __u8 soft;
274 __u8 pad; 280 __u8 shadow;
275 } interrupt; 281 } interrupt;
276 struct { 282 struct {
277 __u8 injected; 283 __u8 injected;
@@ -284,4 +290,13 @@ struct kvm_vcpu_events {
284 __u32 reserved[10]; 290 __u32 reserved[10];
285}; 291};
286 292
293/* for KVM_GET/SET_DEBUGREGS */
294struct kvm_debugregs {
295 __u64 db[4];
296 __u64 dr6;
297 __u64 dr7;
298 __u64 flags;
299 __u64 reserved[9];
300};
301
287#endif /* _ASM_X86_KVM_H */ 302#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54fa13b..0b2729bf207 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
11#ifndef _ASM_X86_KVM_X86_EMULATE_H 11#ifndef _ASM_X86_KVM_X86_EMULATE_H
12#define _ASM_X86_KVM_X86_EMULATE_H 12#define _ASM_X86_KVM_X86_EMULATE_H
13 13
14#include <asm/desc_defs.h>
15
14struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
15 17
16/* 18/*
@@ -63,6 +65,15 @@ struct x86_emulate_ops {
63 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 65 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64 66
65 /* 67 /*
68 * write_std: Write bytes of standard (non-emulated/special) memory.
69 * Used for descriptor writing.
70 * @addr: [IN ] Linear address to which to write.
71 * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
72 * @bytes: [IN ] Number of bytes to write to memory.
73 */
74 int (*write_std)(unsigned long addr, void *val,
75 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
76 /*
66 * fetch: Read bytes of standard (non-emulated/special) memory. 77 * fetch: Read bytes of standard (non-emulated/special) memory.
67 * Used for instruction fetch. 78 * Used for instruction fetch.
68 * @addr: [IN ] Linear address from which to read. 79 * @addr: [IN ] Linear address from which to read.
@@ -109,6 +120,23 @@ struct x86_emulate_ops {
109 unsigned int bytes, 120 unsigned int bytes,
110 struct kvm_vcpu *vcpu); 121 struct kvm_vcpu *vcpu);
111 122
123 int (*pio_in_emulated)(int size, unsigned short port, void *val,
124 unsigned int count, struct kvm_vcpu *vcpu);
125
126 int (*pio_out_emulated)(int size, unsigned short port, const void *val,
127 unsigned int count, struct kvm_vcpu *vcpu);
128
129 bool (*get_cached_descriptor)(struct desc_struct *desc,
130 int seg, struct kvm_vcpu *vcpu);
131 void (*set_cached_descriptor)(struct desc_struct *desc,
132 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
112}; 140};
113 141
114/* Type, address-of, and value of an instruction's operand. */ 142/* Type, address-of, and value of an instruction's operand. */
@@ -124,6 +152,12 @@ struct fetch_cache {
124 unsigned long end; 152 unsigned long end;
125}; 153};
126 154
155struct read_cache {
156 u8 data[1024];
157 unsigned long pos;
158 unsigned long end;
159};
160
127struct decode_cache { 161struct decode_cache {
128 u8 twobyte; 162 u8 twobyte;
129 u8 b; 163 u8 b;
@@ -139,7 +173,7 @@ struct decode_cache {
139 u8 seg_override; 173 u8 seg_override;
140 unsigned int d; 174 unsigned int d;
141 unsigned long regs[NR_VCPU_REGS]; 175 unsigned long regs[NR_VCPU_REGS];
142 unsigned long eip, eip_orig; 176 unsigned long eip;
143 /* modrm */ 177 /* modrm */
144 u8 modrm; 178 u8 modrm;
145 u8 modrm_mod; 179 u8 modrm_mod;
@@ -151,16 +185,15 @@ struct decode_cache {
151 void *modrm_ptr; 185 void *modrm_ptr;
152 unsigned long modrm_val; 186 unsigned long modrm_val;
153 struct fetch_cache fetch; 187 struct fetch_cache fetch;
188 struct read_cache io_read;
154}; 189};
155 190
156#define X86_SHADOW_INT_MOV_SS 1
157#define X86_SHADOW_INT_STI 2
158
159struct x86_emulate_ctxt { 191struct x86_emulate_ctxt {
160 /* Register state before/after emulation. */ 192 /* Register state before/after emulation. */
161 struct kvm_vcpu *vcpu; 193 struct kvm_vcpu *vcpu;
162 194
163 unsigned long eflags; 195 unsigned long eflags;
196 unsigned long eip; /* eip before instruction emulation */
164 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 197 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
165 int mode; 198 int mode;
166 u32 cs_base; 199 u32 cs_base;
@@ -168,6 +201,7 @@ struct x86_emulate_ctxt {
168 /* interruptibility state, as a result of execution of STI or MOV SS */ 201 /* interruptibility state, as a result of execution of STI or MOV SS */
169 int interruptibility; 202 int interruptibility;
170 203
204 bool restart; /* restart string instruction after writeback */
171 /* decode cache */ 205 /* decode cache */
172 struct decode_cache decode; 206 struct decode_cache decode;
173}; 207};
@@ -194,5 +228,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
194 struct x86_emulate_ops *ops); 228 struct x86_emulate_ops *ops);
195int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, 229int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
196 struct x86_emulate_ops *ops); 230 struct x86_emulate_ops *ops);
231int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
232 struct x86_emulate_ops *ops,
233 u16 tss_selector, int reason,
234 bool has_error_code, u32 error_code);
197 235
198#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 236#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79ca37..76f5483cffe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,15 +171,15 @@ struct kvm_pte_chain {
171union kvm_mmu_page_role { 171union kvm_mmu_page_role {
172 unsigned word; 172 unsigned word;
173 struct { 173 struct {
174 unsigned glevels:4;
175 unsigned level:4; 174 unsigned level:4;
175 unsigned cr4_pae:1;
176 unsigned quadrant:2; 176 unsigned quadrant:2;
177 unsigned pad_for_nice_hex_output:6; 177 unsigned pad_for_nice_hex_output:6;
178 unsigned direct:1; 178 unsigned direct:1;
179 unsigned access:3; 179 unsigned access:3;
180 unsigned invalid:1; 180 unsigned invalid:1;
181 unsigned cr4_pge:1;
182 unsigned nxe:1; 181 unsigned nxe:1;
182 unsigned cr0_wp:1;
183 }; 183 };
184}; 184};
185 185
@@ -187,8 +187,6 @@ struct kvm_mmu_page {
187 struct list_head link; 187 struct list_head link;
188 struct hlist_node hash_link; 188 struct hlist_node hash_link;
189 189
190 struct list_head oos_link;
191
192 /* 190 /*
193 * The following two entries are used to key the shadow page in the 191 * The following two entries are used to key the shadow page in the
194 * hash table. 192 * hash table.
@@ -204,9 +202,9 @@ struct kvm_mmu_page {
204 * in this shadow page. 202 * in this shadow page.
205 */ 203 */
206 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 204 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
207 int multimapped; /* More than one parent_pte? */ 205 bool multimapped; /* More than one parent_pte? */
208 int root_count; /* Currently serving as active root */
209 bool unsync; 206 bool unsync;
207 int root_count; /* Currently serving as active root */
210 unsigned int unsync_children; 208 unsigned int unsync_children;
211 union { 209 union {
212 u64 *parent_pte; /* !multimapped */ 210 u64 *parent_pte; /* !multimapped */
@@ -224,14 +222,9 @@ struct kvm_pv_mmu_op_buffer {
224 222
225struct kvm_pio_request { 223struct kvm_pio_request {
226 unsigned long count; 224 unsigned long count;
227 int cur_count;
228 gva_t guest_gva;
229 int in; 225 int in;
230 int port; 226 int port;
231 int size; 227 int size;
232 int string;
233 int down;
234 int rep;
235}; 228};
236 229
237/* 230/*
@@ -320,6 +313,7 @@ struct kvm_vcpu_arch {
320 struct kvm_queued_exception { 313 struct kvm_queued_exception {
321 bool pending; 314 bool pending;
322 bool has_error_code; 315 bool has_error_code;
316 bool reinject;
323 u8 nr; 317 u8 nr;
324 u32 error_code; 318 u32 error_code;
325 } exception; 319 } exception;
@@ -362,8 +356,8 @@ struct kvm_vcpu_arch {
362 u64 *mce_banks; 356 u64 *mce_banks;
363 357
364 /* used for guest single stepping over the given code position */ 358 /* used for guest single stepping over the given code position */
365 u16 singlestep_cs;
366 unsigned long singlestep_rip; 359 unsigned long singlestep_rip;
360
367 /* fields used by HYPER-V emulation */ 361 /* fields used by HYPER-V emulation */
368 u64 hv_vapic; 362 u64 hv_vapic;
369}; 363};
@@ -389,6 +383,7 @@ struct kvm_arch {
389 unsigned int n_free_mmu_pages; 383 unsigned int n_free_mmu_pages;
390 unsigned int n_requested_mmu_pages; 384 unsigned int n_requested_mmu_pages;
391 unsigned int n_alloc_mmu_pages; 385 unsigned int n_alloc_mmu_pages;
386 atomic_t invlpg_counter;
392 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 387 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
393 /* 388 /*
394 * Hash table of struct kvm_mmu_page. 389 * Hash table of struct kvm_mmu_page.
@@ -461,11 +456,6 @@ struct kvm_vcpu_stat {
461 u32 nmi_injections; 456 u32 nmi_injections;
462}; 457};
463 458
464struct descriptor_table {
465 u16 limit;
466 unsigned long base;
467} __attribute__((packed));
468
469struct kvm_x86_ops { 459struct kvm_x86_ops {
470 int (*cpu_has_kvm_support)(void); /* __init */ 460 int (*cpu_has_kvm_support)(void); /* __init */
471 int (*disabled_by_bios)(void); /* __init */ 461 int (*disabled_by_bios)(void); /* __init */
@@ -503,12 +493,11 @@ struct kvm_x86_ops {
503 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 493 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
504 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); 494 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
505 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); 495 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
506 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 496 void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
507 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 497 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
508 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 498 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
509 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 499 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
510 int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); 500 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
511 int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
512 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 501 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
513 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 502 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
514 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 503 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -527,7 +516,8 @@ struct kvm_x86_ops {
527 void (*set_irq)(struct kvm_vcpu *vcpu); 516 void (*set_irq)(struct kvm_vcpu *vcpu);
528 void (*set_nmi)(struct kvm_vcpu *vcpu); 517 void (*set_nmi)(struct kvm_vcpu *vcpu);
529 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 518 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
530 bool has_error_code, u32 error_code); 519 bool has_error_code, u32 error_code,
520 bool reinject);
531 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 521 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
532 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 522 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
533 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 523 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -541,6 +531,8 @@ struct kvm_x86_ops {
541 int (*get_lpage_level)(void); 531 int (*get_lpage_level)(void);
542 bool (*rdtscp_supported)(void); 532 bool (*rdtscp_supported)(void);
543 533
534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
535
544 const struct trace_print_flags *exit_reasons_str; 536 const struct trace_print_flags *exit_reasons_str;
545}; 537};
546 538
@@ -587,23 +579,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
587void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
588void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
589void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 581void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
590void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
591 unsigned long *rflags);
592 582
593unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
594void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
595 unsigned long *rflags);
596void kvm_enable_efer_bits(u64); 583void kvm_enable_efer_bits(u64);
597int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 584int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
598int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 585int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
599 586
600struct x86_emulate_ctxt; 587struct x86_emulate_ctxt;
601 588
602int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, 589int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
603 int size, unsigned port);
604int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
605 int size, unsigned long count, int down,
606 gva_t address, int rep, unsigned port);
607void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 590void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
608int kvm_emulate_halt(struct kvm_vcpu *vcpu); 591int kvm_emulate_halt(struct kvm_vcpu *vcpu);
609int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 592int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -616,12 +599,15 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
616void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 599void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
617int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 600int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
618 601
619int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); 602int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
603 bool has_error_code, u32 error_code);
620 604
621void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 605void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
622void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 606void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
623void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 607void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
624void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 608void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
609int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
610int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
625unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 611unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
626void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 612void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
627void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 613void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -634,6 +620,8 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
634 620
635void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 621void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
636void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 622void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
623void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
624void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
637void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 625void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
638 u32 error_code); 626 u32 error_code);
639bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 627bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
@@ -649,8 +637,6 @@ int emulator_write_emulated(unsigned long addr,
649 unsigned int bytes, 637 unsigned int bytes,
650 struct kvm_vcpu *vcpu); 638 struct kvm_vcpu *vcpu);
651 639
652unsigned long segment_base(u16 selector);
653
654void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 640void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
655void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 641void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
656 const u8 *new, int bytes, 642 const u8 *new, int bytes,
@@ -675,7 +661,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
675void kvm_enable_tdp(void); 661void kvm_enable_tdp(void);
676void kvm_disable_tdp(void); 662void kvm_disable_tdp(void);
677 663
678int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
679int complete_pio(struct kvm_vcpu *vcpu); 664int complete_pio(struct kvm_vcpu *vcpu);
680bool kvm_check_iopl(struct kvm_vcpu *vcpu); 665bool kvm_check_iopl(struct kvm_vcpu *vcpu);
681 666
@@ -724,23 +709,6 @@ static inline void kvm_load_ldt(u16 sel)
724 asm("lldt %0" : : "rm"(sel)); 709 asm("lldt %0" : : "rm"(sel));
725} 710}
726 711
727static inline void kvm_get_idt(struct descriptor_table *table)
728{
729 asm("sidt %0" : "=m"(*table));
730}
731
732static inline void kvm_get_gdt(struct descriptor_table *table)
733{
734 asm("sgdt %0" : "=m"(*table));
735}
736
737static inline unsigned long kvm_read_tr_base(void)
738{
739 u16 tr;
740 asm("str %0" : "=g"(tr));
741 return segment_base(tr);
742}
743
744#ifdef CONFIG_X86_64 712#ifdef CONFIG_X86_64
745static inline unsigned long read_msr(unsigned long msr) 713static inline unsigned long read_msr(unsigned long msr)
746{ 714{
@@ -826,4 +794,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
826void kvm_define_shared_msr(unsigned index, u32 msr); 794void kvm_define_shared_msr(unsigned index, u32 msr);
827void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 795void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
828 796
797bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
798
829#endif /* _ASM_X86_KVM_HOST_H */ 799#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ffae1420e7d..05eba5e9a8e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -16,10 +16,23 @@
16#define KVM_FEATURE_CLOCKSOURCE 0 16#define KVM_FEATURE_CLOCKSOURCE 0
17#define KVM_FEATURE_NOP_IO_DELAY 1 17#define KVM_FEATURE_NOP_IO_DELAY 1
18#define KVM_FEATURE_MMU_OP 2 18#define KVM_FEATURE_MMU_OP 2
19/* This indicates that the new set of kvmclock msrs
20 * are available. The use of 0x11 and 0x12 is deprecated
21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3
23
24/* The last 8 bits are used to indicate how to interpret the flags field
25 * in pvclock structure. If no bits are set, all flags are ignored.
26 */
27#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
19 28
20#define MSR_KVM_WALL_CLOCK 0x11 29#define MSR_KVM_WALL_CLOCK 0x11
21#define MSR_KVM_SYSTEM_TIME 0x12 30#define MSR_KVM_SYSTEM_TIME 0x12
22 31
32/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
33#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
34#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
35
23#define KVM_MAX_MMU_OP_BATCH 32 36#define KVM_MAX_MMU_OP_BATCH 32
24 37
25/* Operations for KVM_HC_MMU_OP */ 38/* Operations for KVM_HC_MMU_OP */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6c3fdd631ed..f32a4301c4d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void);
225static inline void mcheck_intel_therm_init(void) { } 225static inline void mcheck_intel_therm_init(void) { }
226#endif 226#endif
227 227
228/*
229 * Used by APEI to report memory error via /dev/mcelog
230 */
231
232struct cper_sec_mem_err;
233extern void apei_mce_report_mem_error(int corrected,
234 struct cper_sec_mem_err *mem_err);
235
228#endif /* __KERNEL__ */ 236#endif /* __KERNEL__ */
229#endif /* _ASM_X86_MCE_H */ 237#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index d8bf23a88d0..c82868e9f90 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -105,16 +105,6 @@ extern void mp_config_acpi_legacy_irqs(void);
105struct device; 105struct device;
106extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, 106extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
107 int active_high_low); 107 int active_high_low);
108extern int acpi_probe_gsi(void);
109#ifdef CONFIG_X86_IO_APIC
110extern int mp_find_ioapic(int gsi);
111extern int mp_find_ioapic_pin(int ioapic, int gsi);
112#endif
113#else /* !CONFIG_ACPI: */
114static inline int acpi_probe_gsi(void)
115{
116 return 0;
117}
118#endif /* CONFIG_ACPI */ 108#endif /* CONFIG_ACPI */
119 109
120#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) 110#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
new file mode 100644
index 00000000000..79ce5685ab6
--- /dev/null
+++ b/arch/x86/include/asm/mshyperv.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_MSHYPER_H
2#define _ASM_X86_MSHYPER_H
3
4#include <linux/types.h>
5#include <asm/hyperv.h>
6
7struct ms_hyperv_info {
8 u32 features;
9 u32 hints;
10};
11
12extern struct ms_hyperv_info ms_hyperv;
13
14#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4604e6a54d3..b49d8ca228f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -71,11 +71,14 @@
71#define MSR_IA32_LASTINTTOIP 0x000001de 71#define MSR_IA32_LASTINTTOIP 0x000001de
72 72
73/* DEBUGCTLMSR bits (others vary by model): */ 73/* DEBUGCTLMSR bits (others vary by model): */
74#define _DEBUGCTLMSR_LBR 0 /* last branch recording */ 74#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
75#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */ 75#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
76 76#define DEBUGCTLMSR_TR (1UL << 6)
77#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR) 77#define DEBUGCTLMSR_BTS (1UL << 7)
78#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF) 78#define DEBUGCTLMSR_BTINT (1UL << 8)
79#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
80#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
81#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
79 82
80#define MSR_IA32_MC0_CTL 0x00000400 83#define MSR_IA32_MC0_CTL 0x00000400
81#define MSR_IA32_MC0_STATUS 0x00000401 84#define MSR_IA32_MC0_STATUS 0x00000401
@@ -199,8 +202,9 @@
199#define MSR_IA32_EBL_CR_POWERON 0x0000002a 202#define MSR_IA32_EBL_CR_POWERON 0x0000002a
200#define MSR_IA32_FEATURE_CONTROL 0x0000003a 203#define MSR_IA32_FEATURE_CONTROL 0x0000003a
201 204
202#define FEATURE_CONTROL_LOCKED (1<<0) 205#define FEATURE_CONTROL_LOCKED (1<<0)
203#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) 206#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
207#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
204 208
205#define MSR_IA32_APICBASE 0x0000001b 209#define MSR_IA32_APICBASE 0x0000001b
206#define MSR_IA32_APICBASE_BSP (1<<8) 210#define MSR_IA32_APICBASE_BSP (1<<8)
@@ -232,6 +236,8 @@
232 236
233#define MSR_IA32_MISC_ENABLE 0x000001a0 237#define MSR_IA32_MISC_ENABLE 0x000001a0
234 238
239#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
240
235/* MISC_ENABLE bits: architectural */ 241/* MISC_ENABLE bits: architectural */
236#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 242#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
237#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 243#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
@@ -359,6 +365,8 @@
359#define MSR_P4_U2L_ESCR0 0x000003b0 365#define MSR_P4_U2L_ESCR0 0x000003b0
360#define MSR_P4_U2L_ESCR1 0x000003b1 366#define MSR_P4_U2L_ESCR1 0x000003b1
361 367
368#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
369
362/* Intel Core-based CPU performance counters */ 370/* Intel Core-based CPU performance counters */
363#define MSR_CORE_PERF_FIXED_CTR0 0x00000309 371#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
364#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a 372#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 1a0422348d6..8d8797eae5d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -83,7 +83,7 @@ struct irq_routing_table {
83 83
84extern unsigned int pcibios_irq_mask; 84extern unsigned int pcibios_irq_mask;
85 85
86extern spinlock_t pci_config_lock; 86extern raw_spinlock_t pci_config_lock;
87 87
88extern int (*pcibios_enable_irq)(struct pci_dev *dev); 88extern int (*pcibios_enable_irq)(struct pci_dev *dev);
89extern void (*pcibios_disable_irq)(struct pci_dev *dev); 89extern void (*pcibios_disable_irq)(struct pci_dev *dev);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 66a272dfd8b..0797e748d28 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -105,7 +105,7 @@ do { \
105 105
106/* 106/*
107 * Generate a percpu add to memory instruction and optimize code 107 * Generate a percpu add to memory instruction and optimize code
108 * if a one is added or subtracted. 108 * if one is added or subtracted.
109 */ 109 */
110#define percpu_add_op(var, val) \ 110#define percpu_add_op(var, val) \
111do { \ 111do { \
@@ -190,6 +190,29 @@ do { \
190 pfo_ret__; \ 190 pfo_ret__; \
191}) 191})
192 192
193#define percpu_unary_op(op, var) \
194({ \
195 switch (sizeof(var)) { \
196 case 1: \
197 asm(op "b "__percpu_arg(0) \
198 : "+m" (var)); \
199 break; \
200 case 2: \
201 asm(op "w "__percpu_arg(0) \
202 : "+m" (var)); \
203 break; \
204 case 4: \
205 asm(op "l "__percpu_arg(0) \
206 : "+m" (var)); \
207 break; \
208 case 8: \
209 asm(op "q "__percpu_arg(0) \
210 : "+m" (var)); \
211 break; \
212 default: __bad_percpu_size(); \
213 } \
214})
215
193/* 216/*
194 * percpu_read() makes gcc load the percpu variable every time it is 217 * percpu_read() makes gcc load the percpu variable every time it is
195 * accessed while percpu_read_stable() allows the value to be cached. 218 * accessed while percpu_read_stable() allows the value to be cached.
@@ -207,6 +230,7 @@ do { \
207#define percpu_and(var, val) percpu_to_op("and", var, val) 230#define percpu_and(var, val) percpu_to_op("and", var, val)
208#define percpu_or(var, val) percpu_to_op("or", var, val) 231#define percpu_or(var, val) percpu_to_op("or", var, val)
209#define percpu_xor(var, val) percpu_to_op("xor", var, val) 232#define percpu_xor(var, val) percpu_to_op("xor", var, val)
233#define percpu_inc(var) percpu_unary_op("inc", var)
210 234
211#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 235#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
212#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 236#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index db6109a885a..254883d0c7e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,7 +5,7 @@
5 * Performance event hw details: 5 * Performance event hw details:
6 */ 6 */
7 7
8#define X86_PMC_MAX_GENERIC 8 8#define X86_PMC_MAX_GENERIC 32
9#define X86_PMC_MAX_FIXED 3 9#define X86_PMC_MAX_FIXED 3
10 10
11#define X86_PMC_IDX_GENERIC 0 11#define X86_PMC_IDX_GENERIC 0
@@ -18,39 +18,31 @@
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20 20
21#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22) 21#define ARCH_PERFMON_EVENTSEL_EVENT 0x000000FFULL
22#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) 22#define ARCH_PERFMON_EVENTSEL_UMASK 0x0000FF00ULL
23#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) 23#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16)
24#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) 24#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
25#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) 25#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18)
26 26#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20)
27/* 27#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21)
28 * Includes eventsel and unit mask as well: 28#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
29 */ 29#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
30 30#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
31 31
32#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL 32#define AMD64_EVENTSEL_EVENT \
33#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL 33 (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
34#define INTEL_ARCH_EDGE_MASK 0x00040000ULL 34#define INTEL_ARCH_EVENT_MASK \
35#define INTEL_ARCH_INV_MASK 0x00800000ULL 35 (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
36#define INTEL_ARCH_CNT_MASK 0xFF000000ULL 36
37#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK) 37#define X86_RAW_EVENT_MASK \
38 38 (ARCH_PERFMON_EVENTSEL_EVENT | \
39/* 39 ARCH_PERFMON_EVENTSEL_UMASK | \
40 * filter mask to validate fixed counter events. 40 ARCH_PERFMON_EVENTSEL_EDGE | \
41 * the following filters disqualify for fixed counters: 41 ARCH_PERFMON_EVENTSEL_INV | \
42 * - inv 42 ARCH_PERFMON_EVENTSEL_CMASK)
43 * - edge 43#define AMD64_RAW_EVENT_MASK \
44 * - cnt-mask 44 (X86_RAW_EVENT_MASK | \
45 * The other filters are supported by fixed counters. 45 AMD64_EVENTSEL_EVENT)
46 * The any-thread option is supported starting with v3.
47 */
48#define INTEL_ARCH_FIXED_MASK \
49 (INTEL_ARCH_CNT_MASK| \
50 INTEL_ARCH_INV_MASK| \
51 INTEL_ARCH_EDGE_MASK|\
52 INTEL_ARCH_UNIT_MASK|\
53 INTEL_ARCH_EVENT_MASK)
54 46
55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 47#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
56#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 48#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
@@ -67,7 +59,7 @@
67union cpuid10_eax { 59union cpuid10_eax {
68 struct { 60 struct {
69 unsigned int version_id:8; 61 unsigned int version_id:8;
70 unsigned int num_events:8; 62 unsigned int num_counters:8;
71 unsigned int bit_width:8; 63 unsigned int bit_width:8;
72 unsigned int mask_length:8; 64 unsigned int mask_length:8;
73 } split; 65 } split;
@@ -76,7 +68,7 @@ union cpuid10_eax {
76 68
77union cpuid10_edx { 69union cpuid10_edx {
78 struct { 70 struct {
79 unsigned int num_events_fixed:4; 71 unsigned int num_counters_fixed:4;
80 unsigned int reserved:28; 72 unsigned int reserved:28;
81 } split; 73 } split;
82 unsigned int full; 74 unsigned int full;
@@ -136,6 +128,18 @@ extern void perf_events_lapic_init(void);
136 128
137#define PERF_EVENT_INDEX_OFFSET 0 129#define PERF_EVENT_INDEX_OFFSET 0
138 130
131/*
132 * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
133 * This flag is otherwise unused and ABI specified to be 0, so nobody should
134 * care what we do with it.
135 */
136#define PERF_EFLAGS_EXACT (1UL << 3)
137
138struct pt_regs;
139extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
140extern unsigned long perf_misc_flags(struct pt_regs *regs);
141#define perf_misc_flags(regs) perf_misc_flags(regs)
142
139#else 143#else
140static inline void init_hw_perf_events(void) { } 144static inline void init_hw_perf_events(void) { }
141static inline void perf_events_lapic_init(void) { } 145static inline void perf_events_lapic_init(void) { }
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
new file mode 100644
index 00000000000..64a8ebff06f
--- /dev/null
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -0,0 +1,795 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 */
4
5#ifndef PERF_EVENT_P4_H
6#define PERF_EVENT_P4_H
7
8#include <linux/cpu.h>
9#include <linux/bitops.h>
10
11/*
12 * NetBurst has perfomance MSRs shared between
13 * threads if HT is turned on, ie for both logical
14 * processors (mem: in turn in Atom with HT support
15 * perf-MSRs are not shared and every thread has its
16 * own perf-MSRs set)
17 */
18#define ARCH_P4_TOTAL_ESCR (46)
19#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
21#define ARCH_P4_MAX_CCCR (18)
22#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2)
23
24#define P4_ESCR_EVENT_MASK 0x7e000000U
25#define P4_ESCR_EVENT_SHIFT 25
26#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
27#define P4_ESCR_EVENTMASK_SHIFT 9
28#define P4_ESCR_TAG_MASK 0x000001e0U
29#define P4_ESCR_TAG_SHIFT 5
30#define P4_ESCR_TAG_ENABLE 0x00000010U
31#define P4_ESCR_T0_OS 0x00000008U
32#define P4_ESCR_T0_USR 0x00000004U
33#define P4_ESCR_T1_OS 0x00000002U
34#define P4_ESCR_T1_USR 0x00000001U
35
36#define P4_ESCR_EVENT(v) ((v) << P4_ESCR_EVENT_SHIFT)
37#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
38#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
39
40/* Non HT mask */
41#define P4_ESCR_MASK \
42 (P4_ESCR_EVENT_MASK | \
43 P4_ESCR_EVENTMASK_MASK | \
44 P4_ESCR_TAG_MASK | \
45 P4_ESCR_TAG_ENABLE | \
46 P4_ESCR_T0_OS | \
47 P4_ESCR_T0_USR)
48
49/* HT mask */
50#define P4_ESCR_MASK_HT \
51 (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
52
53#define P4_CCCR_OVF 0x80000000U
54#define P4_CCCR_CASCADE 0x40000000U
55#define P4_CCCR_OVF_PMI_T0 0x04000000U
56#define P4_CCCR_OVF_PMI_T1 0x08000000U
57#define P4_CCCR_FORCE_OVF 0x02000000U
58#define P4_CCCR_EDGE 0x01000000U
59#define P4_CCCR_THRESHOLD_MASK 0x00f00000U
60#define P4_CCCR_THRESHOLD_SHIFT 20
61#define P4_CCCR_COMPLEMENT 0x00080000U
62#define P4_CCCR_COMPARE 0x00040000U
63#define P4_CCCR_ESCR_SELECT_MASK 0x0000e000U
64#define P4_CCCR_ESCR_SELECT_SHIFT 13
65#define P4_CCCR_ENABLE 0x00001000U
66#define P4_CCCR_THREAD_SINGLE 0x00010000U
67#define P4_CCCR_THREAD_BOTH 0x00020000U
68#define P4_CCCR_THREAD_ANY 0x00030000U
69#define P4_CCCR_RESERVED 0x00000fffU
70
71#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
72#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
73
74/* Custom bits in reerved CCCR area */
75#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU
76
77
78/* Non HT mask */
79#define P4_CCCR_MASK \
80 (P4_CCCR_OVF | \
81 P4_CCCR_CASCADE | \
82 P4_CCCR_OVF_PMI_T0 | \
83 P4_CCCR_FORCE_OVF | \
84 P4_CCCR_EDGE | \
85 P4_CCCR_THRESHOLD_MASK | \
86 P4_CCCR_COMPLEMENT | \
87 P4_CCCR_COMPARE | \
88 P4_CCCR_ESCR_SELECT_MASK | \
89 P4_CCCR_ENABLE)
90
91/* HT mask */
92#define P4_CCCR_MASK_HT \
93 (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
94
95#define P4_GEN_ESCR_EMASK(class, name, bit) \
96 class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
97#define P4_ESCR_EMASK_BIT(class, name) class##__##name
98
99/*
100 * config field is 64bit width and consists of
101 * HT << 63 | ESCR << 32 | CCCR
102 * where HT is HyperThreading bit (since ESCR
103 * has it reserved we may use it for own purpose)
104 *
105 * note that this is NOT the addresses of respective
106 * ESCR and CCCR but rather an only packed value should
107 * be unpacked and written to a proper addresses
108 *
109 * the base idea is to pack as much info as
110 * possible
111 */
112#define p4_config_pack_escr(v) (((u64)(v)) << 32)
113#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
114#define p4_config_unpack_escr(v) (((u64)(v)) >> 32)
115#define p4_config_unpack_cccr(v) (((u64)(v)) & 0xffffffffULL)
116
117#define p4_config_unpack_emask(v) \
118 ({ \
119 u32 t = p4_config_unpack_escr((v)); \
120 t = t & P4_ESCR_EVENTMASK_MASK; \
121 t = t >> P4_ESCR_EVENTMASK_SHIFT; \
122 t; \
123 })
124
125#define p4_config_unpack_event(v) \
126 ({ \
127 u32 t = p4_config_unpack_escr((v)); \
128 t = t & P4_ESCR_EVENT_MASK; \
129 t = t >> P4_ESCR_EVENT_SHIFT; \
130 t; \
131 })
132
133#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
134
135#define P4_CONFIG_HT_SHIFT 63
136#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
137
138static inline bool p4_is_event_cascaded(u64 config)
139{
140 u32 cccr = p4_config_unpack_cccr(config);
141 return !!(cccr & P4_CCCR_CASCADE);
142}
143
144static inline int p4_ht_config_thread(u64 config)
145{
146 return !!(config & P4_CONFIG_HT);
147}
148
149static inline u64 p4_set_ht_bit(u64 config)
150{
151 return config | P4_CONFIG_HT;
152}
153
154static inline u64 p4_clear_ht_bit(u64 config)
155{
156 return config & ~P4_CONFIG_HT;
157}
158
159static inline int p4_ht_active(void)
160{
161#ifdef CONFIG_SMP
162 return smp_num_siblings > 1;
163#endif
164 return 0;
165}
166
167static inline int p4_ht_thread(int cpu)
168{
169#ifdef CONFIG_SMP
170 if (smp_num_siblings == 2)
171 return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
172#endif
173 return 0;
174}
175
176static inline int p4_should_swap_ts(u64 config, int cpu)
177{
178 return p4_ht_config_thread(config) ^ p4_ht_thread(cpu);
179}
180
181static inline u32 p4_default_cccr_conf(int cpu)
182{
183 /*
184 * Note that P4_CCCR_THREAD_ANY is "required" on
185 * non-HT machines (on HT machines we count TS events
186 * regardless the state of second logical processor
187 */
188 u32 cccr = P4_CCCR_THREAD_ANY;
189
190 if (!p4_ht_thread(cpu))
191 cccr |= P4_CCCR_OVF_PMI_T0;
192 else
193 cccr |= P4_CCCR_OVF_PMI_T1;
194
195 return cccr;
196}
197
198static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
199{
200 u32 escr = 0;
201
202 if (!p4_ht_thread(cpu)) {
203 if (!exclude_os)
204 escr |= P4_ESCR_T0_OS;
205 if (!exclude_usr)
206 escr |= P4_ESCR_T0_USR;
207 } else {
208 if (!exclude_os)
209 escr |= P4_ESCR_T1_OS;
210 if (!exclude_usr)
211 escr |= P4_ESCR_T1_USR;
212 }
213
214 return escr;
215}
216
217enum P4_EVENTS {
218 P4_EVENT_TC_DELIVER_MODE,
219 P4_EVENT_BPU_FETCH_REQUEST,
220 P4_EVENT_ITLB_REFERENCE,
221 P4_EVENT_MEMORY_CANCEL,
222 P4_EVENT_MEMORY_COMPLETE,
223 P4_EVENT_LOAD_PORT_REPLAY,
224 P4_EVENT_STORE_PORT_REPLAY,
225 P4_EVENT_MOB_LOAD_REPLAY,
226 P4_EVENT_PAGE_WALK_TYPE,
227 P4_EVENT_BSQ_CACHE_REFERENCE,
228 P4_EVENT_IOQ_ALLOCATION,
229 P4_EVENT_IOQ_ACTIVE_ENTRIES,
230 P4_EVENT_FSB_DATA_ACTIVITY,
231 P4_EVENT_BSQ_ALLOCATION,
232 P4_EVENT_BSQ_ACTIVE_ENTRIES,
233 P4_EVENT_SSE_INPUT_ASSIST,
234 P4_EVENT_PACKED_SP_UOP,
235 P4_EVENT_PACKED_DP_UOP,
236 P4_EVENT_SCALAR_SP_UOP,
237 P4_EVENT_SCALAR_DP_UOP,
238 P4_EVENT_64BIT_MMX_UOP,
239 P4_EVENT_128BIT_MMX_UOP,
240 P4_EVENT_X87_FP_UOP,
241 P4_EVENT_TC_MISC,
242 P4_EVENT_GLOBAL_POWER_EVENTS,
243 P4_EVENT_TC_MS_XFER,
244 P4_EVENT_UOP_QUEUE_WRITES,
245 P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE,
246 P4_EVENT_RETIRED_BRANCH_TYPE,
247 P4_EVENT_RESOURCE_STALL,
248 P4_EVENT_WC_BUFFER,
249 P4_EVENT_B2B_CYCLES,
250 P4_EVENT_BNR,
251 P4_EVENT_SNOOP,
252 P4_EVENT_RESPONSE,
253 P4_EVENT_FRONT_END_EVENT,
254 P4_EVENT_EXECUTION_EVENT,
255 P4_EVENT_REPLAY_EVENT,
256 P4_EVENT_INSTR_RETIRED,
257 P4_EVENT_UOPS_RETIRED,
258 P4_EVENT_UOP_TYPE,
259 P4_EVENT_BRANCH_RETIRED,
260 P4_EVENT_MISPRED_BRANCH_RETIRED,
261 P4_EVENT_X87_ASSIST,
262 P4_EVENT_MACHINE_CLEAR,
263 P4_EVENT_INSTR_COMPLETED,
264};
265
266#define P4_OPCODE(event) event##_OPCODE
267#define P4_OPCODE_ESEL(opcode) ((opcode & 0x00ff) >> 0)
268#define P4_OPCODE_EVNT(opcode) ((opcode & 0xff00) >> 8)
269#define P4_OPCODE_PACK(event, sel) (((event) << 8) | sel)
270
271/*
272 * Comments below the event represent ESCR restriction
273 * for this event and counter index per ESCR
274 *
275 * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early
276 * processor builds (family 0FH, models 01H-02H). These MSRs
277 * are not available on later versions, so that we don't use
278 * them completely
279 *
280 * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly
281 * working so that we should not use this CCCR and respective
282 * counter as result
283 */
284enum P4_EVENT_OPCODES {
285 P4_OPCODE(P4_EVENT_TC_DELIVER_MODE) = P4_OPCODE_PACK(0x01, 0x01),
286 /*
287 * MSR_P4_TC_ESCR0: 4, 5
288 * MSR_P4_TC_ESCR1: 6, 7
289 */
290
291 P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST) = P4_OPCODE_PACK(0x03, 0x00),
292 /*
293 * MSR_P4_BPU_ESCR0: 0, 1
294 * MSR_P4_BPU_ESCR1: 2, 3
295 */
296
297 P4_OPCODE(P4_EVENT_ITLB_REFERENCE) = P4_OPCODE_PACK(0x18, 0x03),
298 /*
299 * MSR_P4_ITLB_ESCR0: 0, 1
300 * MSR_P4_ITLB_ESCR1: 2, 3
301 */
302
303 P4_OPCODE(P4_EVENT_MEMORY_CANCEL) = P4_OPCODE_PACK(0x02, 0x05),
304 /*
305 * MSR_P4_DAC_ESCR0: 8, 9
306 * MSR_P4_DAC_ESCR1: 10, 11
307 */
308
309 P4_OPCODE(P4_EVENT_MEMORY_COMPLETE) = P4_OPCODE_PACK(0x08, 0x02),
310 /*
311 * MSR_P4_SAAT_ESCR0: 8, 9
312 * MSR_P4_SAAT_ESCR1: 10, 11
313 */
314
315 P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY) = P4_OPCODE_PACK(0x04, 0x02),
316 /*
317 * MSR_P4_SAAT_ESCR0: 8, 9
318 * MSR_P4_SAAT_ESCR1: 10, 11
319 */
320
321 P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY) = P4_OPCODE_PACK(0x05, 0x02),
322 /*
323 * MSR_P4_SAAT_ESCR0: 8, 9
324 * MSR_P4_SAAT_ESCR1: 10, 11
325 */
326
327 P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY) = P4_OPCODE_PACK(0x03, 0x02),
328 /*
329 * MSR_P4_MOB_ESCR0: 0, 1
330 * MSR_P4_MOB_ESCR1: 2, 3
331 */
332
333 P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE) = P4_OPCODE_PACK(0x01, 0x04),
334 /*
335 * MSR_P4_PMH_ESCR0: 0, 1
336 * MSR_P4_PMH_ESCR1: 2, 3
337 */
338
339 P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE) = P4_OPCODE_PACK(0x0c, 0x07),
340 /*
341 * MSR_P4_BSU_ESCR0: 0, 1
342 * MSR_P4_BSU_ESCR1: 2, 3
343 */
344
345 P4_OPCODE(P4_EVENT_IOQ_ALLOCATION) = P4_OPCODE_PACK(0x03, 0x06),
346 /*
347 * MSR_P4_FSB_ESCR0: 0, 1
348 * MSR_P4_FSB_ESCR1: 2, 3
349 */
350
351 P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x1a, 0x06),
352 /*
353 * MSR_P4_FSB_ESCR1: 2, 3
354 */
355
356 P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY) = P4_OPCODE_PACK(0x17, 0x06),
357 /*
358 * MSR_P4_FSB_ESCR0: 0, 1
359 * MSR_P4_FSB_ESCR1: 2, 3
360 */
361
362 P4_OPCODE(P4_EVENT_BSQ_ALLOCATION) = P4_OPCODE_PACK(0x05, 0x07),
363 /*
364 * MSR_P4_BSU_ESCR0: 0, 1
365 */
366
367 P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x06, 0x07),
368 /*
369 * NOTE: no ESCR name in docs, it's guessed
370 * MSR_P4_BSU_ESCR1: 2, 3
371 */
372
373 P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST) = P4_OPCODE_PACK(0x34, 0x01),
374 /*
375 * MSR_P4_FIRM_ESCR0: 8, 9
376 * MSR_P4_FIRM_ESCR1: 10, 11
377 */
378
379 P4_OPCODE(P4_EVENT_PACKED_SP_UOP) = P4_OPCODE_PACK(0x08, 0x01),
380 /*
381 * MSR_P4_FIRM_ESCR0: 8, 9
382 * MSR_P4_FIRM_ESCR1: 10, 11
383 */
384
385 P4_OPCODE(P4_EVENT_PACKED_DP_UOP) = P4_OPCODE_PACK(0x0c, 0x01),
386 /*
387 * MSR_P4_FIRM_ESCR0: 8, 9
388 * MSR_P4_FIRM_ESCR1: 10, 11
389 */
390
391 P4_OPCODE(P4_EVENT_SCALAR_SP_UOP) = P4_OPCODE_PACK(0x0a, 0x01),
392 /*
393 * MSR_P4_FIRM_ESCR0: 8, 9
394 * MSR_P4_FIRM_ESCR1: 10, 11
395 */
396
397 P4_OPCODE(P4_EVENT_SCALAR_DP_UOP) = P4_OPCODE_PACK(0x0e, 0x01),
398 /*
399 * MSR_P4_FIRM_ESCR0: 8, 9
400 * MSR_P4_FIRM_ESCR1: 10, 11
401 */
402
403 P4_OPCODE(P4_EVENT_64BIT_MMX_UOP) = P4_OPCODE_PACK(0x02, 0x01),
404 /*
405 * MSR_P4_FIRM_ESCR0: 8, 9
406 * MSR_P4_FIRM_ESCR1: 10, 11
407 */
408
409 P4_OPCODE(P4_EVENT_128BIT_MMX_UOP) = P4_OPCODE_PACK(0x1a, 0x01),
410 /*
411 * MSR_P4_FIRM_ESCR0: 8, 9
412 * MSR_P4_FIRM_ESCR1: 10, 11
413 */
414
415 P4_OPCODE(P4_EVENT_X87_FP_UOP) = P4_OPCODE_PACK(0x04, 0x01),
416 /*
417 * MSR_P4_FIRM_ESCR0: 8, 9
418 * MSR_P4_FIRM_ESCR1: 10, 11
419 */
420
421 P4_OPCODE(P4_EVENT_TC_MISC) = P4_OPCODE_PACK(0x06, 0x01),
422 /*
423 * MSR_P4_TC_ESCR0: 4, 5
424 * MSR_P4_TC_ESCR1: 6, 7
425 */
426
427 P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS) = P4_OPCODE_PACK(0x13, 0x06),
428 /*
429 * MSR_P4_FSB_ESCR0: 0, 1
430 * MSR_P4_FSB_ESCR1: 2, 3
431 */
432
433 P4_OPCODE(P4_EVENT_TC_MS_XFER) = P4_OPCODE_PACK(0x05, 0x00),
434 /*
435 * MSR_P4_MS_ESCR0: 4, 5
436 * MSR_P4_MS_ESCR1: 6, 7
437 */
438
439 P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES) = P4_OPCODE_PACK(0x09, 0x00),
440 /*
441 * MSR_P4_MS_ESCR0: 4, 5
442 * MSR_P4_MS_ESCR1: 6, 7
443 */
444
445 P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x05, 0x02),
446 /*
447 * MSR_P4_TBPU_ESCR0: 4, 5
448 * MSR_P4_TBPU_ESCR1: 6, 7
449 */
450
451 P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x04, 0x02),
452 /*
453 * MSR_P4_TBPU_ESCR0: 4, 5
454 * MSR_P4_TBPU_ESCR1: 6, 7
455 */
456
457 P4_OPCODE(P4_EVENT_RESOURCE_STALL) = P4_OPCODE_PACK(0x01, 0x01),
458 /*
459 * MSR_P4_ALF_ESCR0: 12, 13, 16
460 * MSR_P4_ALF_ESCR1: 14, 15, 17
461 */
462
463 P4_OPCODE(P4_EVENT_WC_BUFFER) = P4_OPCODE_PACK(0x05, 0x05),
464 /*
465 * MSR_P4_DAC_ESCR0: 8, 9
466 * MSR_P4_DAC_ESCR1: 10, 11
467 */
468
469 P4_OPCODE(P4_EVENT_B2B_CYCLES) = P4_OPCODE_PACK(0x16, 0x03),
470 /*
471 * MSR_P4_FSB_ESCR0: 0, 1
472 * MSR_P4_FSB_ESCR1: 2, 3
473 */
474
475 P4_OPCODE(P4_EVENT_BNR) = P4_OPCODE_PACK(0x08, 0x03),
476 /*
477 * MSR_P4_FSB_ESCR0: 0, 1
478 * MSR_P4_FSB_ESCR1: 2, 3
479 */
480
481 P4_OPCODE(P4_EVENT_SNOOP) = P4_OPCODE_PACK(0x06, 0x03),
482 /*
483 * MSR_P4_FSB_ESCR0: 0, 1
484 * MSR_P4_FSB_ESCR1: 2, 3
485 */
486
487 P4_OPCODE(P4_EVENT_RESPONSE) = P4_OPCODE_PACK(0x04, 0x03),
488 /*
489 * MSR_P4_FSB_ESCR0: 0, 1
490 * MSR_P4_FSB_ESCR1: 2, 3
491 */
492
493 P4_OPCODE(P4_EVENT_FRONT_END_EVENT) = P4_OPCODE_PACK(0x08, 0x05),
494 /*
495 * MSR_P4_CRU_ESCR2: 12, 13, 16
496 * MSR_P4_CRU_ESCR3: 14, 15, 17
497 */
498
499 P4_OPCODE(P4_EVENT_EXECUTION_EVENT) = P4_OPCODE_PACK(0x0c, 0x05),
500 /*
501 * MSR_P4_CRU_ESCR2: 12, 13, 16
502 * MSR_P4_CRU_ESCR3: 14, 15, 17
503 */
504
505 P4_OPCODE(P4_EVENT_REPLAY_EVENT) = P4_OPCODE_PACK(0x09, 0x05),
506 /*
507 * MSR_P4_CRU_ESCR2: 12, 13, 16
508 * MSR_P4_CRU_ESCR3: 14, 15, 17
509 */
510
511 P4_OPCODE(P4_EVENT_INSTR_RETIRED) = P4_OPCODE_PACK(0x02, 0x04),
512 /*
513 * MSR_P4_CRU_ESCR0: 12, 13, 16
514 * MSR_P4_CRU_ESCR1: 14, 15, 17
515 */
516
517 P4_OPCODE(P4_EVENT_UOPS_RETIRED) = P4_OPCODE_PACK(0x01, 0x04),
518 /*
519 * MSR_P4_CRU_ESCR0: 12, 13, 16
520 * MSR_P4_CRU_ESCR1: 14, 15, 17
521 */
522
523 P4_OPCODE(P4_EVENT_UOP_TYPE) = P4_OPCODE_PACK(0x02, 0x02),
524 /*
525 * MSR_P4_RAT_ESCR0: 12, 13, 16
526 * MSR_P4_RAT_ESCR1: 14, 15, 17
527 */
528
529 P4_OPCODE(P4_EVENT_BRANCH_RETIRED) = P4_OPCODE_PACK(0x06, 0x05),
530 /*
531 * MSR_P4_CRU_ESCR2: 12, 13, 16
532 * MSR_P4_CRU_ESCR3: 14, 15, 17
533 */
534
535 P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED) = P4_OPCODE_PACK(0x03, 0x04),
536 /*
537 * MSR_P4_CRU_ESCR0: 12, 13, 16
538 * MSR_P4_CRU_ESCR1: 14, 15, 17
539 */
540
541 P4_OPCODE(P4_EVENT_X87_ASSIST) = P4_OPCODE_PACK(0x03, 0x05),
542 /*
543 * MSR_P4_CRU_ESCR2: 12, 13, 16
544 * MSR_P4_CRU_ESCR3: 14, 15, 17
545 */
546
547 P4_OPCODE(P4_EVENT_MACHINE_CLEAR) = P4_OPCODE_PACK(0x02, 0x05),
548 /*
549 * MSR_P4_CRU_ESCR2: 12, 13, 16
550 * MSR_P4_CRU_ESCR3: 14, 15, 17
551 */
552
553 P4_OPCODE(P4_EVENT_INSTR_COMPLETED) = P4_OPCODE_PACK(0x07, 0x04),
554 /*
555 * MSR_P4_CRU_ESCR0: 12, 13, 16
556 * MSR_P4_CRU_ESCR1: 14, 15, 17
557 */
558};
559
560/*
561 * a caller should use P4_ESCR_EMASK_NAME helper to
562 * pick the EventMask needed, for example
563 *
564 * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
565 */
566enum P4_ESCR_EMASKS {
567 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
568 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1),
569 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2),
570 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3),
571 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4),
572 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5),
573 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6),
574
575 P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0),
576
577 P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0),
578 P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1),
579 P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2),
580
581 P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2),
582 P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3),
583
584 P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0),
585 P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1),
586
587 P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1),
588
589 P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1),
590
591 P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1),
592 P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3),
593 P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
594 P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
595
596 P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0),
597 P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1),
598
599 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
600 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
601 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
602 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
603 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
604 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
605 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
606 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
607 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
608
609 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0),
610 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5),
611 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6),
612 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7),
613 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8),
614 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9),
615 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10),
616 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11),
617 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13),
618 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14),
619 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15),
620
621 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
622 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
623 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
624 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
625 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
626 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
627 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
628 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
629 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13),
630 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14),
631 P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
632
633 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
634 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
635 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
636 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
637 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
638 P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
639
640 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0),
641 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1),
642 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2),
643 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3),
644 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
645 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
646 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
647 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
648 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
649 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
650 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11),
651 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12),
652 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13),
653
654 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
655 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
656 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
657 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
658 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
659 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
660 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
661 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
662 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
663 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
664 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
665 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
666 P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
667
668 P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15),
669
670 P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15),
671
672 P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15),
673
674 P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15),
675
676 P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15),
677
678 P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15),
679
680 P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15),
681
682 P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15),
683
684 P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4),
685
686 P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0),
687
688 P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0),
689
690 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
691 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
692 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2),
693
694 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
695 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
696 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
697 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
698
699 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
700 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2),
701 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3),
702 P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4),
703
704 P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5),
705
706 P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0),
707 P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1),
708
709 P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0),
710 P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1),
711
712 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0),
713 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1),
714 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2),
715 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3),
716 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4),
717 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5),
718 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6),
719 P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7),
720
721 P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0),
722 P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1),
723
724 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0),
725 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1),
726 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2),
727 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3),
728
729 P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0),
730 P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1),
731
732 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1),
733 P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2),
734
735 P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0),
736 P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1),
737 P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2),
738 P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3),
739
740 P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
741
742 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0),
743 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1),
744 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2),
745 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3),
746 P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4),
747
748 P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0),
749 P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1),
750 P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2),
751
752 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0),
753 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
754};
755
756/* P4 PEBS: stale for a while */
757#define P4_PEBS_METRIC_MASK 0x00001fffU
758#define P4_PEBS_UOB_TAG 0x01000000U
759#define P4_PEBS_ENABLE 0x02000000U
760
761/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
762#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001
763#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002
764#define P4_PEBS__dtlb_load_miss_retired 0x3000004
765#define P4_PEBS__dtlb_store_miss_retired 0x3000004
766#define P4_PEBS__dtlb_all_miss_retired 0x3000004
767#define P4_PEBS__tagged_mispred_branch 0x3018000
768#define P4_PEBS__mob_load_replay_retired 0x3000200
769#define P4_PEBS__split_load_retired 0x3000400
770#define P4_PEBS__split_store_retired 0x3000400
771
772#define P4_VERT__1stl_cache_load_miss_retired 0x0000001
773#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001
774#define P4_VERT__dtlb_load_miss_retired 0x0000001
775#define P4_VERT__dtlb_store_miss_retired 0x0000002
776#define P4_VERT__dtlb_all_miss_retired 0x0000003
777#define P4_VERT__tagged_mispred_branch 0x0000010
778#define P4_VERT__mob_load_replay_retired 0x0000001
779#define P4_VERT__split_load_retired 0x0000001
780#define P4_VERT__split_store_retired 0x0000002
781
782enum P4_CACHE_EVENTS {
783 P4_CACHE__NONE,
784
785 P4_CACHE__1stl_cache_load_miss_retired,
786 P4_CACHE__2ndl_cache_load_miss_retired,
787 P4_CACHE__dtlb_load_miss_retired,
788 P4_CACHE__dtlb_store_miss_retired,
789 P4_CACHE__itlb_reference_hit,
790 P4_CACHE__itlb_reference_miss,
791
792 P4_CACHE__MAX
793};
794
795#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703..7e5c6a60b8e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,7 +21,6 @@ struct mm_struct;
21#include <asm/msr.h> 21#include <asm/msr.h>
22#include <asm/desc_defs.h> 22#include <asm/desc_defs.h>
23#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/ds.h>
25 24
26#include <linux/personality.h> 25#include <linux/personality.h>
27#include <linux/cpumask.h> 26#include <linux/cpumask.h>
@@ -29,6 +28,7 @@ struct mm_struct;
29#include <linux/threads.h> 28#include <linux/threads.h>
30#include <linux/math64.h> 29#include <linux/math64.h>
31#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/err.h>
32 32
33#define HBP_NUM 4 33#define HBP_NUM 4
34/* 34/*
@@ -113,7 +113,6 @@ struct cpuinfo_x86 {
113 /* Index into per_cpu list: */ 113 /* Index into per_cpu list: */
114 u16 cpu_index; 114 u16 cpu_index;
115#endif 115#endif
116 unsigned int x86_hyper_vendor;
117} __attribute__((__aligned__(SMP_CACHE_BYTES))); 116} __attribute__((__aligned__(SMP_CACHE_BYTES)));
118 117
119#define X86_VENDOR_INTEL 0 118#define X86_VENDOR_INTEL 0
@@ -127,9 +126,6 @@ struct cpuinfo_x86 {
127 126
128#define X86_VENDOR_UNKNOWN 0xff 127#define X86_VENDOR_UNKNOWN 0xff
129 128
130#define X86_HYPER_VENDOR_NONE 0
131#define X86_HYPER_VENDOR_VMWARE 1
132
133/* 129/*
134 * capabilities of CPUs 130 * capabilities of CPUs
135 */ 131 */
@@ -380,6 +376,10 @@ union thread_xstate {
380 struct xsave_struct xsave; 376 struct xsave_struct xsave;
381}; 377};
382 378
379struct fpu {
380 union thread_xstate *state;
381};
382
383#ifdef CONFIG_X86_64 383#ifdef CONFIG_X86_64
384DECLARE_PER_CPU(struct orig_ist, orig_ist); 384DECLARE_PER_CPU(struct orig_ist, orig_ist);
385 385
@@ -457,7 +457,7 @@ struct thread_struct {
457 unsigned long trap_no; 457 unsigned long trap_no;
458 unsigned long error_code; 458 unsigned long error_code;
459 /* floating point and extended processor state */ 459 /* floating point and extended processor state */
460 union thread_xstate *xstate; 460 struct fpu fpu;
461#ifdef CONFIG_X86_32 461#ifdef CONFIG_X86_32
462 /* Virtual 86 mode info */ 462 /* Virtual 86 mode info */
463 struct vm86_struct __user *vm86_info; 463 struct vm86_struct __user *vm86_info;
@@ -473,10 +473,6 @@ struct thread_struct {
473 unsigned long iopl; 473 unsigned long iopl;
474 /* Max allowed port in the bitmap, in bytes: */ 474 /* Max allowed port in the bitmap, in bytes: */
475 unsigned io_bitmap_max; 475 unsigned io_bitmap_max;
476/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
477 unsigned long debugctlmsr;
478 /* Debug Store context; see asm/ds.h */
479 struct ds_context *ds_ctx;
480}; 476};
481 477
482static inline unsigned long native_get_debugreg(int regno) 478static inline unsigned long native_get_debugreg(int regno)
@@ -793,6 +789,8 @@ static inline void wbinvd_halt(void)
793extern void enable_sep_cpu(void); 789extern void enable_sep_cpu(void);
794extern int sysenter_setup(void); 790extern int sysenter_setup(void);
795 791
792extern void early_trap_init(void);
793
796/* Defined in head.S */ 794/* Defined in head.S */
797extern struct desc_ptr early_gdt_descr; 795extern struct desc_ptr early_gdt_descr;
798 796
@@ -803,7 +801,7 @@ extern void cpu_init(void);
803 801
804static inline unsigned long get_debugctlmsr(void) 802static inline unsigned long get_debugctlmsr(void)
805{ 803{
806 unsigned long debugctlmsr = 0; 804 unsigned long debugctlmsr = 0;
807 805
808#ifndef CONFIG_X86_DEBUGCTLMSR 806#ifndef CONFIG_X86_DEBUGCTLMSR
809 if (boot_cpu_data.x86 < 6) 807 if (boot_cpu_data.x86 < 6)
@@ -811,21 +809,6 @@ static inline unsigned long get_debugctlmsr(void)
811#endif 809#endif
812 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); 810 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
813 811
814 return debugctlmsr;
815}
816
817static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
818{
819 u64 debugctlmsr = 0;
820 u32 val1, val2;
821
822#ifndef CONFIG_X86_DEBUGCTLMSR
823 if (boot_cpu_data.x86 < 6)
824 return 0;
825#endif
826 rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
827 debugctlmsr = val1 | ((u64)val2 << 32);
828
829 return debugctlmsr; 812 return debugctlmsr;
830} 813}
831 814
@@ -838,18 +821,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
838 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); 821 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
839} 822}
840 823
841static inline void update_debugctlmsr_on_cpu(int cpu,
842 unsigned long debugctlmsr)
843{
844#ifndef CONFIG_X86_DEBUGCTLMSR
845 if (boot_cpu_data.x86 < 6)
846 return;
847#endif
848 wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
849 (u32)((u64)debugctlmsr),
850 (u32)((u64)debugctlmsr >> 32));
851}
852
853/* 824/*
854 * from system description table in BIOS. Mostly for MCA use, but 825 * from system description table in BIOS. Mostly for MCA use, but
855 * others may find it useful: 826 * others may find it useful:
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 86723035a51..52b098a6eeb 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -82,61 +82,6 @@
82 82
83#ifndef __ASSEMBLY__ 83#ifndef __ASSEMBLY__
84#include <linux/types.h> 84#include <linux/types.h>
85 85#endif
86/* configuration/status structure used in PTRACE_BTS_CONFIG and
87 PTRACE_BTS_STATUS commands.
88*/
89struct ptrace_bts_config {
90 /* requested or actual size of BTS buffer in bytes */
91 __u32 size;
92 /* bitmask of below flags */
93 __u32 flags;
94 /* buffer overflow signal */
95 __u32 signal;
96 /* actual size of bts_struct in bytes */
97 __u32 bts_size;
98};
99#endif /* __ASSEMBLY__ */
100
101#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
102#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
103#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
104 instead of wrapping around */
105#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */
106
107#define PTRACE_BTS_CONFIG 40
108/* Configure branch trace recording.
109 ADDR points to a struct ptrace_bts_config.
110 DATA gives the size of that buffer.
111 A new buffer is allocated, if requested in the flags.
112 An overflow signal may only be requested for new buffers.
113 Returns the number of bytes read.
114*/
115#define PTRACE_BTS_STATUS 41
116/* Return the current configuration in a struct ptrace_bts_config
117 pointed to by ADDR; DATA gives the size of that buffer.
118 Returns the number of bytes written.
119*/
120#define PTRACE_BTS_SIZE 42
121/* Return the number of available BTS records for draining.
122 DATA and ADDR are ignored.
123*/
124#define PTRACE_BTS_GET 43
125/* Get a single BTS record.
126 DATA defines the index into the BTS array, where 0 is the newest
127 entry, and higher indices refer to older entries.
128 ADDR is pointing to struct bts_struct (see asm/ds.h).
129*/
130#define PTRACE_BTS_CLEAR 44
131/* Clear the BTS buffer.
132 DATA and ADDR are ignored.
133*/
134#define PTRACE_BTS_DRAIN 45
135/* Read all available BTS records and clear the buffer.
136 ADDR points to an array of struct bts_struct.
137 DATA gives the size of that buffer.
138 BTS records are read from oldest to newest.
139 Returns number of BTS records drained.
140*/
141 86
142#endif /* _ASM_X86_PTRACE_ABI_H */ 87#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 69a686a7dff..78cd1ea9450 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -289,12 +289,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
289extern int do_set_thread_area(struct task_struct *p, int idx, 289extern int do_set_thread_area(struct task_struct *p, int idx,
290 struct user_desc __user *info, int can_allocate); 290 struct user_desc __user *info, int can_allocate);
291 291
292#ifdef CONFIG_X86_PTRACE_BTS
293extern void ptrace_bts_untrace(struct task_struct *tsk);
294
295#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk)
296#endif /* CONFIG_X86_PTRACE_BTS */
297
298#endif /* __KERNEL__ */ 292#endif /* __KERNEL__ */
299 293
300#endif /* !__ASSEMBLY__ */ 294#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508f262..35f2d1948ad 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info {
29 u64 system_time; 29 u64 system_time;
30 u32 tsc_to_system_mul; 30 u32 tsc_to_system_mul;
31 s8 tsc_shift; 31 s8 tsc_shift;
32 u8 pad[3]; 32 u8 flags;
33 u8 pad[2];
33} __attribute__((__packed__)); /* 32 bytes */ 34} __attribute__((__packed__)); /* 32 bytes */
34 35
35struct pvclock_wall_clock { 36struct pvclock_wall_clock {
@@ -38,5 +39,6 @@ struct pvclock_wall_clock {
38 u32 nsec; 39 u32 nsec;
39} __attribute__((__packed__)); 40} __attribute__((__packed__));
40 41
42#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
41#endif /* __ASSEMBLY__ */ 43#endif /* __ASSEMBLY__ */
42#endif /* _ASM_X86_PVCLOCK_ABI_H */ 44#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd5f8c..cd02f324aa6 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
6 6
7/* some helper functions for xen and kvm pv clock sources */ 7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9void pvclock_set_flags(u8 flags);
9unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); 10unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
10void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 11void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
11 struct pvclock_vcpu_time_info *vcpu, 12 struct pvclock_vcpu_time_info *vcpu,
diff --git a/arch/x86/include/asm/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
deleted file mode 100644
index c8e9c8bed3d..00000000000
--- a/arch/x86/include/asm/rdc321x_defs.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#define PFX "rdc321x: "
2
3/* General purpose configuration and data registers */
4#define RDC3210_CFGREG_ADDR 0x0CF8
5#define RDC3210_CFGREG_DATA 0x0CFC
6
7#define RDC321X_GPIO_CTRL_REG1 0x48
8#define RDC321X_GPIO_CTRL_REG2 0x84
9#define RDC321X_GPIO_DATA_REG1 0x4c
10#define RDC321X_GPIO_DATA_REG2 0x88
11
12#define RDC321X_MAX_GPIO 58
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index 75af592677e..fb0b1874396 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -1,8 +1,9 @@
1#ifndef _ASM_X86_SCATTERLIST_H 1#ifndef _ASM_X86_SCATTERLIST_H
2#define _ASM_X86_SCATTERLIST_H 2#define _ASM_X86_SCATTERLIST_H
3 3
4#define ISA_DMA_THRESHOLD (0x00ffffff)
5
6#include <asm-generic/scatterlist.h> 4#include <asm-generic/scatterlist.h>
7 5
6#define ISA_DMA_THRESHOLD (0x00ffffff)
7#define ARCH_HAS_SG_CHAIN
8
8#endif /* _ASM_X86_SCATTERLIST_H */ 9#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd2fa4..0e831059ac5 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 81 u32 event_inj_err;
82 u64 nested_cr3; 82 u64 nested_cr3;
83 u64 lbr_ctl; 83 u64 lbr_ctl;
84 u8 reserved_5[832]; 84 u64 reserved_5;
85 u64 next_rip;
86 u8 reserved_6[816];
85}; 87};
86 88
87 89
@@ -115,6 +117,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
115#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) 117#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
116#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) 118#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
117 119
120#define SVM_VM_CR_VALID_MASK 0x001fULL
121#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
122#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
123
118struct __attribute__ ((__packed__)) vmcb_seg { 124struct __attribute__ ((__packed__)) vmcb_seg {
119 u16 selector; 125 u16 selector;
120 u16 attrib; 126 u16 attrib;
@@ -238,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
238 244
239#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 245#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
240#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
241 248
242#define SVM_EXIT_READ_CR0 0x000 249#define SVM_EXIT_READ_CR0 0x000
243#define SVM_EXIT_READ_CR3 0x003 250#define SVM_EXIT_READ_CR3 0x003
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e96..f0b6e5dbc5a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,13 +87,12 @@ struct thread_info {
87#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
88#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
89#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
90#define TIF_MEMDIE 20 90#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
91#define TIF_DEBUG 21 /* uses debug registers */ 91#define TIF_DEBUG 21 /* uses debug registers */
92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
93#define TIF_FREEZE 23 /* is freezing for suspend */ 93#define TIF_FREEZE 23 /* is freezing for suspend */
94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ 96#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ 97#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
99 98
@@ -115,8 +114,7 @@ struct thread_info {
115#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 114#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
116#define _TIF_FREEZE (1 << TIF_FREEZE) 115#define _TIF_FREEZE (1 << TIF_FREEZE)
117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 117#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) 118#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
121#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) 119#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
122 120
@@ -147,7 +145,7 @@ struct thread_info {
147 145
148/* flags to check in __switch_to() */ 146/* flags to check in __switch_to() */
149#define _TIF_WORK_CTXSW \ 147#define _TIF_WORK_CTXSW \
150 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 148 (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
151 149
152#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) 150#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
153#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 151#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
@@ -241,10 +239,9 @@ static inline struct thread_info *current_thread_info(void)
241#define TS_USEDFPU 0x0001 /* FPU was used by this task 239#define TS_USEDFPU 0x0001 /* FPU was used by this task
242 this quantum (SMP) */ 240 this quantum (SMP) */
243#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ 241#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
244#define TS_POLLING 0x0004 /* true if in idle loop 242#define TS_POLLING 0x0004 /* idle task polling need_resched,
245 and not sleeping */ 243 skip sending interrupt */
246#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ 244#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
247#define TS_XSAVE 0x0010 /* Use xsave/xrstor */
248 245
249#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) 246#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
250 247
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c5087d79658..21899cc31e5 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -53,33 +53,29 @@
53extern int cpu_to_node_map[]; 53extern int cpu_to_node_map[];
54 54
55/* Returns the number of the node containing CPU 'cpu' */ 55/* Returns the number of the node containing CPU 'cpu' */
56static inline int cpu_to_node(int cpu) 56static inline int __cpu_to_node(int cpu)
57{ 57{
58 return cpu_to_node_map[cpu]; 58 return cpu_to_node_map[cpu];
59} 59}
60#define early_cpu_to_node(cpu) cpu_to_node(cpu) 60#define early_cpu_to_node __cpu_to_node
61#define cpu_to_node __cpu_to_node
61 62
62#else /* CONFIG_X86_64 */ 63#else /* CONFIG_X86_64 */
63 64
64/* Mappings between logical cpu number and node number */ 65/* Mappings between logical cpu number and node number */
65DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 66DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
66 67
67/* Returns the number of the current Node. */
68DECLARE_PER_CPU(int, node_number);
69#define numa_node_id() percpu_read(node_number)
70
71#ifdef CONFIG_DEBUG_PER_CPU_MAPS 68#ifdef CONFIG_DEBUG_PER_CPU_MAPS
72extern int cpu_to_node(int cpu); 69/*
70 * override generic percpu implementation of cpu_to_node
71 */
72extern int __cpu_to_node(int cpu);
73#define cpu_to_node __cpu_to_node
74
73extern int early_cpu_to_node(int cpu); 75extern int early_cpu_to_node(int cpu);
74 76
75#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 77#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
76 78
77/* Returns the number of the node containing CPU 'cpu' */
78static inline int cpu_to_node(int cpu)
79{
80 return per_cpu(x86_cpu_to_node_map, cpu);
81}
82
83/* Same function but used if called before per_cpu areas are setup */ 79/* Same function but used if called before per_cpu areas are setup */
84static inline int early_cpu_to_node(int cpu) 80static inline int early_cpu_to_node(int cpu)
85{ 81{
@@ -170,6 +166,10 @@ static inline int numa_node_id(void)
170{ 166{
171 return 0; 167 return 0;
172} 168}
169/*
170 * indicate override:
171 */
172#define numa_node_id numa_node_id
173 173
174static inline int early_cpu_to_node(int cpu) 174static inline int early_cpu_to_node(int cpu)
175{ 175{
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4da91ad69e0..f66cda56781 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -79,7 +79,7 @@ static inline int get_si_code(unsigned long condition)
79 79
80extern int panic_on_unrecovered_nmi; 80extern int panic_on_unrecovered_nmi;
81 81
82void math_error(void __user *); 82void math_error(struct pt_regs *, int, int);
83void math_emulate(struct math_emu_info *); 83void math_emulate(struct math_emu_info *);
84#ifndef CONFIG_X86_32 84#ifndef CONFIG_X86_32
85asmlinkage void smp_thermal_interrupt(void); 85asmlinkage void smp_thermal_interrupt(void);
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index b414d2b401f..aa558ac0306 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -27,13 +27,14 @@
27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. 27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
28 * 28 *
29 * We will use 31 sets, one for sending BAU messages from each of the 32 29 * We will use 31 sets, one for sending BAU messages from each of the 32
30 * cpu's on the node. 30 * cpu's on the uvhub.
31 * 31 *
32 * TLB shootdown will use the first of the 8 descriptors of each set. 32 * TLB shootdown will use the first of the 8 descriptors of each set.
33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). 33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
34 */ 34 */
35 35
36#define UV_ITEMS_PER_DESCRIPTOR 8 36#define UV_ITEMS_PER_DESCRIPTOR 8
37#define MAX_BAU_CONCURRENT 3
37#define UV_CPUS_PER_ACT_STATUS 32 38#define UV_CPUS_PER_ACT_STATUS 32
38#define UV_ACT_STATUS_MASK 0x3 39#define UV_ACT_STATUS_MASK 0x3
39#define UV_ACT_STATUS_SIZE 2 40#define UV_ACT_STATUS_SIZE 2
@@ -45,6 +46,9 @@
45#define UV_PAYLOADQ_PNODE_SHIFT 49 46#define UV_PAYLOADQ_PNODE_SHIFT 49
46#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" 47#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
47#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) 48#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
49#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
50#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
51#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
48 52
49/* 53/*
50 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 54 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
@@ -55,15 +59,29 @@
55#define DESC_STATUS_SOURCE_TIMEOUT 3 59#define DESC_STATUS_SOURCE_TIMEOUT 3
56 60
57/* 61/*
58 * source side thresholds at which message retries print a warning 62 * source side threshholds at which message retries print a warning
59 */ 63 */
60#define SOURCE_TIMEOUT_LIMIT 20 64#define SOURCE_TIMEOUT_LIMIT 20
61#define DESTINATION_TIMEOUT_LIMIT 20 65#define DESTINATION_TIMEOUT_LIMIT 20
62 66
63/* 67/*
68 * misc. delays, in microseconds
69 */
70#define THROTTLE_DELAY 10
71#define TIMEOUT_DELAY 10
72#define BIOS_TO 1000
73/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
74
75/*
76 * threshholds at which to use IPI to free resources
77 */
78#define PLUGSB4RESET 100
79#define TIMEOUTSB4RESET 100
80
81/*
64 * number of entries in the destination side payload queue 82 * number of entries in the destination side payload queue
65 */ 83 */
66#define DEST_Q_SIZE 17 84#define DEST_Q_SIZE 20
67/* 85/*
68 * number of destination side software ack resources 86 * number of destination side software ack resources
69 */ 87 */
@@ -72,9 +90,10 @@
72/* 90/*
73 * completion statuses for sending a TLB flush message 91 * completion statuses for sending a TLB flush message
74 */ 92 */
75#define FLUSH_RETRY 1 93#define FLUSH_RETRY_PLUGGED 1
76#define FLUSH_GIVEUP 2 94#define FLUSH_RETRY_TIMEOUT 2
77#define FLUSH_COMPLETE 3 95#define FLUSH_GIVEUP 3
96#define FLUSH_COMPLETE 4
78 97
79/* 98/*
80 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) 99 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -86,14 +105,14 @@
86 * 'base_dest_nodeid' field of the header corresponds to the 105 * 'base_dest_nodeid' field of the header corresponds to the
87 * destination nodeID associated with that specified bit. 106 * destination nodeID associated with that specified bit.
88 */ 107 */
89struct bau_target_nodemask { 108struct bau_target_uvhubmask {
90 unsigned long bits[BITS_TO_LONGS(256)]; 109 unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
91}; 110};
92 111
93/* 112/*
94 * mask of cpu's on a node 113 * mask of cpu's on a uvhub
95 * (during initialization we need to check that unsigned long has 114 * (during initialization we need to check that unsigned long has
96 * enough bits for max. cpu's per node) 115 * enough bits for max. cpu's per uvhub)
97 */ 116 */
98struct bau_local_cpumask { 117struct bau_local_cpumask {
99 unsigned long bits; 118 unsigned long bits;
@@ -135,8 +154,8 @@ struct bau_msg_payload {
135struct bau_msg_header { 154struct bau_msg_header {
136 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 155 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
137 /* bits 5:0 */ 156 /* bits 5:0 */
138 unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ 157 unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */
139 /* bits 20:6 */ /* first bit in node_map */ 158 /* bits 20:6 */ /* first bit in uvhub map */
140 unsigned int command:8; /* message type */ 159 unsigned int command:8; /* message type */
141 /* bits 28:21 */ 160 /* bits 28:21 */
142 /* 0x38: SN3net EndPoint Message */ 161 /* 0x38: SN3net EndPoint Message */
@@ -146,26 +165,38 @@ struct bau_msg_header {
146 unsigned int rsvd_2:9; /* must be zero */ 165 unsigned int rsvd_2:9; /* must be zero */
147 /* bits 40:32 */ 166 /* bits 40:32 */
148 /* Suppl_A is 56-41 */ 167 /* Suppl_A is 56-41 */
149 unsigned int payload_2a:8;/* becomes byte 16 of msg */ 168 unsigned int sequence:16;/* message sequence number */
150 /* bits 48:41 */ /* not currently using */ 169 /* bits 56:41 */ /* becomes bytes 16-17 of msg */
151 unsigned int payload_2b:8;/* becomes byte 17 of msg */
152 /* bits 56:49 */ /* not currently using */
153 /* Address field (96:57) is never used as an 170 /* Address field (96:57) is never used as an
154 address (these are address bits 42:3) */ 171 address (these are address bits 42:3) */
172
155 unsigned int rsvd_3:1; /* must be zero */ 173 unsigned int rsvd_3:1; /* must be zero */
156 /* bit 57 */ 174 /* bit 57 */
157 /* address bits 27:4 are payload */ 175 /* address bits 27:4 are payload */
158 /* these 24 bits become bytes 12-14 of msg */ 176 /* these next 24 (58-81) bits become bytes 12-14 of msg */
177
178 /* bits 65:58 land in byte 12 */
159 unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */ 179 unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
160 /* bit 58 */ 180 /* bit 58 */
161 181 unsigned int msg_type:3; /* software type of the message*/
162 unsigned int payload_1a:5;/* not currently used */ 182 /* bits 61:59 */
163 /* bits 63:59 */ 183 unsigned int canceled:1; /* message canceled, resource to be freed*/
164 unsigned int payload_1b:8;/* not currently used */ 184 /* bit 62 */
165 /* bits 71:64 */ 185 unsigned int payload_1a:1;/* not currently used */
166 unsigned int payload_1c:8;/* not currently used */ 186 /* bit 63 */
167 /* bits 79:72 */ 187 unsigned int payload_1b:2;/* not currently used */
168 unsigned int payload_1d:2;/* not currently used */ 188 /* bits 65:64 */
189
190 /* bits 73:66 land in byte 13 */
191 unsigned int payload_1ca:6;/* not currently used */
192 /* bits 71:66 */
193 unsigned int payload_1c:2;/* not currently used */
194 /* bits 73:72 */
195
196 /* bits 81:74 land in byte 14 */
197 unsigned int payload_1d:6;/* not currently used */
198 /* bits 79:74 */
199 unsigned int payload_1e:2;/* not currently used */
169 /* bits 81:80 */ 200 /* bits 81:80 */
170 201
171 unsigned int rsvd_4:7; /* must be zero */ 202 unsigned int rsvd_4:7; /* must be zero */
@@ -178,7 +209,7 @@ struct bau_msg_header {
178 /* bits 95:90 */ 209 /* bits 95:90 */
179 unsigned int rsvd_6:5; /* must be zero */ 210 unsigned int rsvd_6:5; /* must be zero */
180 /* bits 100:96 */ 211 /* bits 100:96 */
181 unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */ 212 unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */
182 /* bit 101*/ 213 /* bit 101*/
183 unsigned int fairness:3;/* usually zero */ 214 unsigned int fairness:3;/* usually zero */
184 /* bits 104:102 */ 215 /* bits 104:102 */
@@ -191,13 +222,18 @@ struct bau_msg_header {
191 /* bits 127:107 */ 222 /* bits 127:107 */
192}; 223};
193 224
225/* see msg_type: */
226#define MSG_NOOP 0
227#define MSG_REGULAR 1
228#define MSG_RETRY 2
229
194/* 230/*
195 * The activation descriptor: 231 * The activation descriptor:
196 * The format of the message to send, plus all accompanying control 232 * The format of the message to send, plus all accompanying control
197 * Should be 64 bytes 233 * Should be 64 bytes
198 */ 234 */
199struct bau_desc { 235struct bau_desc {
200 struct bau_target_nodemask distribution; 236 struct bau_target_uvhubmask distribution;
201 /* 237 /*
202 * message template, consisting of header and payload: 238 * message template, consisting of header and payload:
203 */ 239 */
@@ -237,19 +273,25 @@ struct bau_payload_queue_entry {
237 unsigned short acknowledge_count; /* filled in by destination */ 273 unsigned short acknowledge_count; /* filled in by destination */
238 /* 16 bits, bytes 10-11 */ 274 /* 16 bits, bytes 10-11 */
239 275
240 unsigned short replied_to:1; /* sent as 0 by the source */ 276 /* these next 3 bytes come from bits 58-81 of the message header */
241 /* 1 bit */ 277 unsigned short replied_to:1; /* sent as 0 by the source */
242 unsigned short unused1:7; /* not currently using */ 278 unsigned short msg_type:3; /* software message type */
243 /* 7 bits: byte 12) */ 279 unsigned short canceled:1; /* sent as 0 by the source */
280 unsigned short unused1:3; /* not currently using */
281 /* byte 12 */
244 282
245 unsigned char unused2[2]; /* not currently using */ 283 unsigned char unused2a; /* not currently using */
246 /* bytes 13-14 */ 284 /* byte 13 */
285 unsigned char unused2; /* not currently using */
286 /* byte 14 */
247 287
248 unsigned char sw_ack_vector; /* filled in by the hardware */ 288 unsigned char sw_ack_vector; /* filled in by the hardware */
249 /* byte 15 (bits 127:120) */ 289 /* byte 15 (bits 127:120) */
250 290
251 unsigned char unused4[3]; /* not currently using bytes 17-19 */ 291 unsigned short sequence; /* message sequence number */
252 /* bytes 17-19 */ 292 /* bytes 16-17 */
293 unsigned char unused4[2]; /* not currently using bytes 18-19 */
294 /* bytes 18-19 */
253 295
254 int number_of_cpus; /* filled in at destination */ 296 int number_of_cpus; /* filled in at destination */
255 /* 32 bits, bytes 20-23 (aligned) */ 297 /* 32 bits, bytes 20-23 (aligned) */
@@ -259,63 +301,93 @@ struct bau_payload_queue_entry {
259}; 301};
260 302
261/* 303/*
262 * one for every slot in the destination payload queue 304 * one per-cpu; to locate the software tables
263 */
264struct bau_msg_status {
265 struct bau_local_cpumask seen_by; /* map of cpu's */
266};
267
268/*
269 * one for every slot in the destination software ack resources
270 */
271struct bau_sw_ack_status {
272 struct bau_payload_queue_entry *msg; /* associated message */
273 int watcher; /* cpu monitoring, or -1 */
274};
275
276/*
277 * one on every node and per-cpu; to locate the software tables
278 */ 305 */
279struct bau_control { 306struct bau_control {
280 struct bau_desc *descriptor_base; 307 struct bau_desc *descriptor_base;
281 struct bau_payload_queue_entry *bau_msg_head;
282 struct bau_payload_queue_entry *va_queue_first; 308 struct bau_payload_queue_entry *va_queue_first;
283 struct bau_payload_queue_entry *va_queue_last; 309 struct bau_payload_queue_entry *va_queue_last;
284 struct bau_msg_status *msg_statuses; 310 struct bau_payload_queue_entry *bau_msg_head;
285 int *watching; /* pointer to array */ 311 struct bau_control *uvhub_master;
312 struct bau_control *socket_master;
313 unsigned long timeout_interval;
314 atomic_t active_descriptor_count;
315 int max_concurrent;
316 int max_concurrent_constant;
317 int retry_message_scans;
318 int plugged_tries;
319 int timeout_tries;
320 int ipi_attempts;
321 int conseccompletes;
322 short cpu;
323 short uvhub_cpu;
324 short uvhub;
325 short cpus_in_socket;
326 short cpus_in_uvhub;
327 unsigned short message_number;
328 unsigned short uvhub_quiesce;
329 short socket_acknowledge_count[DEST_Q_SIZE];
330 cycles_t send_message;
331 spinlock_t masks_lock;
332 spinlock_t uvhub_lock;
333 spinlock_t queue_lock;
286}; 334};
287 335
288/* 336/*
289 * This structure is allocated per_cpu for UV TLB shootdown statistics. 337 * This structure is allocated per_cpu for UV TLB shootdown statistics.
290 */ 338 */
291struct ptc_stats { 339struct ptc_stats {
292 unsigned long ptc_i; /* number of IPI-style flushes */ 340 /* sender statistics */
293 unsigned long requestor; /* number of nodes this cpu sent to */ 341 unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
294 unsigned long requestee; /* times cpu was remotely requested */ 342 unsigned long s_requestor; /* number of shootdown requests */
295 unsigned long alltlb; /* times all tlb's on this cpu were flushed */ 343 unsigned long s_stimeout; /* source side timeouts */
296 unsigned long onetlb; /* times just one tlb on this cpu was flushed */ 344 unsigned long s_dtimeout; /* destination side timeouts */
297 unsigned long s_retry; /* retries on source side timeouts */ 345 unsigned long s_time; /* time spent in sending side */
298 unsigned long d_retry; /* retries on destination side timeouts */ 346 unsigned long s_retriesok; /* successful retries */
299 unsigned long sflush; /* cycles spent in uv_flush_tlb_others */ 347 unsigned long s_ntargcpu; /* number of cpus targeted */
300 unsigned long dflush; /* cycles spent on destination side */ 348 unsigned long s_ntarguvhub; /* number of uvhubs targeted */
301 unsigned long retriesok; /* successes on retries */ 349 unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */
302 unsigned long nomsg; /* interrupts with no message */ 350 unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */
303 unsigned long multmsg; /* interrupts with multiple messages */ 351 unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */
304 unsigned long ntargeted;/* nodes targeted */ 352 unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */
353 unsigned long s_ntarguvhub1; /* number of times == 1 target hub */
354 unsigned long s_resets_plug; /* ipi-style resets from plug state */
355 unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
356 unsigned long s_busy; /* status stayed busy past s/w timer */
357 unsigned long s_throttles; /* waits in throttle */
358 unsigned long s_retry_messages; /* retry broadcasts */
359 /* destination statistics */
360 unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
361 unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
362 unsigned long d_multmsg; /* interrupts with multiple messages */
363 unsigned long d_nomsg; /* interrupts with no message */
364 unsigned long d_time; /* time spent on destination side */
365 unsigned long d_requestee; /* number of messages processed */
366 unsigned long d_retries; /* number of retry messages processed */
367 unsigned long d_canceled; /* number of messages canceled by retries */
368 unsigned long d_nocanceled; /* retries that found nothing to cancel */
369 unsigned long d_resets; /* number of ipi-style requests processed */
370 unsigned long d_rcanceled; /* number of messages canceled by resets */
305}; 371};
306 372
307static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp) 373static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
308{ 374{
309 return constant_test_bit(node, &dstp->bits[0]); 375 return constant_test_bit(uvhub, &dstp->bits[0]);
310} 376}
311static inline void bau_node_set(int node, struct bau_target_nodemask *dstp) 377static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp)
312{ 378{
313 __set_bit(node, &dstp->bits[0]); 379 __set_bit(uvhub, &dstp->bits[0]);
314} 380}
315static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits) 381static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
382 int nbits)
316{ 383{
317 bitmap_zero(&dstp->bits[0], nbits); 384 bitmap_zero(&dstp->bits[0], nbits);
318} 385}
386static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp)
387{
388 return bitmap_weight((unsigned long *)&dstp->bits[0],
389 UV_DISTRIBUTION_SIZE);
390}
319 391
320static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) 392static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
321{ 393{
@@ -328,4 +400,35 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
328extern void uv_bau_message_intr1(void); 400extern void uv_bau_message_intr1(void);
329extern void uv_bau_timeout_intr1(void); 401extern void uv_bau_timeout_intr1(void);
330 402
403struct atomic_short {
404 short counter;
405};
406
407/**
408 * atomic_read_short - read a short atomic variable
409 * @v: pointer of type atomic_short
410 *
411 * Atomically reads the value of @v.
412 */
413static inline int atomic_read_short(const struct atomic_short *v)
414{
415 return v->counter;
416}
417
418/**
419 * atomic_add_short_return - add and return a short int
420 * @i: short value to add
421 * @v: pointer of type atomic_short
422 *
423 * Atomically adds @i to @v and returns @i + @v
424 */
425static inline int atomic_add_short_return(short i, struct atomic_short *v)
426{
427 short __i = i;
428 asm volatile(LOCK_PREFIX "xaddw %0, %1"
429 : "+r" (i), "+m" (v->counter)
430 : : "memory");
431 return i + __i;
432}
433
331#endif /* _ASM_X86_UV_UV_BAU_H */ 434#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 14cc74ba5d2..bf6b88ef8ee 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -307,7 +307,7 @@ static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset
307 * Access Global MMR space using the MMR space located at the top of physical 307 * Access Global MMR space using the MMR space located at the top of physical
308 * memory. 308 * memory.
309 */ 309 */
310static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset) 310static inline volatile void __iomem *uv_global_mmr64_address(int pnode, unsigned long offset)
311{ 311{
312 return __va(UV_GLOBAL_MMR64_BASE | 312 return __va(UV_GLOBAL_MMR64_BASE |
313 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); 313 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 2cae46c7c8a..b2f2d2e05ce 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -1,4 +1,3 @@
1
2/* 1/*
3 * This file is subject to the terms and conditions of the GNU General Public 2 * This file is subject to the terms and conditions of the GNU General Public
4 * License. See the file "COPYING" in the main directory of this archive 3 * License. See the file "COPYING" in the main directory of this archive
@@ -15,13 +14,25 @@
15#define UV_MMR_ENABLE (1UL << 63) 14#define UV_MMR_ENABLE (1UL << 63)
16 15
17/* ========================================================================= */ 16/* ========================================================================= */
17/* UVH_BAU_DATA_BROADCAST */
18/* ========================================================================= */
19#define UVH_BAU_DATA_BROADCAST 0x61688UL
20#define UVH_BAU_DATA_BROADCAST_32 0x0440
21
22#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
23#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
24
25union uvh_bau_data_broadcast_u {
26 unsigned long v;
27 struct uvh_bau_data_broadcast_s {
28 unsigned long enable : 1; /* RW */
29 unsigned long rsvd_1_63: 63; /* */
30 } s;
31};
32
33/* ========================================================================= */
18/* UVH_BAU_DATA_CONFIG */ 34/* UVH_BAU_DATA_CONFIG */
19/* ========================================================================= */ 35/* ========================================================================= */
20#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
21#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
22#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
23#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
24/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */
25#define UVH_BAU_DATA_CONFIG 0x61680UL 36#define UVH_BAU_DATA_CONFIG 0x61680UL
26#define UVH_BAU_DATA_CONFIG_32 0x0438 37#define UVH_BAU_DATA_CONFIG_32 0x0438
27 38
@@ -604,6 +615,68 @@ union uvh_lb_bau_intd_software_acknowledge_u {
604#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70 615#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
605 616
606/* ========================================================================= */ 617/* ========================================================================= */
618/* UVH_LB_BAU_MISC_CONTROL */
619/* ========================================================================= */
620#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
621#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10
622
623#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
624#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
625#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
626#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
627#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
628#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
629#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
630#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
631#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11
632#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
633#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
634#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
635#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
636#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
637#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
638#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
639#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
640#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
641#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
642#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
643#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
644#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
645#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
646#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
647#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
648#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
649#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
650#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
651#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
652#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
653#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
654#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
655
656union uvh_lb_bau_misc_control_u {
657 unsigned long v;
658 struct uvh_lb_bau_misc_control_s {
659 unsigned long rejection_delay : 8; /* RW */
660 unsigned long apic_mode : 1; /* RW */
661 unsigned long force_broadcast : 1; /* RW */
662 unsigned long force_lock_nop : 1; /* RW */
663 unsigned long csi_agent_presence_vector : 3; /* RW */
664 unsigned long descriptor_fetch_mode : 1; /* RW */
665 unsigned long enable_intd_soft_ack_mode : 1; /* RW */
666 unsigned long intd_soft_ack_timeout_period : 4; /* RW */
667 unsigned long enable_dual_mapping_mode : 1; /* RW */
668 unsigned long vga_io_port_decode_enable : 1; /* RW */
669 unsigned long vga_io_port_16_bit_decode : 1; /* RW */
670 unsigned long suppress_dest_registration : 1; /* RW */
671 unsigned long programmed_initial_priority : 3; /* RW */
672 unsigned long use_incoming_priority : 1; /* RW */
673 unsigned long enable_programmed_initial_priority : 1; /* RW */
674 unsigned long rsvd_29_47 : 19; /* */
675 unsigned long fun : 16; /* RW */
676 } s;
677};
678
679/* ========================================================================= */
607/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ 680/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
608/* ========================================================================= */ 681/* ========================================================================= */
609#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL 682#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
@@ -681,334 +754,6 @@ union uvh_lb_bau_sb_descriptor_base_u {
681}; 754};
682 755
683/* ========================================================================= */ 756/* ========================================================================= */
684/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */
685/* ========================================================================= */
686#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
687
688#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
689#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
690#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
691#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
692#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
693#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
694#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
695#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
696#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
697#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
698#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
699#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
700#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
701#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
702#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
703#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
704#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
705#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
706#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
707#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
708#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
709#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
710#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
711#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
712#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
713#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
714#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
715#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
716#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
717#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
718#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
719#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
720#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
721#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
722#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
723#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
724#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
725#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
726#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
727#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
728#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
729#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
730#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
731#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
732#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
733#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
734#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
735#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
736#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
737#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
738#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
739#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
740#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
741#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
742#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
743#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
744#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
745#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
746#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
747#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
748#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
749#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
750#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
751#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
752#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
753#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
754#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
755#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
756#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
757#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
758#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
759#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
760#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
761#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
762#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
763#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
764#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
765#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
766#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
767#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
768#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
769#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
770#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
771#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
772#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
773#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
774
775union uvh_lb_mcast_aoerr0_rpt_enable_u {
776 unsigned long v;
777 struct uvh_lb_mcast_aoerr0_rpt_enable_s {
778 unsigned long mcast_obese_msg : 1; /* RW */
779 unsigned long mcast_data_sb_err : 1; /* RW */
780 unsigned long mcast_nack_buff_parity : 1; /* RW */
781 unsigned long mcast_timeout : 1; /* RW */
782 unsigned long mcast_inactive_reply : 1; /* RW */
783 unsigned long mcast_upgrade_error : 1; /* RW */
784 unsigned long mcast_reg_count_underflow : 1; /* RW */
785 unsigned long mcast_rep_obese_msg : 1; /* RW */
786 unsigned long ucache_req_runt_msg : 1; /* RW */
787 unsigned long ucache_req_obese_msg : 1; /* RW */
788 unsigned long ucache_req_data_sb_err : 1; /* RW */
789 unsigned long ucache_rep_runt_msg : 1; /* RW */
790 unsigned long ucache_rep_obese_msg : 1; /* RW */
791 unsigned long ucache_rep_data_sb_err : 1; /* RW */
792 unsigned long ucache_rep_command_err : 1; /* RW */
793 unsigned long ucache_pend_timeout : 1; /* RW */
794 unsigned long macc_req_runt_msg : 1; /* RW */
795 unsigned long macc_req_obese_msg : 1; /* RW */
796 unsigned long macc_req_data_sb_err : 1; /* RW */
797 unsigned long macc_rep_runt_msg : 1; /* RW */
798 unsigned long macc_rep_obese_msg : 1; /* RW */
799 unsigned long macc_rep_data_sb_err : 1; /* RW */
800 unsigned long macc_amo_timeout : 1; /* RW */
801 unsigned long macc_put_timeout : 1; /* RW */
802 unsigned long macc_spurious_event : 1; /* RW */
803 unsigned long ioh_destination_table_parity : 1; /* RW */
804 unsigned long get_had_error_reply : 1; /* RW */
805 unsigned long get_timeout : 1; /* RW */
806 unsigned long lock_manager_had_error_reply : 1; /* RW */
807 unsigned long put_had_error_reply : 1; /* RW */
808 unsigned long put_timeout : 1; /* RW */
809 unsigned long sb_activation_overrun : 1; /* RW */
810 unsigned long completed_gb_activation_had_error_reply : 1; /* RW */
811 unsigned long completed_gb_activation_timeout : 1; /* RW */
812 unsigned long descriptor_buffer_0_parity : 1; /* RW */
813 unsigned long descriptor_buffer_1_parity : 1; /* RW */
814 unsigned long socket_destination_table_parity : 1; /* RW */
815 unsigned long bau_reply_payload_corruption : 1; /* RW */
816 unsigned long io_port_destination_table_parity : 1; /* RW */
817 unsigned long intd_soft_ack_timeout : 1; /* RW */
818 unsigned long int_rep_obese_msg : 1; /* RW */
819 unsigned long int_rep_command_err : 1; /* RW */
820 unsigned long int_timeout : 1; /* RW */
821 unsigned long rsvd_43_63 : 21; /* */
822 } s;
823};
824
825/* ========================================================================= */
826/* UVH_LOCAL_INT0_CONFIG */
827/* ========================================================================= */
828#define UVH_LOCAL_INT0_CONFIG 0x61000UL
829
830#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
831#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
832#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
833#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
834#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
835#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
836#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
837#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
838#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
839#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
840#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
841#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
842#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
843#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
844#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
845#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
846
847union uvh_local_int0_config_u {
848 unsigned long v;
849 struct uvh_local_int0_config_s {
850 unsigned long vector_ : 8; /* RW */
851 unsigned long dm : 3; /* RW */
852 unsigned long destmode : 1; /* RW */
853 unsigned long status : 1; /* RO */
854 unsigned long p : 1; /* RO */
855 unsigned long rsvd_14 : 1; /* */
856 unsigned long t : 1; /* RO */
857 unsigned long m : 1; /* RW */
858 unsigned long rsvd_17_31: 15; /* */
859 unsigned long apic_id : 32; /* RW */
860 } s;
861};
862
863/* ========================================================================= */
864/* UVH_LOCAL_INT0_ENABLE */
865/* ========================================================================= */
866#define UVH_LOCAL_INT0_ENABLE 0x65000UL
867
868#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
869#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
870#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
871#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
872#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
873#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
874#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
875#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
876#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
877#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
878#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
879#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
880#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
881#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
882#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
883#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
884#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
885#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
886#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
887#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
888#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
889#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
890#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
891#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
892#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
893#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
894#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
895#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
896#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
897#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
898#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
899#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
900#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
901#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
902#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
903#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
904#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
905#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
906#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
907#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
908#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
909#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
910#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
911#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
912#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
913#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
914#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
915#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
916#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
917#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
918#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
919#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
920#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
921#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
922#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
923#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
924#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
925#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
926#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
927#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
928#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
929#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
930#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
931#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
932#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
933#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
934#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
935#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
936#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
937#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
938#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
939#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
940#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
941#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
942#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
943#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
944#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
945#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
946#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
947#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
948#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
949#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
950#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
951#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
952#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
953#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
954#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
955#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
956#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
957#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
958
959union uvh_local_int0_enable_u {
960 unsigned long v;
961 struct uvh_local_int0_enable_s {
962 unsigned long lb_hcerr : 1; /* RW */
963 unsigned long gr0_hcerr : 1; /* RW */
964 unsigned long gr1_hcerr : 1; /* RW */
965 unsigned long lh_hcerr : 1; /* RW */
966 unsigned long rh_hcerr : 1; /* RW */
967 unsigned long xn_hcerr : 1; /* RW */
968 unsigned long si_hcerr : 1; /* RW */
969 unsigned long lb_aoerr0 : 1; /* RW */
970 unsigned long gr0_aoerr0 : 1; /* RW */
971 unsigned long gr1_aoerr0 : 1; /* RW */
972 unsigned long lh_aoerr0 : 1; /* RW */
973 unsigned long rh_aoerr0 : 1; /* RW */
974 unsigned long xn_aoerr0 : 1; /* RW */
975 unsigned long si_aoerr0 : 1; /* RW */
976 unsigned long lb_aoerr1 : 1; /* RW */
977 unsigned long gr0_aoerr1 : 1; /* RW */
978 unsigned long gr1_aoerr1 : 1; /* RW */
979 unsigned long lh_aoerr1 : 1; /* RW */
980 unsigned long rh_aoerr1 : 1; /* RW */
981 unsigned long xn_aoerr1 : 1; /* RW */
982 unsigned long si_aoerr1 : 1; /* RW */
983 unsigned long rh_vpi_int : 1; /* RW */
984 unsigned long system_shutdown_int : 1; /* RW */
985 unsigned long lb_irq_int_0 : 1; /* RW */
986 unsigned long lb_irq_int_1 : 1; /* RW */
987 unsigned long lb_irq_int_2 : 1; /* RW */
988 unsigned long lb_irq_int_3 : 1; /* RW */
989 unsigned long lb_irq_int_4 : 1; /* RW */
990 unsigned long lb_irq_int_5 : 1; /* RW */
991 unsigned long lb_irq_int_6 : 1; /* RW */
992 unsigned long lb_irq_int_7 : 1; /* RW */
993 unsigned long lb_irq_int_8 : 1; /* RW */
994 unsigned long lb_irq_int_9 : 1; /* RW */
995 unsigned long lb_irq_int_10 : 1; /* RW */
996 unsigned long lb_irq_int_11 : 1; /* RW */
997 unsigned long lb_irq_int_12 : 1; /* RW */
998 unsigned long lb_irq_int_13 : 1; /* RW */
999 unsigned long lb_irq_int_14 : 1; /* RW */
1000 unsigned long lb_irq_int_15 : 1; /* RW */
1001 unsigned long l1_nmi_int : 1; /* RW */
1002 unsigned long stop_clock : 1; /* RW */
1003 unsigned long asic_to_l1 : 1; /* RW */
1004 unsigned long l1_to_asic : 1; /* RW */
1005 unsigned long ltc_int : 1; /* RW */
1006 unsigned long la_seq_trigger : 1; /* RW */
1007 unsigned long rsvd_45_63 : 19; /* */
1008 } s;
1009};
1010
1011/* ========================================================================= */
1012/* UVH_NODE_ID */ 757/* UVH_NODE_ID */
1013/* ========================================================================= */ 758/* ========================================================================= */
1014#define UVH_NODE_ID 0x0UL 759#define UVH_NODE_ID 0x0UL
@@ -1112,26 +857,6 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
1112}; 857};
1113 858
1114/* ========================================================================= */ 859/* ========================================================================= */
1115/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */
1116/* ========================================================================= */
1117#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
1118
1119#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1120#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1121#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1122#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1123
1124union uvh_rh_gam_cfg_overlay_config_mmr_u {
1125 unsigned long v;
1126 struct uvh_rh_gam_cfg_overlay_config_mmr_s {
1127 unsigned long rsvd_0_25: 26; /* */
1128 unsigned long base : 20; /* RW */
1129 unsigned long rsvd_46_62: 17; /* */
1130 unsigned long enable : 1; /* RW */
1131 } s;
1132};
1133
1134/* ========================================================================= */
1135/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ 860/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
1136/* ========================================================================= */ 861/* ========================================================================= */
1137#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL 862#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
@@ -1263,101 +988,6 @@ union uvh_rtc1_int_config_u {
1263}; 988};
1264 989
1265/* ========================================================================= */ 990/* ========================================================================= */
1266/* UVH_RTC2_INT_CONFIG */
1267/* ========================================================================= */
1268#define UVH_RTC2_INT_CONFIG 0x61600UL
1269
1270#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
1271#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1272#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
1273#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
1274#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
1275#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1276#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
1277#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1278#define UVH_RTC2_INT_CONFIG_P_SHFT 13
1279#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
1280#define UVH_RTC2_INT_CONFIG_T_SHFT 15
1281#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
1282#define UVH_RTC2_INT_CONFIG_M_SHFT 16
1283#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
1284#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
1285#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1286
1287union uvh_rtc2_int_config_u {
1288 unsigned long v;
1289 struct uvh_rtc2_int_config_s {
1290 unsigned long vector_ : 8; /* RW */
1291 unsigned long dm : 3; /* RW */
1292 unsigned long destmode : 1; /* RW */
1293 unsigned long status : 1; /* RO */
1294 unsigned long p : 1; /* RO */
1295 unsigned long rsvd_14 : 1; /* */
1296 unsigned long t : 1; /* RO */
1297 unsigned long m : 1; /* RW */
1298 unsigned long rsvd_17_31: 15; /* */
1299 unsigned long apic_id : 32; /* RW */
1300 } s;
1301};
1302
1303/* ========================================================================= */
1304/* UVH_RTC3_INT_CONFIG */
1305/* ========================================================================= */
1306#define UVH_RTC3_INT_CONFIG 0x61640UL
1307
1308#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
1309#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1310#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
1311#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
1312#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
1313#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1314#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
1315#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1316#define UVH_RTC3_INT_CONFIG_P_SHFT 13
1317#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
1318#define UVH_RTC3_INT_CONFIG_T_SHFT 15
1319#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
1320#define UVH_RTC3_INT_CONFIG_M_SHFT 16
1321#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
1322#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
1323#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1324
1325union uvh_rtc3_int_config_u {
1326 unsigned long v;
1327 struct uvh_rtc3_int_config_s {
1328 unsigned long vector_ : 8; /* RW */
1329 unsigned long dm : 3; /* RW */
1330 unsigned long destmode : 1; /* RW */
1331 unsigned long status : 1; /* RO */
1332 unsigned long p : 1; /* RO */
1333 unsigned long rsvd_14 : 1; /* */
1334 unsigned long t : 1; /* RO */
1335 unsigned long m : 1; /* RW */
1336 unsigned long rsvd_17_31: 15; /* */
1337 unsigned long apic_id : 32; /* RW */
1338 } s;
1339};
1340
1341/* ========================================================================= */
1342/* UVH_RTC_INC_RATIO */
1343/* ========================================================================= */
1344#define UVH_RTC_INC_RATIO 0x350000UL
1345
1346#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
1347#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
1348#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
1349#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
1350
1351union uvh_rtc_inc_ratio_u {
1352 unsigned long v;
1353 struct uvh_rtc_inc_ratio_s {
1354 unsigned long fraction : 20; /* RW */
1355 unsigned long ratio : 3; /* RW */
1356 unsigned long rsvd_23_63: 41; /* */
1357 } s;
1358};
1359
1360/* ========================================================================= */
1361/* UVH_SI_ADDR_MAP_CONFIG */ 991/* UVH_SI_ADDR_MAP_CONFIG */
1362/* ========================================================================= */ 992/* ========================================================================= */
1363#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL 993#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
deleted file mode 100644
index e49ed6d2fd4..00000000000
--- a/arch/x86/include/asm/vmware.h
+++ /dev/null
@@ -1,27 +0,0 @@
1/*
2 * Copyright (C) 2008, VMware, Inc.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
12 * NON INFRINGEMENT. See the GNU General Public License for more
13 * details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20#ifndef ASM_X86__VMWARE_H
21#define ASM_X86__VMWARE_H
22
23extern void vmware_platform_setup(void);
24extern int vmware_platform(void);
25extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
26
27#endif
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fb9a080740e..9e6779f7cf2 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,8 @@
25 * 25 *
26 */ 26 */
27 27
28#include <linux/types.h>
29
28/* 30/*
29 * Definitions of Primary Processor-Based VM-Execution Controls. 31 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */ 32 */
@@ -120,6 +122,8 @@ enum vmcs_field {
120 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, 122 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
121 GUEST_IA32_PAT = 0x00002804, 123 GUEST_IA32_PAT = 0x00002804,
122 GUEST_IA32_PAT_HIGH = 0x00002805, 124 GUEST_IA32_PAT_HIGH = 0x00002805,
125 GUEST_IA32_EFER = 0x00002806,
126 GUEST_IA32_EFER_HIGH = 0x00002807,
123 GUEST_PDPTR0 = 0x0000280a, 127 GUEST_PDPTR0 = 0x0000280a,
124 GUEST_PDPTR0_HIGH = 0x0000280b, 128 GUEST_PDPTR0_HIGH = 0x0000280b,
125 GUEST_PDPTR1 = 0x0000280c, 129 GUEST_PDPTR1 = 0x0000280c,
@@ -130,6 +134,8 @@ enum vmcs_field {
130 GUEST_PDPTR3_HIGH = 0x00002811, 134 GUEST_PDPTR3_HIGH = 0x00002811,
131 HOST_IA32_PAT = 0x00002c00, 135 HOST_IA32_PAT = 0x00002c00,
132 HOST_IA32_PAT_HIGH = 0x00002c01, 136 HOST_IA32_PAT_HIGH = 0x00002c01,
137 HOST_IA32_EFER = 0x00002c02,
138 HOST_IA32_EFER_HIGH = 0x00002c03,
133 PIN_BASED_VM_EXEC_CONTROL = 0x00004000, 139 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
134 CPU_BASED_VM_EXEC_CONTROL = 0x00004002, 140 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
135 EXCEPTION_BITMAP = 0x00004004, 141 EXCEPTION_BITMAP = 0x00004004,
@@ -394,6 +400,10 @@ enum vmcs_field {
394#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" 400#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
395#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" 401#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
396 402
397 403struct vmx_msr_entry {
404 u32 index;
405 u32 reserved;
406 u64 value;
407} __aligned(16);
398 408
399#endif 409#endif
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index ddc04ccad03..2c4390cae22 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -37,8 +37,9 @@ extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
37 void __user *fpstate, 37 void __user *fpstate,
38 struct _fpx_sw_bytes *sw); 38 struct _fpx_sw_bytes *sw);
39 39
40static inline int xrstor_checking(struct xsave_struct *fx) 40static inline int fpu_xrstor_checking(struct fpu *fpu)
41{ 41{
42 struct xsave_struct *fx = &fpu->state->xsave;
42 int err; 43 int err;
43 44
44 asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" 45 asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
@@ -110,12 +111,12 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
110 : "memory"); 111 : "memory");
111} 112}
112 113
113static inline void xsave(struct task_struct *tsk) 114static inline void fpu_xsave(struct fpu *fpu)
114{ 115{
115 /* This, however, we can work around by forcing the compiler to select 116 /* This, however, we can work around by forcing the compiler to select
116 an addressing mode that doesn't require extended registers. */ 117 an addressing mode that doesn't require extended registers. */
117 __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" 118 __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
118 : : "D" (&(tsk->thread.xstate->xsave)), 119 : : "D" (&(fpu->state->xsave)),
119 "a" (-1), "d"(-1) : "memory"); 120 "a" (-1), "d"(-1) : "memory");
120} 121}
121#endif 122#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e..e77b2208372 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
47obj-y += process.o 47obj-y += process.o
48obj-y += i387.o xsave.o 48obj-y += i387.o xsave.o
49obj-y += ptrace.o 49obj-y += ptrace.o
50obj-$(CONFIG_X86_DS) += ds.o
51obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 50obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 51obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 52obj-y += step.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa9..60cc4058ed5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled);
63int acpi_noirq; /* skip ACPI IRQ initialization */ 63int acpi_noirq; /* skip ACPI IRQ initialization */
64int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ 64int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
65EXPORT_SYMBOL(acpi_pci_disabled); 65EXPORT_SYMBOL(acpi_pci_disabled);
66int acpi_ht __initdata = 1; /* enable HT */
67 66
68int acpi_lapic; 67int acpi_lapic;
69int acpi_ioapic; 68int acpi_ioapic;
@@ -94,6 +93,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
94 93
95 94
96/* 95/*
96 * ISA irqs by default are the first 16 gsis but can be
97 * any gsi as specified by an interrupt source override.
98 */
99static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
100 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
101};
102
103static unsigned int gsi_to_irq(unsigned int gsi)
104{
105 unsigned int irq = gsi + NR_IRQS_LEGACY;
106 unsigned int i;
107
108 for (i = 0; i < NR_IRQS_LEGACY; i++) {
109 if (isa_irq_to_gsi[i] == gsi) {
110 return i;
111 }
112 }
113
114 /* Provide an identity mapping of gsi == irq
115 * except on truly weird platforms that have
116 * non isa irqs in the first 16 gsis.
117 */
118 if (gsi >= NR_IRQS_LEGACY)
119 irq = gsi;
120 else
121 irq = gsi_end + 1 + gsi;
122
123 return irq;
124}
125
126static u32 irq_to_gsi(int irq)
127{
128 unsigned int gsi;
129
130 if (irq < NR_IRQS_LEGACY)
131 gsi = isa_irq_to_gsi[irq];
132 else if (irq <= gsi_end)
133 gsi = irq;
134 else if (irq <= (gsi_end + NR_IRQS_LEGACY))
135 gsi = irq - gsi_end;
136 else
137 gsi = 0xffffffff;
138
139 return gsi;
140}
141
142/*
97 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, 143 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
98 * to map the target physical address. The problem is that set_fixmap() 144 * to map the target physical address. The problem is that set_fixmap()
99 * provides a single page, and it is possible that the page is not 145 * provides a single page, and it is possible that the page is not
@@ -313,7 +359,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
313/* 359/*
314 * Parse Interrupt Source Override for the ACPI SCI 360 * Parse Interrupt Source Override for the ACPI SCI
315 */ 361 */
316static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) 362static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
317{ 363{
318 if (trigger == 0) /* compatible SCI trigger is level */ 364 if (trigger == 0) /* compatible SCI trigger is level */
319 trigger = 3; 365 trigger = 3;
@@ -333,7 +379,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
333 * If GSI is < 16, this will update its flags, 379 * If GSI is < 16, this will update its flags,
334 * else it will create a new mp_irqs[] entry. 380 * else it will create a new mp_irqs[] entry.
335 */ 381 */
336 mp_override_legacy_irq(gsi, polarity, trigger, gsi); 382 mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
337 383
338 /* 384 /*
339 * stash over-ride to indicate we've been here 385 * stash over-ride to indicate we've been here
@@ -357,9 +403,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
357 acpi_table_print_madt_entry(header); 403 acpi_table_print_madt_entry(header);
358 404
359 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { 405 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
360 acpi_sci_ioapic_setup(intsrc->global_irq, 406 acpi_sci_ioapic_setup(intsrc->source_irq,
361 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, 407 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
362 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2); 408 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
409 intsrc->global_irq);
363 return 0; 410 return 0;
364 } 411 }
365 412
@@ -448,7 +495,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
448 495
449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 496int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
450{ 497{
451 *irq = gsi; 498 *irq = gsi_to_irq(gsi);
452 499
453#ifdef CONFIG_X86_IO_APIC 500#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) 501 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -458,6 +505,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
458 return 0; 505 return 0;
459} 506}
460 507
508int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
509{
510 if (isa_irq >= 16)
511 return -1;
512 *gsi = irq_to_gsi(isa_irq);
513 return 0;
514}
515
461/* 516/*
462 * success: return IRQ number (>=0) 517 * success: return IRQ number (>=0)
463 * failure: return < 0 518 * failure: return < 0
@@ -482,7 +537,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 537 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
483 } 538 }
484#endif 539#endif
485 irq = plat_gsi; 540 irq = gsi_to_irq(plat_gsi);
486 541
487 return irq; 542 return irq;
488} 543}
@@ -867,29 +922,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
867extern int es7000_plat; 922extern int es7000_plat;
868#endif 923#endif
869 924
870int __init acpi_probe_gsi(void)
871{
872 int idx;
873 int gsi;
874 int max_gsi = 0;
875
876 if (acpi_disabled)
877 return 0;
878
879 if (!acpi_ioapic)
880 return 0;
881
882 max_gsi = 0;
883 for (idx = 0; idx < nr_ioapics; idx++) {
884 gsi = mp_gsi_routing[idx].gsi_end;
885
886 if (gsi > max_gsi)
887 max_gsi = gsi;
888 }
889
890 return max_gsi + 1;
891}
892
893static void assign_to_mp_irq(struct mpc_intsrc *m, 925static void assign_to_mp_irq(struct mpc_intsrc *m,
894 struct mpc_intsrc *mp_irq) 926 struct mpc_intsrc *mp_irq)
895{ 927{
@@ -947,13 +979,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
947 mp_irq.dstirq = pin; /* INTIN# */ 979 mp_irq.dstirq = pin; /* INTIN# */
948 980
949 save_mp_irq(&mp_irq); 981 save_mp_irq(&mp_irq);
982
983 isa_irq_to_gsi[bus_irq] = gsi;
950} 984}
951 985
952void __init mp_config_acpi_legacy_irqs(void) 986void __init mp_config_acpi_legacy_irqs(void)
953{ 987{
954 int i; 988 int i;
955 int ioapic;
956 unsigned int dstapic;
957 struct mpc_intsrc mp_irq; 989 struct mpc_intsrc mp_irq;
958 990
959#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 991#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -974,19 +1006,27 @@ void __init mp_config_acpi_legacy_irqs(void)
974#endif 1006#endif
975 1007
976 /* 1008 /*
977 * Locate the IOAPIC that manages the ISA IRQs (0-15).
978 */
979 ioapic = mp_find_ioapic(0);
980 if (ioapic < 0)
981 return;
982 dstapic = mp_ioapics[ioapic].apicid;
983
984 /*
985 * Use the default configuration for the IRQs 0-15. Unless 1009 * Use the default configuration for the IRQs 0-15. Unless
986 * overridden by (MADT) interrupt source override entries. 1010 * overridden by (MADT) interrupt source override entries.
987 */ 1011 */
988 for (i = 0; i < 16; i++) { 1012 for (i = 0; i < 16; i++) {
1013 int ioapic, pin;
1014 unsigned int dstapic;
989 int idx; 1015 int idx;
1016 u32 gsi;
1017
1018 /* Locate the gsi that irq i maps to. */
1019 if (acpi_isa_irq_to_gsi(i, &gsi))
1020 continue;
1021
1022 /*
1023 * Locate the IOAPIC that manages the ISA IRQ.
1024 */
1025 ioapic = mp_find_ioapic(gsi);
1026 if (ioapic < 0)
1027 continue;
1028 pin = mp_find_ioapic_pin(ioapic, gsi);
1029 dstapic = mp_ioapics[ioapic].apicid;
990 1030
991 for (idx = 0; idx < mp_irq_entries; idx++) { 1031 for (idx = 0; idx < mp_irq_entries; idx++) {
992 struct mpc_intsrc *irq = mp_irqs + idx; 1032 struct mpc_intsrc *irq = mp_irqs + idx;
@@ -996,7 +1036,7 @@ void __init mp_config_acpi_legacy_irqs(void)
996 break; 1036 break;
997 1037
998 /* Do we already have a mapping for this IOAPIC pin */ 1038 /* Do we already have a mapping for this IOAPIC pin */
999 if (irq->dstapic == dstapic && irq->dstirq == i) 1039 if (irq->dstapic == dstapic && irq->dstirq == pin)
1000 break; 1040 break;
1001 } 1041 }
1002 1042
@@ -1011,7 +1051,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1011 mp_irq.dstapic = dstapic; 1051 mp_irq.dstapic = dstapic;
1012 mp_irq.irqtype = mp_INT; 1052 mp_irq.irqtype = mp_INT;
1013 mp_irq.srcbusirq = i; /* Identity mapped */ 1053 mp_irq.srcbusirq = i; /* Identity mapped */
1014 mp_irq.dstirq = i; 1054 mp_irq.dstirq = pin;
1015 1055
1016 save_mp_irq(&mp_irq); 1056 save_mp_irq(&mp_irq);
1017 } 1057 }
@@ -1076,11 +1116,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1076 1116
1077 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); 1117 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
1078 1118
1079#ifdef CONFIG_X86_32
1080 if (ioapic_renumber_irq)
1081 gsi = ioapic_renumber_irq(ioapic, gsi);
1082#endif
1083
1084 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1119 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1085 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1120 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1086 "%d-%d\n", mp_ioapics[ioapic].apicid, 1121 "%d-%d\n", mp_ioapics[ioapic].apicid,
@@ -1094,7 +1129,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1094 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, 1129 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1095 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, 1130 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1096 polarity == ACPI_ACTIVE_HIGH ? 0 : 1); 1131 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1097 io_apic_set_pci_routing(dev, gsi, &irq_attr); 1132 io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
1098 1133
1099 return gsi; 1134 return gsi;
1100} 1135}
@@ -1154,7 +1189,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1154 * pretend we got one so we can set the SCI flags. 1189 * pretend we got one so we can set the SCI flags.
1155 */ 1190 */
1156 if (!acpi_sci_override_gsi) 1191 if (!acpi_sci_override_gsi)
1157 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1192 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
1193 acpi_gbl_FADT.sci_interrupt);
1158 1194
1159 /* Fill in identity legacy mappings where no override */ 1195 /* Fill in identity legacy mappings where no override */
1160 mp_config_acpi_legacy_irqs(); 1196 mp_config_acpi_legacy_irqs();
@@ -1464,9 +1500,8 @@ void __init acpi_boot_table_init(void)
1464 1500
1465 /* 1501 /*
1466 * If acpi_disabled, bail out 1502 * If acpi_disabled, bail out
1467 * One exception: acpi=ht continues far enough to enumerate LAPICs
1468 */ 1503 */
1469 if (acpi_disabled && !acpi_ht) 1504 if (acpi_disabled)
1470 return; 1505 return;
1471 1506
1472 /* 1507 /*
@@ -1497,9 +1532,8 @@ int __init early_acpi_boot_init(void)
1497{ 1532{
1498 /* 1533 /*
1499 * If acpi_disabled, bail out 1534 * If acpi_disabled, bail out
1500 * One exception: acpi=ht continues far enough to enumerate LAPICs
1501 */ 1535 */
1502 if (acpi_disabled && !acpi_ht) 1536 if (acpi_disabled)
1503 return 1; 1537 return 1;
1504 1538
1505 /* 1539 /*
@@ -1517,9 +1551,8 @@ int __init acpi_boot_init(void)
1517 1551
1518 /* 1552 /*
1519 * If acpi_disabled, bail out 1553 * If acpi_disabled, bail out
1520 * One exception: acpi=ht continues far enough to enumerate LAPICs
1521 */ 1554 */
1522 if (acpi_disabled && !acpi_ht) 1555 if (acpi_disabled)
1523 return 1; 1556 return 1;
1524 1557
1525 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1558 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1554,21 +1587,12 @@ static int __init parse_acpi(char *arg)
1554 /* acpi=force to over-ride black-list */ 1587 /* acpi=force to over-ride black-list */
1555 else if (strcmp(arg, "force") == 0) { 1588 else if (strcmp(arg, "force") == 0) {
1556 acpi_force = 1; 1589 acpi_force = 1;
1557 acpi_ht = 1;
1558 acpi_disabled = 0; 1590 acpi_disabled = 0;
1559 } 1591 }
1560 /* acpi=strict disables out-of-spec workarounds */ 1592 /* acpi=strict disables out-of-spec workarounds */
1561 else if (strcmp(arg, "strict") == 0) { 1593 else if (strcmp(arg, "strict") == 0) {
1562 acpi_strict = 1; 1594 acpi_strict = 1;
1563 } 1595 }
1564 /* Limit ACPI just to boot-time to enable HT */
1565 else if (strcmp(arg, "ht") == 0) {
1566 if (!acpi_force) {
1567 printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
1568 disable_acpi();
1569 }
1570 acpi_ht = 1;
1571 }
1572 /* acpi=rsdt use RSDT instead of XSDT */ 1596 /* acpi=rsdt use RSDT instead of XSDT */
1573 else if (strcmp(arg, "rsdt") == 0) { 1597 else if (strcmp(arg, "rsdt") == 0) {
1574 acpi_rsdt_forced = 1; 1598 acpi_rsdt_forced = 1;
@@ -1576,6 +1600,10 @@ static int __init parse_acpi(char *arg)
1576 /* "acpi=noirq" disables ACPI interrupt routing */ 1600 /* "acpi=noirq" disables ACPI interrupt routing */
1577 else if (strcmp(arg, "noirq") == 0) { 1601 else if (strcmp(arg, "noirq") == 0) {
1578 acpi_noirq_set(); 1602 acpi_noirq_set();
1603 }
1604 /* "acpi=copy_dsdt" copys DSDT */
1605 else if (strcmp(arg, "copy_dsdt") == 0) {
1606 acpi_gbl_copy_dsdt_locally = 1;
1579 } else { 1607 } else {
1580 /* Core will printk when we return error. */ 1608 /* Core will printk when we return error. */
1581 return -EINVAL; 1609 return -EINVAL;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index f9961034e55..82e508677b9 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,8 +162,6 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
167 str = strchr(str, ','); 165 str = strchr(str, ',');
168 if (str != NULL) 166 if (str != NULL)
169 str += strspn(str, ", \t"); 167 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d..70237732a6c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
194} 194}
195 195
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 196extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern u8 *__smp_locks[], *__smp_locks_end[]; 197extern s32 __smp_locks[], __smp_locks_end[];
198static void *text_poke_early(void *addr, const void *opcode, size_t len); 198static void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
235 235
236#ifdef CONFIG_SMP 236#ifdef CONFIG_SMP
237 237
238static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) 238static void alternatives_smp_lock(const s32 *start, const s32 *end,
239 u8 *text, u8 *text_end)
239{ 240{
240 u8 **ptr; 241 const s32 *poff;
241 242
242 mutex_lock(&text_mutex); 243 mutex_lock(&text_mutex);
243 for (ptr = start; ptr < end; ptr++) { 244 for (poff = start; poff < end; poff++) {
244 if (*ptr < text) 245 u8 *ptr = (u8 *)poff + *poff;
245 continue; 246
246 if (*ptr > text_end) 247 if (!*poff || ptr < text || ptr >= text_end)
247 continue; 248 continue;
248 /* turn DS segment override prefix into lock prefix */ 249 /* turn DS segment override prefix into lock prefix */
249 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 250 if (*ptr == 0x3e)
251 text_poke(ptr, ((unsigned char []){0xf0}), 1);
250 }; 252 };
251 mutex_unlock(&text_mutex); 253 mutex_unlock(&text_mutex);
252} 254}
253 255
254static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 256static void alternatives_smp_unlock(const s32 *start, const s32 *end,
257 u8 *text, u8 *text_end)
255{ 258{
256 u8 **ptr; 259 const s32 *poff;
257 260
258 if (noreplace_smp) 261 if (noreplace_smp)
259 return; 262 return;
260 263
261 mutex_lock(&text_mutex); 264 mutex_lock(&text_mutex);
262 for (ptr = start; ptr < end; ptr++) { 265 for (poff = start; poff < end; poff++) {
263 if (*ptr < text) 266 u8 *ptr = (u8 *)poff + *poff;
264 continue; 267
265 if (*ptr > text_end) 268 if (!*poff || ptr < text || ptr >= text_end)
266 continue; 269 continue;
267 /* turn lock prefix into DS segment override prefix */ 270 /* turn lock prefix into DS segment override prefix */
268 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 271 if (*ptr == 0xf0)
272 text_poke(ptr, ((unsigned char []){0x3E}), 1);
269 }; 273 };
270 mutex_unlock(&text_mutex); 274 mutex_unlock(&text_mutex);
271} 275}
@@ -276,8 +280,8 @@ struct smp_alt_module {
276 char *name; 280 char *name;
277 281
278 /* ptrs to lock prefixes */ 282 /* ptrs to lock prefixes */
279 u8 **locks; 283 const s32 *locks;
280 u8 **locks_end; 284 const s32 *locks_end;
281 285
282 /* .text segment, needed to avoid patching init code ;) */ 286 /* .text segment, needed to avoid patching init code ;) */
283 u8 *text; 287 u8 *text;
@@ -398,16 +402,19 @@ void alternatives_smp_switch(int smp)
398int alternatives_text_reserved(void *start, void *end) 402int alternatives_text_reserved(void *start, void *end)
399{ 403{
400 struct smp_alt_module *mod; 404 struct smp_alt_module *mod;
401 u8 **ptr; 405 const s32 *poff;
402 u8 *text_start = start; 406 u8 *text_start = start;
403 u8 *text_end = end; 407 u8 *text_end = end;
404 408
405 list_for_each_entry(mod, &smp_alt_modules, next) { 409 list_for_each_entry(mod, &smp_alt_modules, next) {
406 if (mod->text > text_end || mod->text_end < text_start) 410 if (mod->text > text_end || mod->text_end < text_start)
407 continue; 411 continue;
408 for (ptr = mod->locks; ptr < mod->locks_end; ptr++) 412 for (poff = mod->locks; poff < mod->locks_end; poff++) {
409 if (text_start <= *ptr && text_end >= *ptr) 413 const u8 *ptr = (const u8 *)poff + *poff;
414
415 if (text_start <= ptr && text_end > ptr)
410 return 1; 416 return 1;
417 }
411 } 418 }
412 419
413 return 0; 420 return 0;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e5a4a1e0161..c02cc692985 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
51#include <asm/smp.h> 51#include <asm/smp.h>
52#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h> 53#include <asm/kvm_para.h>
54#include <asm/tsc.h>
54 55
55unsigned int num_processors; 56unsigned int num_processors;
56 57
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
1151 */ 1152 */
1152void __cpuinit setup_local_APIC(void) 1153void __cpuinit setup_local_APIC(void)
1153{ 1154{
1154 unsigned int value; 1155 unsigned int value, queued;
1155 int i, j; 1156 int i, j, acked = 0;
1157 unsigned long long tsc = 0, ntsc;
1158 long long max_loops = cpu_khz;
1159
1160 if (cpu_has_tsc)
1161 rdtscll(tsc);
1156 1162
1157 if (disable_apic) { 1163 if (disable_apic) {
1158 arch_disable_smp_support(); 1164 arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
1204 * the interrupt. Hence a vector might get locked. It was noticed 1210 * the interrupt. Hence a vector might get locked. It was noticed
1205 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. 1211 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
1206 */ 1212 */
1207 for (i = APIC_ISR_NR - 1; i >= 0; i--) { 1213 do {
1208 value = apic_read(APIC_ISR + i*0x10); 1214 queued = 0;
1209 for (j = 31; j >= 0; j--) { 1215 for (i = APIC_ISR_NR - 1; i >= 0; i--)
1210 if (value & (1<<j)) 1216 queued |= apic_read(APIC_IRR + i*0x10);
1211 ack_APIC_irq(); 1217
1218 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
1219 value = apic_read(APIC_ISR + i*0x10);
1220 for (j = 31; j >= 0; j--) {
1221 if (value & (1<<j)) {
1222 ack_APIC_irq();
1223 acked++;
1224 }
1225 }
1212 } 1226 }
1213 } 1227 if (acked > 256) {
1228 printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
1229 acked);
1230 break;
1231 }
1232 if (cpu_has_tsc) {
1233 rdtscll(ntsc);
1234 max_loops = (cpu_khz << 10) - (ntsc - tsc);
1235 } else
1236 max_loops--;
1237 } while (queued && max_loops > 0);
1238 WARN_ON(max_loops <= 0);
1214 1239
1215 /* 1240 /*
1216 * Now that we are all set up, enable the APIC 1241 * Now that we are all set up, enable the APIC
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5..425e53a87fe 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -131,24 +131,6 @@ int es7000_plat;
131 131
132static unsigned int base; 132static unsigned int base;
133 133
134static int
135es7000_rename_gsi(int ioapic, int gsi)
136{
137 if (es7000_plat == ES7000_ZORRO)
138 return gsi;
139
140 if (!base) {
141 int i;
142 for (i = 0; i < nr_ioapics; i++)
143 base += nr_ioapic_registers[i];
144 }
145
146 if (!ioapic && (gsi < 16))
147 gsi += base;
148
149 return gsi;
150}
151
152static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 134static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
153{ 135{
154 unsigned long vect = 0, psaival = 0; 136 unsigned long vect = 0, psaival = 0;
@@ -190,7 +172,6 @@ static void setup_unisys(void)
190 es7000_plat = ES7000_ZORRO; 172 es7000_plat = ES7000_ZORRO;
191 else 173 else
192 es7000_plat = ES7000_CLASSIC; 174 es7000_plat = ES7000_CLASSIC;
193 ioapic_renumber_irq = es7000_rename_gsi;
194} 175}
195 176
196/* 177/*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index eb2789c3f72..33f3563a2a5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
89/* IO APIC gsi routing info */ 89/* IO APIC gsi routing info */
90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; 90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
91 91
92/* The last gsi number used */
93u32 gsi_end;
94
92/* MP IRQ source entries */ 95/* MP IRQ source entries */
93struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; 96struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
94 97
@@ -1013,10 +1016,9 @@ static inline int irq_trigger(int idx)
1013 return MPBIOS_trigger(idx); 1016 return MPBIOS_trigger(idx);
1014} 1017}
1015 1018
1016int (*ioapic_renumber_irq)(int ioapic, int irq);
1017static int pin_2_irq(int idx, int apic, int pin) 1019static int pin_2_irq(int idx, int apic, int pin)
1018{ 1020{
1019 int irq, i; 1021 int irq;
1020 int bus = mp_irqs[idx].srcbus; 1022 int bus = mp_irqs[idx].srcbus;
1021 1023
1022 /* 1024 /*
@@ -1028,18 +1030,12 @@ static int pin_2_irq(int idx, int apic, int pin)
1028 if (test_bit(bus, mp_bus_not_pci)) { 1030 if (test_bit(bus, mp_bus_not_pci)) {
1029 irq = mp_irqs[idx].srcbusirq; 1031 irq = mp_irqs[idx].srcbusirq;
1030 } else { 1032 } else {
1031 /* 1033 u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
1032 * PCI IRQs are mapped in order 1034
1033 */ 1035 if (gsi >= NR_IRQS_LEGACY)
1034 i = irq = 0; 1036 irq = gsi;
1035 while (i < apic) 1037 else
1036 irq += nr_ioapic_registers[i++]; 1038 irq = gsi_end + 1 + gsi;
1037 irq += pin;
1038 /*
1039 * For MPS mode, so far only needed by ES7000 platform
1040 */
1041 if (ioapic_renumber_irq)
1042 irq = ioapic_renumber_irq(apic, irq);
1043 } 1039 }
1044 1040
1045#ifdef CONFIG_X86_32 1041#ifdef CONFIG_X86_32
@@ -1950,20 +1946,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
1950 1946
1951void __init enable_IO_APIC(void) 1947void __init enable_IO_APIC(void)
1952{ 1948{
1953 union IO_APIC_reg_01 reg_01;
1954 int i8259_apic, i8259_pin; 1949 int i8259_apic, i8259_pin;
1955 int apic; 1950 int apic;
1956 unsigned long flags;
1957
1958 /*
1959 * The number of IO-APIC IRQ registers (== #pins):
1960 */
1961 for (apic = 0; apic < nr_ioapics; apic++) {
1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1963 reg_01.raw = io_apic_read(apic, 1);
1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1966 }
1967 1951
1968 if (!legacy_pic->nr_legacy_irqs) 1952 if (!legacy_pic->nr_legacy_irqs)
1969 return; 1953 return;
@@ -3858,27 +3842,20 @@ int __init io_apic_get_redir_entries (int ioapic)
3858 reg_01.raw = io_apic_read(ioapic, 1); 3842 reg_01.raw = io_apic_read(ioapic, 1);
3859 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 3843 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3860 3844
3861 return reg_01.bits.entries; 3845 /* The register returns the maximum index redir index
3846 * supported, which is one less than the total number of redir
3847 * entries.
3848 */
3849 return reg_01.bits.entries + 1;
3862} 3850}
3863 3851
3864void __init probe_nr_irqs_gsi(void) 3852void __init probe_nr_irqs_gsi(void)
3865{ 3853{
3866 int nr = 0; 3854 int nr;
3867 3855
3868 nr = acpi_probe_gsi(); 3856 nr = gsi_end + 1 + NR_IRQS_LEGACY;
3869 if (nr > nr_irqs_gsi) { 3857 if (nr > nr_irqs_gsi)
3870 nr_irqs_gsi = nr; 3858 nr_irqs_gsi = nr;
3871 } else {
3872 /* for acpi=off or acpi is not compiled in */
3873 int idx;
3874
3875 nr = 0;
3876 for (idx = 0; idx < nr_ioapics; idx++)
3877 nr += io_apic_get_redir_entries(idx) + 1;
3878
3879 if (nr > nr_irqs_gsi)
3880 nr_irqs_gsi = nr;
3881 }
3882 3859
3883 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3860 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3884} 3861}
@@ -4085,22 +4062,27 @@ int __init io_apic_get_version(int ioapic)
4085 return reg_01.bits.version; 4062 return reg_01.bits.version;
4086} 4063}
4087 4064
4088int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4065int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
4089{ 4066{
4090 int i; 4067 int ioapic, pin, idx;
4091 4068
4092 if (skip_ioapic_setup) 4069 if (skip_ioapic_setup)
4093 return -1; 4070 return -1;
4094 4071
4095 for (i = 0; i < mp_irq_entries; i++) 4072 ioapic = mp_find_ioapic(gsi);
4096 if (mp_irqs[i].irqtype == mp_INT && 4073 if (ioapic < 0)
4097 mp_irqs[i].srcbusirq == bus_irq)
4098 break;
4099 if (i >= mp_irq_entries)
4100 return -1; 4074 return -1;
4101 4075
4102 *trigger = irq_trigger(i); 4076 pin = mp_find_ioapic_pin(ioapic, gsi);
4103 *polarity = irq_polarity(i); 4077 if (pin < 0)
4078 return -1;
4079
4080 idx = find_irq_entry(ioapic, pin, mp_INT);
4081 if (idx < 0)
4082 return -1;
4083
4084 *trigger = irq_trigger(idx);
4085 *polarity = irq_polarity(idx);
4104 return 0; 4086 return 0;
4105} 4087}
4106 4088
@@ -4241,7 +4223,7 @@ void __init ioapic_insert_resources(void)
4241 } 4223 }
4242} 4224}
4243 4225
4244int mp_find_ioapic(int gsi) 4226int mp_find_ioapic(u32 gsi)
4245{ 4227{
4246 int i = 0; 4228 int i = 0;
4247 4229
@@ -4256,7 +4238,7 @@ int mp_find_ioapic(int gsi)
4256 return -1; 4238 return -1;
4257} 4239}
4258 4240
4259int mp_find_ioapic_pin(int ioapic, int gsi) 4241int mp_find_ioapic_pin(int ioapic, u32 gsi)
4260{ 4242{
4261 if (WARN_ON(ioapic == -1)) 4243 if (WARN_ON(ioapic == -1))
4262 return -1; 4244 return -1;
@@ -4284,6 +4266,7 @@ static int bad_ioapic(unsigned long address)
4284void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 4266void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4285{ 4267{
4286 int idx = 0; 4268 int idx = 0;
4269 int entries;
4287 4270
4288 if (bad_ioapic(address)) 4271 if (bad_ioapic(address))
4289 return; 4272 return;
@@ -4302,9 +4285,17 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4302 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 4285 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4303 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 4286 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4304 */ 4287 */
4288 entries = io_apic_get_redir_entries(idx);
4305 mp_gsi_routing[idx].gsi_base = gsi_base; 4289 mp_gsi_routing[idx].gsi_base = gsi_base;
4306 mp_gsi_routing[idx].gsi_end = gsi_base + 4290 mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
4307 io_apic_get_redir_entries(idx); 4291
4292 /*
4293 * The number of IO-APIC IRQ registers (== #pins):
4294 */
4295 nr_ioapic_registers[idx] = entries;
4296
4297 if (mp_gsi_routing[idx].gsi_end > gsi_end)
4298 gsi_end = mp_gsi_routing[idx].gsi_end;
4308 4299
4309 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 4300 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4310 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, 4301 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c085d52dbaf..e46f98f36e3 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -735,9 +735,6 @@ void __init uv_system_init(void)
735 uv_node_to_blade[nid] = blade; 735 uv_node_to_blade[nid] = blade;
736 uv_cpu_to_blade[cpu] = blade; 736 uv_cpu_to_blade[cpu] = blade;
737 max_pnode = max(pnode, max_pnode); 737 max_pnode = max(pnode, max_pnode);
738
739 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
740 cpu, apicid, pnode, nid, lcpu, blade);
741 } 738 }
742 739
743 /* Add blade/pnode info for nodes without cpus */ 740 /* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0e..c4f9182ca3a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
1224#ifdef INIT_TIMER_AFTER_SUSPEND 1224#ifdef INIT_TIMER_AFTER_SUSPEND
1225 unsigned long flags; 1225 unsigned long flags;
1226 1226
1227 spin_lock_irqsave(&i8253_lock, flags); 1227 raw_spin_lock_irqsave(&i8253_lock, flags);
1228 /* set the clock to HZ */ 1228 /* set the clock to HZ */
1229 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1229 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1230 udelay(10); 1230 udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
1232 udelay(10); 1232 udelay(10);
1233 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ 1233 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
1234 udelay(10); 1234 udelay(10);
1235 spin_unlock_irqrestore(&i8253_lock, flags); 1235 raw_spin_unlock_irqrestore(&i8253_lock, flags);
1236#endif 1236#endif
1237} 1237}
1238 1238
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f367..3a785da34b6 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o addon_cpuid_features.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79cdf68..10fa5684a66 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,12 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
30 const struct cpuid_bit *cb; 30 const struct cpuid_bit *cb;
31 31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, 35 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, 36 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, 37 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, 38 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
39 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
40 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
39 { 0, 0, 0, 0 } 41 { 0, 0, 0, 0 }
40 }; 42 };
41 43
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a26521239..c39576cb301 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87static void __init check_hlt(void) 87static void __init check_hlt(void)
88{ 88{
89 if (paravirt_enabled()) 89 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
90 return; 90 return;
91 91
92 printk(KERN_INFO "Checking 'hlt' instruction... "); 92 printk(KERN_INFO "Checking 'hlt' instruction... ");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951e..68e4a6f2211 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1084,6 +1084,20 @@ static void clear_all_debug_regs(void)
1084 } 1084 }
1085} 1085}
1086 1086
1087#ifdef CONFIG_KGDB
1088/*
1089 * Restore debug regs if using kgdbwait and you have a kernel debugger
1090 * connection established.
1091 */
1092static void dbg_restore_debug_regs(void)
1093{
1094 if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
1095 arch_kgdb_ops.correct_hw_break();
1096}
1097#else /* ! CONFIG_KGDB */
1098#define dbg_restore_debug_regs()
1099#endif /* ! CONFIG_KGDB */
1100
1087/* 1101/*
1088 * cpu_init() initializes state that is per-CPU. Some data is already 1102 * cpu_init() initializes state that is per-CPU. Some data is already
1089 * initialized (naturally) in the bootstrap process, such as the GDT 1103 * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1107,9 +1121,9 @@ void __cpuinit cpu_init(void)
1107 oist = &per_cpu(orig_ist, cpu); 1121 oist = &per_cpu(orig_ist, cpu);
1108 1122
1109#ifdef CONFIG_NUMA 1123#ifdef CONFIG_NUMA
1110 if (cpu != 0 && percpu_read(node_number) == 0 && 1124 if (cpu != 0 && percpu_read(numa_node) == 0 &&
1111 cpu_to_node(cpu) != NUMA_NO_NODE) 1125 early_cpu_to_node(cpu) != NUMA_NO_NODE)
1112 percpu_write(node_number, cpu_to_node(cpu)); 1126 set_numa_node(early_cpu_to_node(cpu));
1113#endif 1127#endif
1114 1128
1115 me = current; 1129 me = current;
@@ -1174,18 +1188,8 @@ void __cpuinit cpu_init(void)
1174 load_TR_desc(); 1188 load_TR_desc();
1175 load_LDT(&init_mm.context); 1189 load_LDT(&init_mm.context);
1176 1190
1177#ifdef CONFIG_KGDB 1191 clear_all_debug_regs();
1178 /* 1192 dbg_restore_debug_regs();
1179 * If the kgdb is connected no debug regs should be altered. This
1180 * is only applicable when KGDB and a KGDB I/O module are built
1181 * into the kernel and you are using early debugging with
1182 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1183 */
1184 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1185 arch_kgdb_ops.correct_hw_break();
1186 else
1187#endif
1188 clear_all_debug_regs();
1189 1193
1190 fpu_init(); 1194 fpu_init();
1191 1195
@@ -1239,14 +1243,12 @@ void __cpuinit cpu_init(void)
1239#endif 1243#endif
1240 1244
1241 clear_all_debug_regs(); 1245 clear_all_debug_regs();
1246 dbg_restore_debug_regs();
1242 1247
1243 /* 1248 /*
1244 * Force FPU initialization: 1249 * Force FPU initialization:
1245 */ 1250 */
1246 if (cpu_has_xsave) 1251 current_thread_info()->status = 0;
1247 current_thread_info()->status = TS_XSAVE;
1248 else
1249 current_thread_info()->status = 0;
1250 clear_used_math(); 1252 clear_used_math();
1251 mxcsr_feature_mask_init(); 1253 mxcsr_feature_mask_init();
1252 1254
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170..bd54bf67e6f 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
2# K8 systems. ACPI is preferred to all other hardware-specific drivers. 2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod. 3# speedstep-* is preferred over p4-clockmod.
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o 7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 459168083b7..1d3cddaa40e 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -46,6 +46,7 @@
46#include <asm/msr.h> 46#include <asm/msr.h>
47#include <asm/processor.h> 47#include <asm/processor.h>
48#include <asm/cpufeature.h> 48#include <asm/cpufeature.h>
49#include "mperf.h"
49 50
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ 51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg) 52 "acpi-cpufreq", msg)
@@ -71,8 +72,6 @@ struct acpi_cpufreq_data {
71 72
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); 73static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73 74
74static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
75
76/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
77static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
78 77
@@ -240,45 +239,6 @@ static u32 get_cur_val(const struct cpumask *mask)
240 return cmd.val; 239 return cmd.val;
241} 240}
242 241
243/* Called via smp_call_function_single(), on the target CPU */
244static void read_measured_perf_ctrs(void *_cur)
245{
246 struct aperfmperf *am = _cur;
247
248 get_aperfmperf(am);
249}
250
251/*
252 * Return the measured active (C0) frequency on this CPU since last call
253 * to this function.
254 * Input: cpu number
255 * Return: Average CPU frequency in terms of max frequency (zero on error)
256 *
257 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
258 * over a period of time, while CPU is in C0 state.
259 * IA32_MPERF counts at the rate of max advertised frequency
260 * IA32_APERF counts at the rate of actual CPU frequency
261 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
262 * no meaning should be associated with absolute values of these MSRs.
263 */
264static unsigned int get_measured_perf(struct cpufreq_policy *policy,
265 unsigned int cpu)
266{
267 struct aperfmperf perf;
268 unsigned long ratio;
269 unsigned int retval;
270
271 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
272 return 0;
273
274 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
275 per_cpu(acfreq_old_perf, cpu) = perf;
276
277 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
278
279 return retval;
280}
281
282static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 242static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
283{ 243{
284 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); 244 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -702,7 +662,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
702 662
703 /* Check for APERF/MPERF support in hardware */ 663 /* Check for APERF/MPERF support in hardware */
704 if (cpu_has(c, X86_FEATURE_APERFMPERF)) 664 if (cpu_has(c, X86_FEATURE_APERFMPERF))
705 acpi_cpufreq_driver.getavg = get_measured_perf; 665 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
706 666
707 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 667 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
708 for (i = 0; i < perf->state_count; i++) 668 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 00000000000..911e193018a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 00000000000..5dbf2950dc2
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index b6215b9798e..7ec2123838e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
1
2/* 1/*
3 * (c) 2003-2006 Advanced Micro Devices, Inc. 2 * (c) 2003-2010 Advanced Micro Devices, Inc.
4 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
5 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
6 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
@@ -46,6 +45,7 @@
46#define PFX "powernow-k8: " 45#define PFX "powernow-k8: "
47#define VERSION "version 2.20.00" 46#define VERSION "version 2.20.00"
48#include "powernow-k8.h" 47#include "powernow-k8.h"
48#include "mperf.h"
49 49
50/* serialize freq changes */ 50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex); 51static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54 54
55static int cpu_family = CPU_OPTERON; 55static int cpu_family = CPU_OPTERON;
56 56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
57#ifndef CONFIG_SMP 63#ifndef CONFIG_SMP
58static inline const struct cpumask *cpu_core_mask(int cpu) 64static inline const struct cpumask *cpu_core_mask(int cpu)
59{ 65{
@@ -1249,6 +1255,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1249 struct powernow_k8_data *data; 1255 struct powernow_k8_data *data;
1250 struct init_on_cpu init_on_cpu; 1256 struct init_on_cpu init_on_cpu;
1251 int rc; 1257 int rc;
1258 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1252 1259
1253 if (!cpu_online(pol->cpu)) 1260 if (!cpu_online(pol->cpu))
1254 return -ENODEV; 1261 return -ENODEV;
@@ -1323,6 +1330,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1323 return -EINVAL; 1330 return -EINVAL;
1324 } 1331 }
1325 1332
1333 /* Check for APERF/MPERF support in hardware */
1334 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1335 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1336
1326 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); 1337 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1327 1338
1328 if (cpu_family == CPU_HW_PSTATE) 1339 if (cpu_family == CPU_HW_PSTATE)
@@ -1394,8 +1405,77 @@ out:
1394 return khz; 1405 return khz;
1395} 1406}
1396 1407
1408static void _cpb_toggle_msrs(bool t)
1409{
1410 int cpu;
1411
1412 get_online_cpus();
1413
1414 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1415
1416 for_each_cpu(cpu, cpu_online_mask) {
1417 struct msr *reg = per_cpu_ptr(msrs, cpu);
1418 if (t)
1419 reg->l &= ~BIT(25);
1420 else
1421 reg->l |= BIT(25);
1422 }
1423 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1424
1425 put_online_cpus();
1426}
1427
1428/*
1429 * Switch on/off core performance boosting.
1430 *
1431 * 0=disable
1432 * 1=enable.
1433 */
1434static void cpb_toggle(bool t)
1435{
1436 if (!cpb_capable)
1437 return;
1438
1439 if (t && !cpb_enabled) {
1440 cpb_enabled = true;
1441 _cpb_toggle_msrs(t);
1442 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1443 } else if (!t && cpb_enabled) {
1444 cpb_enabled = false;
1445 _cpb_toggle_msrs(t);
1446 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1447 }
1448}
1449
1450static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1451 size_t count)
1452{
1453 int ret = -EINVAL;
1454 unsigned long val = 0;
1455
1456 ret = strict_strtoul(buf, 10, &val);
1457 if (!ret && (val == 0 || val == 1) && cpb_capable)
1458 cpb_toggle(val);
1459 else
1460 return -EINVAL;
1461
1462 return count;
1463}
1464
1465static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1466{
1467 return sprintf(buf, "%u\n", cpb_enabled);
1468}
1469
1470#define define_one_rw(_name) \
1471static struct freq_attr _name = \
1472__ATTR(_name, 0644, show_##_name, store_##_name)
1473
1474define_one_rw(cpb);
1475
1397static struct freq_attr *powernow_k8_attr[] = { 1476static struct freq_attr *powernow_k8_attr[] = {
1398 &cpufreq_freq_attr_scaling_available_freqs, 1477 &cpufreq_freq_attr_scaling_available_freqs,
1478 &cpb,
1399 NULL, 1479 NULL,
1400}; 1480};
1401 1481
@@ -1411,10 +1491,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
1411 .attr = powernow_k8_attr, 1491 .attr = powernow_k8_attr,
1412}; 1492};
1413 1493
1494/*
1495 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1496 * cannot block the remaining ones from boosting. On the CPU_UP path we
1497 * simply keep the boost-disable flag in sync with the current global
1498 * state.
1499 */
1500static int cpb_notify(struct notifier_block *nb, unsigned long action,
1501 void *hcpu)
1502{
1503 unsigned cpu = (long)hcpu;
1504 u32 lo, hi;
1505
1506 switch (action) {
1507 case CPU_UP_PREPARE:
1508 case CPU_UP_PREPARE_FROZEN:
1509
1510 if (!cpb_enabled) {
1511 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1512 lo |= BIT(25);
1513 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1514 }
1515 break;
1516
1517 case CPU_DOWN_PREPARE:
1518 case CPU_DOWN_PREPARE_FROZEN:
1519 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1520 lo &= ~BIT(25);
1521 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1522 break;
1523
1524 default:
1525 break;
1526 }
1527
1528 return NOTIFY_OK;
1529}
1530
1531static struct notifier_block cpb_nb = {
1532 .notifier_call = cpb_notify,
1533};
1534
1414/* driver entry point for init */ 1535/* driver entry point for init */
1415static int __cpuinit powernowk8_init(void) 1536static int __cpuinit powernowk8_init(void)
1416{ 1537{
1417 unsigned int i, supported_cpus = 0; 1538 unsigned int i, supported_cpus = 0, cpu;
1418 1539
1419 for_each_online_cpu(i) { 1540 for_each_online_cpu(i) {
1420 int rc; 1541 int rc;
@@ -1423,15 +1544,36 @@ static int __cpuinit powernowk8_init(void)
1423 supported_cpus++; 1544 supported_cpus++;
1424 } 1545 }
1425 1546
1426 if (supported_cpus == num_online_cpus()) { 1547 if (supported_cpus != num_online_cpus())
1427 printk(KERN_INFO PFX "Found %d %s " 1548 return -ENODEV;
1428 "processors (%d cpu cores) (" VERSION ")\n", 1549
1429 num_online_nodes(), 1550 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1430 boot_cpu_data.x86_model_id, supported_cpus); 1551 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1431 return cpufreq_register_driver(&cpufreq_amd64_driver); 1552
1553 if (boot_cpu_has(X86_FEATURE_CPB)) {
1554
1555 cpb_capable = true;
1556
1557 register_cpu_notifier(&cpb_nb);
1558
1559 msrs = msrs_alloc();
1560 if (!msrs) {
1561 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1562 return -ENOMEM;
1563 }
1564
1565 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1566
1567 for_each_cpu(cpu, cpu_online_mask) {
1568 struct msr *reg = per_cpu_ptr(msrs, cpu);
1569 cpb_enabled |= !(!!(reg->l & BIT(25)));
1570 }
1571
1572 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1573 (cpb_enabled ? "on" : "off"));
1432 } 1574 }
1433 1575
1434 return -ENODEV; 1576 return cpufreq_register_driver(&cpufreq_amd64_driver);
1435} 1577}
1436 1578
1437/* driver entry point for term */ 1579/* driver entry point for term */
@@ -1439,6 +1581,13 @@ static void __exit powernowk8_exit(void)
1439{ 1581{
1440 dprintk("exit\n"); 1582 dprintk("exit\n");
1441 1583
1584 if (boot_cpu_has(X86_FEATURE_CPB)) {
1585 msrs_free(msrs);
1586 msrs = NULL;
1587
1588 unregister_cpu_notifier(&cpb_nb);
1589 }
1590
1442 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1591 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1443} 1592}
1444 1593
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073c..df3529b1c02 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 */ 6 */
7 7
8
9enum pstate { 8enum pstate {
10 HW_PSTATE_INVALID = 0xff, 9 HW_PSTATE_INVALID = 0xff,
11 HW_PSTATE_0 = 0, 10 HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
55 struct cpumask *available_cores; 54 struct cpumask *available_cores;
56}; 55};
57 56
58
59/* processor's cpuid instruction support */ 57/* processor's cpuid instruction support */
60#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ 58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
61#define CPUID_XFAM 0x0ff00000 /* extended family */ 59#define CPUID_XFAM 0x0ff00000 /* extended family */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33..dd531cc56a8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,55 @@
21 * 21 *
22 */ 22 */
23 23
24#include <linux/module.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h> 26#include <asm/hypervisor.h>
27 27
28static inline void __cpuinit 28/*
29detect_hypervisor_vendor(struct cpuinfo_x86 *c) 29 * Hypervisor detect order. This is specified explicitly here because
30 * some hypervisors might implement compatibility modes for other
31 * hypervisors and therefore need to be detected in specific sequence.
32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] =
30{ 34{
31 if (vmware_platform()) 35 &x86_hyper_vmware,
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; 36 &x86_hyper_ms_hyperv,
33 else 37};
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35}
36 38
37static inline void __cpuinit 39const struct hypervisor_x86 *x86_hyper;
38hypervisor_set_feature_bits(struct cpuinfo_x86 *c) 40EXPORT_SYMBOL(x86_hyper);
41
42static inline void __init
43detect_hypervisor_vendor(void)
39{ 44{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { 45 const struct hypervisor_x86 *h, * const *p;
41 vmware_set_feature_bits(c); 46
42 return; 47 for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
48 h = *p;
49 if (h->detect()) {
50 x86_hyper = h;
51 printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
52 break;
53 }
43 } 54 }
44} 55}
45 56
46void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) 57void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
47{ 58{
48 detect_hypervisor_vendor(c); 59 if (x86_hyper && x86_hyper->set_cpu_features)
49 hypervisor_set_feature_bits(c); 60 x86_hyper->set_cpu_features(c);
50} 61}
51 62
52void __init init_hypervisor_platform(void) 63void __init init_hypervisor_platform(void)
53{ 64{
65
66 detect_hypervisor_vendor();
67
68 if (!x86_hyper)
69 return;
70
54 init_hypervisor(&boot_cpu_data); 71 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) 72
56 vmware_platform_setup(); 73 if (x86_hyper->init_platform)
74 x86_hyper->init_platform();
57} 75}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1366c7cfd48..85f69cdeae1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17#include <asm/cpu.h> 16#include <asm/cpu.h>
18 17
@@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
373 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 372 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
374 } 373 }
375 374
376 if (c->cpuid_level > 6) {
377 unsigned ecx = cpuid_ecx(6);
378 if (ecx & 0x01)
379 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
380 }
381
382 if (cpu_has_xmm2) 375 if (cpu_has_xmm2)
383 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 376 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
384 if (cpu_has_ds) { 377 if (cpu_has_ds) {
@@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
388 set_cpu_cap(c, X86_FEATURE_BTS); 381 set_cpu_cap(c, X86_FEATURE_BTS);
389 if (!(l1 & (1<<12))) 382 if (!(l1 & (1<<12)))
390 set_cpu_cap(c, X86_FEATURE_PEBS); 383 set_cpu_cap(c, X86_FEATURE_PEBS);
391 ds_init_intel(c);
392 } 384 }
393 385
394 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) 386 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a5..33eae2062cf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
148 u32 full; 148 u32 full;
149}; 149};
150 150
151struct amd_l3_cache {
152 struct pci_dev *dev;
153 bool can_disable;
154 unsigned indices;
155 u8 subcaches[4];
156};
157
151struct _cpuid4_info { 158struct _cpuid4_info {
152 union _cpuid4_leaf_eax eax; 159 union _cpuid4_leaf_eax eax;
153 union _cpuid4_leaf_ebx ebx; 160 union _cpuid4_leaf_ebx ebx;
154 union _cpuid4_leaf_ecx ecx; 161 union _cpuid4_leaf_ecx ecx;
155 unsigned long size; 162 unsigned long size;
156 bool can_disable; 163 struct amd_l3_cache *l3;
157 unsigned int l3_indices;
158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
159}; 165};
160 166
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
164 union _cpuid4_leaf_ebx ebx; 170 union _cpuid4_leaf_ebx ebx;
165 union _cpuid4_leaf_ecx ecx; 171 union _cpuid4_leaf_ecx ecx;
166 unsigned long size; 172 unsigned long size;
167 bool can_disable; 173 struct amd_l3_cache *l3;
168 unsigned int l3_indices;
169}; 174};
170 175
171unsigned short num_cache_leaves; 176unsigned short num_cache_leaves;
@@ -302,87 +307,163 @@ struct _cache_attr {
302}; 307};
303 308
304#ifdef CONFIG_CPU_SUP_AMD 309#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void) 310
311/*
312 * L3 cache descriptors
313 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
306{ 317{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3; 318 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0; 319 u32 val = 0;
316 320
317 pci_read_config_dword(dev, 0x1C4, &val); 321 pci_read_config_dword(l3->dev, 0x1C4, &val);
318 322
319 /* calculate subcache sizes */ 323 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0)); 324 l3->subcaches[0] = sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4)); 325 l3->subcaches[1] = sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9)); 326 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13)); 327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
324 328
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; 329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
330}
331
332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
333{
334 struct amd_l3_cache *l3;
335 struct pci_dev *dev = node_to_k8_nb_misc(node);
336
337 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
338 if (!l3) {
339 printk(KERN_WARNING "Error allocating L3 struct\n");
340 return NULL;
341 }
342
343 l3->dev = dev;
344
345 amd_calc_l3_indices(l3);
346
347 return l3;
326} 348}
327 349
328static void __cpuinit 350static void __cpuinit
329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 351amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
330{ 352{
331 if (index < 3) 353 int node;
354
355 if (boot_cpu_data.x86 != 0x10)
332 return; 356 return;
333 357
334 if (boot_cpu_data.x86 == 0x11) 358 if (index < 3)
335 return; 359 return;
336 360
337 /* see errata #382 and #388 */ 361 /* see errata #382 and #388 */
338 if ((boot_cpu_data.x86 == 0x10) && 362 if (boot_cpu_data.x86_model < 0x8)
339 ((boot_cpu_data.x86_model < 0x8) || 363 return;
340 (boot_cpu_data.x86_mask < 0x1))) 364
365 if ((boot_cpu_data.x86_model == 0x8 ||
366 boot_cpu_data.x86_model == 0x9)
367 &&
368 boot_cpu_data.x86_mask < 0x1)
369 return;
370
371 /* not in virtualized environments */
372 if (num_k8_northbridges == 0)
341 return; 373 return;
342 374
343 this_leaf->can_disable = true; 375 /*
344 this_leaf->l3_indices = amd_calc_l3_indices(); 376 * Strictly speaking, the amount in @size below is leaked since it is
377 * never freed but this is done only on shutdown so it doesn't matter.
378 */
379 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
381
382 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches)
384 return;
385 }
386
387 node = amd_get_nb_id(smp_processor_id());
388
389 if (!l3_caches[node]) {
390 l3_caches[node] = amd_init_l3_cache(node);
391 l3_caches[node]->can_disable = true;
392 }
393
394 WARN_ON(!l3_caches[node]);
395
396 this_leaf->l3 = l3_caches[node];
345} 397}
346 398
347static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 399static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
348 unsigned int index) 400 unsigned int slot)
349{ 401{
350 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 402 struct pci_dev *dev = this_leaf->l3->dev;
351 int node = amd_get_nb_id(cpu);
352 struct pci_dev *dev = node_to_k8_nb_misc(node);
353 unsigned int reg = 0; 403 unsigned int reg = 0;
354 404
355 if (!this_leaf->can_disable) 405 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
356 return -EINVAL; 406 return -EINVAL;
357 407
358 if (!dev) 408 if (!dev)
359 return -EINVAL; 409 return -EINVAL;
360 410
361 pci_read_config_dword(dev, 0x1BC + index * 4, &reg); 411 pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
362 return sprintf(buf, "0x%08x\n", reg); 412 return sprintf(buf, "0x%08x\n", reg);
363} 413}
364 414
365#define SHOW_CACHE_DISABLE(index) \ 415#define SHOW_CACHE_DISABLE(slot) \
366static ssize_t \ 416static ssize_t \
367show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ 417show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
368{ \ 418{ \
369 return show_cache_disable(this_leaf, buf, index); \ 419 return show_cache_disable(this_leaf, buf, slot); \
370} 420}
371SHOW_CACHE_DISABLE(0) 421SHOW_CACHE_DISABLE(0)
372SHOW_CACHE_DISABLE(1) 422SHOW_CACHE_DISABLE(1)
373 423
424static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
425 unsigned slot, unsigned long idx)
426{
427 int i;
428
429 idx |= BIT(30);
430
431 /*
432 * disable index in all 4 subcaches
433 */
434 for (i = 0; i < 4; i++) {
435 u32 reg = idx | (i << 20);
436
437 if (!l3->subcaches[i])
438 continue;
439
440 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
441
442 /*
443 * We need to WBINVD on a core on the node containing the L3
444 * cache which indices we disable therefore a simple wbinvd()
445 * is not sufficient.
446 */
447 wbinvd_on_cpu(cpu);
448
449 reg |= BIT(31);
450 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
451 }
452}
453
454
374static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 455static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
375 const char *buf, size_t count, unsigned int index) 456 const char *buf, size_t count,
457 unsigned int slot)
376{ 458{
459 struct pci_dev *dev = this_leaf->l3->dev;
377 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 460 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
378 int node = amd_get_nb_id(cpu);
379 struct pci_dev *dev = node_to_k8_nb_misc(node);
380 unsigned long val = 0; 461 unsigned long val = 0;
381 462
382#define SUBCACHE_MASK (3UL << 20) 463#define SUBCACHE_MASK (3UL << 20)
383#define SUBCACHE_INDEX 0xfff 464#define SUBCACHE_INDEX 0xfff
384 465
385 if (!this_leaf->can_disable) 466 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
386 return -EINVAL; 467 return -EINVAL;
387 468
388 if (!capable(CAP_SYS_ADMIN)) 469 if (!capable(CAP_SYS_ADMIN))
@@ -396,26 +477,20 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
396 477
397 /* do not allow writes outside of allowed bits */ 478 /* do not allow writes outside of allowed bits */
398 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || 479 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
399 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) 480 ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
400 return -EINVAL; 481 return -EINVAL;
401 482
402 val |= BIT(30); 483 amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
403 pci_write_config_dword(dev, 0x1BC + index * 4, val); 484
404 /*
405 * We need to WBINVD on a core on the node containing the L3 cache which
406 * indices we disable therefore a simple wbinvd() is not sufficient.
407 */
408 wbinvd_on_cpu(cpu);
409 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
410 return count; 485 return count;
411} 486}
412 487
413#define STORE_CACHE_DISABLE(index) \ 488#define STORE_CACHE_DISABLE(slot) \
414static ssize_t \ 489static ssize_t \
415store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ 490store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
416 const char *buf, size_t count) \ 491 const char *buf, size_t count) \
417{ \ 492{ \
418 return store_cache_disable(this_leaf, buf, count, index); \ 493 return store_cache_disable(this_leaf, buf, count, slot); \
419} 494}
420STORE_CACHE_DISABLE(0) 495STORE_CACHE_DISABLE(0)
421STORE_CACHE_DISABLE(1) 496STORE_CACHE_DISABLE(1)
@@ -443,8 +518,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
443 518
444 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 519 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
445 amd_cpuid4(index, &eax, &ebx, &ecx); 520 amd_cpuid4(index, &eax, &ebx, &ecx);
446 if (boot_cpu_data.x86 >= 0x10) 521 amd_check_l3_disable(index, this_leaf);
447 amd_check_l3_disable(index, this_leaf);
448 } else { 522 } else {
449 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 523 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
450 } 524 }
@@ -701,6 +775,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
701 for (i = 0; i < num_cache_leaves; i++) 775 for (i = 0; i < num_cache_leaves; i++)
702 cache_remove_shared_cpu_map(cpu, i); 776 cache_remove_shared_cpu_map(cpu, i);
703 777
778 kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
704 kfree(per_cpu(ici_cpuid4_info, cpu)); 779 kfree(per_cpu(ici_cpuid4_info, cpu));
705 per_cpu(ici_cpuid4_info, cpu) = NULL; 780 per_cpu(ici_cpuid4_info, cpu) = NULL;
706} 781}
@@ -985,7 +1060,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
985 1060
986 this_leaf = CPUID4_INFO_IDX(cpu, i); 1061 this_leaf = CPUID4_INFO_IDX(cpu, i);
987 1062
988 if (this_leaf->can_disable) 1063 if (this_leaf->l3 && this_leaf->l3->can_disable)
989 ktype_cache.default_attrs = default_l3_attrs; 1064 ktype_cache.default_attrs = default_l3_attrs;
990 else 1065 else
991 ktype_cache.default_attrs = default_attrs; 1066 ktype_cache.default_attrs = default_attrs;
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11..bb34b03af25 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
8 8
9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o 9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
10
11obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 00000000000..745b54f9be8
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,138 @@
1/*
2 * Bridge between MCE and APEI
3 *
4 * On some machine, corrected memory errors are reported via APEI
5 * generic hardware error source (GHES) instead of corrected Machine
6 * Check. These corrected memory errors can be reported to user space
7 * through /dev/mcelog via faking a corrected Machine Check, so that
8 * the error memory page can be offlined by /sbin/mcelog if the error
9 * count for one page is beyond the threshold.
10 *
11 * For fatal MCE, save MCE record into persistent storage via ERST, so
12 * that the MCE record can be logged after reboot via ERST.
13 *
14 * Copyright 2010 Intel Corp.
15 * Author: Huang Ying <ying.huang@intel.com>
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public License version
19 * 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
25 *
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 */
30
31#include <linux/kernel.h>
32#include <linux/acpi.h>
33#include <linux/cper.h>
34#include <acpi/apei.h>
35#include <asm/mce.h>
36
37#include "mce-internal.h"
38
39void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
40{
41 struct mce m;
42
43 /* Only corrected MC is reported */
44 if (!corrected)
45 return;
46
47 mce_setup(&m);
48 m.bank = 1;
49 /* Fake a memory read corrected error with unknown channel */
50 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
51 m.addr = mem_err->physical_addr;
52 mce_log(&m);
53 mce_notify_irq();
54}
55EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
56
57#define CPER_CREATOR_MCE \
58 UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
59 0x64, 0x90, 0xb8, 0x9d)
60#define CPER_SECTION_TYPE_MCE \
61 UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
62 0x04, 0x4a, 0x38, 0xfc)
63
64/*
65 * CPER specification (in UEFI specification 2.3 appendix N) requires
66 * byte-packed.
67 */
68struct cper_mce_record {
69 struct cper_record_header hdr;
70 struct cper_section_descriptor sec_hdr;
71 struct mce mce;
72} __packed;
73
74int apei_write_mce(struct mce *m)
75{
76 struct cper_mce_record rcd;
77
78 memset(&rcd, 0, sizeof(rcd));
79 memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
80 rcd.hdr.revision = CPER_RECORD_REV;
81 rcd.hdr.signature_end = CPER_SIG_END;
82 rcd.hdr.section_count = 1;
83 rcd.hdr.error_severity = CPER_SER_FATAL;
84 /* timestamp, platform_id, partition_id are all invalid */
85 rcd.hdr.validation_bits = 0;
86 rcd.hdr.record_length = sizeof(rcd);
87 rcd.hdr.creator_id = CPER_CREATOR_MCE;
88 rcd.hdr.notification_type = CPER_NOTIFY_MCE;
89 rcd.hdr.record_id = cper_next_record_id();
90 rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
91
92 rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
93 rcd.sec_hdr.section_length = sizeof(rcd.mce);
94 rcd.sec_hdr.revision = CPER_SEC_REV;
95 /* fru_id and fru_text is invalid */
96 rcd.sec_hdr.validation_bits = 0;
97 rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
98 rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
99 rcd.sec_hdr.section_severity = CPER_SER_FATAL;
100
101 memcpy(&rcd.mce, m, sizeof(*m));
102
103 return erst_write(&rcd.hdr);
104}
105
106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{
108 struct cper_mce_record rcd;
109 ssize_t len;
110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd));
112 if (len <= 0)
113 return len;
114 /* Can not skip other records in storage via ERST unless clear them */
115 else if (len != sizeof(rcd) ||
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
117 if (printk_ratelimit())
118 pr_warning(
119 "MCE-APEI: Can not skip the unknown record in ERST");
120 return -EIO;
121 }
122
123 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id;
125
126 return sizeof(*m);
127}
128
129/* Check whether there is record in ERST */
130int apei_check_mce(void)
131{
132 return erst_get_record_count();
133}
134
135int apei_clear_mce(u64 record_id)
136{
137 return erst_clear(record_id);
138}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab6..fefcc69ee8b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
28 28
29extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
30 30
31#ifdef CONFIG_ACPI_APEI
32int apei_write_mce(struct mce *m);
33ssize_t apei_read_mce(struct mce *m, u64 *record_id);
34int apei_check_mce(void);
35int apei_clear_mce(u64 record_id);
36#else
37static inline int apei_write_mce(struct mce *m)
38{
39 return -EINVAL;
40}
41static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
42{
43 return 0;
44}
45static inline int apei_check_mce(void)
46{
47 return 0;
48}
49static inline int apei_clear_mce(u64 record_id)
50{
51 return -EINVAL;
52}
53#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767..707165dbc20 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -264,7 +264,7 @@ static void wait_for_panic(void)
264 264
265static void mce_panic(char *msg, struct mce *final, char *exp) 265static void mce_panic(char *msg, struct mce *final, char *exp)
266{ 266{
267 int i; 267 int i, apei_err = 0;
268 268
269 if (!fake_panic) { 269 if (!fake_panic) {
270 /* 270 /*
@@ -287,8 +287,11 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
287 struct mce *m = &mcelog.entry[i]; 287 struct mce *m = &mcelog.entry[i];
288 if (!(m->status & MCI_STATUS_VAL)) 288 if (!(m->status & MCI_STATUS_VAL))
289 continue; 289 continue;
290 if (!(m->status & MCI_STATUS_UC)) 290 if (!(m->status & MCI_STATUS_UC)) {
291 print_mce(m); 291 print_mce(m);
292 if (!apei_err)
293 apei_err = apei_write_mce(m);
294 }
292 } 295 }
293 /* Now print uncorrected but with the final one last */ 296 /* Now print uncorrected but with the final one last */
294 for (i = 0; i < MCE_LOG_LEN; i++) { 297 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,11 +300,17 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
297 continue; 300 continue;
298 if (!(m->status & MCI_STATUS_UC)) 301 if (!(m->status & MCI_STATUS_UC))
299 continue; 302 continue;
300 if (!final || memcmp(m, final, sizeof(struct mce))) 303 if (!final || memcmp(m, final, sizeof(struct mce))) {
301 print_mce(m); 304 print_mce(m);
305 if (!apei_err)
306 apei_err = apei_write_mce(m);
307 }
302 } 308 }
303 if (final) 309 if (final) {
304 print_mce(final); 310 print_mce(final);
311 if (!apei_err)
312 apei_err = apei_write_mce(final);
313 }
305 if (cpu_missing) 314 if (cpu_missing)
306 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 315 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
307 print_mce_tail(); 316 print_mce_tail();
@@ -539,7 +548,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
539 struct mce m; 548 struct mce m;
540 int i; 549 int i;
541 550
542 __get_cpu_var(mce_poll_count)++; 551 percpu_inc(mce_poll_count);
543 552
544 mce_setup(&m); 553 mce_setup(&m);
545 554
@@ -934,7 +943,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
934 943
935 atomic_inc(&mce_entry); 944 atomic_inc(&mce_entry);
936 945
937 __get_cpu_var(mce_exception_count)++; 946 percpu_inc(mce_exception_count);
938 947
939 if (notify_die(DIE_NMI, "machine check", regs, error_code, 948 if (notify_die(DIE_NMI, "machine check", regs, error_code,
940 18, SIGKILL) == NOTIFY_STOP) 949 18, SIGKILL) == NOTIFY_STOP)
@@ -1493,6 +1502,43 @@ static void collect_tscs(void *data)
1493 rdtscll(cpu_tsc[smp_processor_id()]); 1502 rdtscll(cpu_tsc[smp_processor_id()]);
1494} 1503}
1495 1504
1505static int mce_apei_read_done;
1506
1507/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1508static int __mce_read_apei(char __user **ubuf, size_t usize)
1509{
1510 int rc;
1511 u64 record_id;
1512 struct mce m;
1513
1514 if (usize < sizeof(struct mce))
1515 return -EINVAL;
1516
1517 rc = apei_read_mce(&m, &record_id);
1518 /* Error or no more MCE record */
1519 if (rc <= 0) {
1520 mce_apei_read_done = 1;
1521 return rc;
1522 }
1523 rc = -EFAULT;
1524 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1525 return rc;
1526 /*
1527 * In fact, we should have cleared the record after that has
1528 * been flushed to the disk or sent to network in
1529 * /sbin/mcelog, but we have no interface to support that now,
1530 * so just clear it to avoid duplication.
1531 */
1532 rc = apei_clear_mce(record_id);
1533 if (rc) {
1534 mce_apei_read_done = 1;
1535 return rc;
1536 }
1537 *ubuf += sizeof(struct mce);
1538
1539 return 0;
1540}
1541
1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1542static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1497 loff_t *off) 1543 loff_t *off)
1498{ 1544{
@@ -1506,15 +1552,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1506 return -ENOMEM; 1552 return -ENOMEM;
1507 1553
1508 mutex_lock(&mce_read_mutex); 1554 mutex_lock(&mce_read_mutex);
1555
1556 if (!mce_apei_read_done) {
1557 err = __mce_read_apei(&buf, usize);
1558 if (err || buf != ubuf)
1559 goto out;
1560 }
1561
1509 next = rcu_dereference_check_mce(mcelog.next); 1562 next = rcu_dereference_check_mce(mcelog.next);
1510 1563
1511 /* Only supports full reads right now */ 1564 /* Only supports full reads right now */
1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1565 err = -EINVAL;
1513 mutex_unlock(&mce_read_mutex); 1566 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1514 kfree(cpu_tsc); 1567 goto out;
1515
1516 return -EINVAL;
1517 }
1518 1568
1519 err = 0; 1569 err = 0;
1520 prev = 0; 1570 prev = 0;
@@ -1562,10 +1612,15 @@ timeout:
1562 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1612 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1563 } 1613 }
1564 } 1614 }
1615
1616 if (err)
1617 err = -EFAULT;
1618
1619out:
1565 mutex_unlock(&mce_read_mutex); 1620 mutex_unlock(&mce_read_mutex);
1566 kfree(cpu_tsc); 1621 kfree(cpu_tsc);
1567 1622
1568 return err ? -EFAULT : buf - ubuf; 1623 return err ? err : buf - ubuf;
1569} 1624}
1570 1625
1571static unsigned int mce_poll(struct file *file, poll_table *wait) 1626static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1628,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
1573 poll_wait(file, &mce_wait, wait); 1628 poll_wait(file, &mce_wait, wait);
1574 if (rcu_dereference_check_mce(mcelog.next)) 1629 if (rcu_dereference_check_mce(mcelog.next))
1575 return POLLIN | POLLRDNORM; 1630 return POLLIN | POLLRDNORM;
1631 if (!mce_apei_read_done && apei_check_mce())
1632 return POLLIN | POLLRDNORM;
1576 return 0; 1633 return 0;
1577} 1634}
1578 1635
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb2..e1a0a3bf971 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
190 mutex_unlock(&therm_cpu_lock); 190 mutex_unlock(&therm_cpu_lock);
191 break; 191 break;
192 } 192 }
193 return err ? NOTIFY_BAD : NOTIFY_OK; 193 return notifier_from_errno(err);
194} 194}
195 195
196static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = 196static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 00000000000..16f41bbe46b
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,55 @@
1/*
2 * HyperV Detection code.
3 *
4 * Copyright (C) 2010, Novell, Inc.
5 * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 */
12
13#include <linux/types.h>
14#include <linux/module.h>
15#include <asm/processor.h>
16#include <asm/hypervisor.h>
17#include <asm/hyperv.h>
18#include <asm/mshyperv.h>
19
20struct ms_hyperv_info ms_hyperv;
21
22static bool __init ms_hyperv_platform(void)
23{
24 u32 eax;
25 u32 hyp_signature[3];
26
27 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
28 return false;
29
30 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
31 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
32
33 return eax >= HYPERV_CPUID_MIN &&
34 eax <= HYPERV_CPUID_MAX &&
35 !memcmp("Microsoft Hv", hyp_signature, 12);
36}
37
38static void __init ms_hyperv_init_platform(void)
39{
40 /*
41 * Extract the features and hints
42 */
43 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
44 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
45
46 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
47 ms_hyperv.features, ms_hyperv.hints);
48}
49
50const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
51 .name = "Microsoft HyperV",
52 .detect = ms_hyperv_platform,
53 .init_platform = ms_hyperv_init_platform,
54};
55EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf..c77586061bc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33 33
34static u64 perf_event_mask __read_mostly; 34#if 0
35#undef wrmsrl
36#define wrmsrl(msr, val) \
37do { \
38 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
39 (unsigned long)(val)); \
40 native_write_msr((msr), (u32)((u64)(val)), \
41 (u32)((u64)(val) >> 32)); \
42} while (0)
43#endif
35 44
36/* The maximal number of PEBS events: */ 45/*
37#define MAX_PEBS_EVENTS 4 46 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
47 */
48static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{
51 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0;
54 struct page *page;
55 void *map;
56 int ret;
38 57
39/* The size of a BTS record in bytes: */ 58 do {
40#define BTS_RECORD_SIZE 24 59 ret = __get_user_pages_fast(addr, 1, 0, &page);
60 if (!ret)
61 break;
41 62
42/* The size of a per-cpu BTS buffer in bytes: */ 63 offset = addr & (PAGE_SIZE - 1);
43#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) 64 size = min(PAGE_SIZE - offset, n - len);
44 65
45/* The BTS overflow threshold in bytes from the end of the buffer: */ 66 map = kmap_atomic(page, type);
46#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) 67 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type);
69 put_page(page);
47 70
71 len += size;
72 to += size;
73 addr += size;
48 74
49/* 75 } while (len < n);
50 * Bits in the debugctlmsr controlling branch tracing.
51 */
52#define X86_DEBUGCTL_TR (1 << 6)
53#define X86_DEBUGCTL_BTS (1 << 7)
54#define X86_DEBUGCTL_BTINT (1 << 8)
55#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
56#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
57 76
58/* 77 return len;
59 * A debug store configuration. 78}
60 *
61 * We only support architectures that use 64bit fields.
62 */
63struct debug_store {
64 u64 bts_buffer_base;
65 u64 bts_index;
66 u64 bts_absolute_maximum;
67 u64 bts_interrupt_threshold;
68 u64 pebs_buffer_base;
69 u64 pebs_index;
70 u64 pebs_absolute_maximum;
71 u64 pebs_interrupt_threshold;
72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
73};
74 79
75struct event_constraint { 80struct event_constraint {
76 union { 81 union {
@@ -89,18 +94,41 @@ struct amd_nb {
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90}; 95};
91 96
97#define MAX_LBR_ENTRIES 16
98
92struct cpu_hw_events { 99struct cpu_hw_events {
100 /*
101 * Generic x86 PMC bits
102 */
93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ 103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
95 unsigned long interrupts;
96 int enabled; 105 int enabled;
97 struct debug_store *ds;
98 106
99 int n_events; 107 int n_events;
100 int n_added; 108 int n_added;
101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 109 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX]; 110 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 111 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
112
113 unsigned int group_flag;
114
115 /*
116 * Intel DebugStore bits
117 */
118 struct debug_store *ds;
119 u64 pebs_enabled;
120
121 /*
122 * Intel LBR bits
123 */
124 int lbr_users;
125 void *lbr_context;
126 struct perf_branch_stack lbr_stack;
127 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
128
129 /*
130 * AMD specific bits
131 */
104 struct amd_nb *amd_nb; 132 struct amd_nb *amd_nb;
105}; 133};
106 134
@@ -114,44 +142,75 @@ struct cpu_hw_events {
114#define EVENT_CONSTRAINT(c, n, m) \ 142#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 143 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
116 144
145/*
146 * Constraint on the Event code.
147 */
117#define INTEL_EVENT_CONSTRAINT(c, n) \ 148#define INTEL_EVENT_CONSTRAINT(c, n) \
118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) 149 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
119 150
151/*
152 * Constraint on the Event code + UMask + fixed-mask
153 *
154 * filter mask to validate fixed counter events.
155 * the following filters disqualify for fixed counters:
156 * - inv
157 * - edge
158 * - cnt-mask
159 * The other filters are supported by fixed counters.
160 * The any-thread option is supported starting with v3.
161 */
120#define FIXED_EVENT_CONSTRAINT(c, n) \ 162#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) 163 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
164
165/*
166 * Constraint on the Event code + UMask
167 */
168#define PEBS_EVENT_CONSTRAINT(c, n) \
169 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
122 170
123#define EVENT_CONSTRAINT_END \ 171#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0) 172 EVENT_CONSTRAINT(0, 0, 0)
125 173
126#define for_each_event_constraint(e, c) \ 174#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++) 175 for ((e) = (c); (e)->weight; (e)++)
176
177union perf_capabilities {
178 struct {
179 u64 lbr_format : 6;
180 u64 pebs_trap : 1;
181 u64 pebs_arch_reg : 1;
182 u64 pebs_format : 4;
183 u64 smm_freeze : 1;
184 };
185 u64 capabilities;
186};
128 187
129/* 188/*
130 * struct x86_pmu - generic x86 pmu 189 * struct x86_pmu - generic x86 pmu
131 */ 190 */
132struct x86_pmu { 191struct x86_pmu {
192 /*
193 * Generic x86 PMC bits
194 */
133 const char *name; 195 const char *name;
134 int version; 196 int version;
135 int (*handle_irq)(struct pt_regs *); 197 int (*handle_irq)(struct pt_regs *);
136 void (*disable_all)(void); 198 void (*disable_all)(void);
137 void (*enable_all)(void); 199 void (*enable_all)(int added);
138 void (*enable)(struct perf_event *); 200 void (*enable)(struct perf_event *);
139 void (*disable)(struct perf_event *); 201 void (*disable)(struct perf_event *);
202 int (*hw_config)(struct perf_event *event);
203 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
140 unsigned eventsel; 204 unsigned eventsel;
141 unsigned perfctr; 205 unsigned perfctr;
142 u64 (*event_map)(int); 206 u64 (*event_map)(int);
143 u64 (*raw_event)(u64);
144 int max_events; 207 int max_events;
145 int num_events; 208 int num_counters;
146 int num_events_fixed; 209 int num_counters_fixed;
147 int event_bits; 210 int cntval_bits;
148 u64 event_mask; 211 u64 cntval_mask;
149 int apic; 212 int apic;
150 u64 max_period; 213 u64 max_period;
151 u64 intel_ctrl;
152 void (*enable_bts)(u64 config);
153 void (*disable_bts)(void);
154
155 struct event_constraint * 214 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc, 215 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event); 216 struct perf_event *event);
@@ -159,11 +218,32 @@ struct x86_pmu {
159 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 218 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
160 struct perf_event *event); 219 struct perf_event *event);
161 struct event_constraint *event_constraints; 220 struct event_constraint *event_constraints;
221 void (*quirks)(void);
162 222
163 int (*cpu_prepare)(int cpu); 223 int (*cpu_prepare)(int cpu);
164 void (*cpu_starting)(int cpu); 224 void (*cpu_starting)(int cpu);
165 void (*cpu_dying)(int cpu); 225 void (*cpu_dying)(int cpu);
166 void (*cpu_dead)(int cpu); 226 void (*cpu_dead)(int cpu);
227
228 /*
229 * Intel Arch Perfmon v2+
230 */
231 u64 intel_ctrl;
232 union perf_capabilities intel_cap;
233
234 /*
235 * Intel DebugStore bits
236 */
237 int bts, pebs;
238 int pebs_record_size;
239 void (*drain_pebs)(struct pt_regs *regs);
240 struct event_constraint *pebs_constraints;
241
242 /*
243 * Intel LBR
244 */
245 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
246 int lbr_nr; /* hardware stack size */
167}; 247};
168 248
169static struct x86_pmu x86_pmu __read_mostly; 249static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +278,7 @@ static u64
198x86_perf_event_update(struct perf_event *event) 278x86_perf_event_update(struct perf_event *event)
199{ 279{
200 struct hw_perf_event *hwc = &event->hw; 280 struct hw_perf_event *hwc = &event->hw;
201 int shift = 64 - x86_pmu.event_bits; 281 int shift = 64 - x86_pmu.cntval_bits;
202 u64 prev_raw_count, new_raw_count; 282 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx; 283 int idx = hwc->idx;
204 s64 delta; 284 s64 delta;
@@ -241,33 +321,32 @@ again:
241static atomic_t active_events; 321static atomic_t active_events;
242static DEFINE_MUTEX(pmc_reserve_mutex); 322static DEFINE_MUTEX(pmc_reserve_mutex);
243 323
324#ifdef CONFIG_X86_LOCAL_APIC
325
244static bool reserve_pmc_hardware(void) 326static bool reserve_pmc_hardware(void)
245{ 327{
246#ifdef CONFIG_X86_LOCAL_APIC
247 int i; 328 int i;
248 329
249 if (nmi_watchdog == NMI_LOCAL_APIC) 330 if (nmi_watchdog == NMI_LOCAL_APIC)
250 disable_lapic_nmi_watchdog(); 331 disable_lapic_nmi_watchdog();
251 332
252 for (i = 0; i < x86_pmu.num_events; i++) { 333 for (i = 0; i < x86_pmu.num_counters; i++) {
253 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
254 goto perfctr_fail; 335 goto perfctr_fail;
255 } 336 }
256 337
257 for (i = 0; i < x86_pmu.num_events; i++) { 338 for (i = 0; i < x86_pmu.num_counters; i++) {
258 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 339 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
259 goto eventsel_fail; 340 goto eventsel_fail;
260 } 341 }
261#endif
262 342
263 return true; 343 return true;
264 344
265#ifdef CONFIG_X86_LOCAL_APIC
266eventsel_fail: 345eventsel_fail:
267 for (i--; i >= 0; i--) 346 for (i--; i >= 0; i--)
268 release_evntsel_nmi(x86_pmu.eventsel + i); 347 release_evntsel_nmi(x86_pmu.eventsel + i);
269 348
270 i = x86_pmu.num_events; 349 i = x86_pmu.num_counters;
271 350
272perfctr_fail: 351perfctr_fail:
273 for (i--; i >= 0; i--) 352 for (i--; i >= 0; i--)
@@ -277,128 +356,36 @@ perfctr_fail:
277 enable_lapic_nmi_watchdog(); 356 enable_lapic_nmi_watchdog();
278 357
279 return false; 358 return false;
280#endif
281} 359}
282 360
283static void release_pmc_hardware(void) 361static void release_pmc_hardware(void)
284{ 362{
285#ifdef CONFIG_X86_LOCAL_APIC
286 int i; 363 int i;
287 364
288 for (i = 0; i < x86_pmu.num_events; i++) { 365 for (i = 0; i < x86_pmu.num_counters; i++) {
289 release_perfctr_nmi(x86_pmu.perfctr + i); 366 release_perfctr_nmi(x86_pmu.perfctr + i);
290 release_evntsel_nmi(x86_pmu.eventsel + i); 367 release_evntsel_nmi(x86_pmu.eventsel + i);
291 } 368 }
292 369
293 if (nmi_watchdog == NMI_LOCAL_APIC) 370 if (nmi_watchdog == NMI_LOCAL_APIC)
294 enable_lapic_nmi_watchdog(); 371 enable_lapic_nmi_watchdog();
295#endif
296}
297
298static inline bool bts_available(void)
299{
300 return x86_pmu.enable_bts != NULL;
301}
302
303static void init_debug_store_on_cpu(int cpu)
304{
305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
306
307 if (!ds)
308 return;
309
310 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
311 (u32)((u64)(unsigned long)ds),
312 (u32)((u64)(unsigned long)ds >> 32));
313}
314
315static void fini_debug_store_on_cpu(int cpu)
316{
317 if (!per_cpu(cpu_hw_events, cpu).ds)
318 return;
319
320 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
321}
322
323static void release_bts_hardware(void)
324{
325 int cpu;
326
327 if (!bts_available())
328 return;
329
330 get_online_cpus();
331
332 for_each_online_cpu(cpu)
333 fini_debug_store_on_cpu(cpu);
334
335 for_each_possible_cpu(cpu) {
336 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
337
338 if (!ds)
339 continue;
340
341 per_cpu(cpu_hw_events, cpu).ds = NULL;
342
343 kfree((void *)(unsigned long)ds->bts_buffer_base);
344 kfree(ds);
345 }
346
347 put_online_cpus();
348} 372}
349 373
350static int reserve_bts_hardware(void) 374#else
351{
352 int cpu, err = 0;
353
354 if (!bts_available())
355 return 0;
356
357 get_online_cpus();
358
359 for_each_possible_cpu(cpu) {
360 struct debug_store *ds;
361 void *buffer;
362
363 err = -ENOMEM;
364 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
365 if (unlikely(!buffer))
366 break;
367
368 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
369 if (unlikely(!ds)) {
370 kfree(buffer);
371 break;
372 }
373
374 ds->bts_buffer_base = (u64)(unsigned long)buffer;
375 ds->bts_index = ds->bts_buffer_base;
376 ds->bts_absolute_maximum =
377 ds->bts_buffer_base + BTS_BUFFER_SIZE;
378 ds->bts_interrupt_threshold =
379 ds->bts_absolute_maximum - BTS_OVFL_TH;
380
381 per_cpu(cpu_hw_events, cpu).ds = ds;
382 err = 0;
383 }
384 375
385 if (err) 376static bool reserve_pmc_hardware(void) { return true; }
386 release_bts_hardware(); 377static void release_pmc_hardware(void) {}
387 else {
388 for_each_online_cpu(cpu)
389 init_debug_store_on_cpu(cpu);
390 }
391 378
392 put_online_cpus(); 379#endif
393 380
394 return err; 381static int reserve_ds_buffers(void);
395} 382static void release_ds_buffers(void);
396 383
397static void hw_perf_event_destroy(struct perf_event *event) 384static void hw_perf_event_destroy(struct perf_event *event)
398{ 385{
399 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 386 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
400 release_pmc_hardware(); 387 release_pmc_hardware();
401 release_bts_hardware(); 388 release_ds_buffers();
402 mutex_unlock(&pmc_reserve_mutex); 389 mutex_unlock(&pmc_reserve_mutex);
403 } 390 }
404} 391}
@@ -441,54 +428,11 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
441 return 0; 428 return 0;
442} 429}
443 430
444/* 431static int x86_setup_perfctr(struct perf_event *event)
445 * Setup the hardware configuration for a given attr_type
446 */
447static int __hw_perf_event_init(struct perf_event *event)
448{ 432{
449 struct perf_event_attr *attr = &event->attr; 433 struct perf_event_attr *attr = &event->attr;
450 struct hw_perf_event *hwc = &event->hw; 434 struct hw_perf_event *hwc = &event->hw;
451 u64 config; 435 u64 config;
452 int err;
453
454 if (!x86_pmu_initialized())
455 return -ENODEV;
456
457 err = 0;
458 if (!atomic_inc_not_zero(&active_events)) {
459 mutex_lock(&pmc_reserve_mutex);
460 if (atomic_read(&active_events) == 0) {
461 if (!reserve_pmc_hardware())
462 err = -EBUSY;
463 else
464 err = reserve_bts_hardware();
465 }
466 if (!err)
467 atomic_inc(&active_events);
468 mutex_unlock(&pmc_reserve_mutex);
469 }
470 if (err)
471 return err;
472
473 event->destroy = hw_perf_event_destroy;
474
475 /*
476 * Generate PMC IRQs:
477 * (keep 'enabled' bit clear for now)
478 */
479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
480
481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
484
485 /*
486 * Count user and OS events unless requested not to.
487 */
488 if (!attr->exclude_user)
489 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
490 if (!attr->exclude_kernel)
491 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
492 436
493 if (!hwc->sample_period) { 437 if (!hwc->sample_period) {
494 hwc->sample_period = x86_pmu.max_period; 438 hwc->sample_period = x86_pmu.max_period;
@@ -505,16 +449,8 @@ static int __hw_perf_event_init(struct perf_event *event)
505 return -EOPNOTSUPP; 449 return -EOPNOTSUPP;
506 } 450 }
507 451
508 /* 452 if (attr->type == PERF_TYPE_RAW)
509 * Raw hw_event type provide the config in the hw_event structure
510 */
511 if (attr->type == PERF_TYPE_RAW) {
512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
516 return 0; 453 return 0;
517 }
518 454
519 if (attr->type == PERF_TYPE_HW_CACHE) 455 if (attr->type == PERF_TYPE_HW_CACHE)
520 return set_ext_hw_attr(hwc, attr); 456 return set_ext_hw_attr(hwc, attr);
@@ -539,11 +475,11 @@ static int __hw_perf_event_init(struct perf_event *event)
539 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 475 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
540 (hwc->sample_period == 1)) { 476 (hwc->sample_period == 1)) {
541 /* BTS is not supported by this architecture. */ 477 /* BTS is not supported by this architecture. */
542 if (!bts_available()) 478 if (!x86_pmu.bts)
543 return -EOPNOTSUPP; 479 return -EOPNOTSUPP;
544 480
545 /* BTS is currently only allowed for user-mode. */ 481 /* BTS is currently only allowed for user-mode. */
546 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 482 if (!attr->exclude_kernel)
547 return -EOPNOTSUPP; 483 return -EOPNOTSUPP;
548 } 484 }
549 485
@@ -552,12 +488,87 @@ static int __hw_perf_event_init(struct perf_event *event)
552 return 0; 488 return 0;
553} 489}
554 490
491static int x86_pmu_hw_config(struct perf_event *event)
492{
493 if (event->attr.precise_ip) {
494 int precise = 0;
495
496 /* Support for constant skid */
497 if (x86_pmu.pebs)
498 precise++;
499
500 /* Support for IP fixup */
501 if (x86_pmu.lbr_nr)
502 precise++;
503
504 if (event->attr.precise_ip > precise)
505 return -EOPNOTSUPP;
506 }
507
508 /*
509 * Generate PMC IRQs:
510 * (keep 'enabled' bit clear for now)
511 */
512 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
513
514 /*
515 * Count user and OS events unless requested not to
516 */
517 if (!event->attr.exclude_user)
518 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
519 if (!event->attr.exclude_kernel)
520 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
521
522 if (event->attr.type == PERF_TYPE_RAW)
523 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
524
525 return x86_setup_perfctr(event);
526}
527
528/*
529 * Setup the hardware configuration for a given attr_type
530 */
531static int __hw_perf_event_init(struct perf_event *event)
532{
533 int err;
534
535 if (!x86_pmu_initialized())
536 return -ENODEV;
537
538 err = 0;
539 if (!atomic_inc_not_zero(&active_events)) {
540 mutex_lock(&pmc_reserve_mutex);
541 if (atomic_read(&active_events) == 0) {
542 if (!reserve_pmc_hardware())
543 err = -EBUSY;
544 else {
545 err = reserve_ds_buffers();
546 if (err)
547 release_pmc_hardware();
548 }
549 }
550 if (!err)
551 atomic_inc(&active_events);
552 mutex_unlock(&pmc_reserve_mutex);
553 }
554 if (err)
555 return err;
556
557 event->destroy = hw_perf_event_destroy;
558
559 event->hw.idx = -1;
560 event->hw.last_cpu = -1;
561 event->hw.last_tag = ~0ULL;
562
563 return x86_pmu.hw_config(event);
564}
565
555static void x86_pmu_disable_all(void) 566static void x86_pmu_disable_all(void)
556{ 567{
557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 568 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
558 int idx; 569 int idx;
559 570
560 for (idx = 0; idx < x86_pmu.num_events; idx++) { 571 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
561 u64 val; 572 u64 val;
562 573
563 if (!test_bit(idx, cpuc->active_mask)) 574 if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +598,12 @@ void hw_perf_disable(void)
587 x86_pmu.disable_all(); 598 x86_pmu.disable_all();
588} 599}
589 600
590static void x86_pmu_enable_all(void) 601static void x86_pmu_enable_all(int added)
591{ 602{
592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 603 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
593 int idx; 604 int idx;
594 605
595 for (idx = 0; idx < x86_pmu.num_events; idx++) { 606 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
596 struct perf_event *event = cpuc->events[idx]; 607 struct perf_event *event = cpuc->events[idx];
597 u64 val; 608 u64 val;
598 609
@@ -667,14 +678,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
667 * assign events to counters starting with most 678 * assign events to counters starting with most
668 * constrained events. 679 * constrained events.
669 */ 680 */
670 wmax = x86_pmu.num_events; 681 wmax = x86_pmu.num_counters;
671 682
672 /* 683 /*
673 * when fixed event counters are present, 684 * when fixed event counters are present,
674 * wmax is incremented by 1 to account 685 * wmax is incremented by 1 to account
675 * for one more choice 686 * for one more choice
676 */ 687 */
677 if (x86_pmu.num_events_fixed) 688 if (x86_pmu.num_counters_fixed)
678 wmax++; 689 wmax++;
679 690
680 for (w = 1, num = n; num && w <= wmax; w++) { 691 for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +735,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
724 struct perf_event *event; 735 struct perf_event *event;
725 int n, max_count; 736 int n, max_count;
726 737
727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; 738 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
728 739
729 /* current number of events already accepted */ 740 /* current number of events already accepted */
730 n = cpuc->n_events; 741 n = cpuc->n_events;
@@ -795,7 +806,7 @@ void hw_perf_enable(void)
795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 806 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
796 struct perf_event *event; 807 struct perf_event *event;
797 struct hw_perf_event *hwc; 808 struct hw_perf_event *hwc;
798 int i; 809 int i, added = cpuc->n_added;
799 810
800 if (!x86_pmu_initialized()) 811 if (!x86_pmu_initialized())
801 return; 812 return;
@@ -847,19 +858,20 @@ void hw_perf_enable(void)
847 cpuc->enabled = 1; 858 cpuc->enabled = 1;
848 barrier(); 859 barrier();
849 860
850 x86_pmu.enable_all(); 861 x86_pmu.enable_all(added);
851} 862}
852 863
853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) 864static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
865 u64 enable_mask)
854{ 866{
855 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 867 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857} 868}
858 869
859static inline void x86_pmu_disable_event(struct perf_event *event) 870static inline void x86_pmu_disable_event(struct perf_event *event)
860{ 871{
861 struct hw_perf_event *hwc = &event->hw; 872 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); 873
874 wrmsrl(hwc->config_base + hwc->idx, hwc->config);
863} 875}
864 876
865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 877static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -874,7 +886,7 @@ x86_perf_event_set_period(struct perf_event *event)
874 struct hw_perf_event *hwc = &event->hw; 886 struct hw_perf_event *hwc = &event->hw;
875 s64 left = atomic64_read(&hwc->period_left); 887 s64 left = atomic64_read(&hwc->period_left);
876 s64 period = hwc->sample_period; 888 s64 period = hwc->sample_period;
877 int err, ret = 0, idx = hwc->idx; 889 int ret = 0, idx = hwc->idx;
878 890
879 if (idx == X86_PMC_IDX_FIXED_BTS) 891 if (idx == X86_PMC_IDX_FIXED_BTS)
880 return 0; 892 return 0;
@@ -912,8 +924,8 @@ x86_perf_event_set_period(struct perf_event *event)
912 */ 924 */
913 atomic64_set(&hwc->prev_count, (u64)-left); 925 atomic64_set(&hwc->prev_count, (u64)-left);
914 926
915 err = checking_wrmsrl(hwc->event_base + idx, 927 wrmsrl(hwc->event_base + idx,
916 (u64)(-left) & x86_pmu.event_mask); 928 (u64)(-left) & x86_pmu.cntval_mask);
917 929
918 perf_event_update_userpage(event); 930 perf_event_update_userpage(event);
919 931
@@ -924,7 +936,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
924{ 936{
925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 937 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
926 if (cpuc->enabled) 938 if (cpuc->enabled)
927 __x86_pmu_enable_event(&event->hw); 939 __x86_pmu_enable_event(&event->hw,
940 ARCH_PERFMON_EVENTSEL_ENABLE);
928} 941}
929 942
930/* 943/*
@@ -950,7 +963,15 @@ static int x86_pmu_enable(struct perf_event *event)
950 if (n < 0) 963 if (n < 0)
951 return n; 964 return n;
952 965
953 ret = x86_schedule_events(cpuc, n, assign); 966 /*
967 * If group events scheduling transaction was started,
968 * skip the schedulability test here, it will be peformed
969 * at commit time(->commit_txn) as a whole
970 */
971 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
972 goto out;
973
974 ret = x86_pmu.schedule_events(cpuc, n, assign);
954 if (ret) 975 if (ret)
955 return ret; 976 return ret;
956 /* 977 /*
@@ -959,6 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
959 */ 980 */
960 memcpy(cpuc->assign, assign, n*sizeof(int)); 981 memcpy(cpuc->assign, assign, n*sizeof(int));
961 982
983out:
962 cpuc->n_events = n; 984 cpuc->n_events = n;
963 cpuc->n_added += n - n0; 985 cpuc->n_added += n - n0;
964 986
@@ -991,11 +1013,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
991void perf_event_print_debug(void) 1013void perf_event_print_debug(void)
992{ 1014{
993 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1015 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1016 u64 pebs;
994 struct cpu_hw_events *cpuc; 1017 struct cpu_hw_events *cpuc;
995 unsigned long flags; 1018 unsigned long flags;
996 int cpu, idx; 1019 int cpu, idx;
997 1020
998 if (!x86_pmu.num_events) 1021 if (!x86_pmu.num_counters)
999 return; 1022 return;
1000 1023
1001 local_irq_save(flags); 1024 local_irq_save(flags);
@@ -1008,16 +1031,18 @@ void perf_event_print_debug(void)
1008 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1031 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1009 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1032 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1010 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1033 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1034 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1011 1035
1012 pr_info("\n"); 1036 pr_info("\n");
1013 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1037 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1014 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1038 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1039 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1040 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1041 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1017 } 1042 }
1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1043 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1019 1044
1020 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1045 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1046 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1022 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1047 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1023 1048
@@ -1030,7 +1055,7 @@ void perf_event_print_debug(void)
1030 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1055 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1031 cpu, idx, prev_left); 1056 cpu, idx, prev_left);
1032 } 1057 }
1033 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 1058 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1034 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1059 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1035 1060
1036 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1061 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1095,7 +1120,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1095 1120
1096 cpuc = &__get_cpu_var(cpu_hw_events); 1121 cpuc = &__get_cpu_var(cpu_hw_events);
1097 1122
1098 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1123 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1099 if (!test_bit(idx, cpuc->active_mask)) 1124 if (!test_bit(idx, cpuc->active_mask))
1100 continue; 1125 continue;
1101 1126
@@ -1103,7 +1128,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1103 hwc = &event->hw; 1128 hwc = &event->hw;
1104 1129
1105 val = x86_perf_event_update(event); 1130 val = x86_perf_event_update(event);
1106 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1131 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1107 continue; 1132 continue;
1108 1133
1109 /* 1134 /*
@@ -1146,7 +1171,6 @@ void set_perf_event_pending(void)
1146 1171
1147void perf_events_lapic_init(void) 1172void perf_events_lapic_init(void)
1148{ 1173{
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 if (!x86_pmu.apic || !x86_pmu_initialized()) 1174 if (!x86_pmu.apic || !x86_pmu_initialized())
1151 return; 1175 return;
1152 1176
@@ -1154,7 +1178,6 @@ void perf_events_lapic_init(void)
1154 * Always use NMI for PMU 1178 * Always use NMI for PMU
1155 */ 1179 */
1156 apic_write(APIC_LVTPC, APIC_DM_NMI); 1180 apic_write(APIC_LVTPC, APIC_DM_NMI);
1157#endif
1158} 1181}
1159 1182
1160static int __kprobes 1183static int __kprobes
@@ -1178,9 +1201,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1178 1201
1179 regs = args->regs; 1202 regs = args->regs;
1180 1203
1181#ifdef CONFIG_X86_LOCAL_APIC
1182 apic_write(APIC_LVTPC, APIC_DM_NMI); 1204 apic_write(APIC_LVTPC, APIC_DM_NMI);
1183#endif
1184 /* 1205 /*
1185 * Can't rely on the handled return value to say it was our NMI, two 1206 * Can't rely on the handled return value to say it was our NMI, two
1186 * events could trigger 'simultaneously' raising two back-to-back NMIs. 1207 * events could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1217,118 +1238,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1217 return &unconstrained; 1238 return &unconstrained;
1218} 1239}
1219 1240
1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
1222{
1223 int ret = 0;
1224
1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
1228
1229 if (!is_x86_event(event))
1230 ret = event->pmu->enable(event);
1231
1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
1239}
1240
1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
1243{
1244 event->state = PERF_EVENT_STATE_INACTIVE;
1245 event->oncpu = -1;
1246
1247 if (!is_x86_event(event))
1248 event->pmu->disable(event);
1249
1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
1251
1252 if (!is_software_event(event))
1253 cpuctx->active_oncpu--;
1254
1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1256 cpuctx->exclusive = 0;
1257}
1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
1299 /*
1300 * copy new assignment, now we know it is possible
1301 * will be used by hw_perf_enable()
1302 */
1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
1308
1309 /*
1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
1315 */
1316 return 1;
1317undo:
1318 x86_event_sched_out(leader, cpuctx);
1319 n0 = 1;
1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1322 x86_event_sched_out(sub, cpuctx);
1323 if (++n0 == n1)
1324 break;
1325 }
1326 }
1327 return ret;
1328}
1329
1330#include "perf_event_amd.c" 1241#include "perf_event_amd.c"
1331#include "perf_event_p6.c" 1242#include "perf_event_p6.c"
1243#include "perf_event_p4.c"
1244#include "perf_event_intel_lbr.c"
1245#include "perf_event_intel_ds.c"
1332#include "perf_event_intel.c" 1246#include "perf_event_intel.c"
1333 1247
1334static int __cpuinit 1248static int __cpuinit
@@ -1402,48 +1316,50 @@ void __init init_hw_perf_events(void)
1402 1316
1403 pr_cont("%s PMU driver.\n", x86_pmu.name); 1317 pr_cont("%s PMU driver.\n", x86_pmu.name);
1404 1318
1405 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1319 if (x86_pmu.quirks)
1320 x86_pmu.quirks();
1321
1322 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1406 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1323 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1407 x86_pmu.num_events, X86_PMC_MAX_GENERIC); 1324 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1408 x86_pmu.num_events = X86_PMC_MAX_GENERIC; 1325 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1409 } 1326 }
1410 perf_event_mask = (1 << x86_pmu.num_events) - 1; 1327 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1411 perf_max_events = x86_pmu.num_events; 1328 perf_max_events = x86_pmu.num_counters;
1412 1329
1413 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { 1330 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1414 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1331 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1415 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); 1332 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1416 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; 1333 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1417 } 1334 }
1418 1335
1419 perf_event_mask |= 1336 x86_pmu.intel_ctrl |=
1420 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; 1337 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1421 x86_pmu.intel_ctrl = perf_event_mask;
1422 1338
1423 perf_events_lapic_init(); 1339 perf_events_lapic_init();
1424 register_die_notifier(&perf_event_nmi_notifier); 1340 register_die_notifier(&perf_event_nmi_notifier);
1425 1341
1426 unconstrained = (struct event_constraint) 1342 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 1343 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1428 0, x86_pmu.num_events); 1344 0, x86_pmu.num_counters);
1429 1345
1430 if (x86_pmu.event_constraints) { 1346 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) { 1347 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK) 1348 if (c->cmask != X86_RAW_EVENT_MASK)
1433 continue; 1349 continue;
1434 1350
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; 1351 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1436 c->weight += x86_pmu.num_events; 1352 c->weight += x86_pmu.num_counters;
1437 } 1353 }
1438 } 1354 }
1439 1355
1440 pr_info("... version: %d\n", x86_pmu.version); 1356 pr_info("... version: %d\n", x86_pmu.version);
1441 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1357 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1442 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1358 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
1443 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); 1359 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1360 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1361 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1446 pr_info("... event mask: %016Lx\n", perf_event_mask); 1362 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1447 1363
1448 perf_cpu_notifier(x86_pmu_notifier); 1364 perf_cpu_notifier(x86_pmu_notifier);
1449} 1365}
@@ -1453,6 +1369,59 @@ static inline void x86_pmu_read(struct perf_event *event)
1453 x86_perf_event_update(event); 1369 x86_perf_event_update(event);
1454} 1370}
1455 1371
1372/*
1373 * Start group events scheduling transaction
1374 * Set the flag to make pmu::enable() not perform the
1375 * schedulability test, it will be performed at commit time
1376 */
1377static void x86_pmu_start_txn(const struct pmu *pmu)
1378{
1379 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1380
1381 cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
1382}
1383
1384/*
1385 * Stop group events scheduling transaction
1386 * Clear the flag and pmu::enable() will perform the
1387 * schedulability test.
1388 */
1389static void x86_pmu_cancel_txn(const struct pmu *pmu)
1390{
1391 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1392
1393 cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
1394}
1395
1396/*
1397 * Commit group events scheduling transaction
1398 * Perform the group schedulability test as a whole
1399 * Return 0 if success
1400 */
1401static int x86_pmu_commit_txn(const struct pmu *pmu)
1402{
1403 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1404 int assign[X86_PMC_IDX_MAX];
1405 int n, ret;
1406
1407 n = cpuc->n_events;
1408
1409 if (!x86_pmu_initialized())
1410 return -EAGAIN;
1411
1412 ret = x86_pmu.schedule_events(cpuc, n, assign);
1413 if (ret)
1414 return ret;
1415
1416 /*
1417 * copy new assignment, now we know it is possible
1418 * will be used by hw_perf_enable()
1419 */
1420 memcpy(cpuc->assign, assign, n*sizeof(int));
1421
1422 return 0;
1423}
1424
1456static const struct pmu pmu = { 1425static const struct pmu pmu = {
1457 .enable = x86_pmu_enable, 1426 .enable = x86_pmu_enable,
1458 .disable = x86_pmu_disable, 1427 .disable = x86_pmu_disable,
@@ -1460,9 +1429,38 @@ static const struct pmu pmu = {
1460 .stop = x86_pmu_stop, 1429 .stop = x86_pmu_stop,
1461 .read = x86_pmu_read, 1430 .read = x86_pmu_read,
1462 .unthrottle = x86_pmu_unthrottle, 1431 .unthrottle = x86_pmu_unthrottle,
1432 .start_txn = x86_pmu_start_txn,
1433 .cancel_txn = x86_pmu_cancel_txn,
1434 .commit_txn = x86_pmu_commit_txn,
1463}; 1435};
1464 1436
1465/* 1437/*
1438 * validate that we can schedule this event
1439 */
1440static int validate_event(struct perf_event *event)
1441{
1442 struct cpu_hw_events *fake_cpuc;
1443 struct event_constraint *c;
1444 int ret = 0;
1445
1446 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1447 if (!fake_cpuc)
1448 return -ENOMEM;
1449
1450 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1451
1452 if (!c || !c->weight)
1453 ret = -ENOSPC;
1454
1455 if (x86_pmu.put_event_constraints)
1456 x86_pmu.put_event_constraints(fake_cpuc, event);
1457
1458 kfree(fake_cpuc);
1459
1460 return ret;
1461}
1462
1463/*
1466 * validate a single event group 1464 * validate a single event group
1467 * 1465 *
1468 * validation include: 1466 * validation include:
@@ -1502,7 +1500,7 @@ static int validate_group(struct perf_event *event)
1502 1500
1503 fake_cpuc->n_events = n; 1501 fake_cpuc->n_events = n;
1504 1502
1505 ret = x86_schedule_events(fake_cpuc, n, NULL); 1503 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1506 1504
1507out_free: 1505out_free:
1508 kfree(fake_cpuc); 1506 kfree(fake_cpuc);
@@ -1527,6 +1525,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1527 1525
1528 if (event->group_leader != event) 1526 if (event->group_leader != event)
1529 err = validate_group(event); 1527 err = validate_group(event);
1528 else
1529 err = validate_event(event);
1530 1530
1531 event->pmu = tmp; 1531 event->pmu = tmp;
1532 } 1532 }
@@ -1574,8 +1574,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1574{ 1574{
1575 struct perf_callchain_entry *entry = data; 1575 struct perf_callchain_entry *entry = data;
1576 1576
1577 if (reliable) 1577 callchain_store(entry, addr);
1578 callchain_store(entry, addr);
1579} 1578}
1580 1579
1581static const struct stacktrace_ops backtrace_ops = { 1580static const struct stacktrace_ops backtrace_ops = {
@@ -1597,41 +1596,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1596 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1598} 1597}
1599 1598
1600/*
1601 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1602 */
1603static unsigned long
1604copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1605{
1606 unsigned long offset, addr = (unsigned long)from;
1607 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1608 unsigned long size, len = 0;
1609 struct page *page;
1610 void *map;
1611 int ret;
1612
1613 do {
1614 ret = __get_user_pages_fast(addr, 1, 0, &page);
1615 if (!ret)
1616 break;
1617
1618 offset = addr & (PAGE_SIZE - 1);
1619 size = min(PAGE_SIZE - offset, n - len);
1620
1621 map = kmap_atomic(page, type);
1622 memcpy(to, map+offset, size);
1623 kunmap_atomic(map, type);
1624 put_page(page);
1625
1626 len += size;
1627 to += size;
1628 addr += size;
1629
1630 } while (len < n);
1631
1632 return len;
1633}
1634
1635#ifdef CONFIG_COMPAT 1599#ifdef CONFIG_COMPAT
1636static inline int 1600static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 1601perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1691,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1727{ 1691{
1728 struct perf_callchain_entry *entry; 1692 struct perf_callchain_entry *entry;
1729 1693
1694 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1695 /* TODO: We don't support guest os callchain now */
1696 return NULL;
1697 }
1698
1730 if (in_nmi()) 1699 if (in_nmi())
1731 entry = &__get_cpu_var(pmc_nmi_entry); 1700 entry = &__get_cpu_var(pmc_nmi_entry);
1732 else 1701 else
@@ -1748,5 +1717,43 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
1748 */ 1717 */
1749 regs->bp = rewind_frame_pointer(skip + 1); 1718 regs->bp = rewind_frame_pointer(skip + 1);
1750 regs->cs = __KERNEL_CS; 1719 regs->cs = __KERNEL_CS;
1751 local_save_flags(regs->flags); 1720 /*
1721 * We abuse bit 3 to pass exact information, see perf_misc_flags
1722 * and the comment with PERF_EFLAGS_EXACT.
1723 */
1724 regs->flags = 0;
1725}
1726
1727unsigned long perf_instruction_pointer(struct pt_regs *regs)
1728{
1729 unsigned long ip;
1730
1731 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1732 ip = perf_guest_cbs->get_guest_ip();
1733 else
1734 ip = instruction_pointer(regs);
1735
1736 return ip;
1737}
1738
1739unsigned long perf_misc_flags(struct pt_regs *regs)
1740{
1741 int misc = 0;
1742
1743 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1744 if (perf_guest_cbs->is_user_mode())
1745 misc |= PERF_RECORD_MISC_GUEST_USER;
1746 else
1747 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1748 } else {
1749 if (user_mode(regs))
1750 misc |= PERF_RECORD_MISC_USER;
1751 else
1752 misc |= PERF_RECORD_MISC_KERNEL;
1753 }
1754
1755 if (regs->flags & PERF_EFLAGS_EXACT)
1756 misc |= PERF_RECORD_MISC_EXACT_IP;
1757
1758 return misc;
1752} 1759}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e..611df11ba15 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock); 3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4 4
5static __initconst u64 amd_hw_cache_event_ids 5static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
111 return amd_perfmon_event_map[hw_event]; 111 return amd_perfmon_event_map[hw_event];
112} 112}
113 113
114static u64 amd_pmu_raw_event(u64 hw_event) 114static int amd_pmu_hw_config(struct perf_event *event)
115{ 115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL 116 int ret = x86_pmu_hw_config(event);
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 117
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 118 if (ret)
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL 119 return ret;
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL 120
121 121 if (event->attr.type != PERF_TYPE_RAW)
122#define K7_EVNTSEL_MASK \ 122 return 0;
123 (K7_EVNTSEL_EVENT_MASK | \ 123
124 K7_EVNTSEL_UNIT_MASK | \ 124 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
125 K7_EVNTSEL_EDGE_MASK | \ 125
126 K7_EVNTSEL_INV_MASK | \ 126 return 0;
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130} 127}
131 128
132/* 129/*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
165 * be removed on one CPU at a time AND PMU is disabled 162 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here 163 * when we come here
167 */ 164 */
168 for (i = 0; i < x86_pmu.num_events; i++) { 165 for (i = 0; i < x86_pmu.num_counters; i++) {
169 if (nb->owners[i] == event) { 166 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL); 167 cmpxchg(nb->owners+i, event, NULL);
171 break; 168 break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
215 struct hw_perf_event *hwc = &event->hw; 212 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb; 213 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL; 214 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events; 215 int max = x86_pmu.num_counters;
219 int i, j, k = -1; 216 int i, j, k = -1;
220 217
221 /* 218 /*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
293 /* 290 /*
294 * initialize all possible NB constraints 291 * initialize all possible NB constraints
295 */ 292 */
296 for (i = 0; i < x86_pmu.num_events; i++) { 293 for (i = 0; i < x86_pmu.num_counters; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk); 294 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1; 295 nb->event_constraints[i].weight = 1;
299 } 296 }
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
371 raw_spin_unlock(&amd_nb_lock); 368 raw_spin_unlock(&amd_nb_lock);
372} 369}
373 370
374static __initconst struct x86_pmu amd_pmu = { 371static __initconst const struct x86_pmu amd_pmu = {
375 .name = "AMD", 372 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq, 373 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all, 374 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all, 375 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event, 376 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event, 377 .disable = x86_pmu_disable_event,
378 .hw_config = amd_pmu_hw_config,
379 .schedule_events = x86_schedule_events,
381 .eventsel = MSR_K7_EVNTSEL0, 380 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0, 381 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map, 382 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 383 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4, 384 .num_counters = 4,
387 .event_bits = 48, 385 .cntval_bits = 48,
388 .event_mask = (1ULL << 48) - 1, 386 .cntval_mask = (1ULL << 48) - 1,
389 .apic = 1, 387 .apic = 1,
390 /* use highest bit to detect overflow */ 388 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1, 389 .max_period = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac8783..fdbc652d3fe 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
88 return intel_perfmon_event_map[hw_event]; 88 return intel_perfmon_event_map[hw_event];
89} 89}
90 90
91static __initconst u64 westmere_hw_cache_event_ids 91static __initconst const u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX] 92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX] 93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
179 }, 179 },
180}; 180};
181 181
182static __initconst u64 nehalem_hw_cache_event_ids 182static __initconst const u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX] 183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX] 184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
270 }, 270 },
271}; 271};
272 272
273static __initconst u64 core2_hw_cache_event_ids 273static __initconst const u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX] 274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX] 275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
361 }, 361 },
362}; 362};
363 363
364static __initconst u64 atom_hw_cache_event_ids 364static __initconst const u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX] 365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX] 366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
452 }, 452 },
453}; 453};
454 454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void) 455static void intel_pmu_disable_all(void)
510{ 456{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 457 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +460,17 @@ static void intel_pmu_disable_all(void)
514 460
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 461 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts(); 462 intel_pmu_disable_bts();
463
464 intel_pmu_pebs_disable_all();
465 intel_pmu_lbr_disable_all();
517} 466}
518 467
519static void intel_pmu_enable_all(void) 468static void intel_pmu_enable_all(int added)
520{ 469{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 470 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522 471
472 intel_pmu_pebs_enable_all();
473 intel_pmu_lbr_enable_all();
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 474 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524 475
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 476 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +484,42 @@ static void intel_pmu_enable_all(void)
533 } 484 }
534} 485}
535 486
487/*
488 * Workaround for:
489 * Intel Errata AAK100 (model 26)
490 * Intel Errata AAP53 (model 30)
491 * Intel Errata BD53 (model 44)
492 *
493 * These chips need to be 'reset' when adding counters by programming
494 * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
495 * either in sequence on the same PMC or on different PMCs.
496 */
497static void intel_pmu_nhm_enable_all(int added)
498{
499 if (added) {
500 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
501 int i;
502
503 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
504 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
505 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
506
507 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
508 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
509
510 for (i = 0; i < 3; i++) {
511 struct perf_event *event = cpuc->events[i];
512
513 if (!event)
514 continue;
515
516 __x86_pmu_enable_event(&event->hw,
517 ARCH_PERFMON_EVENTSEL_ENABLE);
518 }
519 }
520 intel_pmu_enable_all(added);
521}
522
536static inline u64 intel_pmu_get_status(void) 523static inline u64 intel_pmu_get_status(void)
537{ 524{
538 u64 status; 525 u64 status;
@@ -547,8 +534,7 @@ static inline void intel_pmu_ack_status(u64 ack)
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 534 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548} 535}
549 536
550static inline void 537static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{ 538{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED; 539 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask; 540 u64 ctrl_val, mask;
@@ -557,71 +543,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
557 543
558 rdmsrl(hwc->config_base, ctrl_val); 544 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask; 545 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 546 wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621} 547}
622 548
623static inline void 549static void intel_pmu_disable_event(struct perf_event *event)
624intel_pmu_disable_event(struct perf_event *event)
625{ 550{
626 struct hw_perf_event *hwc = &event->hw; 551 struct hw_perf_event *hwc = &event->hw;
627 552
@@ -637,14 +562,15 @@ intel_pmu_disable_event(struct perf_event *event)
637 } 562 }
638 563
639 x86_pmu_disable_event(event); 564 x86_pmu_disable_event(event);
565
566 if (unlikely(event->attr.precise_ip))
567 intel_pmu_pebs_disable(event);
640} 568}
641 569
642static inline void 570static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{ 571{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED; 572 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask; 573 u64 ctrl_val, bits, mask;
647 int err;
648 574
649 /* 575 /*
650 * Enable IRQ generation (0x8), 576 * Enable IRQ generation (0x8),
@@ -669,7 +595,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
669 rdmsrl(hwc->config_base, ctrl_val); 595 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask; 596 ctrl_val &= ~mask;
671 ctrl_val |= bits; 597 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val); 598 wrmsrl(hwc->config_base, ctrl_val);
673} 599}
674 600
675static void intel_pmu_enable_event(struct perf_event *event) 601static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,7 +615,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
689 return; 615 return;
690 } 616 }
691 617
692 __x86_pmu_enable_event(hwc); 618 if (unlikely(event->attr.precise_ip))
619 intel_pmu_pebs_enable(event);
620
621 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
693} 622}
694 623
695/* 624/*
@@ -708,20 +637,20 @@ static void intel_pmu_reset(void)
708 unsigned long flags; 637 unsigned long flags;
709 int idx; 638 int idx;
710 639
711 if (!x86_pmu.num_events) 640 if (!x86_pmu.num_counters)
712 return; 641 return;
713 642
714 local_irq_save(flags); 643 local_irq_save(flags);
715 644
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 645 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717 646
718 for (idx = 0; idx < x86_pmu.num_events; idx++) { 647 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 648 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 649 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 } 650 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 651 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 652 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 } 653
725 if (ds) 654 if (ds)
726 ds->bts_index = ds->bts_buffer_base; 655 ds->bts_index = ds->bts_buffer_base;
727 656
@@ -747,7 +676,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
747 intel_pmu_drain_bts_buffer(); 676 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status(); 677 status = intel_pmu_get_status();
749 if (!status) { 678 if (!status) {
750 intel_pmu_enable_all(); 679 intel_pmu_enable_all(0);
751 return 0; 680 return 0;
752 } 681 }
753 682
@@ -762,6 +691,15 @@ again:
762 691
763 inc_irq_stat(apic_perf_irqs); 692 inc_irq_stat(apic_perf_irqs);
764 ack = status; 693 ack = status;
694
695 intel_pmu_lbr_read();
696
697 /*
698 * PEBS overflow sets bit 62 in the global status register
699 */
700 if (__test_and_clear_bit(62, (unsigned long *)&status))
701 x86_pmu.drain_pebs(regs);
702
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 703 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit]; 704 struct perf_event *event = cpuc->events[bit];
767 705
@@ -787,26 +725,22 @@ again:
787 goto again; 725 goto again;
788 726
789done: 727done:
790 intel_pmu_enable_all(); 728 intel_pmu_enable_all(0);
791 return 1; 729 return 1;
792} 730}
793 731
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint * 732static struct event_constraint *
798intel_special_constraints(struct perf_event *event) 733intel_bts_constraints(struct perf_event *event)
799{ 734{
800 unsigned int hw_event; 735 struct hw_perf_event *hwc = &event->hw;
801 736 unsigned int hw_event, bts_event;
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803 737
804 if (unlikely((hw_event == 738 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 739 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
806 (event->hw.sample_period == 1))) {
807 740
741 if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
808 return &bts_constraint; 742 return &bts_constraint;
809 } 743
810 return NULL; 744 return NULL;
811} 745}
812 746
@@ -815,24 +749,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
815{ 749{
816 struct event_constraint *c; 750 struct event_constraint *c;
817 751
818 c = intel_special_constraints(event); 752 c = intel_bts_constraints(event);
753 if (c)
754 return c;
755
756 c = intel_pebs_constraints(event);
819 if (c) 757 if (c)
820 return c; 758 return c;
821 759
822 return x86_get_event_constraints(cpuc, event); 760 return x86_get_event_constraints(cpuc, event);
823} 761}
824 762
825static __initconst struct x86_pmu core_pmu = { 763static int intel_pmu_hw_config(struct perf_event *event)
764{
765 int ret = x86_pmu_hw_config(event);
766
767 if (ret)
768 return ret;
769
770 if (event->attr.type != PERF_TYPE_RAW)
771 return 0;
772
773 if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
774 return 0;
775
776 if (x86_pmu.version < 3)
777 return -EINVAL;
778
779 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
780 return -EACCES;
781
782 event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
783
784 return 0;
785}
786
787static __initconst const struct x86_pmu core_pmu = {
826 .name = "core", 788 .name = "core",
827 .handle_irq = x86_pmu_handle_irq, 789 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all, 790 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all, 791 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event, 792 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event, 793 .disable = x86_pmu_disable_event,
794 .hw_config = x86_pmu_hw_config,
795 .schedule_events = x86_schedule_events,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 796 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 797 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map, 798 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 799 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1, 800 .apic = 1,
838 /* 801 /*
@@ -845,17 +808,32 @@ static __initconst struct x86_pmu core_pmu = {
845 .event_constraints = intel_core_event_constraints, 808 .event_constraints = intel_core_event_constraints,
846}; 809};
847 810
848static __initconst struct x86_pmu intel_pmu = { 811static void intel_pmu_cpu_starting(int cpu)
812{
813 init_debug_store_on_cpu(cpu);
814 /*
815 * Deal with CPUs that don't clear their LBRs on power-up.
816 */
817 intel_pmu_lbr_reset();
818}
819
820static void intel_pmu_cpu_dying(int cpu)
821{
822 fini_debug_store_on_cpu(cpu);
823}
824
825static __initconst const struct x86_pmu intel_pmu = {
849 .name = "Intel", 826 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq, 827 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all, 828 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all, 829 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event, 830 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event, 831 .disable = intel_pmu_disable_event,
832 .hw_config = intel_pmu_hw_config,
833 .schedule_events = x86_schedule_events,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 834 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 835 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map, 836 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 837 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1, 838 .apic = 1,
861 /* 839 /*
@@ -864,14 +842,38 @@ static __initconst struct x86_pmu intel_pmu = {
864 * the generic event period: 842 * the generic event period:
865 */ 843 */
866 .max_period = (1ULL << 31) - 1, 844 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints, 845 .get_event_constraints = intel_get_event_constraints,
870 846
871 .cpu_starting = init_debug_store_on_cpu, 847 .cpu_starting = intel_pmu_cpu_starting,
872 .cpu_dying = fini_debug_store_on_cpu, 848 .cpu_dying = intel_pmu_cpu_dying,
873}; 849};
874 850
851static void intel_clovertown_quirks(void)
852{
853 /*
854 * PEBS is unreliable due to:
855 *
856 * AJ67 - PEBS may experience CPL leaks
857 * AJ68 - PEBS PMI may be delayed by one event
858 * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
859 * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
860 *
861 * AJ67 could be worked around by restricting the OS/USR flags.
862 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
863 *
864 * AJ106 could possibly be worked around by not allowing LBR
865 * usage from PEBS, including the fixup.
866 * AJ68 could possibly be worked around by always programming
867 * a pebs_event_reset[0] value and coping with the lost events.
868 *
869 * But taken together it might just make sense to not enable PEBS on
870 * these chips.
871 */
872 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
873 x86_pmu.pebs = 0;
874 x86_pmu.pebs_constraints = NULL;
875}
876
875static __init int intel_pmu_init(void) 877static __init int intel_pmu_init(void)
876{ 878{
877 union cpuid10_edx edx; 879 union cpuid10_edx edx;
@@ -881,12 +883,13 @@ static __init int intel_pmu_init(void)
881 int version; 883 int version;
882 884
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 885 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */ 886 switch (boot_cpu_data.x86) {
885 if (boot_cpu_data.x86 == 6) { 887 case 0x6:
886 return p6_pmu_init(); 888 return p6_pmu_init();
887 } else { 889 case 0xf:
890 return p4_pmu_init();
891 }
888 return -ENODEV; 892 return -ENODEV;
889 }
890 } 893 }
891 894
892 /* 895 /*
@@ -904,16 +907,28 @@ static __init int intel_pmu_init(void)
904 x86_pmu = intel_pmu; 907 x86_pmu = intel_pmu;
905 908
906 x86_pmu.version = version; 909 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events; 910 x86_pmu.num_counters = eax.split.num_counters;
908 x86_pmu.event_bits = eax.split.bit_width; 911 x86_pmu.cntval_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; 912 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
910 913
911 /* 914 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so 915 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events: 916 * assume at least 3 events:
914 */ 917 */
915 if (version > 1) 918 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 919 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
920
921 /*
922 * v2 and above have a perf capabilities MSR
923 */
924 if (version > 1) {
925 u64 capabilities;
926
927 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
928 x86_pmu.intel_cap.capabilities = capabilities;
929 }
930
931 intel_ds_init();
917 932
918 /* 933 /*
919 * Install the hw-cache-events table: 934 * Install the hw-cache-events table:
@@ -924,12 +939,15 @@ static __init int intel_pmu_init(void)
924 break; 939 break;
925 940
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 941 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
942 x86_pmu.quirks = intel_clovertown_quirks;
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 943 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 944 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */ 945 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 946 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids)); 947 sizeof(hw_cache_event_ids));
932 948
949 intel_pmu_lbr_init_core();
950
933 x86_pmu.event_constraints = intel_core2_event_constraints; 951 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, "); 952 pr_cont("Core2 events, ");
935 break; 953 break;
@@ -940,13 +958,19 @@ static __init int intel_pmu_init(void)
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 958 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids)); 959 sizeof(hw_cache_event_ids));
942 960
961 intel_pmu_lbr_init_nhm();
962
943 x86_pmu.event_constraints = intel_nehalem_event_constraints; 963 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, "); 964 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
965 pr_cont("Nehalem events, ");
945 break; 966 break;
967
946 case 28: /* Atom */ 968 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 969 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids)); 970 sizeof(hw_cache_event_ids));
949 971
972 intel_pmu_lbr_init_atom();
973
950 x86_pmu.event_constraints = intel_gen_event_constraints; 974 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, "); 975 pr_cont("Atom events, ");
952 break; 976 break;
@@ -956,7 +980,10 @@ static __init int intel_pmu_init(void)
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 980 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids)); 981 sizeof(hw_cache_event_ids));
958 982
983 intel_pmu_lbr_init_nhm();
984
959 x86_pmu.event_constraints = intel_westmere_event_constraints; 985 x86_pmu.event_constraints = intel_westmere_event_constraints;
986 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
960 pr_cont("Westmere events, "); 987 pr_cont("Westmere events, ");
961 break; 988 break;
962 989
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 00000000000..18018d1311c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/* The maximal number of PEBS events: */
4#define MAX_PEBS_EVENTS 4
5
6/* The size of a BTS record in bytes: */
7#define BTS_RECORD_SIZE 24
8
9#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
10#define PEBS_BUFFER_SIZE PAGE_SIZE
11
12/*
13 * pebs_record_32 for p4 and core not supported
14
15struct pebs_record_32 {
16 u32 flags, ip;
17 u32 ax, bc, cx, dx;
18 u32 si, di, bp, sp;
19};
20
21 */
22
23struct pebs_record_core {
24 u64 flags, ip;
25 u64 ax, bx, cx, dx;
26 u64 si, di, bp, sp;
27 u64 r8, r9, r10, r11;
28 u64 r12, r13, r14, r15;
29};
30
31struct pebs_record_nhm {
32 u64 flags, ip;
33 u64 ax, bx, cx, dx;
34 u64 si, di, bp, sp;
35 u64 r8, r9, r10, r11;
36 u64 r12, r13, r14, r15;
37 u64 status, dla, dse, lat;
38};
39
40/*
41 * A debug store configuration.
42 *
43 * We only support architectures that use 64bit fields.
44 */
45struct debug_store {
46 u64 bts_buffer_base;
47 u64 bts_index;
48 u64 bts_absolute_maximum;
49 u64 bts_interrupt_threshold;
50 u64 pebs_buffer_base;
51 u64 pebs_index;
52 u64 pebs_absolute_maximum;
53 u64 pebs_interrupt_threshold;
54 u64 pebs_event_reset[MAX_PEBS_EVENTS];
55};
56
57static void init_debug_store_on_cpu(int cpu)
58{
59 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
60
61 if (!ds)
62 return;
63
64 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
65 (u32)((u64)(unsigned long)ds),
66 (u32)((u64)(unsigned long)ds >> 32));
67}
68
69static void fini_debug_store_on_cpu(int cpu)
70{
71 if (!per_cpu(cpu_hw_events, cpu).ds)
72 return;
73
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75}
76
77static void release_ds_buffers(void)
78{
79 int cpu;
80
81 if (!x86_pmu.bts && !x86_pmu.pebs)
82 return;
83
84 get_online_cpus();
85
86 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu);
88
89 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
91
92 if (!ds)
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 }
101
102 put_online_cpus();
103}
104
105static int reserve_ds_buffers(void)
106{
107 int cpu, err = 0;
108
109 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0;
111
112 get_online_cpus();
113
114 for_each_possible_cpu(cpu) {
115 struct debug_store *ds;
116 void *buffer;
117 int max, thresh;
118
119 err = -ENOMEM;
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
121 if (unlikely(!ds))
122 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds;
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140
141 if (x86_pmu.pebs) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
143 if (unlikely(!buffer))
144 break;
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158
159 err = 0;
160 }
161
162 if (err)
163 release_ds_buffers();
164 else {
165 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu);
167 }
168
169 put_online_cpus();
170
171 return err;
172}
173
174/*
175 * BTS
176 */
177
178static struct event_constraint bts_constraint =
179 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
180
181static void intel_pmu_enable_bts(u64 config)
182{
183 unsigned long debugctlmsr;
184
185 debugctlmsr = get_debugctlmsr();
186
187 debugctlmsr |= DEBUGCTLMSR_TR;
188 debugctlmsr |= DEBUGCTLMSR_BTS;
189 debugctlmsr |= DEBUGCTLMSR_BTINT;
190
191 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
192 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
193
194 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
195 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
196
197 update_debugctlmsr(debugctlmsr);
198}
199
200static void intel_pmu_disable_bts(void)
201{
202 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
203 unsigned long debugctlmsr;
204
205 if (!cpuc->ds)
206 return;
207
208 debugctlmsr = get_debugctlmsr();
209
210 debugctlmsr &=
211 ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
212 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
213
214 update_debugctlmsr(debugctlmsr);
215}
216
217static void intel_pmu_drain_bts_buffer(void)
218{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds;
221 struct bts_record {
222 u64 from;
223 u64 to;
224 u64 flags;
225 };
226 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
227 struct bts_record *at, *top;
228 struct perf_output_handle handle;
229 struct perf_event_header header;
230 struct perf_sample_data data;
231 struct pt_regs regs;
232
233 if (!event)
234 return;
235
236 if (!ds)
237 return;
238
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index;
241
242 if (top <= at)
243 return;
244
245 ds->bts_index = ds->bts_buffer_base;
246
247 perf_sample_data_init(&data, 0);
248 data.period = event->hw.last_period;
249 regs.ip = 0;
250
251 /*
252 * Prepare a generic sample, i.e. fill in the invariant fields.
253 * We will overwrite the from and to address before we output
254 * the sample.
255 */
256 perf_prepare_sample(&header, &data, event, &regs);
257
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return;
260
261 for (; at < top; at++) {
262 data.ip = at->from;
263 data.addr = at->to;
264
265 perf_output_sample(&handle, &header, &data, event);
266 }
267
268 perf_output_end(&handle);
269
270 /* There's new data available. */
271 event->hw.interrupts++;
272 event->pending_kill = POLL_IN;
273}
274
275/*
276 * PEBS
277 */
278
279static struct event_constraint intel_core_pebs_events[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
289 EVENT_CONSTRAINT_END
290};
291
292static struct event_constraint intel_nehalem_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
302 EVENT_CONSTRAINT_END
303};
304
305static struct event_constraint *
306intel_pebs_constraints(struct perf_event *event)
307{
308 struct event_constraint *c;
309
310 if (!event->attr.precise_ip)
311 return NULL;
312
313 if (x86_pmu.pebs_constraints) {
314 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
315 if ((event->hw.config & c->cmask) == c->code)
316 return c;
317 }
318 }
319
320 return &emptyconstraint;
321}
322
323static void intel_pmu_pebs_enable(struct perf_event *event)
324{
325 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
326 struct hw_perf_event *hwc = &event->hw;
327
328 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
329
330 cpuc->pebs_enabled |= 1ULL << hwc->idx;
331 WARN_ON_ONCE(cpuc->enabled);
332
333 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
334 intel_pmu_lbr_enable(event);
335}
336
337static void intel_pmu_pebs_disable(struct perf_event *event)
338{
339 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
340 struct hw_perf_event *hwc = &event->hw;
341
342 cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
343 if (cpuc->enabled)
344 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
345
346 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
347
348 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
349 intel_pmu_lbr_disable(event);
350}
351
352static void intel_pmu_pebs_enable_all(void)
353{
354 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
355
356 if (cpuc->pebs_enabled)
357 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
358}
359
360static void intel_pmu_pebs_disable_all(void)
361{
362 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
363
364 if (cpuc->pebs_enabled)
365 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
366}
367
368#include <asm/insn.h>
369
370static inline bool kernel_ip(unsigned long ip)
371{
372#ifdef CONFIG_X86_32
373 return ip > PAGE_OFFSET;
374#else
375 return (long)ip < 0;
376#endif
377}
378
379static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
380{
381 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
382 unsigned long from = cpuc->lbr_entries[0].from;
383 unsigned long old_to, to = cpuc->lbr_entries[0].to;
384 unsigned long ip = regs->ip;
385
386 /*
387 * We don't need to fixup if the PEBS assist is fault like
388 */
389 if (!x86_pmu.intel_cap.pebs_trap)
390 return 1;
391
392 /*
393 * No LBR entry, no basic block, no rewinding
394 */
395 if (!cpuc->lbr_stack.nr || !from || !to)
396 return 0;
397
398 /*
399 * Basic blocks should never cross user/kernel boundaries
400 */
401 if (kernel_ip(ip) != kernel_ip(to))
402 return 0;
403
404 /*
405 * unsigned math, either ip is before the start (impossible) or
406 * the basic block is larger than 1 page (sanity)
407 */
408 if ((ip - to) > PAGE_SIZE)
409 return 0;
410
411 /*
412 * We sampled a branch insn, rewind using the LBR stack
413 */
414 if (ip == to) {
415 regs->ip = from;
416 return 1;
417 }
418
419 do {
420 struct insn insn;
421 u8 buf[MAX_INSN_SIZE];
422 void *kaddr;
423
424 old_to = to;
425 if (!kernel_ip(ip)) {
426 int bytes, size = MAX_INSN_SIZE;
427
428 bytes = copy_from_user_nmi(buf, (void __user *)to, size);
429 if (bytes != size)
430 return 0;
431
432 kaddr = buf;
433 } else
434 kaddr = (void *)to;
435
436 kernel_insn_init(&insn, kaddr);
437 insn_get_length(&insn);
438 to += insn.length;
439 } while (to < ip);
440
441 if (to == ip) {
442 regs->ip = old_to;
443 return 1;
444 }
445
446 /*
447 * Even though we decoded the basic block, the instruction stream
448 * never matched the given IP, either the TO or the IP got corrupted.
449 */
450 return 0;
451}
452
453static int intel_pmu_save_and_restart(struct perf_event *event);
454
455static void __intel_pmu_pebs_event(struct perf_event *event,
456 struct pt_regs *iregs, void *__pebs)
457{
458 /*
459 * We cast to pebs_record_core since that is a subset of
460 * both formats and we don't use the other fields in this
461 * routine.
462 */
463 struct pebs_record_core *pebs = __pebs;
464 struct perf_sample_data data;
465 struct pt_regs regs;
466
467 if (!intel_pmu_save_and_restart(event))
468 return;
469
470 perf_sample_data_init(&data, 0);
471 data.period = event->hw.last_period;
472
473 /*
474 * We use the interrupt regs as a base because the PEBS record
475 * does not contain a full regs set, specifically it seems to
476 * lack segment descriptors, which get used by things like
477 * user_mode().
478 *
479 * In the simple case fix up only the IP and BP,SP regs, for
480 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
481 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
482 */
483 regs = *iregs;
484 regs.ip = pebs->ip;
485 regs.bp = pebs->bp;
486 regs.sp = pebs->sp;
487
488 if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
489 regs.flags |= PERF_EFLAGS_EXACT;
490 else
491 regs.flags &= ~PERF_EFLAGS_EXACT;
492
493 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event);
495}
496
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
498{
499 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
500 struct debug_store *ds = cpuc->ds;
501 struct perf_event *event = cpuc->events[0]; /* PMC0 only */
502 struct pebs_record_core *at, *top;
503 int n;
504
505 if (!ds || !x86_pmu.pebs)
506 return;
507
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
509 top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
510
511 /*
512 * Whatever else happens, drain the thing
513 */
514 ds->pebs_index = ds->pebs_buffer_base;
515
516 if (!test_bit(0, cpuc->active_mask))
517 return;
518
519 WARN_ON_ONCE(!event);
520
521 if (!event->attr.precise_ip)
522 return;
523
524 n = top - at;
525 if (n <= 0)
526 return;
527
528 /*
529 * Should not happen, we program the threshold at 1 and do not
530 * set a reset value.
531 */
532 WARN_ON_ONCE(n > 1);
533 at += n - 1;
534
535 __intel_pmu_pebs_event(event, iregs, at);
536}
537
538static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
539{
540 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
541 struct debug_store *ds = cpuc->ds;
542 struct pebs_record_nhm *at, *top;
543 struct perf_event *event = NULL;
544 u64 status = 0;
545 int bit, n;
546
547 if (!ds || !x86_pmu.pebs)
548 return;
549
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
551 top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
552
553 ds->pebs_index = ds->pebs_buffer_base;
554
555 n = top - at;
556 if (n <= 0)
557 return;
558
559 /*
560 * Should not happen, we program the threshold at 1 and do not
561 * set a reset value.
562 */
563 WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
564
565 for ( ; at < top; at++) {
566 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
567 event = cpuc->events[bit];
568 if (!test_bit(bit, cpuc->active_mask))
569 continue;
570
571 WARN_ON_ONCE(!event);
572
573 if (!event->attr.precise_ip)
574 continue;
575
576 if (__test_and_set_bit(bit, (unsigned long *)&status))
577 continue;
578
579 break;
580 }
581
582 if (!event || bit >= MAX_PEBS_EVENTS)
583 continue;
584
585 __intel_pmu_pebs_event(event, iregs, at);
586 }
587}
588
589/*
590 * BTS, PEBS probe and setup
591 */
592
593static void intel_ds_init(void)
594{
595 /*
596 * No support for 32bit formats
597 */
598 if (!boot_cpu_has(X86_FEATURE_DTES64))
599 return;
600
601 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
602 x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
603 if (x86_pmu.pebs) {
604 char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
605 int format = x86_pmu.intel_cap.pebs_format;
606
607 switch (format) {
608 case 0:
609 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
610 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
611 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
612 x86_pmu.pebs_constraints = intel_core_pebs_events;
613 break;
614
615 case 1:
616 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
617 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
618 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
619 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
620 break;
621
622 default:
623 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
624 x86_pmu.pebs = 0;
625 break;
626 }
627 }
628}
629
630#else /* CONFIG_CPU_SUP_INTEL */
631
632static int reserve_ds_buffers(void)
633{
634 return 0;
635}
636
637static void release_ds_buffers(void)
638{
639}
640
641#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 00000000000..d202c1bece1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3enum {
4 LBR_FORMAT_32 = 0x00,
5 LBR_FORMAT_LIP = 0x01,
6 LBR_FORMAT_EIP = 0x02,
7 LBR_FORMAT_EIP_FLAGS = 0x03,
8};
9
10/*
11 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
12 * otherwise it becomes near impossible to get a reliable stack.
13 */
14
15static void __intel_pmu_lbr_enable(void)
16{
17 u64 debugctl;
18
19 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
20 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
21 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
22}
23
24static void __intel_pmu_lbr_disable(void)
25{
26 u64 debugctl;
27
28 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
29 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
30 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
31}
32
33static void intel_pmu_lbr_reset_32(void)
34{
35 int i;
36
37 for (i = 0; i < x86_pmu.lbr_nr; i++)
38 wrmsrl(x86_pmu.lbr_from + i, 0);
39}
40
41static void intel_pmu_lbr_reset_64(void)
42{
43 int i;
44
45 for (i = 0; i < x86_pmu.lbr_nr; i++) {
46 wrmsrl(x86_pmu.lbr_from + i, 0);
47 wrmsrl(x86_pmu.lbr_to + i, 0);
48 }
49}
50
51static void intel_pmu_lbr_reset(void)
52{
53 if (!x86_pmu.lbr_nr)
54 return;
55
56 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
57 intel_pmu_lbr_reset_32();
58 else
59 intel_pmu_lbr_reset_64();
60}
61
62static void intel_pmu_lbr_enable(struct perf_event *event)
63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
65
66 if (!x86_pmu.lbr_nr)
67 return;
68
69 WARN_ON_ONCE(cpuc->enabled);
70
71 /*
72 * Reset the LBR stack if we changed task context to
73 * avoid data leaks.
74 */
75
76 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
77 intel_pmu_lbr_reset();
78 cpuc->lbr_context = event->ctx;
79 }
80
81 cpuc->lbr_users++;
82}
83
84static void intel_pmu_lbr_disable(struct perf_event *event)
85{
86 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
87
88 if (!x86_pmu.lbr_nr)
89 return;
90
91 cpuc->lbr_users--;
92 WARN_ON_ONCE(cpuc->lbr_users < 0);
93
94 if (cpuc->enabled && !cpuc->lbr_users)
95 __intel_pmu_lbr_disable();
96}
97
98static void intel_pmu_lbr_enable_all(void)
99{
100 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
101
102 if (cpuc->lbr_users)
103 __intel_pmu_lbr_enable();
104}
105
106static void intel_pmu_lbr_disable_all(void)
107{
108 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
109
110 if (cpuc->lbr_users)
111 __intel_pmu_lbr_disable();
112}
113
114static inline u64 intel_pmu_lbr_tos(void)
115{
116 u64 tos;
117
118 rdmsrl(x86_pmu.lbr_tos, tos);
119
120 return tos;
121}
122
123static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
124{
125 unsigned long mask = x86_pmu.lbr_nr - 1;
126 u64 tos = intel_pmu_lbr_tos();
127 int i;
128
129 for (i = 0; i < x86_pmu.lbr_nr; i++) {
130 unsigned long lbr_idx = (tos - i) & mask;
131 union {
132 struct {
133 u32 from;
134 u32 to;
135 };
136 u64 lbr;
137 } msr_lastbranch;
138
139 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
140
141 cpuc->lbr_entries[i].from = msr_lastbranch.from;
142 cpuc->lbr_entries[i].to = msr_lastbranch.to;
143 cpuc->lbr_entries[i].flags = 0;
144 }
145 cpuc->lbr_stack.nr = i;
146}
147
148#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
149
150/*
151 * Due to lack of segmentation in Linux the effective address (offset)
152 * is the same as the linear address, allowing us to merge the LIP and EIP
153 * LBR formats.
154 */
155static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
156{
157 unsigned long mask = x86_pmu.lbr_nr - 1;
158 int lbr_format = x86_pmu.intel_cap.lbr_format;
159 u64 tos = intel_pmu_lbr_tos();
160 int i;
161
162 for (i = 0; i < x86_pmu.lbr_nr; i++) {
163 unsigned long lbr_idx = (tos - i) & mask;
164 u64 from, to, flags = 0;
165
166 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
167 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
168
169 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
170 flags = !!(from & LBR_FROM_FLAG_MISPRED);
171 from = (u64)((((s64)from) << 1) >> 1);
172 }
173
174 cpuc->lbr_entries[i].from = from;
175 cpuc->lbr_entries[i].to = to;
176 cpuc->lbr_entries[i].flags = flags;
177 }
178 cpuc->lbr_stack.nr = i;
179}
180
181static void intel_pmu_lbr_read(void)
182{
183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
184
185 if (!cpuc->lbr_users)
186 return;
187
188 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
189 intel_pmu_lbr_read_32(cpuc);
190 else
191 intel_pmu_lbr_read_64(cpuc);
192}
193
194static void intel_pmu_lbr_init_core(void)
195{
196 x86_pmu.lbr_nr = 4;
197 x86_pmu.lbr_tos = 0x01c9;
198 x86_pmu.lbr_from = 0x40;
199 x86_pmu.lbr_to = 0x60;
200}
201
202static void intel_pmu_lbr_init_nhm(void)
203{
204 x86_pmu.lbr_nr = 16;
205 x86_pmu.lbr_tos = 0x01c9;
206 x86_pmu.lbr_from = 0x680;
207 x86_pmu.lbr_to = 0x6c0;
208}
209
210static void intel_pmu_lbr_init_atom(void)
211{
212 x86_pmu.lbr_nr = 8;
213 x86_pmu.lbr_tos = 0x01c9;
214 x86_pmu.lbr_from = 0x40;
215 x86_pmu.lbr_to = 0x60;
216}
217
218#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 00000000000..ae85d69644d
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,858 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#ifdef CONFIG_CPU_SUP_INTEL
11
12#include <asm/perf_event_p4.h>
13
14#define P4_CNTR_LIMIT 3
15/*
16 * array indices: 0,1 - HT threads, used with HT enabled cpu
17 */
18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22};
23
24struct p4_cache_event_bind {
25 unsigned int metric_pebs;
26 unsigned int metric_vert;
27};
28
29#define P4_GEN_CACHE_EVENT_BIND(name) \
30 [P4_CACHE__##name] = { \
31 .metric_pebs = P4_PEBS__##name, \
32 .metric_vert = P4_VERT__##name, \
33 }
34
35static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
36 P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
37 P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
38 P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
39 P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
40};
41
42/*
43 * Note that we don't use CCCR1 here, there is an
44 * exception for P4_BSQ_ALLOCATION but we just have
45 * no workaround
46 *
47 * consider this binding as resources which particular
48 * event may borrow, it doesn't contain EventMask,
49 * Tags and friends -- they are left to a caller
50 */
51static struct p4_event_bind p4_event_bind_map[] = {
52 [P4_EVENT_TC_DELIVER_MODE] = {
53 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
54 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
55 .cntr = { {4, 5, -1}, {6, 7, -1} },
56 },
57 [P4_EVENT_BPU_FETCH_REQUEST] = {
58 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
59 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
60 .cntr = { {0, -1, -1}, {2, -1, -1} },
61 },
62 [P4_EVENT_ITLB_REFERENCE] = {
63 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
64 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
65 .cntr = { {0, -1, -1}, {2, -1, -1} },
66 },
67 [P4_EVENT_MEMORY_CANCEL] = {
68 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
69 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
70 .cntr = { {8, 9, -1}, {10, 11, -1} },
71 },
72 [P4_EVENT_MEMORY_COMPLETE] = {
73 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
74 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
75 .cntr = { {8, 9, -1}, {10, 11, -1} },
76 },
77 [P4_EVENT_LOAD_PORT_REPLAY] = {
78 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
79 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
80 .cntr = { {8, 9, -1}, {10, 11, -1} },
81 },
82 [P4_EVENT_STORE_PORT_REPLAY] = {
83 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
84 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
85 .cntr = { {8, 9, -1}, {10, 11, -1} },
86 },
87 [P4_EVENT_MOB_LOAD_REPLAY] = {
88 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
89 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
90 .cntr = { {0, -1, -1}, {2, -1, -1} },
91 },
92 [P4_EVENT_PAGE_WALK_TYPE] = {
93 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
94 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
95 .cntr = { {0, -1, -1}, {2, -1, -1} },
96 },
97 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
98 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
99 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
100 .cntr = { {0, -1, -1}, {2, -1, -1} },
101 },
102 [P4_EVENT_IOQ_ALLOCATION] = {
103 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
104 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
105 .cntr = { {0, -1, -1}, {2, -1, -1} },
106 },
107 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
108 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
109 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
110 .cntr = { {2, -1, -1}, {3, -1, -1} },
111 },
112 [P4_EVENT_FSB_DATA_ACTIVITY] = {
113 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
114 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
115 .cntr = { {0, -1, -1}, {2, -1, -1} },
116 },
117 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
118 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
119 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
120 .cntr = { {0, -1, -1}, {1, -1, -1} },
121 },
122 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
123 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
124 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
125 .cntr = { {2, -1, -1}, {3, -1, -1} },
126 },
127 [P4_EVENT_SSE_INPUT_ASSIST] = {
128 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
129 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
130 .cntr = { {8, 9, -1}, {10, 11, -1} },
131 },
132 [P4_EVENT_PACKED_SP_UOP] = {
133 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
134 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
135 .cntr = { {8, 9, -1}, {10, 11, -1} },
136 },
137 [P4_EVENT_PACKED_DP_UOP] = {
138 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
139 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
140 .cntr = { {8, 9, -1}, {10, 11, -1} },
141 },
142 [P4_EVENT_SCALAR_SP_UOP] = {
143 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
144 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
145 .cntr = { {8, 9, -1}, {10, 11, -1} },
146 },
147 [P4_EVENT_SCALAR_DP_UOP] = {
148 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
149 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
150 .cntr = { {8, 9, -1}, {10, 11, -1} },
151 },
152 [P4_EVENT_64BIT_MMX_UOP] = {
153 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
154 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
155 .cntr = { {8, 9, -1}, {10, 11, -1} },
156 },
157 [P4_EVENT_128BIT_MMX_UOP] = {
158 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
159 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
160 .cntr = { {8, 9, -1}, {10, 11, -1} },
161 },
162 [P4_EVENT_X87_FP_UOP] = {
163 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
164 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
165 .cntr = { {8, 9, -1}, {10, 11, -1} },
166 },
167 [P4_EVENT_TC_MISC] = {
168 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
169 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
170 .cntr = { {4, 5, -1}, {6, 7, -1} },
171 },
172 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
173 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
174 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
175 .cntr = { {0, -1, -1}, {2, -1, -1} },
176 },
177 [P4_EVENT_TC_MS_XFER] = {
178 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
179 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
180 .cntr = { {4, 5, -1}, {6, 7, -1} },
181 },
182 [P4_EVENT_UOP_QUEUE_WRITES] = {
183 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
184 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
185 .cntr = { {4, 5, -1}, {6, 7, -1} },
186 },
187 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
188 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
189 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
190 .cntr = { {4, 5, -1}, {6, 7, -1} },
191 },
192 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
193 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
194 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
195 .cntr = { {4, 5, -1}, {6, 7, -1} },
196 },
197 [P4_EVENT_RESOURCE_STALL] = {
198 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
199 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
200 .cntr = { {12, 13, 16}, {14, 15, 17} },
201 },
202 [P4_EVENT_WC_BUFFER] = {
203 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
204 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
205 .cntr = { {8, 9, -1}, {10, 11, -1} },
206 },
207 [P4_EVENT_B2B_CYCLES] = {
208 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
209 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
210 .cntr = { {0, -1, -1}, {2, -1, -1} },
211 },
212 [P4_EVENT_BNR] = {
213 .opcode = P4_OPCODE(P4_EVENT_BNR),
214 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
215 .cntr = { {0, -1, -1}, {2, -1, -1} },
216 },
217 [P4_EVENT_SNOOP] = {
218 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
219 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
220 .cntr = { {0, -1, -1}, {2, -1, -1} },
221 },
222 [P4_EVENT_RESPONSE] = {
223 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
224 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
225 .cntr = { {0, -1, -1}, {2, -1, -1} },
226 },
227 [P4_EVENT_FRONT_END_EVENT] = {
228 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
229 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
230 .cntr = { {12, 13, 16}, {14, 15, 17} },
231 },
232 [P4_EVENT_EXECUTION_EVENT] = {
233 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
234 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
235 .cntr = { {12, 13, 16}, {14, 15, 17} },
236 },
237 [P4_EVENT_REPLAY_EVENT] = {
238 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
239 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
240 .cntr = { {12, 13, 16}, {14, 15, 17} },
241 },
242 [P4_EVENT_INSTR_RETIRED] = {
243 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
244 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
245 .cntr = { {12, 13, 16}, {14, 15, 17} },
246 },
247 [P4_EVENT_UOPS_RETIRED] = {
248 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
249 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
250 .cntr = { {12, 13, 16}, {14, 15, 17} },
251 },
252 [P4_EVENT_UOP_TYPE] = {
253 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
254 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
255 .cntr = { {12, 13, 16}, {14, 15, 17} },
256 },
257 [P4_EVENT_BRANCH_RETIRED] = {
258 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
259 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
260 .cntr = { {12, 13, 16}, {14, 15, 17} },
261 },
262 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
263 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
264 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
265 .cntr = { {12, 13, 16}, {14, 15, 17} },
266 },
267 [P4_EVENT_X87_ASSIST] = {
268 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
269 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
270 .cntr = { {12, 13, 16}, {14, 15, 17} },
271 },
272 [P4_EVENT_MACHINE_CLEAR] = {
273 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
274 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
275 .cntr = { {12, 13, 16}, {14, 15, 17} },
276 },
277 [P4_EVENT_INSTR_COMPLETED] = {
278 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
279 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
280 .cntr = { {12, 13, 16}, {14, 15, 17} },
281 },
282};
283
284#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \
285 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
286 P4_ESCR_EMASK_BIT(event, bit)) | \
287 p4_config_pack_cccr(cache_event | \
288 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
289
290static __initconst const u64 p4_hw_cache_event_ids
291 [PERF_COUNT_HW_CACHE_MAX]
292 [PERF_COUNT_HW_CACHE_OP_MAX]
293 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
294{
295 [ C(L1D ) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0,
298 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
299 P4_CACHE__1stl_cache_load_miss_retired),
300 },
301 },
302 [ C(LL ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0,
305 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
306 P4_CACHE__2ndl_cache_load_miss_retired),
307 },
308},
309 [ C(DTLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_CACHE__dtlb_load_miss_retired),
314 },
315 [ C(OP_WRITE) ] = {
316 [ C(RESULT_ACCESS) ] = 0x0,
317 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
318 P4_CACHE__dtlb_store_miss_retired),
319 },
320 },
321 [ C(ITLB) ] = {
322 [ C(OP_READ) ] = {
323 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
324 P4_CACHE__itlb_reference_hit),
325 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
326 P4_CACHE__itlb_reference_miss),
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
340 /* non-halted CPU clocks */
341 [PERF_COUNT_HW_CPU_CYCLES] =
342 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
343 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
344
345 /*
346 * retired instructions
347 * in a sake of simplicity we don't use the FSB tagging
348 */
349 [PERF_COUNT_HW_INSTRUCTIONS] =
350 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
351 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
352 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
353
354 /* cache hits */
355 [PERF_COUNT_HW_CACHE_REFERENCES] =
356 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
358 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
359 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
360 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
361 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
362 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
363
364 /* cache misses */
365 [PERF_COUNT_HW_CACHE_MISSES] =
366 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
367 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
368 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
369 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
370
371 /* branch instructions retired */
372 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
373 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
374 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
375 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
376 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
377 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
378
379 /* mispredicted branches retired */
380 [PERF_COUNT_HW_BRANCH_MISSES] =
381 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
382 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
383
384 /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
385 [PERF_COUNT_HW_BUS_CYCLES] =
386 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
387 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
388 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
389 p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
390};
391
392static struct p4_event_bind *p4_config_get_bind(u64 config)
393{
394 unsigned int evnt = p4_config_unpack_event(config);
395 struct p4_event_bind *bind = NULL;
396
397 if (evnt < ARRAY_SIZE(p4_event_bind_map))
398 bind = &p4_event_bind_map[evnt];
399
400 return bind;
401}
402
403static u64 p4_pmu_event_map(int hw_event)
404{
405 struct p4_event_bind *bind;
406 unsigned int esel;
407 u64 config;
408
409 config = p4_general_events[hw_event];
410 bind = p4_config_get_bind(config);
411 esel = P4_OPCODE_ESEL(bind->opcode);
412 config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
413
414 return config;
415}
416
417static int p4_hw_config(struct perf_event *event)
418{
419 int cpu = get_cpu();
420 int rc = 0;
421 unsigned int evnt;
422 u32 escr, cccr;
423
424 /*
425 * the reason we use cpu that early is that: if we get scheduled
426 * first time on the same cpu -- we will not need swap thread
427 * specific flags in config (and will save some cpu cycles)
428 */
429
430 cccr = p4_default_cccr_conf(cpu);
431 escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
432 event->attr.exclude_user);
433 event->hw.config = p4_config_pack_escr(escr) |
434 p4_config_pack_cccr(cccr);
435
436 if (p4_ht_active() && p4_ht_thread(cpu))
437 event->hw.config = p4_set_ht_bit(event->hw.config);
438
439 if (event->attr.type == PERF_TYPE_RAW) {
440
441 /* user data may have out-of-bound event index */
442 evnt = p4_config_unpack_event(event->attr.config);
443 if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
444 rc = -EINVAL;
445 goto out;
446 }
447
448 /*
449 * We don't control raw events so it's up to the caller
450 * to pass sane values (and we don't count the thread number
451 * on HT machine but allow HT-compatible specifics to be
452 * passed on)
453 *
454 * XXX: HT wide things should check perf_paranoid_cpu() &&
455 * CAP_SYS_ADMIN
456 */
457 event->hw.config |= event->attr.config &
458 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
459 p4_config_pack_cccr(P4_CCCR_MASK_HT));
460 }
461
462 rc = x86_setup_perfctr(event);
463out:
464 put_cpu();
465 return rc;
466}
467
468static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
469{
470 int overflow = 0;
471 u32 low, high;
472
473 rdmsr(hwc->config_base + hwc->idx, low, high);
474
475 /* we need to check high bit for unflagged overflows */
476 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
477 overflow = 1;
478 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
479 ((u64)low) & ~P4_CCCR_OVF);
480 }
481
482 return overflow;
483}
484
485static inline void p4_pmu_disable_event(struct perf_event *event)
486{
487 struct hw_perf_event *hwc = &event->hw;
488
489 /*
490 * If event gets disabled while counter is in overflowed
491 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
492 * asserted again and again
493 */
494 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
495 (u64)(p4_config_unpack_cccr(hwc->config)) &
496 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
497}
498
499static void p4_pmu_disable_all(void)
500{
501 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
502 int idx;
503
504 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
505 struct perf_event *event = cpuc->events[idx];
506 if (!test_bit(idx, cpuc->active_mask))
507 continue;
508 p4_pmu_disable_event(event);
509 }
510}
511
512static void p4_pmu_enable_event(struct perf_event *event)
513{
514 struct hw_perf_event *hwc = &event->hw;
515 int thread = p4_ht_config_thread(hwc->config);
516 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
517 unsigned int idx = p4_config_unpack_event(hwc->config);
518 unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
519 struct p4_event_bind *bind;
520 struct p4_cache_event_bind *bind_cache;
521 u64 escr_addr, cccr;
522
523 bind = &p4_event_bind_map[idx];
524 escr_addr = (u64)bind->escr_msr[thread];
525
526 /*
527 * - we dont support cascaded counters yet
528 * - and counter 1 is broken (erratum)
529 */
530 WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
531 WARN_ON_ONCE(hwc->idx == 1);
532
533 /* we need a real Event value */
534 escr_conf &= ~P4_ESCR_EVENT_MASK;
535 escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
536
537 cccr = p4_config_unpack_cccr(hwc->config);
538
539 /*
540 * it could be Cache event so that we need to
541 * set metrics into additional MSRs
542 */
543 BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
544 if (idx_cache > P4_CACHE__NONE &&
545 idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
546 bind_cache = &p4_cache_event_bind_map[idx_cache];
547 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
548 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
549 }
550
551 (void)checking_wrmsrl(escr_addr, escr_conf);
552 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
553 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
554}
555
556static void p4_pmu_enable_all(int added)
557{
558 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
559 int idx;
560
561 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
562 struct perf_event *event = cpuc->events[idx];
563 if (!test_bit(idx, cpuc->active_mask))
564 continue;
565 p4_pmu_enable_event(event);
566 }
567}
568
569static int p4_pmu_handle_irq(struct pt_regs *regs)
570{
571 struct perf_sample_data data;
572 struct cpu_hw_events *cpuc;
573 struct perf_event *event;
574 struct hw_perf_event *hwc;
575 int idx, handled = 0;
576 u64 val;
577
578 data.addr = 0;
579 data.raw = NULL;
580
581 cpuc = &__get_cpu_var(cpu_hw_events);
582
583 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
584
585 if (!test_bit(idx, cpuc->active_mask))
586 continue;
587
588 event = cpuc->events[idx];
589 hwc = &event->hw;
590
591 WARN_ON_ONCE(hwc->idx != idx);
592
593 /* it might be unflagged overflow */
594 handled = p4_pmu_clear_cccr_ovf(hwc);
595
596 val = x86_perf_event_update(event);
597 if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
598 continue;
599
600 /* event overflow for sure */
601 data.period = event->hw.last_period;
602
603 if (!x86_perf_event_set_period(event))
604 continue;
605 if (perf_event_overflow(event, 1, &data, regs))
606 p4_pmu_disable_event(event);
607 }
608
609 if (handled) {
610 /* p4 quirk: unmask it again */
611 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
612 inc_irq_stat(apic_perf_irqs);
613 }
614
615 return handled;
616}
617
618/*
619 * swap thread specific fields according to a thread
620 * we are going to run on
621 */
622static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
623{
624 u32 escr, cccr;
625
626 /*
627 * we either lucky and continue on same cpu or no HT support
628 */
629 if (!p4_should_swap_ts(hwc->config, cpu))
630 return;
631
632 /*
633 * the event is migrated from an another logical
634 * cpu, so we need to swap thread specific flags
635 */
636
637 escr = p4_config_unpack_escr(hwc->config);
638 cccr = p4_config_unpack_cccr(hwc->config);
639
640 if (p4_ht_thread(cpu)) {
641 cccr &= ~P4_CCCR_OVF_PMI_T0;
642 cccr |= P4_CCCR_OVF_PMI_T1;
643 if (escr & P4_ESCR_T0_OS) {
644 escr &= ~P4_ESCR_T0_OS;
645 escr |= P4_ESCR_T1_OS;
646 }
647 if (escr & P4_ESCR_T0_USR) {
648 escr &= ~P4_ESCR_T0_USR;
649 escr |= P4_ESCR_T1_USR;
650 }
651 hwc->config = p4_config_pack_escr(escr);
652 hwc->config |= p4_config_pack_cccr(cccr);
653 hwc->config |= P4_CONFIG_HT;
654 } else {
655 cccr &= ~P4_CCCR_OVF_PMI_T1;
656 cccr |= P4_CCCR_OVF_PMI_T0;
657 if (escr & P4_ESCR_T1_OS) {
658 escr &= ~P4_ESCR_T1_OS;
659 escr |= P4_ESCR_T0_OS;
660 }
661 if (escr & P4_ESCR_T1_USR) {
662 escr &= ~P4_ESCR_T1_USR;
663 escr |= P4_ESCR_T0_USR;
664 }
665 hwc->config = p4_config_pack_escr(escr);
666 hwc->config |= p4_config_pack_cccr(cccr);
667 hwc->config &= ~P4_CONFIG_HT;
668 }
669}
670
671/*
672 * ESCR address hashing is tricky, ESCRs are not sequential
673 * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
674 * the metric between any ESCRs is laid in range [0xa0,0xe1]
675 *
676 * so we make ~70% filled hashtable
677 */
678
679#define P4_ESCR_MSR_BASE 0x000003a0
680#define P4_ESCR_MSR_MAX 0x000003e1
681#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
682#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
683#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
684
685static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
686 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
687 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
688 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
689 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
690 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
691 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
692 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
693 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
694 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
695 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
696 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
697 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
698 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
699 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
700 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
701 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
702 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
703 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
704 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
705 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
706 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
707 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
708 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
709 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
710 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
711 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
712 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
713 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
714 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
715 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
716 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
717 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
718 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
719 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
720 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
721 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
722 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
723 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
724 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
725 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
726 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
727 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
728 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
729 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
730 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
731 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
732};
733
734static int p4_get_escr_idx(unsigned int addr)
735{
736 unsigned int idx = P4_ESCR_MSR_IDX(addr);
737
738 if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
739 !p4_escr_table[idx] ||
740 p4_escr_table[idx] != addr)) {
741 WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
742 return -1;
743 }
744
745 return idx;
746}
747
748static int p4_next_cntr(int thread, unsigned long *used_mask,
749 struct p4_event_bind *bind)
750{
751 int i, j;
752
753 for (i = 0; i < P4_CNTR_LIMIT; i++) {
754 j = bind->cntr[thread][i];
755 if (j != -1 && !test_bit(j, used_mask))
756 return j;
757 }
758
759 return -1;
760}
761
762static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
763{
764 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
765 unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
766 int cpu = smp_processor_id();
767 struct hw_perf_event *hwc;
768 struct p4_event_bind *bind;
769 unsigned int i, thread, num;
770 int cntr_idx, escr_idx;
771
772 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
773 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
774
775 for (i = 0, num = n; i < n; i++, num--) {
776
777 hwc = &cpuc->event_list[i]->hw;
778 thread = p4_ht_thread(cpu);
779 bind = p4_config_get_bind(hwc->config);
780 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
781 if (unlikely(escr_idx == -1))
782 goto done;
783
784 if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
785 cntr_idx = hwc->idx;
786 if (assign)
787 assign[i] = hwc->idx;
788 goto reserve;
789 }
790
791 cntr_idx = p4_next_cntr(thread, used_mask, bind);
792 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
793 goto done;
794
795 p4_pmu_swap_config_ts(hwc, cpu);
796 if (assign)
797 assign[i] = cntr_idx;
798reserve:
799 set_bit(cntr_idx, used_mask);
800 set_bit(escr_idx, escr_mask);
801 }
802
803done:
804 return num ? -ENOSPC : 0;
805}
806
807static __initconst const struct x86_pmu p4_pmu = {
808 .name = "Netburst P4/Xeon",
809 .handle_irq = p4_pmu_handle_irq,
810 .disable_all = p4_pmu_disable_all,
811 .enable_all = p4_pmu_enable_all,
812 .enable = p4_pmu_enable_event,
813 .disable = p4_pmu_disable_event,
814 .eventsel = MSR_P4_BPU_CCCR0,
815 .perfctr = MSR_P4_BPU_PERFCTR0,
816 .event_map = p4_pmu_event_map,
817 .max_events = ARRAY_SIZE(p4_general_events),
818 .get_event_constraints = x86_get_event_constraints,
819 /*
820 * IF HT disabled we may need to use all
821 * ARCH_P4_MAX_CCCR counters simulaneously
822 * though leave it restricted at moment assuming
823 * HT is on
824 */
825 .num_counters = ARCH_P4_MAX_CCCR,
826 .apic = 1,
827 .cntval_bits = 40,
828 .cntval_mask = (1ULL << 40) - 1,
829 .max_period = (1ULL << 39) - 1,
830 .hw_config = p4_hw_config,
831 .schedule_events = p4_pmu_schedule_events,
832};
833
834static __init int p4_pmu_init(void)
835{
836 unsigned int low, high;
837
838 /* If we get stripped -- indexig fails */
839 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
840
841 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
842 if (!(low & (1 << 7))) {
843 pr_cont("unsupported Netburst CPU model %d ",
844 boot_cpu_data.x86_model);
845 return -ENODEV;
846 }
847
848 memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
849 sizeof(hw_cache_event_ids));
850
851 pr_cont("Netburst events, ");
852
853 x86_pmu = p4_pmu;
854
855 return 0;
856}
857
858#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14d..34ba07be2cd 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
27 */ 27 */
28#define P6_NOP_EVENT 0x0000002EULL 28#define P6_NOP_EVENT 0x0000002EULL
29 29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] = 30static struct event_constraint p6_event_constraints[] =
49{ 31{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ 32 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
66 wrmsrl(MSR_P6_EVNTSEL0, val); 48 wrmsrl(MSR_P6_EVNTSEL0, val);
67} 49}
68 50
69static void p6_pmu_enable_all(void) 51static void p6_pmu_enable_all(int added)
70{ 52{
71 unsigned long val; 53 unsigned long val;
72 54
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103} 85}
104 86
105static __initconst struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
106 .name = "p6", 88 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq, 89 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all, 90 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all, 91 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event, 92 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event, 93 .disable = p6_pmu_disable_event,
94 .hw_config = x86_pmu_hw_config,
95 .schedule_events = x86_schedule_events,
112 .eventsel = MSR_P6_EVNTSEL0, 96 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0, 97 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map, 98 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map), 99 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1, 100 .apic = 1,
118 .max_period = (1ULL << 31) - 1, 101 .max_period = (1ULL << 31) - 1,
119 .version = 0, 102 .version = 0,
120 .num_events = 2, 103 .num_counters = 2,
121 /* 104 /*
122 * Events have 40 bits implemented. However they are designed such 105 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the 106 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
125 * 108 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B 109 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */ 110 */
128 .event_bits = 32, 111 .cntval_bits = 32,
129 .event_mask = (1ULL << 32) - 1, 112 .cntval_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints, 113 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints, 114 .event_constraints = p6_event_constraints,
132}; 115};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index dfdb4dba232..b9d1ff58844 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,8 +24,8 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/div64.h> 26#include <asm/div64.h>
27#include <asm/vmware.h>
28#include <asm/x86_init.h> 27#include <asm/x86_init.h>
28#include <asm/hypervisor.h>
29 29
30#define CPUID_VMWARE_INFO_LEAF 0x40000000 30#define CPUID_VMWARE_INFO_LEAF 0x40000000
31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -65,7 +65,7 @@ static unsigned long vmware_get_tsc_khz(void)
65 return tsc_hz; 65 return tsc_hz;
66} 66}
67 67
68void __init vmware_platform_setup(void) 68static void __init vmware_platform_setup(void)
69{ 69{
70 uint32_t eax, ebx, ecx, edx; 70 uint32_t eax, ebx, ecx, edx;
71 71
@@ -83,26 +83,22 @@ void __init vmware_platform_setup(void)
83 * serial key should be enough, as this will always have a VMware 83 * serial key should be enough, as this will always have a VMware
84 * specific string when running under VMware hypervisor. 84 * specific string when running under VMware hypervisor.
85 */ 85 */
86int vmware_platform(void) 86static bool __init vmware_platform(void)
87{ 87{
88 if (cpu_has_hypervisor) { 88 if (cpu_has_hypervisor) {
89 unsigned int eax, ebx, ecx, edx; 89 unsigned int eax;
90 char hyper_vendor_id[13]; 90 unsigned int hyper_vendor_id[3];
91 91
92 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); 92 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
93 memcpy(hyper_vendor_id + 0, &ebx, 4); 93 &hyper_vendor_id[1], &hyper_vendor_id[2]);
94 memcpy(hyper_vendor_id + 4, &ecx, 4); 94 if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
95 memcpy(hyper_vendor_id + 8, &edx, 4); 95 return true;
96 hyper_vendor_id[12] = '\0';
97 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
98 return 1;
99 } else if (dmi_available && dmi_name_in_serial("VMware") && 96 } else if (dmi_available && dmi_name_in_serial("VMware") &&
100 __vmware_platform()) 97 __vmware_platform())
101 return 1; 98 return true;
102 99
103 return 0; 100 return false;
104} 101}
105EXPORT_SYMBOL(vmware_platform);
106 102
107/* 103/*
108 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 104 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -116,8 +112,16 @@ EXPORT_SYMBOL(vmware_platform);
116 * so that the kernel could just trust the hypervisor with providing a 112 * so that the kernel could just trust the hypervisor with providing a
117 * reliable virtual TSC that is suitable for timekeeping. 113 * reliable virtual TSC that is suitable for timekeeping.
118 */ 114 */
119void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) 115static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
120{ 116{
121 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
122 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 118 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
123} 119}
120
121const __refconst struct hypervisor_x86 x86_hyper_vmware = {
122 .name = "VMware",
123 .detect = vmware_platform,
124 .set_cpu_features = vmware_set_cpu_features,
125 .init_platform = vmware_platform_setup,
126};
127EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8b862d5900f..1b7b31ab7d8 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
170 cpuid_device_destroy(cpu); 170 cpuid_device_destroy(cpu);
171 break; 171 break;
172 } 172 }
173 return err ? NOTIFY_BAD : NOTIFY_OK; 173 return notifier_from_errno(err);
174} 174}
175 175
176static struct notifier_block __refdata cpuid_class_cpu_notifier = 176static struct notifier_block __refdata cpuid_class_cpu_notifier =
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e..00000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done)
11 * - buffer access
12 *
13 * It does not do:
14 * - security checking (is the caller allowed to trace the task)
15 * - buffer allocation (memory accounting)
16 *
17 *
18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */
21
22#include <linux/kernel.h>
23#include <linux/string.h>
24#include <linux/errno.h>
25#include <linux/sched.h>
26#include <linux/slab.h>
27#include <linux/mm.h>
28#include <linux/trace_clock.h>
29
30#include <asm/ds.h>
31
32#include "ds_selftest.h"
33
34/*
35 * The configuration for a particular DS hardware implementation:
36 */
37struct ds_configuration {
38 /* The name of the configuration: */
39 const char *name;
40
41 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
42 unsigned char sizeof_ptr_field;
43
44 /* The size of a BTS/PEBS record in bytes: */
45 unsigned char sizeof_rec[2];
46
47 /* The number of pebs counter reset values in the DS structure. */
48 unsigned char nr_counter_reset;
49
50 /* Control bit-masks indexed by enum ds_feature: */
51 unsigned long ctl[dsf_ctl_max];
52};
53static struct ds_configuration ds_cfg __read_mostly;
54
55
56/* Maximal size of a DS configuration: */
57#define MAX_SIZEOF_DS 0x80
58
59/* Maximal size of a BTS record: */
60#define MAX_SIZEOF_BTS (3 * 8)
61
62/* BTS and PEBS buffer alignment: */
63#define DS_ALIGNMENT (1 << 3)
64
65/* Number of buffer pointers in DS: */
66#define NUM_DS_PTR_FIELDS 8
67
68/* Size of a pebs reset value in DS: */
69#define PEBS_RESET_FIELD_SIZE 8
70
71/* Mask of control bits in the DS MSR register: */
72#define BTS_CONTROL \
73 ( ds_cfg.ctl[dsf_bts] | \
74 ds_cfg.ctl[dsf_bts_kernel] | \
75 ds_cfg.ctl[dsf_bts_user] | \
76 ds_cfg.ctl[dsf_bts_overflow] )
77
78/*
79 * A BTS or PEBS tracer.
80 *
81 * This holds the configuration of the tracer and serves as a handle
82 * to identify tracers.
83 */
84struct ds_tracer {
85 /* The DS context (partially) owned by this tracer. */
86 struct ds_context *context;
87 /* The buffer provided on ds_request() and its size in bytes. */
88 void *buffer;
89 size_t size;
90};
91
92struct bts_tracer {
93 /* The common DS part: */
94 struct ds_tracer ds;
95
96 /* The trace including the DS configuration: */
97 struct bts_trace trace;
98
99 /* Buffer overflow notification function: */
100 bts_ovfl_callback_t ovfl;
101
102 /* Active flags affecting trace collection. */
103 unsigned int flags;
104};
105
106struct pebs_tracer {
107 /* The common DS part: */
108 struct ds_tracer ds;
109
110 /* The trace including the DS configuration: */
111 struct pebs_trace trace;
112
113 /* Buffer overflow notification function: */
114 pebs_ovfl_callback_t ovfl;
115};
116
117/*
118 * Debug Store (DS) save area configuration (see Intel64 and IA32
119 * Architectures Software Developer's Manual, section 18.5)
120 *
121 * The DS configuration consists of the following fields; different
122 * architetures vary in the size of those fields.
123 *
124 * - double-word aligned base linear address of the BTS buffer
125 * - write pointer into the BTS buffer
126 * - end linear address of the BTS buffer (one byte beyond the end of
127 * the buffer)
128 * - interrupt pointer into BTS buffer
129 * (interrupt occurs when write pointer passes interrupt pointer)
130 * - double-word aligned base linear address of the PEBS buffer
131 * - write pointer into the PEBS buffer
132 * - end linear address of the PEBS buffer (one byte beyond the end of
133 * the buffer)
134 * - interrupt pointer into PEBS buffer
135 * (interrupt occurs when write pointer passes interrupt pointer)
136 * - value to which counter is reset following counter overflow
137 *
138 * Later architectures use 64bit pointers throughout, whereas earlier
139 * architectures use 32bit pointers in 32bit mode.
140 *
141 *
142 * We compute the base address for the first 8 fields based on:
143 * - the field size stored in the DS configuration
144 * - the relative field position
145 * - an offset giving the start of the respective region
146 *
147 * This offset is further used to index various arrays holding
148 * information for BTS and PEBS at the respective index.
149 *
150 * On later 32bit processors, we only access the lower 32bit of the
151 * 64bit pointer fields. The upper halves will be zeroed out.
152 */
153
154enum ds_field {
155 ds_buffer_base = 0,
156 ds_index,
157 ds_absolute_maximum,
158 ds_interrupt_threshold,
159};
160
161enum ds_qualifier {
162 ds_bts = 0,
163 ds_pebs
164};
165
166static inline unsigned long
167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
168{
169 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
170 return *(unsigned long *)base;
171}
172
173static inline void
174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
175 unsigned long value)
176{
177 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
178 (*(unsigned long *)base) = value;
179}
180
181
182/*
183 * Locking is done only for allocating BTS or PEBS resources.
184 */
185static DEFINE_SPINLOCK(ds_lock);
186
187/*
188 * We either support (system-wide) per-cpu or per-thread allocation.
189 * We distinguish the two based on the task_struct pointer, where a
190 * NULL pointer indicates per-cpu allocation for the current cpu.
191 *
192 * Allocations are use-counted. As soon as resources are allocated,
193 * further allocations must be of the same type (per-cpu or
194 * per-thread). We model this by counting allocations (i.e. the number
195 * of tracers of a certain type) for one type negatively:
196 * =0 no tracers
197 * >0 number of per-thread tracers
198 * <0 number of per-cpu tracers
199 *
200 * Tracers essentially gives the number of ds contexts for a certain
201 * type of allocation.
202 */
203static atomic_t tracers = ATOMIC_INIT(0);
204
205static inline int get_tracer(struct task_struct *task)
206{
207 int error;
208
209 spin_lock_irq(&ds_lock);
210
211 if (task) {
212 error = -EPERM;
213 if (atomic_read(&tracers) < 0)
214 goto out;
215 atomic_inc(&tracers);
216 } else {
217 error = -EPERM;
218 if (atomic_read(&tracers) > 0)
219 goto out;
220 atomic_dec(&tracers);
221 }
222
223 error = 0;
224out:
225 spin_unlock_irq(&ds_lock);
226 return error;
227}
228
229static inline void put_tracer(struct task_struct *task)
230{
231 if (task)
232 atomic_dec(&tracers);
233 else
234 atomic_inc(&tracers);
235}
236
237/*
238 * The DS context is either attached to a thread or to a cpu:
239 * - in the former case, the thread_struct contains a pointer to the
240 * attached context.
241 * - in the latter case, we use a static array of per-cpu context
242 * pointers.
243 *
244 * Contexts are use-counted. They are allocated on first access and
245 * deallocated when the last user puts the context.
246 */
247struct ds_context {
248 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
249 unsigned char ds[MAX_SIZEOF_DS];
250
251 /* The owner of the BTS and PEBS configuration, respectively: */
252 struct bts_tracer *bts_master;
253 struct pebs_tracer *pebs_master;
254
255 /* Use count: */
256 unsigned long count;
257
258 /* Pointer to the context pointer field: */
259 struct ds_context **this;
260
261 /* The traced task; NULL for cpu tracing: */
262 struct task_struct *task;
263
264 /* The traced cpu; only valid if task is NULL: */
265 int cpu;
266};
267
268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269
270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{
273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL;
277
278 /* Chances are small that we already have a context. */
279 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
280 if (!new_context)
281 return NULL;
282
283 spin_lock_irq(&ds_lock);
284
285 context = *p_context;
286 if (likely(!context)) {
287 context = new_context;
288
289 context->this = p_context;
290 context->task = task;
291 context->cpu = cpu;
292 context->count = 0;
293
294 *p_context = context;
295 }
296
297 context->count++;
298
299 spin_unlock_irq(&ds_lock);
300
301 if (context != new_context)
302 kfree(new_context);
303
304 return context;
305}
306
307static void ds_put_context(struct ds_context *context)
308{
309 struct task_struct *task;
310 unsigned long irq;
311
312 if (!context)
313 return;
314
315 spin_lock_irqsave(&ds_lock, irq);
316
317 if (--context->count) {
318 spin_unlock_irqrestore(&ds_lock, irq);
319 return;
320 }
321
322 *(context->this) = NULL;
323
324 task = context->task;
325
326 if (task)
327 clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
328
329 /*
330 * We leave the (now dangling) pointer to the DS configuration in
331 * the DS_AREA msr. This is as good or as bad as replacing it with
332 * NULL - the hardware would crash if we enabled tracing.
333 *
334 * This saves us some problems with having to write an msr on a
335 * different cpu while preventing others from doing the same for the
336 * next context for that same cpu.
337 */
338
339 spin_unlock_irqrestore(&ds_lock, irq);
340
341 /* The context might still be in use for context switching. */
342 if (task && (task != current))
343 wait_task_context_switch(task);
344
345 kfree(context);
346}
347
348static void ds_install_ds_area(struct ds_context *context)
349{
350 unsigned long ds;
351
352 ds = (unsigned long)context->ds;
353
354 /*
355 * There is a race between the bts master and the pebs master.
356 *
357 * The thread/cpu access is synchronized via get/put_cpu() for
358 * task tracing and via wrmsr_on_cpu for cpu tracing.
359 *
360 * If bts and pebs are collected for the same task or same cpu,
361 * the same confiuration is written twice.
362 */
363 if (context->task) {
364 get_cpu();
365 if (context->task == current)
366 wrmsrl(MSR_IA32_DS_AREA, ds);
367 set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
368 put_cpu();
369 } else
370 wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
371 (u32)((u64)ds), (u32)((u64)ds >> 32));
372}
373
374/*
375 * Call the tracer's callback on a buffer overflow.
376 *
377 * context: the ds context
378 * qual: the buffer type
379 */
380static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
381{
382 switch (qual) {
383 case ds_bts:
384 if (context->bts_master &&
385 context->bts_master->ovfl)
386 context->bts_master->ovfl(context->bts_master);
387 break;
388 case ds_pebs:
389 if (context->pebs_master &&
390 context->pebs_master->ovfl)
391 context->pebs_master->ovfl(context->pebs_master);
392 break;
393 }
394}
395
396
397/*
398 * Write raw data into the BTS or PEBS buffer.
399 *
400 * The remainder of any partially written record is zeroed out.
401 *
402 * context: the DS context
403 * qual: the buffer type
404 * record: the data to write
405 * size: the size of the data
406 */
407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
408 const void *record, size_t size)
409{
410 int bytes_written = 0;
411
412 if (!record)
413 return -EINVAL;
414
415 while (size) {
416 unsigned long base, index, end, write_end, int_th;
417 unsigned long write_size, adj_write_size;
418
419 /*
420 * Write as much as possible without producing an
421 * overflow interrupt.
422 *
423 * Interrupt_threshold must either be
424 * - bigger than absolute_maximum or
425 * - point to a record between buffer_base and absolute_maximum
426 *
427 * Index points to a valid record.
428 */
429 base = ds_get(context->ds, qual, ds_buffer_base);
430 index = ds_get(context->ds, qual, ds_index);
431 end = ds_get(context->ds, qual, ds_absolute_maximum);
432 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
433
434 write_end = min(end, int_th);
435
436 /*
437 * If we are already beyond the interrupt threshold,
438 * we fill the entire buffer.
439 */
440 if (write_end <= index)
441 write_end = end;
442
443 if (write_end <= index)
444 break;
445
446 write_size = min((unsigned long) size, write_end - index);
447 memcpy((void *)index, record, write_size);
448
449 record = (const char *)record + write_size;
450 size -= write_size;
451 bytes_written += write_size;
452
453 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
454 adj_write_size *= ds_cfg.sizeof_rec[qual];
455
456 /* Zero out trailing bytes. */
457 memset((char *)index + write_size, 0,
458 adj_write_size - write_size);
459 index += adj_write_size;
460
461 if (index >= end)
462 index = base;
463 ds_set(context->ds, qual, ds_index, index);
464
465 if (index >= int_th)
466 ds_overflow(context, qual);
467 }
468
469 return bytes_written;
470}
471
472
473/*
474 * Branch Trace Store (BTS) uses the following format. Different
475 * architectures vary in the size of those fields.
476 * - source linear address
477 * - destination linear address
478 * - flags
479 *
480 * Later architectures use 64bit pointers throughout, whereas earlier
481 * architectures use 32bit pointers in 32bit mode.
482 *
483 * We compute the base address for the fields based on:
484 * - the field size stored in the DS configuration
485 * - the relative field position
486 *
487 * In order to store additional information in the BTS buffer, we use
488 * a special source address to indicate that the record requires
489 * special interpretation.
490 *
491 * Netburst indicated via a bit in the flags field whether the branch
492 * was predicted; this is ignored.
493 *
494 * We use two levels of abstraction:
495 * - the raw data level defined here
496 * - an arch-independent level defined in ds.h
497 */
498
499enum bts_field {
500 bts_from,
501 bts_to,
502 bts_flags,
503
504 bts_qual = bts_from,
505 bts_clock = bts_to,
506 bts_pid = bts_flags,
507
508 bts_qual_mask = (bts_qual_max - 1),
509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
510};
511
512static inline unsigned long bts_get(const char *base, unsigned long field)
513{
514 base += (ds_cfg.sizeof_ptr_field * field);
515 return *(unsigned long *)base;
516}
517
518static inline void bts_set(char *base, unsigned long field, unsigned long val)
519{
520 base += (ds_cfg.sizeof_ptr_field * field);
521 (*(unsigned long *)base) = val;
522}
523
524
525/*
526 * The raw BTS data is architecture dependent.
527 *
528 * For higher-level users, we give an arch-independent view.
529 * - ds.h defines struct bts_struct
530 * - bts_read translates one raw bts record into a bts_struct
531 * - bts_write translates one bts_struct into the raw format and
532 * writes it into the top of the parameter tracer's buffer.
533 *
534 * return: bytes read/written on success; -Eerrno, otherwise
535 */
536static int
537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
538{
539 if (!tracer)
540 return -EINVAL;
541
542 if (at < tracer->trace.ds.begin)
543 return -EINVAL;
544
545 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
546 return -EINVAL;
547
548 memset(out, 0, sizeof(*out));
549 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
550 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
551 out->variant.event.clock = bts_get(at, bts_clock);
552 out->variant.event.pid = bts_get(at, bts_pid);
553 } else {
554 out->qualifier = bts_branch;
555 out->variant.lbr.from = bts_get(at, bts_from);
556 out->variant.lbr.to = bts_get(at, bts_to);
557
558 if (!out->variant.lbr.from && !out->variant.lbr.to)
559 out->qualifier = bts_invalid;
560 }
561
562 return ds_cfg.sizeof_rec[ds_bts];
563}
564
565static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
566{
567 unsigned char raw[MAX_SIZEOF_BTS];
568
569 if (!tracer)
570 return -EINVAL;
571
572 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
573 return -EOVERFLOW;
574
575 switch (in->qualifier) {
576 case bts_invalid:
577 bts_set(raw, bts_from, 0);
578 bts_set(raw, bts_to, 0);
579 bts_set(raw, bts_flags, 0);
580 break;
581 case bts_branch:
582 bts_set(raw, bts_from, in->variant.lbr.from);
583 bts_set(raw, bts_to, in->variant.lbr.to);
584 bts_set(raw, bts_flags, 0);
585 break;
586 case bts_task_arrives:
587 case bts_task_departs:
588 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
589 bts_set(raw, bts_clock, in->variant.event.clock);
590 bts_set(raw, bts_pid, in->variant.event.pid);
591 break;
592 default:
593 return -EINVAL;
594 }
595
596 return ds_write(tracer->ds.context, ds_bts, raw,
597 ds_cfg.sizeof_rec[ds_bts]);
598}
599
600
601static void ds_write_config(struct ds_context *context,
602 struct ds_trace *cfg, enum ds_qualifier qual)
603{
604 unsigned char *ds = context->ds;
605
606 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
607 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
608 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
609 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
610}
611
612static void ds_read_config(struct ds_context *context,
613 struct ds_trace *cfg, enum ds_qualifier qual)
614{
615 unsigned char *ds = context->ds;
616
617 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
618 cfg->top = (void *)ds_get(ds, qual, ds_index);
619 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
620 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
621}
622
623static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
624 void *base, size_t size, size_t ith,
625 unsigned int flags) {
626 unsigned long buffer, adj;
627
628 /*
629 * Adjust the buffer address and size to meet alignment
630 * constraints:
631 * - buffer is double-word aligned
632 * - size is multiple of record size
633 *
634 * We checked the size at the very beginning; we have enough
635 * space to do the adjustment.
636 */
637 buffer = (unsigned long)base;
638
639 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
640 buffer += adj;
641 size -= adj;
642
643 trace->n = size / ds_cfg.sizeof_rec[qual];
644 trace->size = ds_cfg.sizeof_rec[qual];
645
646 size = (trace->n * trace->size);
647
648 trace->begin = (void *)buffer;
649 trace->top = trace->begin;
650 trace->end = (void *)(buffer + size);
651 /*
652 * The value for 'no threshold' is -1, which will set the
653 * threshold outside of the buffer, just like we want it.
654 */
655 ith *= ds_cfg.sizeof_rec[qual];
656 trace->ith = (void *)(buffer + size - ith);
657
658 trace->flags = flags;
659}
660
661
662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
663 enum ds_qualifier qual, struct task_struct *task,
664 int cpu, void *base, size_t size, size_t th)
665{
666 struct ds_context *context;
667 int error;
668 size_t req_size;
669
670 error = -EOPNOTSUPP;
671 if (!ds_cfg.sizeof_rec[qual])
672 goto out;
673
674 error = -EINVAL;
675 if (!base)
676 goto out;
677
678 req_size = ds_cfg.sizeof_rec[qual];
679 /* We might need space for alignment adjustments. */
680 if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
681 req_size += DS_ALIGNMENT;
682
683 error = -EINVAL;
684 if (size < req_size)
685 goto out;
686
687 if (th != (size_t)-1) {
688 th *= ds_cfg.sizeof_rec[qual];
689
690 error = -EINVAL;
691 if (size <= th)
692 goto out;
693 }
694
695 tracer->buffer = base;
696 tracer->size = size;
697
698 error = -ENOMEM;
699 context = ds_get_context(task, cpu);
700 if (!context)
701 goto out;
702 tracer->context = context;
703
704 /*
705 * Defer any tracer-specific initialization work for the context until
706 * context ownership has been clarified.
707 */
708
709 error = 0;
710 out:
711 return error;
712}
713
714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
715 void *base, size_t size,
716 bts_ovfl_callback_t ovfl, size_t th,
717 unsigned int flags)
718{
719 struct bts_tracer *tracer;
720 int error;
721
722 /* Buffer overflow notification is not yet implemented. */
723 error = -EOPNOTSUPP;
724 if (ovfl)
725 goto out;
726
727 error = get_tracer(task);
728 if (error < 0)
729 goto out;
730
731 error = -ENOMEM;
732 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
733 if (!tracer)
734 goto out_put_tracer;
735 tracer->ovfl = ovfl;
736
737 /* Do some more error checking and acquire a tracing context. */
738 error = ds_request(&tracer->ds, &tracer->trace.ds,
739 ds_bts, task, cpu, base, size, th);
740 if (error < 0)
741 goto out_tracer;
742
743 /* Claim the bts part of the tracing context we acquired above. */
744 spin_lock_irq(&ds_lock);
745
746 error = -EPERM;
747 if (tracer->ds.context->bts_master)
748 goto out_unlock;
749 tracer->ds.context->bts_master = tracer;
750
751 spin_unlock_irq(&ds_lock);
752
753 /*
754 * Now that we own the bts part of the context, let's complete the
755 * initialization for that part.
756 */
757 ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
758 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
759 ds_install_ds_area(tracer->ds.context);
760
761 tracer->trace.read = bts_read;
762 tracer->trace.write = bts_write;
763
764 /* Start tracing. */
765 ds_resume_bts(tracer);
766
767 return tracer;
768
769 out_unlock:
770 spin_unlock_irq(&ds_lock);
771 ds_put_context(tracer->ds.context);
772 out_tracer:
773 kfree(tracer);
774 out_put_tracer:
775 put_tracer(task);
776 out:
777 return ERR_PTR(error);
778}
779
780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
781 void *base, size_t size,
782 bts_ovfl_callback_t ovfl,
783 size_t th, unsigned int flags)
784{
785 return ds_request_bts(task, 0, base, size, ovfl, th, flags);
786}
787
788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
789 bts_ovfl_callback_t ovfl,
790 size_t th, unsigned int flags)
791{
792 return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
793}
794
795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
796 void *base, size_t size,
797 pebs_ovfl_callback_t ovfl, size_t th,
798 unsigned int flags)
799{
800 struct pebs_tracer *tracer;
801 int error;
802
803 /* Buffer overflow notification is not yet implemented. */
804 error = -EOPNOTSUPP;
805 if (ovfl)
806 goto out;
807
808 error = get_tracer(task);
809 if (error < 0)
810 goto out;
811
812 error = -ENOMEM;
813 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
814 if (!tracer)
815 goto out_put_tracer;
816 tracer->ovfl = ovfl;
817
818 /* Do some more error checking and acquire a tracing context. */
819 error = ds_request(&tracer->ds, &tracer->trace.ds,
820 ds_pebs, task, cpu, base, size, th);
821 if (error < 0)
822 goto out_tracer;
823
824 /* Claim the pebs part of the tracing context we acquired above. */
825 spin_lock_irq(&ds_lock);
826
827 error = -EPERM;
828 if (tracer->ds.context->pebs_master)
829 goto out_unlock;
830 tracer->ds.context->pebs_master = tracer;
831
832 spin_unlock_irq(&ds_lock);
833
834 /*
835 * Now that we own the pebs part of the context, let's complete the
836 * initialization for that part.
837 */
838 ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
839 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
840 ds_install_ds_area(tracer->ds.context);
841
842 /* Start tracing. */
843 ds_resume_pebs(tracer);
844
845 return tracer;
846
847 out_unlock:
848 spin_unlock_irq(&ds_lock);
849 ds_put_context(tracer->ds.context);
850 out_tracer:
851 kfree(tracer);
852 out_put_tracer:
853 put_tracer(task);
854 out:
855 return ERR_PTR(error);
856}
857
858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
859 void *base, size_t size,
860 pebs_ovfl_callback_t ovfl,
861 size_t th, unsigned int flags)
862{
863 return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
864}
865
866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
867 pebs_ovfl_callback_t ovfl,
868 size_t th, unsigned int flags)
869{
870 return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
871}
872
873static void ds_free_bts(struct bts_tracer *tracer)
874{
875 struct task_struct *task;
876
877 task = tracer->ds.context->task;
878
879 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
880 tracer->ds.context->bts_master = NULL;
881
882 /* Make sure tracing stopped and the tracer is not in use. */
883 if (task && (task != current))
884 wait_task_context_switch(task);
885
886 ds_put_context(tracer->ds.context);
887 put_tracer(task);
888
889 kfree(tracer);
890}
891
892void ds_release_bts(struct bts_tracer *tracer)
893{
894 might_sleep();
895
896 if (!tracer)
897 return;
898
899 ds_suspend_bts(tracer);
900 ds_free_bts(tracer);
901}
902
903int ds_release_bts_noirq(struct bts_tracer *tracer)
904{
905 struct task_struct *task;
906 unsigned long irq;
907 int error;
908
909 if (!tracer)
910 return 0;
911
912 task = tracer->ds.context->task;
913
914 local_irq_save(irq);
915
916 error = -EPERM;
917 if (!task &&
918 (tracer->ds.context->cpu != smp_processor_id()))
919 goto out;
920
921 error = -EPERM;
922 if (task && (task != current))
923 goto out;
924
925 ds_suspend_bts_noirq(tracer);
926 ds_free_bts(tracer);
927
928 error = 0;
929 out:
930 local_irq_restore(irq);
931 return error;
932}
933
934static void update_task_debugctlmsr(struct task_struct *task,
935 unsigned long debugctlmsr)
936{
937 task->thread.debugctlmsr = debugctlmsr;
938
939 get_cpu();
940 if (task == current)
941 update_debugctlmsr(debugctlmsr);
942 put_cpu();
943}
944
945void ds_suspend_bts(struct bts_tracer *tracer)
946{
947 struct task_struct *task;
948 unsigned long debugctlmsr;
949 int cpu;
950
951 if (!tracer)
952 return;
953
954 tracer->flags = 0;
955
956 task = tracer->ds.context->task;
957 cpu = tracer->ds.context->cpu;
958
959 WARN_ON(!task && irqs_disabled());
960
961 debugctlmsr = (task ?
962 task->thread.debugctlmsr :
963 get_debugctlmsr_on_cpu(cpu));
964 debugctlmsr &= ~BTS_CONTROL;
965
966 if (task)
967 update_task_debugctlmsr(task, debugctlmsr);
968 else
969 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
970}
971
972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
973{
974 struct task_struct *task;
975 unsigned long debugctlmsr, irq;
976 int cpu, error = 0;
977
978 if (!tracer)
979 return 0;
980
981 tracer->flags = 0;
982
983 task = tracer->ds.context->task;
984 cpu = tracer->ds.context->cpu;
985
986 local_irq_save(irq);
987
988 error = -EPERM;
989 if (!task && (cpu != smp_processor_id()))
990 goto out;
991
992 debugctlmsr = (task ?
993 task->thread.debugctlmsr :
994 get_debugctlmsr());
995 debugctlmsr &= ~BTS_CONTROL;
996
997 if (task)
998 update_task_debugctlmsr(task, debugctlmsr);
999 else
1000 update_debugctlmsr(debugctlmsr);
1001
1002 error = 0;
1003 out:
1004 local_irq_restore(irq);
1005 return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010 unsigned long control;
1011
1012 control = ds_cfg.ctl[dsf_bts];
1013 if (!(tracer->trace.ds.flags & BTS_KERNEL))
1014 control |= ds_cfg.ctl[dsf_bts_kernel];
1015 if (!(tracer->trace.ds.flags & BTS_USER))
1016 control |= ds_cfg.ctl[dsf_bts_user];
1017
1018 return control;
1019}
1020
1021void ds_resume_bts(struct bts_tracer *tracer)
1022{
1023 struct task_struct *task;
1024 unsigned long debugctlmsr;
1025 int cpu;
1026
1027 if (!tracer)
1028 return;
1029
1030 tracer->flags = tracer->trace.ds.flags;
1031
1032 task = tracer->ds.context->task;
1033 cpu = tracer->ds.context->cpu;
1034
1035 WARN_ON(!task && irqs_disabled());
1036
1037 debugctlmsr = (task ?
1038 task->thread.debugctlmsr :
1039 get_debugctlmsr_on_cpu(cpu));
1040 debugctlmsr |= ds_bts_control(tracer);
1041
1042 if (task)
1043 update_task_debugctlmsr(task, debugctlmsr);
1044 else
1045 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050 struct task_struct *task;
1051 unsigned long debugctlmsr, irq;
1052 int cpu, error = 0;
1053
1054 if (!tracer)
1055 return 0;
1056
1057 tracer->flags = tracer->trace.ds.flags;
1058
1059 task = tracer->ds.context->task;
1060 cpu = tracer->ds.context->cpu;
1061
1062 local_irq_save(irq);
1063
1064 error = -EPERM;
1065 if (!task && (cpu != smp_processor_id()))
1066 goto out;
1067
1068 debugctlmsr = (task ?
1069 task->thread.debugctlmsr :
1070 get_debugctlmsr());
1071 debugctlmsr |= ds_bts_control(tracer);
1072
1073 if (task)
1074 update_task_debugctlmsr(task, debugctlmsr);
1075 else
1076 update_debugctlmsr(debugctlmsr);
1077
1078 error = 0;
1079 out:
1080 local_irq_restore(irq);
1081 return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086 struct task_struct *task;
1087
1088 task = tracer->ds.context->task;
1089
1090 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
1091 tracer->ds.context->pebs_master = NULL;
1092
1093 ds_put_context(tracer->ds.context);
1094 put_tracer(task);
1095
1096 kfree(tracer);
1097}
1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101 might_sleep();
1102
1103 if (!tracer)
1104 return;
1105
1106 ds_suspend_pebs(tracer);
1107 ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112 struct task_struct *task;
1113 unsigned long irq;
1114 int error;
1115
1116 if (!tracer)
1117 return 0;
1118
1119 task = tracer->ds.context->task;
1120
1121 local_irq_save(irq);
1122
1123 error = -EPERM;
1124 if (!task &&
1125 (tracer->ds.context->cpu != smp_processor_id()))
1126 goto out;
1127
1128 error = -EPERM;
1129 if (task && (task != current))
1130 goto out;
1131
1132 ds_suspend_pebs_noirq(tracer);
1133 ds_free_pebs(tracer);
1134
1135 error = 0;
1136 out:
1137 local_irq_restore(irq);
1138 return error;
1139}
1140
1141void ds_suspend_pebs(struct pebs_tracer *tracer)
1142{
1143
1144}
1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148 return 0;
1149}
1150
1151void ds_resume_pebs(struct pebs_tracer *tracer)
1152{
1153
1154}
1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158 return 0;
1159}
1160
1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
1162{
1163 if (!tracer)
1164 return NULL;
1165
1166 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
1167 return &tracer->trace;
1168}
1169
1170const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
1171{
1172 if (!tracer)
1173 return NULL;
1174
1175 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
1176
1177 tracer->trace.counters = ds_cfg.nr_counter_reset;
1178 memcpy(tracer->trace.counter_reset,
1179 tracer->ds.context->ds +
1180 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181 ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
1182
1183 return &tracer->trace;
1184}
1185
1186int ds_reset_bts(struct bts_tracer *tracer)
1187{
1188 if (!tracer)
1189 return -EINVAL;
1190
1191 tracer->trace.ds.top = tracer->trace.ds.begin;
1192
1193 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
1194 (unsigned long)tracer->trace.ds.top);
1195
1196 return 0;
1197}
1198
1199int ds_reset_pebs(struct pebs_tracer *tracer)
1200{
1201 if (!tracer)
1202 return -EINVAL;
1203
1204 tracer->trace.ds.top = tracer->trace.ds.begin;
1205
1206 ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
1207 (unsigned long)tracer->trace.ds.top);
1208
1209 return 0;
1210}
1211
1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213 unsigned int counter, u64 value)
1214{
1215 if (!tracer)
1216 return -EINVAL;
1217
1218 if (ds_cfg.nr_counter_reset < counter)
1219 return -EINVAL;
1220
1221 *(u64 *)(tracer->ds.context->ds +
1222 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223 (counter * PEBS_RESET_FIELD_SIZE)) = value;
1224
1225 return 0;
1226}
1227
1228static const struct ds_configuration ds_cfg_netburst = {
1229 .name = "Netburst",
1230 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
1231 .ctl[dsf_bts_kernel] = (1 << 5),
1232 .ctl[dsf_bts_user] = (1 << 6),
1233 .nr_counter_reset = 1,
1234};
1235static const struct ds_configuration ds_cfg_pentium_m = {
1236 .name = "Pentium M",
1237 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1238 .nr_counter_reset = 1,
1239};
1240static const struct ds_configuration ds_cfg_core2_atom = {
1241 .name = "Core 2/Atom",
1242 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1243 .ctl[dsf_bts_kernel] = (1 << 9),
1244 .ctl[dsf_bts_user] = (1 << 10),
1245 .nr_counter_reset = 1,
1246};
1247static const struct ds_configuration ds_cfg_core_i7 = {
1248 .name = "Core i7",
1249 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1250 .ctl[dsf_bts_kernel] = (1 << 9),
1251 .ctl[dsf_bts_user] = (1 << 10),
1252 .nr_counter_reset = 4,
1253};
1254
1255static void
1256ds_configure(const struct ds_configuration *cfg,
1257 struct cpuinfo_x86 *cpu)
1258{
1259 unsigned long nr_pebs_fields = 0;
1260
1261 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264 nr_pebs_fields = 10;
1265#else
1266 nr_pebs_fields = 18;
1267#endif
1268
1269 /*
1270 * Starting with version 2, architectural performance
1271 * monitoring supports a format specifier.
1272 */
1273 if ((cpuid_eax(0xa) & 0xff) > 1) {
1274 unsigned long perf_capabilities, format;
1275
1276 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278 format = (perf_capabilities >> 8) & 0xf;
1279
1280 switch (format) {
1281 case 0:
1282 nr_pebs_fields = 18;
1283 break;
1284 case 1:
1285 nr_pebs_fields = 22;
1286 break;
1287 default:
1288 printk(KERN_INFO
1289 "[ds] unknown PEBS format: %lu\n", format);
1290 nr_pebs_fields = 0;
1291 break;
1292 }
1293 }
1294
1295 memset(&ds_cfg, 0, sizeof(ds_cfg));
1296 ds_cfg = *cfg;
1297
1298 ds_cfg.sizeof_ptr_field =
1299 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
1302 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
1303
1304 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
1305 ds_cfg.sizeof_rec[ds_bts] = 0;
1306 printk(KERN_INFO "[ds] bts not available\n");
1307 }
1308 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309 ds_cfg.sizeof_rec[ds_pebs] = 0;
1310 printk(KERN_INFO "[ds] pebs not available\n");
1311 }
1312
1313 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314 8 * ds_cfg.sizeof_ptr_field);
1315 printk("bts/pebs record: %u/%u bytes\n",
1316 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
1317
1318 WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
1319}
1320
1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
1322{
1323 /* Only configure the first cpu. Others are identical. */
1324 if (ds_cfg.name)
1325 return;
1326
1327 switch (c->x86) {
1328 case 0x6:
1329 switch (c->x86_model) {
1330 case 0x9:
1331 case 0xd: /* Pentium M */
1332 ds_configure(&ds_cfg_pentium_m, c);
1333 break;
1334 case 0xf:
1335 case 0x17: /* Core2 */
1336 case 0x1c: /* Atom */
1337 ds_configure(&ds_cfg_core2_atom, c);
1338 break;
1339 case 0x1a: /* Core i7 */
1340 ds_configure(&ds_cfg_core_i7, c);
1341 break;
1342 default:
1343 /* Sorry, don't know about them. */
1344 break;
1345 }
1346 break;
1347 case 0xf:
1348 switch (c->x86_model) {
1349 case 0x0:
1350 case 0x1:
1351 case 0x2: /* Netburst */
1352 ds_configure(&ds_cfg_netburst, c);
1353 break;
1354 default:
1355 /* Sorry, don't know about them. */
1356 break;
1357 }
1358 break;
1359 default:
1360 /* Sorry, don't know about them. */
1361 break;
1362 }
1363}
1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366 enum bts_qualifier qualifier,
1367 struct task_struct *task)
1368{
1369 struct bts_tracer *tracer = context->bts_master;
1370 struct bts_struct ts;
1371
1372 /* Prevent compilers from reading the tracer pointer twice. */
1373 barrier();
1374
1375 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376 return;
1377
1378 memset(&ts, 0, sizeof(ts));
1379 ts.qualifier = qualifier;
1380 ts.variant.event.clock = trace_clock_global();
1381 ts.variant.event.pid = task->pid;
1382
1383 bts_write(tracer, &ts);
1384}
1385
1386/*
1387 * Change the DS configuration from tracing prev to tracing next.
1388 */
1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
1390{
1391 struct ds_context *prev_ctx = prev->thread.ds_ctx;
1392 struct ds_context *next_ctx = next->thread.ds_ctx;
1393 unsigned long debugctlmsr = next->thread.debugctlmsr;
1394
1395 /* Make sure all data is read before we start. */
1396 barrier();
1397
1398 if (prev_ctx) {
1399 update_debugctlmsr(0);
1400
1401 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
1402 }
1403
1404 if (next_ctx) {
1405 ds_take_timestamp(next_ctx, bts_task_arrives, next);
1406
1407 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1408 }
1409
1410 update_debugctlmsr(debugctlmsr);
1411}
1412
1413static __init int ds_selftest(void)
1414{
1415 if (ds_cfg.sizeof_rec[ds_bts]) {
1416 int error;
1417
1418 error = ds_selftest_bts();
1419 if (error) {
1420 WARN(1, "[ds] selftest failed. disabling bts.\n");
1421 ds_cfg.sizeof_rec[ds_bts] = 0;
1422 }
1423 }
1424
1425 if (ds_cfg.sizeof_rec[ds_pebs]) {
1426 int error;
1427
1428 error = ds_selftest_pebs();
1429 if (error) {
1430 WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431 ds_cfg.sizeof_rec[ds_pebs] = 0;
1432 }
1433 }
1434
1435 return 0;
1436}
1437device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab9..00000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#include "ds_selftest.h"
10
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/smp.h>
14#include <linux/cpu.h>
15
16#include <asm/ds.h>
17
18
19#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
20#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
21
22struct ds_selftest_bts_conf {
23 struct bts_tracer *tracer;
24 int error;
25 int (*suspend)(struct bts_tracer *);
26 int (*resume)(struct bts_tracer *);
27};
28
29static int ds_selftest_bts_consistency(const struct bts_trace *trace)
30{
31 int error = 0;
32
33 if (!trace) {
34 printk(KERN_CONT "failed to access trace...");
35 /* Bail out. Other tests are pointless. */
36 return -1;
37 }
38
39 if (!trace->read) {
40 printk(KERN_CONT "bts read not available...");
41 error = -1;
42 }
43
44 /* Do some sanity checks on the trace configuration. */
45 if (!trace->ds.n) {
46 printk(KERN_CONT "empty bts buffer...");
47 error = -1;
48 }
49 if (!trace->ds.size) {
50 printk(KERN_CONT "bad bts trace setup...");
51 error = -1;
52 }
53 if (trace->ds.end !=
54 (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
55 printk(KERN_CONT "bad bts buffer setup...");
56 error = -1;
57 }
58 /*
59 * We allow top in [begin; end], since its not clear when the
60 * overflow adjustment happens: after the increment or before the
61 * write.
62 */
63 if ((trace->ds.top < trace->ds.begin) ||
64 (trace->ds.end < trace->ds.top)) {
65 printk(KERN_CONT "bts top out of bounds...");
66 error = -1;
67 }
68
69 return error;
70}
71
72static int ds_selftest_bts_read(struct bts_tracer *tracer,
73 const struct bts_trace *trace,
74 const void *from, const void *to)
75{
76 const unsigned char *at;
77
78 /*
79 * Check a few things which do not belong to this test.
80 * They should be covered by other tests.
81 */
82 if (!trace)
83 return -1;
84
85 if (!trace->read)
86 return -1;
87
88 if (to < from)
89 return -1;
90
91 if (from < trace->ds.begin)
92 return -1;
93
94 if (trace->ds.end < to)
95 return -1;
96
97 if (!trace->ds.size)
98 return -1;
99
100 /* Now to the test itself. */
101 for (at = from; (void *)at < to; at += trace->ds.size) {
102 struct bts_struct bts;
103 unsigned long index;
104 int error;
105
106 if (((void *)at - trace->ds.begin) % trace->ds.size) {
107 printk(KERN_CONT
108 "read from non-integer index...");
109 return -1;
110 }
111 index = ((void *)at - trace->ds.begin) / trace->ds.size;
112
113 memset(&bts, 0, sizeof(bts));
114 error = trace->read(tracer, at, &bts);
115 if (error < 0) {
116 printk(KERN_CONT
117 "error reading bts trace at [%lu] (0x%p)...",
118 index, at);
119 return error;
120 }
121
122 switch (bts.qualifier) {
123 case BTS_BRANCH:
124 break;
125 default:
126 printk(KERN_CONT
127 "unexpected bts entry %llu at [%lu] (0x%p)...",
128 bts.qualifier, index, at);
129 return -1;
130 }
131 }
132
133 return 0;
134}
135
136static void ds_selftest_bts_cpu(void *arg)
137{
138 struct ds_selftest_bts_conf *conf = arg;
139 const struct bts_trace *trace;
140 void *top;
141
142 if (IS_ERR(conf->tracer)) {
143 conf->error = PTR_ERR(conf->tracer);
144 conf->tracer = NULL;
145
146 printk(KERN_CONT
147 "initialization failed (err: %d)...", conf->error);
148 return;
149 }
150
151 /* We should meanwhile have enough trace. */
152 conf->error = conf->suspend(conf->tracer);
153 if (conf->error < 0)
154 return;
155
156 /* Let's see if we can access the trace. */
157 trace = ds_read_bts(conf->tracer);
158
159 conf->error = ds_selftest_bts_consistency(trace);
160 if (conf->error < 0)
161 return;
162
163 /* If everything went well, we should have a few trace entries. */
164 if (trace->ds.top == trace->ds.begin) {
165 /*
166 * It is possible but highly unlikely that we got a
167 * buffer overflow and end up at exactly the same
168 * position we started from.
169 * Let's issue a warning, but continue.
170 */
171 printk(KERN_CONT "no trace/overflow...");
172 }
173
174 /* Let's try to read the trace we collected. */
175 conf->error =
176 ds_selftest_bts_read(conf->tracer, trace,
177 trace->ds.begin, trace->ds.top);
178 if (conf->error < 0)
179 return;
180
181 /*
182 * Let's read the trace again.
183 * Since we suspended tracing, we should get the same result.
184 */
185 top = trace->ds.top;
186
187 trace = ds_read_bts(conf->tracer);
188 conf->error = ds_selftest_bts_consistency(trace);
189 if (conf->error < 0)
190 return;
191
192 if (top != trace->ds.top) {
193 printk(KERN_CONT "suspend not working...");
194 conf->error = -1;
195 return;
196 }
197
198 /* Let's collect some more trace - see if resume is working. */
199 conf->error = conf->resume(conf->tracer);
200 if (conf->error < 0)
201 return;
202
203 conf->error = conf->suspend(conf->tracer);
204 if (conf->error < 0)
205 return;
206
207 trace = ds_read_bts(conf->tracer);
208
209 conf->error = ds_selftest_bts_consistency(trace);
210 if (conf->error < 0)
211 return;
212
213 if (trace->ds.top == top) {
214 /*
215 * It is possible but highly unlikely that we got a
216 * buffer overflow and end up at exactly the same
217 * position we started from.
218 * Let's issue a warning and check the full trace.
219 */
220 printk(KERN_CONT
221 "no resume progress/overflow...");
222
223 conf->error =
224 ds_selftest_bts_read(conf->tracer, trace,
225 trace->ds.begin, trace->ds.end);
226 } else if (trace->ds.top < top) {
227 /*
228 * We had a buffer overflow - the entire buffer should
229 * contain trace records.
230 */
231 conf->error =
232 ds_selftest_bts_read(conf->tracer, trace,
233 trace->ds.begin, trace->ds.end);
234 } else {
235 /*
236 * It is quite likely that the buffer did not overflow.
237 * Let's just check the delta trace.
238 */
239 conf->error =
240 ds_selftest_bts_read(conf->tracer, trace, top,
241 trace->ds.top);
242 }
243 if (conf->error < 0)
244 return;
245
246 conf->error = 0;
247}
248
249static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
250{
251 ds_suspend_bts(tracer);
252 return 0;
253}
254
255static int ds_resume_bts_wrap(struct bts_tracer *tracer)
256{
257 ds_resume_bts(tracer);
258 return 0;
259}
260
261static void ds_release_bts_noirq_wrap(void *tracer)
262{
263 (void)ds_release_bts_noirq(tracer);
264}
265
266static int ds_selftest_bts_bad_release_noirq(int cpu,
267 struct bts_tracer *tracer)
268{
269 int error = -EPERM;
270
271 /* Try to release the tracer on the wrong cpu. */
272 get_cpu();
273 if (cpu != smp_processor_id()) {
274 error = ds_release_bts_noirq(tracer);
275 if (error != -EPERM)
276 printk(KERN_CONT "release on wrong cpu...");
277 }
278 put_cpu();
279
280 return error ? 0 : -1;
281}
282
283static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
284{
285 struct bts_tracer *tracer;
286 int error;
287
288 /* Try to request cpu tracing while task tracing is active. */
289 tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
290 (size_t)-1, BTS_KERNEL);
291 error = PTR_ERR(tracer);
292 if (!IS_ERR(tracer)) {
293 ds_release_bts(tracer);
294 error = 0;
295 }
296
297 if (error != -EPERM)
298 printk(KERN_CONT "cpu/task tracing overlap...");
299
300 return error ? 0 : -1;
301}
302
303static int ds_selftest_bts_bad_request_task(void *buffer)
304{
305 struct bts_tracer *tracer;
306 int error;
307
308 /* Try to request cpu tracing while task tracing is active. */
309 tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
310 (size_t)-1, BTS_KERNEL);
311 error = PTR_ERR(tracer);
312 if (!IS_ERR(tracer)) {
313 error = 0;
314 ds_release_bts(tracer);
315 }
316
317 if (error != -EPERM)
318 printk(KERN_CONT "task/cpu tracing overlap...");
319
320 return error ? 0 : -1;
321}
322
323int ds_selftest_bts(void)
324{
325 struct ds_selftest_bts_conf conf;
326 unsigned char buffer[BUFFER_SIZE], *small_buffer;
327 unsigned long irq;
328 int cpu;
329
330 printk(KERN_INFO "[ds] bts selftest...");
331 conf.error = 0;
332
333 small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
334
335 get_online_cpus();
336 for_each_online_cpu(cpu) {
337 conf.suspend = ds_suspend_bts_wrap;
338 conf.resume = ds_resume_bts_wrap;
339 conf.tracer =
340 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
341 NULL, (size_t)-1, BTS_KERNEL);
342 ds_selftest_bts_cpu(&conf);
343 if (conf.error >= 0)
344 conf.error = ds_selftest_bts_bad_request_task(buffer);
345 ds_release_bts(conf.tracer);
346 if (conf.error < 0)
347 goto out;
348
349 conf.suspend = ds_suspend_bts_noirq;
350 conf.resume = ds_resume_bts_noirq;
351 conf.tracer =
352 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
353 NULL, (size_t)-1, BTS_KERNEL);
354 smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
355 if (conf.error >= 0) {
356 conf.error =
357 ds_selftest_bts_bad_release_noirq(cpu,
358 conf.tracer);
359 /* We must not release the tracer twice. */
360 if (conf.error < 0)
361 conf.tracer = NULL;
362 }
363 if (conf.error >= 0)
364 conf.error = ds_selftest_bts_bad_request_task(buffer);
365 smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
366 conf.tracer, 1);
367 if (conf.error < 0)
368 goto out;
369 }
370
371 conf.suspend = ds_suspend_bts_wrap;
372 conf.resume = ds_resume_bts_wrap;
373 conf.tracer =
374 ds_request_bts_task(current, buffer, BUFFER_SIZE,
375 NULL, (size_t)-1, BTS_KERNEL);
376 ds_selftest_bts_cpu(&conf);
377 if (conf.error >= 0)
378 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
379 ds_release_bts(conf.tracer);
380 if (conf.error < 0)
381 goto out;
382
383 conf.suspend = ds_suspend_bts_noirq;
384 conf.resume = ds_resume_bts_noirq;
385 conf.tracer =
386 ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
387 NULL, (size_t)-1, BTS_KERNEL);
388 local_irq_save(irq);
389 ds_selftest_bts_cpu(&conf);
390 if (conf.error >= 0)
391 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
392 ds_release_bts_noirq(conf.tracer);
393 local_irq_restore(irq);
394 if (conf.error < 0)
395 goto out;
396
397 conf.error = 0;
398 out:
399 put_online_cpus();
400 printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
401
402 return conf.error;
403}
404
405int ds_selftest_pebs(void)
406{
407 return 0;
408}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c666..00000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#ifdef CONFIG_X86_DS_SELFTEST
10extern int ds_selftest_bts(void);
11extern int ds_selftest_pebs(void);
12#else
13static inline int ds_selftest_bts(void) { return 0; }
14static inline int ds_selftest_pebs(void) { return 0; }
15#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780..c89a386930b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
224 int cpu; 224 int cpu;
225 unsigned long flags; 225 unsigned long flags;
226 226
227 /* notify the hw-branch tracer so it may disable tracing and
228 add the last trace to the trace buffer -
229 the earlier this happens, the more useful the trace. */
230 trace_hw_branch_oops();
231
232 oops_enter(); 227 oops_enter();
233 228
234 /* racy, but better than risking deadlock. */ 229 /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b9c830c12b4..fa99bae75ac 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -41,6 +41,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n)
41 writew(0x720, VGABASE + 2*(max_xpos*j + i)); 41 writew(0x720, VGABASE + 2*(max_xpos*j + i));
42 current_ypos = max_ypos-1; 42 current_ypos = max_ypos-1;
43 } 43 }
44#ifdef CONFIG_KGDB_KDB
45 if (c == '\b') {
46 if (current_xpos > 0)
47 current_xpos--;
48 } else if (c == '\r') {
49 current_xpos = 0;
50 } else
51#endif
44 if (c == '\n') { 52 if (c == '\n') {
45 current_xpos = 0; 53 current_xpos = 0;
46 current_ypos++; 54 current_ypos++;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc673..cd49141cf15 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
53#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56#include <asm/cpufeature.h>
56 57
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 58/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h> 59#include <linux/elf-em.h>
@@ -905,7 +906,25 @@ ENTRY(simd_coprocessor_error)
905 RING0_INT_FRAME 906 RING0_INT_FRAME
906 pushl $0 907 pushl $0
907 CFI_ADJUST_CFA_OFFSET 4 908 CFI_ADJUST_CFA_OFFSET 4
909#ifdef CONFIG_X86_INVD_BUG
910 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
911661: pushl $do_general_protection
912662:
913.section .altinstructions,"a"
914 .balign 4
915 .long 661b
916 .long 663f
917 .byte X86_FEATURE_XMM
918 .byte 662b-661b
919 .byte 664f-663f
920.previous
921.section .altinstr_replacement,"ax"
922663: pushl $do_simd_coprocessor_error
923664:
924.previous
925#else
908 pushl $do_simd_coprocessor_error 926 pushl $do_simd_coprocessor_error
927#endif
909 CFI_ADJUST_CFA_OFFSET 4 928 CFI_ADJUST_CFA_OFFSET 4
910 jmp error_code 929 jmp error_code
911 CFI_ENDPROC 930 CFI_ENDPROC
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 23b4ecdffa9..a198b7c87a1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -36,6 +36,7 @@
36unsigned long hpet_address; 36unsigned long hpet_address;
37u8 hpet_blockid; /* OS timer block num */ 37u8 hpet_blockid; /* OS timer block num */
38u8 hpet_msi_disable; 38u8 hpet_msi_disable;
39u8 hpet_readback_cmp;
39 40
40#ifdef CONFIG_PCI_MSI 41#ifdef CONFIG_PCI_MSI
41static unsigned long hpet_num_timers; 42static unsigned long hpet_num_timers;
@@ -395,19 +396,23 @@ static int hpet_next_event(unsigned long delta,
395 * at that point and we would wait for the next hpet interrupt 396 * at that point and we would wait for the next hpet interrupt
396 * forever. We found out that reading the CMP register back 397 * forever. We found out that reading the CMP register back
397 * forces the transfer so we can rely on the comparison with 398 * forces the transfer so we can rely on the comparison with
398 * the counter register below. If the read back from the 399 * the counter register below.
399 * compare register does not match the value we programmed 400 *
400 * then we might have a real hardware problem. We can not do 401 * That works fine on those ATI chipsets, but on newer Intel
401 * much about it here, but at least alert the user/admin with 402 * chipsets (ICH9...) this triggers due to an erratum: Reading
402 * a prominent warning. 403 * the comparator immediately following a write is returning
403 * An erratum on some chipsets (ICH9,..), results in comparator read 404 * the old value.
404 * immediately following a write returning old value. Workaround 405 *
405 * for this is to read this value second time, when first 406 * We restrict the read back to the affected ATI chipsets (set
406 * read returns old value. 407 * by quirks) and also run it with hpet=verbose for debugging
408 * purposes.
407 */ 409 */
408 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { 410 if (hpet_readback_cmp || hpet_verbose) {
409 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, 411 u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
410 KERN_WARNING "hpet: compare register read back failed.\n"); 412
413 if (cmp != cnt)
414 printk_once(KERN_WARNING
415 "hpet: compare register read back failed.\n");
411 } 416 }
412 417
413 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 418 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519..a8f1b803d2f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len)
189} 189}
190 190
191/* 191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space. 192 * Check for virtual address in kernel space.
205 */ 193 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) 194int arch_check_bp_in_kernelspace(struct perf_event *bp)
207{ 195{
208 unsigned int len; 196 unsigned int len;
197 unsigned long va;
198 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
209 199
210 len = get_hbp_len(hbp_len); 200 va = info->address;
201 len = get_hbp_len(info->len);
211 202
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 203 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213} 204}
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
300/* 291/*
301 * Validate the arch-specific HW Breakpoint register settings 292 * Validate the arch-specific HW Breakpoint register settings
302 */ 293 */
303int arch_validate_hwbkpt_settings(struct perf_event *bp, 294int arch_validate_hwbkpt_settings(struct perf_event *bp)
304 struct task_struct *tsk)
305{ 295{
306 struct arch_hw_breakpoint *info = counter_arch_bp(bp); 296 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
307 unsigned int align; 297 unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
314 304
315 ret = -EINVAL; 305 ret = -EINVAL;
316 306
317 if (info->type == X86_BREAKPOINT_EXECUTE)
318 /*
319 * Ptrace-refactoring code
320 * For now, we'll allow instruction breakpoint only for user-space
321 * addresses
322 */
323 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
324 info->len != X86_BREAKPOINT_EXECUTE)
325 return ret;
326
327 switch (info->len) { 307 switch (info->len) {
328 case X86_BREAKPOINT_LEN_1: 308 case X86_BREAKPOINT_LEN_1:
329 align = 0; 309 align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
350 if (info->address & align) 330 if (info->address & align)
351 return -EINVAL; 331 return -EINVAL;
352 332
353 /* Check that the virtual address is in the proper range */
354 if (tsk) {
355 if (!arch_check_va_in_userspace(info->address, info->len))
356 return -EFAULT;
357 } else {
358 if (!arch_check_va_in_kernelspace(info->address, info->len))
359 return -EFAULT;
360 }
361
362 return 0; 333 return 0;
363} 334}
364 335
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c28548..86cef6b3225 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -102,65 +102,62 @@ void __cpuinit fpu_init(void)
102 102
103 mxcsr_feature_mask_init(); 103 mxcsr_feature_mask_init();
104 /* clean state in init */ 104 /* clean state in init */
105 if (cpu_has_xsave) 105 current_thread_info()->status = 0;
106 current_thread_info()->status = TS_XSAVE;
107 else
108 current_thread_info()->status = 0;
109 clear_used_math(); 106 clear_used_math();
110} 107}
111#endif /* CONFIG_X86_64 */ 108#endif /* CONFIG_X86_64 */
112 109
113/* 110static void fpu_finit(struct fpu *fpu)
114 * The _current_ task is using the FPU for the first time
115 * so initialize it and set the mxcsr to its default
116 * value at reset if we support XMM instructions and then
117 * remeber the current task has used the FPU.
118 */
119int init_fpu(struct task_struct *tsk)
120{ 111{
121 if (tsk_used_math(tsk)) {
122 if (HAVE_HWFP && tsk == current)
123 unlazy_fpu(tsk);
124 return 0;
125 }
126
127 /*
128 * Memory allocation at the first usage of the FPU and other state.
129 */
130 if (!tsk->thread.xstate) {
131 tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
132 GFP_KERNEL);
133 if (!tsk->thread.xstate)
134 return -ENOMEM;
135 }
136
137#ifdef CONFIG_X86_32 112#ifdef CONFIG_X86_32
138 if (!HAVE_HWFP) { 113 if (!HAVE_HWFP) {
139 memset(tsk->thread.xstate, 0, xstate_size); 114 finit_soft_fpu(&fpu->state->soft);
140 finit_task(tsk); 115 return;
141 set_stopped_child_used_math(tsk);
142 return 0;
143 } 116 }
144#endif 117#endif
145 118
146 if (cpu_has_fxsr) { 119 if (cpu_has_fxsr) {
147 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 120 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
148 121
149 memset(fx, 0, xstate_size); 122 memset(fx, 0, xstate_size);
150 fx->cwd = 0x37f; 123 fx->cwd = 0x37f;
151 if (cpu_has_xmm) 124 if (cpu_has_xmm)
152 fx->mxcsr = MXCSR_DEFAULT; 125 fx->mxcsr = MXCSR_DEFAULT;
153 } else { 126 } else {
154 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 127 struct i387_fsave_struct *fp = &fpu->state->fsave;
155 memset(fp, 0, xstate_size); 128 memset(fp, 0, xstate_size);
156 fp->cwd = 0xffff037fu; 129 fp->cwd = 0xffff037fu;
157 fp->swd = 0xffff0000u; 130 fp->swd = 0xffff0000u;
158 fp->twd = 0xffffffffu; 131 fp->twd = 0xffffffffu;
159 fp->fos = 0xffff0000u; 132 fp->fos = 0xffff0000u;
160 } 133 }
134}
135
136/*
137 * The _current_ task is using the FPU for the first time
138 * so initialize it and set the mxcsr to its default
139 * value at reset if we support XMM instructions and then
140 * remeber the current task has used the FPU.
141 */
142int init_fpu(struct task_struct *tsk)
143{
144 int ret;
145
146 if (tsk_used_math(tsk)) {
147 if (HAVE_HWFP && tsk == current)
148 unlazy_fpu(tsk);
149 return 0;
150 }
151
161 /* 152 /*
162 * Only the device not available exception or ptrace can call init_fpu. 153 * Memory allocation at the first usage of the FPU and other state.
163 */ 154 */
155 ret = fpu_alloc(&tsk->thread.fpu);
156 if (ret)
157 return ret;
158
159 fpu_finit(&tsk->thread.fpu);
160
164 set_stopped_child_used_math(tsk); 161 set_stopped_child_used_math(tsk);
165 return 0; 162 return 0;
166} 163}
@@ -194,7 +191,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
194 return ret; 191 return ret;
195 192
196 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 193 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
197 &target->thread.xstate->fxsave, 0, -1); 194 &target->thread.fpu.state->fxsave, 0, -1);
198} 195}
199 196
200int xfpregs_set(struct task_struct *target, const struct user_regset *regset, 197int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -211,19 +208,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
211 return ret; 208 return ret;
212 209
213 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 210 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
214 &target->thread.xstate->fxsave, 0, -1); 211 &target->thread.fpu.state->fxsave, 0, -1);
215 212
216 /* 213 /*
217 * mxcsr reserved bits must be masked to zero for security reasons. 214 * mxcsr reserved bits must be masked to zero for security reasons.
218 */ 215 */
219 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 216 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
220 217
221 /* 218 /*
222 * update the header bits in the xsave header, indicating the 219 * update the header bits in the xsave header, indicating the
223 * presence of FP and SSE state. 220 * presence of FP and SSE state.
224 */ 221 */
225 if (cpu_has_xsave) 222 if (cpu_has_xsave)
226 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; 223 target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
227 224
228 return ret; 225 return ret;
229} 226}
@@ -246,14 +243,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
246 * memory layout in the thread struct, so that we can copy the entire 243 * memory layout in the thread struct, so that we can copy the entire
247 * xstateregs to the user using one user_regset_copyout(). 244 * xstateregs to the user using one user_regset_copyout().
248 */ 245 */
249 memcpy(&target->thread.xstate->fxsave.sw_reserved, 246 memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
250 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); 247 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
251 248
252 /* 249 /*
253 * Copy the xstate memory layout. 250 * Copy the xstate memory layout.
254 */ 251 */
255 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, 252 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
256 &target->thread.xstate->xsave, 0, -1); 253 &target->thread.fpu.state->xsave, 0, -1);
257 return ret; 254 return ret;
258} 255}
259 256
@@ -272,14 +269,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
272 return ret; 269 return ret;
273 270
274 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 271 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
275 &target->thread.xstate->xsave, 0, -1); 272 &target->thread.fpu.state->xsave, 0, -1);
276 273
277 /* 274 /*
278 * mxcsr reserved bits must be masked to zero for security reasons. 275 * mxcsr reserved bits must be masked to zero for security reasons.
279 */ 276 */
280 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 277 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
281 278
282 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; 279 xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
283 280
284 xsave_hdr->xstate_bv &= pcntxt_mask; 281 xsave_hdr->xstate_bv &= pcntxt_mask;
285 /* 282 /*
@@ -365,7 +362,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
365static void 362static void
366convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) 363convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
367{ 364{
368 struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; 365 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
369 struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; 366 struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
370 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; 367 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
371 int i; 368 int i;
@@ -405,7 +402,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
405 const struct user_i387_ia32_struct *env) 402 const struct user_i387_ia32_struct *env)
406 403
407{ 404{
408 struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; 405 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
409 struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; 406 struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
410 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; 407 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
411 int i; 408 int i;
@@ -445,7 +442,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
445 442
446 if (!cpu_has_fxsr) { 443 if (!cpu_has_fxsr) {
447 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 444 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
448 &target->thread.xstate->fsave, 0, 445 &target->thread.fpu.state->fsave, 0,
449 -1); 446 -1);
450 } 447 }
451 448
@@ -475,7 +472,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
475 472
476 if (!cpu_has_fxsr) { 473 if (!cpu_has_fxsr) {
477 return user_regset_copyin(&pos, &count, &kbuf, &ubuf, 474 return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
478 &target->thread.xstate->fsave, 0, -1); 475 &target->thread.fpu.state->fsave, 0, -1);
479 } 476 }
480 477
481 if (pos > 0 || count < sizeof(env)) 478 if (pos > 0 || count < sizeof(env))
@@ -490,7 +487,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
490 * presence of FP. 487 * presence of FP.
491 */ 488 */
492 if (cpu_has_xsave) 489 if (cpu_has_xsave)
493 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; 490 target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
494 return ret; 491 return ret;
495} 492}
496 493
@@ -501,7 +498,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
501static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) 498static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
502{ 499{
503 struct task_struct *tsk = current; 500 struct task_struct *tsk = current;
504 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 501 struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
505 502
506 fp->status = fp->swd; 503 fp->status = fp->swd;
507 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 504 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -512,7 +509,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
512static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) 509static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
513{ 510{
514 struct task_struct *tsk = current; 511 struct task_struct *tsk = current;
515 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 512 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
516 struct user_i387_ia32_struct env; 513 struct user_i387_ia32_struct env;
517 int err = 0; 514 int err = 0;
518 515
@@ -547,7 +544,7 @@ static int save_i387_xsave(void __user *buf)
547 * header as well as change any contents in the memory layout. 544 * header as well as change any contents in the memory layout.
548 * xrestore as part of sigreturn will capture all the changes. 545 * xrestore as part of sigreturn will capture all the changes.
549 */ 546 */
550 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; 547 tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
551 548
552 if (save_i387_fxsave(fx) < 0) 549 if (save_i387_fxsave(fx) < 0)
553 return -1; 550 return -1;
@@ -599,7 +596,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
599{ 596{
600 struct task_struct *tsk = current; 597 struct task_struct *tsk = current;
601 598
602 return __copy_from_user(&tsk->thread.xstate->fsave, buf, 599 return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
603 sizeof(struct i387_fsave_struct)); 600 sizeof(struct i387_fsave_struct));
604} 601}
605 602
@@ -610,10 +607,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
610 struct user_i387_ia32_struct env; 607 struct user_i387_ia32_struct env;
611 int err; 608 int err;
612 609
613 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 610 err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
614 size); 611 size);
615 /* mxcsr reserved bits must be masked to zero for security reasons */ 612 /* mxcsr reserved bits must be masked to zero for security reasons */
616 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 613 tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
617 if (err || __copy_from_user(&env, buf, sizeof(env))) 614 if (err || __copy_from_user(&env, buf, sizeof(env)))
618 return 1; 615 return 1;
619 convert_to_fxsr(tsk, &env); 616 convert_to_fxsr(tsk, &env);
@@ -629,7 +626,7 @@ static int restore_i387_xsave(void __user *buf)
629 struct i387_fxsave_struct __user *fx = 626 struct i387_fxsave_struct __user *fx =
630 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; 627 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
631 struct xsave_hdr_struct *xsave_hdr = 628 struct xsave_hdr_struct *xsave_hdr =
632 &current->thread.xstate->xsave.xsave_hdr; 629 &current->thread.fpu.state->xsave.xsave_hdr;
633 u64 mask; 630 u64 mask;
634 int err; 631 int err;
635 632
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5..2dfd3159744 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17#include <asm/smp.h> 17#include <asm/smp.h>
18 18
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_RAW_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22/* 22/*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
33static void init_pit_timer(enum clock_event_mode mode, 33static void init_pit_timer(enum clock_event_mode mode,
34 struct clock_event_device *evt) 34 struct clock_event_device *evt)
35{ 35{
36 spin_lock(&i8253_lock); 36 raw_spin_lock(&i8253_lock);
37 37
38 switch (mode) { 38 switch (mode) {
39 case CLOCK_EVT_MODE_PERIODIC: 39 case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
62 /* Nothing to do here */ 62 /* Nothing to do here */
63 break; 63 break;
64 } 64 }
65 spin_unlock(&i8253_lock); 65 raw_spin_unlock(&i8253_lock);
66} 66}
67 67
68/* 68/*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
72 */ 72 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt) 73static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{ 74{
75 spin_lock(&i8253_lock); 75 raw_spin_lock(&i8253_lock);
76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */ 76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */ 77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 spin_unlock(&i8253_lock); 78 raw_spin_unlock(&i8253_lock);
79 79
80 return 0; 80 return 0;
81} 81}
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
130 int count; 130 int count;
131 u32 jifs; 131 u32 jifs;
132 132
133 spin_lock_irqsave(&i8253_lock, flags); 133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /* 134 /*
135 * Although our caller may have the read side of xtime_lock, 135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine 136 * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
176 old_count = count; 176 old_count = count;
177 old_jifs = jifs; 177 old_jifs = jifs;
178 178
179 spin_unlock_irqrestore(&i8253_lock, flags); 179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180 180
181 count = (LATCH - 1) - count; 181 count = (LATCH - 1) - count;
182 182
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd4..990ae7cfc57 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
60 outb(0, 0xF0); 60 outb(0, 0xF0);
61 if (ignore_fpu_irq || !boot_cpu_data.hard_math) 61 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
62 return IRQ_NONE; 62 return IRQ_NONE;
63 math_error((void __user *)get_irq_regs()->ip); 63 math_error(get_irq_regs(), 0, 16);
64 return IRQ_HANDLED; 64 return IRQ_HANDLED;
65} 65}
66 66
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b2258ca9100..4f4af75b948 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -47,20 +47,8 @@
47#include <asm/debugreg.h> 47#include <asm/debugreg.h>
48#include <asm/apicdef.h> 48#include <asm/apicdef.h>
49#include <asm/system.h> 49#include <asm/system.h>
50
51#include <asm/apic.h> 50#include <asm/apic.h>
52 51
53/*
54 * Put the error code here just in case the user cares:
55 */
56static int gdb_x86errcode;
57
58/*
59 * Likewise, the vector number here (since GDB only gets the signal
60 * number through the usual means, and that's not very specific):
61 */
62static int gdb_x86vector = -1;
63
64/** 52/**
65 * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs 53 * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
66 * @gdb_regs: A pointer to hold the registers in the order GDB wants. 54 * @gdb_regs: A pointer to hold the registers in the order GDB wants.
@@ -211,6 +199,8 @@ static struct hw_breakpoint {
211 struct perf_event **pev; 199 struct perf_event **pev;
212} breakinfo[4]; 200} breakinfo[4];
213 201
202static unsigned long early_dr7;
203
214static void kgdb_correct_hw_break(void) 204static void kgdb_correct_hw_break(void)
215{ 205{
216 int breakno; 206 int breakno;
@@ -222,6 +212,14 @@ static void kgdb_correct_hw_break(void)
222 int cpu = raw_smp_processor_id(); 212 int cpu = raw_smp_processor_id();
223 if (!breakinfo[breakno].enabled) 213 if (!breakinfo[breakno].enabled)
224 continue; 214 continue;
215 if (dbg_is_early) {
216 set_debugreg(breakinfo[breakno].addr, breakno);
217 early_dr7 |= encode_dr7(breakno,
218 breakinfo[breakno].len,
219 breakinfo[breakno].type);
220 set_debugreg(early_dr7, 7);
221 continue;
222 }
225 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); 223 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
226 info = counter_arch_bp(bp); 224 info = counter_arch_bp(bp);
227 if (bp->attr.disabled != 1) 225 if (bp->attr.disabled != 1)
@@ -236,7 +234,8 @@ static void kgdb_correct_hw_break(void)
236 if (!val) 234 if (!val)
237 bp->attr.disabled = 0; 235 bp->attr.disabled = 0;
238 } 236 }
239 hw_breakpoint_restore(); 237 if (!dbg_is_early)
238 hw_breakpoint_restore();
240} 239}
241 240
242static int hw_break_reserve_slot(int breakno) 241static int hw_break_reserve_slot(int breakno)
@@ -245,6 +244,9 @@ static int hw_break_reserve_slot(int breakno)
245 int cnt = 0; 244 int cnt = 0;
246 struct perf_event **pevent; 245 struct perf_event **pevent;
247 246
247 if (dbg_is_early)
248 return 0;
249
248 for_each_online_cpu(cpu) { 250 for_each_online_cpu(cpu) {
249 cnt++; 251 cnt++;
250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 252 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
@@ -270,6 +272,9 @@ static int hw_break_release_slot(int breakno)
270 struct perf_event **pevent; 272 struct perf_event **pevent;
271 int cpu; 273 int cpu;
272 274
275 if (dbg_is_early)
276 return 0;
277
273 for_each_online_cpu(cpu) { 278 for_each_online_cpu(cpu) {
274 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 279 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
275 if (dbg_release_bp_slot(*pevent)) 280 if (dbg_release_bp_slot(*pevent))
@@ -314,7 +319,11 @@ static void kgdb_remove_all_hw_break(void)
314 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 319 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
315 if (bp->attr.disabled == 1) 320 if (bp->attr.disabled == 1)
316 continue; 321 continue;
317 arch_uninstall_hw_breakpoint(bp); 322 if (dbg_is_early)
323 early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
324 breakinfo[i].type);
325 else
326 arch_uninstall_hw_breakpoint(bp);
318 bp->attr.disabled = 1; 327 bp->attr.disabled = 1;
319 } 328 }
320} 329}
@@ -391,6 +400,11 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
391 for (i = 0; i < 4; i++) { 400 for (i = 0; i < 4; i++) {
392 if (!breakinfo[i].enabled) 401 if (!breakinfo[i].enabled)
393 continue; 402 continue;
403 if (dbg_is_early) {
404 early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
405 breakinfo[i].type);
406 continue;
407 }
394 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 408 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
395 if (bp->attr.disabled == 1) 409 if (bp->attr.disabled == 1)
396 continue; 410 continue;
@@ -399,23 +413,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
399 } 413 }
400} 414}
401 415
402/**
403 * kgdb_post_primary_code - Save error vector/code numbers.
404 * @regs: Original pt_regs.
405 * @e_vector: Original error vector.
406 * @err_code: Original error code.
407 *
408 * This is needed on architectures which support SMP and KGDB.
409 * This function is called after all the slave cpus have been put
410 * to a know spin state and the primary CPU has control over KGDB.
411 */
412void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
413{
414 /* primary processor is completely in the debugger */
415 gdb_x86vector = e_vector;
416 gdb_x86errcode = err_code;
417}
418
419#ifdef CONFIG_SMP 416#ifdef CONFIG_SMP
420/** 417/**
421 * kgdb_roundup_cpus - Get other CPUs into a holding pattern 418 * kgdb_roundup_cpus - Get other CPUs into a holding pattern
@@ -567,7 +564,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
567 return NOTIFY_DONE; 564 return NOTIFY_DONE;
568 } 565 }
569 566
570 if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs)) 567 if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
571 return NOTIFY_DONE; 568 return NOTIFY_DONE;
572 569
573 /* Must touch watchdog before return to normal operation */ 570 /* Must touch watchdog before return to normal operation */
@@ -575,6 +572,26 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
575 return NOTIFY_STOP; 572 return NOTIFY_STOP;
576} 573}
577 574
575#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
576int kgdb_ll_trap(int cmd, const char *str,
577 struct pt_regs *regs, long err, int trap, int sig)
578{
579 struct die_args args = {
580 .regs = regs,
581 .str = str,
582 .err = err,
583 .trapnr = trap,
584 .signr = sig,
585
586 };
587
588 if (!kgdb_io_module_registered)
589 return NOTIFY_DONE;
590
591 return __kgdb_notify(&args, cmd);
592}
593#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
594
578static int 595static int
579kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) 596kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
580{ 597{
@@ -605,14 +622,15 @@ static struct notifier_block kgdb_notifier = {
605 */ 622 */
606int kgdb_arch_init(void) 623int kgdb_arch_init(void)
607{ 624{
625 return register_die_notifier(&kgdb_notifier);
626}
627
628void kgdb_arch_late(void)
629{
608 int i, cpu; 630 int i, cpu;
609 int ret;
610 struct perf_event_attr attr; 631 struct perf_event_attr attr;
611 struct perf_event **pevent; 632 struct perf_event **pevent;
612 633
613 ret = register_die_notifier(&kgdb_notifier);
614 if (ret != 0)
615 return ret;
616 /* 634 /*
617 * Pre-allocate the hw breakpoint structions in the non-atomic 635 * Pre-allocate the hw breakpoint structions in the non-atomic
618 * portion of kgdb because this operation requires mutexs to 636 * portion of kgdb because this operation requires mutexs to
@@ -624,12 +642,15 @@ int kgdb_arch_init(void)
624 attr.bp_type = HW_BREAKPOINT_W; 642 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1; 643 attr.disabled = 1;
626 for (i = 0; i < 4; i++) { 644 for (i = 0; i < 4; i++) {
645 if (breakinfo[i].pev)
646 continue;
627 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 647 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
628 if (IS_ERR(breakinfo[i].pev)) { 648 if (IS_ERR(breakinfo[i].pev)) {
629 printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); 649 printk(KERN_ERR "kgdb: Could not allocate hw"
650 "breakpoints\nDisabling the kernel debugger\n");
630 breakinfo[i].pev = NULL; 651 breakinfo[i].pev = NULL;
631 kgdb_arch_exit(); 652 kgdb_arch_exit();
632 return -1; 653 return;
633 } 654 }
634 for_each_online_cpu(cpu) { 655 for_each_online_cpu(cpu) {
635 pevent = per_cpu_ptr(breakinfo[i].pev, cpu); 656 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -640,7 +661,6 @@ int kgdb_arch_init(void)
640 } 661 }
641 } 662 }
642 } 663 }
643 return ret;
644} 664}
645 665
646/** 666/**
@@ -690,6 +710,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
690 return instruction_pointer(regs); 710 return instruction_pointer(regs);
691} 711}
692 712
713void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
714{
715 regs->ip = ip;
716}
717
693struct kgdb_arch arch_kgdb_ops = { 718struct kgdb_arch arch_kgdb_ops = {
694 /* Breakpoint instruction: */ 719 /* Breakpoint instruction: */
695 .gdb_bpt_instr = { 0xcc }, 720 .gdb_bpt_instr = { 0xcc },
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c..345a4b1fe14 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
422 422
423static void __kprobes clear_btf(void) 423static void __kprobes clear_btf(void)
424{ 424{
425 if (test_thread_flag(TIF_DEBUGCTLMSR)) 425 if (test_thread_flag(TIF_BLOCKSTEP)) {
426 update_debugctlmsr(0); 426 unsigned long debugctl = get_debugctlmsr();
427
428 debugctl &= ~DEBUGCTLMSR_BTF;
429 update_debugctlmsr(debugctl);
430 }
427} 431}
428 432
429static void __kprobes restore_btf(void) 433static void __kprobes restore_btf(void)
430{ 434{
431 if (test_thread_flag(TIF_DEBUGCTLMSR)) 435 if (test_thread_flag(TIF_BLOCKSTEP)) {
432 update_debugctlmsr(current->thread.debugctlmsr); 436 unsigned long debugctl = get_debugctlmsr();
437
438 debugctl |= DEBUGCTLMSR_BTF;
439 update_debugctlmsr(debugctl);
440 }
433} 441}
434 442
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 443void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
@@ -534,20 +542,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
534 struct kprobe_ctlblk *kcb; 542 struct kprobe_ctlblk *kcb;
535 543
536 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 544 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
537 if (*addr != BREAKPOINT_INSTRUCTION) {
538 /*
539 * The breakpoint instruction was removed right
540 * after we hit it. Another cpu has removed
541 * either a probepoint or a debugger breakpoint
542 * at this address. In either case, no further
543 * handling of this interrupt is appropriate.
544 * Back up over the (now missing) int3 and run
545 * the original instruction.
546 */
547 regs->ip = (unsigned long)addr;
548 return 1;
549 }
550
551 /* 545 /*
552 * We don't want to be preempted for the entire 546 * We don't want to be preempted for the entire
553 * duration of kprobe processing. We conditionally 547 * duration of kprobe processing. We conditionally
@@ -579,6 +573,19 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
579 setup_singlestep(p, regs, kcb, 0); 573 setup_singlestep(p, regs, kcb, 0);
580 return 1; 574 return 1;
581 } 575 }
576 } else if (*addr != BREAKPOINT_INSTRUCTION) {
577 /*
578 * The breakpoint instruction was removed right
579 * after we hit it. Another cpu has removed
580 * either a probepoint or a debugger breakpoint
581 * at this address. In either case, no further
582 * handling of this interrupt is appropriate.
583 * Back up over the (now missing) int3 and run
584 * the original instruction.
585 */
586 regs->ip = (unsigned long)addr;
587 preempt_enable_no_resched();
588 return 1;
582 } else if (kprobe_running()) { 589 } else if (kprobe_running()) {
583 p = __get_cpu_var(current_kprobe); 590 p = __get_cpu_var(current_kprobe);
584 if (p->break_handler && p->break_handler(p, regs)) { 591 if (p->break_handler && p->break_handler(p, regs)) {
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4..eb9b76c716c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
29#define KVM_SCALE 22 29#define KVM_SCALE 22
30 30
31static int kvmclock = 1; 31static int kvmclock = 1;
32static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
33static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
32 34
33static int parse_no_kvmclock(char *arg) 35static int parse_no_kvmclock(char *arg)
34{ 36{
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
54 56
55 low = (int)__pa_symbol(&wall_clock); 57 low = (int)__pa_symbol(&wall_clock);
56 high = ((u64)__pa_symbol(&wall_clock) >> 32); 58 high = ((u64)__pa_symbol(&wall_clock) >> 32);
57 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 59
60 native_write_msr(msr_kvm_wall_clock, low, high);
58 61
59 vcpu_time = &get_cpu_var(hv_clock); 62 vcpu_time = &get_cpu_var(hv_clock);
60 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 63 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
130 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 133 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
131 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 134 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
132 cpu, high, low, txt); 135 cpu, high, low, txt);
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 136
137 return native_write_msr_safe(msr_kvm_system_time, low, high);
134} 138}
135 139
136#ifdef CONFIG_X86_LOCAL_APIC 140#ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
165#ifdef CONFIG_KEXEC 169#ifdef CONFIG_KEXEC
166static void kvm_crash_shutdown(struct pt_regs *regs) 170static void kvm_crash_shutdown(struct pt_regs *regs)
167{ 171{
168 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); 172 native_write_msr(msr_kvm_system_time, 0, 0);
169 native_machine_crash_shutdown(regs); 173 native_machine_crash_shutdown(regs);
170} 174}
171#endif 175#endif
172 176
173static void kvm_shutdown(void) 177static void kvm_shutdown(void)
174{ 178{
175 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); 179 native_write_msr(msr_kvm_system_time, 0, 0);
176 native_machine_shutdown(); 180 native_machine_shutdown();
177} 181}
178 182
@@ -181,27 +185,37 @@ void __init kvmclock_init(void)
181 if (!kvm_para_available()) 185 if (!kvm_para_available())
182 return; 186 return;
183 187
184 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 188 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
185 if (kvm_register_clock("boot clock")) 189 msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
186 return; 190 msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
187 pv_time_ops.sched_clock = kvm_clock_read; 191 } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
188 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 192 return;
189 x86_platform.get_wallclock = kvm_get_wallclock; 193
190 x86_platform.set_wallclock = kvm_set_wallclock; 194 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
195 msr_kvm_system_time, msr_kvm_wall_clock);
196
197 if (kvm_register_clock("boot clock"))
198 return;
199 pv_time_ops.sched_clock = kvm_clock_read;
200 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
201 x86_platform.get_wallclock = kvm_get_wallclock;
202 x86_platform.set_wallclock = kvm_set_wallclock;
191#ifdef CONFIG_X86_LOCAL_APIC 203#ifdef CONFIG_X86_LOCAL_APIC
192 x86_cpuinit.setup_percpu_clockev = 204 x86_cpuinit.setup_percpu_clockev =
193 kvm_setup_secondary_clock; 205 kvm_setup_secondary_clock;
194#endif 206#endif
195#ifdef CONFIG_SMP 207#ifdef CONFIG_SMP
196 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 208 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
197#endif 209#endif
198 machine_ops.shutdown = kvm_shutdown; 210 machine_ops.shutdown = kvm_shutdown;
199#ifdef CONFIG_KEXEC 211#ifdef CONFIG_KEXEC
200 machine_ops.crash_shutdown = kvm_crash_shutdown; 212 machine_ops.crash_shutdown = kvm_crash_shutdown;
201#endif 213#endif
202 kvm_get_preset_lpj(); 214 kvm_get_preset_lpj();
203 clocksource_register(&kvm_clock); 215 clocksource_register(&kvm_clock);
204 pv_info.paravirt_enabled = 1; 216 pv_info.paravirt_enabled = 1;
205 pv_info.name = "KVM"; 217 pv_info.name = "KVM";
206 } 218
219 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
220 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
207} 221}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c..fa6551d36c1 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 return error; 201 return error;
202} 202}
203 203
204static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *inode, struct file *file)
205{ 205{
206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
207} 207}
208 208
209static ssize_t microcode_write(struct file *file, const char __user *buf, 209static ssize_t microcode_write(struct file *file, const char __user *buf,
@@ -260,6 +260,7 @@ static void microcode_dev_exit(void)
260} 260}
261 261
262MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); 262MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
263MODULE_ALIAS("devname:cpu/microcode");
263#else 264#else
264#define microcode_dev_init() 0 265#define microcode_dev_init() 0
265#define microcode_dev_exit() do { } while (0) 266#define microcode_dev_exit() do { } while (0)
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e2893..356170262a9 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
343 int (*get_ucode_data)(void *, const void *, size_t)) 343 int (*get_ucode_data)(void *, const void *, size_t))
344{ 344{
345 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 345 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
346 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 346 u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
347 int new_rev = uci->cpu_sig.rev; 347 int new_rev = uci->cpu_sig.rev;
348 unsigned int leftover = size; 348 unsigned int leftover = size;
349 enum ucode_state state = UCODE_OK; 349 enum ucode_state state = UCODE_OK;
350 unsigned int curr_mc_size = 0;
350 351
351 while (leftover) { 352 while (leftover) {
352 struct microcode_header_intel mc_header; 353 struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
361 break; 362 break;
362 } 363 }
363 364
364 mc = vmalloc(mc_size); 365 /* For performance reasons, reuse mc area when possible */
365 if (!mc) 366 if (!mc || mc_size > curr_mc_size) {
366 break; 367 if (mc)
368 vfree(mc);
369 mc = vmalloc(mc_size);
370 if (!mc)
371 break;
372 curr_mc_size = mc_size;
373 }
367 374
368 if (get_ucode_data(mc, ucode_ptr, mc_size) || 375 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
369 microcode_sanity_check(mc) < 0) { 376 microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
376 vfree(new_mc); 383 vfree(new_mc);
377 new_rev = mc_header.rev; 384 new_rev = mc_header.rev;
378 new_mc = mc; 385 new_mc = mc;
379 } else 386 mc = NULL; /* trigger new vmalloc */
380 vfree(mc); 387 }
381 388
382 ucode_ptr += mc_size; 389 ucode_ptr += mc_size;
383 leftover -= mc_size; 390 leftover -= mc_size;
384 } 391 }
385 392
393 if (mc)
394 vfree(mc);
395
386 if (leftover) { 396 if (leftover) {
387 if (new_mc) 397 if (new_mc)
388 vfree(new_mc); 398 vfree(new_mc);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8..5ae5d2426ed 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
116} 116}
117 117
118static int bad_ioapic(unsigned long address)
119{
120 if (nr_ioapics >= MAX_IO_APICS) {
121 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
122 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
123 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
124 }
125 if (!address) {
126 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
127 " found in table, skipping!\n");
128 return 1;
129 }
130 return 0;
131}
132
133static void __init MP_ioapic_info(struct mpc_ioapic *m) 118static void __init MP_ioapic_info(struct mpc_ioapic *m)
134{ 119{
135 if (!(m->flags & MPC_APIC_USABLE)) 120 if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
138 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", 123 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
139 m->apicid, m->apicver, m->apicaddr); 124 m->apicid, m->apicver, m->apicaddr);
140 125
141 if (bad_ioapic(m->apicaddr)) 126 mp_register_ioapic(m->apicid, m->apicaddr, gsi_end + 1);
142 return;
143
144 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
145 mp_ioapics[nr_ioapics].apicid = m->apicid;
146 mp_ioapics[nr_ioapics].type = m->type;
147 mp_ioapics[nr_ioapics].apicver = m->apicver;
148 mp_ioapics[nr_ioapics].flags = m->flags;
149 nr_ioapics++;
150} 127}
151 128
152static void print_MP_intsrc_info(struct mpc_intsrc *m) 129static void print_MP_intsrc_info(struct mpc_intsrc *m)
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad8670858..e796448f0eb 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -237,4 +237,9 @@ void __init x86_mrst_early_setup(void)
237 x86_init.pci.fixup_irqs = x86_init_noop; 237 x86_init.pci.fixup_irqs = x86_init_noop;
238 238
239 legacy_pic = &null_legacy_pic; 239 legacy_pic = &null_legacy_pic;
240
241 /* Avoid searching for BIOS MP tables */
242 x86_init.mpparse.find_smp_config = x86_init_noop;
243 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
244
240} 245}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4d4468e9f47..7bf2dc4c8f7 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
230 msr_device_destroy(cpu); 230 msr_device_destroy(cpu);
231 break; 231 break;
232 } 232 }
233 return err ? NOTIFY_BAD : NOTIFY_OK; 233 return notifier_from_errno(err);
234} 234}
235 235
236static struct notifier_block __refdata msr_class_cpu_notifier = { 236static struct notifier_block __refdata msr_class_cpu_notifier = {
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 7d2829dde20..a5bc528d432 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = {
31 .free_coherent = swiotlb_free_coherent, 31 .free_coherent = swiotlb_free_coherent,
32 .sync_single_for_cpu = swiotlb_sync_single_for_cpu, 32 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
33 .sync_single_for_device = swiotlb_sync_single_for_device, 33 .sync_single_for_device = swiotlb_sync_single_for_device,
34 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
35 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
36 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 34 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
37 .sync_sg_for_device = swiotlb_sync_sg_for_device, 35 .sync_sg_for_device = swiotlb_sync_sg_for_device,
38 .map_sg = swiotlb_map_sg_attrs, 36 .map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b9..e7e35219b32 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <asm/i387.h> 22#include <asm/i387.h>
23#include <asm/ds.h>
24#include <asm/debugreg.h> 23#include <asm/debugreg.h>
25 24
26unsigned long idle_halt; 25unsigned long idle_halt;
@@ -32,26 +31,22 @@ struct kmem_cache *task_xstate_cachep;
32 31
33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 32int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
34{ 33{
34 int ret;
35
35 *dst = *src; 36 *dst = *src;
36 if (src->thread.xstate) { 37 if (fpu_allocated(&src->thread.fpu)) {
37 dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, 38 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
38 GFP_KERNEL); 39 ret = fpu_alloc(&dst->thread.fpu);
39 if (!dst->thread.xstate) 40 if (ret)
40 return -ENOMEM; 41 return ret;
41 WARN_ON((unsigned long)dst->thread.xstate & 15); 42 fpu_copy(&dst->thread.fpu, &src->thread.fpu);
42 memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
43 } 43 }
44 return 0; 44 return 0;
45} 45}
46 46
47void free_thread_xstate(struct task_struct *tsk) 47void free_thread_xstate(struct task_struct *tsk)
48{ 48{
49 if (tsk->thread.xstate) { 49 fpu_free(&tsk->thread.fpu);
50 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
51 tsk->thread.xstate = NULL;
52 }
53
54 WARN(tsk->thread.ds_ctx, "leaking DS context\n");
55} 50}
56 51
57void free_thread_info(struct thread_info *ti) 52void free_thread_info(struct thread_info *ti)
@@ -198,11 +193,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
198 prev = &prev_p->thread; 193 prev = &prev_p->thread;
199 next = &next_p->thread; 194 next = &next_p->thread;
200 195
201 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || 196 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
202 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) 197 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
203 ds_switch_to(prev_p, next_p); 198 unsigned long debugctl = get_debugctlmsr();
204 else if (next->debugctlmsr != prev->debugctlmsr) 199
205 update_debugctlmsr(next->debugctlmsr); 200 debugctl &= ~DEBUGCTLMSR_BTF;
201 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
202 debugctl |= DEBUGCTLMSR_BTF;
203
204 update_debugctlmsr(debugctl);
205 }
206 206
207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
208 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 208 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
@@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
546 * check OSVW bit for CPUs that are not affected 546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400 547 * by erratum #400
548 */ 548 */
549 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); 549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 if (val >= 2) { 550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 rdmsrl(MSR_AMD64_OSVW_STATUS, val); 551 if (val >= 2) {
552 if (!(val & BIT(1))) 552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 goto no_c1e_idle; 553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
554 } 556 }
555 return 1; 557 return 1;
556 } 558 }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30..8d128783af4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
55#include <asm/cpu.h> 55#include <asm/cpu.h>
56#include <asm/idle.h> 56#include <asm/idle.h>
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/ds.h>
59#include <asm/debugreg.h> 58#include <asm/debugreg.h>
60 59
61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
238 kfree(p->thread.io_bitmap_ptr); 237 kfree(p->thread.io_bitmap_ptr);
239 p->thread.io_bitmap_max = 0; 238 p->thread.io_bitmap_max = 0;
240 } 239 }
241
242 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
243 p->thread.ds_ctx = NULL;
244
245 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
246 p->thread.debugctlmsr = 0;
247
248 return err; 240 return err;
249} 241}
250 242
@@ -317,7 +309,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
317 309
318 /* we're going to use this soon, after a few expensive things */ 310 /* we're going to use this soon, after a few expensive things */
319 if (preload_fpu) 311 if (preload_fpu)
320 prefetch(next->xstate); 312 prefetch(next->fpu.state);
321 313
322 /* 314 /*
323 * Reload esp0. 315 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf..3c2422a99f1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
49#include <asm/ia32.h> 49#include <asm/ia32.h>
50#include <asm/idle.h> 50#include <asm/idle.h>
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/ds.h>
53#include <asm/debugreg.h> 52#include <asm/debugreg.h>
54 53
55asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
313 if (err) 312 if (err)
314 goto out; 313 goto out;
315 } 314 }
316
317 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
318 p->thread.ds_ctx = NULL;
319
320 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
321 p->thread.debugctlmsr = 0;
322
323 err = 0; 315 err = 0;
324out: 316out:
325 if (err && p->thread.io_bitmap_ptr) { 317 if (err && p->thread.io_bitmap_ptr) {
@@ -396,7 +388,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
396 388
397 /* we're going to use this soon, after a few expensive things */ 389 /* we're going to use this soon, after a few expensive things */
398 if (preload_fpu) 390 if (preload_fpu)
399 prefetch(next->xstate); 391 prefetch(next->fpu.state);
400 392
401 /* 393 /*
402 * Reload esp0, LDT and the page table pointer: 394 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2e9b55027b7..70c4872cd8a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
2/* 2/*
3 * Pentium III FXSR, SSE support 3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000 4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * BTS tracing
7 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
8 */ 5 */
9 6
10#include <linux/kernel.h> 7#include <linux/kernel.h>
@@ -22,7 +19,6 @@
22#include <linux/audit.h> 19#include <linux/audit.h>
23#include <linux/seccomp.h> 20#include <linux/seccomp.h>
24#include <linux/signal.h> 21#include <linux/signal.h>
25#include <linux/workqueue.h>
26#include <linux/perf_event.h> 22#include <linux/perf_event.h>
27#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
28 24
@@ -36,7 +32,6 @@
36#include <asm/desc.h> 32#include <asm/desc.h>
37#include <asm/prctl.h> 33#include <asm/prctl.h>
38#include <asm/proto.h> 34#include <asm/proto.h>
39#include <asm/ds.h>
40#include <asm/hw_breakpoint.h> 35#include <asm/hw_breakpoint.h>
41 36
42#include "tls.h" 37#include "tls.h"
@@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
693 struct perf_event_attr attr; 688 struct perf_event_attr attr;
694 689
695 if (!t->ptrace_bps[nr]) { 690 if (!t->ptrace_bps[nr]) {
696 hw_breakpoint_init(&attr); 691 ptrace_breakpoint_init(&attr);
697 /* 692 /*
698 * Put stub len and type to register (reserve) an inactive but 693 * Put stub len and type to register (reserve) an inactive but
699 * correct bp 694 * correct bp
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target,
789 0, IO_BITMAP_BYTES); 784 0, IO_BITMAP_BYTES);
790} 785}
791 786
792#ifdef CONFIG_X86_PTRACE_BTS
793/*
794 * A branch trace store context.
795 *
796 * Contexts may only be installed by ptrace_bts_config() and only for
797 * ptraced tasks.
798 *
799 * Contexts are destroyed when the tracee is detached from the tracer.
800 * The actual destruction work requires interrupts enabled, so the
801 * work is deferred and will be scheduled during __ptrace_unlink().
802 *
803 * Contexts hold an additional task_struct reference on the traced
804 * task, as well as a reference on the tracer's mm.
805 *
806 * Ptrace already holds a task_struct for the duration of ptrace operations,
807 * but since destruction is deferred, it may be executed after both
808 * tracer and tracee exited.
809 */
810struct bts_context {
811 /* The branch trace handle. */
812 struct bts_tracer *tracer;
813
814 /* The buffer used to store the branch trace and its size. */
815 void *buffer;
816 unsigned int size;
817
818 /* The mm that paid for the above buffer. */
819 struct mm_struct *mm;
820
821 /* The task this context belongs to. */
822 struct task_struct *task;
823
824 /* The signal to send on a bts buffer overflow. */
825 unsigned int bts_ovfl_signal;
826
827 /* The work struct to destroy a context. */
828 struct work_struct work;
829};
830
831static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
832{
833 void *buffer = NULL;
834 int err = -ENOMEM;
835
836 err = account_locked_memory(current->mm, current->signal->rlim, size);
837 if (err < 0)
838 return err;
839
840 buffer = kzalloc(size, GFP_KERNEL);
841 if (!buffer)
842 goto out_refund;
843
844 context->buffer = buffer;
845 context->size = size;
846 context->mm = get_task_mm(current);
847
848 return 0;
849
850 out_refund:
851 refund_locked_memory(current->mm, size);
852 return err;
853}
854
855static inline void free_bts_buffer(struct bts_context *context)
856{
857 if (!context->buffer)
858 return;
859
860 kfree(context->buffer);
861 context->buffer = NULL;
862
863 refund_locked_memory(context->mm, context->size);
864 context->size = 0;
865
866 mmput(context->mm);
867 context->mm = NULL;
868}
869
870static void free_bts_context_work(struct work_struct *w)
871{
872 struct bts_context *context;
873
874 context = container_of(w, struct bts_context, work);
875
876 ds_release_bts(context->tracer);
877 put_task_struct(context->task);
878 free_bts_buffer(context);
879 kfree(context);
880}
881
882static inline void free_bts_context(struct bts_context *context)
883{
884 INIT_WORK(&context->work, free_bts_context_work);
885 schedule_work(&context->work);
886}
887
888static inline struct bts_context *alloc_bts_context(struct task_struct *task)
889{
890 struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
891 if (context) {
892 context->task = task;
893 task->bts = context;
894
895 get_task_struct(task);
896 }
897
898 return context;
899}
900
901static int ptrace_bts_read_record(struct task_struct *child, size_t index,
902 struct bts_struct __user *out)
903{
904 struct bts_context *context;
905 const struct bts_trace *trace;
906 struct bts_struct bts;
907 const unsigned char *at;
908 int error;
909
910 context = child->bts;
911 if (!context)
912 return -ESRCH;
913
914 trace = ds_read_bts(context->tracer);
915 if (!trace)
916 return -ESRCH;
917
918 at = trace->ds.top - ((index + 1) * trace->ds.size);
919 if ((void *)at < trace->ds.begin)
920 at += (trace->ds.n * trace->ds.size);
921
922 if (!trace->read)
923 return -EOPNOTSUPP;
924
925 error = trace->read(context->tracer, at, &bts);
926 if (error < 0)
927 return error;
928
929 if (copy_to_user(out, &bts, sizeof(bts)))
930 return -EFAULT;
931
932 return sizeof(bts);
933}
934
935static int ptrace_bts_drain(struct task_struct *child,
936 long size,
937 struct bts_struct __user *out)
938{
939 struct bts_context *context;
940 const struct bts_trace *trace;
941 const unsigned char *at;
942 int error, drained = 0;
943
944 context = child->bts;
945 if (!context)
946 return -ESRCH;
947
948 trace = ds_read_bts(context->tracer);
949 if (!trace)
950 return -ESRCH;
951
952 if (!trace->read)
953 return -EOPNOTSUPP;
954
955 if (size < (trace->ds.top - trace->ds.begin))
956 return -EIO;
957
958 for (at = trace->ds.begin; (void *)at < trace->ds.top;
959 out++, drained++, at += trace->ds.size) {
960 struct bts_struct bts;
961
962 error = trace->read(context->tracer, at, &bts);
963 if (error < 0)
964 return error;
965
966 if (copy_to_user(out, &bts, sizeof(bts)))
967 return -EFAULT;
968 }
969
970 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
971
972 error = ds_reset_bts(context->tracer);
973 if (error < 0)
974 return error;
975
976 return drained;
977}
978
979static int ptrace_bts_config(struct task_struct *child,
980 long cfg_size,
981 const struct ptrace_bts_config __user *ucfg)
982{
983 struct bts_context *context;
984 struct ptrace_bts_config cfg;
985 unsigned int flags = 0;
986
987 if (cfg_size < sizeof(cfg))
988 return -EIO;
989
990 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
991 return -EFAULT;
992
993 context = child->bts;
994 if (!context)
995 context = alloc_bts_context(child);
996 if (!context)
997 return -ENOMEM;
998
999 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
1000 if (!cfg.signal)
1001 return -EINVAL;
1002
1003 return -EOPNOTSUPP;
1004 context->bts_ovfl_signal = cfg.signal;
1005 }
1006
1007 ds_release_bts(context->tracer);
1008 context->tracer = NULL;
1009
1010 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
1011 int err;
1012
1013 free_bts_buffer(context);
1014 if (!cfg.size)
1015 return 0;
1016
1017 err = alloc_bts_buffer(context, cfg.size);
1018 if (err < 0)
1019 return err;
1020 }
1021
1022 if (cfg.flags & PTRACE_BTS_O_TRACE)
1023 flags |= BTS_USER;
1024
1025 if (cfg.flags & PTRACE_BTS_O_SCHED)
1026 flags |= BTS_TIMESTAMPS;
1027
1028 context->tracer =
1029 ds_request_bts_task(child, context->buffer, context->size,
1030 NULL, (size_t)-1, flags);
1031 if (unlikely(IS_ERR(context->tracer))) {
1032 int error = PTR_ERR(context->tracer);
1033
1034 free_bts_buffer(context);
1035 context->tracer = NULL;
1036 return error;
1037 }
1038
1039 return sizeof(cfg);
1040}
1041
1042static int ptrace_bts_status(struct task_struct *child,
1043 long cfg_size,
1044 struct ptrace_bts_config __user *ucfg)
1045{
1046 struct bts_context *context;
1047 const struct bts_trace *trace;
1048 struct ptrace_bts_config cfg;
1049
1050 context = child->bts;
1051 if (!context)
1052 return -ESRCH;
1053
1054 if (cfg_size < sizeof(cfg))
1055 return -EIO;
1056
1057 trace = ds_read_bts(context->tracer);
1058 if (!trace)
1059 return -ESRCH;
1060
1061 memset(&cfg, 0, sizeof(cfg));
1062 cfg.size = trace->ds.end - trace->ds.begin;
1063 cfg.signal = context->bts_ovfl_signal;
1064 cfg.bts_size = sizeof(struct bts_struct);
1065
1066 if (cfg.signal)
1067 cfg.flags |= PTRACE_BTS_O_SIGNAL;
1068
1069 if (trace->ds.flags & BTS_USER)
1070 cfg.flags |= PTRACE_BTS_O_TRACE;
1071
1072 if (trace->ds.flags & BTS_TIMESTAMPS)
1073 cfg.flags |= PTRACE_BTS_O_SCHED;
1074
1075 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
1076 return -EFAULT;
1077
1078 return sizeof(cfg);
1079}
1080
1081static int ptrace_bts_clear(struct task_struct *child)
1082{
1083 struct bts_context *context;
1084 const struct bts_trace *trace;
1085
1086 context = child->bts;
1087 if (!context)
1088 return -ESRCH;
1089
1090 trace = ds_read_bts(context->tracer);
1091 if (!trace)
1092 return -ESRCH;
1093
1094 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
1095
1096 return ds_reset_bts(context->tracer);
1097}
1098
1099static int ptrace_bts_size(struct task_struct *child)
1100{
1101 struct bts_context *context;
1102 const struct bts_trace *trace;
1103
1104 context = child->bts;
1105 if (!context)
1106 return -ESRCH;
1107
1108 trace = ds_read_bts(context->tracer);
1109 if (!trace)
1110 return -ESRCH;
1111
1112 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
1113}
1114
1115/*
1116 * Called from __ptrace_unlink() after the child has been moved back
1117 * to its original parent.
1118 */
1119void ptrace_bts_untrace(struct task_struct *child)
1120{
1121 if (unlikely(child->bts)) {
1122 free_bts_context(child->bts);
1123 child->bts = NULL;
1124 }
1125}
1126#endif /* CONFIG_X86_PTRACE_BTS */
1127
1128/* 787/*
1129 * Called by kernel/ptrace.c when detaching.. 788 * Called by kernel/ptrace.c when detaching..
1130 * 789 *
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1252 break; 911 break;
1253#endif 912#endif
1254 913
1255 /*
1256 * These bits need more cooking - not enabled yet:
1257 */
1258#ifdef CONFIG_X86_PTRACE_BTS
1259 case PTRACE_BTS_CONFIG:
1260 ret = ptrace_bts_config
1261 (child, data, (struct ptrace_bts_config __user *)addr);
1262 break;
1263
1264 case PTRACE_BTS_STATUS:
1265 ret = ptrace_bts_status
1266 (child, data, (struct ptrace_bts_config __user *)addr);
1267 break;
1268
1269 case PTRACE_BTS_SIZE:
1270 ret = ptrace_bts_size(child);
1271 break;
1272
1273 case PTRACE_BTS_GET:
1274 ret = ptrace_bts_read_record
1275 (child, data, (struct bts_struct __user *) addr);
1276 break;
1277
1278 case PTRACE_BTS_CLEAR:
1279 ret = ptrace_bts_clear(child);
1280 break;
1281
1282 case PTRACE_BTS_DRAIN:
1283 ret = ptrace_bts_drain
1284 (child, data, (struct bts_struct __user *) addr);
1285 break;
1286#endif /* CONFIG_X86_PTRACE_BTS */
1287
1288 default: 914 default:
1289 ret = ptrace_request(child, request, addr, data); 915 ret = ptrace_request(child, request, addr, data);
1290 break; 916 break;
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1544 1170
1545 case PTRACE_GET_THREAD_AREA: 1171 case PTRACE_GET_THREAD_AREA:
1546 case PTRACE_SET_THREAD_AREA: 1172 case PTRACE_SET_THREAD_AREA:
1547#ifdef CONFIG_X86_PTRACE_BTS
1548 case PTRACE_BTS_CONFIG:
1549 case PTRACE_BTS_STATUS:
1550 case PTRACE_BTS_SIZE:
1551 case PTRACE_BTS_GET:
1552 case PTRACE_BTS_CLEAR:
1553 case PTRACE_BTS_DRAIN:
1554#endif /* CONFIG_X86_PTRACE_BTS */
1555 return arch_ptrace(child, request, addr, data); 1173 return arch_ptrace(child, request, addr, data);
1556 1174
1557 default: 1175 default:
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761..239427ca02a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
31 u32 tsc_to_nsec_mul; 31 u32 tsc_to_nsec_mul;
32 int tsc_shift; 32 int tsc_shift;
33 u32 version; 33 u32 version;
34 u8 flags;
34}; 35};
35 36
37static u8 valid_flags __read_mostly = 0;
38
39void pvclock_set_flags(u8 flags)
40{
41 valid_flags = flags;
42}
43
36/* 44/*
37 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 45 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
38 * yielding a 64-bit result. 46 * yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
91 dst->system_timestamp = src->system_time; 99 dst->system_timestamp = src->system_time;
92 dst->tsc_to_nsec_mul = src->tsc_to_system_mul; 100 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
93 dst->tsc_shift = src->tsc_shift; 101 dst->tsc_shift = src->tsc_shift;
102 dst->flags = src->flags;
94 rmb(); /* test version after fetching data */ 103 rmb(); /* test version after fetching data */
95 } while ((src->version & 1) || (dst->version != src->version)); 104 } while ((src->version & 1) || (dst->version != src->version));
96 105
@@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
109 return pv_tsc_khz; 118 return pv_tsc_khz;
110} 119}
111 120
121static atomic64_t last_value = ATOMIC64_INIT(0);
122
112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 123cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
113{ 124{
114 struct pvclock_shadow_time shadow; 125 struct pvclock_shadow_time shadow;
115 unsigned version; 126 unsigned version;
116 cycle_t ret, offset; 127 cycle_t ret, offset;
128 u64 last;
117 129
118 do { 130 do {
119 version = pvclock_get_time_values(&shadow, src); 131 version = pvclock_get_time_values(&shadow, src);
@@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
123 barrier(); 135 barrier();
124 } while (version != src->version); 136 } while (version != src->version);
125 137
138 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
139 (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
140 return ret;
141
142 /*
143 * Assumption here is that last_value, a global accumulator, always goes
144 * forward. If we are less than that, we should not be much smaller.
145 * We assume there is an error marging we're inside, and then the correction
146 * does not sacrifice accuracy.
147 *
148 * For reads: global may have changed between test and return,
149 * but this means someone else updated poked the clock at a later time.
150 * We just need to make sure we are not seeing a backwards event.
151 *
152 * For updates: last_value = ret is not enough, since two vcpus could be
153 * updating at the same time, and one of them could be slightly behind,
154 * making the assumption that last_value always go forward fail to hold.
155 */
156 last = atomic64_read(&last_value);
157 do {
158 if (ret < last)
159 return last;
160 last = atomic64_cmpxchg(&last_value, last, ret);
161 } while (unlikely(last != ret));
162
126 return ret; 163 return ret;
127} 164}
128 165
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 12e9feaa2f7..e72d3fc6547 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -495,10 +495,18 @@ void force_hpet_resume(void)
495/* 495/*
496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on 496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on
497 * floppy DMA. Disable HPET MSI on such platforms. 497 * floppy DMA. Disable HPET MSI on such platforms.
498 * See erratum #27 (Misinterpreted MSI Requests May Result in
499 * Corrupted LPC DMA Data) in AMD Publication #46837,
500 * "SB700 Family Product Errata", Rev. 1.0, March 2010.
501 *
502 * Also force the read back of the CMP register in hpet_next_event()
503 * to work around the problem that the CMP register write seems to be
504 * delayed. See hpet_next_event() for details.
498 */ 505 */
499static void force_disable_hpet_msi(struct pci_dev *unused) 506static void force_disable_hpet_msi(struct pci_dev *unused)
500{ 507{
501 hpet_msi_disable = 1; 508 hpet_msi_disable = 1;
509 hpet_readback_cmp = 1;
502} 510}
503 511
504DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, 512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4851eff57b..b4ae4acbd03 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -676,6 +676,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), 676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
677 }, 677 },
678 }, 678 },
679 /*
680 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
681 * match on the product name.
682 */
683 {
684 .callback = dmi_low_memory_corruption,
685 .ident = "Phoenix BIOS",
686 .matches = {
687 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
688 },
689 },
679#endif 690#endif
680 {} 691 {}
681}; 692};
@@ -725,6 +736,7 @@ void __init setup_arch(char **cmdline_p)
725 /* VMI may relocate the fixmap; do this before touching ioremap area */ 736 /* VMI may relocate the fixmap; do this before touching ioremap area */
726 vmi_init(); 737 vmi_init();
727 738
739 early_trap_init();
728 early_cpu_init(); 740 early_cpu_init();
729 early_ioremap_init(); 741 early_ioremap_init();
730 742
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef6370b00e7..a867940a6df 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -265,10 +265,10 @@ void __init setup_per_cpu_areas(void)
265 265
266#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 266#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
267 /* 267 /*
268 * make sure boot cpu node_number is right, when boot cpu is on the 268 * make sure boot cpu numa_node is right, when boot cpu is on the
269 * node that doesn't have mem installed 269 * node that doesn't have mem installed
270 */ 270 */
271 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); 271 set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id));
272#endif 272#endif
273 273
274 /* Setup node to cpumask map */ 274 /* Setup node to cpumask map */
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e09938265..7ded57896c0 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
81#endif /* CONFIG_X86_LOCAL_APIC */ 81#endif /* CONFIG_X86_LOCAL_APIC */
82 82
83#ifdef CONFIG_X86_IO_APIC 83#ifdef CONFIG_X86_IO_APIC
84static u32 gsi_base;
85 84
86static int __init sfi_parse_ioapic(struct sfi_table_header *table) 85static int __init sfi_parse_ioapic(struct sfi_table_header *table)
87{ 86{
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
94 pentry = (struct sfi_apic_table_entry *)sb->pentry; 93 pentry = (struct sfi_apic_table_entry *)sb->pentry;
95 94
96 for (i = 0; i < num; i++) { 95 for (i = 0; i < num; i++) {
97 mp_register_ioapic(i, pentry->phys_addr, gsi_base); 96 mp_register_ioapic(i, pentry->phys_addr, gsi_end + 1);
98 gsi_base += io_apic_get_redir_entries(i);
99 pentry++; 97 pentry++;
100 } 98 }
101 99
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763d815e27a..37462f1ddba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1215,9 +1215,17 @@ __init void prefill_possible_map(void)
1215 if (!num_processors) 1215 if (!num_processors)
1216 num_processors = 1; 1216 num_processors = 1;
1217 1217
1218 if (setup_possible_cpus == -1) 1218 i = setup_max_cpus ?: 1;
1219 possible = num_processors + disabled_cpus; 1219 if (setup_possible_cpus == -1) {
1220 else 1220 possible = num_processors;
1221#ifdef CONFIG_HOTPLUG_CPU
1222 if (setup_max_cpus)
1223 possible += disabled_cpus;
1224#else
1225 if (possible > i)
1226 possible = i;
1227#endif
1228 } else
1221 possible = setup_possible_cpus; 1229 possible = setup_possible_cpus;
1222 1230
1223 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1231 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
@@ -1230,11 +1238,23 @@ __init void prefill_possible_map(void)
1230 possible = nr_cpu_ids; 1238 possible = nr_cpu_ids;
1231 } 1239 }
1232 1240
1241#ifdef CONFIG_HOTPLUG_CPU
1242 if (!setup_max_cpus)
1243#endif
1244 if (possible > i) {
1245 printk(KERN_WARNING
1246 "%d Processors exceeds max_cpus limit of %u\n",
1247 possible, setup_max_cpus);
1248 possible = i;
1249 }
1250
1233 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1251 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
1234 possible, max_t(int, possible - num_processors, 0)); 1252 possible, max_t(int, possible - num_processors, 0));
1235 1253
1236 for (i = 0; i < possible; i++) 1254 for (i = 0; i < possible; i++)
1237 set_cpu_possible(i, true); 1255 set_cpu_possible(i, true);
1256 for (; i < NR_CPUS; i++)
1257 set_cpu_possible(i, false);
1238 1258
1239 nr_cpu_ids = possible; 1259 nr_cpu_ids = possible;
1240} 1260}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff10..58de45ee08b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child)
158} 158}
159 159
160/* 160/*
161 * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
162 */
163static void write_debugctlmsr(struct task_struct *child, unsigned long val)
164{
165 if (child->thread.debugctlmsr == val)
166 return;
167
168 child->thread.debugctlmsr = val;
169
170 if (child != current)
171 return;
172
173 update_debugctlmsr(val);
174}
175
176/*
177 * Enable single or block step. 161 * Enable single or block step.
178 */ 162 */
179static void enable_step(struct task_struct *child, bool block) 163static void enable_step(struct task_struct *child, bool block)
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block)
186 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
187 */ 171 */
188 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
189 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 173 unsigned long debugctl = get_debugctlmsr();
190 write_debugctlmsr(child, 174
191 child->thread.debugctlmsr | DEBUGCTLMSR_BTF); 175 debugctl |= DEBUGCTLMSR_BTF;
192 } else { 176 update_debugctlmsr(debugctl);
193 write_debugctlmsr(child, 177 set_tsk_thread_flag(child, TIF_BLOCKSTEP);
194 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 178 } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
195 179 unsigned long debugctl = get_debugctlmsr();
196 if (!child->thread.debugctlmsr) 180
197 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 181 debugctl &= ~DEBUGCTLMSR_BTF;
182 update_debugctlmsr(debugctl);
183 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
198 } 184 }
199} 185}
200 186
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child)
213 /* 199 /*
214 * Make sure block stepping (BTF) is disabled. 200 * Make sure block stepping (BTF) is disabled.
215 */ 201 */
216 write_debugctlmsr(child, 202 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
217 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 203 unsigned long debugctl = get_debugctlmsr();
218 204
219 if (!child->thread.debugctlmsr) 205 debugctl &= ~DEBUGCTLMSR_BTF;
220 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 206 update_debugctlmsr(debugctl);
207 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
208 }
221 209
222 /* Always clear TIF_SINGLESTEP... */ 210 /* Always clear TIF_SINGLESTEP... */
223 clear_tsk_thread_flag(child, TIF_SINGLESTEP); 211 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48a..c2f1b26141e 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
46 46
47/* Global pointer to shared data; NULL means no measured launch. */ 47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly; 48struct tboot *tboot __read_mostly;
49EXPORT_SYMBOL(tboot);
49 50
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ 51/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1 52#define AP_WAIT_TIMEOUT 1
@@ -175,6 +176,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
175 struct tboot_mac_region *mr; 176 struct tboot_mac_region *mr;
176 phys_addr_t end = start + size; 177 phys_addr_t end = start + size;
177 178
179 if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
180 panic("tboot: Too many MAC regions\n");
181
178 if (start && size) { 182 if (start && size) {
179 mr = &tboot->mac_regions[tboot->num_mac_regions++]; 183 mr = &tboot->mac_regions[tboot->num_mac_regions++];
180 mr->start = round_down(start, PAGE_SIZE); 184 mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +188,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
184 188
185static int tboot_setup_sleep(void) 189static int tboot_setup_sleep(void)
186{ 190{
191 int i;
192
187 tboot->num_mac_regions = 0; 193 tboot->num_mac_regions = 0;
188 194
189 /* S3 resume code */ 195 for (i = 0; i < e820.nr_map; i++) {
190 add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); 196 if ((e820.map[i].type != E820_RAM)
197 && (e820.map[i].type != E820_RESERVED_KERN))
198 continue;
191 199
192#ifdef CONFIG_X86_TRAMPOLINE 200 add_mac_region(e820.map[i].addr, e820.map[i].size);
193 /* AP trampoline code */ 201 }
194 add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
195#endif
196
197 /* kernel code + data + bss */
198 add_mac_region(virt_to_phys(_text), _end - _text);
199 202
200 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; 203 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
201 204
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 17b03dd3a6b..7fea555929e 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * SGI UltraViolet TLB flush routines. 2 * SGI UltraViolet TLB flush routines.
3 * 3 *
4 * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. 4 * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
5 * 5 *
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
@@ -20,42 +20,67 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/tsc.h> 21#include <asm/tsc.h>
22#include <asm/irq_vectors.h> 22#include <asm/irq_vectors.h>
23#include <asm/timer.h>
23 24
24static struct bau_control **uv_bau_table_bases __read_mostly; 25struct msg_desc {
25static int uv_bau_retry_limit __read_mostly; 26 struct bau_payload_queue_entry *msg;
27 int msg_slot;
28 int sw_ack_slot;
29 struct bau_payload_queue_entry *va_queue_first;
30 struct bau_payload_queue_entry *va_queue_last;
31};
26 32
27/* base pnode in this partition */ 33#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
28static int uv_partition_base_pnode __read_mostly; 34
35static int uv_bau_max_concurrent __read_mostly;
36
37static int nobau;
38static int __init setup_nobau(char *arg)
39{
40 nobau = 1;
41 return 0;
42}
43early_param("nobau", setup_nobau);
29 44
30static unsigned long uv_mmask __read_mostly; 45/* base pnode in this partition */
46static int uv_partition_base_pnode __read_mostly;
47/* position of pnode (which is nasid>>1): */
48static int uv_nshift __read_mostly;
49static unsigned long uv_mmask __read_mostly;
31 50
32static DEFINE_PER_CPU(struct ptc_stats, ptcstats); 51static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
33static DEFINE_PER_CPU(struct bau_control, bau_control); 52static DEFINE_PER_CPU(struct bau_control, bau_control);
53static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
54
55struct reset_args {
56 int sender;
57};
34 58
35/* 59/*
36 * Determine the first node on a blade. 60 * Determine the first node on a uvhub. 'Nodes' are used for kernel
61 * memory allocation.
37 */ 62 */
38static int __init blade_to_first_node(int blade) 63static int __init uvhub_to_first_node(int uvhub)
39{ 64{
40 int node, b; 65 int node, b;
41 66
42 for_each_online_node(node) { 67 for_each_online_node(node) {
43 b = uv_node_to_blade_id(node); 68 b = uv_node_to_blade_id(node);
44 if (blade == b) 69 if (uvhub == b)
45 return node; 70 return node;
46 } 71 }
47 return -1; /* shouldn't happen */ 72 return -1;
48} 73}
49 74
50/* 75/*
51 * Determine the apicid of the first cpu on a blade. 76 * Determine the apicid of the first cpu on a uvhub.
52 */ 77 */
53static int __init blade_to_first_apicid(int blade) 78static int __init uvhub_to_first_apicid(int uvhub)
54{ 79{
55 int cpu; 80 int cpu;
56 81
57 for_each_present_cpu(cpu) 82 for_each_present_cpu(cpu)
58 if (blade == uv_cpu_to_blade_id(cpu)) 83 if (uvhub == uv_cpu_to_blade_id(cpu))
59 return per_cpu(x86_cpu_to_apicid, cpu); 84 return per_cpu(x86_cpu_to_apicid, cpu);
60 return -1; 85 return -1;
61} 86}
@@ -68,195 +93,459 @@ static int __init blade_to_first_apicid(int blade)
68 * clear of the Timeout bit (as well) will free the resource. No reply will 93 * clear of the Timeout bit (as well) will free the resource. No reply will
69 * be sent (the hardware will only do one reply per message). 94 * be sent (the hardware will only do one reply per message).
70 */ 95 */
71static void uv_reply_to_message(int resource, 96static inline void uv_reply_to_message(struct msg_desc *mdp,
72 struct bau_payload_queue_entry *msg, 97 struct bau_control *bcp)
73 struct bau_msg_status *msp)
74{ 98{
75 unsigned long dw; 99 unsigned long dw;
100 struct bau_payload_queue_entry *msg;
76 101
77 dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); 102 msg = mdp->msg;
103 if (!msg->canceled) {
104 dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
105 msg->sw_ack_vector;
106 uv_write_local_mmr(
107 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
108 }
78 msg->replied_to = 1; 109 msg->replied_to = 1;
79 msg->sw_ack_vector = 0; 110 msg->sw_ack_vector = 0;
80 if (msp)
81 msp->seen_by.bits = 0;
82 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
83} 111}
84 112
85/* 113/*
86 * Do all the things a cpu should do for a TLB shootdown message. 114 * Process the receipt of a RETRY message
87 * Other cpu's may come here at the same time for this message.
88 */ 115 */
89static void uv_bau_process_message(struct bau_payload_queue_entry *msg, 116static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
90 int msg_slot, int sw_ack_slot) 117 struct bau_control *bcp)
91{ 118{
92 unsigned long this_cpu_mask; 119 int i;
93 struct bau_msg_status *msp; 120 int cancel_count = 0;
94 int cpu; 121 int slot2;
122 unsigned long msg_res;
123 unsigned long mmr = 0;
124 struct bau_payload_queue_entry *msg;
125 struct bau_payload_queue_entry *msg2;
126 struct ptc_stats *stat;
95 127
96 msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; 128 msg = mdp->msg;
97 cpu = uv_blade_processor_id(); 129 stat = &per_cpu(ptcstats, bcp->cpu);
98 msg->number_of_cpus = 130 stat->d_retries++;
99 uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); 131 /*
100 this_cpu_mask = 1UL << cpu; 132 * cancel any message from msg+1 to the retry itself
101 if (msp->seen_by.bits & this_cpu_mask) 133 */
102 return; 134 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
103 atomic_or_long(&msp->seen_by.bits, this_cpu_mask); 135 if (msg2 > mdp->va_queue_last)
136 msg2 = mdp->va_queue_first;
137 if (msg2 == msg)
138 break;
139
140 /* same conditions for cancellation as uv_do_reset */
141 if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
142 (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
143 msg->sw_ack_vector) == 0) &&
144 (msg2->sending_cpu == msg->sending_cpu) &&
145 (msg2->msg_type != MSG_NOOP)) {
146 slot2 = msg2 - mdp->va_queue_first;
147 mmr = uv_read_local_mmr
148 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
149 msg_res = ((msg2->sw_ack_vector << 8) |
150 msg2->sw_ack_vector);
151 /*
152 * This is a message retry; clear the resources held
153 * by the previous message only if they timed out.
154 * If it has not timed out we have an unexpected
155 * situation to report.
156 */
157 if (mmr & (msg_res << 8)) {
158 /*
159 * is the resource timed out?
160 * make everyone ignore the cancelled message.
161 */
162 msg2->canceled = 1;
163 stat->d_canceled++;
164 cancel_count++;
165 uv_write_local_mmr(
166 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
167 (msg_res << 8) | msg_res);
168 } else
169 printk(KERN_INFO "note bau retry: no effect\n");
170 }
171 }
172 if (!cancel_count)
173 stat->d_nocanceled++;
174}
104 175
105 if (msg->replied_to == 1) 176/*
106 return; 177 * Do all the things a cpu should do for a TLB shootdown message.
178 * Other cpu's may come here at the same time for this message.
179 */
180static void uv_bau_process_message(struct msg_desc *mdp,
181 struct bau_control *bcp)
182{
183 int msg_ack_count;
184 short socket_ack_count = 0;
185 struct ptc_stats *stat;
186 struct bau_payload_queue_entry *msg;
187 struct bau_control *smaster = bcp->socket_master;
107 188
189 /*
190 * This must be a normal message, or retry of a normal message
191 */
192 msg = mdp->msg;
193 stat = &per_cpu(ptcstats, bcp->cpu);
108 if (msg->address == TLB_FLUSH_ALL) { 194 if (msg->address == TLB_FLUSH_ALL) {
109 local_flush_tlb(); 195 local_flush_tlb();
110 __get_cpu_var(ptcstats).alltlb++; 196 stat->d_alltlb++;
111 } else { 197 } else {
112 __flush_tlb_one(msg->address); 198 __flush_tlb_one(msg->address);
113 __get_cpu_var(ptcstats).onetlb++; 199 stat->d_onetlb++;
114 } 200 }
201 stat->d_requestee++;
202
203 /*
204 * One cpu on each uvhub has the additional job on a RETRY
205 * of releasing the resource held by the message that is
206 * being retried. That message is identified by sending
207 * cpu number.
208 */
209 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
210 uv_bau_process_retry_msg(mdp, bcp);
115 211
116 __get_cpu_var(ptcstats).requestee++; 212 /*
213 * This is a sw_ack message, so we have to reply to it.
214 * Count each responding cpu on the socket. This avoids
215 * pinging the count's cache line back and forth between
216 * the sockets.
217 */
218 socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
219 &smaster->socket_acknowledge_count[mdp->msg_slot]);
220 if (socket_ack_count == bcp->cpus_in_socket) {
221 /*
222 * Both sockets dump their completed count total into
223 * the message's count.
224 */
225 smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
226 msg_ack_count = atomic_add_short_return(socket_ack_count,
227 (struct atomic_short *)&msg->acknowledge_count);
228
229 if (msg_ack_count == bcp->cpus_in_uvhub) {
230 /*
231 * All cpus in uvhub saw it; reply
232 */
233 uv_reply_to_message(mdp, bcp);
234 }
235 }
117 236
118 atomic_inc_short(&msg->acknowledge_count); 237 return;
119 if (msg->number_of_cpus == msg->acknowledge_count)
120 uv_reply_to_message(sw_ack_slot, msg, msp);
121} 238}
122 239
123/* 240/*
124 * Examine the payload queue on one distribution node to see 241 * Determine the first cpu on a uvhub.
125 * which messages have not been seen, and which cpu(s) have not seen them. 242 */
243static int uvhub_to_first_cpu(int uvhub)
244{
245 int cpu;
246 for_each_present_cpu(cpu)
247 if (uvhub == uv_cpu_to_blade_id(cpu))
248 return cpu;
249 return -1;
250}
251
252/*
253 * Last resort when we get a large number of destination timeouts is
254 * to clear resources held by a given cpu.
255 * Do this with IPI so that all messages in the BAU message queue
256 * can be identified by their nonzero sw_ack_vector field.
126 * 257 *
127 * Returns the number of cpu's that have not responded. 258 * This is entered for a single cpu on the uvhub.
259 * The sender want's this uvhub to free a specific message's
260 * sw_ack resources.
128 */ 261 */
129static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) 262static void
263uv_do_reset(void *ptr)
130{ 264{
131 struct bau_payload_queue_entry *msg;
132 struct bau_msg_status *msp;
133 int count = 0;
134 int i; 265 int i;
135 int j; 266 int slot;
267 int count = 0;
268 unsigned long mmr;
269 unsigned long msg_res;
270 struct bau_control *bcp;
271 struct reset_args *rap;
272 struct bau_payload_queue_entry *msg;
273 struct ptc_stats *stat;
136 274
137 for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; 275 bcp = &per_cpu(bau_control, smp_processor_id());
138 msg++, i++) { 276 rap = (struct reset_args *)ptr;
139 if ((msg->sending_cpu == sender) && (!msg->replied_to)) { 277 stat = &per_cpu(ptcstats, bcp->cpu);
140 msp = bau_tablesp->msg_statuses + i; 278 stat->d_resets++;
141 printk(KERN_DEBUG 279
142 "blade %d: address:%#lx %d of %d, not cpu(s): ", 280 /*
143 i, msg->address, msg->acknowledge_count, 281 * We're looking for the given sender, and
144 msg->number_of_cpus); 282 * will free its sw_ack resource.
145 for (j = 0; j < msg->number_of_cpus; j++) { 283 * If all cpu's finally responded after the timeout, its
146 if (!((1L << j) & msp->seen_by.bits)) { 284 * message 'replied_to' was set.
147 count++; 285 */
148 printk("%d ", j); 286 for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
149 } 287 /* uv_do_reset: same conditions for cancellation as
288 uv_bau_process_retry_msg() */
289 if ((msg->replied_to == 0) &&
290 (msg->canceled == 0) &&
291 (msg->sending_cpu == rap->sender) &&
292 (msg->sw_ack_vector) &&
293 (msg->msg_type != MSG_NOOP)) {
294 /*
295 * make everyone else ignore this message
296 */
297 msg->canceled = 1;
298 slot = msg - bcp->va_queue_first;
299 count++;
300 /*
301 * only reset the resource if it is still pending
302 */
303 mmr = uv_read_local_mmr
304 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
305 msg_res = ((msg->sw_ack_vector << 8) |
306 msg->sw_ack_vector);
307 if (mmr & msg_res) {
308 stat->d_rcanceled++;
309 uv_write_local_mmr(
310 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
311 msg_res);
150 } 312 }
151 printk("\n");
152 } 313 }
153 } 314 }
154 return count; 315 return;
155} 316}
156 317
157/* 318/*
158 * Examine the payload queue on all the distribution nodes to see 319 * Use IPI to get all target uvhubs to release resources held by
159 * which messages have not been seen, and which cpu(s) have not seen them. 320 * a given sending cpu number.
160 *
161 * Returns the number of cpu's that have not responded.
162 */ 321 */
163static int uv_examine_destinations(struct bau_target_nodemask *distribution) 322static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
323 int sender)
164{ 324{
165 int sender; 325 int uvhub;
166 int i; 326 int cpu;
167 int count = 0; 327 cpumask_t mask;
328 struct reset_args reset_args;
329
330 reset_args.sender = sender;
168 331
169 sender = smp_processor_id(); 332 cpus_clear(mask);
170 for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { 333 /* find a single cpu for each uvhub in this distribution mask */
171 if (!bau_node_isset(i, distribution)) 334 for (uvhub = 0;
335 uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
336 uvhub++) {
337 if (!bau_uvhub_isset(uvhub, distribution))
172 continue; 338 continue;
173 count += uv_examine_destination(uv_bau_table_bases[i], sender); 339 /* find a cpu for this uvhub */
340 cpu = uvhub_to_first_cpu(uvhub);
341 cpu_set(cpu, mask);
174 } 342 }
175 return count; 343 /* IPI all cpus; Preemption is already disabled */
344 smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
345 return;
346}
347
348static inline unsigned long
349cycles_2_us(unsigned long long cyc)
350{
351 unsigned long long ns;
352 unsigned long us;
353 ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
354 >> CYC2NS_SCALE_FACTOR;
355 us = ns / 1000;
356 return us;
176} 357}
177 358
178/* 359/*
179 * wait for completion of a broadcast message 360 * wait for all cpus on this hub to finish their sends and go quiet
180 * 361 * leaves uvhub_quiesce set so that no new broadcasts are started by
181 * return COMPLETE, RETRY or GIVEUP 362 * bau_flush_send_and_wait()
363 */
364static inline void
365quiesce_local_uvhub(struct bau_control *hmaster)
366{
367 atomic_add_short_return(1, (struct atomic_short *)
368 &hmaster->uvhub_quiesce);
369}
370
371/*
372 * mark this quiet-requestor as done
373 */
374static inline void
375end_uvhub_quiesce(struct bau_control *hmaster)
376{
377 atomic_add_short_return(-1, (struct atomic_short *)
378 &hmaster->uvhub_quiesce);
379}
380
381/*
382 * Wait for completion of a broadcast software ack message
383 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
182 */ 384 */
183static int uv_wait_completion(struct bau_desc *bau_desc, 385static int uv_wait_completion(struct bau_desc *bau_desc,
184 unsigned long mmr_offset, int right_shift) 386 unsigned long mmr_offset, int right_shift, int this_cpu,
387 struct bau_control *bcp, struct bau_control *smaster, long try)
185{ 388{
186 int exams = 0; 389 int relaxes = 0;
187 long destination_timeouts = 0;
188 long source_timeouts = 0;
189 unsigned long descriptor_status; 390 unsigned long descriptor_status;
391 unsigned long mmr;
392 unsigned long mask;
393 cycles_t ttime;
394 cycles_t timeout_time;
395 struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
396 struct bau_control *hmaster;
397
398 hmaster = bcp->uvhub_master;
399 timeout_time = get_cycles() + bcp->timeout_interval;
190 400
401 /* spin on the status MMR, waiting for it to go idle */
191 while ((descriptor_status = (((unsigned long) 402 while ((descriptor_status = (((unsigned long)
192 uv_read_local_mmr(mmr_offset) >> 403 uv_read_local_mmr(mmr_offset) >>
193 right_shift) & UV_ACT_STATUS_MASK)) != 404 right_shift) & UV_ACT_STATUS_MASK)) !=
194 DESC_STATUS_IDLE) { 405 DESC_STATUS_IDLE) {
195 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
196 source_timeouts++;
197 if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
198 source_timeouts = 0;
199 __get_cpu_var(ptcstats).s_retry++;
200 return FLUSH_RETRY;
201 }
202 /* 406 /*
203 * spin here looking for progress at the destinations 407 * Our software ack messages may be blocked because there are
408 * no swack resources available. As long as none of them
409 * has timed out hardware will NACK our message and its
410 * state will stay IDLE.
204 */ 411 */
205 if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { 412 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
206 destination_timeouts++; 413 stat->s_stimeout++;
207 if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { 414 return FLUSH_GIVEUP;
208 /* 415 } else if (descriptor_status ==
209 * returns number of cpus not responding 416 DESC_STATUS_DESTINATION_TIMEOUT) {
210 */ 417 stat->s_dtimeout++;
211 if (uv_examine_destinations 418 ttime = get_cycles();
212 (&bau_desc->distribution) == 0) { 419
213 __get_cpu_var(ptcstats).d_retry++; 420 /*
214 return FLUSH_RETRY; 421 * Our retries may be blocked by all destination
215 } 422 * swack resources being consumed, and a timeout
216 exams++; 423 * pending. In that case hardware returns the
217 if (exams >= uv_bau_retry_limit) { 424 * ERROR that looks like a destination timeout.
218 printk(KERN_DEBUG 425 */
219 "uv_flush_tlb_others"); 426 if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
220 printk("giving up on cpu %d\n", 427 bcp->conseccompletes = 0;
221 smp_processor_id()); 428 return FLUSH_RETRY_PLUGGED;
429 }
430
431 bcp->conseccompletes = 0;
432 return FLUSH_RETRY_TIMEOUT;
433 } else {
434 /*
435 * descriptor_status is still BUSY
436 */
437 cpu_relax();
438 relaxes++;
439 if (relaxes >= 10000) {
440 relaxes = 0;
441 if (get_cycles() > timeout_time) {
442 quiesce_local_uvhub(hmaster);
443
444 /* single-thread the register change */
445 spin_lock(&hmaster->masks_lock);
446 mmr = uv_read_local_mmr(mmr_offset);
447 mask = 0UL;
448 mask |= (3UL < right_shift);
449 mask = ~mask;
450 mmr &= mask;
451 uv_write_local_mmr(mmr_offset, mmr);
452 spin_unlock(&hmaster->masks_lock);
453 end_uvhub_quiesce(hmaster);
454 stat->s_busy++;
222 return FLUSH_GIVEUP; 455 return FLUSH_GIVEUP;
223 } 456 }
224 /*
225 * delays can hang the simulator
226 udelay(1000);
227 */
228 destination_timeouts = 0;
229 } 457 }
230 } 458 }
231 cpu_relax();
232 } 459 }
460 bcp->conseccompletes++;
233 return FLUSH_COMPLETE; 461 return FLUSH_COMPLETE;
234} 462}
235 463
464static inline cycles_t
465sec_2_cycles(unsigned long sec)
466{
467 unsigned long ns;
468 cycles_t cyc;
469
470 ns = sec * 1000000000;
471 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
472 return cyc;
473}
474
475/*
476 * conditionally add 1 to *v, unless *v is >= u
477 * return 0 if we cannot add 1 to *v because it is >= u
478 * return 1 if we can add 1 to *v because it is < u
479 * the add is atomic
480 *
481 * This is close to atomic_add_unless(), but this allows the 'u' value
482 * to be lowered below the current 'v'. atomic_add_unless can only stop
483 * on equal.
484 */
485static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
486{
487 spin_lock(lock);
488 if (atomic_read(v) >= u) {
489 spin_unlock(lock);
490 return 0;
491 }
492 atomic_inc(v);
493 spin_unlock(lock);
494 return 1;
495}
496
236/** 497/**
237 * uv_flush_send_and_wait 498 * uv_flush_send_and_wait
238 * 499 *
239 * Send a broadcast and wait for a broadcast message to complete. 500 * Send a broadcast and wait for it to complete.
240 * 501 *
241 * The flush_mask contains the cpus the broadcast was sent to. 502 * The flush_mask contains the cpus the broadcast is to be sent to, plus
503 * cpus that are on the local uvhub.
242 * 504 *
243 * Returns NULL if all remote flushing was done. The mask is zeroed. 505 * Returns NULL if all flushing represented in the mask was done. The mask
506 * is zeroed.
244 * Returns @flush_mask if some remote flushing remains to be done. The 507 * Returns @flush_mask if some remote flushing remains to be done. The
245 * mask will have some bits still set. 508 * mask will have some bits still set, representing any cpus on the local
509 * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
246 */ 510 */
247const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, 511const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
248 struct bau_desc *bau_desc, 512 struct cpumask *flush_mask,
249 struct cpumask *flush_mask) 513 struct bau_control *bcp)
250{ 514{
251 int completion_status = 0;
252 int right_shift; 515 int right_shift;
253 int tries = 0; 516 int uvhub;
254 int pnode;
255 int bit; 517 int bit;
518 int completion_status = 0;
519 int seq_number = 0;
520 long try = 0;
521 int cpu = bcp->uvhub_cpu;
522 int this_cpu = bcp->cpu;
523 int this_uvhub = bcp->uvhub;
256 unsigned long mmr_offset; 524 unsigned long mmr_offset;
257 unsigned long index; 525 unsigned long index;
258 cycles_t time1; 526 cycles_t time1;
259 cycles_t time2; 527 cycles_t time2;
528 struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
529 struct bau_control *smaster = bcp->socket_master;
530 struct bau_control *hmaster = bcp->uvhub_master;
531
532 /*
533 * Spin here while there are hmaster->max_concurrent or more active
534 * descriptors. This is the per-uvhub 'throttle'.
535 */
536 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
537 &hmaster->active_descriptor_count,
538 hmaster->max_concurrent)) {
539 stat->s_throttles++;
540 do {
541 cpu_relax();
542 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
543 &hmaster->active_descriptor_count,
544 hmaster->max_concurrent));
545 }
546
547 while (hmaster->uvhub_quiesce)
548 cpu_relax();
260 549
261 if (cpu < UV_CPUS_PER_ACT_STATUS) { 550 if (cpu < UV_CPUS_PER_ACT_STATUS) {
262 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 551 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -268,24 +557,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
268 } 557 }
269 time1 = get_cycles(); 558 time1 = get_cycles();
270 do { 559 do {
271 tries++; 560 /*
561 * Every message from any given cpu gets a unique message
562 * sequence number. But retries use that same number.
563 * Our message may have timed out at the destination because
564 * all sw-ack resources are in use and there is a timeout
565 * pending there. In that case, our last send never got
566 * placed into the queue and we need to persist until it
567 * does.
568 *
569 * Make any retry a type MSG_RETRY so that the destination will
570 * free any resource held by a previous message from this cpu.
571 */
572 if (try == 0) {
573 /* use message type set by the caller the first time */
574 seq_number = bcp->message_number++;
575 } else {
576 /* use RETRY type on all the rest; same sequence */
577 bau_desc->header.msg_type = MSG_RETRY;
578 stat->s_retry_messages++;
579 }
580 bau_desc->header.sequence = seq_number;
272 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 581 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
273 cpu; 582 bcp->uvhub_cpu;
583 bcp->send_message = get_cycles();
584
274 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 585 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
586
587 try++;
275 completion_status = uv_wait_completion(bau_desc, mmr_offset, 588 completion_status = uv_wait_completion(bau_desc, mmr_offset,
276 right_shift); 589 right_shift, this_cpu, bcp, smaster, try);
277 } while (completion_status == FLUSH_RETRY); 590
591 if (completion_status == FLUSH_RETRY_PLUGGED) {
592 /*
593 * Our retries may be blocked by all destination swack
594 * resources being consumed, and a timeout pending. In
595 * that case hardware immediately returns the ERROR
596 * that looks like a destination timeout.
597 */
598 udelay(TIMEOUT_DELAY);
599 bcp->plugged_tries++;
600 if (bcp->plugged_tries >= PLUGSB4RESET) {
601 bcp->plugged_tries = 0;
602 quiesce_local_uvhub(hmaster);
603 spin_lock(&hmaster->queue_lock);
604 uv_reset_with_ipi(&bau_desc->distribution,
605 this_cpu);
606 spin_unlock(&hmaster->queue_lock);
607 end_uvhub_quiesce(hmaster);
608 bcp->ipi_attempts++;
609 stat->s_resets_plug++;
610 }
611 } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
612 hmaster->max_concurrent = 1;
613 bcp->timeout_tries++;
614 udelay(TIMEOUT_DELAY);
615 if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
616 bcp->timeout_tries = 0;
617 quiesce_local_uvhub(hmaster);
618 spin_lock(&hmaster->queue_lock);
619 uv_reset_with_ipi(&bau_desc->distribution,
620 this_cpu);
621 spin_unlock(&hmaster->queue_lock);
622 end_uvhub_quiesce(hmaster);
623 bcp->ipi_attempts++;
624 stat->s_resets_timeout++;
625 }
626 }
627 if (bcp->ipi_attempts >= 3) {
628 bcp->ipi_attempts = 0;
629 completion_status = FLUSH_GIVEUP;
630 break;
631 }
632 cpu_relax();
633 } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
634 (completion_status == FLUSH_RETRY_TIMEOUT));
278 time2 = get_cycles(); 635 time2 = get_cycles();
279 __get_cpu_var(ptcstats).sflush += (time2 - time1);
280 if (tries > 1)
281 __get_cpu_var(ptcstats).retriesok++;
282 636
283 if (completion_status == FLUSH_GIVEUP) { 637 if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
638 && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
639 hmaster->max_concurrent++;
640
641 /*
642 * hold any cpu not timing out here; no other cpu currently held by
643 * the 'throttle' should enter the activation code
644 */
645 while (hmaster->uvhub_quiesce)
646 cpu_relax();
647 atomic_dec(&hmaster->active_descriptor_count);
648
649 /* guard against cycles wrap */
650 if (time2 > time1)
651 stat->s_time += (time2 - time1);
652 else
653 stat->s_requestor--; /* don't count this one */
654 if (completion_status == FLUSH_COMPLETE && try > 1)
655 stat->s_retriesok++;
656 else if (completion_status == FLUSH_GIVEUP) {
284 /* 657 /*
285 * Cause the caller to do an IPI-style TLB shootdown on 658 * Cause the caller to do an IPI-style TLB shootdown on
286 * the cpu's, all of which are still in the mask. 659 * the target cpu's, all of which are still in the mask.
287 */ 660 */
288 __get_cpu_var(ptcstats).ptc_i++; 661 stat->s_giveup++;
289 return flush_mask; 662 return flush_mask;
290 } 663 }
291 664
@@ -294,18 +667,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
294 * use the IPI method of shootdown on them. 667 * use the IPI method of shootdown on them.
295 */ 668 */
296 for_each_cpu(bit, flush_mask) { 669 for_each_cpu(bit, flush_mask) {
297 pnode = uv_cpu_to_pnode(bit); 670 uvhub = uv_cpu_to_blade_id(bit);
298 if (pnode == this_pnode) 671 if (uvhub == this_uvhub)
299 continue; 672 continue;
300 cpumask_clear_cpu(bit, flush_mask); 673 cpumask_clear_cpu(bit, flush_mask);
301 } 674 }
302 if (!cpumask_empty(flush_mask)) 675 if (!cpumask_empty(flush_mask))
303 return flush_mask; 676 return flush_mask;
677
304 return NULL; 678 return NULL;
305} 679}
306 680
307static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
308
309/** 681/**
310 * uv_flush_tlb_others - globally purge translation cache of a virtual 682 * uv_flush_tlb_others - globally purge translation cache of a virtual
311 * address or all TLB's 683 * address or all TLB's
@@ -322,8 +694,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
322 * The caller has derived the cpumask from the mm_struct. This function 694 * The caller has derived the cpumask from the mm_struct. This function
323 * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) 695 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
324 * 696 *
325 * The cpumask is converted into a nodemask of the nodes containing 697 * The cpumask is converted into a uvhubmask of the uvhubs containing
326 * the cpus. 698 * those cpus.
327 * 699 *
328 * Note that this function should be called with preemption disabled. 700 * Note that this function should be called with preemption disabled.
329 * 701 *
@@ -335,52 +707,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
335 struct mm_struct *mm, 707 struct mm_struct *mm,
336 unsigned long va, unsigned int cpu) 708 unsigned long va, unsigned int cpu)
337{ 709{
338 struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); 710 int remotes;
339 int i; 711 int tcpu;
340 int bit; 712 int uvhub;
341 int pnode;
342 int uv_cpu;
343 int this_pnode;
344 int locals = 0; 713 int locals = 0;
345 struct bau_desc *bau_desc; 714 struct bau_desc *bau_desc;
715 struct cpumask *flush_mask;
716 struct ptc_stats *stat;
717 struct bau_control *bcp;
346 718
347 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 719 if (nobau)
720 return cpumask;
348 721
349 uv_cpu = uv_blade_processor_id(); 722 bcp = &per_cpu(bau_control, cpu);
350 this_pnode = uv_hub_info->pnode; 723 /*
351 bau_desc = __get_cpu_var(bau_control).descriptor_base; 724 * Each sending cpu has a per-cpu mask which it fills from the caller's
352 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; 725 * cpu mask. Only remote cpus are converted to uvhubs and copied.
726 */
727 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
728 /*
729 * copy cpumask to flush_mask, removing current cpu
730 * (current cpu should already have been flushed by the caller and
731 * should never be returned if we return flush_mask)
732 */
733 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
734 if (cpu_isset(cpu, *cpumask))
735 locals++; /* current cpu was targeted */
353 736
354 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 737 bau_desc = bcp->descriptor_base;
738 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
355 739
356 i = 0; 740 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
357 for_each_cpu(bit, flush_mask) { 741 remotes = 0;
358 pnode = uv_cpu_to_pnode(bit); 742 for_each_cpu(tcpu, flush_mask) {
359 BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1)); 743 uvhub = uv_cpu_to_blade_id(tcpu);
360 if (pnode == this_pnode) { 744 if (uvhub == bcp->uvhub) {
361 locals++; 745 locals++;
362 continue; 746 continue;
363 } 747 }
364 bau_node_set(pnode - uv_partition_base_pnode, 748 bau_uvhub_set(uvhub, &bau_desc->distribution);
365 &bau_desc->distribution); 749 remotes++;
366 i++;
367 } 750 }
368 if (i == 0) { 751 if (remotes == 0) {
369 /* 752 /*
370 * no off_node flushing; return status for local node 753 * No off_hub flushing; return status for local hub.
754 * Return the caller's mask if all were local (the current
755 * cpu may be in that mask).
371 */ 756 */
372 if (locals) 757 if (locals)
373 return flush_mask; 758 return cpumask;
374 else 759 else
375 return NULL; 760 return NULL;
376 } 761 }
377 __get_cpu_var(ptcstats).requestor++; 762 stat = &per_cpu(ptcstats, cpu);
378 __get_cpu_var(ptcstats).ntargeted += i; 763 stat->s_requestor++;
764 stat->s_ntargcpu += remotes;
765 remotes = bau_uvhub_weight(&bau_desc->distribution);
766 stat->s_ntarguvhub += remotes;
767 if (remotes >= 16)
768 stat->s_ntarguvhub16++;
769 else if (remotes >= 8)
770 stat->s_ntarguvhub8++;
771 else if (remotes >= 4)
772 stat->s_ntarguvhub4++;
773 else if (remotes >= 2)
774 stat->s_ntarguvhub2++;
775 else
776 stat->s_ntarguvhub1++;
379 777
380 bau_desc->payload.address = va; 778 bau_desc->payload.address = va;
381 bau_desc->payload.sending_cpu = cpu; 779 bau_desc->payload.sending_cpu = cpu;
382 780
383 return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask); 781 /*
782 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
783 * the adjusted flush_mask if any cpu's were not messaged.
784 */
785 return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
384} 786}
385 787
386/* 788/*
@@ -389,87 +791,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
389 * 791 *
390 * We received a broadcast assist message. 792 * We received a broadcast assist message.
391 * 793 *
392 * Interrupts may have been disabled; this interrupt could represent 794 * Interrupts are disabled; this interrupt could represent
393 * the receipt of several messages. 795 * the receipt of several messages.
394 * 796 *
395 * All cores/threads on this node get this interrupt. 797 * All cores/threads on this hub get this interrupt.
396 * The last one to see it does the s/w ack. 798 * The last one to see it does the software ack.
397 * (the resource will not be freed until noninterruptable cpus see this 799 * (the resource will not be freed until noninterruptable cpus see this
398 * interrupt; hardware will timeout the s/w ack and reply ERROR) 800 * interrupt; hardware may timeout the s/w ack and reply ERROR)
399 */ 801 */
400void uv_bau_message_interrupt(struct pt_regs *regs) 802void uv_bau_message_interrupt(struct pt_regs *regs)
401{ 803{
402 struct bau_payload_queue_entry *va_queue_first;
403 struct bau_payload_queue_entry *va_queue_last;
404 struct bau_payload_queue_entry *msg;
405 struct pt_regs *old_regs = set_irq_regs(regs);
406 cycles_t time1;
407 cycles_t time2;
408 int msg_slot;
409 int sw_ack_slot;
410 int fw;
411 int count = 0; 804 int count = 0;
412 unsigned long local_pnode; 805 cycles_t time_start;
413 806 struct bau_payload_queue_entry *msg;
414 ack_APIC_irq(); 807 struct bau_control *bcp;
415 exit_idle(); 808 struct ptc_stats *stat;
416 irq_enter(); 809 struct msg_desc msgdesc;
417 810
418 time1 = get_cycles(); 811 time_start = get_cycles();
419 812 bcp = &per_cpu(bau_control, smp_processor_id());
420 local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); 813 stat = &per_cpu(ptcstats, smp_processor_id());
421 814 msgdesc.va_queue_first = bcp->va_queue_first;
422 va_queue_first = __get_cpu_var(bau_control).va_queue_first; 815 msgdesc.va_queue_last = bcp->va_queue_last;
423 va_queue_last = __get_cpu_var(bau_control).va_queue_last; 816 msg = bcp->bau_msg_head;
424
425 msg = __get_cpu_var(bau_control).bau_msg_head;
426 while (msg->sw_ack_vector) { 817 while (msg->sw_ack_vector) {
427 count++; 818 count++;
428 fw = msg->sw_ack_vector; 819 msgdesc.msg_slot = msg - msgdesc.va_queue_first;
429 msg_slot = msg - va_queue_first; 820 msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
430 sw_ack_slot = ffs(fw) - 1; 821 msgdesc.msg = msg;
431 822 uv_bau_process_message(&msgdesc, bcp);
432 uv_bau_process_message(msg, msg_slot, sw_ack_slot);
433
434 msg++; 823 msg++;
435 if (msg > va_queue_last) 824 if (msg > msgdesc.va_queue_last)
436 msg = va_queue_first; 825 msg = msgdesc.va_queue_first;
437 __get_cpu_var(bau_control).bau_msg_head = msg; 826 bcp->bau_msg_head = msg;
438 } 827 }
828 stat->d_time += (get_cycles() - time_start);
439 if (!count) 829 if (!count)
440 __get_cpu_var(ptcstats).nomsg++; 830 stat->d_nomsg++;
441 else if (count > 1) 831 else if (count > 1)
442 __get_cpu_var(ptcstats).multmsg++; 832 stat->d_multmsg++;
443 833 ack_APIC_irq();
444 time2 = get_cycles();
445 __get_cpu_var(ptcstats).dflush += (time2 - time1);
446
447 irq_exit();
448 set_irq_regs(old_regs);
449} 834}
450 835
451/* 836/*
452 * uv_enable_timeouts 837 * uv_enable_timeouts
453 * 838 *
454 * Each target blade (i.e. blades that have cpu's) needs to have 839 * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
455 * shootdown message timeouts enabled. The timeout does not cause 840 * shootdown message timeouts enabled. The timeout does not cause
456 * an interrupt, but causes an error message to be returned to 841 * an interrupt, but causes an error message to be returned to
457 * the sender. 842 * the sender.
458 */ 843 */
459static void uv_enable_timeouts(void) 844static void uv_enable_timeouts(void)
460{ 845{
461 int blade; 846 int uvhub;
462 int nblades; 847 int nuvhubs;
463 int pnode; 848 int pnode;
464 unsigned long mmr_image; 849 unsigned long mmr_image;
465 850
466 nblades = uv_num_possible_blades(); 851 nuvhubs = uv_num_possible_blades();
467 852
468 for (blade = 0; blade < nblades; blade++) { 853 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
469 if (!uv_blade_nr_possible_cpus(blade)) 854 if (!uv_blade_nr_possible_cpus(uvhub))
470 continue; 855 continue;
471 856
472 pnode = uv_blade_to_pnode(blade); 857 pnode = uv_blade_to_pnode(uvhub);
473 mmr_image = 858 mmr_image =
474 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); 859 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
475 /* 860 /*
@@ -479,16 +864,16 @@ static void uv_enable_timeouts(void)
479 * To program the period, the SOFT_ACK_MODE must be off. 864 * To program the period, the SOFT_ACK_MODE must be off.
480 */ 865 */
481 mmr_image &= ~((unsigned long)1 << 866 mmr_image &= ~((unsigned long)1 <<
482 UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); 867 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
483 uv_write_global_mmr64 868 uv_write_global_mmr64
484 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 869 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
485 /* 870 /*
486 * Set the 4-bit period. 871 * Set the 4-bit period.
487 */ 872 */
488 mmr_image &= ~((unsigned long)0xf << 873 mmr_image &= ~((unsigned long)0xf <<
489 UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); 874 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
490 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << 875 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
491 UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); 876 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
492 uv_write_global_mmr64 877 uv_write_global_mmr64
493 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 878 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
494 /* 879 /*
@@ -497,7 +882,7 @@ static void uv_enable_timeouts(void)
497 * indicated in bits 2:0 (7 causes all of them to timeout). 882 * indicated in bits 2:0 (7 causes all of them to timeout).
498 */ 883 */
499 mmr_image |= ((unsigned long)1 << 884 mmr_image |= ((unsigned long)1 <<
500 UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); 885 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
501 uv_write_global_mmr64 886 uv_write_global_mmr64
502 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 887 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
503 } 888 }
@@ -522,9 +907,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
522{ 907{
523} 908}
524 909
910static inline unsigned long long
911millisec_2_cycles(unsigned long millisec)
912{
913 unsigned long ns;
914 unsigned long long cyc;
915
916 ns = millisec * 1000;
917 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
918 return cyc;
919}
920
525/* 921/*
526 * Display the statistics thru /proc 922 * Display the statistics thru /proc.
527 * data points to the cpu number 923 * 'data' points to the cpu number
528 */ 924 */
529static int uv_ptc_seq_show(struct seq_file *file, void *data) 925static int uv_ptc_seq_show(struct seq_file *file, void *data)
530{ 926{
@@ -535,78 +931,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
535 931
536 if (!cpu) { 932 if (!cpu) {
537 seq_printf(file, 933 seq_printf(file,
538 "# cpu requestor requestee one all sretry dretry ptc_i "); 934 "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
539 seq_printf(file, 935 seq_printf(file,
540 "sw_ack sflush dflush sok dnomsg dmult starget\n"); 936 "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
937 seq_printf(file,
938 "retries rok resetp resett giveup sto bz throt ");
939 seq_printf(file,
940 "sw_ack recv rtime all ");
941 seq_printf(file,
942 "one mult none retry canc nocan reset rcan\n");
541 } 943 }
542 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 944 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
543 stat = &per_cpu(ptcstats, cpu); 945 stat = &per_cpu(ptcstats, cpu);
544 seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", 946 /* source side statistics */
545 cpu, stat->requestor, 947 seq_printf(file,
546 stat->requestee, stat->onetlb, stat->alltlb, 948 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
547 stat->s_retry, stat->d_retry, stat->ptc_i); 949 cpu, stat->s_requestor, cycles_2_us(stat->s_time),
548 seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", 950 stat->s_ntarguvhub, stat->s_ntarguvhub16,
951 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
952 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
953 stat->s_ntargcpu, stat->s_dtimeout);
954 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
955 stat->s_retry_messages, stat->s_retriesok,
956 stat->s_resets_plug, stat->s_resets_timeout,
957 stat->s_giveup, stat->s_stimeout,
958 stat->s_busy, stat->s_throttles);
959 /* destination side statistics */
960 seq_printf(file,
961 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
549 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 962 uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
550 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 963 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
551 stat->sflush, stat->dflush, 964 stat->d_requestee, cycles_2_us(stat->d_time),
552 stat->retriesok, stat->nomsg, 965 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
553 stat->multmsg, stat->ntargeted); 966 stat->d_nomsg, stat->d_retries, stat->d_canceled,
967 stat->d_nocanceled, stat->d_resets,
968 stat->d_rcanceled);
554 } 969 }
555 970
556 return 0; 971 return 0;
557} 972}
558 973
559/* 974/*
975 * -1: resetf the statistics
560 * 0: display meaning of the statistics 976 * 0: display meaning of the statistics
561 * >0: retry limit 977 * >0: maximum concurrent active descriptors per uvhub (throttle)
562 */ 978 */
563static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, 979static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
564 size_t count, loff_t *data) 980 size_t count, loff_t *data)
565{ 981{
566 long newmode; 982 int cpu;
983 long input_arg;
567 char optstr[64]; 984 char optstr[64];
985 struct ptc_stats *stat;
986 struct bau_control *bcp;
568 987
569 if (count == 0 || count > sizeof(optstr)) 988 if (count == 0 || count > sizeof(optstr))
570 return -EINVAL; 989 return -EINVAL;
571 if (copy_from_user(optstr, user, count)) 990 if (copy_from_user(optstr, user, count))
572 return -EFAULT; 991 return -EFAULT;
573 optstr[count - 1] = '\0'; 992 optstr[count - 1] = '\0';
574 if (strict_strtoul(optstr, 10, &newmode) < 0) { 993 if (strict_strtol(optstr, 10, &input_arg) < 0) {
575 printk(KERN_DEBUG "%s is invalid\n", optstr); 994 printk(KERN_DEBUG "%s is invalid\n", optstr);
576 return -EINVAL; 995 return -EINVAL;
577 } 996 }
578 997
579 if (newmode == 0) { 998 if (input_arg == 0) {
580 printk(KERN_DEBUG "# cpu: cpu number\n"); 999 printk(KERN_DEBUG "# cpu: cpu number\n");
1000 printk(KERN_DEBUG "Sender statistics:\n");
1001 printk(KERN_DEBUG
1002 "sent: number of shootdown messages sent\n");
1003 printk(KERN_DEBUG
1004 "stime: time spent sending messages\n");
1005 printk(KERN_DEBUG
1006 "numuvhubs: number of hubs targeted with shootdown\n");
1007 printk(KERN_DEBUG
1008 "numuvhubs16: number times 16 or more hubs targeted\n");
1009 printk(KERN_DEBUG
1010 "numuvhubs8: number times 8 or more hubs targeted\n");
1011 printk(KERN_DEBUG
1012 "numuvhubs4: number times 4 or more hubs targeted\n");
1013 printk(KERN_DEBUG
1014 "numuvhubs2: number times 2 or more hubs targeted\n");
1015 printk(KERN_DEBUG
1016 "numuvhubs1: number times 1 hub targeted\n");
1017 printk(KERN_DEBUG
1018 "numcpus: number of cpus targeted with shootdown\n");
1019 printk(KERN_DEBUG
1020 "dto: number of destination timeouts\n");
1021 printk(KERN_DEBUG
1022 "retries: destination timeout retries sent\n");
1023 printk(KERN_DEBUG
1024 "rok: : destination timeouts successfully retried\n");
1025 printk(KERN_DEBUG
1026 "resetp: ipi-style resource resets for plugs\n");
1027 printk(KERN_DEBUG
1028 "resett: ipi-style resource resets for timeouts\n");
1029 printk(KERN_DEBUG
1030 "giveup: fall-backs to ipi-style shootdowns\n");
1031 printk(KERN_DEBUG
1032 "sto: number of source timeouts\n");
1033 printk(KERN_DEBUG
1034 "bz: number of stay-busy's\n");
1035 printk(KERN_DEBUG
1036 "throt: number times spun in throttle\n");
1037 printk(KERN_DEBUG "Destination side statistics:\n");
581 printk(KERN_DEBUG 1038 printk(KERN_DEBUG
582 "requestor: times this cpu was the flush requestor\n"); 1039 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
583 printk(KERN_DEBUG 1040 printk(KERN_DEBUG
584 "requestee: times this cpu was requested to flush its TLBs\n"); 1041 "recv: shootdown messages received\n");
585 printk(KERN_DEBUG 1042 printk(KERN_DEBUG
586 "one: times requested to flush a single address\n"); 1043 "rtime: time spent processing messages\n");
587 printk(KERN_DEBUG 1044 printk(KERN_DEBUG
588 "all: times requested to flush all TLB's\n"); 1045 "all: shootdown all-tlb messages\n");
589 printk(KERN_DEBUG 1046 printk(KERN_DEBUG
590 "sretry: number of retries of source-side timeouts\n"); 1047 "one: shootdown one-tlb messages\n");
591 printk(KERN_DEBUG 1048 printk(KERN_DEBUG
592 "dretry: number of retries of destination-side timeouts\n"); 1049 "mult: interrupts that found multiple messages\n");
593 printk(KERN_DEBUG 1050 printk(KERN_DEBUG
594 "ptc_i: times UV fell through to IPI-style flushes\n"); 1051 "none: interrupts that found no messages\n");
595 printk(KERN_DEBUG 1052 printk(KERN_DEBUG
596 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); 1053 "retry: number of retry messages processed\n");
597 printk(KERN_DEBUG 1054 printk(KERN_DEBUG
598 "sflush_us: cycles spent in uv_flush_tlb_others()\n"); 1055 "canc: number messages canceled by retries\n");
599 printk(KERN_DEBUG 1056 printk(KERN_DEBUG
600 "dflush_us: cycles spent in handling flush requests\n"); 1057 "nocan: number retries that found nothing to cancel\n");
601 printk(KERN_DEBUG "sok: successes on retry\n");
602 printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
603 printk(KERN_DEBUG 1058 printk(KERN_DEBUG
604 "dmult: interrupts with multiple messages\n"); 1059 "reset: number of ipi-style reset requests processed\n");
605 printk(KERN_DEBUG "starget: nodes targeted\n"); 1060 printk(KERN_DEBUG
1061 "rcan: number messages canceled by reset requests\n");
1062 } else if (input_arg == -1) {
1063 for_each_present_cpu(cpu) {
1064 stat = &per_cpu(ptcstats, cpu);
1065 memset(stat, 0, sizeof(struct ptc_stats));
1066 }
606 } else { 1067 } else {
607 uv_bau_retry_limit = newmode; 1068 uv_bau_max_concurrent = input_arg;
608 printk(KERN_DEBUG "timeout retry limit:%d\n", 1069 bcp = &per_cpu(bau_control, smp_processor_id());
609 uv_bau_retry_limit); 1070 if (uv_bau_max_concurrent < 1 ||
1071 uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
1072 printk(KERN_DEBUG
1073 "Error: BAU max concurrent %d; %d is invalid\n",
1074 bcp->max_concurrent, uv_bau_max_concurrent);
1075 return -EINVAL;
1076 }
1077 printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
1078 uv_bau_max_concurrent);
1079 for_each_present_cpu(cpu) {
1080 bcp = &per_cpu(bau_control, cpu);
1081 bcp->max_concurrent = uv_bau_max_concurrent;
1082 }
610 } 1083 }
611 1084
612 return count; 1085 return count;
@@ -650,79 +1123,30 @@ static int __init uv_ptc_init(void)
650} 1123}
651 1124
652/* 1125/*
653 * begin the initialization of the per-blade control structures
654 */
655static struct bau_control * __init uv_table_bases_init(int blade, int node)
656{
657 int i;
658 struct bau_msg_status *msp;
659 struct bau_control *bau_tabp;
660
661 bau_tabp =
662 kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
663 BUG_ON(!bau_tabp);
664
665 bau_tabp->msg_statuses =
666 kmalloc_node(sizeof(struct bau_msg_status) *
667 DEST_Q_SIZE, GFP_KERNEL, node);
668 BUG_ON(!bau_tabp->msg_statuses);
669
670 for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
671 bau_cpubits_clear(&msp->seen_by, (int)
672 uv_blade_nr_possible_cpus(blade));
673
674 uv_bau_table_bases[blade] = bau_tabp;
675
676 return bau_tabp;
677}
678
679/*
680 * finish the initialization of the per-blade control structures
681 */
682static void __init
683uv_table_bases_finish(int blade,
684 struct bau_control *bau_tablesp,
685 struct bau_desc *adp)
686{
687 struct bau_control *bcp;
688 int cpu;
689
690 for_each_present_cpu(cpu) {
691 if (blade != uv_cpu_to_blade_id(cpu))
692 continue;
693
694 bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
695 bcp->bau_msg_head = bau_tablesp->va_queue_first;
696 bcp->va_queue_first = bau_tablesp->va_queue_first;
697 bcp->va_queue_last = bau_tablesp->va_queue_last;
698 bcp->msg_statuses = bau_tablesp->msg_statuses;
699 bcp->descriptor_base = adp;
700 }
701}
702
703/*
704 * initialize the sending side's sending buffers 1126 * initialize the sending side's sending buffers
705 */ 1127 */
706static struct bau_desc * __init 1128static void
707uv_activation_descriptor_init(int node, int pnode) 1129uv_activation_descriptor_init(int node, int pnode)
708{ 1130{
709 int i; 1131 int i;
1132 int cpu;
710 unsigned long pa; 1133 unsigned long pa;
711 unsigned long m; 1134 unsigned long m;
712 unsigned long n; 1135 unsigned long n;
713 struct bau_desc *adp; 1136 struct bau_desc *bau_desc;
714 struct bau_desc *ad2; 1137 struct bau_desc *bd2;
1138 struct bau_control *bcp;
715 1139
716 /* 1140 /*
717 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) 1141 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
718 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade 1142 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
719 */ 1143 */
720 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* 1144 bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
721 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); 1145 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
722 BUG_ON(!adp); 1146 BUG_ON(!bau_desc);
723 1147
724 pa = uv_gpa(adp); /* need the real nasid*/ 1148 pa = uv_gpa(bau_desc); /* need the real nasid*/
725 n = uv_gpa_to_pnode(pa); 1149 n = pa >> uv_nshift;
726 m = pa & uv_mmask; 1150 m = pa & uv_mmask;
727 1151
728 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 1152 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -731,96 +1155,188 @@ uv_activation_descriptor_init(int node, int pnode)
731 /* 1155 /*
732 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 1156 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
733 * cpu even though we only use the first one; one descriptor can 1157 * cpu even though we only use the first one; one descriptor can
734 * describe a broadcast to 256 nodes. 1158 * describe a broadcast to 256 uv hubs.
735 */ 1159 */
736 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); 1160 for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
737 i++, ad2++) { 1161 i++, bd2++) {
738 memset(ad2, 0, sizeof(struct bau_desc)); 1162 memset(bd2, 0, sizeof(struct bau_desc));
739 ad2->header.sw_ack_flag = 1; 1163 bd2->header.sw_ack_flag = 1;
740 /* 1164 /*
741 * base_dest_nodeid is the first node in the partition, so 1165 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
742 * the bit map will indicate partition-relative node numbers. 1166 * in the partition. The bit map will indicate uvhub numbers,
743 * note that base_dest_nodeid is actually a nasid. 1167 * which are 0-N in a partition. Pnodes are unique system-wide.
744 */ 1168 */
745 ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 1169 bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
746 ad2->header.dest_subnodeid = 0x10; /* the LB */ 1170 bd2->header.dest_subnodeid = 0x10; /* the LB */
747 ad2->header.command = UV_NET_ENDPOINT_INTD; 1171 bd2->header.command = UV_NET_ENDPOINT_INTD;
748 ad2->header.int_both = 1; 1172 bd2->header.int_both = 1;
749 /* 1173 /*
750 * all others need to be set to zero: 1174 * all others need to be set to zero:
751 * fairness chaining multilevel count replied_to 1175 * fairness chaining multilevel count replied_to
752 */ 1176 */
753 } 1177 }
754 return adp; 1178 for_each_present_cpu(cpu) {
1179 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1180 continue;
1181 bcp = &per_cpu(bau_control, cpu);
1182 bcp->descriptor_base = bau_desc;
1183 }
755} 1184}
756 1185
757/* 1186/*
758 * initialize the destination side's receiving buffers 1187 * initialize the destination side's receiving buffers
1188 * entered for each uvhub in the partition
1189 * - node is first node (kernel memory notion) on the uvhub
1190 * - pnode is the uvhub's physical identifier
759 */ 1191 */
760static struct bau_payload_queue_entry * __init 1192static void
761uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) 1193uv_payload_queue_init(int node, int pnode)
762{ 1194{
763 struct bau_payload_queue_entry *pqp;
764 unsigned long pa;
765 int pn; 1195 int pn;
1196 int cpu;
766 char *cp; 1197 char *cp;
1198 unsigned long pa;
1199 struct bau_payload_queue_entry *pqp;
1200 struct bau_payload_queue_entry *pqp_malloc;
1201 struct bau_control *bcp;
767 1202
768 pqp = (struct bau_payload_queue_entry *) kmalloc_node( 1203 pqp = (struct bau_payload_queue_entry *) kmalloc_node(
769 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), 1204 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
770 GFP_KERNEL, node); 1205 GFP_KERNEL, node);
771 BUG_ON(!pqp); 1206 BUG_ON(!pqp);
1207 pqp_malloc = pqp;
772 1208
773 cp = (char *)pqp + 31; 1209 cp = (char *)pqp + 31;
774 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); 1210 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
775 bau_tablesp->va_queue_first = pqp; 1211
1212 for_each_present_cpu(cpu) {
1213 if (pnode != uv_cpu_to_pnode(cpu))
1214 continue;
1215 /* for every cpu on this pnode: */
1216 bcp = &per_cpu(bau_control, cpu);
1217 bcp->va_queue_first = pqp;
1218 bcp->bau_msg_head = pqp;
1219 bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
1220 }
776 /* 1221 /*
777 * need the pnode of where the memory was really allocated 1222 * need the pnode of where the memory was really allocated
778 */ 1223 */
779 pa = uv_gpa(pqp); 1224 pa = uv_gpa(pqp);
780 pn = uv_gpa_to_pnode(pa); 1225 pn = pa >> uv_nshift;
781 uv_write_global_mmr64(pnode, 1226 uv_write_global_mmr64(pnode,
782 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 1227 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
783 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 1228 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
784 uv_physnodeaddr(pqp)); 1229 uv_physnodeaddr(pqp));
785 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, 1230 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
786 uv_physnodeaddr(pqp)); 1231 uv_physnodeaddr(pqp));
787 bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
788 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, 1232 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
789 (unsigned long) 1233 (unsigned long)
790 uv_physnodeaddr(bau_tablesp->va_queue_last)); 1234 uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
1235 /* in effect, all msg_type's are set to MSG_NOOP */
791 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); 1236 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
792
793 return pqp;
794} 1237}
795 1238
796/* 1239/*
797 * Initialization of each UV blade's structures 1240 * Initialization of each UV hub's structures
798 */ 1241 */
799static int __init uv_init_blade(int blade) 1242static void __init uv_init_uvhub(int uvhub, int vector)
800{ 1243{
801 int node; 1244 int node;
802 int pnode; 1245 int pnode;
803 unsigned long pa;
804 unsigned long apicid; 1246 unsigned long apicid;
805 struct bau_desc *adp; 1247
806 struct bau_payload_queue_entry *pqp; 1248 node = uvhub_to_first_node(uvhub);
807 struct bau_control *bau_tablesp; 1249 pnode = uv_blade_to_pnode(uvhub);
808 1250 uv_activation_descriptor_init(node, pnode);
809 node = blade_to_first_node(blade); 1251 uv_payload_queue_init(node, pnode);
810 bau_tablesp = uv_table_bases_init(blade, node);
811 pnode = uv_blade_to_pnode(blade);
812 adp = uv_activation_descriptor_init(node, pnode);
813 pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
814 uv_table_bases_finish(blade, bau_tablesp, adp);
815 /* 1252 /*
816 * the below initialization can't be in firmware because the 1253 * the below initialization can't be in firmware because the
817 * messaging IRQ will be determined by the OS 1254 * messaging IRQ will be determined by the OS
818 */ 1255 */
819 apicid = blade_to_first_apicid(blade); 1256 apicid = uvhub_to_first_apicid(uvhub);
820 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, 1257 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 1258 ((apicid << 32) | vector));
823 return 0; 1259}
1260
1261/*
1262 * initialize the bau_control structure for each cpu
1263 */
1264static void uv_init_per_cpu(int nuvhubs)
1265{
1266 int i, j, k;
1267 int cpu;
1268 int pnode;
1269 int uvhub;
1270 short socket = 0;
1271 struct bau_control *bcp;
1272 struct uvhub_desc *bdp;
1273 struct socket_desc *sdp;
1274 struct bau_control *hmaster = NULL;
1275 struct bau_control *smaster = NULL;
1276 struct socket_desc {
1277 short num_cpus;
1278 short cpu_number[16];
1279 };
1280 struct uvhub_desc {
1281 short num_sockets;
1282 short num_cpus;
1283 short uvhub;
1284 short pnode;
1285 struct socket_desc socket[2];
1286 };
1287 struct uvhub_desc *uvhub_descs;
1288
1289 uvhub_descs = (struct uvhub_desc *)
1290 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1291 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1292 for_each_present_cpu(cpu) {
1293 bcp = &per_cpu(bau_control, cpu);
1294 memset(bcp, 0, sizeof(struct bau_control));
1295 spin_lock_init(&bcp->masks_lock);
1296 bcp->max_concurrent = uv_bau_max_concurrent;
1297 pnode = uv_cpu_hub_info(cpu)->pnode;
1298 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1299 bdp = &uvhub_descs[uvhub];
1300 bdp->num_cpus++;
1301 bdp->uvhub = uvhub;
1302 bdp->pnode = pnode;
1303 /* time interval to catch a hardware stay-busy bug */
1304 bcp->timeout_interval = millisec_2_cycles(3);
1305 /* kludge: assume uv_hub.h is constant */
1306 socket = (cpu_physical_id(cpu)>>5)&1;
1307 if (socket >= bdp->num_sockets)
1308 bdp->num_sockets = socket+1;
1309 sdp = &bdp->socket[socket];
1310 sdp->cpu_number[sdp->num_cpus] = cpu;
1311 sdp->num_cpus++;
1312 }
1313 socket = 0;
1314 for_each_possible_blade(uvhub) {
1315 bdp = &uvhub_descs[uvhub];
1316 for (i = 0; i < bdp->num_sockets; i++) {
1317 sdp = &bdp->socket[i];
1318 for (j = 0; j < sdp->num_cpus; j++) {
1319 cpu = sdp->cpu_number[j];
1320 bcp = &per_cpu(bau_control, cpu);
1321 bcp->cpu = cpu;
1322 if (j == 0) {
1323 smaster = bcp;
1324 if (i == 0)
1325 hmaster = bcp;
1326 }
1327 bcp->cpus_in_uvhub = bdp->num_cpus;
1328 bcp->cpus_in_socket = sdp->num_cpus;
1329 bcp->socket_master = smaster;
1330 bcp->uvhub_master = hmaster;
1331 for (k = 0; k < DEST_Q_SIZE; k++)
1332 bcp->socket_acknowledge_count[k] = 0;
1333 bcp->uvhub_cpu =
1334 uv_cpu_hub_info(cpu)->blade_processor_id;
1335 }
1336 socket++;
1337 }
1338 }
1339 kfree(uvhub_descs);
824} 1340}
825 1341
826/* 1342/*
@@ -828,38 +1344,54 @@ static int __init uv_init_blade(int blade)
828 */ 1344 */
829static int __init uv_bau_init(void) 1345static int __init uv_bau_init(void)
830{ 1346{
831 int blade; 1347 int uvhub;
832 int nblades; 1348 int pnode;
1349 int nuvhubs;
833 int cur_cpu; 1350 int cur_cpu;
1351 int vector;
1352 unsigned long mmr;
834 1353
835 if (!is_uv_system()) 1354 if (!is_uv_system())
836 return 0; 1355 return 0;
837 1356
1357 if (nobau)
1358 return 0;
1359
838 for_each_possible_cpu(cur_cpu) 1360 for_each_possible_cpu(cur_cpu)
839 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 1361 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
840 GFP_KERNEL, cpu_to_node(cur_cpu)); 1362 GFP_KERNEL, cpu_to_node(cur_cpu));
841 1363
842 uv_bau_retry_limit = 1; 1364 uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
1365 uv_nshift = uv_hub_info->m_val;
843 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1366 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
844 nblades = uv_num_possible_blades(); 1367 nuvhubs = uv_num_possible_blades();
845 1368
846 uv_bau_table_bases = (struct bau_control **) 1369 uv_init_per_cpu(nuvhubs);
847 kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
848 BUG_ON(!uv_bau_table_bases);
849 1370
850 uv_partition_base_pnode = 0x7fffffff; 1371 uv_partition_base_pnode = 0x7fffffff;
851 for (blade = 0; blade < nblades; blade++) 1372 for (uvhub = 0; uvhub < nuvhubs; uvhub++)
852 if (uv_blade_nr_possible_cpus(blade) && 1373 if (uv_blade_nr_possible_cpus(uvhub) &&
853 (uv_blade_to_pnode(blade) < uv_partition_base_pnode)) 1374 (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
854 uv_partition_base_pnode = uv_blade_to_pnode(blade); 1375 uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
855 for (blade = 0; blade < nblades; blade++) 1376
856 if (uv_blade_nr_possible_cpus(blade)) 1377 vector = UV_BAU_MESSAGE;
857 uv_init_blade(blade); 1378 for_each_possible_blade(uvhub)
858 1379 if (uv_blade_nr_possible_cpus(uvhub))
859 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 1380 uv_init_uvhub(uvhub, vector);
1381
860 uv_enable_timeouts(); 1382 uv_enable_timeouts();
1383 alloc_intr_gate(vector, uv_bau_message_intr1);
1384
1385 for_each_possible_blade(uvhub) {
1386 pnode = uv_blade_to_pnode(uvhub);
1387 /* INIT the bau */
1388 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1389 ((unsigned long)1 << 63));
1390 mmr = 1; /* should be 1 to broadcast to both sockets */
1391 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
1392 }
861 1393
862 return 0; 1394 return 0;
863} 1395}
864__initcall(uv_bau_init); 1396core_initcall(uv_bau_init);
865__initcall(uv_ptc_init); 1397core_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e445418..142d70c74b0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -15,6 +15,7 @@
15#include <linux/kprobes.h> 15#include <linux/kprobes.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/kdebug.h> 17#include <linux/kdebug.h>
18#include <linux/kgdb.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/ptrace.h> 21#include <linux/ptrace.h>
@@ -108,15 +109,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
108 dec_preempt_count(); 109 dec_preempt_count();
109} 110}
110 111
111#ifdef CONFIG_X86_32
112static inline void
113die_if_kernel(const char *str, struct pt_regs *regs, long err)
114{
115 if (!user_mode_vm(regs))
116 die(str, regs, err);
117}
118#endif
119
120static void __kprobes 112static void __kprobes
121do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, 113do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
122 long error_code, siginfo_t *info) 114 long error_code, siginfo_t *info)
@@ -460,6 +452,11 @@ void restart_nmi(void)
460/* May run on IST stack. */ 452/* May run on IST stack. */
461dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) 453dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
462{ 454{
455#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
456 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
457 == NOTIFY_STOP)
458 return;
459#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
463#ifdef CONFIG_KPROBES 460#ifdef CONFIG_KPROBES
464 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 461 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
465 == NOTIFY_STOP) 462 == NOTIFY_STOP)
@@ -543,11 +540,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
543 540
544 /* DR6 may or may not be cleared by the CPU */ 541 /* DR6 may or may not be cleared by the CPU */
545 set_debugreg(0, 6); 542 set_debugreg(0, 6);
543
546 /* 544 /*
547 * The processor cleared BTF, so don't mark that we need it set. 545 * The processor cleared BTF, so don't mark that we need it set.
548 */ 546 */
549 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 547 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
550 tsk->thread.debugctlmsr = 0;
551 548
552 /* Store the virtualized DR6 value */ 549 /* Store the virtualized DR6 value */
553 tsk->thread.debugreg6 = dr6; 550 tsk->thread.debugreg6 = dr6;
@@ -585,55 +582,67 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
585 return; 582 return;
586} 583}
587 584
588#ifdef CONFIG_X86_64
589static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
590{
591 if (fixup_exception(regs))
592 return 1;
593
594 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
595 /* Illegal floating point operation in the kernel */
596 current->thread.trap_no = trapnr;
597 die(str, regs, 0);
598 return 0;
599}
600#endif
601
602/* 585/*
603 * Note that we play around with the 'TS' bit in an attempt to get 586 * Note that we play around with the 'TS' bit in an attempt to get
604 * the correct behaviour even in the presence of the asynchronous 587 * the correct behaviour even in the presence of the asynchronous
605 * IRQ13 behaviour 588 * IRQ13 behaviour
606 */ 589 */
607void math_error(void __user *ip) 590void math_error(struct pt_regs *regs, int error_code, int trapnr)
608{ 591{
609 struct task_struct *task; 592 struct task_struct *task = current;
610 siginfo_t info; 593 siginfo_t info;
611 unsigned short cwd, swd, err; 594 unsigned short err;
595 char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
596
597 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
598 return;
599 conditional_sti(regs);
600
601 if (!user_mode_vm(regs))
602 {
603 if (!fixup_exception(regs)) {
604 task->thread.error_code = error_code;
605 task->thread.trap_no = trapnr;
606 die(str, regs, error_code);
607 }
608 return;
609 }
612 610
613 /* 611 /*
614 * Save the info for the exception handler and clear the error. 612 * Save the info for the exception handler and clear the error.
615 */ 613 */
616 task = current;
617 save_init_fpu(task); 614 save_init_fpu(task);
618 task->thread.trap_no = 16; 615 task->thread.trap_no = trapnr;
619 task->thread.error_code = 0; 616 task->thread.error_code = error_code;
620 info.si_signo = SIGFPE; 617 info.si_signo = SIGFPE;
621 info.si_errno = 0; 618 info.si_errno = 0;
622 info.si_addr = ip; 619 info.si_addr = (void __user *)regs->ip;
623 /* 620 if (trapnr == 16) {
624 * (~cwd & swd) will mask out exceptions that are not set to unmasked 621 unsigned short cwd, swd;
625 * status. 0x3f is the exception bits in these regs, 0x200 is the 622 /*
626 * C1 reg you need in case of a stack fault, 0x040 is the stack 623 * (~cwd & swd) will mask out exceptions that are not set to unmasked
627 * fault bit. We should only be taking one exception at a time, 624 * status. 0x3f is the exception bits in these regs, 0x200 is the
628 * so if this combination doesn't produce any single exception, 625 * C1 reg you need in case of a stack fault, 0x040 is the stack
629 * then we have a bad program that isn't synchronizing its FPU usage 626 * fault bit. We should only be taking one exception at a time,
630 * and it will suffer the consequences since we won't be able to 627 * so if this combination doesn't produce any single exception,
631 * fully reproduce the context of the exception 628 * then we have a bad program that isn't synchronizing its FPU usage
632 */ 629 * and it will suffer the consequences since we won't be able to
633 cwd = get_fpu_cwd(task); 630 * fully reproduce the context of the exception
634 swd = get_fpu_swd(task); 631 */
632 cwd = get_fpu_cwd(task);
633 swd = get_fpu_swd(task);
635 634
636 err = swd & ~cwd; 635 err = swd & ~cwd;
636 } else {
637 /*
638 * The SIMD FPU exceptions are handled a little differently, as there
639 * is only a single status/control register. Thus, to determine which
640 * unmasked exception was caught we must mask the exception mask bits
641 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
642 */
643 unsigned short mxcsr = get_fpu_mxcsr(task);
644 err = ~(mxcsr >> 7) & mxcsr;
645 }
637 646
638 if (err & 0x001) { /* Invalid op */ 647 if (err & 0x001) { /* Invalid op */
639 /* 648 /*
@@ -662,97 +671,17 @@ void math_error(void __user *ip)
662 671
663dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 672dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
664{ 673{
665 conditional_sti(regs);
666
667#ifdef CONFIG_X86_32 674#ifdef CONFIG_X86_32
668 ignore_fpu_irq = 1; 675 ignore_fpu_irq = 1;
669#else
670 if (!user_mode(regs) &&
671 kernel_math_error(regs, "kernel x87 math error", 16))
672 return;
673#endif 676#endif
674 677
675 math_error((void __user *)regs->ip); 678 math_error(regs, error_code, 16);
676}
677
678static void simd_math_error(void __user *ip)
679{
680 struct task_struct *task;
681 siginfo_t info;
682 unsigned short mxcsr;
683
684 /*
685 * Save the info for the exception handler and clear the error.
686 */
687 task = current;
688 save_init_fpu(task);
689 task->thread.trap_no = 19;
690 task->thread.error_code = 0;
691 info.si_signo = SIGFPE;
692 info.si_errno = 0;
693 info.si_code = __SI_FAULT;
694 info.si_addr = ip;
695 /*
696 * The SIMD FPU exceptions are handled a little differently, as there
697 * is only a single status/control register. Thus, to determine which
698 * unmasked exception was caught we must mask the exception mask bits
699 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
700 */
701 mxcsr = get_fpu_mxcsr(task);
702 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
703 case 0x000:
704 default:
705 break;
706 case 0x001: /* Invalid Op */
707 info.si_code = FPE_FLTINV;
708 break;
709 case 0x002: /* Denormalize */
710 case 0x010: /* Underflow */
711 info.si_code = FPE_FLTUND;
712 break;
713 case 0x004: /* Zero Divide */
714 info.si_code = FPE_FLTDIV;
715 break;
716 case 0x008: /* Overflow */
717 info.si_code = FPE_FLTOVF;
718 break;
719 case 0x020: /* Precision */
720 info.si_code = FPE_FLTRES;
721 break;
722 }
723 force_sig_info(SIGFPE, &info, task);
724} 679}
725 680
726dotraplinkage void 681dotraplinkage void
727do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 682do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
728{ 683{
729 conditional_sti(regs); 684 math_error(regs, error_code, 19);
730
731#ifdef CONFIG_X86_32
732 if (cpu_has_xmm) {
733 /* Handle SIMD FPU exceptions on PIII+ processors. */
734 ignore_fpu_irq = 1;
735 simd_math_error((void __user *)regs->ip);
736 return;
737 }
738 /*
739 * Handle strange cache flush from user space exception
740 * in all other cases. This is undocumented behaviour.
741 */
742 if (regs->flags & X86_VM_MASK) {
743 handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
744 return;
745 }
746 current->thread.trap_no = 19;
747 current->thread.error_code = error_code;
748 die_if_kernel("cache flush denied", regs, error_code);
749 force_sig(SIGSEGV, current);
750#else
751 if (!user_mode(regs) &&
752 kernel_math_error(regs, "kernel simd math error", 19))
753 return;
754 simd_math_error((void __user *)regs->ip);
755#endif
756} 685}
757 686
758dotraplinkage void 687dotraplinkage void
@@ -879,6 +808,16 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
879} 808}
880#endif 809#endif
881 810
811/* Set of traps needed for early debugging. */
812void __init early_trap_init(void)
813{
814 set_intr_gate_ist(1, &debug, DEBUG_STACK);
815 /* int3 can be called from all */
816 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
817 set_intr_gate(14, &page_fault);
818 load_idt(&idt_descr);
819}
820
882void __init trap_init(void) 821void __init trap_init(void)
883{ 822{
884 int i; 823 int i;
@@ -892,10 +831,7 @@ void __init trap_init(void)
892#endif 831#endif
893 832
894 set_intr_gate(0, &divide_error); 833 set_intr_gate(0, &divide_error);
895 set_intr_gate_ist(1, &debug, DEBUG_STACK);
896 set_intr_gate_ist(2, &nmi, NMI_STACK); 834 set_intr_gate_ist(2, &nmi, NMI_STACK);
897 /* int3 can be called from all */
898 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
899 /* int4 can be called from all */ 835 /* int4 can be called from all */
900 set_system_intr_gate(4, &overflow); 836 set_system_intr_gate(4, &overflow);
901 set_intr_gate(5, &bounds); 837 set_intr_gate(5, &bounds);
@@ -911,7 +847,6 @@ void __init trap_init(void)
911 set_intr_gate(11, &segment_not_present); 847 set_intr_gate(11, &segment_not_present);
912 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); 848 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
913 set_intr_gate(13, &general_protection); 849 set_intr_gate(13, &general_protection);
914 set_intr_gate(14, &page_fault);
915 set_intr_gate(15, &spurious_interrupt_bug); 850 set_intr_gate(15, &spurious_interrupt_bug);
916 set_intr_gate(16, &coprocessor_error); 851 set_intr_gate(16, &coprocessor_error);
917 set_intr_gate(17, &alignment_check); 852 set_intr_gate(17, &alignment_check);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 1d40336b030..1132129db79 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -44,7 +44,7 @@ static void uv_ack_apic(unsigned int irq)
44 ack_APIC_irq(); 44 ack_APIC_irq();
45} 45}
46 46
47struct irq_chip uv_irq_chip = { 47static struct irq_chip uv_irq_chip = {
48 .name = "UV-CORE", 48 .name = "UV-CORE",
49 .startup = uv_noop_ret, 49 .startup = uv_noop_ret,
50 .shutdown = uv_noop, 50 .shutdown = uv_noop,
@@ -141,7 +141,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
141 */ 141 */
142static int 142static int
143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, 143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
144 unsigned long mmr_offset, int restrict) 144 unsigned long mmr_offset, int limit)
145{ 145{
146 const struct cpumask *eligible_cpu = cpumask_of(cpu); 146 const struct cpumask *eligible_cpu = cpumask_of(cpu);
147 struct irq_desc *desc = irq_to_desc(irq); 147 struct irq_desc *desc = irq_to_desc(irq);
@@ -160,7 +160,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
160 if (err != 0) 160 if (err != 0)
161 return err; 161 return err;
162 162
163 if (restrict == UV_AFFINITY_CPU) 163 if (limit == UV_AFFINITY_CPU)
164 desc->status |= IRQ_NO_BALANCING; 164 desc->status |= IRQ_NO_BALANCING;
165 else 165 else
166 desc->status |= IRQ_MOVE_PCNTXT; 166 desc->status |= IRQ_MOVE_PCNTXT;
@@ -214,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
214 unsigned long mmr_value; 214 unsigned long mmr_value;
215 struct uv_IO_APIC_route_entry *entry; 215 struct uv_IO_APIC_route_entry *entry;
216 unsigned long mmr_offset; 216 unsigned long mmr_offset;
217 unsigned mmr_pnode; 217 int mmr_pnode;
218 218
219 if (set_desc_affinity(desc, mask, &dest)) 219 if (set_desc_affinity(desc, mask, &dest))
220 return -1; 220 return -1;
@@ -248,7 +248,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
248 * interrupt is raised. 248 * interrupt is raised.
249 */ 249 */
250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
251 unsigned long mmr_offset, int restrict) 251 unsigned long mmr_offset, int limit)
252{ 252{
253 int irq, ret; 253 int irq, ret;
254 254
@@ -258,7 +258,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
258 return -EBUSY; 258 return -EBUSY;
259 259
260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, 260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
261 restrict); 261 limit);
262 if (ret == irq) 262 if (ret == irq)
263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); 263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
264 else 264 else
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b2249..1b950d151e5 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55 55
56EXPORT_SYMBOL(empty_zero_page); 56EXPORT_SYMBOL(empty_zero_page);
57EXPORT_SYMBOL(init_level4_pgt);
58#ifndef CONFIG_PARAVIRT 57#ifndef CONFIG_PARAVIRT
59EXPORT_SYMBOL(native_load_gs_index); 58EXPORT_SYMBOL(native_load_gs_index);
60#endif 59#endif
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec..37e68fc5e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -99,7 +99,7 @@ int save_i387_xstate(void __user *buf)
99 if (err) 99 if (err)
100 return err; 100 return err;
101 101
102 if (task_thread_info(tsk)->status & TS_XSAVE) 102 if (use_xsave())
103 err = xsave_user(buf); 103 err = xsave_user(buf);
104 else 104 else
105 err = fxsave_user(buf); 105 err = fxsave_user(buf);
@@ -109,14 +109,14 @@ int save_i387_xstate(void __user *buf)
109 task_thread_info(tsk)->status &= ~TS_USEDFPU; 109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts(); 110 stts();
111 } else { 111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, 112 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
113 xstate_size)) 113 xstate_size))
114 return -1; 114 return -1;
115 } 115 }
116 116
117 clear_used_math(); /* trigger finit */ 117 clear_used_math(); /* trigger finit */
118 118
119 if (task_thread_info(tsk)->status & TS_XSAVE) { 119 if (use_xsave()) {
120 struct _fpstate __user *fx = buf; 120 struct _fpstate __user *fx = buf;
121 struct _xstate __user *x = buf; 121 struct _xstate __user *x = buf;
122 u64 xstate_bv; 122 u64 xstate_bv;
@@ -225,7 +225,7 @@ int restore_i387_xstate(void __user *buf)
225 clts(); 225 clts();
226 task_thread_info(current)->status |= TS_USEDFPU; 226 task_thread_info(current)->status |= TS_USEDFPU;
227 } 227 }
228 if (task_thread_info(tsk)->status & TS_XSAVE) 228 if (use_xsave())
229 err = restore_user_xstate(buf); 229 err = restore_user_xstate(buf);
230 else 230 else
231 err = fxrstor_checking((__force struct i387_fxsave_struct *) 231 err = fxrstor_checking((__force struct i387_fxsave_struct *)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6ac082..5ac0bb465ed 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "x86.h" 35#include "x86.h"
36#include "tss.h"
36 37
37/* 38/*
38 * Opcode effective-address decode tables. 39 * Opcode effective-address decode tables.
@@ -50,6 +51,8 @@
50#define DstReg (2<<1) /* Register operand. */ 51#define DstReg (2<<1) /* Register operand. */
51#define DstMem (3<<1) /* Memory operand. */ 52#define DstMem (3<<1) /* Memory operand. */
52#define DstAcc (4<<1) /* Destination Accumulator */ 53#define DstAcc (4<<1) /* Destination Accumulator */
54#define DstDI (5<<1) /* Destination is in ES:(E)DI */
55#define DstMem64 (6<<1) /* 64bit memory operand */
53#define DstMask (7<<1) 56#define DstMask (7<<1)
54/* Source operand type. */ 57/* Source operand type. */
55#define SrcNone (0<<4) /* No source operand. */ 58#define SrcNone (0<<4) /* No source operand. */
@@ -63,6 +66,7 @@
63#define SrcOne (7<<4) /* Implied '1' */ 66#define SrcOne (7<<4) /* Implied '1' */
64#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
65#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 68#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
66#define SrcMask (0xf<<4) 70#define SrcMask (0xf<<4)
67/* Generic ModRM decode. */ 71/* Generic ModRM decode. */
68#define ModRM (1<<8) 72#define ModRM (1<<8)
@@ -85,6 +89,9 @@
85#define Src2ImmByte (2<<29) 89#define Src2ImmByte (2<<29)
86#define Src2One (3<<29) 90#define Src2One (3<<29)
87#define Src2Imm16 (4<<29) 91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
88#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
89 96
90enum { 97enum {
@@ -147,8 +154,8 @@ static u32 opcode_table[256] = {
147 0, 0, 0, 0, 154 0, 0, 0, 0,
148 /* 0x68 - 0x6F */ 155 /* 0x68 - 0x6F */
149 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
150 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
151 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
152 /* 0x70 - 0x77 */ 159 /* 0x70 - 0x77 */
153 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
154 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -173,12 +180,12 @@ static u32 opcode_table[256] = {
173 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
174 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
175 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
176 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
177 ByteOp | ImplicitOps | String, ImplicitOps | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
178 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
179 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
180 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
181 ByteOp | ImplicitOps | String, ImplicitOps | String, 188 ByteOp | DstDI | String, DstDI | String,
182 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
183 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
184 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -204,13 +211,13 @@ static u32 opcode_table[256] = {
204 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0,
205 /* 0xE0 - 0xE7 */ 212 /* 0xE0 - 0xE7 */
206 0, 0, 0, 0, 213 0, 0, 0, 0,
207 ByteOp | SrcImmUByte, SrcImmUByte, 214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
208 ByteOp | SrcImmUByte, SrcImmUByte, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
209 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
210 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
211 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
212 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
214 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
215 0, 0, 0, 0, 222 0, 0, 0, 0,
216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, 223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -343,7 +350,8 @@ static u32 group_table[] = {
343 [Group5*8] = 350 [Group5*8] =
344 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
345 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
347 [Group7*8] = 355 [Group7*8] =
348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
349 SrcNone | ModRM | DstMem | Mov, 0, 357 SrcNone | ModRM | DstMem | Mov, 0,
@@ -353,14 +361,14 @@ static u32 group_table[] = {
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, 361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, 362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] = 363 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, 364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
357}; 365};
358 366
359static u32 group2_table[] = { 367static u32 group2_table[] = {
360 [Group7*8] = 368 [Group7*8] =
361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, 369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
362 SrcNone | ModRM | DstMem | Mov, 0, 370 SrcNone | ModRM | DstMem | Mov, 0,
363 SrcMem16 | ModRM | Mov, 0, 371 SrcMem16 | ModRM | Mov | Priv, 0,
364 [Group9*8] = 372 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0, 373 0, 0, 0, 0, 0, 0, 0, 0,
366}; 374};
@@ -562,7 +570,7 @@ static u32 group2_table[] = {
562#define insn_fetch(_type, _size, _eip) \ 570#define insn_fetch(_type, _size, _eip) \
563({ unsigned long _x; \ 571({ unsigned long _x; \
564 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ 572 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
565 if (rc != 0) \ 573 if (rc != X86EMUL_CONTINUE) \
566 goto done; \ 574 goto done; \
567 (_eip) += (_size); \ 575 (_eip) += (_size); \
568 (_type)_x; \ 576 (_type)_x; \
@@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
638 646
639static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
640 struct x86_emulate_ops *ops, 648 struct x86_emulate_ops *ops,
641 unsigned long linear, u8 *dest) 649 unsigned long eip, u8 *dest)
642{ 650{
643 struct fetch_cache *fc = &ctxt->decode.fetch; 651 struct fetch_cache *fc = &ctxt->decode.fetch;
644 int rc; 652 int rc;
645 int size; 653 int size, cur_size;
646 654
647 if (linear < fc->start || linear >= fc->end) { 655 if (eip == fc->end) {
648 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 656 cur_size = fc->end - fc->start;
649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); 657 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
650 if (rc) 658 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
659 size, ctxt->vcpu, NULL);
660 if (rc != X86EMUL_CONTINUE)
651 return rc; 661 return rc;
652 fc->start = linear; 662 fc->end += size;
653 fc->end = linear + size;
654 } 663 }
655 *dest = fc->data[linear - fc->start]; 664 *dest = fc->data[eip - fc->start];
656 return 0; 665 return X86EMUL_CONTINUE;
657} 666}
658 667
659static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 668static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
660 struct x86_emulate_ops *ops, 669 struct x86_emulate_ops *ops,
661 unsigned long eip, void *dest, unsigned size) 670 unsigned long eip, void *dest, unsigned size)
662{ 671{
663 int rc = 0; 672 int rc;
664 673
665 /* x86 instructions are limited to 15 bytes. */ 674 /* x86 instructions are limited to 15 bytes. */
666 if (eip + size - ctxt->decode.eip_orig > 15) 675 if (eip + size - ctxt->eip > 15)
667 return X86EMUL_UNHANDLEABLE; 676 return X86EMUL_UNHANDLEABLE;
668 eip += ctxt->cs_base;
669 while (size--) { 677 while (size--) {
670 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 678 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
671 if (rc) 679 if (rc != X86EMUL_CONTINUE)
672 return rc; 680 return rc;
673 } 681 }
674 return 0; 682 return X86EMUL_CONTINUE;
675} 683}
676 684
677/* 685/*
@@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
702 *address = 0; 710 *address = 0;
703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 711 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
704 ctxt->vcpu, NULL); 712 ctxt->vcpu, NULL);
705 if (rc) 713 if (rc != X86EMUL_CONTINUE)
706 return rc; 714 return rc;
707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 715 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
708 ctxt->vcpu, NULL); 716 ctxt->vcpu, NULL);
@@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
782 struct decode_cache *c = &ctxt->decode; 790 struct decode_cache *c = &ctxt->decode;
783 u8 sib; 791 u8 sib;
784 int index_reg = 0, base_reg = 0, scale; 792 int index_reg = 0, base_reg = 0, scale;
785 int rc = 0; 793 int rc = X86EMUL_CONTINUE;
786 794
787 if (c->rex_prefix) { 795 if (c->rex_prefix) {
788 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 796 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
895 struct x86_emulate_ops *ops) 903 struct x86_emulate_ops *ops)
896{ 904{
897 struct decode_cache *c = &ctxt->decode; 905 struct decode_cache *c = &ctxt->decode;
898 int rc = 0; 906 int rc = X86EMUL_CONTINUE;
899 907
900 switch (c->ad_bytes) { 908 switch (c->ad_bytes) {
901 case 2: 909 case 2:
@@ -916,14 +924,18 @@ int
916x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 924x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
917{ 925{
918 struct decode_cache *c = &ctxt->decode; 926 struct decode_cache *c = &ctxt->decode;
919 int rc = 0; 927 int rc = X86EMUL_CONTINUE;
920 int mode = ctxt->mode; 928 int mode = ctxt->mode;
921 int def_op_bytes, def_ad_bytes, group; 929 int def_op_bytes, def_ad_bytes, group;
922 930
923 /* Shadow copy of register state. Committed on successful emulation. */
924 931
932 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart);
934
935 /* Shadow copy of register state. Committed on successful emulation. */
925 memset(c, 0, sizeof(struct decode_cache)); 936 memset(c, 0, sizeof(struct decode_cache));
926 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); 937 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip;
927 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
928 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
929 941
@@ -1015,11 +1027,6 @@ done_prefixes:
1015 } 1027 }
1016 } 1028 }
1017 1029
1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
1020 return -1;
1021 }
1022
1023 if (c->d & Group) { 1030 if (c->d & Group) {
1024 group = c->d & GroupMask; 1031 group = c->d & GroupMask;
1025 c->modrm = insn_fetch(u8, 1, c->eip); 1032 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1046,7 +1053,7 @@ done_prefixes:
1046 rc = decode_modrm(ctxt, ops); 1053 rc = decode_modrm(ctxt, ops);
1047 else if (c->d & MemAbs) 1054 else if (c->d & MemAbs)
1048 rc = decode_abs(ctxt, ops); 1055 rc = decode_abs(ctxt, ops);
1049 if (rc) 1056 if (rc != X86EMUL_CONTINUE)
1050 goto done; 1057 goto done;
1051 1058
1052 if (!c->has_seg_override) 1059 if (!c->has_seg_override)
@@ -1057,6 +1064,10 @@ done_prefixes:
1057 1064
1058 if (c->ad_bytes != 8) 1065 if (c->ad_bytes != 8)
1059 c->modrm_ea = (u32)c->modrm_ea; 1066 c->modrm_ea = (u32)c->modrm_ea;
1067
1068 if (c->rip_relative)
1069 c->modrm_ea += c->eip;
1070
1060 /* 1071 /*
1061 * Decode and fetch the source operand: register, memory 1072 * Decode and fetch the source operand: register, memory
1062 * or immediate. 1073 * or immediate.
@@ -1091,6 +1102,8 @@ done_prefixes:
1091 break; 1102 break;
1092 } 1103 }
1093 c->src.type = OP_MEM; 1104 c->src.type = OP_MEM;
1105 c->src.ptr = (unsigned long *)c->modrm_ea;
1106 c->src.val = 0;
1094 break; 1107 break;
1095 case SrcImm: 1108 case SrcImm:
1096 case SrcImmU: 1109 case SrcImmU:
@@ -1139,6 +1152,14 @@ done_prefixes:
1139 c->src.bytes = 1; 1152 c->src.bytes = 1;
1140 c->src.val = 1; 1153 c->src.val = 1;
1141 break; 1154 break;
1155 case SrcSI:
1156 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c),
1160 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0;
1162 break;
1142 } 1163 }
1143 1164
1144 /* 1165 /*
@@ -1168,6 +1189,12 @@ done_prefixes:
1168 c->src2.bytes = 1; 1189 c->src2.bytes = 1;
1169 c->src2.val = 1; 1190 c->src2.val = 1;
1170 break; 1191 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1171 } 1198 }
1172 1199
1173 /* Decode and fetch the destination operand: register or memory. */ 1200 /* Decode and fetch the destination operand: register or memory. */
@@ -1180,6 +1207,7 @@ done_prefixes:
1180 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 1207 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1181 break; 1208 break;
1182 case DstMem: 1209 case DstMem:
1210 case DstMem64:
1183 if ((c->d & ModRM) && c->modrm_mod == 3) { 1211 if ((c->d & ModRM) && c->modrm_mod == 3) {
1184 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1212 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1185 c->dst.type = OP_REG; 1213 c->dst.type = OP_REG;
@@ -1188,12 +1216,24 @@ done_prefixes:
1188 break; 1216 break;
1189 } 1217 }
1190 c->dst.type = OP_MEM; 1218 c->dst.type = OP_MEM;
1219 c->dst.ptr = (unsigned long *)c->modrm_ea;
1220 if ((c->d & DstMask) == DstMem64)
1221 c->dst.bytes = 8;
1222 else
1223 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1224 c->dst.val = 0;
1225 if (c->d & BitOp) {
1226 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1227
1228 c->dst.ptr = (void *)c->dst.ptr +
1229 (c->src.val & mask) / 8;
1230 }
1191 break; 1231 break;
1192 case DstAcc: 1232 case DstAcc:
1193 c->dst.type = OP_REG; 1233 c->dst.type = OP_REG;
1194 c->dst.bytes = c->op_bytes; 1234 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1195 c->dst.ptr = &c->regs[VCPU_REGS_RAX]; 1235 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1196 switch (c->op_bytes) { 1236 switch (c->dst.bytes) {
1197 case 1: 1237 case 1:
1198 c->dst.val = *(u8 *)c->dst.ptr; 1238 c->dst.val = *(u8 *)c->dst.ptr;
1199 break; 1239 break;
@@ -1203,18 +1243,248 @@ done_prefixes:
1203 case 4: 1243 case 4:
1204 c->dst.val = *(u32 *)c->dst.ptr; 1244 c->dst.val = *(u32 *)c->dst.ptr;
1205 break; 1245 break;
1246 case 8:
1247 c->dst.val = *(u64 *)c->dst.ptr;
1248 break;
1206 } 1249 }
1207 c->dst.orig_val = c->dst.val; 1250 c->dst.orig_val = c->dst.val;
1208 break; 1251 break;
1252 case DstDI:
1253 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt),
1257 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0;
1259 break;
1209 } 1260 }
1210 1261
1211 if (c->rip_relative)
1212 c->modrm_ea += c->eip;
1213
1214done: 1262done:
1215 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1216} 1264}
1217 1265
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port,
1269 void *dest)
1270{
1271 struct read_cache *rc = &ctxt->decode.io_read;
1272
1273 if (rc->pos == rc->end) { /* refill pio read ahead */
1274 struct decode_cache *c = &ctxt->decode;
1275 unsigned int in_page, n;
1276 unsigned int count = c->rep_prefix ?
1277 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
1278 in_page = (ctxt->eflags & EFLG_DF) ?
1279 offset_in_page(c->regs[VCPU_REGS_RDI]) :
1280 PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
1281 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1282 count);
1283 if (n == 0)
1284 n = 1;
1285 rc->pos = rc->end = 0;
1286 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
1287 return 0;
1288 rc->end = n * size;
1289 }
1290
1291 memcpy(dest, rc->data + rc->pos, size);
1292 rc->pos += size;
1293 return 1;
1294}
1295
1296static u32 desc_limit_scaled(struct desc_struct *desc)
1297{
1298 u32 limit = get_desc_limit(desc);
1299
1300 return desc->g ? (limit << 12) | 0xfff : limit;
1301}
1302
1303static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1304 struct x86_emulate_ops *ops,
1305 u16 selector, struct desc_ptr *dt)
1306{
1307 if (selector & 1 << 2) {
1308 struct desc_struct desc;
1309 memset (dt, 0, sizeof *dt);
1310 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
1311 return;
1312
1313 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
1314 dt->address = get_desc_base(&desc);
1315 } else
1316 ops->get_gdt(dt, ctxt->vcpu);
1317}
1318
1319/* allowed just for 8 bytes segments */
1320static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1321 struct x86_emulate_ops *ops,
1322 u16 selector, struct desc_struct *desc)
1323{
1324 struct desc_ptr dt;
1325 u16 index = selector >> 3;
1326 int ret;
1327 u32 err;
1328 ulong addr;
1329
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331
1332 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT;
1335 }
1336 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1340
1341 return ret;
1342}
1343
1344/* allowed just for 8 bytes segments */
1345static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1346 struct x86_emulate_ops *ops,
1347 u16 selector, struct desc_struct *desc)
1348{
1349 struct desc_ptr dt;
1350 u16 index = selector >> 3;
1351 u32 err;
1352 ulong addr;
1353 int ret;
1354
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356
1357 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT;
1360 }
1361
1362 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1366
1367 return ret;
1368}
1369
1370static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1371 struct x86_emulate_ops *ops,
1372 u16 selector, int seg)
1373{
1374 struct desc_struct seg_desc;
1375 u8 dpl, rpl, cpl;
1376 unsigned err_vec = GP_VECTOR;
1377 u32 err_code = 0;
1378 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1379 int ret;
1380
1381 memset(&seg_desc, 0, sizeof seg_desc);
1382
1383 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
1384 || ctxt->mode == X86EMUL_MODE_REAL) {
1385 /* set real mode segment descriptor */
1386 set_desc_base(&seg_desc, selector << 4);
1387 set_desc_limit(&seg_desc, 0xffff);
1388 seg_desc.type = 3;
1389 seg_desc.p = 1;
1390 seg_desc.s = 1;
1391 goto load;
1392 }
1393
1394 /* NULL selector is not valid for TR, CS and SS */
1395 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
1396 && null_selector)
1397 goto exception;
1398
1399 /* TR should be in GDT only */
1400 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
1401 goto exception;
1402
1403 if (null_selector) /* for NULL selector skip all following checks */
1404 goto load;
1405
1406 ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
1407 if (ret != X86EMUL_CONTINUE)
1408 return ret;
1409
1410 err_code = selector & 0xfffc;
1411 err_vec = GP_VECTOR;
1412
1413 /* can't load system descriptor into segment selecor */
1414 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1415 goto exception;
1416
1417 if (!seg_desc.p) {
1418 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
1419 goto exception;
1420 }
1421
1422 rpl = selector & 3;
1423 dpl = seg_desc.dpl;
1424 cpl = ops->cpl(ctxt->vcpu);
1425
1426 switch (seg) {
1427 case VCPU_SREG_SS:
1428 /*
1429 * segment is not a writable data segment or segment
1430 * selector's RPL != CPL or segment selector's RPL != CPL
1431 */
1432 if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
1433 goto exception;
1434 break;
1435 case VCPU_SREG_CS:
1436 if (!(seg_desc.type & 8))
1437 goto exception;
1438
1439 if (seg_desc.type & 4) {
1440 /* conforming */
1441 if (dpl > cpl)
1442 goto exception;
1443 } else {
1444 /* nonconforming */
1445 if (rpl > cpl || dpl != cpl)
1446 goto exception;
1447 }
1448 /* CS(RPL) <- CPL */
1449 selector = (selector & 0xfffc) | cpl;
1450 break;
1451 case VCPU_SREG_TR:
1452 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1453 goto exception;
1454 break;
1455 case VCPU_SREG_LDTR:
1456 if (seg_desc.s || seg_desc.type != 2)
1457 goto exception;
1458 break;
1459 default: /* DS, ES, FS, or GS */
1460 /*
1461 * segment is not a data or readable code segment or
1462 * ((segment is a data or nonconforming code segment)
1463 * and (both RPL and CPL > DPL))
1464 */
1465 if ((seg_desc.type & 0xa) == 0x8 ||
1466 (((seg_desc.type & 0xc) != 0xc) &&
1467 (rpl > dpl && cpl > dpl)))
1468 goto exception;
1469 break;
1470 }
1471
1472 if (seg_desc.s) {
1473 /* mark segment as accessed */
1474 seg_desc.type |= 1;
1475 ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
1476 if (ret != X86EMUL_CONTINUE)
1477 return ret;
1478 }
1479load:
1480 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE;
1483exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
1485 return X86EMUL_PROPAGATE_FAULT;
1486}
1487
1218static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1219{ 1489{
1220 struct decode_cache *c = &ctxt->decode; 1490 struct decode_cache *c = &ctxt->decode;
@@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1251 int rc; 1521 int rc;
1252 unsigned long val, change_mask; 1522 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1523 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); 1524 int cpl = ops->cpl(ctxt->vcpu);
1255 1525
1256 rc = emulate_pop(ctxt, ops, &val, len); 1526 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE) 1527 if (rc != X86EMUL_CONTINUE)
@@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1306 int rc; 1576 int rc;
1307 1577
1308 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1578 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1309 if (rc != 0) 1579 if (rc != X86EMUL_CONTINUE)
1310 return rc; 1580 return rc;
1311 1581
1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); 1582 rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
1313 return rc; 1583 return rc;
1314} 1584}
1315 1585
@@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1332 struct x86_emulate_ops *ops) 1602 struct x86_emulate_ops *ops)
1333{ 1603{
1334 struct decode_cache *c = &ctxt->decode; 1604 struct decode_cache *c = &ctxt->decode;
1335 int rc = 0; 1605 int rc = X86EMUL_CONTINUE;
1336 int reg = VCPU_REGS_RDI; 1606 int reg = VCPU_REGS_RDI;
1337 1607
1338 while (reg >= VCPU_REGS_RAX) { 1608 while (reg >= VCPU_REGS_RAX) {
@@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1343 } 1613 }
1344 1614
1345 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1615 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1346 if (rc != 0) 1616 if (rc != X86EMUL_CONTINUE)
1347 break; 1617 break;
1348 --reg; 1618 --reg;
1349 } 1619 }
@@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1354 struct x86_emulate_ops *ops) 1624 struct x86_emulate_ops *ops)
1355{ 1625{
1356 struct decode_cache *c = &ctxt->decode; 1626 struct decode_cache *c = &ctxt->decode;
1357 int rc;
1358 1627
1359 rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1628 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
1360 if (rc != 0)
1361 return rc;
1362 return 0;
1363} 1629}
1364 1630
1365static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1631static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1395 struct x86_emulate_ops *ops) 1661 struct x86_emulate_ops *ops)
1396{ 1662{
1397 struct decode_cache *c = &ctxt->decode; 1663 struct decode_cache *c = &ctxt->decode;
1398 int rc = 0;
1399 1664
1400 switch (c->modrm_reg) { 1665 switch (c->modrm_reg) {
1401 case 0 ... 1: /* test */ 1666 case 0 ... 1: /* test */
@@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1408 emulate_1op("neg", c->dst, ctxt->eflags); 1673 emulate_1op("neg", c->dst, ctxt->eflags);
1409 break; 1674 break;
1410 default: 1675 default:
1411 DPRINTF("Cannot emulate %02x\n", c->b); 1676 return 0;
1412 rc = X86EMUL_UNHANDLEABLE;
1413 break;
1414 } 1677 }
1415 return rc; 1678 return 1;
1416} 1679}
1417 1680
1418static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1681static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1442 emulate_push(ctxt); 1705 emulate_push(ctxt);
1443 break; 1706 break;
1444 } 1707 }
1445 return 0; 1708 return X86EMUL_CONTINUE;
1446} 1709}
1447 1710
1448static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1711static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1449 struct x86_emulate_ops *ops, 1712 struct x86_emulate_ops *ops)
1450 unsigned long memop)
1451{ 1713{
1452 struct decode_cache *c = &ctxt->decode; 1714 struct decode_cache *c = &ctxt->decode;
1453 u64 old, new; 1715 u64 old = c->dst.orig_val;
1454 int rc;
1455
1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1457 if (rc != X86EMUL_CONTINUE)
1458 return rc;
1459 1716
1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1717 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1461 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { 1718 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1463 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 1720 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1464 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1721 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1465 ctxt->eflags &= ~EFLG_ZF; 1722 ctxt->eflags &= ~EFLG_ZF;
1466
1467 } else { 1723 } else {
1468 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | 1724 c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1469 (u32) c->regs[VCPU_REGS_RBX]; 1725 (u32) c->regs[VCPU_REGS_RBX];
1470 1726
1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1472 if (rc != X86EMUL_CONTINUE)
1473 return rc;
1474 ctxt->eflags |= EFLG_ZF; 1727 ctxt->eflags |= EFLG_ZF;
1475 } 1728 }
1476 return 0; 1729 return X86EMUL_CONTINUE;
1477} 1730}
1478 1731
1479static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, 1732static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1484 unsigned long cs; 1737 unsigned long cs;
1485 1738
1486 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1739 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
1487 if (rc) 1740 if (rc != X86EMUL_CONTINUE)
1488 return rc; 1741 return rc;
1489 if (c->op_bytes == 4) 1742 if (c->op_bytes == 4)
1490 c->eip = (u32)c->eip; 1743 c->eip = (u32)c->eip;
1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1744 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1492 if (rc) 1745 if (rc != X86EMUL_CONTINUE)
1493 return rc; 1746 return rc;
1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); 1747 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1495 return rc; 1748 return rc;
1496} 1749}
1497 1750
@@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1544 default: 1797 default:
1545 break; 1798 break;
1546 } 1799 }
1547 return 0; 1800 return X86EMUL_CONTINUE;
1548} 1801}
1549 1802
1550static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) 1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1598 u64 msr_data; 1851 u64 msr_data;
1599 1852
1600 /* syscall is not available in real mode */ 1853 /* syscall is not available in real mode */
1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) 1854 if (ctxt->mode == X86EMUL_MODE_REAL ||
1602 return X86EMUL_UNHANDLEABLE; 1855 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1857 return X86EMUL_PROPAGATE_FAULT;
1858 }
1603 1859
1604 setup_syscalls_segments(ctxt, &cs, &ss); 1860 setup_syscalls_segments(ctxt, &cs, &ss);
1605 1861
@@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1649 /* inject #GP if in real mode */ 1905 /* inject #GP if in real mode */
1650 if (ctxt->mode == X86EMUL_MODE_REAL) { 1906 if (ctxt->mode == X86EMUL_MODE_REAL) {
1651 kvm_inject_gp(ctxt->vcpu, 0); 1907 kvm_inject_gp(ctxt->vcpu, 0);
1652 return X86EMUL_UNHANDLEABLE; 1908 return X86EMUL_PROPAGATE_FAULT;
1653 } 1909 }
1654 1910
1655 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1911 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1656 * Therefore, we inject an #UD. 1912 * Therefore, we inject an #UD.
1657 */ 1913 */
1658 if (ctxt->mode == X86EMUL_MODE_PROT64) 1914 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1659 return X86EMUL_UNHANDLEABLE; 1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1916 return X86EMUL_PROPAGATE_FAULT;
1917 }
1660 1918
1661 setup_syscalls_segments(ctxt, &cs, &ss); 1919 setup_syscalls_segments(ctxt, &cs, &ss);
1662 1920
@@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1711 if (ctxt->mode == X86EMUL_MODE_REAL || 1969 if (ctxt->mode == X86EMUL_MODE_REAL ||
1712 ctxt->mode == X86EMUL_MODE_VM86) { 1970 ctxt->mode == X86EMUL_MODE_VM86) {
1713 kvm_inject_gp(ctxt->vcpu, 0); 1971 kvm_inject_gp(ctxt->vcpu, 0);
1714 return X86EMUL_UNHANDLEABLE; 1972 return X86EMUL_PROPAGATE_FAULT;
1715 } 1973 }
1716 1974
1717 setup_syscalls_segments(ctxt, &cs, &ss); 1975 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1756 return X86EMUL_CONTINUE; 2014 return X86EMUL_CONTINUE;
1757} 2015}
1758 2016
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) 2017static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2018 struct x86_emulate_ops *ops)
1760{ 2019{
1761 int iopl; 2020 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL) 2021 if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1764 if (ctxt->mode == X86EMUL_MODE_VM86) 2023 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true; 2024 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2025 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; 2026 return ops->cpl(ctxt->vcpu) > iopl;
1768} 2027}
1769 2028
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2029static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops, 2060 struct x86_emulate_ops *ops,
1802 u16 port, u16 len) 2061 u16 port, u16 len)
1803{ 2062{
1804 if (emulator_bad_iopl(ctxt)) 2063 if (emulator_bad_iopl(ctxt, ops))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2064 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false; 2065 return false;
1807 return true; 2066 return true;
1808} 2067}
1809 2068
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss)
2083{
2084 struct decode_cache *c = &ctxt->decode;
2085
2086 tss->ip = c->eip;
2087 tss->flag = ctxt->eflags;
2088 tss->ax = c->regs[VCPU_REGS_RAX];
2089 tss->cx = c->regs[VCPU_REGS_RCX];
2090 tss->dx = c->regs[VCPU_REGS_RDX];
2091 tss->bx = c->regs[VCPU_REGS_RBX];
2092 tss->sp = c->regs[VCPU_REGS_RSP];
2093 tss->bp = c->regs[VCPU_REGS_RBP];
2094 tss->si = c->regs[VCPU_REGS_RSI];
2095 tss->di = c->regs[VCPU_REGS_RDI];
2096
2097 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2098 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2099 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2100 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2101 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2102}
2103
2104static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2105 struct x86_emulate_ops *ops,
2106 struct tss_segment_16 *tss)
2107{
2108 struct decode_cache *c = &ctxt->decode;
2109 int ret;
2110
2111 c->eip = tss->ip;
2112 ctxt->eflags = tss->flag | 2;
2113 c->regs[VCPU_REGS_RAX] = tss->ax;
2114 c->regs[VCPU_REGS_RCX] = tss->cx;
2115 c->regs[VCPU_REGS_RDX] = tss->dx;
2116 c->regs[VCPU_REGS_RBX] = tss->bx;
2117 c->regs[VCPU_REGS_RSP] = tss->sp;
2118 c->regs[VCPU_REGS_RBP] = tss->bp;
2119 c->regs[VCPU_REGS_RSI] = tss->si;
2120 c->regs[VCPU_REGS_RDI] = tss->di;
2121
2122 /*
2123 * SDM says that segment selectors are loaded before segment
2124 * descriptors
2125 */
2126 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
2127 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2128 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2129 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2130 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2131
2132 /*
2133 * Now load segment descriptors. If fault happenes at this stage
2134 * it is handled in a context of new task
2135 */
2136 ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
2137 if (ret != X86EMUL_CONTINUE)
2138 return ret;
2139 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2140 if (ret != X86EMUL_CONTINUE)
2141 return ret;
2142 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2143 if (ret != X86EMUL_CONTINUE)
2144 return ret;
2145 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2146 if (ret != X86EMUL_CONTINUE)
2147 return ret;
2148 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2149 if (ret != X86EMUL_CONTINUE)
2150 return ret;
2151
2152 return X86EMUL_CONTINUE;
2153}
2154
2155static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2156 struct x86_emulate_ops *ops,
2157 u16 tss_selector, u16 old_tss_sel,
2158 ulong old_tss_base, struct desc_struct *new_desc)
2159{
2160 struct tss_segment_16 tss_seg;
2161 int ret;
2162 u32 err, new_tss_base = get_desc_base(new_desc);
2163
2164 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2165 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2169 return ret;
2170 }
2171
2172 save_state_to_tss16(ctxt, ops, &tss_seg);
2173
2174 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2175 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2179 return ret;
2180 }
2181
2182 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2183 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2187 return ret;
2188 }
2189
2190 if (old_tss_sel != 0xffff) {
2191 tss_seg.prev_task_link = old_tss_sel;
2192
2193 ret = ops->write_std(new_tss_base,
2194 &tss_seg.prev_task_link,
2195 sizeof tss_seg.prev_task_link,
2196 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2200 return ret;
2201 }
2202 }
2203
2204 return load_state_from_tss16(ctxt, ops, &tss_seg);
2205}
2206
2207static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2208 struct x86_emulate_ops *ops,
2209 struct tss_segment_32 *tss)
2210{
2211 struct decode_cache *c = &ctxt->decode;
2212
2213 tss->cr3 = ops->get_cr(3, ctxt->vcpu);
2214 tss->eip = c->eip;
2215 tss->eflags = ctxt->eflags;
2216 tss->eax = c->regs[VCPU_REGS_RAX];
2217 tss->ecx = c->regs[VCPU_REGS_RCX];
2218 tss->edx = c->regs[VCPU_REGS_RDX];
2219 tss->ebx = c->regs[VCPU_REGS_RBX];
2220 tss->esp = c->regs[VCPU_REGS_RSP];
2221 tss->ebp = c->regs[VCPU_REGS_RBP];
2222 tss->esi = c->regs[VCPU_REGS_RSI];
2223 tss->edi = c->regs[VCPU_REGS_RDI];
2224
2225 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2226 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2227 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2228 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2229 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
2230 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
2231 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2232}
2233
2234static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2235 struct x86_emulate_ops *ops,
2236 struct tss_segment_32 *tss)
2237{
2238 struct decode_cache *c = &ctxt->decode;
2239 int ret;
2240
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu);
2242 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax;
2245 c->regs[VCPU_REGS_RCX] = tss->ecx;
2246 c->regs[VCPU_REGS_RDX] = tss->edx;
2247 c->regs[VCPU_REGS_RBX] = tss->ebx;
2248 c->regs[VCPU_REGS_RSP] = tss->esp;
2249 c->regs[VCPU_REGS_RBP] = tss->ebp;
2250 c->regs[VCPU_REGS_RSI] = tss->esi;
2251 c->regs[VCPU_REGS_RDI] = tss->edi;
2252
2253 /*
2254 * SDM says that segment selectors are loaded before segment
2255 * descriptors
2256 */
2257 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
2258 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2259 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2260 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2261 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2262 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
2263 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
2264
2265 /*
2266 * Now load segment descriptors. If fault happenes at this stage
2267 * it is handled in a context of new task
2268 */
2269 ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
2270 if (ret != X86EMUL_CONTINUE)
2271 return ret;
2272 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2273 if (ret != X86EMUL_CONTINUE)
2274 return ret;
2275 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2276 if (ret != X86EMUL_CONTINUE)
2277 return ret;
2278 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2279 if (ret != X86EMUL_CONTINUE)
2280 return ret;
2281 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2282 if (ret != X86EMUL_CONTINUE)
2283 return ret;
2284 ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
2285 if (ret != X86EMUL_CONTINUE)
2286 return ret;
2287 ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
2288 if (ret != X86EMUL_CONTINUE)
2289 return ret;
2290
2291 return X86EMUL_CONTINUE;
2292}
2293
2294static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2295 struct x86_emulate_ops *ops,
2296 u16 tss_selector, u16 old_tss_sel,
2297 ulong old_tss_base, struct desc_struct *new_desc)
2298{
2299 struct tss_segment_32 tss_seg;
2300 int ret;
2301 u32 err, new_tss_base = get_desc_base(new_desc);
2302
2303 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2304 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2308 return ret;
2309 }
2310
2311 save_state_to_tss32(ctxt, ops, &tss_seg);
2312
2313 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2314 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2318 return ret;
2319 }
2320
2321 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2322 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2326 return ret;
2327 }
2328
2329 if (old_tss_sel != 0xffff) {
2330 tss_seg.prev_task_link = old_tss_sel;
2331
2332 ret = ops->write_std(new_tss_base,
2333 &tss_seg.prev_task_link,
2334 sizeof tss_seg.prev_task_link,
2335 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2339 return ret;
2340 }
2341 }
2342
2343 return load_state_from_tss32(ctxt, ops, &tss_seg);
2344}
2345
2346static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2347 struct x86_emulate_ops *ops,
2348 u16 tss_selector, int reason,
2349 bool has_error_code, u32 error_code)
2350{
2351 struct desc_struct curr_tss_desc, next_tss_desc;
2352 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
2356 u32 desc_limit;
2357
2358 /* FIXME: old_tss_base == ~0 ? */
2359
2360 ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
2361 if (ret != X86EMUL_CONTINUE)
2362 return ret;
2363 ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
2364 if (ret != X86EMUL_CONTINUE)
2365 return ret;
2366
2367 /* FIXME: check that next_tss_desc is tss */
2368
2369 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0);
2373 return X86EMUL_PROPAGATE_FAULT;
2374 }
2375 }
2376
2377 desc_limit = desc_limit_scaled(&next_tss_desc);
2378 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT;
2384 }
2385
2386 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
2387 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
2388 write_segment_descriptor(ctxt, ops, old_tss_sel,
2389 &curr_tss_desc);
2390 }
2391
2392 if (reason == TASK_SWITCH_IRET)
2393 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2394
2395 /* set back link to prev task only if NT bit is set in eflags
2396 note that old_tss_sel is not used afetr this point */
2397 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2398 old_tss_sel = 0xffff;
2399
2400 if (next_tss_desc.type & 8)
2401 ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
2402 old_tss_base, &next_tss_desc);
2403 else
2404 ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
2405 old_tss_base, &next_tss_desc);
2406 if (ret != X86EMUL_CONTINUE)
2407 return ret;
2408
2409 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
2410 ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
2411
2412 if (reason != TASK_SWITCH_IRET) {
2413 next_tss_desc.type |= (1 << 1); /* set busy flag */
2414 write_segment_descriptor(ctxt, ops, tss_selector,
2415 &next_tss_desc);
2416 }
2417
2418 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2419 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
2420 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2421
2422 if (has_error_code) {
2423 struct decode_cache *c = &ctxt->decode;
2424
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt);
2429 }
2430
2431 return ret;
2432}
2433
2434int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2435 struct x86_emulate_ops *ops,
2436 u16 tss_selector, int reason,
2437 bool has_error_code, u32 error_code)
2438{
2439 struct decode_cache *c = &ctxt->decode;
2440 int rc;
2441
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE;
2446
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code);
2449
2450 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops);
2454 }
2455
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2457}
2458
2459static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
2460 int reg, struct operand *op)
2461{
2462 struct decode_cache *c = &ctxt->decode;
2463 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2464
2465 register_address_increment(c, &c->regs[reg], df * op->bytes);
2466 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
2467}
2468
1810int 2469int
1811x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2470x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1812{ 2471{
1813 unsigned long memop = 0;
1814 u64 msr_data; 2472 u64 msr_data;
1815 unsigned long saved_eip = 0;
1816 struct decode_cache *c = &ctxt->decode; 2473 struct decode_cache *c = &ctxt->decode;
1817 unsigned int port; 2474 int rc = X86EMUL_CONTINUE;
1818 int io_dir_in; 2475 int saved_dst_type = c->dst.type;
1819 int rc = 0;
1820 2476
1821 ctxt->interruptibility = 0; 2477 ctxt->interruptibility = 0;
1822 2478
@@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1826 */ 2482 */
1827 2483
1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1829 saved_eip = c->eip; 2485
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2488 goto done;
2489 }
1830 2490
1831 /* LOCK prefix is allowed only with some instructions */ 2491 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) { 2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done; 2494 goto done;
1835 } 2495 }
1836 2496
1837 /* Privileged instruction can be executed only in CPL=0 */ 2497 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { 2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0); 2499 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done; 2500 goto done;
1841 } 2501 }
1842 2502
1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1844 memop = c->modrm_ea;
1845
1846 if (c->rep_prefix && (c->d & String)) { 2503 if (c->rep_prefix && (c->d & String)) {
2504 ctxt->restart = true;
1847 /* All REP prefixes have the same first termination condition */ 2505 /* All REP prefixes have the same first termination condition */
1848 if (c->regs[VCPU_REGS_RCX] == 0) { 2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done:
2508 ctxt->restart = false;
1849 kvm_rip_write(ctxt->vcpu, c->eip); 2509 kvm_rip_write(ctxt->vcpu, c->eip);
1850 goto done; 2510 goto done;
1851 } 2511 }
@@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1857 * - if REPNE/REPNZ and ZF = 1 then done 2517 * - if REPNE/REPNZ and ZF = 1 then done
1858 */ 2518 */
1859 if ((c->b == 0xa6) || (c->b == 0xa7) || 2519 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1860 (c->b == 0xae) || (c->b == 0xaf)) { 2520 (c->b == 0xae) || (c->b == 0xaf)) {
1861 if ((c->rep_prefix == REPE_PREFIX) && 2521 if ((c->rep_prefix == REPE_PREFIX) &&
1862 ((ctxt->eflags & EFLG_ZF) == 0)) { 2522 ((ctxt->eflags & EFLG_ZF) == 0))
1863 kvm_rip_write(ctxt->vcpu, c->eip); 2523 goto string_done;
1864 goto done;
1865 }
1866 if ((c->rep_prefix == REPNE_PREFIX) && 2524 if ((c->rep_prefix == REPNE_PREFIX) &&
1867 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 2525 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
1868 kvm_rip_write(ctxt->vcpu, c->eip); 2526 goto string_done;
1869 goto done;
1870 }
1871 } 2527 }
1872 c->regs[VCPU_REGS_RCX]--; 2528 c->eip = ctxt->eip;
1873 c->eip = kvm_rip_read(ctxt->vcpu);
1874 } 2529 }
1875 2530
1876 if (c->src.type == OP_MEM) { 2531 if (c->src.type == OP_MEM) {
1877 c->src.ptr = (unsigned long *)memop;
1878 c->src.val = 0;
1879 rc = ops->read_emulated((unsigned long)c->src.ptr, 2532 rc = ops->read_emulated((unsigned long)c->src.ptr,
1880 &c->src.val, 2533 &c->src.val,
1881 c->src.bytes, 2534 c->src.bytes,
@@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1885 c->src.orig_val = c->src.val; 2538 c->src.orig_val = c->src.val;
1886 } 2539 }
1887 2540
2541 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr,
2543 &c->src2.val,
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE)
2547 goto done;
2548 }
2549
1888 if ((c->d & DstMask) == ImplicitOps) 2550 if ((c->d & DstMask) == ImplicitOps)
1889 goto special_insn; 2551 goto special_insn;
1890 2552
1891 2553
1892 if (c->dst.type == OP_MEM) { 2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
1893 c->dst.ptr = (unsigned long *)memop; 2555 /* optimisation - avoid slow emulated read if Mov */
1894 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
1895 c->dst.val = 0; 2557 c->dst.bytes, ctxt->vcpu);
1896 if (c->d & BitOp) { 2558 if (rc != X86EMUL_CONTINUE)
1897 unsigned long mask = ~(c->dst.bytes * 8 - 1); 2559 goto done;
1898
1899 c->dst.ptr = (void *)c->dst.ptr +
1900 (c->src.val & mask) / 8;
1901 }
1902 if (!(c->d & Mov)) {
1903 /* optimisation - avoid slow emulated read */
1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1905 &c->dst.val,
1906 c->dst.bytes,
1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1911 } 2560 }
1912 c->dst.orig_val = c->dst.val; 2561 c->dst.orig_val = c->dst.val;
1913 2562
@@ -1926,7 +2575,7 @@ special_insn:
1926 break; 2575 break;
1927 case 0x07: /* pop es */ 2576 case 0x07: /* pop es */
1928 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1929 if (rc != 0) 2578 if (rc != X86EMUL_CONTINUE)
1930 goto done; 2579 goto done;
1931 break; 2580 break;
1932 case 0x08 ... 0x0d: 2581 case 0x08 ... 0x0d:
@@ -1945,7 +2594,7 @@ special_insn:
1945 break; 2594 break;
1946 case 0x17: /* pop ss */ 2595 case 0x17: /* pop ss */
1947 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1948 if (rc != 0) 2597 if (rc != X86EMUL_CONTINUE)
1949 goto done; 2598 goto done;
1950 break; 2599 break;
1951 case 0x18 ... 0x1d: 2600 case 0x18 ... 0x1d:
@@ -1957,7 +2606,7 @@ special_insn:
1957 break; 2606 break;
1958 case 0x1f: /* pop ds */ 2607 case 0x1f: /* pop ds */
1959 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1960 if (rc != 0) 2609 if (rc != X86EMUL_CONTINUE)
1961 goto done; 2610 goto done;
1962 break; 2611 break;
1963 case 0x20 ... 0x25: 2612 case 0x20 ... 0x25:
@@ -1988,7 +2637,7 @@ special_insn:
1988 case 0x58 ... 0x5f: /* pop reg */ 2637 case 0x58 ... 0x5f: /* pop reg */
1989 pop_instruction: 2638 pop_instruction:
1990 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); 2639 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
1991 if (rc != 0) 2640 if (rc != X86EMUL_CONTINUE)
1992 goto done; 2641 goto done;
1993 break; 2642 break;
1994 case 0x60: /* pusha */ 2643 case 0x60: /* pusha */
@@ -1996,7 +2645,7 @@ special_insn:
1996 break; 2645 break;
1997 case 0x61: /* popa */ 2646 case 0x61: /* popa */
1998 rc = emulate_popa(ctxt, ops); 2647 rc = emulate_popa(ctxt, ops);
1999 if (rc != 0) 2648 if (rc != X86EMUL_CONTINUE)
2000 goto done; 2649 goto done;
2001 break; 2650 break;
2002 case 0x63: /* movsxd */ 2651 case 0x63: /* movsxd */
@@ -2010,47 +2659,29 @@ special_insn:
2010 break; 2659 break;
2011 case 0x6c: /* insb */ 2660 case 0x6c: /* insb */
2012 case 0x6d: /* insw/insd */ 2661 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u);
2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2664 c->dst.bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0); 2665 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done; 2666 goto done;
2017 } 2667 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu, 2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2019 1, 2669 c->regs[VCPU_REGS_RDX], &c->dst.val))
2020 (c->d & ByteOp) ? 1 : c->op_bytes, 2670 goto done; /* IO is needed, skip writeback */
2021 c->rep_prefix ? 2671 break;
2022 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
2023 (ctxt->eflags & EFLG_DF),
2024 register_address(c, es_base(ctxt),
2025 c->regs[VCPU_REGS_RDI]),
2026 c->rep_prefix,
2027 c->regs[VCPU_REGS_RDX]) == 0) {
2028 c->eip = saved_eip;
2029 return -1;
2030 }
2031 return 0;
2032 case 0x6e: /* outsb */ 2672 case 0x6e: /* outsb */
2033 case 0x6f: /* outsw/outsd */ 2673 case 0x6f: /* outsw/outsd */
2674 c->src.bytes = min(c->src.bytes, 4u);
2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2676 c->src.bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0); 2677 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done; 2678 goto done;
2038 } 2679 }
2039 if (kvm_emulate_pio_string(ctxt->vcpu, 2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2040 0, 2681 &c->src.val, 1, ctxt->vcpu);
2041 (c->d & ByteOp) ? 1 : c->op_bytes, 2682
2042 c->rep_prefix ? 2683 c->dst.type = OP_NONE; /* nothing to writeback */
2043 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 2684 break;
2044 (ctxt->eflags & EFLG_DF),
2045 register_address(c,
2046 seg_override_base(ctxt, c),
2047 c->regs[VCPU_REGS_RSI]),
2048 c->rep_prefix,
2049 c->regs[VCPU_REGS_RDX]) == 0) {
2050 c->eip = saved_eip;
2051 return -1;
2052 }
2053 return 0;
2054 case 0x70 ... 0x7f: /* jcc (short) */ 2685 case 0x70 ... 0x7f: /* jcc (short) */
2055 if (test_cc(c->b, ctxt->eflags)) 2686 if (test_cc(c->b, ctxt->eflags))
2056 jmp_rel(c, c->src.val); 2687 jmp_rel(c, c->src.val);
@@ -2107,12 +2738,11 @@ special_insn:
2107 case 0x8c: { /* mov r/m, sreg */ 2738 case 0x8c: { /* mov r/m, sreg */
2108 struct kvm_segment segreg; 2739 struct kvm_segment segreg;
2109 2740
2110 if (c->modrm_reg <= 5) 2741 if (c->modrm_reg <= VCPU_SREG_GS)
2111 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); 2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2112 else { 2743 else {
2113 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", 2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2114 c->modrm); 2745 goto done;
2115 goto cannot_emulate;
2116 } 2746 }
2117 c->dst.val = segreg.selector; 2747 c->dst.val = segreg.selector;
2118 break; 2748 break;
@@ -2132,16 +2762,16 @@ special_insn:
2132 } 2762 }
2133 2763
2134 if (c->modrm_reg == VCPU_SREG_SS) 2764 if (c->modrm_reg == VCPU_SREG_SS)
2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); 2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
2136 2766
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); 2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2138 2768
2139 c->dst.type = OP_NONE; /* Disable writeback. */ 2769 c->dst.type = OP_NONE; /* Disable writeback. */
2140 break; 2770 break;
2141 } 2771 }
2142 case 0x8f: /* pop (sole member of Grp1a) */ 2772 case 0x8f: /* pop (sole member of Grp1a) */
2143 rc = emulate_grp1a(ctxt, ops); 2773 rc = emulate_grp1a(ctxt, ops);
2144 if (rc != 0) 2774 if (rc != X86EMUL_CONTINUE)
2145 goto done; 2775 goto done;
2146 break; 2776 break;
2147 case 0x90: /* nop / xchg r8,rax */ 2777 case 0x90: /* nop / xchg r8,rax */
@@ -2175,89 +2805,16 @@ special_insn:
2175 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; 2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2176 break; 2806 break;
2177 case 0xa4 ... 0xa5: /* movs */ 2807 case 0xa4 ... 0xa5: /* movs */
2178 c->dst.type = OP_MEM; 2808 goto mov;
2179 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2180 c->dst.ptr = (unsigned long *)register_address(c,
2181 es_base(ctxt),
2182 c->regs[VCPU_REGS_RDI]);
2183 rc = ops->read_emulated(register_address(c,
2184 seg_override_base(ctxt, c),
2185 c->regs[VCPU_REGS_RSI]),
2186 &c->dst.val,
2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
2189 goto done;
2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2192 : c->dst.bytes);
2193 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2194 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2195 : c->dst.bytes);
2196 break;
2197 case 0xa6 ... 0xa7: /* cmps */ 2809 case 0xa6 ... 0xa7: /* cmps */
2198 c->src.type = OP_NONE; /* Disable writeback. */
2199 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2200 c->src.ptr = (unsigned long *)register_address(c,
2201 seg_override_base(ctxt, c),
2202 c->regs[VCPU_REGS_RSI]);
2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
2204 &c->src.val,
2205 c->src.bytes,
2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
2208 goto done;
2209
2210 c->dst.type = OP_NONE; /* Disable writeback. */ 2810 c->dst.type = OP_NONE; /* Disable writeback. */
2211 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2212 c->dst.ptr = (unsigned long *)register_address(c,
2213 es_base(ctxt),
2214 c->regs[VCPU_REGS_RDI]);
2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
2216 &c->dst.val,
2217 c->dst.bytes,
2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
2220 goto done;
2221
2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2223 2812 goto cmp;
2224 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2225
2226 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2227 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
2228 : c->src.bytes);
2229 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2230 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2231 : c->dst.bytes);
2232
2233 break;
2234 case 0xaa ... 0xab: /* stos */ 2813 case 0xaa ... 0xab: /* stos */
2235 c->dst.type = OP_MEM;
2236 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2237 c->dst.ptr = (unsigned long *)register_address(c,
2238 es_base(ctxt),
2239 c->regs[VCPU_REGS_RDI]);
2240 c->dst.val = c->regs[VCPU_REGS_RAX]; 2814 c->dst.val = c->regs[VCPU_REGS_RAX];
2241 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2242 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2243 : c->dst.bytes);
2244 break; 2815 break;
2245 case 0xac ... 0xad: /* lods */ 2816 case 0xac ... 0xad: /* lods */
2246 c->dst.type = OP_REG; 2817 goto mov;
2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2249 rc = ops->read_emulated(register_address(c,
2250 seg_override_base(ctxt, c),
2251 c->regs[VCPU_REGS_RSI]),
2252 &c->dst.val,
2253 c->dst.bytes,
2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
2256 goto done;
2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2259 : c->dst.bytes);
2260 break;
2261 case 0xae ... 0xaf: /* scas */ 2818 case 0xae ... 0xaf: /* scas */
2262 DPRINTF("Urk! I don't handle SCAS.\n"); 2819 DPRINTF("Urk! I don't handle SCAS.\n");
2263 goto cannot_emulate; 2820 goto cannot_emulate;
@@ -2277,7 +2834,7 @@ special_insn:
2277 break; 2834 break;
2278 case 0xcb: /* ret far */ 2835 case 0xcb: /* ret far */
2279 rc = emulate_ret_far(ctxt, ops); 2836 rc = emulate_ret_far(ctxt, ops);
2280 if (rc) 2837 if (rc != X86EMUL_CONTINUE)
2281 goto done; 2838 goto done;
2282 break; 2839 break;
2283 case 0xd0 ... 0xd1: /* Grp2 */ 2840 case 0xd0 ... 0xd1: /* Grp2 */
@@ -2290,14 +2847,10 @@ special_insn:
2290 break; 2847 break;
2291 case 0xe4: /* inb */ 2848 case 0xe4: /* inb */
2292 case 0xe5: /* in */ 2849 case 0xe5: /* in */
2293 port = c->src.val; 2850 goto do_io_in;
2294 io_dir_in = 1;
2295 goto do_io;
2296 case 0xe6: /* outb */ 2851 case 0xe6: /* outb */
2297 case 0xe7: /* out */ 2852 case 0xe7: /* out */
2298 port = c->src.val; 2853 goto do_io_out;
2299 io_dir_in = 0;
2300 goto do_io;
2301 case 0xe8: /* call (near) */ { 2854 case 0xe8: /* call (near) */ {
2302 long int rel = c->src.val; 2855 long int rel = c->src.val;
2303 c->src.val = (unsigned long) c->eip; 2856 c->src.val = (unsigned long) c->eip;
@@ -2308,8 +2861,9 @@ special_insn:
2308 case 0xe9: /* jmp rel */ 2861 case 0xe9: /* jmp rel */
2309 goto jmp; 2862 goto jmp;
2310 case 0xea: /* jmp far */ 2863 case 0xea: /* jmp far */
2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 2864 jump_far:
2312 VCPU_SREG_CS)) 2865 if (load_segment_descriptor(ctxt, ops, c->src2.val,
2866 VCPU_SREG_CS))
2313 goto done; 2867 goto done;
2314 2868
2315 c->eip = c->src.val; 2869 c->eip = c->src.val;
@@ -2321,25 +2875,29 @@ special_insn:
2321 break; 2875 break;
2322 case 0xec: /* in al,dx */ 2876 case 0xec: /* in al,dx */
2323 case 0xed: /* in (e/r)ax,dx */ 2877 case 0xed: /* in (e/r)ax,dx */
2324 port = c->regs[VCPU_REGS_RDX]; 2878 c->src.val = c->regs[VCPU_REGS_RDX];
2325 io_dir_in = 1; 2879 do_io_in:
2326 goto do_io; 2880 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0);
2883 goto done;
2884 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val))
2887 goto done; /* IO is needed */
2888 break;
2327 case 0xee: /* out al,dx */ 2889 case 0xee: /* out al,dx */
2328 case 0xef: /* out (e/r)ax,dx */ 2890 case 0xef: /* out (e/r)ax,dx */
2329 port = c->regs[VCPU_REGS_RDX]; 2891 c->src.val = c->regs[VCPU_REGS_RDX];
2330 io_dir_in = 0; 2892 do_io_out:
2331 do_io: 2893 c->dst.bytes = min(c->dst.bytes, 4u);
2332 if (!emulator_io_permited(ctxt, ops, port, 2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0); 2895 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done; 2896 goto done;
2336 } 2897 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, 2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
2338 (c->d & ByteOp) ? 1 : c->op_bytes, 2899 ctxt->vcpu);
2339 port) != 0) { 2900 c->dst.type = OP_NONE; /* Disable writeback. */
2340 c->eip = saved_eip;
2341 goto cannot_emulate;
2342 }
2343 break; 2901 break;
2344 case 0xf4: /* hlt */ 2902 case 0xf4: /* hlt */
2345 ctxt->vcpu->arch.halt_request = 1; 2903 ctxt->vcpu->arch.halt_request = 1;
@@ -2350,16 +2908,15 @@ special_insn:
2350 c->dst.type = OP_NONE; /* Disable writeback. */ 2908 c->dst.type = OP_NONE; /* Disable writeback. */
2351 break; 2909 break;
2352 case 0xf6 ... 0xf7: /* Grp3 */ 2910 case 0xf6 ... 0xf7: /* Grp3 */
2353 rc = emulate_grp3(ctxt, ops); 2911 if (!emulate_grp3(ctxt, ops))
2354 if (rc != 0) 2912 goto cannot_emulate;
2355 goto done;
2356 break; 2913 break;
2357 case 0xf8: /* clc */ 2914 case 0xf8: /* clc */
2358 ctxt->eflags &= ~EFLG_CF; 2915 ctxt->eflags &= ~EFLG_CF;
2359 c->dst.type = OP_NONE; /* Disable writeback. */ 2916 c->dst.type = OP_NONE; /* Disable writeback. */
2360 break; 2917 break;
2361 case 0xfa: /* cli */ 2918 case 0xfa: /* cli */
2362 if (emulator_bad_iopl(ctxt)) 2919 if (emulator_bad_iopl(ctxt, ops))
2363 kvm_inject_gp(ctxt->vcpu, 0); 2920 kvm_inject_gp(ctxt->vcpu, 0);
2364 else { 2921 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF; 2922 ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2367,10 +2924,10 @@ special_insn:
2367 } 2924 }
2368 break; 2925 break;
2369 case 0xfb: /* sti */ 2926 case 0xfb: /* sti */
2370 if (emulator_bad_iopl(ctxt)) 2927 if (emulator_bad_iopl(ctxt, ops))
2371 kvm_inject_gp(ctxt->vcpu, 0); 2928 kvm_inject_gp(ctxt->vcpu, 0);
2372 else { 2929 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF; 2931 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */ 2932 c->dst.type = OP_NONE; /* Disable writeback. */
2376 } 2933 }
@@ -2383,28 +2940,55 @@ special_insn:
2383 ctxt->eflags |= EFLG_DF; 2940 ctxt->eflags |= EFLG_DF;
2384 c->dst.type = OP_NONE; /* Disable writeback. */ 2941 c->dst.type = OP_NONE; /* Disable writeback. */
2385 break; 2942 break;
2386 case 0xfe ... 0xff: /* Grp4/Grp5 */ 2943 case 0xfe: /* Grp4 */
2944 grp45:
2387 rc = emulate_grp45(ctxt, ops); 2945 rc = emulate_grp45(ctxt, ops);
2388 if (rc != 0) 2946 if (rc != X86EMUL_CONTINUE)
2389 goto done; 2947 goto done;
2390 break; 2948 break;
2949 case 0xff: /* Grp5 */
2950 if (c->modrm_reg == 5)
2951 goto jump_far;
2952 goto grp45;
2391 } 2953 }
2392 2954
2393writeback: 2955writeback:
2394 rc = writeback(ctxt, ops); 2956 rc = writeback(ctxt, ops);
2395 if (rc != 0) 2957 if (rc != X86EMUL_CONTINUE)
2396 goto done; 2958 goto done;
2397 2959
2960 /*
2961 * restore dst type in case the decoding will be reused
2962 * (happens for string instruction )
2963 */
2964 c->dst.type = saved_dst_type;
2965
2966 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
2968 &c->src);
2969
2970 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
2972
2973 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read;
2975 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
2976 /*
2977 * Re-enter guest when pio read ahead buffer is empty or,
2978 * if it is not used, after each 1024 iteration.
2979 */
2980 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
2981 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false;
2983 }
2984
2398 /* Commit shadow register state. */ 2985 /* Commit shadow register state. */
2399 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2400 kvm_rip_write(ctxt->vcpu, c->eip); 2987 kvm_rip_write(ctxt->vcpu, c->eip);
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags);
2401 2989
2402done: 2990done:
2403 if (rc == X86EMUL_UNHANDLEABLE) { 2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2404 c->eip = saved_eip;
2405 return -1;
2406 }
2407 return 0;
2408 2992
2409twobyte_insn: 2993twobyte_insn:
2410 switch (c->b) { 2994 switch (c->b) {
@@ -2418,18 +3002,18 @@ twobyte_insn:
2418 goto cannot_emulate; 3002 goto cannot_emulate;
2419 3003
2420 rc = kvm_fix_hypercall(ctxt->vcpu); 3004 rc = kvm_fix_hypercall(ctxt->vcpu);
2421 if (rc) 3005 if (rc != X86EMUL_CONTINUE)
2422 goto done; 3006 goto done;
2423 3007
2424 /* Let the processor re-execute the fixed hypercall */ 3008 /* Let the processor re-execute the fixed hypercall */
2425 c->eip = kvm_rip_read(ctxt->vcpu); 3009 c->eip = ctxt->eip;
2426 /* Disable writeback. */ 3010 /* Disable writeback. */
2427 c->dst.type = OP_NONE; 3011 c->dst.type = OP_NONE;
2428 break; 3012 break;
2429 case 2: /* lgdt */ 3013 case 2: /* lgdt */
2430 rc = read_descriptor(ctxt, ops, c->src.ptr, 3014 rc = read_descriptor(ctxt, ops, c->src.ptr,
2431 &size, &address, c->op_bytes); 3015 &size, &address, c->op_bytes);
2432 if (rc) 3016 if (rc != X86EMUL_CONTINUE)
2433 goto done; 3017 goto done;
2434 realmode_lgdt(ctxt->vcpu, size, address); 3018 realmode_lgdt(ctxt->vcpu, size, address);
2435 /* Disable writeback. */ 3019 /* Disable writeback. */
@@ -2440,7 +3024,7 @@ twobyte_insn:
2440 switch (c->modrm_rm) { 3024 switch (c->modrm_rm) {
2441 case 1: 3025 case 1:
2442 rc = kvm_fix_hypercall(ctxt->vcpu); 3026 rc = kvm_fix_hypercall(ctxt->vcpu);
2443 if (rc) 3027 if (rc != X86EMUL_CONTINUE)
2444 goto done; 3028 goto done;
2445 break; 3029 break;
2446 default: 3030 default:
@@ -2450,7 +3034,7 @@ twobyte_insn:
2450 rc = read_descriptor(ctxt, ops, c->src.ptr, 3034 rc = read_descriptor(ctxt, ops, c->src.ptr,
2451 &size, &address, 3035 &size, &address,
2452 c->op_bytes); 3036 c->op_bytes);
2453 if (rc) 3037 if (rc != X86EMUL_CONTINUE)
2454 goto done; 3038 goto done;
2455 realmode_lidt(ctxt->vcpu, size, address); 3039 realmode_lidt(ctxt->vcpu, size, address);
2456 } 3040 }
@@ -2459,15 +3043,18 @@ twobyte_insn:
2459 break; 3043 break;
2460 case 4: /* smsw */ 3044 case 4: /* smsw */
2461 c->dst.bytes = 2; 3045 c->dst.bytes = 2;
2462 c->dst.val = realmode_get_cr(ctxt->vcpu, 0); 3046 c->dst.val = ops->get_cr(0, ctxt->vcpu);
2463 break; 3047 break;
2464 case 6: /* lmsw */ 3048 case 6: /* lmsw */
2465 realmode_lmsw(ctxt->vcpu, (u16)c->src.val, 3049 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
2466 &ctxt->eflags); 3050 (c->src.val & 0x0f), ctxt->vcpu);
2467 c->dst.type = OP_NONE; 3051 c->dst.type = OP_NONE;
2468 break; 3052 break;
3053 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3055 goto done;
2469 case 7: /* invlpg*/ 3056 case 7: /* invlpg*/
2470 emulate_invlpg(ctxt->vcpu, memop); 3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
2471 /* Disable writeback. */ 3058 /* Disable writeback. */
2472 c->dst.type = OP_NONE; 3059 c->dst.type = OP_NONE;
2473 break; 3060 break;
@@ -2493,54 +3080,54 @@ twobyte_insn:
2493 c->dst.type = OP_NONE; 3080 c->dst.type = OP_NONE;
2494 break; 3081 break;
2495 case 0x20: /* mov cr, reg */ 3082 case 0x20: /* mov cr, reg */
2496 if (c->modrm_mod != 3) 3083 switch (c->modrm_reg) {
2497 goto cannot_emulate; 3084 case 1:
2498 c->regs[c->modrm_rm] = 3085 case 5 ... 7:
2499 realmode_get_cr(ctxt->vcpu, c->modrm_reg); 3086 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3088 goto done;
3089 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
2500 c->dst.type = OP_NONE; /* no writeback */ 3091 c->dst.type = OP_NONE; /* no writeback */
2501 break; 3092 break;
2502 case 0x21: /* mov from dr to reg */ 3093 case 0x21: /* mov from dr to reg */
2503 if (c->modrm_mod != 3) 3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2504 goto cannot_emulate; 3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2505 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2506 if (rc) 3097 goto done;
2507 goto cannot_emulate; 3098 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
2508 c->dst.type = OP_NONE; /* no writeback */ 3100 c->dst.type = OP_NONE; /* no writeback */
2509 break; 3101 break;
2510 case 0x22: /* mov reg, cr */ 3102 case 0x22: /* mov reg, cr */
2511 if (c->modrm_mod != 3) 3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
2512 goto cannot_emulate;
2513 realmode_set_cr(ctxt->vcpu,
2514 c->modrm_reg, c->modrm_val, &ctxt->eflags);
2515 c->dst.type = OP_NONE; 3104 c->dst.type = OP_NONE;
2516 break; 3105 break;
2517 case 0x23: /* mov from reg to dr */ 3106 case 0x23: /* mov from reg to dr */
2518 if (c->modrm_mod != 3) 3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2519 goto cannot_emulate; 3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2520 rc = emulator_set_dr(ctxt, c->modrm_reg, 3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2521 c->regs[c->modrm_rm]); 3110 goto done;
2522 if (rc) 3111 }
2523 goto cannot_emulate; 3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
2524 c->dst.type = OP_NONE; /* no writeback */ 3113 c->dst.type = OP_NONE; /* no writeback */
2525 break; 3114 break;
2526 case 0x30: 3115 case 0x30:
2527 /* wrmsr */ 3116 /* wrmsr */
2528 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3117 msr_data = (u32)c->regs[VCPU_REGS_RAX]
2529 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
2530 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
2531 if (rc) {
2532 kvm_inject_gp(ctxt->vcpu, 0); 3120 kvm_inject_gp(ctxt->vcpu, 0);
2533 c->eip = kvm_rip_read(ctxt->vcpu); 3121 goto done;
2534 } 3122 }
2535 rc = X86EMUL_CONTINUE; 3123 rc = X86EMUL_CONTINUE;
2536 c->dst.type = OP_NONE; 3124 c->dst.type = OP_NONE;
2537 break; 3125 break;
2538 case 0x32: 3126 case 0x32:
2539 /* rdmsr */ 3127 /* rdmsr */
2540 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
2541 if (rc) {
2542 kvm_inject_gp(ctxt->vcpu, 0); 3129 kvm_inject_gp(ctxt->vcpu, 0);
2543 c->eip = kvm_rip_read(ctxt->vcpu); 3130 goto done;
2544 } else { 3131 } else {
2545 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
2546 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 3133 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -2577,7 +3164,7 @@ twobyte_insn:
2577 break; 3164 break;
2578 case 0xa1: /* pop fs */ 3165 case 0xa1: /* pop fs */
2579 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2580 if (rc != 0) 3167 if (rc != X86EMUL_CONTINUE)
2581 goto done; 3168 goto done;
2582 break; 3169 break;
2583 case 0xa3: 3170 case 0xa3:
@@ -2596,7 +3183,7 @@ twobyte_insn:
2596 break; 3183 break;
2597 case 0xa9: /* pop gs */ 3184 case 0xa9: /* pop gs */
2598 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2599 if (rc != 0) 3186 if (rc != X86EMUL_CONTINUE)
2600 goto done; 3187 goto done;
2601 break; 3188 break;
2602 case 0xab: 3189 case 0xab:
@@ -2668,16 +3255,14 @@ twobyte_insn:
2668 (u64) c->src.val; 3255 (u64) c->src.val;
2669 break; 3256 break;
2670 case 0xc7: /* Grp9 (cmpxchg8b) */ 3257 case 0xc7: /* Grp9 (cmpxchg8b) */
2671 rc = emulate_grp9(ctxt, ops, memop); 3258 rc = emulate_grp9(ctxt, ops);
2672 if (rc != 0) 3259 if (rc != X86EMUL_CONTINUE)
2673 goto done; 3260 goto done;
2674 c->dst.type = OP_NONE;
2675 break; 3261 break;
2676 } 3262 }
2677 goto writeback; 3263 goto writeback;
2678 3264
2679cannot_emulate: 3265cannot_emulate:
2680 DPRINTF("Cannot emulate %02x\n", c->b); 3266 DPRINTF("Cannot emulate %02x\n", c->b);
2681 c->eip = saved_eip;
2682 return -1; 3267 return -1;
2683} 3268}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa128a9..93825ff3338 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
33#include <linux/kvm_host.h> 33#include <linux/kvm_host.h>
34#include "trace.h" 34#include "trace.h"
35 35
36static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock)
38{
39 raw_spin_lock(&s->lock);
40}
41
42static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock)
44{
45 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu;
47
48 s->wakeup_needed = false;
49
50 raw_spin_unlock(&s->lock);
51
52 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu;
54 if (vcpu)
55 kvm_vcpu_kick(vcpu);
56 }
57}
58
36static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 59static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
37{ 60{
38 s->isr &= ~(1 << irq); 61 s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
45 * Other interrupt may be delivered to PIC while lock is dropped but 68 * Other interrupt may be delivered to PIC while lock is dropped but
46 * it should be safe since PIC state is already updated at this stage. 69 * it should be safe since PIC state is already updated at this stage.
47 */ 70 */
48 raw_spin_unlock(&s->pics_state->lock); 71 pic_unlock(s->pics_state);
49 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 72 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
50 raw_spin_lock(&s->pics_state->lock); 73 pic_lock(s->pics_state);
51} 74}
52 75
53void kvm_pic_clear_isr_ack(struct kvm *kvm) 76void kvm_pic_clear_isr_ack(struct kvm *kvm)
54{ 77{
55 struct kvm_pic *s = pic_irqchip(kvm); 78 struct kvm_pic *s = pic_irqchip(kvm);
56 79
57 raw_spin_lock(&s->lock); 80 pic_lock(s);
58 s->pics[0].isr_ack = 0xff; 81 s->pics[0].isr_ack = 0xff;
59 s->pics[1].isr_ack = 0xff; 82 s->pics[1].isr_ack = 0xff;
60 raw_spin_unlock(&s->lock); 83 pic_unlock(s);
61} 84}
62 85
63/* 86/*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
158 181
159void kvm_pic_update_irq(struct kvm_pic *s) 182void kvm_pic_update_irq(struct kvm_pic *s)
160{ 183{
161 raw_spin_lock(&s->lock); 184 pic_lock(s);
162 pic_update_irq(s); 185 pic_update_irq(s);
163 raw_spin_unlock(&s->lock); 186 pic_unlock(s);
164} 187}
165 188
166int kvm_pic_set_irq(void *opaque, int irq, int level) 189int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
168 struct kvm_pic *s = opaque; 191 struct kvm_pic *s = opaque;
169 int ret = -1; 192 int ret = -1;
170 193
171 raw_spin_lock(&s->lock); 194 pic_lock(s);
172 if (irq >= 0 && irq < PIC_NUM_PINS) { 195 if (irq >= 0 && irq < PIC_NUM_PINS) {
173 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 196 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
174 pic_update_irq(s); 197 pic_update_irq(s);
175 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 198 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
176 s->pics[irq >> 3].imr, ret == 0); 199 s->pics[irq >> 3].imr, ret == 0);
177 } 200 }
178 raw_spin_unlock(&s->lock); 201 pic_unlock(s);
179 202
180 return ret; 203 return ret;
181} 204}
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
205 int irq, irq2, intno; 228 int irq, irq2, intno;
206 struct kvm_pic *s = pic_irqchip(kvm); 229 struct kvm_pic *s = pic_irqchip(kvm);
207 230
208 raw_spin_lock(&s->lock); 231 pic_lock(s);
209 irq = pic_get_irq(&s->pics[0]); 232 irq = pic_get_irq(&s->pics[0]);
210 if (irq >= 0) { 233 if (irq >= 0) {
211 pic_intack(&s->pics[0], irq); 234 pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
230 intno = s->pics[0].irq_base + irq; 253 intno = s->pics[0].irq_base + irq;
231 } 254 }
232 pic_update_irq(s); 255 pic_update_irq(s);
233 raw_spin_unlock(&s->lock); 256 pic_unlock(s);
234 257
235 return intno; 258 return intno;
236} 259}
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
444 printk(KERN_ERR "PIC: non byte write\n"); 467 printk(KERN_ERR "PIC: non byte write\n");
445 return 0; 468 return 0;
446 } 469 }
447 raw_spin_lock(&s->lock); 470 pic_lock(s);
448 switch (addr) { 471 switch (addr) {
449 case 0x20: 472 case 0x20:
450 case 0x21: 473 case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
457 elcr_ioport_write(&s->pics[addr & 1], addr, data); 480 elcr_ioport_write(&s->pics[addr & 1], addr, data);
458 break; 481 break;
459 } 482 }
460 raw_spin_unlock(&s->lock); 483 pic_unlock(s);
461 return 0; 484 return 0;
462} 485}
463 486
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
474 printk(KERN_ERR "PIC: non byte read\n"); 497 printk(KERN_ERR "PIC: non byte read\n");
475 return 0; 498 return 0;
476 } 499 }
477 raw_spin_lock(&s->lock); 500 pic_lock(s);
478 switch (addr) { 501 switch (addr) {
479 case 0x20: 502 case 0x20:
480 case 0x21: 503 case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
488 break; 511 break;
489 } 512 }
490 *(unsigned char *)val = data; 513 *(unsigned char *)val = data;
491 raw_spin_unlock(&s->lock); 514 pic_unlock(s);
492 return 0; 515 return 0;
493} 516}
494 517
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
505 s->output = level; 528 s->output = level;
506 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 529 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
507 s->pics[0].isr_ack &= ~(1 << irq); 530 s->pics[0].isr_ack &= ~(1 << irq);
508 kvm_vcpu_kick(vcpu); 531 s->wakeup_needed = true;
509 } 532 }
510} 533}
511 534
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915754..cd1f362f413 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
63 63
64struct kvm_pic { 64struct kvm_pic {
65 raw_spinlock_t lock; 65 raw_spinlock_t lock;
66 bool wakeup_needed;
66 unsigned pending_acks; 67 unsigned pending_acks;
67 struct kvm *kvm; 68 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524dda5..64bc6ea78d9 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
10}; 10};
11 11
12struct kvm_timer_ops { 12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *); 13 bool (*is_periodic)(struct kvm_timer *);
14}; 14};
15 15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); 16enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 19a8906bcaa..81563e76e28 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
148 148
149#include <trace/events/kvm.h> 149#include <trace/events/kvm.h>
150 150
151#undef TRACE_INCLUDE_FILE
152#define CREATE_TRACE_POINTS 151#define CREATE_TRACE_POINTS
153#include "mmutrace.h" 152#include "mmutrace.h"
154 153
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
174 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
175 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
176 175
177 176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
178struct kvm_unsync_walk {
179 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
180};
181
182typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
183 177
184static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
185static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -223,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223} 217}
224EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 218EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
225 219
226static int is_write_protection(struct kvm_vcpu *vcpu) 220static bool is_write_protection(struct kvm_vcpu *vcpu)
227{ 221{
228 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 222 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
229} 223}
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
327 page = alloc_page(GFP_KERNEL); 321 page = alloc_page(GFP_KERNEL);
328 if (!page) 322 if (!page)
329 return -ENOMEM; 323 return -ENOMEM;
330 set_page_private(page, 0);
331 cache->objects[cache->nobjs++] = page_address(page); 324 cache->objects[cache->nobjs++] = page_address(page);
332 } 325 }
333 return 0; 326 return 0;
@@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
438 int i; 431 int i;
439 432
440 gfn = unalias_gfn(kvm, gfn); 433 gfn = unalias_gfn(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
441 for (i = PT_DIRECTORY_LEVEL; 435 for (i = PT_DIRECTORY_LEVEL;
442 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
443 slot = gfn_to_memslot_unaliased(kvm, gfn);
444 write_count = slot_largepage_idx(gfn, slot, i); 437 write_count = slot_largepage_idx(gfn, slot, i);
445 *write_count -= 1; 438 *write_count -= 1;
446 WARN_ON(*write_count < 0); 439 WARN_ON(*write_count < 0);
@@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
654static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
655{ 648{
656 struct kvm_rmap_desc *desc; 649 struct kvm_rmap_desc *desc;
657 struct kvm_rmap_desc *prev_desc;
658 u64 *prev_spte; 650 u64 *prev_spte;
659 int i; 651 int i;
660 652
@@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
666 return NULL; 658 return NULL;
667 } 659 }
668 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
669 prev_desc = NULL;
670 prev_spte = NULL; 661 prev_spte = NULL;
671 while (desc) { 662 while (desc) {
672 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 663 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
794 int retval = 0; 785 int retval = 0;
795 struct kvm_memslots *slots; 786 struct kvm_memslots *slots;
796 787
797 slots = rcu_dereference(kvm->memslots); 788 slots = kvm_memslots(kvm);
798 789
799 for (i = 0; i < slots->nmemslots; i++) { 790 for (i = 0; i < slots->nmemslots; i++) {
800 struct kvm_memory_slot *memslot = &slots->memslots[i]; 791 struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
925 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
926 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
927 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
928 INIT_LIST_HEAD(&sp->oos_link);
929 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
930 sp->multimapped = 0; 920 sp->multimapped = 0;
931 sp->parent_pte = parent_pte; 921 sp->parent_pte = parent_pte;
@@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1009} 999}
1010 1000
1011 1001
1012static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1013 mmu_parent_walk_fn fn)
1014{ 1003{
1015 struct kvm_pte_chain *pte_chain; 1004 struct kvm_pte_chain *pte_chain;
1016 struct hlist_node *node; 1005 struct hlist_node *node;
@@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1019 1008
1020 if (!sp->multimapped && sp->parent_pte) { 1009 if (!sp->multimapped && sp->parent_pte) {
1021 parent_sp = page_header(__pa(sp->parent_pte)); 1010 parent_sp = page_header(__pa(sp->parent_pte));
1022 fn(vcpu, parent_sp); 1011 fn(parent_sp);
1023 mmu_parent_walk(vcpu, parent_sp, fn); 1012 mmu_parent_walk(parent_sp, fn);
1024 return; 1013 return;
1025 } 1014 }
1026 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1028 if (!pte_chain->parent_ptes[i]) 1017 if (!pte_chain->parent_ptes[i])
1029 break; 1018 break;
1030 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
1031 fn(vcpu, parent_sp); 1020 fn(parent_sp);
1032 mmu_parent_walk(vcpu, parent_sp, fn); 1021 mmu_parent_walk(parent_sp, fn);
1033 } 1022 }
1034} 1023}
1035 1024
@@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
1066 } 1055 }
1067} 1056}
1068 1057
1069static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1070{ 1059{
1071 kvm_mmu_update_parents_unsync(sp); 1060 kvm_mmu_update_parents_unsync(sp);
1072 return 1; 1061 return 1;
1073} 1062}
1074 1063
1075static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, 1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1076 struct kvm_mmu_page *sp)
1077{ 1065{
1078 mmu_parent_walk(vcpu, sp, unsync_walk_fn); 1066 mmu_parent_walk(sp, unsync_walk_fn);
1079 kvm_mmu_update_parents_unsync(sp); 1067 kvm_mmu_update_parents_unsync(sp);
1080} 1068}
1081 1069
@@ -1201,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1201static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1202{ 1190{
1203 WARN_ON(!sp->unsync); 1191 WARN_ON(!sp->unsync);
1192 trace_kvm_mmu_sync_page(sp);
1204 sp->unsync = 0; 1193 sp->unsync = 0;
1205 --kvm->stat.mmu_unsync; 1194 --kvm->stat.mmu_unsync;
1206} 1195}
@@ -1209,12 +1198,11 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1209 1198
1210static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1211{ 1200{
1212 if (sp->role.glevels != vcpu->arch.mmu.root_level) { 1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1213 kvm_mmu_zap_page(vcpu->kvm, sp); 1202 kvm_mmu_zap_page(vcpu->kvm, sp);
1214 return 1; 1203 return 1;
1215 } 1204 }
1216 1205
1217 trace_kvm_mmu_sync_page(sp);
1218 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1206 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1219 kvm_flush_remote_tlbs(vcpu->kvm); 1207 kvm_flush_remote_tlbs(vcpu->kvm);
1220 kvm_unlink_unsync_page(vcpu->kvm, sp); 1208 kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1331 role = vcpu->arch.mmu.base_role; 1319 role = vcpu->arch.mmu.base_role;
1332 role.level = level; 1320 role.level = level;
1333 role.direct = direct; 1321 role.direct = direct;
1322 if (role.direct)
1323 role.cr4_pae = 0;
1334 role.access = access; 1324 role.access = access;
1335 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1336 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1351 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1352 if (sp->unsync_children) { 1342 if (sp->unsync_children) {
1353 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1354 kvm_mmu_mark_parents_unsync(vcpu, sp); 1344 kvm_mmu_mark_parents_unsync(sp);
1355 } 1345 }
1356 trace_kvm_mmu_get_page(sp, false); 1346 trace_kvm_mmu_get_page(sp, false);
1357 return sp; 1347 return sp;
@@ -1573,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1573 r = 0; 1563 r = 0;
1574 index = kvm_page_table_hashfn(gfn); 1564 index = kvm_page_table_hashfn(gfn);
1575 bucket = &kvm->arch.mmu_page_hash[index]; 1565 bucket = &kvm->arch.mmu_page_hash[index];
1566restart:
1576 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1577 if (sp->gfn == gfn && !sp->role.direct) { 1568 if (sp->gfn == gfn && !sp->role.direct) {
1578 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1579 sp->role.word); 1570 sp->role.word);
1580 r = 1; 1571 r = 1;
1581 if (kvm_mmu_zap_page(kvm, sp)) 1572 if (kvm_mmu_zap_page(kvm, sp))
1582 n = bucket->first; 1573 goto restart;
1583 } 1574 }
1584 return r; 1575 return r;
1585} 1576}
@@ -1593,13 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1593 1584
1594 index = kvm_page_table_hashfn(gfn); 1585 index = kvm_page_table_hashfn(gfn);
1595 bucket = &kvm->arch.mmu_page_hash[index]; 1586 bucket = &kvm->arch.mmu_page_hash[index];
1587restart:
1596 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
1597 if (sp->gfn == gfn && !sp->role.direct 1589 if (sp->gfn == gfn && !sp->role.direct
1598 && !sp->role.invalid) { 1590 && !sp->role.invalid) {
1599 pgprintk("%s: zap %lx %x\n", 1591 pgprintk("%s: zap %lx %x\n",
1600 __func__, gfn, sp->role.word); 1592 __func__, gfn, sp->role.word);
1601 if (kvm_mmu_zap_page(kvm, sp)) 1593 if (kvm_mmu_zap_page(kvm, sp))
1602 nn = bucket->first; 1594 goto restart;
1603 } 1595 }
1604 } 1596 }
1605} 1597}
@@ -1626,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1626 } 1618 }
1627} 1619}
1628 1620
1629struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1630{
1631 struct page *page;
1632
1633 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1634
1635 if (gpa == UNMAPPED_GVA)
1636 return NULL;
1637
1638 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1639
1640 return page;
1641}
1642
1643/* 1621/*
1644 * The function is based on mtrr_type_lookup() in 1622 * The function is based on mtrr_type_lookup() in
1645 * arch/x86/kernel/cpu/mtrr/generic.c 1623 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1752,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1752 struct kvm_mmu_page *s; 1730 struct kvm_mmu_page *s;
1753 struct hlist_node *node, *n; 1731 struct hlist_node *node, *n;
1754 1732
1755 trace_kvm_mmu_unsync_page(sp);
1756 index = kvm_page_table_hashfn(sp->gfn); 1733 index = kvm_page_table_hashfn(sp->gfn);
1757 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1758 /* don't unsync if pagetable is shadowed with multiple roles */ 1735 /* don't unsync if pagetable is shadowed with multiple roles */
@@ -1762,10 +1739,11 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1762 if (s->role.word != sp->role.word) 1739 if (s->role.word != sp->role.word)
1763 return 1; 1740 return 1;
1764 } 1741 }
1742 trace_kvm_mmu_unsync_page(sp);
1765 ++vcpu->kvm->stat.mmu_unsync; 1743 ++vcpu->kvm->stat.mmu_unsync;
1766 sp->unsync = 1; 1744 sp->unsync = 1;
1767 1745
1768 kvm_mmu_mark_parents_unsync(vcpu, sp); 1746 kvm_mmu_mark_parents_unsync(sp);
1769 1747
1770 mmu_convert_notrap(sp); 1748 mmu_convert_notrap(sp);
1771 return 0; 1749 return 0;
@@ -2081,21 +2059,23 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2081 hpa_t root = vcpu->arch.mmu.root_hpa; 2059 hpa_t root = vcpu->arch.mmu.root_hpa;
2082 2060
2083 ASSERT(!VALID_PAGE(root)); 2061 ASSERT(!VALID_PAGE(root));
2084 if (tdp_enabled)
2085 direct = 1;
2086 if (mmu_check_root(vcpu, root_gfn)) 2062 if (mmu_check_root(vcpu, root_gfn))
2087 return 1; 2063 return 1;
2064 if (tdp_enabled) {
2065 direct = 1;
2066 root_gfn = 0;
2067 }
2068 spin_lock(&vcpu->kvm->mmu_lock);
2088 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2069 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2089 PT64_ROOT_LEVEL, direct, 2070 PT64_ROOT_LEVEL, direct,
2090 ACC_ALL, NULL); 2071 ACC_ALL, NULL);
2091 root = __pa(sp->spt); 2072 root = __pa(sp->spt);
2092 ++sp->root_count; 2073 ++sp->root_count;
2074 spin_unlock(&vcpu->kvm->mmu_lock);
2093 vcpu->arch.mmu.root_hpa = root; 2075 vcpu->arch.mmu.root_hpa = root;
2094 return 0; 2076 return 0;
2095 } 2077 }
2096 direct = !is_paging(vcpu); 2078 direct = !is_paging(vcpu);
2097 if (tdp_enabled)
2098 direct = 1;
2099 for (i = 0; i < 4; ++i) { 2079 for (i = 0; i < 4; ++i) {
2100 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2080 hpa_t root = vcpu->arch.mmu.pae_root[i];
2101 2081
@@ -2111,11 +2091,18 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2111 root_gfn = 0; 2091 root_gfn = 0;
2112 if (mmu_check_root(vcpu, root_gfn)) 2092 if (mmu_check_root(vcpu, root_gfn))
2113 return 1; 2093 return 1;
2094 if (tdp_enabled) {
2095 direct = 1;
2096 root_gfn = i << 30;
2097 }
2098 spin_lock(&vcpu->kvm->mmu_lock);
2114 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2099 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2115 PT32_ROOT_LEVEL, direct, 2100 PT32_ROOT_LEVEL, direct,
2116 ACC_ALL, NULL); 2101 ACC_ALL, NULL);
2117 root = __pa(sp->spt); 2102 root = __pa(sp->spt);
2118 ++sp->root_count; 2103 ++sp->root_count;
2104 spin_unlock(&vcpu->kvm->mmu_lock);
2105
2119 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2106 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2120 } 2107 }
2121 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2108 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
@@ -2299,13 +2286,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2299 /* no rsvd bits for 2 level 4K page table entries */ 2286 /* no rsvd bits for 2 level 4K page table entries */
2300 context->rsvd_bits_mask[0][1] = 0; 2287 context->rsvd_bits_mask[0][1] = 0;
2301 context->rsvd_bits_mask[0][0] = 0; 2288 context->rsvd_bits_mask[0][0] = 0;
2289 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2290
2291 if (!is_pse(vcpu)) {
2292 context->rsvd_bits_mask[1][1] = 0;
2293 break;
2294 }
2295
2302 if (is_cpuid_PSE36()) 2296 if (is_cpuid_PSE36())
2303 /* 36bits PSE 4MB page */ 2297 /* 36bits PSE 4MB page */
2304 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 2298 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2305 else 2299 else
2306 /* 32 bits PSE 4MB page */ 2300 /* 32 bits PSE 4MB page */
2307 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2301 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2308 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2309 break; 2302 break;
2310 case PT32E_ROOT_LEVEL: 2303 case PT32E_ROOT_LEVEL:
2311 context->rsvd_bits_mask[0][2] = 2304 context->rsvd_bits_mask[0][2] =
@@ -2318,7 +2311,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2318 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2311 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2319 rsvd_bits(maxphyaddr, 62) | 2312 rsvd_bits(maxphyaddr, 62) |
2320 rsvd_bits(13, 20); /* large page */ 2313 rsvd_bits(13, 20); /* large page */
2321 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2314 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2322 break; 2315 break;
2323 case PT64_ROOT_LEVEL: 2316 case PT64_ROOT_LEVEL:
2324 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2317 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2336,7 +2329,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2336 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2329 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2337 rsvd_bits(maxphyaddr, 51) | 2330 rsvd_bits(maxphyaddr, 51) |
2338 rsvd_bits(13, 20); /* large page */ 2331 rsvd_bits(13, 20); /* large page */
2339 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2332 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2340 break; 2333 break;
2341 } 2334 }
2342} 2335}
@@ -2438,7 +2431,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2438 else 2431 else
2439 r = paging32_init_context(vcpu); 2432 r = paging32_init_context(vcpu);
2440 2433
2441 vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; 2434 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2435 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2442 2436
2443 return r; 2437 return r;
2444} 2438}
@@ -2478,7 +2472,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2478 goto out; 2472 goto out;
2479 spin_lock(&vcpu->kvm->mmu_lock); 2473 spin_lock(&vcpu->kvm->mmu_lock);
2480 kvm_mmu_free_some_pages(vcpu); 2474 kvm_mmu_free_some_pages(vcpu);
2475 spin_unlock(&vcpu->kvm->mmu_lock);
2481 r = mmu_alloc_roots(vcpu); 2476 r = mmu_alloc_roots(vcpu);
2477 spin_lock(&vcpu->kvm->mmu_lock);
2482 mmu_sync_roots(vcpu); 2478 mmu_sync_roots(vcpu);
2483 spin_unlock(&vcpu->kvm->mmu_lock); 2479 spin_unlock(&vcpu->kvm->mmu_lock);
2484 if (r) 2480 if (r)
@@ -2527,7 +2523,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2527 } 2523 }
2528 2524
2529 ++vcpu->kvm->stat.mmu_pte_updated; 2525 ++vcpu->kvm->stat.mmu_pte_updated;
2530 if (sp->role.glevels == PT32_ROOT_LEVEL) 2526 if (!sp->role.cr4_pae)
2531 paging32_update_pte(vcpu, sp, spte, new); 2527 paging32_update_pte(vcpu, sp, spte, new);
2532 else 2528 else
2533 paging64_update_pte(vcpu, sp, spte, new); 2529 paging64_update_pte(vcpu, sp, spte, new);
@@ -2562,36 +2558,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2562} 2558}
2563 2559
2564static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2560static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2565 const u8 *new, int bytes) 2561 u64 gpte)
2566{ 2562{
2567 gfn_t gfn; 2563 gfn_t gfn;
2568 int r;
2569 u64 gpte = 0;
2570 pfn_t pfn; 2564 pfn_t pfn;
2571 2565
2572 if (bytes != 4 && bytes != 8)
2573 return;
2574
2575 /*
2576 * Assume that the pte write on a page table of the same type
2577 * as the current vcpu paging mode. This is nearly always true
2578 * (might be false while changing modes). Note it is verified later
2579 * by update_pte().
2580 */
2581 if (is_pae(vcpu)) {
2582 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2583 if ((bytes == 4) && (gpa % 4 == 0)) {
2584 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
2585 if (r)
2586 return;
2587 memcpy((void *)&gpte + (gpa % 8), new, 4);
2588 } else if ((bytes == 8) && (gpa % 8 == 0)) {
2589 memcpy((void *)&gpte, new, 8);
2590 }
2591 } else {
2592 if ((bytes == 4) && (gpa % 4 == 0))
2593 memcpy((void *)&gpte, new, 4);
2594 }
2595 if (!is_present_gpte(gpte)) 2566 if (!is_present_gpte(gpte))
2596 return; 2567 return;
2597 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2568 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2640,10 +2611,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2640 int flooded = 0; 2611 int flooded = 0;
2641 int npte; 2612 int npte;
2642 int r; 2613 int r;
2614 int invlpg_counter;
2643 2615
2644 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2616 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2645 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 2617
2618 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2619
2620 /*
2621 * Assume that the pte write on a page table of the same type
2622 * as the current vcpu paging mode. This is nearly always true
2623 * (might be false while changing modes). Note it is verified later
2624 * by update_pte().
2625 */
2626 if ((is_pae(vcpu) && bytes == 4) || !new) {
2627 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2628 if (is_pae(vcpu)) {
2629 gpa &= ~(gpa_t)7;
2630 bytes = 8;
2631 }
2632 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2633 if (r)
2634 gentry = 0;
2635 new = (const u8 *)&gentry;
2636 }
2637
2638 switch (bytes) {
2639 case 4:
2640 gentry = *(const u32 *)new;
2641 break;
2642 case 8:
2643 gentry = *(const u64 *)new;
2644 break;
2645 default:
2646 gentry = 0;
2647 break;
2648 }
2649
2650 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2646 spin_lock(&vcpu->kvm->mmu_lock); 2651 spin_lock(&vcpu->kvm->mmu_lock);
2652 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2653 gentry = 0;
2647 kvm_mmu_access_page(vcpu, gfn); 2654 kvm_mmu_access_page(vcpu, gfn);
2648 kvm_mmu_free_some_pages(vcpu); 2655 kvm_mmu_free_some_pages(vcpu);
2649 ++vcpu->kvm->stat.mmu_pte_write; 2656 ++vcpu->kvm->stat.mmu_pte_write;
@@ -2662,10 +2669,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2662 } 2669 }
2663 index = kvm_page_table_hashfn(gfn); 2670 index = kvm_page_table_hashfn(gfn);
2664 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2671 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2672
2673restart:
2665 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2674 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2666 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) 2675 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2667 continue; 2676 continue;
2668 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2677 pte_size = sp->role.cr4_pae ? 8 : 4;
2669 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2678 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2670 misaligned |= bytes < 4; 2679 misaligned |= bytes < 4;
2671 if (misaligned || flooded) { 2680 if (misaligned || flooded) {
@@ -2682,14 +2691,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2682 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2691 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2683 gpa, bytes, sp->role.word); 2692 gpa, bytes, sp->role.word);
2684 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2693 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2685 n = bucket->first; 2694 goto restart;
2686 ++vcpu->kvm->stat.mmu_flooded; 2695 ++vcpu->kvm->stat.mmu_flooded;
2687 continue; 2696 continue;
2688 } 2697 }
2689 page_offset = offset; 2698 page_offset = offset;
2690 level = sp->role.level; 2699 level = sp->role.level;
2691 npte = 1; 2700 npte = 1;
2692 if (sp->role.glevels == PT32_ROOT_LEVEL) { 2701 if (!sp->role.cr4_pae) {
2693 page_offset <<= 1; /* 32->64 */ 2702 page_offset <<= 1; /* 32->64 */
2694 /* 2703 /*
2695 * A 32-bit pde maps 4MB while the shadow pdes map 2704 * A 32-bit pde maps 4MB while the shadow pdes map
@@ -2707,20 +2716,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2707 continue; 2716 continue;
2708 } 2717 }
2709 spte = &sp->spt[page_offset / sizeof(*spte)]; 2718 spte = &sp->spt[page_offset / sizeof(*spte)];
2710 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2711 gentry = 0;
2712 r = kvm_read_guest_atomic(vcpu->kvm,
2713 gpa & ~(u64)(pte_size - 1),
2714 &gentry, pte_size);
2715 new = (const void *)&gentry;
2716 if (r < 0)
2717 new = NULL;
2718 }
2719 while (npte--) { 2719 while (npte--) {
2720 entry = *spte; 2720 entry = *spte;
2721 mmu_pte_write_zap_pte(vcpu, sp, spte); 2721 mmu_pte_write_zap_pte(vcpu, sp, spte);
2722 if (new) 2722 if (gentry)
2723 mmu_pte_write_new_pte(vcpu, sp, spte, new); 2723 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2724 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2724 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2725 ++spte; 2725 ++spte;
2726 } 2726 }
@@ -2900,22 +2900,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2900 struct kvm_mmu_page *sp, *node; 2900 struct kvm_mmu_page *sp, *node;
2901 2901
2902 spin_lock(&kvm->mmu_lock); 2902 spin_lock(&kvm->mmu_lock);
2903restart:
2903 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2904 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2904 if (kvm_mmu_zap_page(kvm, sp)) 2905 if (kvm_mmu_zap_page(kvm, sp))
2905 node = container_of(kvm->arch.active_mmu_pages.next, 2906 goto restart;
2906 struct kvm_mmu_page, link); 2907
2907 spin_unlock(&kvm->mmu_lock); 2908 spin_unlock(&kvm->mmu_lock);
2908 2909
2909 kvm_flush_remote_tlbs(kvm); 2910 kvm_flush_remote_tlbs(kvm);
2910} 2911}
2911 2912
2912static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 2913static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
2913{ 2914{
2914 struct kvm_mmu_page *page; 2915 struct kvm_mmu_page *page;
2915 2916
2916 page = container_of(kvm->arch.active_mmu_pages.prev, 2917 page = container_of(kvm->arch.active_mmu_pages.prev,
2917 struct kvm_mmu_page, link); 2918 struct kvm_mmu_page, link);
2918 kvm_mmu_zap_page(kvm, page); 2919 return kvm_mmu_zap_page(kvm, page) + 1;
2919} 2920}
2920 2921
2921static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) 2922static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
@@ -2927,7 +2928,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2927 spin_lock(&kvm_lock); 2928 spin_lock(&kvm_lock);
2928 2929
2929 list_for_each_entry(kvm, &vm_list, vm_list) { 2930 list_for_each_entry(kvm, &vm_list, vm_list) {
2930 int npages, idx; 2931 int npages, idx, freed_pages;
2931 2932
2932 idx = srcu_read_lock(&kvm->srcu); 2933 idx = srcu_read_lock(&kvm->srcu);
2933 spin_lock(&kvm->mmu_lock); 2934 spin_lock(&kvm->mmu_lock);
@@ -2935,8 +2936,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2935 kvm->arch.n_free_mmu_pages; 2936 kvm->arch.n_free_mmu_pages;
2936 cache_count += npages; 2937 cache_count += npages;
2937 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 2938 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2938 kvm_mmu_remove_one_alloc_mmu_page(kvm); 2939 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
2939 cache_count--; 2940 cache_count -= freed_pages;
2940 kvm_freed = kvm; 2941 kvm_freed = kvm;
2941 } 2942 }
2942 nr_to_scan--; 2943 nr_to_scan--;
@@ -3011,7 +3012,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3011 unsigned int nr_pages = 0; 3012 unsigned int nr_pages = 0;
3012 struct kvm_memslots *slots; 3013 struct kvm_memslots *slots;
3013 3014
3014 slots = rcu_dereference(kvm->memslots); 3015 slots = kvm_memslots(kvm);
3016
3015 for (i = 0; i < slots->nmemslots; i++) 3017 for (i = 0; i < slots->nmemslots; i++)
3016 nr_pages += slots->memslots[i].npages; 3018 nr_pages += slots->memslots[i].npages;
3017 3019
@@ -3174,8 +3176,7 @@ static gva_t canonicalize(gva_t gva)
3174} 3176}
3175 3177
3176 3178
3177typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, 3179typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3178 u64 *sptep);
3179 3180
3180static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, 3181static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3181 inspect_spte_fn fn) 3182 inspect_spte_fn fn)
@@ -3191,7 +3192,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3191 child = page_header(ent & PT64_BASE_ADDR_MASK); 3192 child = page_header(ent & PT64_BASE_ADDR_MASK);
3192 __mmu_spte_walk(kvm, child, fn); 3193 __mmu_spte_walk(kvm, child, fn);
3193 } else 3194 } else
3194 fn(kvm, sp, &sp->spt[i]); 3195 fn(kvm, &sp->spt[i]);
3195 } 3196 }
3196 } 3197 }
3197} 3198}
@@ -3282,11 +3283,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3282 3283
3283static int count_rmaps(struct kvm_vcpu *vcpu) 3284static int count_rmaps(struct kvm_vcpu *vcpu)
3284{ 3285{
3286 struct kvm *kvm = vcpu->kvm;
3287 struct kvm_memslots *slots;
3285 int nmaps = 0; 3288 int nmaps = 0;
3286 int i, j, k, idx; 3289 int i, j, k, idx;
3287 3290
3288 idx = srcu_read_lock(&kvm->srcu); 3291 idx = srcu_read_lock(&kvm->srcu);
3289 slots = rcu_dereference(kvm->memslots); 3292 slots = kvm_memslots(kvm);
3290 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3293 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3291 struct kvm_memory_slot *m = &slots->memslots[i]; 3294 struct kvm_memory_slot *m = &slots->memslots[i];
3292 struct kvm_rmap_desc *d; 3295 struct kvm_rmap_desc *d;
@@ -3315,7 +3318,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3315 return nmaps; 3318 return nmaps;
3316} 3319}
3317 3320
3318void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) 3321void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3319{ 3322{
3320 unsigned long *rmapp; 3323 unsigned long *rmapp;
3321 struct kvm_mmu_page *rev_sp; 3324 struct kvm_mmu_page *rev_sp;
@@ -3331,14 +3334,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3331 printk(KERN_ERR "%s: no memslot for gfn %ld\n", 3334 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3332 audit_msg, gfn); 3335 audit_msg, gfn);
3333 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", 3336 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3334 audit_msg, sptep - rev_sp->spt, 3337 audit_msg, (long int)(sptep - rev_sp->spt),
3335 rev_sp->gfn); 3338 rev_sp->gfn);
3336 dump_stack(); 3339 dump_stack();
3337 return; 3340 return;
3338 } 3341 }
3339 3342
3340 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3343 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3341 is_large_pte(*sptep)); 3344 rev_sp->role.level);
3342 if (!*rmapp) { 3345 if (!*rmapp) {
3343 if (!printk_ratelimit()) 3346 if (!printk_ratelimit())
3344 return; 3347 return;
@@ -3373,7 +3376,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3373 continue; 3376 continue;
3374 if (!(ent & PT_WRITABLE_MASK)) 3377 if (!(ent & PT_WRITABLE_MASK))
3375 continue; 3378 continue;
3376 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); 3379 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3377 } 3380 }
3378 } 3381 }
3379 return; 3382 return;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6ca2a..42f07b1bfbc 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,14 +6,12 @@
6 6
7#undef TRACE_SYSTEM 7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu 8#define TRACE_SYSTEM kvmmmu
9#define TRACE_INCLUDE_PATH .
10#define TRACE_INCLUDE_FILE mmutrace
11 9
12#define KVM_MMU_PAGE_FIELDS \ 10#define KVM_MMU_PAGE_FIELDS \
13 __field(__u64, gfn) \ 11 __field(__u64, gfn) \
14 __field(__u32, role) \ 12 __field(__u32, role) \
15 __field(__u32, root_count) \ 13 __field(__u32, root_count) \
16 __field(__u32, unsync) 14 __field(bool, unsync)
17 15
18#define KVM_MMU_PAGE_ASSIGN(sp) \ 16#define KVM_MMU_PAGE_ASSIGN(sp) \
19 __entry->gfn = sp->gfn; \ 17 __entry->gfn = sp->gfn; \
@@ -30,14 +28,14 @@
30 \ 28 \
31 role.word = __entry->role; \ 29 role.word = __entry->role; \
32 \ 30 \
33 trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ 31 trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
34 " %snxe root %u %s%c", \ 32 " %snxe root %u %s%c", \
35 __entry->gfn, role.level, role.glevels, \ 33 __entry->gfn, role.level, \
34 role.cr4_pae ? " pae" : "", \
36 role.quadrant, \ 35 role.quadrant, \
37 role.direct ? " direct" : "", \ 36 role.direct ? " direct" : "", \
38 access_str[role.access], \ 37 access_str[role.access], \
39 role.invalid ? " invalid" : "", \ 38 role.invalid ? " invalid" : "", \
40 role.cr4_pge ? "" : "!", \
41 role.nxe ? "" : "!", \ 39 role.nxe ? "" : "!", \
42 __entry->root_count, \ 40 __entry->root_count, \
43 __entry->unsync ? "unsync" : "sync", 0); \ 41 __entry->unsync ? "unsync" : "sync", 0); \
@@ -94,15 +92,15 @@ TRACE_EVENT(
94 TP_printk("pte %llx level %u", __entry->pte, __entry->level) 92 TP_printk("pte %llx level %u", __entry->pte, __entry->level)
95); 93);
96 94
97/* We set a pte accessed bit */ 95DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
98TRACE_EVENT( 96
99 kvm_mmu_set_accessed_bit,
100 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), 97 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
98
101 TP_ARGS(table_gfn, index, size), 99 TP_ARGS(table_gfn, index, size),
102 100
103 TP_STRUCT__entry( 101 TP_STRUCT__entry(
104 __field(__u64, gpa) 102 __field(__u64, gpa)
105 ), 103 ),
106 104
107 TP_fast_assign( 105 TP_fast_assign(
108 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) 106 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
@@ -112,22 +110,20 @@ TRACE_EVENT(
112 TP_printk("gpa %llx", __entry->gpa) 110 TP_printk("gpa %llx", __entry->gpa)
113); 111);
114 112
115/* We set a pte dirty bit */ 113/* We set a pte accessed bit */
116TRACE_EVENT( 114DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
117 kvm_mmu_set_dirty_bit, 115
118 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), 116 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
119 TP_ARGS(table_gfn, index, size),
120 117
121 TP_STRUCT__entry( 118 TP_ARGS(table_gfn, index, size)
122 __field(__u64, gpa) 119);
123 ),
124 120
125 TP_fast_assign( 121/* We set a pte dirty bit */
126 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) 122DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
127 + index * size;
128 ),
129 123
130 TP_printk("gpa %llx", __entry->gpa) 124 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
125
126 TP_ARGS(table_gfn, index, size)
131); 127);
132 128
133TRACE_EVENT( 129TRACE_EVENT(
@@ -166,55 +162,45 @@ TRACE_EVENT(
166 __entry->created ? "new" : "existing") 162 __entry->created ? "new" : "existing")
167); 163);
168 164
169TRACE_EVENT( 165DECLARE_EVENT_CLASS(kvm_mmu_page_class,
170 kvm_mmu_sync_page, 166
171 TP_PROTO(struct kvm_mmu_page *sp), 167 TP_PROTO(struct kvm_mmu_page *sp),
172 TP_ARGS(sp), 168 TP_ARGS(sp),
173 169
174 TP_STRUCT__entry( 170 TP_STRUCT__entry(
175 KVM_MMU_PAGE_FIELDS 171 KVM_MMU_PAGE_FIELDS
176 ), 172 ),
177 173
178 TP_fast_assign( 174 TP_fast_assign(
179 KVM_MMU_PAGE_ASSIGN(sp) 175 KVM_MMU_PAGE_ASSIGN(sp)
180 ), 176 ),
181 177
182 TP_printk("%s", KVM_MMU_PAGE_PRINTK()) 178 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
183); 179);
184 180
185TRACE_EVENT( 181DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
186 kvm_mmu_unsync_page,
187 TP_PROTO(struct kvm_mmu_page *sp), 182 TP_PROTO(struct kvm_mmu_page *sp),
188 TP_ARGS(sp),
189
190 TP_STRUCT__entry(
191 KVM_MMU_PAGE_FIELDS
192 ),
193 183
194 TP_fast_assign( 184 TP_ARGS(sp)
195 KVM_MMU_PAGE_ASSIGN(sp)
196 ),
197
198 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
199); 185);
200 186
201TRACE_EVENT( 187DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
202 kvm_mmu_zap_page,
203 TP_PROTO(struct kvm_mmu_page *sp), 188 TP_PROTO(struct kvm_mmu_page *sp),
204 TP_ARGS(sp),
205 189
206 TP_STRUCT__entry( 190 TP_ARGS(sp)
207 KVM_MMU_PAGE_FIELDS 191);
208 ),
209 192
210 TP_fast_assign( 193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
211 KVM_MMU_PAGE_ASSIGN(sp) 194 TP_PROTO(struct kvm_mmu_page *sp),
212 ),
213 195
214 TP_printk("%s", KVM_MMU_PAGE_PRINTK()) 196 TP_ARGS(sp)
215); 197);
216
217#endif /* _TRACE_KVMMMU_H */ 198#endif /* _TRACE_KVMMMU_H */
218 199
200#undef TRACE_INCLUDE_PATH
201#define TRACE_INCLUDE_PATH .
202#undef TRACE_INCLUDE_FILE
203#define TRACE_INCLUDE_FILE mmutrace
204
219/* This part must be outside protection */ 205/* This part must be outside protection */
220#include <trace/define_trace.h> 206#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a50e6..89d66ca4d87 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
170 goto access_error; 170 goto access_error;
171 171
172#if PTTYPE == 64 172#if PTTYPE == 64
173 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) 173 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 174 goto access_error;
175#endif 175#endif
176 176
@@ -190,10 +190,10 @@ walk:
190 190
191 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 191 if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
192 ((walker->level == PT_DIRECTORY_LEVEL) && 192 ((walker->level == PT_DIRECTORY_LEVEL) &&
193 (pte & PT_PAGE_SIZE_MASK) && 193 is_large_pte(pte) &&
194 (PTTYPE == 64 || is_pse(vcpu))) || 194 (PTTYPE == 64 || is_pse(vcpu))) ||
195 ((walker->level == PT_PDPE_LEVEL) && 195 ((walker->level == PT_PDPE_LEVEL) &&
196 (pte & PT_PAGE_SIZE_MASK) && 196 is_large_pte(pte) &&
197 is_long_mode(vcpu))) { 197 is_long_mode(vcpu))) {
198 int lvl = walker->level; 198 int lvl = walker->level;
199 199
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
258 pt_element_t gpte; 258 pt_element_t gpte;
259 unsigned pte_access; 259 unsigned pte_access;
260 pfn_t pfn; 260 pfn_t pfn;
261 u64 new_spte;
261 262
262 gpte = *(const pt_element_t *)pte; 263 gpte = *(const pt_element_t *)pte;
263 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
264 if (!is_present_gpte(gpte)) 265 if (!is_present_gpte(gpte)) {
265 __set_spte(spte, shadow_notrap_nonpresent_pte); 266 if (page->unsync)
267 new_spte = shadow_trap_nonpresent_pte;
268 else
269 new_spte = shadow_notrap_nonpresent_pte;
270 __set_spte(spte, new_spte);
271 }
266 return; 272 return;
267 } 273 }
268 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -457,6 +463,7 @@ out_unlock:
457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 463static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
458{ 464{
459 struct kvm_shadow_walk_iterator iterator; 465 struct kvm_shadow_walk_iterator iterator;
466 gpa_t pte_gpa = -1;
460 int level; 467 int level;
461 u64 *sptep; 468 u64 *sptep;
462 int need_flush = 0; 469 int need_flush = 0;
@@ -467,9 +474,16 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 474 level = iterator.level;
468 sptep = iterator.sptep; 475 sptep = iterator.sptep;
469 476
470 if (level == PT_PAGE_TABLE_LEVEL || 477 if (is_last_spte(*sptep, level)) {
471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 478 struct kvm_mmu_page *sp = page_header(__pa(sptep));
472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 479 int offset, shift;
480
481 shift = PAGE_SHIFT -
482 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
483 offset = sp->role.quadrant << shift;
484
485 pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
486 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
473 487
474 if (is_shadow_present_pte(*sptep)) { 488 if (is_shadow_present_pte(*sptep)) {
475 rmap_remove(vcpu->kvm, sptep); 489 rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +501,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 501
488 if (need_flush) 502 if (need_flush)
489 kvm_flush_remote_tlbs(vcpu->kvm); 503 kvm_flush_remote_tlbs(vcpu->kvm);
504
505 atomic_inc(&vcpu->kvm->arch.invlpg_counter);
506
490 spin_unlock(&vcpu->kvm->mmu_lock); 507 spin_unlock(&vcpu->kvm->mmu_lock);
508
509 if (pte_gpa == -1)
510 return;
511
512 if (mmu_topup_memory_caches(vcpu))
513 return;
514 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
491} 515}
492 516
493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 517static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -551,12 +575,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
551{ 575{
552 int i, offset, nr_present; 576 int i, offset, nr_present;
553 bool reset_host_protection; 577 bool reset_host_protection;
578 gpa_t first_pte_gpa;
554 579
555 offset = nr_present = 0; 580 offset = nr_present = 0;
556 581
557 if (PTTYPE == 32) 582 if (PTTYPE == 32)
558 offset = sp->role.quadrant << PT64_LEVEL_BITS; 583 offset = sp->role.quadrant << PT64_LEVEL_BITS;
559 584
585 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
586
560 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 587 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
561 unsigned pte_access; 588 unsigned pte_access;
562 pt_element_t gpte; 589 pt_element_t gpte;
@@ -566,8 +593,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
566 if (!is_shadow_present_pte(sp->spt[i])) 593 if (!is_shadow_present_pte(sp->spt[i]))
567 continue; 594 continue;
568 595
569 pte_gpa = gfn_to_gpa(sp->gfn); 596 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
570 pte_gpa += (i+offset) * sizeof(pt_element_t);
571 597
572 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 598 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
573 sizeof(pt_element_t))) 599 sizeof(pt_element_t)))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ba58206812..96dc232bfc5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,10 +44,11 @@ MODULE_LICENSE("GPL");
44#define SEG_TYPE_LDT 2 44#define SEG_TYPE_LDT 2
45#define SEG_TYPE_BUSY_TSS16 3 45#define SEG_TYPE_BUSY_TSS16 3
46 46
47#define SVM_FEATURE_NPT (1 << 0) 47#define SVM_FEATURE_NPT (1 << 0)
48#define SVM_FEATURE_LBRV (1 << 1) 48#define SVM_FEATURE_LBRV (1 << 1)
49#define SVM_FEATURE_SVML (1 << 2) 49#define SVM_FEATURE_SVML (1 << 2)
50#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 50#define SVM_FEATURE_NRIP (1 << 3)
51#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
51 52
52#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 53#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
53#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 54#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -70,6 +71,7 @@ struct kvm_vcpu;
70struct nested_state { 71struct nested_state {
71 struct vmcb *hsave; 72 struct vmcb *hsave;
72 u64 hsave_msr; 73 u64 hsave_msr;
74 u64 vm_cr_msr;
73 u64 vmcb; 75 u64 vmcb;
74 76
75 /* These are the merged vectors */ 77 /* These are the merged vectors */
@@ -77,6 +79,7 @@ struct nested_state {
77 79
78 /* gpa pointers to the real vectors */ 80 /* gpa pointers to the real vectors */
79 u64 vmcb_msrpm; 81 u64 vmcb_msrpm;
82 u64 vmcb_iopm;
80 83
81 /* A VMEXIT is required but not yet emulated */ 84 /* A VMEXIT is required but not yet emulated */
82 bool exit_required; 85 bool exit_required;
@@ -91,6 +94,9 @@ struct nested_state {
91 94
92}; 95};
93 96
97#define MSRPM_OFFSETS 16
98static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
99
94struct vcpu_svm { 100struct vcpu_svm {
95 struct kvm_vcpu vcpu; 101 struct kvm_vcpu vcpu;
96 struct vmcb *vmcb; 102 struct vmcb *vmcb;
@@ -110,13 +116,39 @@ struct vcpu_svm {
110 struct nested_state nested; 116 struct nested_state nested;
111 117
112 bool nmi_singlestep; 118 bool nmi_singlestep;
119
120 unsigned int3_injected;
121 unsigned long int3_rip;
122};
123
124#define MSR_INVALID 0xffffffffU
125
126static struct svm_direct_access_msrs {
127 u32 index; /* Index of the MSR */
128 bool always; /* True if intercept is always on */
129} direct_access_msrs[] = {
130 { .index = MSR_K6_STAR, .always = true },
131 { .index = MSR_IA32_SYSENTER_CS, .always = true },
132#ifdef CONFIG_X86_64
133 { .index = MSR_GS_BASE, .always = true },
134 { .index = MSR_FS_BASE, .always = true },
135 { .index = MSR_KERNEL_GS_BASE, .always = true },
136 { .index = MSR_LSTAR, .always = true },
137 { .index = MSR_CSTAR, .always = true },
138 { .index = MSR_SYSCALL_MASK, .always = true },
139#endif
140 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
141 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
142 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
143 { .index = MSR_IA32_LASTINTTOIP, .always = false },
144 { .index = MSR_INVALID, .always = false },
113}; 145};
114 146
115/* enable NPT for AMD64 and X86 with PAE */ 147/* enable NPT for AMD64 and X86 with PAE */
116#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 148#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
117static bool npt_enabled = true; 149static bool npt_enabled = true;
118#else 150#else
119static bool npt_enabled = false; 151static bool npt_enabled;
120#endif 152#endif
121static int npt = 1; 153static int npt = 1;
122 154
@@ -129,6 +161,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
129static void svm_complete_interrupts(struct vcpu_svm *svm); 161static void svm_complete_interrupts(struct vcpu_svm *svm);
130 162
131static int nested_svm_exit_handled(struct vcpu_svm *svm); 163static int nested_svm_exit_handled(struct vcpu_svm *svm);
164static int nested_svm_intercept(struct vcpu_svm *svm);
132static int nested_svm_vmexit(struct vcpu_svm *svm); 165static int nested_svm_vmexit(struct vcpu_svm *svm);
133static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 166static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
134 bool has_error_code, u32 error_code); 167 bool has_error_code, u32 error_code);
@@ -163,8 +196,8 @@ static unsigned long iopm_base;
163struct kvm_ldttss_desc { 196struct kvm_ldttss_desc {
164 u16 limit0; 197 u16 limit0;
165 u16 base0; 198 u16 base0;
166 unsigned base1 : 8, type : 5, dpl : 2, p : 1; 199 unsigned base1:8, type:5, dpl:2, p:1;
167 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; 200 unsigned limit1:4, zero0:3, g:1, base2:8;
168 u32 base3; 201 u32 base3;
169 u32 zero1; 202 u32 zero1;
170} __attribute__((packed)); 203} __attribute__((packed));
@@ -194,6 +227,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
194#define MSRS_RANGE_SIZE 2048 227#define MSRS_RANGE_SIZE 2048
195#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 228#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
196 229
230static u32 svm_msrpm_offset(u32 msr)
231{
232 u32 offset;
233 int i;
234
235 for (i = 0; i < NUM_MSR_MAPS; i++) {
236 if (msr < msrpm_ranges[i] ||
237 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
238 continue;
239
240 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
241 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
242
243 /* Now we have the u8 offset - but need the u32 offset */
244 return offset / 4;
245 }
246
247 /* MSR not in any range */
248 return MSR_INVALID;
249}
250
197#define MAX_INST_SIZE 15 251#define MAX_INST_SIZE 15
198 252
199static inline u32 svm_has(u32 feat) 253static inline u32 svm_has(u32 feat)
@@ -213,7 +267,7 @@ static inline void stgi(void)
213 267
214static inline void invlpga(unsigned long addr, u32 asid) 268static inline void invlpga(unsigned long addr, u32 asid)
215{ 269{
216 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); 270 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
217} 271}
218 272
219static inline void force_new_asid(struct kvm_vcpu *vcpu) 273static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -235,23 +289,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
235 vcpu->arch.efer = efer; 289 vcpu->arch.efer = efer;
236} 290}
237 291
238static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
239 bool has_error_code, u32 error_code)
240{
241 struct vcpu_svm *svm = to_svm(vcpu);
242
243 /* If we are within a nested VM we'd better #VMEXIT and let the
244 guest handle the exception */
245 if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
246 return;
247
248 svm->vmcb->control.event_inj = nr
249 | SVM_EVTINJ_VALID
250 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
251 | SVM_EVTINJ_TYPE_EXEPT;
252 svm->vmcb->control.event_inj_err = error_code;
253}
254
255static int is_external_interrupt(u32 info) 292static int is_external_interrupt(u32 info)
256{ 293{
257 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 294 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -264,7 +301,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
264 u32 ret = 0; 301 u32 ret = 0;
265 302
266 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 303 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
267 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; 304 ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
268 return ret & mask; 305 return ret & mask;
269} 306}
270 307
@@ -283,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
283{ 320{
284 struct vcpu_svm *svm = to_svm(vcpu); 321 struct vcpu_svm *svm = to_svm(vcpu);
285 322
323 if (svm->vmcb->control.next_rip != 0)
324 svm->next_rip = svm->vmcb->control.next_rip;
325
286 if (!svm->next_rip) { 326 if (!svm->next_rip) {
287 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 327 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
288 EMULATE_DONE) 328 EMULATE_DONE)
@@ -297,6 +337,43 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
297 svm_set_interrupt_shadow(vcpu, 0); 337 svm_set_interrupt_shadow(vcpu, 0);
298} 338}
299 339
340static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
341 bool has_error_code, u32 error_code,
342 bool reinject)
343{
344 struct vcpu_svm *svm = to_svm(vcpu);
345
346 /*
347 * If we are within a nested VM we'd better #VMEXIT and let the guest
348 * handle the exception
349 */
350 if (!reinject &&
351 nested_svm_check_exception(svm, nr, has_error_code, error_code))
352 return;
353
354 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
355 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
356
357 /*
358 * For guest debugging where we have to reinject #BP if some
359 * INT3 is guest-owned:
360 * Emulate nRIP by moving RIP forward. Will fail if injection
361 * raises a fault that is not intercepted. Still better than
362 * failing in all cases.
363 */
364 skip_emulated_instruction(&svm->vcpu);
365 rip = kvm_rip_read(&svm->vcpu);
366 svm->int3_rip = rip + svm->vmcb->save.cs.base;
367 svm->int3_injected = rip - old_rip;
368 }
369
370 svm->vmcb->control.event_inj = nr
371 | SVM_EVTINJ_VALID
372 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
373 | SVM_EVTINJ_TYPE_EXEPT;
374 svm->vmcb->control.event_inj_err = error_code;
375}
376
300static int has_svm(void) 377static int has_svm(void)
301{ 378{
302 const char *msg; 379 const char *msg;
@@ -319,7 +396,7 @@ static int svm_hardware_enable(void *garbage)
319 396
320 struct svm_cpu_data *sd; 397 struct svm_cpu_data *sd;
321 uint64_t efer; 398 uint64_t efer;
322 struct descriptor_table gdt_descr; 399 struct desc_ptr gdt_descr;
323 struct desc_struct *gdt; 400 struct desc_struct *gdt;
324 int me = raw_smp_processor_id(); 401 int me = raw_smp_processor_id();
325 402
@@ -344,8 +421,8 @@ static int svm_hardware_enable(void *garbage)
344 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 421 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
345 sd->next_asid = sd->max_asid + 1; 422 sd->next_asid = sd->max_asid + 1;
346 423
347 kvm_get_gdt(&gdt_descr); 424 native_store_gdt(&gdt_descr);
348 gdt = (struct desc_struct *)gdt_descr.base; 425 gdt = (struct desc_struct *)gdt_descr.address;
349 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 426 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
350 427
351 wrmsrl(MSR_EFER, efer | EFER_SVME); 428 wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -391,42 +468,98 @@ err_1:
391 468
392} 469}
393 470
471static bool valid_msr_intercept(u32 index)
472{
473 int i;
474
475 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
476 if (direct_access_msrs[i].index == index)
477 return true;
478
479 return false;
480}
481
394static void set_msr_interception(u32 *msrpm, unsigned msr, 482static void set_msr_interception(u32 *msrpm, unsigned msr,
395 int read, int write) 483 int read, int write)
396{ 484{
485 u8 bit_read, bit_write;
486 unsigned long tmp;
487 u32 offset;
488
489 /*
490 * If this warning triggers extend the direct_access_msrs list at the
491 * beginning of the file
492 */
493 WARN_ON(!valid_msr_intercept(msr));
494
495 offset = svm_msrpm_offset(msr);
496 bit_read = 2 * (msr & 0x0f);
497 bit_write = 2 * (msr & 0x0f) + 1;
498 tmp = msrpm[offset];
499
500 BUG_ON(offset == MSR_INVALID);
501
502 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
503 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
504
505 msrpm[offset] = tmp;
506}
507
508static void svm_vcpu_init_msrpm(u32 *msrpm)
509{
397 int i; 510 int i;
398 511
399 for (i = 0; i < NUM_MSR_MAPS; i++) { 512 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
400 if (msr >= msrpm_ranges[i] && 513
401 msr < msrpm_ranges[i] + MSRS_IN_RANGE) { 514 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
402 u32 msr_offset = (i * MSRS_IN_RANGE + msr - 515 if (!direct_access_msrs[i].always)
403 msrpm_ranges[i]) * 2; 516 continue;
404 517
405 u32 *base = msrpm + (msr_offset / 32); 518 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
406 u32 msr_shift = msr_offset % 32; 519 }
407 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); 520}
408 *base = (*base & ~(0x3 << msr_shift)) | 521
409 (mask << msr_shift); 522static void add_msr_offset(u32 offset)
523{
524 int i;
525
526 for (i = 0; i < MSRPM_OFFSETS; ++i) {
527
528 /* Offset already in list? */
529 if (msrpm_offsets[i] == offset)
410 return; 530 return;
411 } 531
532 /* Slot used by another offset? */
533 if (msrpm_offsets[i] != MSR_INVALID)
534 continue;
535
536 /* Add offset to list */
537 msrpm_offsets[i] = offset;
538
539 return;
412 } 540 }
541
542 /*
543 * If this BUG triggers the msrpm_offsets table has an overflow. Just
544 * increase MSRPM_OFFSETS in this case.
545 */
413 BUG(); 546 BUG();
414} 547}
415 548
416static void svm_vcpu_init_msrpm(u32 *msrpm) 549static void init_msrpm_offsets(void)
417{ 550{
418 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); 551 int i;
419 552
420#ifdef CONFIG_X86_64 553 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
421 set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); 554
422 set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); 555 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
423 set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); 556 u32 offset;
424 set_msr_interception(msrpm, MSR_LSTAR, 1, 1); 557
425 set_msr_interception(msrpm, MSR_CSTAR, 1, 1); 558 offset = svm_msrpm_offset(direct_access_msrs[i].index);
426 set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); 559 BUG_ON(offset == MSR_INVALID);
427#endif 560
428 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); 561 add_msr_offset(offset);
429 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); 562 }
430} 563}
431 564
432static void svm_enable_lbrv(struct vcpu_svm *svm) 565static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -467,6 +600,8 @@ static __init int svm_hardware_setup(void)
467 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); 600 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
468 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 601 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
469 602
603 init_msrpm_offsets();
604
470 if (boot_cpu_has(X86_FEATURE_NX)) 605 if (boot_cpu_has(X86_FEATURE_NX))
471 kvm_enable_efer_bits(EFER_NX); 606 kvm_enable_efer_bits(EFER_NX);
472 607
@@ -523,7 +658,7 @@ static void init_seg(struct vmcb_seg *seg)
523{ 658{
524 seg->selector = 0; 659 seg->selector = 0;
525 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 660 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
526 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 661 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
527 seg->limit = 0xffff; 662 seg->limit = 0xffff;
528 seg->base = 0; 663 seg->base = 0;
529} 664}
@@ -543,16 +678,16 @@ static void init_vmcb(struct vcpu_svm *svm)
543 678
544 svm->vcpu.fpu_active = 1; 679 svm->vcpu.fpu_active = 1;
545 680
546 control->intercept_cr_read = INTERCEPT_CR0_MASK | 681 control->intercept_cr_read = INTERCEPT_CR0_MASK |
547 INTERCEPT_CR3_MASK | 682 INTERCEPT_CR3_MASK |
548 INTERCEPT_CR4_MASK; 683 INTERCEPT_CR4_MASK;
549 684
550 control->intercept_cr_write = INTERCEPT_CR0_MASK | 685 control->intercept_cr_write = INTERCEPT_CR0_MASK |
551 INTERCEPT_CR3_MASK | 686 INTERCEPT_CR3_MASK |
552 INTERCEPT_CR4_MASK | 687 INTERCEPT_CR4_MASK |
553 INTERCEPT_CR8_MASK; 688 INTERCEPT_CR8_MASK;
554 689
555 control->intercept_dr_read = INTERCEPT_DR0_MASK | 690 control->intercept_dr_read = INTERCEPT_DR0_MASK |
556 INTERCEPT_DR1_MASK | 691 INTERCEPT_DR1_MASK |
557 INTERCEPT_DR2_MASK | 692 INTERCEPT_DR2_MASK |
558 INTERCEPT_DR3_MASK | 693 INTERCEPT_DR3_MASK |
@@ -561,7 +696,7 @@ static void init_vmcb(struct vcpu_svm *svm)
561 INTERCEPT_DR6_MASK | 696 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK; 697 INTERCEPT_DR7_MASK;
563 698
564 control->intercept_dr_write = INTERCEPT_DR0_MASK | 699 control->intercept_dr_write = INTERCEPT_DR0_MASK |
565 INTERCEPT_DR1_MASK | 700 INTERCEPT_DR1_MASK |
566 INTERCEPT_DR2_MASK | 701 INTERCEPT_DR2_MASK |
567 INTERCEPT_DR3_MASK | 702 INTERCEPT_DR3_MASK |
@@ -575,7 +710,7 @@ static void init_vmcb(struct vcpu_svm *svm)
575 (1 << MC_VECTOR); 710 (1 << MC_VECTOR);
576 711
577 712
578 control->intercept = (1ULL << INTERCEPT_INTR) | 713 control->intercept = (1ULL << INTERCEPT_INTR) |
579 (1ULL << INTERCEPT_NMI) | 714 (1ULL << INTERCEPT_NMI) |
580 (1ULL << INTERCEPT_SMI) | 715 (1ULL << INTERCEPT_SMI) |
581 (1ULL << INTERCEPT_SELECTIVE_CR0) | 716 (1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -636,7 +771,8 @@ static void init_vmcb(struct vcpu_svm *svm)
636 save->rip = 0x0000fff0; 771 save->rip = 0x0000fff0;
637 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 772 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
638 773
639 /* This is the guest-visible cr0 value. 774 /*
775 * This is the guest-visible cr0 value.
640 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 776 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
641 */ 777 */
642 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 778 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -729,6 +865,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
729 svm_vcpu_init_msrpm(svm->msrpm); 865 svm_vcpu_init_msrpm(svm->msrpm);
730 866
731 svm->nested.msrpm = page_address(nested_msrpm_pages); 867 svm->nested.msrpm = page_address(nested_msrpm_pages);
868 svm_vcpu_init_msrpm(svm->nested.msrpm);
732 869
733 svm->vmcb = page_address(page); 870 svm->vmcb = page_address(page);
734 clear_page(svm->vmcb); 871 clear_page(svm->vmcb);
@@ -882,7 +1019,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
882 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1019 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
883 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; 1020 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
884 1021
885 /* AMD's VMCB does not have an explicit unusable field, so emulate it 1022 /*
1023 * AMD's VMCB does not have an explicit unusable field, so emulate it
886 * for cross vendor migration purposes by "not present" 1024 * for cross vendor migration purposes by "not present"
887 */ 1025 */
888 var->unusable = !var->present || (var->type == 0); 1026 var->unusable = !var->present || (var->type == 0);
@@ -918,7 +1056,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
918 var->type |= 0x1; 1056 var->type |= 0x1;
919 break; 1057 break;
920 case VCPU_SREG_SS: 1058 case VCPU_SREG_SS:
921 /* On AMD CPUs sometimes the DB bit in the segment 1059 /*
1060 * On AMD CPUs sometimes the DB bit in the segment
922 * descriptor is left as 1, although the whole segment has 1061 * descriptor is left as 1, although the whole segment has
923 * been made unusable. Clear it here to pass an Intel VMX 1062 * been made unusable. Clear it here to pass an Intel VMX
924 * entry check when cross vendor migrating. 1063 * entry check when cross vendor migrating.
@@ -936,36 +1075,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
936 return save->cpl; 1075 return save->cpl;
937} 1076}
938 1077
939static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1078static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
940{ 1079{
941 struct vcpu_svm *svm = to_svm(vcpu); 1080 struct vcpu_svm *svm = to_svm(vcpu);
942 1081
943 dt->limit = svm->vmcb->save.idtr.limit; 1082 dt->size = svm->vmcb->save.idtr.limit;
944 dt->base = svm->vmcb->save.idtr.base; 1083 dt->address = svm->vmcb->save.idtr.base;
945} 1084}
946 1085
947static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1086static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
948{ 1087{
949 struct vcpu_svm *svm = to_svm(vcpu); 1088 struct vcpu_svm *svm = to_svm(vcpu);
950 1089
951 svm->vmcb->save.idtr.limit = dt->limit; 1090 svm->vmcb->save.idtr.limit = dt->size;
952 svm->vmcb->save.idtr.base = dt->base ; 1091 svm->vmcb->save.idtr.base = dt->address ;
953} 1092}
954 1093
955static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1094static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
956{ 1095{
957 struct vcpu_svm *svm = to_svm(vcpu); 1096 struct vcpu_svm *svm = to_svm(vcpu);
958 1097
959 dt->limit = svm->vmcb->save.gdtr.limit; 1098 dt->size = svm->vmcb->save.gdtr.limit;
960 dt->base = svm->vmcb->save.gdtr.base; 1099 dt->address = svm->vmcb->save.gdtr.base;
961} 1100}
962 1101
963static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1102static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
964{ 1103{
965 struct vcpu_svm *svm = to_svm(vcpu); 1104 struct vcpu_svm *svm = to_svm(vcpu);
966 1105
967 svm->vmcb->save.gdtr.limit = dt->limit; 1106 svm->vmcb->save.gdtr.limit = dt->size;
968 svm->vmcb->save.gdtr.base = dt->base ; 1107 svm->vmcb->save.gdtr.base = dt->address ;
969} 1108}
970 1109
971static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1110static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -978,6 +1117,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
978 1117
979static void update_cr0_intercept(struct vcpu_svm *svm) 1118static void update_cr0_intercept(struct vcpu_svm *svm)
980{ 1119{
1120 struct vmcb *vmcb = svm->vmcb;
981 ulong gcr0 = svm->vcpu.arch.cr0; 1121 ulong gcr0 = svm->vcpu.arch.cr0;
982 u64 *hcr0 = &svm->vmcb->save.cr0; 1122 u64 *hcr0 = &svm->vmcb->save.cr0;
983 1123
@@ -989,11 +1129,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
989 1129
990 1130
991 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1131 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
992 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1132 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
993 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1133 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1134 if (is_nested(svm)) {
1135 struct vmcb *hsave = svm->nested.hsave;
1136
1137 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1138 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1139 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1140 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1141 }
994 } else { 1142 } else {
995 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1143 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
996 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1144 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1145 if (is_nested(svm)) {
1146 struct vmcb *hsave = svm->nested.hsave;
1147
1148 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1149 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1150 }
997 } 1151 }
998} 1152}
999 1153
@@ -1001,6 +1155,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1001{ 1155{
1002 struct vcpu_svm *svm = to_svm(vcpu); 1156 struct vcpu_svm *svm = to_svm(vcpu);
1003 1157
1158 if (is_nested(svm)) {
1159 /*
1160 * We are here because we run in nested mode, the host kvm
1161 * intercepts cr0 writes but the l1 hypervisor does not.
1162 * But the L1 hypervisor may intercept selective cr0 writes.
1163 * This needs to be checked here.
1164 */
1165 unsigned long old, new;
1166
1167 /* Remove bits that would trigger a real cr0 write intercept */
1168 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1169 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1170
1171 if (old == new) {
1172 /* cr0 write with ts and mp unchanged */
1173 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1174 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
1175 return;
1176 }
1177 }
1178
1004#ifdef CONFIG_X86_64 1179#ifdef CONFIG_X86_64
1005 if (vcpu->arch.efer & EFER_LME) { 1180 if (vcpu->arch.efer & EFER_LME) {
1006 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1181 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1134,70 +1309,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1134 svm->vmcb->control.asid = sd->next_asid++; 1309 svm->vmcb->control.asid = sd->next_asid++;
1135} 1310}
1136 1311
1137static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) 1312static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1138{ 1313{
1139 struct vcpu_svm *svm = to_svm(vcpu); 1314 struct vcpu_svm *svm = to_svm(vcpu);
1140 1315
1141 switch (dr) { 1316 svm->vmcb->save.dr7 = value;
1142 case 0 ... 3:
1143 *dest = vcpu->arch.db[dr];
1144 break;
1145 case 4:
1146 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1147 return EMULATE_FAIL; /* will re-inject UD */
1148 /* fall through */
1149 case 6:
1150 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1151 *dest = vcpu->arch.dr6;
1152 else
1153 *dest = svm->vmcb->save.dr6;
1154 break;
1155 case 5:
1156 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1157 return EMULATE_FAIL; /* will re-inject UD */
1158 /* fall through */
1159 case 7:
1160 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1161 *dest = vcpu->arch.dr7;
1162 else
1163 *dest = svm->vmcb->save.dr7;
1164 break;
1165 }
1166
1167 return EMULATE_DONE;
1168}
1169
1170static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1171{
1172 struct vcpu_svm *svm = to_svm(vcpu);
1173
1174 switch (dr) {
1175 case 0 ... 3:
1176 vcpu->arch.db[dr] = value;
1177 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1178 vcpu->arch.eff_db[dr] = value;
1179 break;
1180 case 4:
1181 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1182 return EMULATE_FAIL; /* will re-inject UD */
1183 /* fall through */
1184 case 6:
1185 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1186 break;
1187 case 5:
1188 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1189 return EMULATE_FAIL; /* will re-inject UD */
1190 /* fall through */
1191 case 7:
1192 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1193 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1194 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1195 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1196 }
1197 break;
1198 }
1199
1200 return EMULATE_DONE;
1201} 1317}
1202 1318
1203static int pf_interception(struct vcpu_svm *svm) 1319static int pf_interception(struct vcpu_svm *svm)
@@ -1234,7 +1350,7 @@ static int db_interception(struct vcpu_svm *svm)
1234 } 1350 }
1235 1351
1236 if (svm->vcpu.guest_debug & 1352 if (svm->vcpu.guest_debug &
1237 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ 1353 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1238 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1354 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1239 kvm_run->debug.arch.pc = 1355 kvm_run->debug.arch.pc =
1240 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1356 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1268,7 +1384,22 @@ static int ud_interception(struct vcpu_svm *svm)
1268static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1384static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1269{ 1385{
1270 struct vcpu_svm *svm = to_svm(vcpu); 1386 struct vcpu_svm *svm = to_svm(vcpu);
1271 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1387 u32 excp;
1388
1389 if (is_nested(svm)) {
1390 u32 h_excp, n_excp;
1391
1392 h_excp = svm->nested.hsave->control.intercept_exceptions;
1393 n_excp = svm->nested.intercept_exceptions;
1394 h_excp &= ~(1 << NM_VECTOR);
1395 excp = h_excp | n_excp;
1396 } else {
1397 excp = svm->vmcb->control.intercept_exceptions;
1398 excp &= ~(1 << NM_VECTOR);
1399 }
1400
1401 svm->vmcb->control.intercept_exceptions = excp;
1402
1272 svm->vcpu.fpu_active = 1; 1403 svm->vcpu.fpu_active = 1;
1273 update_cr0_intercept(svm); 1404 update_cr0_intercept(svm);
1274} 1405}
@@ -1309,29 +1440,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
1309 1440
1310static int io_interception(struct vcpu_svm *svm) 1441static int io_interception(struct vcpu_svm *svm)
1311{ 1442{
1443 struct kvm_vcpu *vcpu = &svm->vcpu;
1312 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1444 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1313 int size, in, string; 1445 int size, in, string;
1314 unsigned port; 1446 unsigned port;
1315 1447
1316 ++svm->vcpu.stat.io_exits; 1448 ++svm->vcpu.stat.io_exits;
1317
1318 svm->next_rip = svm->vmcb->control.exit_info_2;
1319
1320 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1449 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1321
1322 if (string) {
1323 if (emulate_instruction(&svm->vcpu,
1324 0, 0, 0) == EMULATE_DO_MMIO)
1325 return 0;
1326 return 1;
1327 }
1328
1329 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1450 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1451 if (string || in)
1452 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
1453
1330 port = io_info >> 16; 1454 port = io_info >> 16;
1331 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1455 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1332 1456 svm->next_rip = svm->vmcb->control.exit_info_2;
1333 skip_emulated_instruction(&svm->vcpu); 1457 skip_emulated_instruction(&svm->vcpu);
1334 return kvm_emulate_pio(&svm->vcpu, in, size, port); 1458
1459 return kvm_fast_pio_out(vcpu, size, port);
1335} 1460}
1336 1461
1337static int nmi_interception(struct vcpu_svm *svm) 1462static int nmi_interception(struct vcpu_svm *svm)
@@ -1384,6 +1509,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
1384static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 1509static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1385 bool has_error_code, u32 error_code) 1510 bool has_error_code, u32 error_code)
1386{ 1511{
1512 int vmexit;
1513
1387 if (!is_nested(svm)) 1514 if (!is_nested(svm))
1388 return 0; 1515 return 0;
1389 1516
@@ -1392,21 +1519,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1392 svm->vmcb->control.exit_info_1 = error_code; 1519 svm->vmcb->control.exit_info_1 = error_code;
1393 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 1520 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1394 1521
1395 return nested_svm_exit_handled(svm); 1522 vmexit = nested_svm_intercept(svm);
1523 if (vmexit == NESTED_EXIT_DONE)
1524 svm->nested.exit_required = true;
1525
1526 return vmexit;
1396} 1527}
1397 1528
1398static inline int nested_svm_intr(struct vcpu_svm *svm) 1529/* This function returns true if it is save to enable the irq window */
1530static inline bool nested_svm_intr(struct vcpu_svm *svm)
1399{ 1531{
1400 if (!is_nested(svm)) 1532 if (!is_nested(svm))
1401 return 0; 1533 return true;
1402 1534
1403 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1535 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1404 return 0; 1536 return true;
1405 1537
1406 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1538 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1407 return 0; 1539 return false;
1408 1540
1409 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1541 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1542 svm->vmcb->control.exit_info_1 = 0;
1543 svm->vmcb->control.exit_info_2 = 0;
1410 1544
1411 if (svm->nested.intercept & 1ULL) { 1545 if (svm->nested.intercept & 1ULL) {
1412 /* 1546 /*
@@ -1417,21 +1551,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1417 */ 1551 */
1418 svm->nested.exit_required = true; 1552 svm->nested.exit_required = true;
1419 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); 1553 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1420 return 1; 1554 return false;
1421 } 1555 }
1422 1556
1423 return 0; 1557 return true;
1558}
1559
1560/* This function returns true if it is save to enable the nmi window */
1561static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1562{
1563 if (!is_nested(svm))
1564 return true;
1565
1566 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
1567 return true;
1568
1569 svm->vmcb->control.exit_code = SVM_EXIT_NMI;
1570 svm->nested.exit_required = true;
1571
1572 return false;
1424} 1573}
1425 1574
1426static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) 1575static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
1427{ 1576{
1428 struct page *page; 1577 struct page *page;
1429 1578
1579 might_sleep();
1580
1430 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1581 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1431 if (is_error_page(page)) 1582 if (is_error_page(page))
1432 goto error; 1583 goto error;
1433 1584
1434 return kmap_atomic(page, idx); 1585 *_page = page;
1586
1587 return kmap(page);
1435 1588
1436error: 1589error:
1437 kvm_release_page_clean(page); 1590 kvm_release_page_clean(page);
@@ -1440,61 +1593,55 @@ error:
1440 return NULL; 1593 return NULL;
1441} 1594}
1442 1595
1443static void nested_svm_unmap(void *addr, enum km_type idx) 1596static void nested_svm_unmap(struct page *page)
1444{ 1597{
1445 struct page *page; 1598 kunmap(page);
1599 kvm_release_page_dirty(page);
1600}
1446 1601
1447 if (!addr) 1602static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1448 return; 1603{
1604 unsigned port;
1605 u8 val, bit;
1606 u64 gpa;
1449 1607
1450 page = kmap_atomic_to_page(addr); 1608 if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
1609 return NESTED_EXIT_HOST;
1451 1610
1452 kunmap_atomic(addr, idx); 1611 port = svm->vmcb->control.exit_info_1 >> 16;
1453 kvm_release_page_dirty(page); 1612 gpa = svm->nested.vmcb_iopm + (port / 8);
1613 bit = port % 8;
1614 val = 0;
1615
1616 if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
1617 val &= (1 << bit);
1618
1619 return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1454} 1620}
1455 1621
1456static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) 1622static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1457{ 1623{
1458 u32 param = svm->vmcb->control.exit_info_1 & 1; 1624 u32 offset, msr, value;
1459 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1625 int write, mask;
1460 bool ret = false;
1461 u32 t0, t1;
1462 u8 *msrpm;
1463 1626
1464 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 1627 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1465 return false; 1628 return NESTED_EXIT_HOST;
1466 1629
1467 msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); 1630 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1631 offset = svm_msrpm_offset(msr);
1632 write = svm->vmcb->control.exit_info_1 & 1;
1633 mask = 1 << ((2 * (msr & 0xf)) + write);
1468 1634
1469 if (!msrpm) 1635 if (offset == MSR_INVALID)
1470 goto out; 1636 return NESTED_EXIT_DONE;
1471 1637
1472 switch (msr) { 1638 /* Offset is in 32 bit units but need in 8 bit units */
1473 case 0 ... 0x1fff: 1639 offset *= 4;
1474 t0 = (msr * 2) % 8;
1475 t1 = msr / 8;
1476 break;
1477 case 0xc0000000 ... 0xc0001fff:
1478 t0 = (8192 + msr - 0xc0000000) * 2;
1479 t1 = (t0 / 8);
1480 t0 %= 8;
1481 break;
1482 case 0xc0010000 ... 0xc0011fff:
1483 t0 = (16384 + msr - 0xc0010000) * 2;
1484 t1 = (t0 / 8);
1485 t0 %= 8;
1486 break;
1487 default:
1488 ret = true;
1489 goto out;
1490 }
1491 1640
1492 ret = msrpm[t1] & ((1 << param) << t0); 1641 if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
1493 1642 return NESTED_EXIT_DONE;
1494out:
1495 nested_svm_unmap(msrpm, KM_USER0);
1496 1643
1497 return ret; 1644 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1498} 1645}
1499 1646
1500static int nested_svm_exit_special(struct vcpu_svm *svm) 1647static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1504,17 +1651,21 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1504 switch (exit_code) { 1651 switch (exit_code) {
1505 case SVM_EXIT_INTR: 1652 case SVM_EXIT_INTR:
1506 case SVM_EXIT_NMI: 1653 case SVM_EXIT_NMI:
1654 case SVM_EXIT_EXCP_BASE + MC_VECTOR:
1507 return NESTED_EXIT_HOST; 1655 return NESTED_EXIT_HOST;
1508 /* For now we are always handling NPFs when using them */
1509 case SVM_EXIT_NPF: 1656 case SVM_EXIT_NPF:
1657 /* For now we are always handling NPFs when using them */
1510 if (npt_enabled) 1658 if (npt_enabled)
1511 return NESTED_EXIT_HOST; 1659 return NESTED_EXIT_HOST;
1512 break; 1660 break;
1513 /* When we're shadowing, trap PFs */
1514 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1661 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1662 /* When we're shadowing, trap PFs */
1515 if (!npt_enabled) 1663 if (!npt_enabled)
1516 return NESTED_EXIT_HOST; 1664 return NESTED_EXIT_HOST;
1517 break; 1665 break;
1666 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
1667 nm_interception(svm);
1668 break;
1518 default: 1669 default:
1519 break; 1670 break;
1520 } 1671 }
@@ -1525,7 +1676,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1525/* 1676/*
1526 * If this function returns true, this #vmexit was already handled 1677 * If this function returns true, this #vmexit was already handled
1527 */ 1678 */
1528static int nested_svm_exit_handled(struct vcpu_svm *svm) 1679static int nested_svm_intercept(struct vcpu_svm *svm)
1529{ 1680{
1530 u32 exit_code = svm->vmcb->control.exit_code; 1681 u32 exit_code = svm->vmcb->control.exit_code;
1531 int vmexit = NESTED_EXIT_HOST; 1682 int vmexit = NESTED_EXIT_HOST;
@@ -1534,6 +1685,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1534 case SVM_EXIT_MSR: 1685 case SVM_EXIT_MSR:
1535 vmexit = nested_svm_exit_handled_msr(svm); 1686 vmexit = nested_svm_exit_handled_msr(svm);
1536 break; 1687 break;
1688 case SVM_EXIT_IOIO:
1689 vmexit = nested_svm_intercept_ioio(svm);
1690 break;
1537 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 1691 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1538 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 1692 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1539 if (svm->nested.intercept_cr_read & cr_bits) 1693 if (svm->nested.intercept_cr_read & cr_bits)
@@ -1564,6 +1718,10 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1564 vmexit = NESTED_EXIT_DONE; 1718 vmexit = NESTED_EXIT_DONE;
1565 break; 1719 break;
1566 } 1720 }
1721 case SVM_EXIT_ERR: {
1722 vmexit = NESTED_EXIT_DONE;
1723 break;
1724 }
1567 default: { 1725 default: {
1568 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1726 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1569 if (svm->nested.intercept & exit_bits) 1727 if (svm->nested.intercept & exit_bits)
@@ -1571,9 +1729,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1571 } 1729 }
1572 } 1730 }
1573 1731
1574 if (vmexit == NESTED_EXIT_DONE) { 1732 return vmexit;
1733}
1734
1735static int nested_svm_exit_handled(struct vcpu_svm *svm)
1736{
1737 int vmexit;
1738
1739 vmexit = nested_svm_intercept(svm);
1740
1741 if (vmexit == NESTED_EXIT_DONE)
1575 nested_svm_vmexit(svm); 1742 nested_svm_vmexit(svm);
1576 }
1577 1743
1578 return vmexit; 1744 return vmexit;
1579} 1745}
@@ -1615,6 +1781,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1615 struct vmcb *nested_vmcb; 1781 struct vmcb *nested_vmcb;
1616 struct vmcb *hsave = svm->nested.hsave; 1782 struct vmcb *hsave = svm->nested.hsave;
1617 struct vmcb *vmcb = svm->vmcb; 1783 struct vmcb *vmcb = svm->vmcb;
1784 struct page *page;
1618 1785
1619 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, 1786 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1620 vmcb->control.exit_info_1, 1787 vmcb->control.exit_info_1,
@@ -1622,10 +1789,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1622 vmcb->control.exit_int_info, 1789 vmcb->control.exit_int_info,
1623 vmcb->control.exit_int_info_err); 1790 vmcb->control.exit_int_info_err);
1624 1791
1625 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1792 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
1626 if (!nested_vmcb) 1793 if (!nested_vmcb)
1627 return 1; 1794 return 1;
1628 1795
1796 /* Exit nested SVM mode */
1797 svm->nested.vmcb = 0;
1798
1629 /* Give the current vmcb to the guest */ 1799 /* Give the current vmcb to the guest */
1630 disable_gif(svm); 1800 disable_gif(svm);
1631 1801
@@ -1635,9 +1805,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1635 nested_vmcb->save.ds = vmcb->save.ds; 1805 nested_vmcb->save.ds = vmcb->save.ds;
1636 nested_vmcb->save.gdtr = vmcb->save.gdtr; 1806 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1637 nested_vmcb->save.idtr = vmcb->save.idtr; 1807 nested_vmcb->save.idtr = vmcb->save.idtr;
1638 if (npt_enabled) 1808 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1639 nested_vmcb->save.cr3 = vmcb->save.cr3; 1809 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
1640 nested_vmcb->save.cr2 = vmcb->save.cr2; 1810 nested_vmcb->save.cr2 = vmcb->save.cr2;
1811 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1641 nested_vmcb->save.rflags = vmcb->save.rflags; 1812 nested_vmcb->save.rflags = vmcb->save.rflags;
1642 nested_vmcb->save.rip = vmcb->save.rip; 1813 nested_vmcb->save.rip = vmcb->save.rip;
1643 nested_vmcb->save.rsp = vmcb->save.rsp; 1814 nested_vmcb->save.rsp = vmcb->save.rsp;
@@ -1709,10 +1880,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1709 svm->vmcb->save.cpl = 0; 1880 svm->vmcb->save.cpl = 0;
1710 svm->vmcb->control.exit_int_info = 0; 1881 svm->vmcb->control.exit_int_info = 0;
1711 1882
1712 /* Exit nested SVM mode */ 1883 nested_svm_unmap(page);
1713 svm->nested.vmcb = 0;
1714
1715 nested_svm_unmap(nested_vmcb, KM_USER0);
1716 1884
1717 kvm_mmu_reset_context(&svm->vcpu); 1885 kvm_mmu_reset_context(&svm->vcpu);
1718 kvm_mmu_load(&svm->vcpu); 1886 kvm_mmu_load(&svm->vcpu);
@@ -1722,19 +1890,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1722 1890
1723static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) 1891static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
1724{ 1892{
1725 u32 *nested_msrpm; 1893 /*
1894 * This function merges the msr permission bitmaps of kvm and the
1895 * nested vmcb. It is omptimized in that it only merges the parts where
1896 * the kvm msr permission bitmap may contain zero bits
1897 */
1726 int i; 1898 int i;
1727 1899
1728 nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); 1900 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1729 if (!nested_msrpm) 1901 return true;
1730 return false;
1731 1902
1732 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) 1903 for (i = 0; i < MSRPM_OFFSETS; i++) {
1733 svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; 1904 u32 value, p;
1905 u64 offset;
1734 1906
1735 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); 1907 if (msrpm_offsets[i] == 0xffffffff)
1908 break;
1909
1910 p = msrpm_offsets[i];
1911 offset = svm->nested.vmcb_msrpm + (p * 4);
1912
1913 if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
1914 return false;
1915
1916 svm->nested.msrpm[p] = svm->msrpm[p] | value;
1917 }
1736 1918
1737 nested_svm_unmap(nested_msrpm, KM_USER0); 1919 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
1738 1920
1739 return true; 1921 return true;
1740} 1922}
@@ -1744,26 +1926,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1744 struct vmcb *nested_vmcb; 1926 struct vmcb *nested_vmcb;
1745 struct vmcb *hsave = svm->nested.hsave; 1927 struct vmcb *hsave = svm->nested.hsave;
1746 struct vmcb *vmcb = svm->vmcb; 1928 struct vmcb *vmcb = svm->vmcb;
1929 struct page *page;
1930 u64 vmcb_gpa;
1747 1931
1748 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 1932 vmcb_gpa = svm->vmcb->save.rax;
1933
1934 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1749 if (!nested_vmcb) 1935 if (!nested_vmcb)
1750 return false; 1936 return false;
1751 1937
1752 /* nested_vmcb is our indicator if nested SVM is activated */ 1938 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
1753 svm->nested.vmcb = svm->vmcb->save.rax;
1754
1755 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1756 nested_vmcb->save.rip, 1939 nested_vmcb->save.rip,
1757 nested_vmcb->control.int_ctl, 1940 nested_vmcb->control.int_ctl,
1758 nested_vmcb->control.event_inj, 1941 nested_vmcb->control.event_inj,
1759 nested_vmcb->control.nested_ctl); 1942 nested_vmcb->control.nested_ctl);
1760 1943
1944 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
1945 nested_vmcb->control.intercept_cr_write,
1946 nested_vmcb->control.intercept_exceptions,
1947 nested_vmcb->control.intercept);
1948
1761 /* Clear internal status */ 1949 /* Clear internal status */
1762 kvm_clear_exception_queue(&svm->vcpu); 1950 kvm_clear_exception_queue(&svm->vcpu);
1763 kvm_clear_interrupt_queue(&svm->vcpu); 1951 kvm_clear_interrupt_queue(&svm->vcpu);
1764 1952
1765 /* Save the old vmcb, so we don't need to pick what we save, but 1953 /*
1766 can restore everything when a VMEXIT occurs */ 1954 * Save the old vmcb, so we don't need to pick what we save, but can
1955 * restore everything when a VMEXIT occurs
1956 */
1767 hsave->save.es = vmcb->save.es; 1957 hsave->save.es = vmcb->save.es;
1768 hsave->save.cs = vmcb->save.cs; 1958 hsave->save.cs = vmcb->save.cs;
1769 hsave->save.ss = vmcb->save.ss; 1959 hsave->save.ss = vmcb->save.ss;
@@ -1803,14 +1993,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1803 if (npt_enabled) { 1993 if (npt_enabled) {
1804 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 1994 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
1805 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 1995 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
1806 } else { 1996 } else
1807 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 1997 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1808 kvm_mmu_reset_context(&svm->vcpu); 1998
1809 } 1999 /* Guest paging mode is active - reset mmu */
2000 kvm_mmu_reset_context(&svm->vcpu);
2001
1810 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; 2002 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
1811 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 2003 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1812 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 2004 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1813 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 2005 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2006
1814 /* In case we don't even reach vcpu_run, the fields are not updated */ 2007 /* In case we don't even reach vcpu_run, the fields are not updated */
1815 svm->vmcb->save.rax = nested_vmcb->save.rax; 2008 svm->vmcb->save.rax = nested_vmcb->save.rax;
1816 svm->vmcb->save.rsp = nested_vmcb->save.rsp; 2009 svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1819,22 +2012,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1819 svm->vmcb->save.dr6 = nested_vmcb->save.dr6; 2012 svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
1820 svm->vmcb->save.cpl = nested_vmcb->save.cpl; 2013 svm->vmcb->save.cpl = nested_vmcb->save.cpl;
1821 2014
1822 /* We don't want a nested guest to be more powerful than the guest, 2015 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
1823 so all intercepts are ORed */ 2016 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
1824 svm->vmcb->control.intercept_cr_read |=
1825 nested_vmcb->control.intercept_cr_read;
1826 svm->vmcb->control.intercept_cr_write |=
1827 nested_vmcb->control.intercept_cr_write;
1828 svm->vmcb->control.intercept_dr_read |=
1829 nested_vmcb->control.intercept_dr_read;
1830 svm->vmcb->control.intercept_dr_write |=
1831 nested_vmcb->control.intercept_dr_write;
1832 svm->vmcb->control.intercept_exceptions |=
1833 nested_vmcb->control.intercept_exceptions;
1834
1835 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1836
1837 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1838 2017
1839 /* cache intercepts */ 2018 /* cache intercepts */
1840 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2019 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
@@ -1851,13 +2030,43 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1851 else 2030 else
1852 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 2031 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1853 2032
2033 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2034 /* We only want the cr8 intercept bits of the guest */
2035 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
2036 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2037 }
2038
2039 /* We don't want to see VMMCALLs from a nested guest */
2040 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
2041
2042 /*
2043 * We don't want a nested guest to be more powerful than the guest, so
2044 * all intercepts are ORed
2045 */
2046 svm->vmcb->control.intercept_cr_read |=
2047 nested_vmcb->control.intercept_cr_read;
2048 svm->vmcb->control.intercept_cr_write |=
2049 nested_vmcb->control.intercept_cr_write;
2050 svm->vmcb->control.intercept_dr_read |=
2051 nested_vmcb->control.intercept_dr_read;
2052 svm->vmcb->control.intercept_dr_write |=
2053 nested_vmcb->control.intercept_dr_write;
2054 svm->vmcb->control.intercept_exceptions |=
2055 nested_vmcb->control.intercept_exceptions;
2056
2057 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2058
2059 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
1854 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2060 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1855 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 2061 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1856 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 2062 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1857 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 2063 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1858 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 2064 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1859 2065
1860 nested_svm_unmap(nested_vmcb, KM_USER0); 2066 nested_svm_unmap(page);
2067
2068 /* nested_vmcb is our indicator if nested SVM is activated */
2069 svm->nested.vmcb = vmcb_gpa;
1861 2070
1862 enable_gif(svm); 2071 enable_gif(svm);
1863 2072
@@ -1883,6 +2092,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1883static int vmload_interception(struct vcpu_svm *svm) 2092static int vmload_interception(struct vcpu_svm *svm)
1884{ 2093{
1885 struct vmcb *nested_vmcb; 2094 struct vmcb *nested_vmcb;
2095 struct page *page;
1886 2096
1887 if (nested_svm_check_permissions(svm)) 2097 if (nested_svm_check_permissions(svm))
1888 return 1; 2098 return 1;
@@ -1890,12 +2100,12 @@ static int vmload_interception(struct vcpu_svm *svm)
1890 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2100 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1891 skip_emulated_instruction(&svm->vcpu); 2101 skip_emulated_instruction(&svm->vcpu);
1892 2102
1893 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2103 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1894 if (!nested_vmcb) 2104 if (!nested_vmcb)
1895 return 1; 2105 return 1;
1896 2106
1897 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2107 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1898 nested_svm_unmap(nested_vmcb, KM_USER0); 2108 nested_svm_unmap(page);
1899 2109
1900 return 1; 2110 return 1;
1901} 2111}
@@ -1903,6 +2113,7 @@ static int vmload_interception(struct vcpu_svm *svm)
1903static int vmsave_interception(struct vcpu_svm *svm) 2113static int vmsave_interception(struct vcpu_svm *svm)
1904{ 2114{
1905 struct vmcb *nested_vmcb; 2115 struct vmcb *nested_vmcb;
2116 struct page *page;
1906 2117
1907 if (nested_svm_check_permissions(svm)) 2118 if (nested_svm_check_permissions(svm))
1908 return 1; 2119 return 1;
@@ -1910,12 +2121,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
1910 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2121 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1911 skip_emulated_instruction(&svm->vcpu); 2122 skip_emulated_instruction(&svm->vcpu);
1912 2123
1913 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2124 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1914 if (!nested_vmcb) 2125 if (!nested_vmcb)
1915 return 1; 2126 return 1;
1916 2127
1917 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2128 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1918 nested_svm_unmap(nested_vmcb, KM_USER0); 2129 nested_svm_unmap(page);
1919 2130
1920 return 1; 2131 return 1;
1921} 2132}
@@ -2018,6 +2229,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
2018 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2229 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2019 uint32_t idt_v = 2230 uint32_t idt_v =
2020 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2231 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2232 bool has_error_code = false;
2233 u32 error_code = 0;
2021 2234
2022 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2235 tss_selector = (u16)svm->vmcb->control.exit_info_1;
2023 2236
@@ -2038,6 +2251,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
2038 svm->vcpu.arch.nmi_injected = false; 2251 svm->vcpu.arch.nmi_injected = false;
2039 break; 2252 break;
2040 case SVM_EXITINTINFO_TYPE_EXEPT: 2253 case SVM_EXITINTINFO_TYPE_EXEPT:
2254 if (svm->vmcb->control.exit_info_2 &
2255 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2256 has_error_code = true;
2257 error_code =
2258 (u32)svm->vmcb->control.exit_info_2;
2259 }
2041 kvm_clear_exception_queue(&svm->vcpu); 2260 kvm_clear_exception_queue(&svm->vcpu);
2042 break; 2261 break;
2043 case SVM_EXITINTINFO_TYPE_INTR: 2262 case SVM_EXITINTINFO_TYPE_INTR:
@@ -2054,7 +2273,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
2054 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 2273 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2055 skip_emulated_instruction(&svm->vcpu); 2274 skip_emulated_instruction(&svm->vcpu);
2056 2275
2057 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2276 if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
2277 has_error_code, error_code) == EMULATE_FAIL) {
2278 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2279 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2280 svm->vcpu.run->internal.ndata = 0;
2281 return 0;
2282 }
2283 return 1;
2058} 2284}
2059 2285
2060static int cpuid_interception(struct vcpu_svm *svm) 2286static int cpuid_interception(struct vcpu_svm *svm)
@@ -2067,7 +2293,7 @@ static int cpuid_interception(struct vcpu_svm *svm)
2067static int iret_interception(struct vcpu_svm *svm) 2293static int iret_interception(struct vcpu_svm *svm)
2068{ 2294{
2069 ++svm->vcpu.stat.nmi_window_exits; 2295 ++svm->vcpu.stat.nmi_window_exits;
2070 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2296 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
2071 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2297 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2072 return 1; 2298 return 1;
2073} 2299}
@@ -2145,9 +2371,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2145 case MSR_IA32_SYSENTER_ESP: 2371 case MSR_IA32_SYSENTER_ESP:
2146 *data = svm->sysenter_esp; 2372 *data = svm->sysenter_esp;
2147 break; 2373 break;
2148 /* Nobody will change the following 5 values in the VMCB so 2374 /*
2149 we can safely return them on rdmsr. They will always be 0 2375 * Nobody will change the following 5 values in the VMCB so we can
2150 until LBRV is implemented. */ 2376 * safely return them on rdmsr. They will always be 0 until LBRV is
2377 * implemented.
2378 */
2151 case MSR_IA32_DEBUGCTLMSR: 2379 case MSR_IA32_DEBUGCTLMSR:
2152 *data = svm->vmcb->save.dbgctl; 2380 *data = svm->vmcb->save.dbgctl;
2153 break; 2381 break;
@@ -2167,7 +2395,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2167 *data = svm->nested.hsave_msr; 2395 *data = svm->nested.hsave_msr;
2168 break; 2396 break;
2169 case MSR_VM_CR: 2397 case MSR_VM_CR:
2170 *data = 0; 2398 *data = svm->nested.vm_cr_msr;
2171 break; 2399 break;
2172 case MSR_IA32_UCODE_REV: 2400 case MSR_IA32_UCODE_REV:
2173 *data = 0x01000065; 2401 *data = 0x01000065;
@@ -2197,6 +2425,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
2197 return 1; 2425 return 1;
2198} 2426}
2199 2427
2428static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2429{
2430 struct vcpu_svm *svm = to_svm(vcpu);
2431 int svm_dis, chg_mask;
2432
2433 if (data & ~SVM_VM_CR_VALID_MASK)
2434 return 1;
2435
2436 chg_mask = SVM_VM_CR_VALID_MASK;
2437
2438 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2439 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2440
2441 svm->nested.vm_cr_msr &= ~chg_mask;
2442 svm->nested.vm_cr_msr |= (data & chg_mask);
2443
2444 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2445
2446 /* check for svm_disable while efer.svme is set */
2447 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2448 return 1;
2449
2450 return 0;
2451}
2452
2200static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 2453static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2201{ 2454{
2202 struct vcpu_svm *svm = to_svm(vcpu); 2455 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2263,6 +2516,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2263 svm->nested.hsave_msr = data; 2516 svm->nested.hsave_msr = data;
2264 break; 2517 break;
2265 case MSR_VM_CR: 2518 case MSR_VM_CR:
2519 return svm_set_vm_cr(vcpu, data);
2266 case MSR_VM_IGNNE: 2520 case MSR_VM_IGNNE:
2267 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 2521 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2268 break; 2522 break;
@@ -2326,16 +2580,16 @@ static int pause_interception(struct vcpu_svm *svm)
2326} 2580}
2327 2581
2328static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 2582static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2329 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2583 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2330 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2584 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2331 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2585 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2332 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2586 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2333 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 2587 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2334 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2588 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2335 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2589 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2336 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2590 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
2337 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2591 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2338 [SVM_EXIT_READ_DR0] = emulate_on_interception, 2592 [SVM_EXIT_READ_DR0] = emulate_on_interception,
2339 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2593 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2340 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2594 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2341 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2595 [SVM_EXIT_READ_DR3] = emulate_on_interception,
@@ -2354,15 +2608,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2354 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2608 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2355 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2609 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2356 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 2610 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
2357 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 2611 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
2358 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 2612 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
2359 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 2613 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
2360 [SVM_EXIT_INTR] = intr_interception, 2614 [SVM_EXIT_INTR] = intr_interception,
2361 [SVM_EXIT_NMI] = nmi_interception, 2615 [SVM_EXIT_NMI] = nmi_interception,
2362 [SVM_EXIT_SMI] = nop_on_interception, 2616 [SVM_EXIT_SMI] = nop_on_interception,
2363 [SVM_EXIT_INIT] = nop_on_interception, 2617 [SVM_EXIT_INIT] = nop_on_interception,
2364 [SVM_EXIT_VINTR] = interrupt_window_interception, 2618 [SVM_EXIT_VINTR] = interrupt_window_interception,
2365 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2366 [SVM_EXIT_CPUID] = cpuid_interception, 2619 [SVM_EXIT_CPUID] = cpuid_interception,
2367 [SVM_EXIT_IRET] = iret_interception, 2620 [SVM_EXIT_IRET] = iret_interception,
2368 [SVM_EXIT_INVD] = emulate_on_interception, 2621 [SVM_EXIT_INVD] = emulate_on_interception,
@@ -2370,7 +2623,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2370 [SVM_EXIT_HLT] = halt_interception, 2623 [SVM_EXIT_HLT] = halt_interception,
2371 [SVM_EXIT_INVLPG] = invlpg_interception, 2624 [SVM_EXIT_INVLPG] = invlpg_interception,
2372 [SVM_EXIT_INVLPGA] = invlpga_interception, 2625 [SVM_EXIT_INVLPGA] = invlpga_interception,
2373 [SVM_EXIT_IOIO] = io_interception, 2626 [SVM_EXIT_IOIO] = io_interception,
2374 [SVM_EXIT_MSR] = msr_interception, 2627 [SVM_EXIT_MSR] = msr_interception,
2375 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 2628 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
2376 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 2629 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
@@ -2393,7 +2646,12 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2393 struct kvm_run *kvm_run = vcpu->run; 2646 struct kvm_run *kvm_run = vcpu->run;
2394 u32 exit_code = svm->vmcb->control.exit_code; 2647 u32 exit_code = svm->vmcb->control.exit_code;
2395 2648
2396 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2649 trace_kvm_exit(exit_code, vcpu);
2650
2651 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2652 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2653 if (npt_enabled)
2654 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2397 2655
2398 if (unlikely(svm->nested.exit_required)) { 2656 if (unlikely(svm->nested.exit_required)) {
2399 nested_svm_vmexit(svm); 2657 nested_svm_vmexit(svm);
@@ -2422,11 +2680,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2422 2680
2423 svm_complete_interrupts(svm); 2681 svm_complete_interrupts(svm);
2424 2682
2425 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2426 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2427 if (npt_enabled)
2428 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2429
2430 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2683 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2431 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2684 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2432 kvm_run->fail_entry.hardware_entry_failure_reason 2685 kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2479,7 +2732,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2479 2732
2480 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 2733 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2481 vcpu->arch.hflags |= HF_NMI_MASK; 2734 vcpu->arch.hflags |= HF_NMI_MASK;
2482 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); 2735 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
2483 ++vcpu->stat.nmi_injections; 2736 ++vcpu->stat.nmi_injections;
2484} 2737}
2485 2738
@@ -2511,6 +2764,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2511{ 2764{
2512 struct vcpu_svm *svm = to_svm(vcpu); 2765 struct vcpu_svm *svm = to_svm(vcpu);
2513 2766
2767 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2768 return;
2769
2514 if (irr == -1) 2770 if (irr == -1)
2515 return; 2771 return;
2516 2772
@@ -2522,8 +2778,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2522{ 2778{
2523 struct vcpu_svm *svm = to_svm(vcpu); 2779 struct vcpu_svm *svm = to_svm(vcpu);
2524 struct vmcb *vmcb = svm->vmcb; 2780 struct vmcb *vmcb = svm->vmcb;
2525 return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2781 int ret;
2526 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2782 ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2783 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2784 ret = ret && gif_set(svm) && nested_svm_nmi(svm);
2785
2786 return ret;
2527} 2787}
2528 2788
2529static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 2789static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2539,10 +2799,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2539 2799
2540 if (masked) { 2800 if (masked) {
2541 svm->vcpu.arch.hflags |= HF_NMI_MASK; 2801 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2542 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); 2802 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
2543 } else { 2803 } else {
2544 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 2804 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2545 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2805 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
2546 } 2806 }
2547} 2807}
2548 2808
@@ -2568,13 +2828,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2568{ 2828{
2569 struct vcpu_svm *svm = to_svm(vcpu); 2829 struct vcpu_svm *svm = to_svm(vcpu);
2570 2830
2571 nested_svm_intr(svm); 2831 /*
2572 2832 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
2573 /* In case GIF=0 we can't rely on the CPU to tell us when 2833 * 1, because that's a separate STGI/VMRUN intercept. The next time we
2574 * GIF becomes 1, because that's a separate STGI/VMRUN intercept. 2834 * get that intercept, this function will be called again though and
2575 * The next time we get that intercept, this function will be 2835 * we'll get the vintr intercept.
2576 * called again though and we'll get the vintr intercept. */ 2836 */
2577 if (gif_set(svm)) { 2837 if (gif_set(svm) && nested_svm_intr(svm)) {
2578 svm_set_vintr(svm); 2838 svm_set_vintr(svm);
2579 svm_inject_irq(svm, 0x0); 2839 svm_inject_irq(svm, 0x0);
2580 } 2840 }
@@ -2588,9 +2848,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2588 == HF_NMI_MASK) 2848 == HF_NMI_MASK)
2589 return; /* IRET will cause a vm exit */ 2849 return; /* IRET will cause a vm exit */
2590 2850
2591 /* Something prevents NMI from been injected. Single step over 2851 /*
2592 possible problem (IRET or exception injection or interrupt 2852 * Something prevents NMI from been injected. Single step over possible
2593 shadow) */ 2853 * problem (IRET or exception injection or interrupt shadow)
2854 */
2594 svm->nmi_singlestep = true; 2855 svm->nmi_singlestep = true;
2595 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2856 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2596 update_db_intercept(vcpu); 2857 update_db_intercept(vcpu);
@@ -2614,6 +2875,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2614{ 2875{
2615 struct vcpu_svm *svm = to_svm(vcpu); 2876 struct vcpu_svm *svm = to_svm(vcpu);
2616 2877
2878 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2879 return;
2880
2617 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2881 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2618 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2882 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2619 kvm_set_cr8(vcpu, cr8); 2883 kvm_set_cr8(vcpu, cr8);
@@ -2625,6 +2889,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2625 struct vcpu_svm *svm = to_svm(vcpu); 2889 struct vcpu_svm *svm = to_svm(vcpu);
2626 u64 cr8; 2890 u64 cr8;
2627 2891
2892 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2893 return;
2894
2628 cr8 = kvm_get_cr8(vcpu); 2895 cr8 = kvm_get_cr8(vcpu);
2629 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2896 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2630 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2897 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -2635,6 +2902,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2635 u8 vector; 2902 u8 vector;
2636 int type; 2903 int type;
2637 u32 exitintinfo = svm->vmcb->control.exit_int_info; 2904 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2905 unsigned int3_injected = svm->int3_injected;
2906
2907 svm->int3_injected = 0;
2638 2908
2639 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 2909 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2640 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 2910 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2654,18 +2924,25 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2654 svm->vcpu.arch.nmi_injected = true; 2924 svm->vcpu.arch.nmi_injected = true;
2655 break; 2925 break;
2656 case SVM_EXITINTINFO_TYPE_EXEPT: 2926 case SVM_EXITINTINFO_TYPE_EXEPT:
2657 /* In case of software exception do not reinject an exception 2927 /*
2658 vector, but re-execute and instruction instead */ 2928 * In case of software exceptions, do not reinject the vector,
2659 if (is_nested(svm)) 2929 * but re-execute the instruction instead. Rewind RIP first
2660 break; 2930 * if we emulated INT3 before.
2661 if (kvm_exception_is_soft(vector)) 2931 */
2932 if (kvm_exception_is_soft(vector)) {
2933 if (vector == BP_VECTOR && int3_injected &&
2934 kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
2935 kvm_rip_write(&svm->vcpu,
2936 kvm_rip_read(&svm->vcpu) -
2937 int3_injected);
2662 break; 2938 break;
2939 }
2663 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 2940 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2664 u32 err = svm->vmcb->control.exit_int_info_err; 2941 u32 err = svm->vmcb->control.exit_int_info_err;
2665 kvm_queue_exception_e(&svm->vcpu, vector, err); 2942 kvm_requeue_exception_e(&svm->vcpu, vector, err);
2666 2943
2667 } else 2944 } else
2668 kvm_queue_exception(&svm->vcpu, vector); 2945 kvm_requeue_exception(&svm->vcpu, vector);
2669 break; 2946 break;
2670 case SVM_EXITINTINFO_TYPE_INTR: 2947 case SVM_EXITINTINFO_TYPE_INTR:
2671 kvm_queue_interrupt(&svm->vcpu, vector, false); 2948 kvm_queue_interrupt(&svm->vcpu, vector, false);
@@ -2688,6 +2965,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2688 u16 gs_selector; 2965 u16 gs_selector;
2689 u16 ldt_selector; 2966 u16 ldt_selector;
2690 2967
2968 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2969 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2970 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
2971
2691 /* 2972 /*
2692 * A vmexit emulation is required before the vcpu can be executed 2973 * A vmexit emulation is required before the vcpu can be executed
2693 * again. 2974 * again.
@@ -2695,10 +2976,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2695 if (unlikely(svm->nested.exit_required)) 2976 if (unlikely(svm->nested.exit_required))
2696 return; 2977 return;
2697 2978
2698 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2699 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2700 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
2701
2702 pre_svm_run(svm); 2979 pre_svm_run(svm);
2703 2980
2704 sync_lapic_to_cr8(vcpu); 2981 sync_lapic_to_cr8(vcpu);
@@ -2879,25 +3156,39 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2879{ 3156{
2880} 3157}
2881 3158
3159static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3160{
3161 switch (func) {
3162 case 0x8000000A:
3163 entry->eax = 1; /* SVM revision 1 */
3164 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3165 ASID emulation to nested SVM */
3166 entry->ecx = 0; /* Reserved */
3167 entry->edx = 0; /* Do not support any additional features */
3168
3169 break;
3170 }
3171}
3172
2882static const struct trace_print_flags svm_exit_reasons_str[] = { 3173static const struct trace_print_flags svm_exit_reasons_str[] = {
2883 { SVM_EXIT_READ_CR0, "read_cr0" }, 3174 { SVM_EXIT_READ_CR0, "read_cr0" },
2884 { SVM_EXIT_READ_CR3, "read_cr3" }, 3175 { SVM_EXIT_READ_CR3, "read_cr3" },
2885 { SVM_EXIT_READ_CR4, "read_cr4" }, 3176 { SVM_EXIT_READ_CR4, "read_cr4" },
2886 { SVM_EXIT_READ_CR8, "read_cr8" }, 3177 { SVM_EXIT_READ_CR8, "read_cr8" },
2887 { SVM_EXIT_WRITE_CR0, "write_cr0" }, 3178 { SVM_EXIT_WRITE_CR0, "write_cr0" },
2888 { SVM_EXIT_WRITE_CR3, "write_cr3" }, 3179 { SVM_EXIT_WRITE_CR3, "write_cr3" },
2889 { SVM_EXIT_WRITE_CR4, "write_cr4" }, 3180 { SVM_EXIT_WRITE_CR4, "write_cr4" },
2890 { SVM_EXIT_WRITE_CR8, "write_cr8" }, 3181 { SVM_EXIT_WRITE_CR8, "write_cr8" },
2891 { SVM_EXIT_READ_DR0, "read_dr0" }, 3182 { SVM_EXIT_READ_DR0, "read_dr0" },
2892 { SVM_EXIT_READ_DR1, "read_dr1" }, 3183 { SVM_EXIT_READ_DR1, "read_dr1" },
2893 { SVM_EXIT_READ_DR2, "read_dr2" }, 3184 { SVM_EXIT_READ_DR2, "read_dr2" },
2894 { SVM_EXIT_READ_DR3, "read_dr3" }, 3185 { SVM_EXIT_READ_DR3, "read_dr3" },
2895 { SVM_EXIT_WRITE_DR0, "write_dr0" }, 3186 { SVM_EXIT_WRITE_DR0, "write_dr0" },
2896 { SVM_EXIT_WRITE_DR1, "write_dr1" }, 3187 { SVM_EXIT_WRITE_DR1, "write_dr1" },
2897 { SVM_EXIT_WRITE_DR2, "write_dr2" }, 3188 { SVM_EXIT_WRITE_DR2, "write_dr2" },
2898 { SVM_EXIT_WRITE_DR3, "write_dr3" }, 3189 { SVM_EXIT_WRITE_DR3, "write_dr3" },
2899 { SVM_EXIT_WRITE_DR5, "write_dr5" }, 3190 { SVM_EXIT_WRITE_DR5, "write_dr5" },
2900 { SVM_EXIT_WRITE_DR7, "write_dr7" }, 3191 { SVM_EXIT_WRITE_DR7, "write_dr7" },
2901 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, 3192 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
2902 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, 3193 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
2903 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, 3194 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
@@ -2946,8 +3237,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2946{ 3237{
2947 struct vcpu_svm *svm = to_svm(vcpu); 3238 struct vcpu_svm *svm = to_svm(vcpu);
2948 3239
2949 update_cr0_intercept(svm);
2950 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3240 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
3241 if (is_nested(svm))
3242 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3243 update_cr0_intercept(svm);
2951} 3244}
2952 3245
2953static struct kvm_x86_ops svm_x86_ops = { 3246static struct kvm_x86_ops svm_x86_ops = {
@@ -2986,8 +3279,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2986 .set_idt = svm_set_idt, 3279 .set_idt = svm_set_idt,
2987 .get_gdt = svm_get_gdt, 3280 .get_gdt = svm_get_gdt,
2988 .set_gdt = svm_set_gdt, 3281 .set_gdt = svm_set_gdt,
2989 .get_dr = svm_get_dr, 3282 .set_dr7 = svm_set_dr7,
2990 .set_dr = svm_set_dr,
2991 .cache_reg = svm_cache_reg, 3283 .cache_reg = svm_cache_reg,
2992 .get_rflags = svm_get_rflags, 3284 .get_rflags = svm_get_rflags,
2993 .set_rflags = svm_set_rflags, 3285 .set_rflags = svm_set_rflags,
@@ -3023,12 +3315,14 @@ static struct kvm_x86_ops svm_x86_ops = {
3023 .cpuid_update = svm_cpuid_update, 3315 .cpuid_update = svm_cpuid_update,
3024 3316
3025 .rdtscp_supported = svm_rdtscp_supported, 3317 .rdtscp_supported = svm_rdtscp_supported,
3318
3319 .set_supported_cpuid = svm_set_supported_cpuid,
3026}; 3320};
3027 3321
3028static int __init svm_init(void) 3322static int __init svm_init(void)
3029{ 3323{
3030 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), 3324 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
3031 THIS_MODULE); 3325 __alignof__(struct vcpu_svm), THIS_MODULE);
3032} 3326}
3033 3327
3034static void __exit svm_exit(void) 3328static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea40439066..4ddadb1a5ff 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
12 /* 12 /*
13 * There is a race window between reading and incrementing, but we do 13 * There is a race window between reading and incrementing, but we do
14 * not care about potentially loosing timer events in the !reinject 14 * not care about potentially loosing timer events in the !reinject
15 * case anyway. 15 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
16 * in vcpu_enter_guest.
16 */ 17 */
17 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
18 atomic_inc(&ktimer->pending); 19 atomic_inc(&ktimer->pending);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a29f04..a6544b8e7c0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
5 5
6#undef TRACE_SYSTEM 6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm 7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_PATH arch/x86/kvm
9#define TRACE_INCLUDE_FILE trace
10 8
11/* 9/*
12 * Tracepoint for guest mode entry. 10 * Tracepoint for guest mode entry.
@@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic,
184 * Tracepoint for kvm guest exit: 182 * Tracepoint for kvm guest exit:
185 */ 183 */
186TRACE_EVENT(kvm_exit, 184TRACE_EVENT(kvm_exit,
187 TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), 185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
188 TP_ARGS(exit_reason, guest_rip), 186 TP_ARGS(exit_reason, vcpu),
189 187
190 TP_STRUCT__entry( 188 TP_STRUCT__entry(
191 __field( unsigned int, exit_reason ) 189 __field( unsigned int, exit_reason )
@@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit,
194 192
195 TP_fast_assign( 193 TP_fast_assign(
196 __entry->exit_reason = exit_reason; 194 __entry->exit_reason = exit_reason;
197 __entry->guest_rip = guest_rip; 195 __entry->guest_rip = kvm_rip_read(vcpu);
198 ), 196 ),
199 197
200 TP_printk("reason %s rip 0x%lx", 198 TP_printk("reason %s rip 0x%lx",
@@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
221 TP_printk("irq %u", __entry->irq) 219 TP_printk("irq %u", __entry->irq)
222); 220);
223 221
222#define EXS(x) { x##_VECTOR, "#" #x }
223
224#define kvm_trace_sym_exc \
225 EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
226 EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
227 EXS(MF), EXS(MC)
228
229/*
230 * Tracepoint for kvm interrupt injection:
231 */
232TRACE_EVENT(kvm_inj_exception,
233 TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
234 TP_ARGS(exception, has_error, error_code),
235
236 TP_STRUCT__entry(
237 __field( u8, exception )
238 __field( u8, has_error )
239 __field( u32, error_code )
240 ),
241
242 TP_fast_assign(
243 __entry->exception = exception;
244 __entry->has_error = has_error;
245 __entry->error_code = error_code;
246 ),
247
248 TP_printk("%s (0x%x)",
249 __print_symbolic(__entry->exception, kvm_trace_sym_exc),
250 /* FIXME: don't print error_code if not present */
251 __entry->has_error ? __entry->error_code : 0)
252);
253
224/* 254/*
225 * Tracepoint for page fault. 255 * Tracepoint for page fault.
226 */ 256 */
@@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun,
413 ), 443 ),
414 444
415 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " 445 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
416 "event_inj: 0x%08x npt: %s\n", 446 "event_inj: 0x%08x npt: %s",
417 __entry->rip, __entry->vmcb, __entry->nested_rip, 447 __entry->rip, __entry->vmcb, __entry->nested_rip,
418 __entry->int_ctl, __entry->event_inj, 448 __entry->int_ctl, __entry->event_inj,
419 __entry->npt ? "on" : "off") 449 __entry->npt ? "on" : "off")
420); 450);
421 451
452TRACE_EVENT(kvm_nested_intercepts,
453 TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
454 TP_ARGS(cr_read, cr_write, exceptions, intercept),
455
456 TP_STRUCT__entry(
457 __field( __u16, cr_read )
458 __field( __u16, cr_write )
459 __field( __u32, exceptions )
460 __field( __u64, intercept )
461 ),
462
463 TP_fast_assign(
464 __entry->cr_read = cr_read;
465 __entry->cr_write = cr_write;
466 __entry->exceptions = exceptions;
467 __entry->intercept = intercept;
468 ),
469
470 TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
471 __entry->cr_read, __entry->cr_write, __entry->exceptions,
472 __entry->intercept)
473);
422/* 474/*
423 * Tracepoint for #VMEXIT while nested 475 * Tracepoint for #VMEXIT while nested
424 */ 476 */
@@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit,
447 __entry->exit_int_info_err = exit_int_info_err; 499 __entry->exit_int_info_err = exit_int_info_err;
448 ), 500 ),
449 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " 501 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
450 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 502 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
451 __entry->rip, 503 __entry->rip,
452 ftrace_print_symbols_seq(p, __entry->exit_code, 504 ftrace_print_symbols_seq(p, __entry->exit_code,
453 kvm_x86_ops->exit_reasons_str), 505 kvm_x86_ops->exit_reasons_str),
@@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
482 ), 534 ),
483 535
484 TP_printk("reason: %s ext_inf1: 0x%016llx " 536 TP_printk("reason: %s ext_inf1: 0x%016llx "
485 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 537 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
486 ftrace_print_symbols_seq(p, __entry->exit_code, 538 ftrace_print_symbols_seq(p, __entry->exit_code,
487 kvm_x86_ops->exit_reasons_str), 539 kvm_x86_ops->exit_reasons_str),
488 __entry->exit_info1, __entry->exit_info2, 540 __entry->exit_info1, __entry->exit_info2,
@@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
504 __entry->rip = rip 556 __entry->rip = rip
505 ), 557 ),
506 558
507 TP_printk("rip: 0x%016llx\n", __entry->rip) 559 TP_printk("rip: 0x%016llx", __entry->rip)
508); 560);
509 561
510/* 562/*
@@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga,
526 __entry->address = address; 578 __entry->address = address;
527 ), 579 ),
528 580
529 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", 581 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
530 __entry->rip, __entry->asid, __entry->address) 582 __entry->rip, __entry->asid, __entry->address)
531); 583);
532 584
@@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit,
547 __entry->slb = slb; 599 __entry->slb = slb;
548 ), 600 ),
549 601
550 TP_printk("rip: 0x%016llx slb: 0x%08x\n", 602 TP_printk("rip: 0x%016llx slb: 0x%08x",
551 __entry->rip, __entry->slb) 603 __entry->rip, __entry->slb)
552); 604);
553 605
606#define __print_insn(insn, ilen) ({ \
607 int i; \
608 const char *ret = p->buffer + p->len; \
609 \
610 for (i = 0; i < ilen; ++i) \
611 trace_seq_printf(p, " %02x", insn[i]); \
612 trace_seq_printf(p, "%c", 0); \
613 ret; \
614 })
615
616#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
617#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
618#define KVM_EMUL_INSN_F_CS_D (1 << 2)
619#define KVM_EMUL_INSN_F_CS_L (1 << 3)
620
621#define kvm_trace_symbol_emul_flags \
622 { 0, "real" }, \
623 { KVM_EMUL_INSN_F_CR0_PE \
624 | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
625 { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
626 { KVM_EMUL_INSN_F_CR0_PE \
627 | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
628 { KVM_EMUL_INSN_F_CR0_PE \
629 | KVM_EMUL_INSN_F_CS_L, "prot64" }
630
631#define kei_decode_mode(mode) ({ \
632 u8 flags = 0xff; \
633 switch (mode) { \
634 case X86EMUL_MODE_REAL: \
635 flags = 0; \
636 break; \
637 case X86EMUL_MODE_VM86: \
638 flags = KVM_EMUL_INSN_F_EFL_VM; \
639 break; \
640 case X86EMUL_MODE_PROT16: \
641 flags = KVM_EMUL_INSN_F_CR0_PE; \
642 break; \
643 case X86EMUL_MODE_PROT32: \
644 flags = KVM_EMUL_INSN_F_CR0_PE \
645 | KVM_EMUL_INSN_F_CS_D; \
646 break; \
647 case X86EMUL_MODE_PROT64: \
648 flags = KVM_EMUL_INSN_F_CR0_PE \
649 | KVM_EMUL_INSN_F_CS_L; \
650 break; \
651 } \
652 flags; \
653 })
654
655TRACE_EVENT(kvm_emulate_insn,
656 TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
657 TP_ARGS(vcpu, failed),
658
659 TP_STRUCT__entry(
660 __field( __u64, rip )
661 __field( __u32, csbase )
662 __field( __u8, len )
663 __array( __u8, insn, 15 )
664 __field( __u8, flags )
665 __field( __u8, failed )
666 ),
667
668 TP_fast_assign(
669 __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
670 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
671 __entry->len = vcpu->arch.emulate_ctxt.decode.eip
672 - vcpu->arch.emulate_ctxt.decode.fetch.start;
673 memcpy(__entry->insn,
674 vcpu->arch.emulate_ctxt.decode.fetch.data,
675 15);
676 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
677 __entry->failed = failed;
678 ),
679
680 TP_printk("%x:%llx:%s (%s)%s",
681 __entry->csbase, __entry->rip,
682 __print_insn(__entry->insn, __entry->len),
683 __print_symbolic(__entry->flags,
684 kvm_trace_symbol_emul_flags),
685 __entry->failed ? " failed" : ""
686 )
687 );
688
689#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
690#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
691
554#endif /* _TRACE_KVM_H */ 692#endif /* _TRACE_KVM_H */
555 693
694#undef TRACE_INCLUDE_PATH
695#define TRACE_INCLUDE_PATH arch/x86/kvm
696#undef TRACE_INCLUDE_FILE
697#define TRACE_INCLUDE_FILE trace
698
556/* This part must be outside protection */ 699/* This part must be outside protection */
557#include <trace/define_trace.h> 700#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc933cfb4e6..859a01a07db 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
27#include <linux/moduleparam.h> 27#include <linux/moduleparam.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/tboot.h>
30#include "kvm_cache_regs.h" 31#include "kvm_cache_regs.h"
31#include "x86.h" 32#include "x86.h"
32 33
@@ -98,6 +99,8 @@ module_param(ple_gap, int, S_IRUGO);
98static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 99static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
99module_param(ple_window, int, S_IRUGO); 100module_param(ple_window, int, S_IRUGO);
100 101
102#define NR_AUTOLOAD_MSRS 1
103
101struct vmcs { 104struct vmcs {
102 u32 revision_id; 105 u32 revision_id;
103 u32 abort; 106 u32 abort;
@@ -125,6 +128,11 @@ struct vcpu_vmx {
125 u64 msr_guest_kernel_gs_base; 128 u64 msr_guest_kernel_gs_base;
126#endif 129#endif
127 struct vmcs *vmcs; 130 struct vmcs *vmcs;
131 struct msr_autoload {
132 unsigned nr;
133 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
134 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
135 } msr_autoload;
128 struct { 136 struct {
129 int loaded; 137 int loaded;
130 u16 fs_sel, gs_sel, ldt_sel; 138 u16 fs_sel, gs_sel, ldt_sel;
@@ -234,56 +242,56 @@ static const u32 vmx_msr_index[] = {
234}; 242};
235#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 243#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
236 244
237static inline int is_page_fault(u32 intr_info) 245static inline bool is_page_fault(u32 intr_info)
238{ 246{
239 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 247 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
240 INTR_INFO_VALID_MASK)) == 248 INTR_INFO_VALID_MASK)) ==
241 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 249 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
242} 250}
243 251
244static inline int is_no_device(u32 intr_info) 252static inline bool is_no_device(u32 intr_info)
245{ 253{
246 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 254 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
247 INTR_INFO_VALID_MASK)) == 255 INTR_INFO_VALID_MASK)) ==
248 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 256 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
249} 257}
250 258
251static inline int is_invalid_opcode(u32 intr_info) 259static inline bool is_invalid_opcode(u32 intr_info)
252{ 260{
253 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 261 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
254 INTR_INFO_VALID_MASK)) == 262 INTR_INFO_VALID_MASK)) ==
255 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 263 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
256} 264}
257 265
258static inline int is_external_interrupt(u32 intr_info) 266static inline bool is_external_interrupt(u32 intr_info)
259{ 267{
260 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 268 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
261 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 269 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
262} 270}
263 271
264static inline int is_machine_check(u32 intr_info) 272static inline bool is_machine_check(u32 intr_info)
265{ 273{
266 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 274 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
267 INTR_INFO_VALID_MASK)) == 275 INTR_INFO_VALID_MASK)) ==
268 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 276 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
269} 277}
270 278
271static inline int cpu_has_vmx_msr_bitmap(void) 279static inline bool cpu_has_vmx_msr_bitmap(void)
272{ 280{
273 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 281 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
274} 282}
275 283
276static inline int cpu_has_vmx_tpr_shadow(void) 284static inline bool cpu_has_vmx_tpr_shadow(void)
277{ 285{
278 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 286 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
279} 287}
280 288
281static inline int vm_need_tpr_shadow(struct kvm *kvm) 289static inline bool vm_need_tpr_shadow(struct kvm *kvm)
282{ 290{
283 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 291 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
284} 292}
285 293
286static inline int cpu_has_secondary_exec_ctrls(void) 294static inline bool cpu_has_secondary_exec_ctrls(void)
287{ 295{
288 return vmcs_config.cpu_based_exec_ctrl & 296 return vmcs_config.cpu_based_exec_ctrl &
289 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 297 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -303,80 +311,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
303 311
304static inline bool cpu_has_vmx_ept_execute_only(void) 312static inline bool cpu_has_vmx_ept_execute_only(void)
305{ 313{
306 return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); 314 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
307} 315}
308 316
309static inline bool cpu_has_vmx_eptp_uncacheable(void) 317static inline bool cpu_has_vmx_eptp_uncacheable(void)
310{ 318{
311 return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); 319 return vmx_capability.ept & VMX_EPTP_UC_BIT;
312} 320}
313 321
314static inline bool cpu_has_vmx_eptp_writeback(void) 322static inline bool cpu_has_vmx_eptp_writeback(void)
315{ 323{
316 return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); 324 return vmx_capability.ept & VMX_EPTP_WB_BIT;
317} 325}
318 326
319static inline bool cpu_has_vmx_ept_2m_page(void) 327static inline bool cpu_has_vmx_ept_2m_page(void)
320{ 328{
321 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 329 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
322} 330}
323 331
324static inline bool cpu_has_vmx_ept_1g_page(void) 332static inline bool cpu_has_vmx_ept_1g_page(void)
325{ 333{
326 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); 334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
327} 335}
328 336
329static inline int cpu_has_vmx_invept_individual_addr(void) 337static inline bool cpu_has_vmx_invept_individual_addr(void)
330{ 338{
331 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 339 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
332} 340}
333 341
334static inline int cpu_has_vmx_invept_context(void) 342static inline bool cpu_has_vmx_invept_context(void)
335{ 343{
336 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); 344 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
337} 345}
338 346
339static inline int cpu_has_vmx_invept_global(void) 347static inline bool cpu_has_vmx_invept_global(void)
340{ 348{
341 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); 349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
342} 350}
343 351
344static inline int cpu_has_vmx_ept(void) 352static inline bool cpu_has_vmx_ept(void)
345{ 353{
346 return vmcs_config.cpu_based_2nd_exec_ctrl & 354 return vmcs_config.cpu_based_2nd_exec_ctrl &
347 SECONDARY_EXEC_ENABLE_EPT; 355 SECONDARY_EXEC_ENABLE_EPT;
348} 356}
349 357
350static inline int cpu_has_vmx_unrestricted_guest(void) 358static inline bool cpu_has_vmx_unrestricted_guest(void)
351{ 359{
352 return vmcs_config.cpu_based_2nd_exec_ctrl & 360 return vmcs_config.cpu_based_2nd_exec_ctrl &
353 SECONDARY_EXEC_UNRESTRICTED_GUEST; 361 SECONDARY_EXEC_UNRESTRICTED_GUEST;
354} 362}
355 363
356static inline int cpu_has_vmx_ple(void) 364static inline bool cpu_has_vmx_ple(void)
357{ 365{
358 return vmcs_config.cpu_based_2nd_exec_ctrl & 366 return vmcs_config.cpu_based_2nd_exec_ctrl &
359 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 367 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
360} 368}
361 369
362static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 370static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
363{ 371{
364 return flexpriority_enabled && irqchip_in_kernel(kvm); 372 return flexpriority_enabled && irqchip_in_kernel(kvm);
365} 373}
366 374
367static inline int cpu_has_vmx_vpid(void) 375static inline bool cpu_has_vmx_vpid(void)
368{ 376{
369 return vmcs_config.cpu_based_2nd_exec_ctrl & 377 return vmcs_config.cpu_based_2nd_exec_ctrl &
370 SECONDARY_EXEC_ENABLE_VPID; 378 SECONDARY_EXEC_ENABLE_VPID;
371} 379}
372 380
373static inline int cpu_has_vmx_rdtscp(void) 381static inline bool cpu_has_vmx_rdtscp(void)
374{ 382{
375 return vmcs_config.cpu_based_2nd_exec_ctrl & 383 return vmcs_config.cpu_based_2nd_exec_ctrl &
376 SECONDARY_EXEC_RDTSCP; 384 SECONDARY_EXEC_RDTSCP;
377} 385}
378 386
379static inline int cpu_has_virtual_nmis(void) 387static inline bool cpu_has_virtual_nmis(void)
380{ 388{
381 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
382} 390}
@@ -595,16 +603,56 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
595 vmcs_write32(EXCEPTION_BITMAP, eb); 603 vmcs_write32(EXCEPTION_BITMAP, eb);
596} 604}
597 605
606static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
607{
608 unsigned i;
609 struct msr_autoload *m = &vmx->msr_autoload;
610
611 for (i = 0; i < m->nr; ++i)
612 if (m->guest[i].index == msr)
613 break;
614
615 if (i == m->nr)
616 return;
617 --m->nr;
618 m->guest[i] = m->guest[m->nr];
619 m->host[i] = m->host[m->nr];
620 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
621 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
622}
623
624static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
625 u64 guest_val, u64 host_val)
626{
627 unsigned i;
628 struct msr_autoload *m = &vmx->msr_autoload;
629
630 for (i = 0; i < m->nr; ++i)
631 if (m->guest[i].index == msr)
632 break;
633
634 if (i == m->nr) {
635 ++m->nr;
636 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
637 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
638 }
639
640 m->guest[i].index = msr;
641 m->guest[i].value = guest_val;
642 m->host[i].index = msr;
643 m->host[i].value = host_val;
644}
645
598static void reload_tss(void) 646static void reload_tss(void)
599{ 647{
600 /* 648 /*
601 * VT restores TR but not its size. Useless. 649 * VT restores TR but not its size. Useless.
602 */ 650 */
603 struct descriptor_table gdt; 651 struct desc_ptr gdt;
604 struct desc_struct *descs; 652 struct desc_struct *descs;
605 653
606 kvm_get_gdt(&gdt); 654 native_store_gdt(&gdt);
607 descs = (void *)gdt.base; 655 descs = (void *)gdt.address;
608 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 656 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
609 load_TR_desc(); 657 load_TR_desc();
610} 658}
@@ -631,9 +679,57 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
631 guest_efer |= host_efer & ignore_bits; 679 guest_efer |= host_efer & ignore_bits;
632 vmx->guest_msrs[efer_offset].data = guest_efer; 680 vmx->guest_msrs[efer_offset].data = guest_efer;
633 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 681 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
682
683 clear_atomic_switch_msr(vmx, MSR_EFER);
684 /* On ept, can't emulate nx, and must switch nx atomically */
685 if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
686 guest_efer = vmx->vcpu.arch.efer;
687 if (!(guest_efer & EFER_LMA))
688 guest_efer &= ~EFER_LME;
689 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
690 return false;
691 }
692
634 return true; 693 return true;
635} 694}
636 695
696static unsigned long segment_base(u16 selector)
697{
698 struct desc_ptr gdt;
699 struct desc_struct *d;
700 unsigned long table_base;
701 unsigned long v;
702
703 if (!(selector & ~3))
704 return 0;
705
706 native_store_gdt(&gdt);
707 table_base = gdt.address;
708
709 if (selector & 4) { /* from ldt */
710 u16 ldt_selector = kvm_read_ldt();
711
712 if (!(ldt_selector & ~3))
713 return 0;
714
715 table_base = segment_base(ldt_selector);
716 }
717 d = (struct desc_struct *)(table_base + (selector & ~7));
718 v = get_desc_base(d);
719#ifdef CONFIG_X86_64
720 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
721 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
722#endif
723 return v;
724}
725
726static inline unsigned long kvm_read_tr_base(void)
727{
728 u16 tr;
729 asm("str %0" : "=g"(tr));
730 return segment_base(tr);
731}
732
637static void vmx_save_host_state(struct kvm_vcpu *vcpu) 733static void vmx_save_host_state(struct kvm_vcpu *vcpu)
638{ 734{
639 struct vcpu_vmx *vmx = to_vmx(vcpu); 735 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -758,7 +854,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 } 854 }
759 855
760 if (vcpu->cpu != cpu) { 856 if (vcpu->cpu != cpu) {
761 struct descriptor_table dt; 857 struct desc_ptr dt;
762 unsigned long sysenter_esp; 858 unsigned long sysenter_esp;
763 859
764 vcpu->cpu = cpu; 860 vcpu->cpu = cpu;
@@ -767,8 +863,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
767 * processors. 863 * processors.
768 */ 864 */
769 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 865 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
770 kvm_get_gdt(&dt); 866 native_store_gdt(&dt);
771 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 867 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
772 868
773 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 869 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
774 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 870 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -846,9 +942,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
846 int ret = 0; 942 int ret = 0;
847 943
848 if (interruptibility & GUEST_INTR_STATE_STI) 944 if (interruptibility & GUEST_INTR_STATE_STI)
849 ret |= X86_SHADOW_INT_STI; 945 ret |= KVM_X86_SHADOW_INT_STI;
850 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 946 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
851 ret |= X86_SHADOW_INT_MOV_SS; 947 ret |= KVM_X86_SHADOW_INT_MOV_SS;
852 948
853 return ret & mask; 949 return ret & mask;
854} 950}
@@ -860,9 +956,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
860 956
861 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 957 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
862 958
863 if (mask & X86_SHADOW_INT_MOV_SS) 959 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
864 interruptibility |= GUEST_INTR_STATE_MOV_SS; 960 interruptibility |= GUEST_INTR_STATE_MOV_SS;
865 if (mask & X86_SHADOW_INT_STI) 961 else if (mask & KVM_X86_SHADOW_INT_STI)
866 interruptibility |= GUEST_INTR_STATE_STI; 962 interruptibility |= GUEST_INTR_STATE_STI;
867 963
868 if ((interruptibility != interruptibility_old)) 964 if ((interruptibility != interruptibility_old))
@@ -882,7 +978,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
882} 978}
883 979
884static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 980static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
885 bool has_error_code, u32 error_code) 981 bool has_error_code, u32 error_code,
982 bool reinject)
886{ 983{
887 struct vcpu_vmx *vmx = to_vmx(vcpu); 984 struct vcpu_vmx *vmx = to_vmx(vcpu);
888 u32 intr_info = nr | INTR_INFO_VALID_MASK; 985 u32 intr_info = nr | INTR_INFO_VALID_MASK;
@@ -1176,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void)
1176 u64 msr; 1273 u64 msr;
1177 1274
1178 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1275 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1179 return (msr & (FEATURE_CONTROL_LOCKED | 1276 if (msr & FEATURE_CONTROL_LOCKED) {
1180 FEATURE_CONTROL_VMXON_ENABLED)) 1277 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1181 == FEATURE_CONTROL_LOCKED; 1278 && tboot_enabled())
1279 return 1;
1280 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1281 && !tboot_enabled())
1282 return 1;
1283 }
1284
1285 return 0;
1182 /* locked but not enabled */ 1286 /* locked but not enabled */
1183} 1287}
1184 1288
@@ -1186,21 +1290,23 @@ static int hardware_enable(void *garbage)
1186{ 1290{
1187 int cpu = raw_smp_processor_id(); 1291 int cpu = raw_smp_processor_id();
1188 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1292 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1189 u64 old; 1293 u64 old, test_bits;
1190 1294
1191 if (read_cr4() & X86_CR4_VMXE) 1295 if (read_cr4() & X86_CR4_VMXE)
1192 return -EBUSY; 1296 return -EBUSY;
1193 1297
1194 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1298 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1195 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1299 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1196 if ((old & (FEATURE_CONTROL_LOCKED | 1300
1197 FEATURE_CONTROL_VMXON_ENABLED)) 1301 test_bits = FEATURE_CONTROL_LOCKED;
1198 != (FEATURE_CONTROL_LOCKED | 1302 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1199 FEATURE_CONTROL_VMXON_ENABLED)) 1303 if (tboot_enabled())
1304 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1305
1306 if ((old & test_bits) != test_bits) {
1200 /* enable and lock */ 1307 /* enable and lock */
1201 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1202 FEATURE_CONTROL_LOCKED | 1309 }
1203 FEATURE_CONTROL_VMXON_ENABLED);
1204 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1205 asm volatile (ASM_VMX_VMXON_RAX 1311 asm volatile (ASM_VMX_VMXON_RAX
1206 : : "a"(&phys_addr), "m"(phys_addr) 1312 : : "a"(&phys_addr), "m"(phys_addr)
@@ -1521,7 +1627,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1521 struct kvm_memslots *slots; 1627 struct kvm_memslots *slots;
1522 gfn_t base_gfn; 1628 gfn_t base_gfn;
1523 1629
1524 slots = rcu_dereference(kvm->memslots); 1630 slots = kvm_memslots(kvm);
1525 base_gfn = kvm->memslots->memslots[0].base_gfn + 1631 base_gfn = kvm->memslots->memslots[0].base_gfn +
1526 kvm->memslots->memslots[0].npages - 3; 1632 kvm->memslots->memslots[0].npages - 3;
1527 return base_gfn << PAGE_SHIFT; 1633 return base_gfn << PAGE_SHIFT;
@@ -1649,6 +1755,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1649 vmcs_write32(VM_ENTRY_CONTROLS, 1755 vmcs_write32(VM_ENTRY_CONTROLS,
1650 vmcs_read32(VM_ENTRY_CONTROLS) 1756 vmcs_read32(VM_ENTRY_CONTROLS)
1651 & ~VM_ENTRY_IA32E_MODE); 1757 & ~VM_ENTRY_IA32E_MODE);
1758 vmx_set_efer(vcpu, vcpu->arch.efer);
1652} 1759}
1653 1760
1654#endif 1761#endif
@@ -1934,28 +2041,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1934 *l = (ar >> 13) & 1; 2041 *l = (ar >> 13) & 1;
1935} 2042}
1936 2043
1937static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2044static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1938{ 2045{
1939 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); 2046 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
1940 dt->base = vmcs_readl(GUEST_IDTR_BASE); 2047 dt->address = vmcs_readl(GUEST_IDTR_BASE);
1941} 2048}
1942 2049
1943static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2050static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1944{ 2051{
1945 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); 2052 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
1946 vmcs_writel(GUEST_IDTR_BASE, dt->base); 2053 vmcs_writel(GUEST_IDTR_BASE, dt->address);
1947} 2054}
1948 2055
1949static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2056static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1950{ 2057{
1951 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); 2058 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
1952 dt->base = vmcs_readl(GUEST_GDTR_BASE); 2059 dt->address = vmcs_readl(GUEST_GDTR_BASE);
1953} 2060}
1954 2061
1955static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2062static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1956{ 2063{
1957 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); 2064 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
1958 vmcs_writel(GUEST_GDTR_BASE, dt->base); 2065 vmcs_writel(GUEST_GDTR_BASE, dt->address);
1959} 2066}
1960 2067
1961static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 2068static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2296,6 +2403,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2296 spin_unlock(&vmx_vpid_lock); 2403 spin_unlock(&vmx_vpid_lock);
2297} 2404}
2298 2405
2406static void free_vpid(struct vcpu_vmx *vmx)
2407{
2408 if (!enable_vpid)
2409 return;
2410 spin_lock(&vmx_vpid_lock);
2411 if (vmx->vpid != 0)
2412 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
2413 spin_unlock(&vmx_vpid_lock);
2414}
2415
2299static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 2416static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2300{ 2417{
2301 int f = sizeof(unsigned long); 2418 int f = sizeof(unsigned long);
@@ -2334,7 +2451,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2334 u32 junk; 2451 u32 junk;
2335 u64 host_pat, tsc_this, tsc_base; 2452 u64 host_pat, tsc_this, tsc_base;
2336 unsigned long a; 2453 unsigned long a;
2337 struct descriptor_table dt; 2454 struct desc_ptr dt;
2338 int i; 2455 int i;
2339 unsigned long kvm_vmx_return; 2456 unsigned long kvm_vmx_return;
2340 u32 exec_control; 2457 u32 exec_control;
@@ -2415,14 +2532,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2415 2532
2416 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 2533 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2417 2534
2418 kvm_get_idt(&dt); 2535 native_store_idt(&dt);
2419 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 2536 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2420 2537
2421 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 2538 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2422 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ 2539 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2423 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 2540 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2424 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 2541 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2542 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2425 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 2543 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2544 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
2426 2545
2427 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); 2546 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2428 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); 2547 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -2703,8 +2822,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2703 return 0; 2822 return 0;
2704 2823
2705 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2824 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2706 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | 2825 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
2707 GUEST_INTR_STATE_NMI));
2708} 2826}
2709 2827
2710static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 2828static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2948,22 +3066,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
2948 int size, in, string; 3066 int size, in, string;
2949 unsigned port; 3067 unsigned port;
2950 3068
2951 ++vcpu->stat.io_exits;
2952 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3069 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2953 string = (exit_qualification & 16) != 0; 3070 string = (exit_qualification & 16) != 0;
3071 in = (exit_qualification & 8) != 0;
2954 3072
2955 if (string) { 3073 ++vcpu->stat.io_exits;
2956 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2957 return 0;
2958 return 1;
2959 }
2960 3074
2961 size = (exit_qualification & 7) + 1; 3075 if (string || in)
2962 in = (exit_qualification & 8) != 0; 3076 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
2963 port = exit_qualification >> 16;
2964 3077
3078 port = exit_qualification >> 16;
3079 size = (exit_qualification & 7) + 1;
2965 skip_emulated_instruction(vcpu); 3080 skip_emulated_instruction(vcpu);
2966 return kvm_emulate_pio(vcpu, in, size, port); 3081
3082 return kvm_fast_pio_out(vcpu, size, port);
2967} 3083}
2968 3084
2969static void 3085static void
@@ -3054,19 +3170,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3054 return 0; 3170 return 0;
3055} 3171}
3056 3172
3057static int check_dr_alias(struct kvm_vcpu *vcpu)
3058{
3059 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3060 kvm_queue_exception(vcpu, UD_VECTOR);
3061 return -1;
3062 }
3063 return 0;
3064}
3065
3066static int handle_dr(struct kvm_vcpu *vcpu) 3173static int handle_dr(struct kvm_vcpu *vcpu)
3067{ 3174{
3068 unsigned long exit_qualification; 3175 unsigned long exit_qualification;
3069 unsigned long val;
3070 int dr, reg; 3176 int dr, reg;
3071 3177
3072 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 3178 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3101,67 +3207,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3101 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 3207 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
3102 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 3208 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
3103 if (exit_qualification & TYPE_MOV_FROM_DR) { 3209 if (exit_qualification & TYPE_MOV_FROM_DR) {
3104 switch (dr) { 3210 unsigned long val;
3105 case 0 ... 3: 3211 if (!kvm_get_dr(vcpu, dr, &val))
3106 val = vcpu->arch.db[dr]; 3212 kvm_register_write(vcpu, reg, val);
3107 break; 3213 } else
3108 case 4: 3214 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
3109 if (check_dr_alias(vcpu) < 0)
3110 return 1;
3111 /* fall through */
3112 case 6:
3113 val = vcpu->arch.dr6;
3114 break;
3115 case 5:
3116 if (check_dr_alias(vcpu) < 0)
3117 return 1;
3118 /* fall through */
3119 default: /* 7 */
3120 val = vcpu->arch.dr7;
3121 break;
3122 }
3123 kvm_register_write(vcpu, reg, val);
3124 } else {
3125 val = vcpu->arch.regs[reg];
3126 switch (dr) {
3127 case 0 ... 3:
3128 vcpu->arch.db[dr] = val;
3129 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
3130 vcpu->arch.eff_db[dr] = val;
3131 break;
3132 case 4:
3133 if (check_dr_alias(vcpu) < 0)
3134 return 1;
3135 /* fall through */
3136 case 6:
3137 if (val & 0xffffffff00000000ULL) {
3138 kvm_inject_gp(vcpu, 0);
3139 return 1;
3140 }
3141 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3142 break;
3143 case 5:
3144 if (check_dr_alias(vcpu) < 0)
3145 return 1;
3146 /* fall through */
3147 default: /* 7 */
3148 if (val & 0xffffffff00000000ULL) {
3149 kvm_inject_gp(vcpu, 0);
3150 return 1;
3151 }
3152 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3153 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
3154 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
3155 vcpu->arch.switch_db_regs =
3156 (val & DR7_BP_EN_MASK);
3157 }
3158 break;
3159 }
3160 }
3161 skip_emulated_instruction(vcpu); 3215 skip_emulated_instruction(vcpu);
3162 return 1; 3216 return 1;
3163} 3217}
3164 3218
3219static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
3220{
3221 vmcs_writel(GUEST_DR7, val);
3222}
3223
3165static int handle_cpuid(struct kvm_vcpu *vcpu) 3224static int handle_cpuid(struct kvm_vcpu *vcpu)
3166{ 3225{
3167 kvm_emulate_cpuid(vcpu); 3226 kvm_emulate_cpuid(vcpu);
@@ -3293,6 +3352,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3293{ 3352{
3294 struct vcpu_vmx *vmx = to_vmx(vcpu); 3353 struct vcpu_vmx *vmx = to_vmx(vcpu);
3295 unsigned long exit_qualification; 3354 unsigned long exit_qualification;
3355 bool has_error_code = false;
3356 u32 error_code = 0;
3296 u16 tss_selector; 3357 u16 tss_selector;
3297 int reason, type, idt_v; 3358 int reason, type, idt_v;
3298 3359
@@ -3315,6 +3376,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3315 kvm_clear_interrupt_queue(vcpu); 3376 kvm_clear_interrupt_queue(vcpu);
3316 break; 3377 break;
3317 case INTR_TYPE_HARD_EXCEPTION: 3378 case INTR_TYPE_HARD_EXCEPTION:
3379 if (vmx->idt_vectoring_info &
3380 VECTORING_INFO_DELIVER_CODE_MASK) {
3381 has_error_code = true;
3382 error_code =
3383 vmcs_read32(IDT_VECTORING_ERROR_CODE);
3384 }
3385 /* fall through */
3318 case INTR_TYPE_SOFT_EXCEPTION: 3386 case INTR_TYPE_SOFT_EXCEPTION:
3319 kvm_clear_exception_queue(vcpu); 3387 kvm_clear_exception_queue(vcpu);
3320 break; 3388 break;
@@ -3329,8 +3397,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3329 type != INTR_TYPE_NMI_INTR)) 3397 type != INTR_TYPE_NMI_INTR))
3330 skip_emulated_instruction(vcpu); 3398 skip_emulated_instruction(vcpu);
3331 3399
3332 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3400 if (kvm_task_switch(vcpu, tss_selector, reason,
3401 has_error_code, error_code) == EMULATE_FAIL) {
3402 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3403 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3404 vcpu->run->internal.ndata = 0;
3333 return 0; 3405 return 0;
3406 }
3334 3407
3335 /* clear all local breakpoint enable flags */ 3408 /* clear all local breakpoint enable flags */
3336 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); 3409 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3575,7 +3648,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3575 u32 exit_reason = vmx->exit_reason; 3648 u32 exit_reason = vmx->exit_reason;
3576 u32 vectoring_info = vmx->idt_vectoring_info; 3649 u32 vectoring_info = vmx->idt_vectoring_info;
3577 3650
3578 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3651 trace_kvm_exit(exit_reason, vcpu);
3579 3652
3580 /* If guest state is invalid, start emulating */ 3653 /* If guest state is invalid, start emulating */
3581 if (vmx->emulation_required && emulate_invalid_guest_state) 3654 if (vmx->emulation_required && emulate_invalid_guest_state)
@@ -3660,8 +3733,11 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3660 3733
3661 /* We need to handle NMIs before interrupts are enabled */ 3734 /* We need to handle NMIs before interrupts are enabled */
3662 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && 3735 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3663 (exit_intr_info & INTR_INFO_VALID_MASK)) 3736 (exit_intr_info & INTR_INFO_VALID_MASK)) {
3737 kvm_before_handle_nmi(&vmx->vcpu);
3664 asm("int $2"); 3738 asm("int $2");
3739 kvm_after_handle_nmi(&vmx->vcpu);
3740 }
3665 3741
3666 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3742 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3667 3743
@@ -3921,10 +3997,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3921{ 3997{
3922 struct vcpu_vmx *vmx = to_vmx(vcpu); 3998 struct vcpu_vmx *vmx = to_vmx(vcpu);
3923 3999
3924 spin_lock(&vmx_vpid_lock); 4000 free_vpid(vmx);
3925 if (vmx->vpid != 0)
3926 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3927 spin_unlock(&vmx_vpid_lock);
3928 vmx_free_vmcs(vcpu); 4001 vmx_free_vmcs(vcpu);
3929 kfree(vmx->guest_msrs); 4002 kfree(vmx->guest_msrs);
3930 kvm_vcpu_uninit(vcpu); 4003 kvm_vcpu_uninit(vcpu);
@@ -3986,6 +4059,7 @@ free_msrs:
3986uninit_vcpu: 4059uninit_vcpu:
3987 kvm_vcpu_uninit(&vmx->vcpu); 4060 kvm_vcpu_uninit(&vmx->vcpu);
3988free_vcpu: 4061free_vcpu:
4062 free_vpid(vmx);
3989 kmem_cache_free(kvm_vcpu_cache, vmx); 4063 kmem_cache_free(kvm_vcpu_cache, vmx);
3990 return ERR_PTR(err); 4064 return ERR_PTR(err);
3991} 4065}
@@ -4116,6 +4190,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4116 } 4190 }
4117} 4191}
4118 4192
4193static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4194{
4195}
4196
4119static struct kvm_x86_ops vmx_x86_ops = { 4197static struct kvm_x86_ops vmx_x86_ops = {
4120 .cpu_has_kvm_support = cpu_has_kvm_support, 4198 .cpu_has_kvm_support = cpu_has_kvm_support,
4121 .disabled_by_bios = vmx_disabled_by_bios, 4199 .disabled_by_bios = vmx_disabled_by_bios,
@@ -4152,6 +4230,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4152 .set_idt = vmx_set_idt, 4230 .set_idt = vmx_set_idt,
4153 .get_gdt = vmx_get_gdt, 4231 .get_gdt = vmx_get_gdt,
4154 .set_gdt = vmx_set_gdt, 4232 .set_gdt = vmx_set_gdt,
4233 .set_dr7 = vmx_set_dr7,
4155 .cache_reg = vmx_cache_reg, 4234 .cache_reg = vmx_cache_reg,
4156 .get_rflags = vmx_get_rflags, 4235 .get_rflags = vmx_get_rflags,
4157 .set_rflags = vmx_set_rflags, 4236 .set_rflags = vmx_set_rflags,
@@ -4187,6 +4266,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4187 .cpuid_update = vmx_cpuid_update, 4266 .cpuid_update = vmx_cpuid_update,
4188 4267
4189 .rdtscp_supported = vmx_rdtscp_supported, 4268 .rdtscp_supported = vmx_rdtscp_supported,
4269
4270 .set_supported_cpuid = vmx_set_supported_cpuid,
4190}; 4271};
4191 4272
4192static int __init vmx_init(void) 4273static int __init vmx_init(void)
@@ -4234,7 +4315,8 @@ static int __init vmx_init(void)
4234 4315
4235 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 4316 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
4236 4317
4237 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 4318 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
4319 __alignof__(struct vcpu_vmx), THIS_MODULE);
4238 if (r) 4320 if (r)
4239 goto out3; 4321 goto out3;
4240 4322
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4ca98ad27..05d571f6f19 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -40,8 +40,9 @@
40#include <linux/user-return-notifier.h> 40#include <linux/user-return-notifier.h>
41#include <linux/srcu.h> 41#include <linux/srcu.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/perf_event.h>
43#include <trace/events/kvm.h> 44#include <trace/events/kvm.h>
44#undef TRACE_INCLUDE_FILE 45
45#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
46#include "trace.h" 47#include "trace.h"
47 48
@@ -223,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore)
223 kvm_on_user_return(&smsr->urn); 224 kvm_on_user_return(&smsr->urn);
224} 225}
225 226
226unsigned long segment_base(u16 selector)
227{
228 struct descriptor_table gdt;
229 struct desc_struct *d;
230 unsigned long table_base;
231 unsigned long v;
232
233 if (selector == 0)
234 return 0;
235
236 kvm_get_gdt(&gdt);
237 table_base = gdt.base;
238
239 if (selector & 4) { /* from ldt */
240 u16 ldt_selector = kvm_read_ldt();
241
242 table_base = segment_base(ldt_selector);
243 }
244 d = (struct desc_struct *)(table_base + (selector & ~7));
245 v = get_desc_base(d);
246#ifdef CONFIG_X86_64
247 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
248 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
249#endif
250 return v;
251}
252EXPORT_SYMBOL_GPL(segment_base);
253
254u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 227u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
255{ 228{
256 if (irqchip_in_kernel(vcpu->kvm)) 229 if (irqchip_in_kernel(vcpu->kvm))
@@ -292,7 +265,8 @@ static int exception_class(int vector)
292} 265}
293 266
294static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 267static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
295 unsigned nr, bool has_error, u32 error_code) 268 unsigned nr, bool has_error, u32 error_code,
269 bool reinject)
296{ 270{
297 u32 prev_nr; 271 u32 prev_nr;
298 int class1, class2; 272 int class1, class2;
@@ -303,6 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
303 vcpu->arch.exception.has_error_code = has_error; 277 vcpu->arch.exception.has_error_code = has_error;
304 vcpu->arch.exception.nr = nr; 278 vcpu->arch.exception.nr = nr;
305 vcpu->arch.exception.error_code = error_code; 279 vcpu->arch.exception.error_code = error_code;
280 vcpu->arch.exception.reinject = reinject;
306 return; 281 return;
307 } 282 }
308 283
@@ -331,10 +306,16 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
331 306
332void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 307void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
333{ 308{
334 kvm_multiple_exception(vcpu, nr, false, 0); 309 kvm_multiple_exception(vcpu, nr, false, 0, false);
335} 310}
336EXPORT_SYMBOL_GPL(kvm_queue_exception); 311EXPORT_SYMBOL_GPL(kvm_queue_exception);
337 312
313void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
314{
315 kvm_multiple_exception(vcpu, nr, false, 0, true);
316}
317EXPORT_SYMBOL_GPL(kvm_requeue_exception);
318
338void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 319void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
339 u32 error_code) 320 u32 error_code)
340{ 321{
@@ -351,10 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
351 332
352void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 333void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
353{ 334{
354 kvm_multiple_exception(vcpu, nr, true, error_code); 335 kvm_multiple_exception(vcpu, nr, true, error_code, false);
355} 336}
356EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 337EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
357 338
339void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
340{
341 kvm_multiple_exception(vcpu, nr, true, error_code, true);
342}
343EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
344
358/* 345/*
359 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 346 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
360 * a #GP and return false. 347 * a #GP and return false.
@@ -475,7 +462,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
475 } 462 }
476 463
477 kvm_x86_ops->set_cr0(vcpu, cr0); 464 kvm_x86_ops->set_cr0(vcpu, cr0);
478 vcpu->arch.cr0 = cr0;
479 465
480 kvm_mmu_reset_context(vcpu); 466 kvm_mmu_reset_context(vcpu);
481 return; 467 return;
@@ -484,7 +470,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
484 470
485void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
486{ 472{
487 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); 473 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
488} 474}
489EXPORT_SYMBOL_GPL(kvm_lmsw); 475EXPORT_SYMBOL_GPL(kvm_lmsw);
490 476
@@ -516,7 +502,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
516 } 502 }
517 kvm_x86_ops->set_cr4(vcpu, cr4); 503 kvm_x86_ops->set_cr4(vcpu, cr4);
518 vcpu->arch.cr4 = cr4; 504 vcpu->arch.cr4 = cr4;
519 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
520 kvm_mmu_reset_context(vcpu); 505 kvm_mmu_reset_context(vcpu);
521} 506}
522EXPORT_SYMBOL_GPL(kvm_set_cr4); 507EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -591,6 +576,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
591} 576}
592EXPORT_SYMBOL_GPL(kvm_get_cr8); 577EXPORT_SYMBOL_GPL(kvm_get_cr8);
593 578
579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
580{
581 switch (dr) {
582 case 0 ... 3:
583 vcpu->arch.db[dr] = val;
584 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
585 vcpu->arch.eff_db[dr] = val;
586 break;
587 case 4:
588 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
589 kvm_queue_exception(vcpu, UD_VECTOR);
590 return 1;
591 }
592 /* fall through */
593 case 6:
594 if (val & 0xffffffff00000000ULL) {
595 kvm_inject_gp(vcpu, 0);
596 return 1;
597 }
598 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
599 break;
600 case 5:
601 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
602 kvm_queue_exception(vcpu, UD_VECTOR);
603 return 1;
604 }
605 /* fall through */
606 default: /* 7 */
607 if (val & 0xffffffff00000000ULL) {
608 kvm_inject_gp(vcpu, 0);
609 return 1;
610 }
611 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
612 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
613 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
614 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
615 }
616 break;
617 }
618
619 return 0;
620}
621EXPORT_SYMBOL_GPL(kvm_set_dr);
622
623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
624{
625 switch (dr) {
626 case 0 ... 3:
627 *val = vcpu->arch.db[dr];
628 break;
629 case 4:
630 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
631 kvm_queue_exception(vcpu, UD_VECTOR);
632 return 1;
633 }
634 /* fall through */
635 case 6:
636 *val = vcpu->arch.dr6;
637 break;
638 case 5:
639 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
640 kvm_queue_exception(vcpu, UD_VECTOR);
641 return 1;
642 }
643 /* fall through */
644 default: /* 7 */
645 *val = vcpu->arch.dr7;
646 break;
647 }
648
649 return 0;
650}
651EXPORT_SYMBOL_GPL(kvm_get_dr);
652
594static inline u32 bit(int bitno) 653static inline u32 bit(int bitno)
595{ 654{
596 return 1 << (bitno & 31); 655 return 1 << (bitno & 31);
@@ -605,9 +664,10 @@ static inline u32 bit(int bitno)
605 * kvm-specific. Those are put in the beginning of the list. 664 * kvm-specific. Those are put in the beginning of the list.
606 */ 665 */
607 666
608#define KVM_SAVE_MSRS_BEGIN 5 667#define KVM_SAVE_MSRS_BEGIN 7
609static u32 msrs_to_save[] = { 668static u32 msrs_to_save[] = {
610 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 669 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
670 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
611 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 671 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
612 HV_X64_MSR_APIC_ASSIST_PAGE, 672 HV_X64_MSR_APIC_ASSIST_PAGE,
613 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 673 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -624,48 +684,42 @@ static u32 emulated_msrs[] = {
624 MSR_IA32_MISC_ENABLE, 684 MSR_IA32_MISC_ENABLE,
625}; 685};
626 686
627static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 687static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
628{ 688{
629 if (efer & efer_reserved_bits) { 689 if (efer & efer_reserved_bits)
630 kvm_inject_gp(vcpu, 0); 690 return 1;
631 return;
632 }
633 691
634 if (is_paging(vcpu) 692 if (is_paging(vcpu)
635 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { 693 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
636 kvm_inject_gp(vcpu, 0); 694 return 1;
637 return;
638 }
639 695
640 if (efer & EFER_FFXSR) { 696 if (efer & EFER_FFXSR) {
641 struct kvm_cpuid_entry2 *feat; 697 struct kvm_cpuid_entry2 *feat;
642 698
643 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 699 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
644 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 700 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
645 kvm_inject_gp(vcpu, 0); 701 return 1;
646 return;
647 }
648 } 702 }
649 703
650 if (efer & EFER_SVME) { 704 if (efer & EFER_SVME) {
651 struct kvm_cpuid_entry2 *feat; 705 struct kvm_cpuid_entry2 *feat;
652 706
653 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 707 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
654 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 708 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
655 kvm_inject_gp(vcpu, 0); 709 return 1;
656 return;
657 }
658 } 710 }
659 711
660 kvm_x86_ops->set_efer(vcpu, efer);
661
662 efer &= ~EFER_LMA; 712 efer &= ~EFER_LMA;
663 efer |= vcpu->arch.efer & EFER_LMA; 713 efer |= vcpu->arch.efer & EFER_LMA;
664 714
715 kvm_x86_ops->set_efer(vcpu, efer);
716
665 vcpu->arch.efer = efer; 717 vcpu->arch.efer = efer;
666 718
667 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 719 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
668 kvm_mmu_reset_context(vcpu); 720 kvm_mmu_reset_context(vcpu);
721
722 return 0;
669} 723}
670 724
671void kvm_enable_efer_bits(u64 mask) 725void kvm_enable_efer_bits(u64 mask)
@@ -695,14 +749,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
695 749
696static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 750static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
697{ 751{
698 static int version; 752 int version;
753 int r;
699 struct pvclock_wall_clock wc; 754 struct pvclock_wall_clock wc;
700 struct timespec boot; 755 struct timespec boot;
701 756
702 if (!wall_clock) 757 if (!wall_clock)
703 return; 758 return;
704 759
705 version++; 760 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
761 if (r)
762 return;
763
764 if (version & 1)
765 ++version; /* first time write, random junk */
766
767 ++version;
706 768
707 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 769 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
708 770
@@ -795,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
795 vcpu->hv_clock.system_time = ts.tv_nsec + 857 vcpu->hv_clock.system_time = ts.tv_nsec +
796 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 858 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
797 859
860 vcpu->hv_clock.flags = 0;
861
798 /* 862 /*
799 * The interface expects us to write an even number signaling that the 863 * The interface expects us to write an even number signaling that the
800 * update is finished. Since the guest won't see the intermediate 864 * update is finished. Since the guest won't see the intermediate
@@ -1086,10 +1150,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1086{ 1150{
1087 switch (msr) { 1151 switch (msr) {
1088 case MSR_EFER: 1152 case MSR_EFER:
1089 set_efer(vcpu, data); 1153 return set_efer(vcpu, data);
1090 break;
1091 case MSR_K7_HWCR: 1154 case MSR_K7_HWCR:
1092 data &= ~(u64)0x40; /* ignore flush filter disable */ 1155 data &= ~(u64)0x40; /* ignore flush filter disable */
1156 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1093 if (data != 0) { 1157 if (data != 0) {
1094 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1158 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1095 data); 1159 data);
@@ -1132,10 +1196,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1132 case MSR_IA32_MISC_ENABLE: 1196 case MSR_IA32_MISC_ENABLE:
1133 vcpu->arch.ia32_misc_enable_msr = data; 1197 vcpu->arch.ia32_misc_enable_msr = data;
1134 break; 1198 break;
1199 case MSR_KVM_WALL_CLOCK_NEW:
1135 case MSR_KVM_WALL_CLOCK: 1200 case MSR_KVM_WALL_CLOCK:
1136 vcpu->kvm->arch.wall_clock = data; 1201 vcpu->kvm->arch.wall_clock = data;
1137 kvm_write_wall_clock(vcpu->kvm, data); 1202 kvm_write_wall_clock(vcpu->kvm, data);
1138 break; 1203 break;
1204 case MSR_KVM_SYSTEM_TIME_NEW:
1139 case MSR_KVM_SYSTEM_TIME: { 1205 case MSR_KVM_SYSTEM_TIME: {
1140 if (vcpu->arch.time_page) { 1206 if (vcpu->arch.time_page) {
1141 kvm_release_page_dirty(vcpu->arch.time_page); 1207 kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1407,9 +1473,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1407 data = vcpu->arch.efer; 1473 data = vcpu->arch.efer;
1408 break; 1474 break;
1409 case MSR_KVM_WALL_CLOCK: 1475 case MSR_KVM_WALL_CLOCK:
1476 case MSR_KVM_WALL_CLOCK_NEW:
1410 data = vcpu->kvm->arch.wall_clock; 1477 data = vcpu->kvm->arch.wall_clock;
1411 break; 1478 break;
1412 case MSR_KVM_SYSTEM_TIME: 1479 case MSR_KVM_SYSTEM_TIME:
1480 case MSR_KVM_SYSTEM_TIME_NEW:
1413 data = vcpu->arch.time; 1481 data = vcpu->arch.time;
1414 break; 1482 break;
1415 case MSR_IA32_P5_MC_ADDR: 1483 case MSR_IA32_P5_MC_ADDR:
@@ -1548,6 +1616,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1548 case KVM_CAP_HYPERV_VAPIC: 1616 case KVM_CAP_HYPERV_VAPIC:
1549 case KVM_CAP_HYPERV_SPIN: 1617 case KVM_CAP_HYPERV_SPIN:
1550 case KVM_CAP_PCI_SEGMENT: 1618 case KVM_CAP_PCI_SEGMENT:
1619 case KVM_CAP_DEBUGREGS:
1551 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1620 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1552 r = 1; 1621 r = 1;
1553 break; 1622 break;
@@ -1712,6 +1781,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1712 if (copy_from_user(cpuid_entries, entries, 1781 if (copy_from_user(cpuid_entries, entries,
1713 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1782 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1714 goto out_free; 1783 goto out_free;
1784 vcpu_load(vcpu);
1715 for (i = 0; i < cpuid->nent; i++) { 1785 for (i = 0; i < cpuid->nent; i++) {
1716 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1786 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1717 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1787 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1729,6 +1799,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1729 r = 0; 1799 r = 0;
1730 kvm_apic_set_version(vcpu); 1800 kvm_apic_set_version(vcpu);
1731 kvm_x86_ops->cpuid_update(vcpu); 1801 kvm_x86_ops->cpuid_update(vcpu);
1802 vcpu_put(vcpu);
1732 1803
1733out_free: 1804out_free:
1734 vfree(cpuid_entries); 1805 vfree(cpuid_entries);
@@ -1749,9 +1820,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1749 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1820 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1750 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1821 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1751 goto out; 1822 goto out;
1823 vcpu_load(vcpu);
1752 vcpu->arch.cpuid_nent = cpuid->nent; 1824 vcpu->arch.cpuid_nent = cpuid->nent;
1753 kvm_apic_set_version(vcpu); 1825 kvm_apic_set_version(vcpu);
1754 kvm_x86_ops->cpuid_update(vcpu); 1826 kvm_x86_ops->cpuid_update(vcpu);
1827 vcpu_put(vcpu);
1755 return 0; 1828 return 0;
1756 1829
1757out: 1830out:
@@ -1764,6 +1837,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1764{ 1837{
1765 int r; 1838 int r;
1766 1839
1840 vcpu_load(vcpu);
1767 r = -E2BIG; 1841 r = -E2BIG;
1768 if (cpuid->nent < vcpu->arch.cpuid_nent) 1842 if (cpuid->nent < vcpu->arch.cpuid_nent)
1769 goto out; 1843 goto out;
@@ -1775,6 +1849,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1775 1849
1776out: 1850out:
1777 cpuid->nent = vcpu->arch.cpuid_nent; 1851 cpuid->nent = vcpu->arch.cpuid_nent;
1852 vcpu_put(vcpu);
1778 return r; 1853 return r;
1779} 1854}
1780 1855
@@ -1905,6 +1980,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1905 } 1980 }
1906 break; 1981 break;
1907 } 1982 }
1983 case KVM_CPUID_SIGNATURE: {
1984 char signature[12] = "KVMKVMKVM\0\0";
1985 u32 *sigptr = (u32 *)signature;
1986 entry->eax = 0;
1987 entry->ebx = sigptr[0];
1988 entry->ecx = sigptr[1];
1989 entry->edx = sigptr[2];
1990 break;
1991 }
1992 case KVM_CPUID_FEATURES:
1993 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
1994 (1 << KVM_FEATURE_NOP_IO_DELAY) |
1995 (1 << KVM_FEATURE_CLOCKSOURCE2) |
1996 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
1997 entry->ebx = 0;
1998 entry->ecx = 0;
1999 entry->edx = 0;
2000 break;
1908 case 0x80000000: 2001 case 0x80000000:
1909 entry->eax = min(entry->eax, 0x8000001a); 2002 entry->eax = min(entry->eax, 0x8000001a);
1910 break; 2003 break;
@@ -1913,6 +2006,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1913 entry->ecx &= kvm_supported_word6_x86_features; 2006 entry->ecx &= kvm_supported_word6_x86_features;
1914 break; 2007 break;
1915 } 2008 }
2009
2010 kvm_x86_ops->set_supported_cpuid(function, entry);
2011
1916 put_cpu(); 2012 put_cpu();
1917} 2013}
1918 2014
@@ -1948,6 +2044,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1948 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 2044 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1949 do_cpuid_ent(&cpuid_entries[nent], func, 0, 2045 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1950 &nent, cpuid->nent); 2046 &nent, cpuid->nent);
2047
2048
2049
2050 r = -E2BIG;
2051 if (nent >= cpuid->nent)
2052 goto out_free;
2053
2054 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2055 cpuid->nent);
2056
2057 r = -E2BIG;
2058 if (nent >= cpuid->nent)
2059 goto out_free;
2060
2061 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
2062 cpuid->nent);
2063
1951 r = -E2BIG; 2064 r = -E2BIG;
1952 if (nent >= cpuid->nent) 2065 if (nent >= cpuid->nent)
1953 goto out_free; 2066 goto out_free;
@@ -2027,6 +2140,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2027 int r; 2140 int r;
2028 unsigned bank_num = mcg_cap & 0xff, bank; 2141 unsigned bank_num = mcg_cap & 0xff, bank;
2029 2142
2143 vcpu_load(vcpu);
2030 r = -EINVAL; 2144 r = -EINVAL;
2031 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2145 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2032 goto out; 2146 goto out;
@@ -2041,6 +2155,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2041 for (bank = 0; bank < bank_num; bank++) 2155 for (bank = 0; bank < bank_num; bank++)
2042 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2043out: 2157out:
2158 vcpu_put(vcpu);
2044 return r; 2159 return r;
2045} 2160}
2046 2161
@@ -2100,14 +2215,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2100{ 2215{
2101 vcpu_load(vcpu); 2216 vcpu_load(vcpu);
2102 2217
2103 events->exception.injected = vcpu->arch.exception.pending; 2218 events->exception.injected =
2219 vcpu->arch.exception.pending &&
2220 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2104 events->exception.nr = vcpu->arch.exception.nr; 2221 events->exception.nr = vcpu->arch.exception.nr;
2105 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2222 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2106 events->exception.error_code = vcpu->arch.exception.error_code; 2223 events->exception.error_code = vcpu->arch.exception.error_code;
2107 2224
2108 events->interrupt.injected = vcpu->arch.interrupt.pending; 2225 events->interrupt.injected =
2226 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2109 events->interrupt.nr = vcpu->arch.interrupt.nr; 2227 events->interrupt.nr = vcpu->arch.interrupt.nr;
2110 events->interrupt.soft = vcpu->arch.interrupt.soft; 2228 events->interrupt.soft = 0;
2229 events->interrupt.shadow =
2230 kvm_x86_ops->get_interrupt_shadow(vcpu,
2231 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2111 2232
2112 events->nmi.injected = vcpu->arch.nmi_injected; 2233 events->nmi.injected = vcpu->arch.nmi_injected;
2113 events->nmi.pending = vcpu->arch.nmi_pending; 2234 events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2116,7 +2237,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2116 events->sipi_vector = vcpu->arch.sipi_vector; 2237 events->sipi_vector = vcpu->arch.sipi_vector;
2117 2238
2118 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2119 | KVM_VCPUEVENT_VALID_SIPI_VECTOR); 2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241 | KVM_VCPUEVENT_VALID_SHADOW);
2120 2242
2121 vcpu_put(vcpu); 2243 vcpu_put(vcpu);
2122} 2244}
@@ -2125,7 +2247,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2125 struct kvm_vcpu_events *events) 2247 struct kvm_vcpu_events *events)
2126{ 2248{
2127 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 2249 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2128 | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) 2250 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2251 | KVM_VCPUEVENT_VALID_SHADOW))
2129 return -EINVAL; 2252 return -EINVAL;
2130 2253
2131 vcpu_load(vcpu); 2254 vcpu_load(vcpu);
@@ -2140,6 +2263,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2140 vcpu->arch.interrupt.soft = events->interrupt.soft; 2263 vcpu->arch.interrupt.soft = events->interrupt.soft;
2141 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 2264 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2142 kvm_pic_clear_isr_ack(vcpu->kvm); 2265 kvm_pic_clear_isr_ack(vcpu->kvm);
2266 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2267 kvm_x86_ops->set_interrupt_shadow(vcpu,
2268 events->interrupt.shadow);
2143 2269
2144 vcpu->arch.nmi_injected = events->nmi.injected; 2270 vcpu->arch.nmi_injected = events->nmi.injected;
2145 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 2271 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2154,6 +2280,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2154 return 0; 2280 return 0;
2155} 2281}
2156 2282
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284 struct kvm_debugregs *dbgregs)
2285{
2286 vcpu_load(vcpu);
2287
2288 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289 dbgregs->dr6 = vcpu->arch.dr6;
2290 dbgregs->dr7 = vcpu->arch.dr7;
2291 dbgregs->flags = 0;
2292
2293 vcpu_put(vcpu);
2294}
2295
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2297 struct kvm_debugregs *dbgregs)
2298{
2299 if (dbgregs->flags)
2300 return -EINVAL;
2301
2302 vcpu_load(vcpu);
2303
2304 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305 vcpu->arch.dr6 = dbgregs->dr6;
2306 vcpu->arch.dr7 = dbgregs->dr7;
2307
2308 vcpu_put(vcpu);
2309
2310 return 0;
2311}
2312
2157long kvm_arch_vcpu_ioctl(struct file *filp, 2313long kvm_arch_vcpu_ioctl(struct file *filp,
2158 unsigned int ioctl, unsigned long arg) 2314 unsigned int ioctl, unsigned long arg)
2159{ 2315{
@@ -2308,7 +2464,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2308 r = -EFAULT; 2464 r = -EFAULT;
2309 if (copy_from_user(&mce, argp, sizeof mce)) 2465 if (copy_from_user(&mce, argp, sizeof mce))
2310 goto out; 2466 goto out;
2467 vcpu_load(vcpu);
2311 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2468 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469 vcpu_put(vcpu);
2312 break; 2470 break;
2313 } 2471 }
2314 case KVM_GET_VCPU_EVENTS: { 2472 case KVM_GET_VCPU_EVENTS: {
@@ -2332,6 +2490,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2332 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2490 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2333 break; 2491 break;
2334 } 2492 }
2493 case KVM_GET_DEBUGREGS: {
2494 struct kvm_debugregs dbgregs;
2495
2496 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
2497
2498 r = -EFAULT;
2499 if (copy_to_user(argp, &dbgregs,
2500 sizeof(struct kvm_debugregs)))
2501 break;
2502 r = 0;
2503 break;
2504 }
2505 case KVM_SET_DEBUGREGS: {
2506 struct kvm_debugregs dbgregs;
2507
2508 r = -EFAULT;
2509 if (copy_from_user(&dbgregs, argp,
2510 sizeof(struct kvm_debugregs)))
2511 break;
2512
2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514 break;
2515 }
2335 default: 2516 default:
2336 r = -EINVAL; 2517 r = -EINVAL;
2337 } 2518 }
@@ -2385,7 +2566,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2385 struct kvm_mem_alias *alias; 2566 struct kvm_mem_alias *alias;
2386 struct kvm_mem_aliases *aliases; 2567 struct kvm_mem_aliases *aliases;
2387 2568
2388 aliases = rcu_dereference(kvm->arch.aliases); 2569 aliases = kvm_aliases(kvm);
2389 2570
2390 for (i = 0; i < aliases->naliases; ++i) { 2571 for (i = 0; i < aliases->naliases; ++i) {
2391 alias = &aliases->aliases[i]; 2572 alias = &aliases->aliases[i];
@@ -2404,7 +2585,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2404 struct kvm_mem_alias *alias; 2585 struct kvm_mem_alias *alias;
2405 struct kvm_mem_aliases *aliases; 2586 struct kvm_mem_aliases *aliases;
2406 2587
2407 aliases = rcu_dereference(kvm->arch.aliases); 2588 aliases = kvm_aliases(kvm);
2408 2589
2409 for (i = 0; i < aliases->naliases; ++i) { 2590 for (i = 0; i < aliases->naliases; ++i) {
2410 alias = &aliases->aliases[i]; 2591 alias = &aliases->aliases[i];
@@ -2799,11 +2980,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
2799 r = -EFAULT; 2980 r = -EFAULT;
2800 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2981 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2801 goto out; 2982 goto out;
2983 r = -ENXIO;
2802 if (irqchip_in_kernel(kvm)) { 2984 if (irqchip_in_kernel(kvm)) {
2803 __s32 status; 2985 __s32 status;
2804 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2986 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2805 irq_event.irq, irq_event.level); 2987 irq_event.irq, irq_event.level);
2806 if (ioctl == KVM_IRQ_LINE_STATUS) { 2988 if (ioctl == KVM_IRQ_LINE_STATUS) {
2989 r = -EFAULT;
2807 irq_event.status = status; 2990 irq_event.status = status;
2808 if (copy_to_user(argp, &irq_event, 2991 if (copy_to_user(argp, &irq_event,
2809 sizeof irq_event)) 2992 sizeof irq_event))
@@ -3019,6 +3202,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3019 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3202 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
3020} 3203}
3021 3204
3205static void kvm_set_segment(struct kvm_vcpu *vcpu,
3206 struct kvm_segment *var, int seg)
3207{
3208 kvm_x86_ops->set_segment(vcpu, var, seg);
3209}
3210
3211void kvm_get_segment(struct kvm_vcpu *vcpu,
3212 struct kvm_segment *var, int seg)
3213{
3214 kvm_x86_ops->get_segment(vcpu, var, seg);
3215}
3216
3022gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3217gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3023{ 3218{
3024 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3219 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3099,14 +3294,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3099 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3294 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3100} 3295}
3101 3296
3102static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3297static int kvm_write_guest_virt_system(gva_t addr, void *val,
3103 struct kvm_vcpu *vcpu, u32 *error) 3298 unsigned int bytes,
3299 struct kvm_vcpu *vcpu,
3300 u32 *error)
3104{ 3301{
3105 void *data = val; 3302 void *data = val;
3106 int r = X86EMUL_CONTINUE; 3303 int r = X86EMUL_CONTINUE;
3107 3304
3108 while (bytes) { 3305 while (bytes) {
3109 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); 3306 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
3307 PFERR_WRITE_MASK, error);
3110 unsigned offset = addr & (PAGE_SIZE-1); 3308 unsigned offset = addr & (PAGE_SIZE-1);
3111 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3309 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3112 int ret; 3310 int ret;
@@ -3129,7 +3327,6 @@ out:
3129 return r; 3327 return r;
3130} 3328}
3131 3329
3132
3133static int emulator_read_emulated(unsigned long addr, 3330static int emulator_read_emulated(unsigned long addr,
3134 void *val, 3331 void *val,
3135 unsigned int bytes, 3332 unsigned int bytes,
@@ -3232,9 +3429,9 @@ mmio:
3232} 3429}
3233 3430
3234int emulator_write_emulated(unsigned long addr, 3431int emulator_write_emulated(unsigned long addr,
3235 const void *val, 3432 const void *val,
3236 unsigned int bytes, 3433 unsigned int bytes,
3237 struct kvm_vcpu *vcpu) 3434 struct kvm_vcpu *vcpu)
3238{ 3435{
3239 /* Crossing a page boundary? */ 3436 /* Crossing a page boundary? */
3240 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3437 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3252,45 +3449,150 @@ int emulator_write_emulated(unsigned long addr,
3252} 3449}
3253EXPORT_SYMBOL_GPL(emulator_write_emulated); 3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3254 3451
3452#define CMPXCHG_TYPE(t, ptr, old, new) \
3453 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
3454
3455#ifdef CONFIG_X86_64
3456# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
3457#else
3458# define CMPXCHG64(ptr, old, new) \
3459 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3460#endif
3461
3255static int emulator_cmpxchg_emulated(unsigned long addr, 3462static int emulator_cmpxchg_emulated(unsigned long addr,
3256 const void *old, 3463 const void *old,
3257 const void *new, 3464 const void *new,
3258 unsigned int bytes, 3465 unsigned int bytes,
3259 struct kvm_vcpu *vcpu) 3466 struct kvm_vcpu *vcpu)
3260{ 3467{
3261 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3468 gpa_t gpa;
3262#ifndef CONFIG_X86_64 3469 struct page *page;
3263 /* guests cmpxchg8b have to be emulated atomically */ 3470 char *kaddr;
3264 if (bytes == 8) { 3471 bool exchanged;
3265 gpa_t gpa;
3266 struct page *page;
3267 char *kaddr;
3268 u64 val;
3269 3472
3270 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 3473 /* guests cmpxchg8b have to be emulated atomically */
3474 if (bytes > 8 || (bytes & (bytes - 1)))
3475 goto emul_write;
3271 3476
3272 if (gpa == UNMAPPED_GVA || 3477 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
3273 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3274 goto emul_write;
3275 3478
3276 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 3479 if (gpa == UNMAPPED_GVA ||
3277 goto emul_write; 3480 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3481 goto emul_write;
3278 3482
3279 val = *(u64 *)new; 3483 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
3484 goto emul_write;
3280 3485
3281 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3282 3487
3283 kaddr = kmap_atomic(page, KM_USER0); 3488 kaddr = kmap_atomic(page, KM_USER0);
3284 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 3489 kaddr += offset_in_page(gpa);
3285 kunmap_atomic(kaddr, KM_USER0); 3490 switch (bytes) {
3286 kvm_release_page_dirty(page); 3491 case 1:
3492 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
3493 break;
3494 case 2:
3495 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
3496 break;
3497 case 4:
3498 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
3499 break;
3500 case 8:
3501 exchanged = CMPXCHG64(kaddr, old, new);
3502 break;
3503 default:
3504 BUG();
3287 } 3505 }
3506 kunmap_atomic(kaddr, KM_USER0);
3507 kvm_release_page_dirty(page);
3508
3509 if (!exchanged)
3510 return X86EMUL_CMPXCHG_FAILED;
3511
3512 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
3513
3514 return X86EMUL_CONTINUE;
3515
3288emul_write: 3516emul_write:
3289#endif 3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3290 3518
3291 return emulator_write_emulated(addr, new, bytes, vcpu); 3519 return emulator_write_emulated(addr, new, bytes, vcpu);
3292} 3520}
3293 3521
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3523{
3524 /* TODO: String I/O for in kernel device */
3525 int r;
3526
3527 if (vcpu->arch.pio.in)
3528 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3529 vcpu->arch.pio.size, pd);
3530 else
3531 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3532 vcpu->arch.pio.port, vcpu->arch.pio.size,
3533 pd);
3534 return r;
3535}
3536
3537
3538static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3539 unsigned int count, struct kvm_vcpu *vcpu)
3540{
3541 if (vcpu->arch.pio.count)
3542 goto data_avail;
3543
3544 trace_kvm_pio(1, port, size, 1);
3545
3546 vcpu->arch.pio.port = port;
3547 vcpu->arch.pio.in = 1;
3548 vcpu->arch.pio.count = count;
3549 vcpu->arch.pio.size = size;
3550
3551 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3552 data_avail:
3553 memcpy(val, vcpu->arch.pio_data, size * count);
3554 vcpu->arch.pio.count = 0;
3555 return 1;
3556 }
3557
3558 vcpu->run->exit_reason = KVM_EXIT_IO;
3559 vcpu->run->io.direction = KVM_EXIT_IO_IN;
3560 vcpu->run->io.size = size;
3561 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3562 vcpu->run->io.count = count;
3563 vcpu->run->io.port = port;
3564
3565 return 0;
3566}
3567
3568static int emulator_pio_out_emulated(int size, unsigned short port,
3569 const void *val, unsigned int count,
3570 struct kvm_vcpu *vcpu)
3571{
3572 trace_kvm_pio(0, port, size, 1);
3573
3574 vcpu->arch.pio.port = port;
3575 vcpu->arch.pio.in = 0;
3576 vcpu->arch.pio.count = count;
3577 vcpu->arch.pio.size = size;
3578
3579 memcpy(vcpu->arch.pio_data, val, size * count);
3580
3581 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3582 vcpu->arch.pio.count = 0;
3583 return 1;
3584 }
3585
3586 vcpu->run->exit_reason = KVM_EXIT_IO;
3587 vcpu->run->io.direction = KVM_EXIT_IO_OUT;
3588 vcpu->run->io.size = size;
3589 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3590 vcpu->run->io.count = count;
3591 vcpu->run->io.port = port;
3592
3593 return 0;
3594}
3595
3294static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 3596static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
3295{ 3597{
3296 return kvm_x86_ops->get_segment_base(vcpu, seg); 3598 return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3311,14 +3613,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
3311 3613
3312int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
3313{ 3615{
3314 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); 3616 return kvm_get_dr(ctxt->vcpu, dr, dest);
3315} 3617}
3316 3618
3317int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
3318{ 3620{
3319 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3621 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
3320 3622
3321 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); 3623 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3322} 3624}
3323 3625
3324void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3339,12 +3641,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3339} 3641}
3340EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3341 3643
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{
3646 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3647}
3648
3649static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3650{
3651 unsigned long value;
3652
3653 switch (cr) {
3654 case 0:
3655 value = kvm_read_cr0(vcpu);
3656 break;
3657 case 2:
3658 value = vcpu->arch.cr2;
3659 break;
3660 case 3:
3661 value = vcpu->arch.cr3;
3662 break;
3663 case 4:
3664 value = kvm_read_cr4(vcpu);
3665 break;
3666 case 8:
3667 value = kvm_get_cr8(vcpu);
3668 break;
3669 default:
3670 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3671 return 0;
3672 }
3673
3674 return value;
3675}
3676
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{
3679 switch (cr) {
3680 case 0:
3681 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682 break;
3683 case 2:
3684 vcpu->arch.cr2 = val;
3685 break;
3686 case 3:
3687 kvm_set_cr3(vcpu, val);
3688 break;
3689 case 4:
3690 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691 break;
3692 case 8:
3693 kvm_set_cr8(vcpu, val & 0xfUL);
3694 break;
3695 default:
3696 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3697 }
3698}
3699
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu)
3701{
3702 return kvm_x86_ops->get_cpl(vcpu);
3703}
3704
3705static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3706{
3707 kvm_x86_ops->get_gdt(vcpu, dt);
3708}
3709
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711 struct kvm_vcpu *vcpu)
3712{
3713 struct kvm_segment var;
3714
3715 kvm_get_segment(vcpu, &var, seg);
3716
3717 if (var.unusable)
3718 return false;
3719
3720 if (var.g)
3721 var.limit >>= 12;
3722 set_desc_limit(desc, var.limit);
3723 set_desc_base(desc, (unsigned long)var.base);
3724 desc->type = var.type;
3725 desc->s = var.s;
3726 desc->dpl = var.dpl;
3727 desc->p = var.present;
3728 desc->avl = var.avl;
3729 desc->l = var.l;
3730 desc->d = var.db;
3731 desc->g = var.g;
3732
3733 return true;
3734}
3735
3736static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
3737 struct kvm_vcpu *vcpu)
3738{
3739 struct kvm_segment var;
3740
3741 /* needed to preserve selector */
3742 kvm_get_segment(vcpu, &var, seg);
3743
3744 var.base = get_desc_base(desc);
3745 var.limit = get_desc_limit(desc);
3746 if (desc->g)
3747 var.limit = (var.limit << 12) | 0xfff;
3748 var.type = desc->type;
3749 var.present = desc->p;
3750 var.dpl = desc->dpl;
3751 var.db = desc->d;
3752 var.s = desc->s;
3753 var.l = desc->l;
3754 var.g = desc->g;
3755 var.avl = desc->avl;
3756 var.present = desc->p;
3757 var.unusable = !var.present;
3758 var.padding = 0;
3759
3760 kvm_set_segment(vcpu, &var, seg);
3761 return;
3762}
3763
3764static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
3765{
3766 struct kvm_segment kvm_seg;
3767
3768 kvm_get_segment(vcpu, &kvm_seg, seg);
3769 return kvm_seg.selector;
3770}
3771
3772static void emulator_set_segment_selector(u16 sel, int seg,
3773 struct kvm_vcpu *vcpu)
3774{
3775 struct kvm_segment kvm_seg;
3776
3777 kvm_get_segment(vcpu, &kvm_seg, seg);
3778 kvm_seg.selector = sel;
3779 kvm_set_segment(vcpu, &kvm_seg, seg);
3780}
3781
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784 kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3342static struct x86_emulate_ops emulate_ops = { 3787static struct x86_emulate_ops emulate_ops = {
3343 .read_std = kvm_read_guest_virt_system, 3788 .read_std = kvm_read_guest_virt_system,
3789 .write_std = kvm_write_guest_virt_system,
3344 .fetch = kvm_fetch_guest_virt, 3790 .fetch = kvm_fetch_guest_virt,
3345 .read_emulated = emulator_read_emulated, 3791 .read_emulated = emulator_read_emulated,
3346 .write_emulated = emulator_write_emulated, 3792 .write_emulated = emulator_write_emulated,
3347 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3793 .cmpxchg_emulated = emulator_cmpxchg_emulated,
3794 .pio_in_emulated = emulator_pio_in_emulated,
3795 .pio_out_emulated = emulator_pio_out_emulated,
3796 .get_cached_descriptor = emulator_get_cached_descriptor,
3797 .set_cached_descriptor = emulator_set_cached_descriptor,
3798 .get_segment_selector = emulator_get_segment_selector,
3799 .set_segment_selector = emulator_set_segment_selector,
3800 .get_gdt = emulator_get_gdt,
3801 .get_cr = emulator_get_cr,
3802 .set_cr = emulator_set_cr,
3803 .cpl = emulator_get_cpl,
3804 .set_rflags = emulator_set_rflags,
3348}; 3805};
3349 3806
3350static void cache_all_regs(struct kvm_vcpu *vcpu) 3807static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3375,14 +3832,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3375 cache_all_regs(vcpu); 3832 cache_all_regs(vcpu);
3376 3833
3377 vcpu->mmio_is_write = 0; 3834 vcpu->mmio_is_write = 0;
3378 vcpu->arch.pio.string = 0;
3379 3835
3380 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3836 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3381 int cs_db, cs_l; 3837 int cs_db, cs_l;
3382 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3838 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3383 3839
3384 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3840 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3385 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3841 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3842 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3386 vcpu->arch.emulate_ctxt.mode = 3843 vcpu->arch.emulate_ctxt.mode =
3387 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 3844 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3388 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3845 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3391,6 +3848,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3391 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3392 3849
3393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3850 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851 trace_kvm_emulate_insn_start(vcpu);
3394 3852
3395 /* Only allow emulation of specific instructions on #UD 3853 /* Only allow emulation of specific instructions on #UD
3396 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 3854 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3423,6 +3881,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3423 ++vcpu->stat.insn_emulation; 3881 ++vcpu->stat.insn_emulation;
3424 if (r) { 3882 if (r) {
3425 ++vcpu->stat.insn_emulation_fail; 3883 ++vcpu->stat.insn_emulation_fail;
3884 trace_kvm_emulate_insn_failed(vcpu);
3426 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3885 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3427 return EMULATE_DONE; 3886 return EMULATE_DONE;
3428 return EMULATE_FAIL; 3887 return EMULATE_FAIL;
@@ -3434,16 +3893,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3434 return EMULATE_DONE; 3893 return EMULATE_DONE;
3435 } 3894 }
3436 3895
3896restart:
3437 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3897 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3438 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 3898 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3439 3899
3440 if (r == 0) 3900 if (r == 0)
3441 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 3901 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
3442 3902
3443 if (vcpu->arch.pio.string) 3903 if (vcpu->arch.pio.count) {
3904 if (!vcpu->arch.pio.in)
3905 vcpu->arch.pio.count = 0;
3444 return EMULATE_DO_MMIO; 3906 return EMULATE_DO_MMIO;
3907 }
3445 3908
3446 if ((r || vcpu->mmio_is_write) && run) { 3909 if (r || vcpu->mmio_is_write) {
3447 run->exit_reason = KVM_EXIT_MMIO; 3910 run->exit_reason = KVM_EXIT_MMIO;
3448 run->mmio.phys_addr = vcpu->mmio_phys_addr; 3911 run->mmio.phys_addr = vcpu->mmio_phys_addr;
3449 memcpy(run->mmio.data, vcpu->mmio_data, 8); 3912 memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3453,222 +3916,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3453 3916
3454 if (r) { 3917 if (r) {
3455 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3918 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3456 return EMULATE_DONE; 3919 goto done;
3457 if (!vcpu->mmio_needed) { 3920 if (!vcpu->mmio_needed) {
3921 ++vcpu->stat.insn_emulation_fail;
3922 trace_kvm_emulate_insn_failed(vcpu);
3458 kvm_report_emulation_failure(vcpu, "mmio"); 3923 kvm_report_emulation_failure(vcpu, "mmio");
3459 return EMULATE_FAIL; 3924 return EMULATE_FAIL;
3460 } 3925 }
3461 return EMULATE_DO_MMIO; 3926 return EMULATE_DO_MMIO;
3462 } 3927 }
3463 3928
3464 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3465
3466 if (vcpu->mmio_is_write) { 3929 if (vcpu->mmio_is_write) {
3467 vcpu->mmio_needed = 0; 3930 vcpu->mmio_needed = 0;
3468 return EMULATE_DO_MMIO; 3931 return EMULATE_DO_MMIO;
3469 } 3932 }
3470 3933
3471 return EMULATE_DONE; 3934done:
3472} 3935 if (vcpu->arch.exception.pending)
3473EXPORT_SYMBOL_GPL(emulate_instruction); 3936 vcpu->arch.emulate_ctxt.restart = false;
3474
3475static int pio_copy_data(struct kvm_vcpu *vcpu)
3476{
3477 void *p = vcpu->arch.pio_data;
3478 gva_t q = vcpu->arch.pio.guest_gva;
3479 unsigned bytes;
3480 int ret;
3481 u32 error_code;
3482
3483 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3484 if (vcpu->arch.pio.in)
3485 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
3486 else
3487 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3488
3489 if (ret == X86EMUL_PROPAGATE_FAULT)
3490 kvm_inject_page_fault(vcpu, q, error_code);
3491
3492 return ret;
3493}
3494
3495int complete_pio(struct kvm_vcpu *vcpu)
3496{
3497 struct kvm_pio_request *io = &vcpu->arch.pio;
3498 long delta;
3499 int r;
3500 unsigned long val;
3501
3502 if (!io->string) {
3503 if (io->in) {
3504 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3505 memcpy(&val, vcpu->arch.pio_data, io->size);
3506 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
3507 }
3508 } else {
3509 if (io->in) {
3510 r = pio_copy_data(vcpu);
3511 if (r)
3512 goto out;
3513 }
3514
3515 delta = 1;
3516 if (io->rep) {
3517 delta *= io->cur_count;
3518 /*
3519 * The size of the register should really depend on
3520 * current address size.
3521 */
3522 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
3523 val -= delta;
3524 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
3525 }
3526 if (io->down)
3527 delta = -delta;
3528 delta *= io->size;
3529 if (io->in) {
3530 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
3531 val += delta;
3532 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
3533 } else {
3534 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
3535 val += delta;
3536 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3537 }
3538 }
3539out:
3540 io->count -= io->cur_count;
3541 io->cur_count = 0;
3542
3543 return 0;
3544}
3545
3546static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3547{
3548 /* TODO: String I/O for in kernel device */
3549 int r;
3550
3551 if (vcpu->arch.pio.in)
3552 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3553 vcpu->arch.pio.size, pd);
3554 else
3555 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3556 vcpu->arch.pio.port, vcpu->arch.pio.size,
3557 pd);
3558 return r;
3559}
3560
3561static int pio_string_write(struct kvm_vcpu *vcpu)
3562{
3563 struct kvm_pio_request *io = &vcpu->arch.pio;
3564 void *pd = vcpu->arch.pio_data;
3565 int i, r = 0;
3566
3567 for (i = 0; i < io->cur_count; i++) {
3568 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3569 io->port, io->size, pd)) {
3570 r = -EOPNOTSUPP;
3571 break;
3572 }
3573 pd += io->size;
3574 }
3575 return r;
3576}
3577
3578int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3579{
3580 unsigned long val;
3581 3937
3582 trace_kvm_pio(!in, port, size, 1); 3938 if (vcpu->arch.emulate_ctxt.restart)
3939 goto restart;
3583 3940
3584 vcpu->run->exit_reason = KVM_EXIT_IO; 3941 return EMULATE_DONE;
3585 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3586 vcpu->run->io.size = vcpu->arch.pio.size = size;
3587 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3588 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3589 vcpu->run->io.port = vcpu->arch.pio.port = port;
3590 vcpu->arch.pio.in = in;
3591 vcpu->arch.pio.string = 0;
3592 vcpu->arch.pio.down = 0;
3593 vcpu->arch.pio.rep = 0;
3594
3595 if (!vcpu->arch.pio.in) {
3596 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3597 memcpy(vcpu->arch.pio_data, &val, 4);
3598 }
3599
3600 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3601 complete_pio(vcpu);
3602 return 1;
3603 }
3604 return 0;
3605} 3942}
3606EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3943EXPORT_SYMBOL_GPL(emulate_instruction);
3607 3944
3608int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, 3945int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
3609 int size, unsigned long count, int down,
3610 gva_t address, int rep, unsigned port)
3611{ 3946{
3612 unsigned now, in_page; 3947 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3613 int ret = 0; 3948 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
3614 3949 /* do not return to emulator after return from userspace */
3615 trace_kvm_pio(!in, port, size, count); 3950 vcpu->arch.pio.count = 0;
3616
3617 vcpu->run->exit_reason = KVM_EXIT_IO;
3618 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3619 vcpu->run->io.size = vcpu->arch.pio.size = size;
3620 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3621 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3622 vcpu->run->io.port = vcpu->arch.pio.port = port;
3623 vcpu->arch.pio.in = in;
3624 vcpu->arch.pio.string = 1;
3625 vcpu->arch.pio.down = down;
3626 vcpu->arch.pio.rep = rep;
3627
3628 if (!count) {
3629 kvm_x86_ops->skip_emulated_instruction(vcpu);
3630 return 1;
3631 }
3632
3633 if (!down)
3634 in_page = PAGE_SIZE - offset_in_page(address);
3635 else
3636 in_page = offset_in_page(address) + size;
3637 now = min(count, (unsigned long)in_page / size);
3638 if (!now)
3639 now = 1;
3640 if (down) {
3641 /*
3642 * String I/O in reverse. Yuck. Kill the guest, fix later.
3643 */
3644 pr_unimpl(vcpu, "guest string pio down\n");
3645 kvm_inject_gp(vcpu, 0);
3646 return 1;
3647 }
3648 vcpu->run->io.count = now;
3649 vcpu->arch.pio.cur_count = now;
3650
3651 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3652 kvm_x86_ops->skip_emulated_instruction(vcpu);
3653
3654 vcpu->arch.pio.guest_gva = address;
3655
3656 if (!vcpu->arch.pio.in) {
3657 /* string PIO write */
3658 ret = pio_copy_data(vcpu);
3659 if (ret == X86EMUL_PROPAGATE_FAULT)
3660 return 1;
3661 if (ret == 0 && !pio_string_write(vcpu)) {
3662 complete_pio(vcpu);
3663 if (vcpu->arch.pio.count == 0)
3664 ret = 1;
3665 }
3666 }
3667 /* no string PIO read support yet */
3668
3669 return ret; 3951 return ret;
3670} 3952}
3671EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3953EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
3672 3954
3673static void bounce_off(void *info) 3955static void bounce_off(void *info)
3674{ 3956{
@@ -3743,6 +4025,51 @@ static void kvm_timer_init(void)
3743 } 4025 }
3744} 4026}
3745 4027
4028static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
4029
4030static int kvm_is_in_guest(void)
4031{
4032 return percpu_read(current_vcpu) != NULL;
4033}
4034
4035static int kvm_is_user_mode(void)
4036{
4037 int user_mode = 3;
4038
4039 if (percpu_read(current_vcpu))
4040 user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
4041
4042 return user_mode != 0;
4043}
4044
4045static unsigned long kvm_get_guest_ip(void)
4046{
4047 unsigned long ip = 0;
4048
4049 if (percpu_read(current_vcpu))
4050 ip = kvm_rip_read(percpu_read(current_vcpu));
4051
4052 return ip;
4053}
4054
4055static struct perf_guest_info_callbacks kvm_guest_cbs = {
4056 .is_in_guest = kvm_is_in_guest,
4057 .is_user_mode = kvm_is_user_mode,
4058 .get_guest_ip = kvm_get_guest_ip,
4059};
4060
4061void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
4062{
4063 percpu_write(current_vcpu, vcpu);
4064}
4065EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
4066
4067void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
4068{
4069 percpu_write(current_vcpu, NULL);
4070}
4071EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
4072
3746int kvm_arch_init(void *opaque) 4073int kvm_arch_init(void *opaque)
3747{ 4074{
3748 int r; 4075 int r;
@@ -3779,6 +4106,8 @@ int kvm_arch_init(void *opaque)
3779 4106
3780 kvm_timer_init(); 4107 kvm_timer_init();
3781 4108
4109 perf_register_guest_info_callbacks(&kvm_guest_cbs);
4110
3782 return 0; 4111 return 0;
3783 4112
3784out: 4113out:
@@ -3787,6 +4116,8 @@ out:
3787 4116
3788void kvm_arch_exit(void) 4117void kvm_arch_exit(void)
3789{ 4118{
4119 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
4120
3790 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4121 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
3791 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4122 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
3792 CPUFREQ_TRANSITION_NOTIFIER); 4123 CPUFREQ_TRANSITION_NOTIFIER);
@@ -3942,85 +4273,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3942 return emulator_write_emulated(rip, instruction, 3, vcpu); 4273 return emulator_write_emulated(rip, instruction, 3, vcpu);
3943} 4274}
3944 4275
3945static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3946{
3947 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3948}
3949
3950void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3951{ 4277{
3952 struct descriptor_table dt = { limit, base }; 4278 struct desc_ptr dt = { limit, base };
3953 4279
3954 kvm_x86_ops->set_gdt(vcpu, &dt); 4280 kvm_x86_ops->set_gdt(vcpu, &dt);
3955} 4281}
3956 4282
3957void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4283void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3958{ 4284{
3959 struct descriptor_table dt = { limit, base }; 4285 struct desc_ptr dt = { limit, base };
3960 4286
3961 kvm_x86_ops->set_idt(vcpu, &dt); 4287 kvm_x86_ops->set_idt(vcpu, &dt);
3962} 4288}
3963 4289
3964void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3965 unsigned long *rflags)
3966{
3967 kvm_lmsw(vcpu, msw);
3968 *rflags = kvm_get_rflags(vcpu);
3969}
3970
3971unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3972{
3973 unsigned long value;
3974
3975 switch (cr) {
3976 case 0:
3977 value = kvm_read_cr0(vcpu);
3978 break;
3979 case 2:
3980 value = vcpu->arch.cr2;
3981 break;
3982 case 3:
3983 value = vcpu->arch.cr3;
3984 break;
3985 case 4:
3986 value = kvm_read_cr4(vcpu);
3987 break;
3988 case 8:
3989 value = kvm_get_cr8(vcpu);
3990 break;
3991 default:
3992 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3993 return 0;
3994 }
3995
3996 return value;
3997}
3998
3999void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
4000 unsigned long *rflags)
4001{
4002 switch (cr) {
4003 case 0:
4004 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4005 *rflags = kvm_get_rflags(vcpu);
4006 break;
4007 case 2:
4008 vcpu->arch.cr2 = val;
4009 break;
4010 case 3:
4011 kvm_set_cr3(vcpu, val);
4012 break;
4013 case 4:
4014 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4015 break;
4016 case 8:
4017 kvm_set_cr8(vcpu, val & 0xfUL);
4018 break;
4019 default:
4020 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4021 }
4022}
4023
4024static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 4290static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
4025{ 4291{
4026 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 4292 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -4084,9 +4350,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
4084{ 4350{
4085 struct kvm_cpuid_entry2 *best; 4351 struct kvm_cpuid_entry2 *best;
4086 4352
4353 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
4354 if (!best || best->eax < 0x80000008)
4355 goto not_found;
4087 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 4356 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
4088 if (best) 4357 if (best)
4089 return best->eax & 0xff; 4358 return best->eax & 0xff;
4359not_found:
4090 return 36; 4360 return 36;
4091} 4361}
4092 4362
@@ -4200,9 +4470,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4200{ 4470{
4201 /* try to reinject previous events if any */ 4471 /* try to reinject previous events if any */
4202 if (vcpu->arch.exception.pending) { 4472 if (vcpu->arch.exception.pending) {
4473 trace_kvm_inj_exception(vcpu->arch.exception.nr,
4474 vcpu->arch.exception.has_error_code,
4475 vcpu->arch.exception.error_code);
4203 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 4476 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
4204 vcpu->arch.exception.has_error_code, 4477 vcpu->arch.exception.has_error_code,
4205 vcpu->arch.exception.error_code); 4478 vcpu->arch.exception.error_code,
4479 vcpu->arch.exception.reinject);
4206 return; 4480 return;
4207 } 4481 }
4208 4482
@@ -4432,7 +4706,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4432 } 4706 }
4433 4707
4434 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4708 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4435 post_kvm_run_save(vcpu);
4436 4709
4437 vapic_exit(vcpu); 4710 vapic_exit(vcpu);
4438 4711
@@ -4460,26 +4733,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4460 if (!irqchip_in_kernel(vcpu->kvm)) 4733 if (!irqchip_in_kernel(vcpu->kvm))
4461 kvm_set_cr8(vcpu, kvm_run->cr8); 4734 kvm_set_cr8(vcpu, kvm_run->cr8);
4462 4735
4463 if (vcpu->arch.pio.cur_count) { 4736 if (vcpu->arch.pio.count || vcpu->mmio_needed ||
4464 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4737 vcpu->arch.emulate_ctxt.restart) {
4465 r = complete_pio(vcpu); 4738 if (vcpu->mmio_needed) {
4466 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4739 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4467 if (r) 4740 vcpu->mmio_read_completed = 1;
4468 goto out; 4741 vcpu->mmio_needed = 0;
4469 } 4742 }
4470 if (vcpu->mmio_needed) {
4471 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4472 vcpu->mmio_read_completed = 1;
4473 vcpu->mmio_needed = 0;
4474
4475 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4476 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4477 EMULTYPE_NO_DECODE);
4478 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4479 if (r == EMULATE_DO_MMIO) { 4746 if (r == EMULATE_DO_MMIO) {
4480 /*
4481 * Read-modify-write. Back to userspace.
4482 */
4483 r = 0; 4747 r = 0;
4484 goto out; 4748 goto out;
4485 } 4749 }
@@ -4491,6 +4755,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4491 r = __vcpu_run(vcpu); 4755 r = __vcpu_run(vcpu);
4492 4756
4493out: 4757out:
4758 post_kvm_run_save(vcpu);
4494 if (vcpu->sigset_active) 4759 if (vcpu->sigset_active)
4495 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4496 4761
@@ -4562,12 +4827,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4562 return 0; 4827 return 0;
4563} 4828}
4564 4829
4565void kvm_get_segment(struct kvm_vcpu *vcpu,
4566 struct kvm_segment *var, int seg)
4567{
4568 kvm_x86_ops->get_segment(vcpu, var, seg);
4569}
4570
4571void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4830void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4572{ 4831{
4573 struct kvm_segment cs; 4832 struct kvm_segment cs;
@@ -4581,7 +4840,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
4581int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 4840int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4582 struct kvm_sregs *sregs) 4841 struct kvm_sregs *sregs)
4583{ 4842{
4584 struct descriptor_table dt; 4843 struct desc_ptr dt;
4585 4844
4586 vcpu_load(vcpu); 4845 vcpu_load(vcpu);
4587 4846
@@ -4596,11 +4855,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4596 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4855 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4597 4856
4598 kvm_x86_ops->get_idt(vcpu, &dt); 4857 kvm_x86_ops->get_idt(vcpu, &dt);
4599 sregs->idt.limit = dt.limit; 4858 sregs->idt.limit = dt.size;
4600 sregs->idt.base = dt.base; 4859 sregs->idt.base = dt.address;
4601 kvm_x86_ops->get_gdt(vcpu, &dt); 4860 kvm_x86_ops->get_gdt(vcpu, &dt);
4602 sregs->gdt.limit = dt.limit; 4861 sregs->gdt.limit = dt.size;
4603 sregs->gdt.base = dt.base; 4862 sregs->gdt.base = dt.address;
4604 4863
4605 sregs->cr0 = kvm_read_cr0(vcpu); 4864 sregs->cr0 = kvm_read_cr0(vcpu);
4606 sregs->cr2 = vcpu->arch.cr2; 4865 sregs->cr2 = vcpu->arch.cr2;
@@ -4639,563 +4898,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4639 return 0; 4898 return 0;
4640} 4899}
4641 4900
4642static void kvm_set_segment(struct kvm_vcpu *vcpu, 4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4643 struct kvm_segment *var, int seg) 4902 bool has_error_code, u32 error_code)
4644{
4645 kvm_x86_ops->set_segment(vcpu, var, seg);
4646}
4647
4648static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
4649 struct kvm_segment *kvm_desct)
4650{
4651 kvm_desct->base = get_desc_base(seg_desc);
4652 kvm_desct->limit = get_desc_limit(seg_desc);
4653 if (seg_desc->g) {
4654 kvm_desct->limit <<= 12;
4655 kvm_desct->limit |= 0xfff;
4656 }
4657 kvm_desct->selector = selector;
4658 kvm_desct->type = seg_desc->type;
4659 kvm_desct->present = seg_desc->p;
4660 kvm_desct->dpl = seg_desc->dpl;
4661 kvm_desct->db = seg_desc->d;
4662 kvm_desct->s = seg_desc->s;
4663 kvm_desct->l = seg_desc->l;
4664 kvm_desct->g = seg_desc->g;
4665 kvm_desct->avl = seg_desc->avl;
4666 if (!selector)
4667 kvm_desct->unusable = 1;
4668 else
4669 kvm_desct->unusable = 0;
4670 kvm_desct->padding = 0;
4671}
4672
4673static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4674 u16 selector,
4675 struct descriptor_table *dtable)
4676{
4677 if (selector & 1 << 2) {
4678 struct kvm_segment kvm_seg;
4679
4680 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4681
4682 if (kvm_seg.unusable)
4683 dtable->limit = 0;
4684 else
4685 dtable->limit = kvm_seg.limit;
4686 dtable->base = kvm_seg.base;
4687 }
4688 else
4689 kvm_x86_ops->get_gdt(vcpu, dtable);
4690}
4691
4692/* allowed just for 8 bytes segments */
4693static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4694 struct desc_struct *seg_desc)
4695{
4696 struct descriptor_table dtable;
4697 u16 index = selector >> 3;
4698 int ret;
4699 u32 err;
4700 gva_t addr;
4701
4702 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4703
4704 if (dtable.limit < index * 8 + 7) {
4705 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4706 return X86EMUL_PROPAGATE_FAULT;
4707 }
4708 addr = dtable.base + index * 8;
4709 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4710 vcpu, &err);
4711 if (ret == X86EMUL_PROPAGATE_FAULT)
4712 kvm_inject_page_fault(vcpu, addr, err);
4713
4714 return ret;
4715}
4716
4717/* allowed just for 8 bytes segments */
4718static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4719 struct desc_struct *seg_desc)
4720{
4721 struct descriptor_table dtable;
4722 u16 index = selector >> 3;
4723
4724 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4725
4726 if (dtable.limit < index * 8 + 7)
4727 return 1;
4728 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4729}
4730
4731static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4732 struct desc_struct *seg_desc)
4733{
4734 u32 base_addr = get_desc_base(seg_desc);
4735
4736 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4737}
4738
4739static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4740 struct desc_struct *seg_desc)
4741{
4742 u32 base_addr = get_desc_base(seg_desc);
4743
4744 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4745}
4746
4747static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4748{
4749 struct kvm_segment kvm_seg;
4750
4751 kvm_get_segment(vcpu, &kvm_seg, seg);
4752 return kvm_seg.selector;
4753}
4754
4755static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4756{
4757 struct kvm_segment segvar = {
4758 .base = selector << 4,
4759 .limit = 0xffff,
4760 .selector = selector,
4761 .type = 3,
4762 .present = 1,
4763 .dpl = 3,
4764 .db = 0,
4765 .s = 1,
4766 .l = 0,
4767 .g = 0,
4768 .avl = 0,
4769 .unusable = 0,
4770 };
4771 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4772 return X86EMUL_CONTINUE;
4773}
4774
4775static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4776{ 4903{
4777 return (seg != VCPU_SREG_LDTR) && 4904 int cs_db, cs_l, ret;
4778 (seg != VCPU_SREG_TR) && 4905 cache_all_regs(vcpu);
4779 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4780}
4781
4782int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4783{
4784 struct kvm_segment kvm_seg;
4785 struct desc_struct seg_desc;
4786 u8 dpl, rpl, cpl;
4787 unsigned err_vec = GP_VECTOR;
4788 u32 err_code = 0;
4789 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4790 int ret;
4791 4906
4792 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) 4907 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4793 return kvm_load_realmode_segment(vcpu, selector, seg);
4794 4908
4795 /* NULL selector is not valid for TR, CS and SS */ 4909 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4796 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 4910 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4797 && null_selector) 4911 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4798 goto exception; 4912 vcpu->arch.emulate_ctxt.mode =
4913 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4914 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4915 ? X86EMUL_MODE_VM86 : cs_l
4916 ? X86EMUL_MODE_PROT64 : cs_db
4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4799 4918
4800 /* TR should be in GDT only */ 4919 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4801 if (seg == VCPU_SREG_TR && (selector & (1 << 2))) 4920 tss_selector, reason, has_error_code,
4802 goto exception; 4921 error_code);
4803 4922
4804 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4805 if (ret) 4923 if (ret)
4806 return ret; 4924 return EMULATE_FAIL;
4807
4808 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4809
4810 if (null_selector) { /* for NULL selector skip all following checks */
4811 kvm_seg.unusable = 1;
4812 goto load;
4813 }
4814
4815 err_code = selector & 0xfffc;
4816 err_vec = GP_VECTOR;
4817
4818 /* can't load system descriptor into segment selecor */
4819 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4820 goto exception;
4821
4822 if (!kvm_seg.present) {
4823 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4824 goto exception;
4825 }
4826
4827 rpl = selector & 3;
4828 dpl = kvm_seg.dpl;
4829 cpl = kvm_x86_ops->get_cpl(vcpu);
4830
4831 switch (seg) {
4832 case VCPU_SREG_SS:
4833 /*
4834 * segment is not a writable data segment or segment
4835 * selector's RPL != CPL or segment selector's RPL != CPL
4836 */
4837 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4838 goto exception;
4839 break;
4840 case VCPU_SREG_CS:
4841 if (!(kvm_seg.type & 8))
4842 goto exception;
4843
4844 if (kvm_seg.type & 4) {
4845 /* conforming */
4846 if (dpl > cpl)
4847 goto exception;
4848 } else {
4849 /* nonconforming */
4850 if (rpl > cpl || dpl != cpl)
4851 goto exception;
4852 }
4853 /* CS(RPL) <- CPL */
4854 selector = (selector & 0xfffc) | cpl;
4855 break;
4856 case VCPU_SREG_TR:
4857 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4858 goto exception;
4859 break;
4860 case VCPU_SREG_LDTR:
4861 if (kvm_seg.s || kvm_seg.type != 2)
4862 goto exception;
4863 break;
4864 default: /* DS, ES, FS, or GS */
4865 /*
4866 * segment is not a data or readable code segment or
4867 * ((segment is a data or nonconforming code segment)
4868 * and (both RPL and CPL > DPL))
4869 */
4870 if ((kvm_seg.type & 0xa) == 0x8 ||
4871 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4872 goto exception;
4873 break;
4874 }
4875
4876 if (!kvm_seg.unusable && kvm_seg.s) {
4877 /* mark segment as accessed */
4878 kvm_seg.type |= 1;
4879 seg_desc.type |= 1;
4880 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4881 }
4882load:
4883 kvm_set_segment(vcpu, &kvm_seg, seg);
4884 return X86EMUL_CONTINUE;
4885exception:
4886 kvm_queue_exception_e(vcpu, err_vec, err_code);
4887 return X86EMUL_PROPAGATE_FAULT;
4888}
4889
4890static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4891 struct tss_segment_32 *tss)
4892{
4893 tss->cr3 = vcpu->arch.cr3;
4894 tss->eip = kvm_rip_read(vcpu);
4895 tss->eflags = kvm_get_rflags(vcpu);
4896 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4897 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4898 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4899 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4900 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4901 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4902 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4903 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4904 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4905 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4906 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4907 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4908 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4909 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4910 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4911}
4912
4913static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4914{
4915 struct kvm_segment kvm_seg;
4916 kvm_get_segment(vcpu, &kvm_seg, seg);
4917 kvm_seg.selector = sel;
4918 kvm_set_segment(vcpu, &kvm_seg, seg);
4919}
4920
4921static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4922 struct tss_segment_32 *tss)
4923{
4924 kvm_set_cr3(vcpu, tss->cr3);
4925
4926 kvm_rip_write(vcpu, tss->eip);
4927 kvm_set_rflags(vcpu, tss->eflags | 2);
4928
4929 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4930 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
4931 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
4932 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
4933 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
4934 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
4935 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4936 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4937
4938 /*
4939 * SDM says that segment selectors are loaded before segment
4940 * descriptors
4941 */
4942 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
4943 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
4944 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
4945 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
4946 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
4947 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
4948 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
4949
4950 /*
4951 * Now load segment descriptors. If fault happenes at this stage
4952 * it is handled in a context of new task
4953 */
4954 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
4955 return 1;
4956
4957 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4958 return 1;
4959
4960 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4961 return 1;
4962
4963 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4964 return 1;
4965
4966 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4967 return 1;
4968
4969 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
4970 return 1;
4971
4972 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
4973 return 1;
4974 return 0;
4975}
4976
4977static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4978 struct tss_segment_16 *tss)
4979{
4980 tss->ip = kvm_rip_read(vcpu);
4981 tss->flag = kvm_get_rflags(vcpu);
4982 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4983 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4984 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4985 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4986 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4987 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4988 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
4989 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
4990
4991 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4992 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4993 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4994 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4995 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4996}
4997
4998static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4999 struct tss_segment_16 *tss)
5000{
5001 kvm_rip_write(vcpu, tss->ip);
5002 kvm_set_rflags(vcpu, tss->flag | 2);
5003 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
5004 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
5005 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
5006 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
5007 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
5008 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
5009 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
5010 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
5011
5012 /*
5013 * SDM says that segment selectors are loaded before segment
5014 * descriptors
5015 */
5016 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5017 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5018 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5019 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5020 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5021
5022 /*
5023 * Now load segment descriptors. If fault happenes at this stage
5024 * it is handled in a context of new task
5025 */
5026 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
5027 return 1;
5028
5029 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5030 return 1;
5031
5032 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
5033 return 1;
5034
5035 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5036 return 1;
5037
5038 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5039 return 1;
5040 return 0;
5041}
5042
5043static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
5044 u16 old_tss_sel, u32 old_tss_base,
5045 struct desc_struct *nseg_desc)
5046{
5047 struct tss_segment_16 tss_segment_16;
5048 int ret = 0;
5049
5050 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5051 sizeof tss_segment_16))
5052 goto out;
5053
5054 save_state_to_tss16(vcpu, &tss_segment_16);
5055
5056 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5057 sizeof tss_segment_16))
5058 goto out;
5059
5060 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5061 &tss_segment_16, sizeof tss_segment_16))
5062 goto out;
5063
5064 if (old_tss_sel != 0xffff) {
5065 tss_segment_16.prev_task_link = old_tss_sel;
5066 4925
5067 if (kvm_write_guest(vcpu->kvm, 4926 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5068 get_tss_base_addr_write(vcpu, nseg_desc), 4927 return EMULATE_DONE;
5069 &tss_segment_16.prev_task_link,
5070 sizeof tss_segment_16.prev_task_link))
5071 goto out;
5072 }
5073
5074 if (load_state_from_tss16(vcpu, &tss_segment_16))
5075 goto out;
5076
5077 ret = 1;
5078out:
5079 return ret;
5080}
5081
5082static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
5083 u16 old_tss_sel, u32 old_tss_base,
5084 struct desc_struct *nseg_desc)
5085{
5086 struct tss_segment_32 tss_segment_32;
5087 int ret = 0;
5088
5089 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5090 sizeof tss_segment_32))
5091 goto out;
5092
5093 save_state_to_tss32(vcpu, &tss_segment_32);
5094
5095 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5096 sizeof tss_segment_32))
5097 goto out;
5098
5099 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5100 &tss_segment_32, sizeof tss_segment_32))
5101 goto out;
5102
5103 if (old_tss_sel != 0xffff) {
5104 tss_segment_32.prev_task_link = old_tss_sel;
5105
5106 if (kvm_write_guest(vcpu->kvm,
5107 get_tss_base_addr_write(vcpu, nseg_desc),
5108 &tss_segment_32.prev_task_link,
5109 sizeof tss_segment_32.prev_task_link))
5110 goto out;
5111 }
5112
5113 if (load_state_from_tss32(vcpu, &tss_segment_32))
5114 goto out;
5115
5116 ret = 1;
5117out:
5118 return ret;
5119}
5120
5121int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
5122{
5123 struct kvm_segment tr_seg;
5124 struct desc_struct cseg_desc;
5125 struct desc_struct nseg_desc;
5126 int ret = 0;
5127 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
5128 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
5129 u32 desc_limit;
5130
5131 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
5132
5133 /* FIXME: Handle errors. Failure to read either TSS or their
5134 * descriptors should generate a pagefault.
5135 */
5136 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
5137 goto out;
5138
5139 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
5140 goto out;
5141
5142 if (reason != TASK_SWITCH_IRET) {
5143 int cpl;
5144
5145 cpl = kvm_x86_ops->get_cpl(vcpu);
5146 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
5147 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5148 return 1;
5149 }
5150 }
5151
5152 desc_limit = get_desc_limit(&nseg_desc);
5153 if (!nseg_desc.p ||
5154 ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
5155 desc_limit < 0x2b)) {
5156 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
5157 return 1;
5158 }
5159
5160 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
5161 cseg_desc.type &= ~(1 << 1); //clear the B flag
5162 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
5163 }
5164
5165 if (reason == TASK_SWITCH_IRET) {
5166 u32 eflags = kvm_get_rflags(vcpu);
5167 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
5168 }
5169
5170 /* set back link to prev task only if NT bit is set in eflags
5171 note that old_tss_sel is not used afetr this point */
5172 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
5173 old_tss_sel = 0xffff;
5174
5175 if (nseg_desc.type & 8)
5176 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
5177 old_tss_base, &nseg_desc);
5178 else
5179 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
5180 old_tss_base, &nseg_desc);
5181
5182 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
5183 u32 eflags = kvm_get_rflags(vcpu);
5184 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
5185 }
5186
5187 if (reason != TASK_SWITCH_IRET) {
5188 nseg_desc.type |= (1 << 1);
5189 save_guest_segment_descriptor(vcpu, tss_selector,
5190 &nseg_desc);
5191 }
5192
5193 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
5194 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
5195 tr_seg.type = 11;
5196 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
5197out:
5198 return ret;
5199} 4928}
5200EXPORT_SYMBOL_GPL(kvm_task_switch); 4929EXPORT_SYMBOL_GPL(kvm_task_switch);
5201 4930
@@ -5204,15 +4933,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5204{ 4933{
5205 int mmu_reset_needed = 0; 4934 int mmu_reset_needed = 0;
5206 int pending_vec, max_bits; 4935 int pending_vec, max_bits;
5207 struct descriptor_table dt; 4936 struct desc_ptr dt;
5208 4937
5209 vcpu_load(vcpu); 4938 vcpu_load(vcpu);
5210 4939
5211 dt.limit = sregs->idt.limit; 4940 dt.size = sregs->idt.limit;
5212 dt.base = sregs->idt.base; 4941 dt.address = sregs->idt.base;
5213 kvm_x86_ops->set_idt(vcpu, &dt); 4942 kvm_x86_ops->set_idt(vcpu, &dt);
5214 dt.limit = sregs->gdt.limit; 4943 dt.size = sregs->gdt.limit;
5215 dt.base = sregs->gdt.base; 4944 dt.address = sregs->gdt.base;
5216 kvm_x86_ops->set_gdt(vcpu, &dt); 4945 kvm_x86_ops->set_gdt(vcpu, &dt);
5217 4946
5218 vcpu->arch.cr2 = sregs->cr2; 4947 vcpu->arch.cr2 = sregs->cr2;
@@ -5311,11 +5040,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5311 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 5040 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5312 } 5041 }
5313 5042
5314 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 5043 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5315 vcpu->arch.singlestep_cs = 5044 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
5316 get_segment_selector(vcpu, VCPU_SREG_CS); 5045 get_segment_base(vcpu, VCPU_SREG_CS);
5317 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
5318 }
5319 5046
5320 /* 5047 /*
5321 * Trigger an rflags update that will inject or remove the trace 5048 * Trigger an rflags update that will inject or remove the trace
@@ -5806,13 +5533,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
5806 return kvm_x86_ops->interrupt_allowed(vcpu); 5533 return kvm_x86_ops->interrupt_allowed(vcpu);
5807} 5534}
5808 5535
5536bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
5537{
5538 unsigned long current_rip = kvm_rip_read(vcpu) +
5539 get_segment_base(vcpu, VCPU_SREG_CS);
5540
5541 return current_rip == linear_rip;
5542}
5543EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
5544
5809unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5545unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5810{ 5546{
5811 unsigned long rflags; 5547 unsigned long rflags;
5812 5548
5813 rflags = kvm_x86_ops->get_rflags(vcpu); 5549 rflags = kvm_x86_ops->get_rflags(vcpu);
5814 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5550 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5815 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); 5551 rflags &= ~X86_EFLAGS_TF;
5816 return rflags; 5552 return rflags;
5817} 5553}
5818EXPORT_SYMBOL_GPL(kvm_get_rflags); 5554EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5820,10 +5556,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
5820void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5556void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5821{ 5557{
5822 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5558 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5823 vcpu->arch.singlestep_cs == 5559 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5824 get_segment_selector(vcpu, VCPU_SREG_CS) && 5560 rflags |= X86_EFLAGS_TF;
5825 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5826 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5827 kvm_x86_ops->set_rflags(vcpu, rflags); 5561 kvm_x86_ops->set_rflags(vcpu, rflags);
5828} 5562}
5829EXPORT_SYMBOL_GPL(kvm_set_rflags); 5563EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5839,3 +5573,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5839EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5573EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5840EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5574EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5841EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5575EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
5576EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d101639bd8..f4b54458285 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,4 +65,14 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
77
68#endif 78#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 2bdf628066b..9257510b483 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1390,7 +1390,6 @@ __init void lguest_init(void)
1390#endif 1390#endif
1391#ifdef CONFIG_ACPI 1391#ifdef CONFIG_ACPI
1392 acpi_disabled = 1; 1392 acpi_disabled = 1;
1393 acpi_ht = 0;
1394#endif 1393#endif
1395 1394
1396 /* 1395 /*
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 419386c24b8..f871e04b696 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -20,17 +20,18 @@ lib-y := delay.o
20lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
21lib-y += usercopy_$(BITS).o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o getuser.o putuser.o
22lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
23lib-$(CONFIG_KPROBES) += insn.o inat.o 23lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
24 24
25obj-y += msr.o msr-reg.o msr-reg-export.o 25obj-y += msr.o msr-reg.o msr-reg-export.o
26 26
27ifeq ($(CONFIG_X86_32),y) 27ifeq ($(CONFIG_X86_32),y)
28 obj-y += atomic64_32.o 28 obj-y += atomic64_32.o
29 lib-y += atomic64_cx8_32.o
29 lib-y += checksum_32.o 30 lib-y += checksum_32.o
30 lib-y += strstr_32.o 31 lib-y += strstr_32.o
31 lib-y += semaphore_32.o string_32.o 32 lib-y += semaphore_32.o string_32.o
32ifneq ($(CONFIG_X86_CMPXCHG64),y) 33ifneq ($(CONFIG_X86_CMPXCHG64),y)
33 lib-y += cmpxchg8b_emu.o 34 lib-y += cmpxchg8b_emu.o atomic64_386_32.o
34endif 35endif
35 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 36 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
36else 37else
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 824fa0be55a..540179e8e9f 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -6,225 +6,54 @@
6#include <asm/cmpxchg.h> 6#include <asm/cmpxchg.h>
7#include <asm/atomic.h> 7#include <asm/atomic.h>
8 8
9static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) 9long long atomic64_read_cx8(long long, const atomic64_t *v);
10{ 10EXPORT_SYMBOL(atomic64_read_cx8);
11 u32 low = new; 11long long atomic64_set_cx8(long long, const atomic64_t *v);
12 u32 high = new >> 32; 12EXPORT_SYMBOL(atomic64_set_cx8);
13 13long long atomic64_xchg_cx8(long long, unsigned high);
14 asm volatile( 14EXPORT_SYMBOL(atomic64_xchg_cx8);
15 LOCK_PREFIX "cmpxchg8b %1\n" 15long long atomic64_add_return_cx8(long long a, atomic64_t *v);
16 : "+A" (old), "+m" (*ptr) 16EXPORT_SYMBOL(atomic64_add_return_cx8);
17 : "b" (low), "c" (high) 17long long atomic64_sub_return_cx8(long long a, atomic64_t *v);
18 ); 18EXPORT_SYMBOL(atomic64_sub_return_cx8);
19 return old; 19long long atomic64_inc_return_cx8(long long a, atomic64_t *v);
20} 20EXPORT_SYMBOL(atomic64_inc_return_cx8);
21 21long long atomic64_dec_return_cx8(long long a, atomic64_t *v);
22u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) 22EXPORT_SYMBOL(atomic64_dec_return_cx8);
23{ 23long long atomic64_dec_if_positive_cx8(atomic64_t *v);
24 return cmpxchg8b(&ptr->counter, old_val, new_val); 24EXPORT_SYMBOL(atomic64_dec_if_positive_cx8);
25} 25int atomic64_inc_not_zero_cx8(atomic64_t *v);
26EXPORT_SYMBOL(atomic64_cmpxchg); 26EXPORT_SYMBOL(atomic64_inc_not_zero_cx8);
27 27int atomic64_add_unless_cx8(atomic64_t *v, long long a, long long u);
28/** 28EXPORT_SYMBOL(atomic64_add_unless_cx8);
29 * atomic64_xchg - xchg atomic64 variable 29
30 * @ptr: pointer to type atomic64_t 30#ifndef CONFIG_X86_CMPXCHG64
31 * @new_val: value to assign 31long long atomic64_read_386(long long, const atomic64_t *v);
32 * 32EXPORT_SYMBOL(atomic64_read_386);
33 * Atomically xchgs the value of @ptr to @new_val and returns 33long long atomic64_set_386(long long, const atomic64_t *v);
34 * the old value. 34EXPORT_SYMBOL(atomic64_set_386);
35 */ 35long long atomic64_xchg_386(long long, unsigned high);
36u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) 36EXPORT_SYMBOL(atomic64_xchg_386);
37{ 37long long atomic64_add_return_386(long long a, atomic64_t *v);
38 /* 38EXPORT_SYMBOL(atomic64_add_return_386);
39 * Try first with a (possibly incorrect) assumption about 39long long atomic64_sub_return_386(long long a, atomic64_t *v);
40 * what we have there. We'll do two loops most likely, 40EXPORT_SYMBOL(atomic64_sub_return_386);
41 * but we'll get an ownership MESI transaction straight away 41long long atomic64_inc_return_386(long long a, atomic64_t *v);
42 * instead of a read transaction followed by a 42EXPORT_SYMBOL(atomic64_inc_return_386);
43 * flush-for-ownership transaction: 43long long atomic64_dec_return_386(long long a, atomic64_t *v);
44 */ 44EXPORT_SYMBOL(atomic64_dec_return_386);
45 u64 old_val, real_val = 0; 45long long atomic64_add_386(long long a, atomic64_t *v);
46 46EXPORT_SYMBOL(atomic64_add_386);
47 do { 47long long atomic64_sub_386(long long a, atomic64_t *v);
48 old_val = real_val; 48EXPORT_SYMBOL(atomic64_sub_386);
49 49long long atomic64_inc_386(long long a, atomic64_t *v);
50 real_val = atomic64_cmpxchg(ptr, old_val, new_val); 50EXPORT_SYMBOL(atomic64_inc_386);
51 51long long atomic64_dec_386(long long a, atomic64_t *v);
52 } while (real_val != old_val); 52EXPORT_SYMBOL(atomic64_dec_386);
53 53long long atomic64_dec_if_positive_386(atomic64_t *v);
54 return old_val; 54EXPORT_SYMBOL(atomic64_dec_if_positive_386);
55} 55int atomic64_inc_not_zero_386(atomic64_t *v);
56EXPORT_SYMBOL(atomic64_xchg); 56EXPORT_SYMBOL(atomic64_inc_not_zero_386);
57 57int atomic64_add_unless_386(atomic64_t *v, long long a, long long u);
58/** 58EXPORT_SYMBOL(atomic64_add_unless_386);
59 * atomic64_set - set atomic64 variable 59#endif
60 * @ptr: pointer to type atomic64_t
61 * @new_val: value to assign
62 *
63 * Atomically sets the value of @ptr to @new_val.
64 */
65void atomic64_set(atomic64_t *ptr, u64 new_val)
66{
67 atomic64_xchg(ptr, new_val);
68}
69EXPORT_SYMBOL(atomic64_set);
70
71/**
72EXPORT_SYMBOL(atomic64_read);
73 * atomic64_add_return - add and return
74 * @delta: integer value to add
75 * @ptr: pointer to type atomic64_t
76 *
77 * Atomically adds @delta to @ptr and returns @delta + *@ptr
78 */
79noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
80{
81 /*
82 * Try first with a (possibly incorrect) assumption about
83 * what we have there. We'll do two loops most likely,
84 * but we'll get an ownership MESI transaction straight away
85 * instead of a read transaction followed by a
86 * flush-for-ownership transaction:
87 */
88 u64 old_val, new_val, real_val = 0;
89
90 do {
91 old_val = real_val;
92 new_val = old_val + delta;
93
94 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
95
96 } while (real_val != old_val);
97
98 return new_val;
99}
100EXPORT_SYMBOL(atomic64_add_return);
101
102u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
103{
104 return atomic64_add_return(-delta, ptr);
105}
106EXPORT_SYMBOL(atomic64_sub_return);
107
108u64 atomic64_inc_return(atomic64_t *ptr)
109{
110 return atomic64_add_return(1, ptr);
111}
112EXPORT_SYMBOL(atomic64_inc_return);
113
114u64 atomic64_dec_return(atomic64_t *ptr)
115{
116 return atomic64_sub_return(1, ptr);
117}
118EXPORT_SYMBOL(atomic64_dec_return);
119
120/**
121 * atomic64_add - add integer to atomic64 variable
122 * @delta: integer value to add
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically adds @delta to @ptr.
126 */
127void atomic64_add(u64 delta, atomic64_t *ptr)
128{
129 atomic64_add_return(delta, ptr);
130}
131EXPORT_SYMBOL(atomic64_add);
132
133/**
134 * atomic64_sub - subtract the atomic64 variable
135 * @delta: integer value to subtract
136 * @ptr: pointer to type atomic64_t
137 *
138 * Atomically subtracts @delta from @ptr.
139 */
140void atomic64_sub(u64 delta, atomic64_t *ptr)
141{
142 atomic64_add(-delta, ptr);
143}
144EXPORT_SYMBOL(atomic64_sub);
145
146/**
147 * atomic64_sub_and_test - subtract value from variable and test result
148 * @delta: integer value to subtract
149 * @ptr: pointer to type atomic64_t
150 *
151 * Atomically subtracts @delta from @ptr and returns
152 * true if the result is zero, or false for all
153 * other cases.
154 */
155int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
156{
157 u64 new_val = atomic64_sub_return(delta, ptr);
158
159 return new_val == 0;
160}
161EXPORT_SYMBOL(atomic64_sub_and_test);
162
163/**
164 * atomic64_inc - increment atomic64 variable
165 * @ptr: pointer to type atomic64_t
166 *
167 * Atomically increments @ptr by 1.
168 */
169void atomic64_inc(atomic64_t *ptr)
170{
171 atomic64_add(1, ptr);
172}
173EXPORT_SYMBOL(atomic64_inc);
174
175/**
176 * atomic64_dec - decrement atomic64 variable
177 * @ptr: pointer to type atomic64_t
178 *
179 * Atomically decrements @ptr by 1.
180 */
181void atomic64_dec(atomic64_t *ptr)
182{
183 atomic64_sub(1, ptr);
184}
185EXPORT_SYMBOL(atomic64_dec);
186
187/**
188 * atomic64_dec_and_test - decrement and test
189 * @ptr: pointer to type atomic64_t
190 *
191 * Atomically decrements @ptr by 1 and
192 * returns true if the result is 0, or false for all other
193 * cases.
194 */
195int atomic64_dec_and_test(atomic64_t *ptr)
196{
197 return atomic64_sub_and_test(1, ptr);
198}
199EXPORT_SYMBOL(atomic64_dec_and_test);
200
201/**
202 * atomic64_inc_and_test - increment and test
203 * @ptr: pointer to type atomic64_t
204 *
205 * Atomically increments @ptr by 1
206 * and returns true if the result is zero, or false for all
207 * other cases.
208 */
209int atomic64_inc_and_test(atomic64_t *ptr)
210{
211 return atomic64_sub_and_test(-1, ptr);
212}
213EXPORT_SYMBOL(atomic64_inc_and_test);
214
215/**
216 * atomic64_add_negative - add and test if negative
217 * @delta: integer value to add
218 * @ptr: pointer to type atomic64_t
219 *
220 * Atomically adds @delta to @ptr and returns true
221 * if the result is negative, or false when
222 * result is greater than or equal to zero.
223 */
224int atomic64_add_negative(u64 delta, atomic64_t *ptr)
225{
226 s64 new_val = atomic64_add_return(delta, ptr);
227
228 return new_val < 0;
229}
230EXPORT_SYMBOL(atomic64_add_negative);
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 00000000000..4a5979aa688
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,174 @@
1/*
2 * atomic64_t for 386/486
3 *
4 * Copyright © 2010 Luca Barbieri
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13#include <asm/alternative-asm.h>
14#include <asm/dwarf2.h>
15
16/* if you want SMP support, implement these with real spinlocks */
17.macro LOCK reg
18 pushfl
19 CFI_ADJUST_CFA_OFFSET 4
20 cli
21.endm
22
23.macro UNLOCK reg
24 popfl
25 CFI_ADJUST_CFA_OFFSET -4
26.endm
27
28.macro BEGIN func reg
29$v = \reg
30
31ENTRY(atomic64_\func\()_386)
32 CFI_STARTPROC
33 LOCK $v
34
35.macro RETURN
36 UNLOCK $v
37 ret
38.endm
39
40.macro END_
41 CFI_ENDPROC
42ENDPROC(atomic64_\func\()_386)
43.purgem RETURN
44.purgem END_
45.purgem END
46.endm
47
48.macro END
49RETURN
50END_
51.endm
52.endm
53
54BEGIN read %ecx
55 movl ($v), %eax
56 movl 4($v), %edx
57END
58
59BEGIN set %esi
60 movl %ebx, ($v)
61 movl %ecx, 4($v)
62END
63
64BEGIN xchg %esi
65 movl ($v), %eax
66 movl 4($v), %edx
67 movl %ebx, ($v)
68 movl %ecx, 4($v)
69END
70
71BEGIN add %ecx
72 addl %eax, ($v)
73 adcl %edx, 4($v)
74END
75
76BEGIN add_return %ecx
77 addl ($v), %eax
78 adcl 4($v), %edx
79 movl %eax, ($v)
80 movl %edx, 4($v)
81END
82
83BEGIN sub %ecx
84 subl %eax, ($v)
85 sbbl %edx, 4($v)
86END
87
88BEGIN sub_return %ecx
89 negl %edx
90 negl %eax
91 sbbl $0, %edx
92 addl ($v), %eax
93 adcl 4($v), %edx
94 movl %eax, ($v)
95 movl %edx, 4($v)
96END
97
98BEGIN inc %esi
99 addl $1, ($v)
100 adcl $0, 4($v)
101END
102
103BEGIN inc_return %esi
104 movl ($v), %eax
105 movl 4($v), %edx
106 addl $1, %eax
107 adcl $0, %edx
108 movl %eax, ($v)
109 movl %edx, 4($v)
110END
111
112BEGIN dec %esi
113 subl $1, ($v)
114 sbbl $0, 4($v)
115END
116
117BEGIN dec_return %esi
118 movl ($v), %eax
119 movl 4($v), %edx
120 subl $1, %eax
121 sbbl $0, %edx
122 movl %eax, ($v)
123 movl %edx, 4($v)
124END
125
126BEGIN add_unless %ecx
127 addl %eax, %esi
128 adcl %edx, %edi
129 addl ($v), %eax
130 adcl 4($v), %edx
131 cmpl %eax, %esi
132 je 3f
1331:
134 movl %eax, ($v)
135 movl %edx, 4($v)
136 movl $1, %eax
1372:
138RETURN
1393:
140 cmpl %edx, %edi
141 jne 1b
142 xorl %eax, %eax
143 jmp 2b
144END_
145
146BEGIN inc_not_zero %esi
147 movl ($v), %eax
148 movl 4($v), %edx
149 testl %eax, %eax
150 je 3f
1511:
152 addl $1, %eax
153 adcl $0, %edx
154 movl %eax, ($v)
155 movl %edx, 4($v)
156 movl $1, %eax
1572:
158RETURN
1593:
160 testl %edx, %edx
161 jne 1b
162 jmp 2b
163END_
164
165BEGIN dec_if_positive %esi
166 movl ($v), %eax
167 movl 4($v), %edx
168 subl $1, %eax
169 sbbl $0, %edx
170 js 1f
171 movl %eax, ($v)
172 movl %edx, 4($v)
1731:
174END
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 00000000000..71e080de335
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,224 @@
1/*
2 * atomic64_t for 586+
3 *
4 * Copyright © 2010 Luca Barbieri
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13#include <asm/alternative-asm.h>
14#include <asm/dwarf2.h>
15
16.macro SAVE reg
17 pushl %\reg
18 CFI_ADJUST_CFA_OFFSET 4
19 CFI_REL_OFFSET \reg, 0
20.endm
21
22.macro RESTORE reg
23 popl %\reg
24 CFI_ADJUST_CFA_OFFSET -4
25 CFI_RESTORE \reg
26.endm
27
28.macro read64 reg
29 movl %ebx, %eax
30 movl %ecx, %edx
31/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
32 LOCK_PREFIX
33 cmpxchg8b (\reg)
34.endm
35
36ENTRY(atomic64_read_cx8)
37 CFI_STARTPROC
38
39 read64 %ecx
40 ret
41 CFI_ENDPROC
42ENDPROC(atomic64_read_cx8)
43
44ENTRY(atomic64_set_cx8)
45 CFI_STARTPROC
46
471:
48/* we don't need LOCK_PREFIX since aligned 64-bit writes
49 * are atomic on 586 and newer */
50 cmpxchg8b (%esi)
51 jne 1b
52
53 ret
54 CFI_ENDPROC
55ENDPROC(atomic64_set_cx8)
56
57ENTRY(atomic64_xchg_cx8)
58 CFI_STARTPROC
59
60 movl %ebx, %eax
61 movl %ecx, %edx
621:
63 LOCK_PREFIX
64 cmpxchg8b (%esi)
65 jne 1b
66
67 ret
68 CFI_ENDPROC
69ENDPROC(atomic64_xchg_cx8)
70
71.macro addsub_return func ins insc
72ENTRY(atomic64_\func\()_return_cx8)
73 CFI_STARTPROC
74 SAVE ebp
75 SAVE ebx
76 SAVE esi
77 SAVE edi
78
79 movl %eax, %esi
80 movl %edx, %edi
81 movl %ecx, %ebp
82
83 read64 %ebp
841:
85 movl %eax, %ebx
86 movl %edx, %ecx
87 \ins\()l %esi, %ebx
88 \insc\()l %edi, %ecx
89 LOCK_PREFIX
90 cmpxchg8b (%ebp)
91 jne 1b
92
9310:
94 movl %ebx, %eax
95 movl %ecx, %edx
96 RESTORE edi
97 RESTORE esi
98 RESTORE ebx
99 RESTORE ebp
100 ret
101 CFI_ENDPROC
102ENDPROC(atomic64_\func\()_return_cx8)
103.endm
104
105addsub_return add add adc
106addsub_return sub sub sbb
107
108.macro incdec_return func ins insc
109ENTRY(atomic64_\func\()_return_cx8)
110 CFI_STARTPROC
111 SAVE ebx
112
113 read64 %esi
1141:
115 movl %eax, %ebx
116 movl %edx, %ecx
117 \ins\()l $1, %ebx
118 \insc\()l $0, %ecx
119 LOCK_PREFIX
120 cmpxchg8b (%esi)
121 jne 1b
122
12310:
124 movl %ebx, %eax
125 movl %ecx, %edx
126 RESTORE ebx
127 ret
128 CFI_ENDPROC
129ENDPROC(atomic64_\func\()_return_cx8)
130.endm
131
132incdec_return inc add adc
133incdec_return dec sub sbb
134
135ENTRY(atomic64_dec_if_positive_cx8)
136 CFI_STARTPROC
137 SAVE ebx
138
139 read64 %esi
1401:
141 movl %eax, %ebx
142 movl %edx, %ecx
143 subl $1, %ebx
144 sbb $0, %ecx
145 js 2f
146 LOCK_PREFIX
147 cmpxchg8b (%esi)
148 jne 1b
149
1502:
151 movl %ebx, %eax
152 movl %ecx, %edx
153 RESTORE ebx
154 ret
155 CFI_ENDPROC
156ENDPROC(atomic64_dec_if_positive_cx8)
157
158ENTRY(atomic64_add_unless_cx8)
159 CFI_STARTPROC
160 SAVE ebp
161 SAVE ebx
162/* these just push these two parameters on the stack */
163 SAVE edi
164 SAVE esi
165
166 movl %ecx, %ebp
167 movl %eax, %esi
168 movl %edx, %edi
169
170 read64 %ebp
1711:
172 cmpl %eax, 0(%esp)
173 je 4f
1742:
175 movl %eax, %ebx
176 movl %edx, %ecx
177 addl %esi, %ebx
178 adcl %edi, %ecx
179 LOCK_PREFIX
180 cmpxchg8b (%ebp)
181 jne 1b
182
183 movl $1, %eax
1843:
185 addl $8, %esp
186 CFI_ADJUST_CFA_OFFSET -8
187 RESTORE ebx
188 RESTORE ebp
189 ret
1904:
191 cmpl %edx, 4(%esp)
192 jne 2b
193 xorl %eax, %eax
194 jmp 3b
195 CFI_ENDPROC
196ENDPROC(atomic64_add_unless_cx8)
197
198ENTRY(atomic64_inc_not_zero_cx8)
199 CFI_STARTPROC
200 SAVE ebx
201
202 read64 %esi
2031:
204 testl %eax, %eax
205 je 4f
2062:
207 movl %eax, %ebx
208 movl %edx, %ecx
209 addl $1, %ebx
210 adcl $0, %ecx
211 LOCK_PREFIX
212 cmpxchg8b (%esi)
213 jne 1b
214
215 movl $1, %eax
2163:
217 RESTORE ebx
218 ret
2194:
220 testl %edx, %edx
221 jne 2b
222 jmp 3b
223 CFI_ENDPROC
224ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index aa098708877..dc8adad10a2 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,10 +30,10 @@ static void fclex(void)
30} 30}
31 31
32/* Needs to be externally visible */ 32/* Needs to be externally visible */
33void finit_task(struct task_struct *tsk) 33void finit_soft_fpu(struct i387_soft_struct *soft)
34{ 34{
35 struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
36 struct address *oaddr, *iaddr; 35 struct address *oaddr, *iaddr;
36 memset(soft, 0, sizeof(*soft));
37 soft->cwd = 0x037f; 37 soft->cwd = 0x037f;
38 soft->swd = 0; 38 soft->swd = 0;
39 soft->ftop = 0; /* We don't keep top in the status word internally. */ 39 soft->ftop = 0; /* We don't keep top in the status word internally. */
@@ -52,7 +52,7 @@ void finit_task(struct task_struct *tsk)
52 52
53void finit(void) 53void finit(void)
54{ 54{
55 finit_task(current); 55 finit_soft_fpu(&current->thread.fpu.state->soft);
56} 56}
57 57
58/* 58/*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 5d87f586f8d..7718541541d 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -681,7 +681,7 @@ int fpregs_soft_set(struct task_struct *target,
681 unsigned int pos, unsigned int count, 681 unsigned int pos, unsigned int count,
682 const void *kbuf, const void __user *ubuf) 682 const void *kbuf, const void __user *ubuf)
683{ 683{
684 struct i387_soft_struct *s387 = &target->thread.xstate->soft; 684 struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
685 void *space = s387->st_space; 685 void *space = s387->st_space;
686 int ret; 686 int ret;
687 int offset, other, i, tags, regnr, tag, newtop; 687 int offset, other, i, tags, regnr, tag, newtop;
@@ -733,7 +733,7 @@ int fpregs_soft_get(struct task_struct *target,
733 unsigned int pos, unsigned int count, 733 unsigned int pos, unsigned int count,
734 void *kbuf, void __user *ubuf) 734 void *kbuf, void __user *ubuf)
735{ 735{
736 struct i387_soft_struct *s387 = &target->thread.xstate->soft; 736 struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
737 const void *space = s387->st_space; 737 const void *space = s387->st_space;
738 int ret; 738 int ret;
739 int offset = (S387->ftop & 7) * 10, other = 80 - offset; 739 int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 50fa0ec2c8a..2c614410a5f 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -31,7 +31,7 @@
31#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ 31#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \
32 == (1 << 10)) 32 == (1 << 10))
33 33
34#define I387 (current->thread.xstate) 34#define I387 (current->thread.fpu.state)
35#define FPU_info (I387->soft.info) 35#define FPU_info (I387->soft.info)
36 36
37#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) 37#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 06630d26e56..a4c768397ba 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,7 @@ nostackp := $(call cc-option, -fno-stack-protector)
6CFLAGS_physaddr.o := $(nostackp) 6CFLAGS_physaddr.o := $(nostackp)
7CFLAGS_setup_nx.o := $(nostackp) 7CFLAGS_setup_nx.o := $(nostackp)
8 8
9obj-$(CONFIG_X86_PAT) += pat_rbtree.o
9obj-$(CONFIG_SMP) += tlb.o 10obj-$(CONFIG_SMP) += tlb.o
10 11
11obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o 12obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 8948f47fde0..a7bcc23ef96 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -33,9 +33,6 @@ int numa_off __initdata;
33static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
35 35
36DEFINE_PER_CPU(int, node_number) = 0;
37EXPORT_PER_CPU_SYMBOL(node_number);
38
39/* 36/*
40 * Map cpu index to node index 37 * Map cpu index to node index
41 */ 38 */
@@ -809,7 +806,7 @@ void __cpuinit numa_set_node(int cpu, int node)
809 per_cpu(x86_cpu_to_node_map, cpu) = node; 806 per_cpu(x86_cpu_to_node_map, cpu) = node;
810 807
811 if (node != NUMA_NO_NODE) 808 if (node != NUMA_NO_NODE)
812 per_cpu(node_number, cpu) = node; 809 set_cpu_numa_node(cpu, node);
813} 810}
814 811
815void __cpuinit numa_clear_node(int cpu) 812void __cpuinit numa_clear_node(int cpu)
@@ -867,7 +864,7 @@ void __cpuinit numa_remove_cpu(int cpu)
867 numa_set_cpumask(cpu, 0); 864 numa_set_cpumask(cpu, 0);
868} 865}
869 866
870int cpu_to_node(int cpu) 867int __cpu_to_node(int cpu)
871{ 868{
872 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 869 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
873 printk(KERN_WARNING 870 printk(KERN_WARNING
@@ -877,7 +874,7 @@ int cpu_to_node(int cpu)
877 } 874 }
878 return per_cpu(x86_cpu_to_node_map, cpu); 875 return per_cpu(x86_cpu_to_node_map, cpu);
879} 876}
880EXPORT_SYMBOL(cpu_to_node); 877EXPORT_SYMBOL(__cpu_to_node);
881 878
882/* 879/*
883 * Same function as cpu_to_node() but used if called before the 880 * Same function as cpu_to_node() but used if called before the
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 28195c350b9..532e7933d60 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -997,7 +997,8 @@ out_err:
997} 997}
998EXPORT_SYMBOL(set_memory_uc); 998EXPORT_SYMBOL(set_memory_uc);
999 999
1000int set_memory_array_uc(unsigned long *addr, int addrinarray) 1000int _set_memory_array(unsigned long *addr, int addrinarray,
1001 unsigned long new_type)
1001{ 1002{
1002 int i, j; 1003 int i, j;
1003 int ret; 1004 int ret;
@@ -1007,13 +1008,19 @@ int set_memory_array_uc(unsigned long *addr, int addrinarray)
1007 */ 1008 */
1008 for (i = 0; i < addrinarray; i++) { 1009 for (i = 0; i < addrinarray; i++) {
1009 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, 1010 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1010 _PAGE_CACHE_UC_MINUS, NULL); 1011 new_type, NULL);
1011 if (ret) 1012 if (ret)
1012 goto out_free; 1013 goto out_free;
1013 } 1014 }
1014 1015
1015 ret = change_page_attr_set(addr, addrinarray, 1016 ret = change_page_attr_set(addr, addrinarray,
1016 __pgprot(_PAGE_CACHE_UC_MINUS), 1); 1017 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
1018
1019 if (!ret && new_type == _PAGE_CACHE_WC)
1020 ret = change_page_attr_set_clr(addr, addrinarray,
1021 __pgprot(_PAGE_CACHE_WC),
1022 __pgprot(_PAGE_CACHE_MASK),
1023 0, CPA_ARRAY, NULL);
1017 if (ret) 1024 if (ret)
1018 goto out_free; 1025 goto out_free;
1019 1026
@@ -1025,8 +1032,19 @@ out_free:
1025 1032
1026 return ret; 1033 return ret;
1027} 1034}
1035
1036int set_memory_array_uc(unsigned long *addr, int addrinarray)
1037{
1038 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
1039}
1028EXPORT_SYMBOL(set_memory_array_uc); 1040EXPORT_SYMBOL(set_memory_array_uc);
1029 1041
1042int set_memory_array_wc(unsigned long *addr, int addrinarray)
1043{
1044 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
1045}
1046EXPORT_SYMBOL(set_memory_array_wc);
1047
1030int _set_memory_wc(unsigned long addr, int numpages) 1048int _set_memory_wc(unsigned long addr, int numpages)
1031{ 1049{
1032 int ret; 1050 int ret;
@@ -1153,26 +1171,34 @@ int set_pages_uc(struct page *page, int numpages)
1153} 1171}
1154EXPORT_SYMBOL(set_pages_uc); 1172EXPORT_SYMBOL(set_pages_uc);
1155 1173
1156int set_pages_array_uc(struct page **pages, int addrinarray) 1174static int _set_pages_array(struct page **pages, int addrinarray,
1175 unsigned long new_type)
1157{ 1176{
1158 unsigned long start; 1177 unsigned long start;
1159 unsigned long end; 1178 unsigned long end;
1160 int i; 1179 int i;
1161 int free_idx; 1180 int free_idx;
1181 int ret;
1162 1182
1163 for (i = 0; i < addrinarray; i++) { 1183 for (i = 0; i < addrinarray; i++) {
1164 if (PageHighMem(pages[i])) 1184 if (PageHighMem(pages[i]))
1165 continue; 1185 continue;
1166 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1186 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1167 end = start + PAGE_SIZE; 1187 end = start + PAGE_SIZE;
1168 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) 1188 if (reserve_memtype(start, end, new_type, NULL))
1169 goto err_out; 1189 goto err_out;
1170 } 1190 }
1171 1191
1172 if (cpa_set_pages_array(pages, addrinarray, 1192 ret = cpa_set_pages_array(pages, addrinarray,
1173 __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) { 1193 __pgprot(_PAGE_CACHE_UC_MINUS));
1174 return 0; /* Success */ 1194 if (!ret && new_type == _PAGE_CACHE_WC)
1175 } 1195 ret = change_page_attr_set_clr(NULL, addrinarray,
1196 __pgprot(_PAGE_CACHE_WC),
1197 __pgprot(_PAGE_CACHE_MASK),
1198 0, CPA_PAGES_ARRAY, pages);
1199 if (ret)
1200 goto err_out;
1201 return 0; /* Success */
1176err_out: 1202err_out:
1177 free_idx = i; 1203 free_idx = i;
1178 for (i = 0; i < free_idx; i++) { 1204 for (i = 0; i < free_idx; i++) {
@@ -1184,8 +1210,19 @@ err_out:
1184 } 1210 }
1185 return -EINVAL; 1211 return -EINVAL;
1186} 1212}
1213
1214int set_pages_array_uc(struct page **pages, int addrinarray)
1215{
1216 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
1217}
1187EXPORT_SYMBOL(set_pages_array_uc); 1218EXPORT_SYMBOL(set_pages_array_uc);
1188 1219
1220int set_pages_array_wc(struct page **pages, int addrinarray)
1221{
1222 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
1223}
1224EXPORT_SYMBOL(set_pages_array_wc);
1225
1189int set_pages_wb(struct page *page, int numpages) 1226int set_pages_wb(struct page *page, int numpages)
1190{ 1227{
1191 unsigned long addr = (unsigned long)page_address(page); 1228 unsigned long addr = (unsigned long)page_address(page);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index edc8b95afc1..acc15b23b74 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -30,6 +30,8 @@
30#include <asm/pat.h> 30#include <asm/pat.h>
31#include <asm/io.h> 31#include <asm/io.h>
32 32
33#include "pat_internal.h"
34
33#ifdef CONFIG_X86_PAT 35#ifdef CONFIG_X86_PAT
34int __read_mostly pat_enabled = 1; 36int __read_mostly pat_enabled = 1;
35 37
@@ -53,19 +55,15 @@ static inline void pat_disable(const char *reason)
53#endif 55#endif
54 56
55 57
56static int debug_enable; 58int pat_debug_enable;
57 59
58static int __init pat_debug_setup(char *str) 60static int __init pat_debug_setup(char *str)
59{ 61{
60 debug_enable = 1; 62 pat_debug_enable = 1;
61 return 0; 63 return 0;
62} 64}
63__setup("debugpat", pat_debug_setup); 65__setup("debugpat", pat_debug_setup);
64 66
65#define dprintk(fmt, arg...) \
66 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
67
68
69static u64 __read_mostly boot_pat_state; 67static u64 __read_mostly boot_pat_state;
70 68
71enum { 69enum {
@@ -132,84 +130,7 @@ void pat_init(void)
132 130
133#undef PAT 131#undef PAT
134 132
135static char *cattr_name(unsigned long flags) 133static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
136{
137 switch (flags & _PAGE_CACHE_MASK) {
138 case _PAGE_CACHE_UC: return "uncached";
139 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
140 case _PAGE_CACHE_WB: return "write-back";
141 case _PAGE_CACHE_WC: return "write-combining";
142 default: return "broken";
143 }
144}
145
146/*
147 * The global memtype list keeps track of memory type for specific
148 * physical memory areas. Conflicting memory types in different
149 * mappings can cause CPU cache corruption. To avoid this we keep track.
150 *
151 * The list is sorted based on starting address and can contain multiple
152 * entries for each address (this allows reference counting for overlapping
153 * areas). All the aliases have the same cache attributes of course.
154 * Zero attributes are represented as holes.
155 *
156 * The data structure is a list that is also organized as an rbtree
157 * sorted on the start address of memtype range.
158 *
159 * memtype_lock protects both the linear list and rbtree.
160 */
161
162struct memtype {
163 u64 start;
164 u64 end;
165 unsigned long type;
166 struct list_head nd;
167 struct rb_node rb;
168};
169
170static struct rb_root memtype_rbroot = RB_ROOT;
171static LIST_HEAD(memtype_list);
172static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
173
174static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
175{
176 struct rb_node *node = root->rb_node;
177 struct memtype *last_lower = NULL;
178
179 while (node) {
180 struct memtype *data = container_of(node, struct memtype, rb);
181
182 if (data->start < start) {
183 last_lower = data;
184 node = node->rb_right;
185 } else if (data->start > start) {
186 node = node->rb_left;
187 } else
188 return data;
189 }
190
191 /* Will return NULL if there is no entry with its start <= start */
192 return last_lower;
193}
194
195static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
196{
197 struct rb_node **new = &(root->rb_node);
198 struct rb_node *parent = NULL;
199
200 while (*new) {
201 struct memtype *this = container_of(*new, struct memtype, rb);
202
203 parent = *new;
204 if (data->start <= this->start)
205 new = &((*new)->rb_left);
206 else if (data->start > this->start)
207 new = &((*new)->rb_right);
208 }
209
210 rb_link_node(&data->rb, parent, new);
211 rb_insert_color(&data->rb, root);
212}
213 134
214/* 135/*
215 * Does intersection of PAT memory type and MTRR memory type and returns 136 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -237,33 +158,6 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
237 return req_type; 158 return req_type;
238} 159}
239 160
240static int
241chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
242{
243 if (new->type != entry->type) {
244 if (type) {
245 new->type = entry->type;
246 *type = entry->type;
247 } else
248 goto conflict;
249 }
250
251 /* check overlaps with more than one entry in the list */
252 list_for_each_entry_continue(entry, &memtype_list, nd) {
253 if (new->end <= entry->start)
254 break;
255 else if (new->type != entry->type)
256 goto conflict;
257 }
258 return 0;
259
260 conflict:
261 printk(KERN_INFO "%s:%d conflicting memory types "
262 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
263 new->end, cattr_name(new->type), cattr_name(entry->type));
264 return -EBUSY;
265}
266
267static int pat_pagerange_is_ram(unsigned long start, unsigned long end) 161static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
268{ 162{
269 int ram_page = 0, not_rampage = 0; 163 int ram_page = 0, not_rampage = 0;
@@ -296,8 +190,6 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
296 * Here we do two pass: 190 * Here we do two pass:
297 * - Find the memtype of all the pages in the range, look for any conflicts 191 * - Find the memtype of all the pages in the range, look for any conflicts
298 * - In case of no conflicts, set the new memtype for pages in the range 192 * - In case of no conflicts, set the new memtype for pages in the range
299 *
300 * Caller must hold memtype_lock for atomicity.
301 */ 193 */
302static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, 194static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
303 unsigned long *new_type) 195 unsigned long *new_type)
@@ -364,9 +256,8 @@ static int free_ram_pages_type(u64 start, u64 end)
364int reserve_memtype(u64 start, u64 end, unsigned long req_type, 256int reserve_memtype(u64 start, u64 end, unsigned long req_type,
365 unsigned long *new_type) 257 unsigned long *new_type)
366{ 258{
367 struct memtype *new, *entry; 259 struct memtype *new;
368 unsigned long actual_type; 260 unsigned long actual_type;
369 struct list_head *where;
370 int is_range_ram; 261 int is_range_ram;
371 int err = 0; 262 int err = 0;
372 263
@@ -404,9 +295,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
404 is_range_ram = pat_pagerange_is_ram(start, end); 295 is_range_ram = pat_pagerange_is_ram(start, end);
405 if (is_range_ram == 1) { 296 if (is_range_ram == 1) {
406 297
407 spin_lock(&memtype_lock);
408 err = reserve_ram_pages_type(start, end, req_type, new_type); 298 err = reserve_ram_pages_type(start, end, req_type, new_type);
409 spin_unlock(&memtype_lock);
410 299
411 return err; 300 return err;
412 } else if (is_range_ram < 0) { 301 } else if (is_range_ram < 0) {
@@ -423,42 +312,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
423 312
424 spin_lock(&memtype_lock); 313 spin_lock(&memtype_lock);
425 314
426 /* Search for existing mapping that overlaps the current range */ 315 err = rbt_memtype_check_insert(new, new_type);
427 where = NULL;
428 list_for_each_entry(entry, &memtype_list, nd) {
429 if (end <= entry->start) {
430 where = entry->nd.prev;
431 break;
432 } else if (start <= entry->start) { /* end > entry->start */
433 err = chk_conflict(new, entry, new_type);
434 if (!err) {
435 dprintk("Overlap at 0x%Lx-0x%Lx\n",
436 entry->start, entry->end);
437 where = entry->nd.prev;
438 }
439 break;
440 } else if (start < entry->end) { /* start > entry->start */
441 err = chk_conflict(new, entry, new_type);
442 if (!err) {
443 dprintk("Overlap at 0x%Lx-0x%Lx\n",
444 entry->start, entry->end);
445
446 /*
447 * Move to right position in the linked
448 * list to add this new entry
449 */
450 list_for_each_entry_continue(entry,
451 &memtype_list, nd) {
452 if (start <= entry->start) {
453 where = entry->nd.prev;
454 break;
455 }
456 }
457 }
458 break;
459 }
460 }
461
462 if (err) { 316 if (err) {
463 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " 317 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
464 "track %s, req %s\n", 318 "track %s, req %s\n",
@@ -469,13 +323,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
469 return err; 323 return err;
470 } 324 }
471 325
472 if (where)
473 list_add(&new->nd, where);
474 else
475 list_add_tail(&new->nd, &memtype_list);
476
477 memtype_rb_insert(&memtype_rbroot, new);
478
479 spin_unlock(&memtype_lock); 326 spin_unlock(&memtype_lock);
480 327
481 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", 328 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -487,9 +334,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
487 334
488int free_memtype(u64 start, u64 end) 335int free_memtype(u64 start, u64 end)
489{ 336{
490 struct memtype *entry, *saved_entry;
491 int err = -EINVAL; 337 int err = -EINVAL;
492 int is_range_ram; 338 int is_range_ram;
339 struct memtype *entry;
493 340
494 if (!pat_enabled) 341 if (!pat_enabled)
495 return 0; 342 return 0;
@@ -501,9 +348,7 @@ int free_memtype(u64 start, u64 end)
501 is_range_ram = pat_pagerange_is_ram(start, end); 348 is_range_ram = pat_pagerange_is_ram(start, end);
502 if (is_range_ram == 1) { 349 if (is_range_ram == 1) {
503 350
504 spin_lock(&memtype_lock);
505 err = free_ram_pages_type(start, end); 351 err = free_ram_pages_type(start, end);
506 spin_unlock(&memtype_lock);
507 352
508 return err; 353 return err;
509 } else if (is_range_ram < 0) { 354 } else if (is_range_ram < 0) {
@@ -511,56 +356,20 @@ int free_memtype(u64 start, u64 end)
511 } 356 }
512 357
513 spin_lock(&memtype_lock); 358 spin_lock(&memtype_lock);
514 359 entry = rbt_memtype_erase(start, end);
515 entry = memtype_rb_search(&memtype_rbroot, start);
516 if (unlikely(entry == NULL))
517 goto unlock_ret;
518
519 /*
520 * Saved entry points to an entry with start same or less than what
521 * we searched for. Now go through the list in both directions to look
522 * for the entry that matches with both start and end, with list stored
523 * in sorted start address
524 */
525 saved_entry = entry;
526 list_for_each_entry_from(entry, &memtype_list, nd) {
527 if (entry->start == start && entry->end == end) {
528 rb_erase(&entry->rb, &memtype_rbroot);
529 list_del(&entry->nd);
530 kfree(entry);
531 err = 0;
532 break;
533 } else if (entry->start > start) {
534 break;
535 }
536 }
537
538 if (!err)
539 goto unlock_ret;
540
541 entry = saved_entry;
542 list_for_each_entry_reverse(entry, &memtype_list, nd) {
543 if (entry->start == start && entry->end == end) {
544 rb_erase(&entry->rb, &memtype_rbroot);
545 list_del(&entry->nd);
546 kfree(entry);
547 err = 0;
548 break;
549 } else if (entry->start < start) {
550 break;
551 }
552 }
553unlock_ret:
554 spin_unlock(&memtype_lock); 360 spin_unlock(&memtype_lock);
555 361
556 if (err) { 362 if (!entry) {
557 printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", 363 printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
558 current->comm, current->pid, start, end); 364 current->comm, current->pid, start, end);
365 return -EINVAL;
559 } 366 }
560 367
368 kfree(entry);
369
561 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 370 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
562 371
563 return err; 372 return 0;
564} 373}
565 374
566 375
@@ -583,10 +392,8 @@ static unsigned long lookup_memtype(u64 paddr)
583 392
584 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 393 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
585 struct page *page; 394 struct page *page;
586 spin_lock(&memtype_lock);
587 page = pfn_to_page(paddr >> PAGE_SHIFT); 395 page = pfn_to_page(paddr >> PAGE_SHIFT);
588 rettype = get_page_memtype(page); 396 rettype = get_page_memtype(page);
589 spin_unlock(&memtype_lock);
590 /* 397 /*
591 * -1 from get_page_memtype() implies RAM page is in its 398 * -1 from get_page_memtype() implies RAM page is in its
592 * default state and not reserved, and hence of type WB 399 * default state and not reserved, and hence of type WB
@@ -599,7 +406,7 @@ static unsigned long lookup_memtype(u64 paddr)
599 406
600 spin_lock(&memtype_lock); 407 spin_lock(&memtype_lock);
601 408
602 entry = memtype_rb_search(&memtype_rbroot, paddr); 409 entry = rbt_memtype_lookup(paddr);
603 if (entry != NULL) 410 if (entry != NULL)
604 rettype = entry->type; 411 rettype = entry->type;
605 else 412 else
@@ -936,29 +743,25 @@ EXPORT_SYMBOL_GPL(pgprot_writecombine);
936 743
937#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 744#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
938 745
939/* get Nth element of the linked list */
940static struct memtype *memtype_get_idx(loff_t pos) 746static struct memtype *memtype_get_idx(loff_t pos)
941{ 747{
942 struct memtype *list_node, *print_entry; 748 struct memtype *print_entry;
943 int i = 1; 749 int ret;
944 750
945 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); 751 print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL);
946 if (!print_entry) 752 if (!print_entry)
947 return NULL; 753 return NULL;
948 754
949 spin_lock(&memtype_lock); 755 spin_lock(&memtype_lock);
950 list_for_each_entry(list_node, &memtype_list, nd) { 756 ret = rbt_memtype_copy_nth_element(print_entry, pos);
951 if (pos == i) {
952 *print_entry = *list_node;
953 spin_unlock(&memtype_lock);
954 return print_entry;
955 }
956 ++i;
957 }
958 spin_unlock(&memtype_lock); 757 spin_unlock(&memtype_lock);
959 kfree(print_entry);
960 758
961 return NULL; 759 if (!ret) {
760 return print_entry;
761 } else {
762 kfree(print_entry);
763 return NULL;
764 }
962} 765}
963 766
964static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) 767static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
new file mode 100644
index 00000000000..77e5ba153fa
--- /dev/null
+++ b/arch/x86/mm/pat_internal.h
@@ -0,0 +1,46 @@
1#ifndef __PAT_INTERNAL_H_
2#define __PAT_INTERNAL_H_
3
4extern int pat_debug_enable;
5
6#define dprintk(fmt, arg...) \
7 do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
8
9struct memtype {
10 u64 start;
11 u64 end;
12 u64 subtree_max_end;
13 unsigned long type;
14 struct rb_node rb;
15};
16
17static inline char *cattr_name(unsigned long flags)
18{
19 switch (flags & _PAGE_CACHE_MASK) {
20 case _PAGE_CACHE_UC: return "uncached";
21 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
22 case _PAGE_CACHE_WB: return "write-back";
23 case _PAGE_CACHE_WC: return "write-combining";
24 default: return "broken";
25 }
26}
27
28#ifdef CONFIG_X86_PAT
29extern int rbt_memtype_check_insert(struct memtype *new,
30 unsigned long *new_type);
31extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
32extern struct memtype *rbt_memtype_lookup(u64 addr);
33extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
34#else
35static inline int rbt_memtype_check_insert(struct memtype *new,
36 unsigned long *new_type)
37{ return 0; }
38static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
39{ return NULL; }
40static inline struct memtype *rbt_memtype_lookup(u64 addr)
41{ return NULL; }
42static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
43{ return 0; }
44#endif
45
46#endif /* __PAT_INTERNAL_H_ */
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
new file mode 100644
index 00000000000..f537087bb74
--- /dev/null
+++ b/arch/x86/mm/pat_rbtree.c
@@ -0,0 +1,274 @@
1/*
2 * Handle caching attributes in page tables (PAT)
3 *
4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
5 * Suresh B Siddha <suresh.b.siddha@intel.com>
6 *
7 * Interval tree (augmented rbtree) used to store the PAT memory type
8 * reservations.
9 */
10
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kernel.h>
14#include <linux/module.h>
15#include <linux/rbtree.h>
16#include <linux/sched.h>
17#include <linux/gfp.h>
18
19#include <asm/pgtable.h>
20#include <asm/pat.h>
21
22#include "pat_internal.h"
23
24/*
25 * The memtype tree keeps track of memory type for specific
26 * physical memory areas. Without proper tracking, conflicting memory
27 * types in different mappings can cause CPU cache corruption.
28 *
29 * The tree is an interval tree (augmented rbtree) with tree ordered
30 * on starting address. Tree can contain multiple entries for
31 * different regions which overlap. All the aliases have the same
32 * cache attributes of course.
33 *
34 * memtype_lock protects the rbtree.
35 */
36
37static void memtype_rb_augment_cb(struct rb_node *node);
38static struct rb_root memtype_rbroot = RB_AUGMENT_ROOT(&memtype_rb_augment_cb);
39
40static int is_node_overlap(struct memtype *node, u64 start, u64 end)
41{
42 if (node->start >= end || node->end <= start)
43 return 0;
44
45 return 1;
46}
47
48static u64 get_subtree_max_end(struct rb_node *node)
49{
50 u64 ret = 0;
51 if (node) {
52 struct memtype *data = container_of(node, struct memtype, rb);
53 ret = data->subtree_max_end;
54 }
55 return ret;
56}
57
58/* Update 'subtree_max_end' for a node, based on node and its children */
59static void update_node_max_end(struct rb_node *node)
60{
61 struct memtype *data;
62 u64 max_end, child_max_end;
63
64 if (!node)
65 return;
66
67 data = container_of(node, struct memtype, rb);
68 max_end = data->end;
69
70 child_max_end = get_subtree_max_end(node->rb_right);
71 if (child_max_end > max_end)
72 max_end = child_max_end;
73
74 child_max_end = get_subtree_max_end(node->rb_left);
75 if (child_max_end > max_end)
76 max_end = child_max_end;
77
78 data->subtree_max_end = max_end;
79}
80
81/* Update 'subtree_max_end' for a node and all its ancestors */
82static void update_path_max_end(struct rb_node *node)
83{
84 u64 old_max_end, new_max_end;
85
86 while (node) {
87 struct memtype *data = container_of(node, struct memtype, rb);
88
89 old_max_end = data->subtree_max_end;
90 update_node_max_end(node);
91 new_max_end = data->subtree_max_end;
92
93 if (new_max_end == old_max_end)
94 break;
95
96 node = rb_parent(node);
97 }
98}
99
100/* Find the first (lowest start addr) overlapping range from rb tree */
101static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
102 u64 start, u64 end)
103{
104 struct rb_node *node = root->rb_node;
105 struct memtype *last_lower = NULL;
106
107 while (node) {
108 struct memtype *data = container_of(node, struct memtype, rb);
109
110 if (get_subtree_max_end(node->rb_left) > start) {
111 /* Lowest overlap if any must be on left side */
112 node = node->rb_left;
113 } else if (is_node_overlap(data, start, end)) {
114 last_lower = data;
115 break;
116 } else if (start >= data->start) {
117 /* Lowest overlap if any must be on right side */
118 node = node->rb_right;
119 } else {
120 break;
121 }
122 }
123 return last_lower; /* Returns NULL if there is no overlap */
124}
125
126static struct memtype *memtype_rb_exact_match(struct rb_root *root,
127 u64 start, u64 end)
128{
129 struct memtype *match;
130
131 match = memtype_rb_lowest_match(root, start, end);
132 while (match != NULL && match->start < end) {
133 struct rb_node *node;
134
135 if (match->start == start && match->end == end)
136 return match;
137
138 node = rb_next(&match->rb);
139 if (node)
140 match = container_of(node, struct memtype, rb);
141 else
142 match = NULL;
143 }
144
145 return NULL; /* Returns NULL if there is no exact match */
146}
147
148static int memtype_rb_check_conflict(struct rb_root *root,
149 u64 start, u64 end,
150 unsigned long reqtype, unsigned long *newtype)
151{
152 struct rb_node *node;
153 struct memtype *match;
154 int found_type = reqtype;
155
156 match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
157 if (match == NULL)
158 goto success;
159
160 if (match->type != found_type && newtype == NULL)
161 goto failure;
162
163 dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
164 found_type = match->type;
165
166 node = rb_next(&match->rb);
167 while (node) {
168 match = container_of(node, struct memtype, rb);
169
170 if (match->start >= end) /* Checked all possible matches */
171 goto success;
172
173 if (is_node_overlap(match, start, end) &&
174 match->type != found_type) {
175 goto failure;
176 }
177
178 node = rb_next(&match->rb);
179 }
180success:
181 if (newtype)
182 *newtype = found_type;
183
184 return 0;
185
186failure:
187 printk(KERN_INFO "%s:%d conflicting memory types "
188 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
189 end, cattr_name(found_type), cattr_name(match->type));
190 return -EBUSY;
191}
192
193static void memtype_rb_augment_cb(struct rb_node *node)
194{
195 if (node)
196 update_path_max_end(node);
197}
198
199static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
200{
201 struct rb_node **node = &(root->rb_node);
202 struct rb_node *parent = NULL;
203
204 while (*node) {
205 struct memtype *data = container_of(*node, struct memtype, rb);
206
207 parent = *node;
208 if (newdata->start <= data->start)
209 node = &((*node)->rb_left);
210 else if (newdata->start > data->start)
211 node = &((*node)->rb_right);
212 }
213
214 rb_link_node(&newdata->rb, parent, node);
215 rb_insert_color(&newdata->rb, root);
216}
217
218int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
219{
220 int err = 0;
221
222 err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
223 new->type, ret_type);
224
225 if (!err) {
226 if (ret_type)
227 new->type = *ret_type;
228
229 memtype_rb_insert(&memtype_rbroot, new);
230 }
231 return err;
232}
233
234struct memtype *rbt_memtype_erase(u64 start, u64 end)
235{
236 struct memtype *data;
237
238 data = memtype_rb_exact_match(&memtype_rbroot, start, end);
239 if (!data)
240 goto out;
241
242 rb_erase(&data->rb, &memtype_rbroot);
243out:
244 return data;
245}
246
247struct memtype *rbt_memtype_lookup(u64 addr)
248{
249 struct memtype *data;
250 data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
251 return data;
252}
253
254#if defined(CONFIG_DEBUG_FS)
255int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
256{
257 struct rb_node *node;
258 int i = 1;
259
260 node = rb_first(&memtype_rbroot);
261 while (node && pos != i) {
262 node = rb_next(node);
263 i++;
264 }
265
266 if (node) { /* pos == i */
267 struct memtype *this = container_of(node, struct memtype, rb);
268 *out = *this;
269 return 0;
270 } else {
271 return 1;
272 }
273}
274#endif
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index df3d5c861cd..308e32570d8 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -34,7 +34,7 @@
34/* IA32 Manual 3, 2-1 */ 34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = { 35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, 36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67 37 0x65, 0x66, 0x67
38}; 38};
39/* IA32 Manual 3, 3-432*/ 39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = { 40static unsigned int reg_rop[] = {
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 792854003ed..cac71849925 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -9,7 +9,6 @@
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/quicklist.h>
13 12
14#include <asm/system.h> 13#include <asm/system.h>
15#include <asm/pgtable.h> 14#include <asm/pgtable.h>
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 28c68762648..f9897f7a9ef 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -363,6 +363,54 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
363 for (i = 0; i < MAX_NUMNODES; i++) 363 for (i = 0; i < MAX_NUMNODES; i++)
364 cutoff_node(i, start, end); 364 cutoff_node(i, start, end);
365 365
366 /*
367 * Join together blocks on the same node, holes between
368 * which don't overlap with memory on other nodes.
369 */
370 for (i = 0; i < num_node_memblks; ++i) {
371 int j, k;
372
373 for (j = i + 1; j < num_node_memblks; ++j) {
374 unsigned long start, end;
375
376 if (memblk_nodeid[i] != memblk_nodeid[j])
377 continue;
378 start = min(node_memblk_range[i].end,
379 node_memblk_range[j].end);
380 end = max(node_memblk_range[i].start,
381 node_memblk_range[j].start);
382 for (k = 0; k < num_node_memblks; ++k) {
383 if (memblk_nodeid[i] == memblk_nodeid[k])
384 continue;
385 if (start < node_memblk_range[k].end &&
386 end > node_memblk_range[k].start)
387 break;
388 }
389 if (k < num_node_memblks)
390 continue;
391 start = min(node_memblk_range[i].start,
392 node_memblk_range[j].start);
393 end = max(node_memblk_range[i].end,
394 node_memblk_range[j].end);
395 printk(KERN_INFO "SRAT: Node %d "
396 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
397 memblk_nodeid[i],
398 node_memblk_range[i].start,
399 node_memblk_range[i].end,
400 node_memblk_range[j].start,
401 node_memblk_range[j].end,
402 start, end);
403 node_memblk_range[i].start = start;
404 node_memblk_range[i].end = end;
405 k = --num_node_memblks - j;
406 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
407 k * sizeof(*memblk_nodeid));
408 memmove(node_memblk_range + j, node_memblk_range + j+1,
409 k * sizeof(*node_memblk_range));
410 --j;
411 }
412 }
413
366 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 414 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
367 memblk_nodeid); 415 memblk_nodeid);
368 if (memnode_shift < 0) { 416 if (memnode_shift < 0) {
@@ -461,7 +509,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
461 * node, it must now point to the fake node ID. 509 * node, it must now point to the fake node ID.
462 */ 510 */
463 for (j = 0; j < MAX_LOCAL_APIC; j++) 511 for (j = 0; j < MAX_LOCAL_APIC; j++)
464 if (apicid_to_node[j] == nid) 512 if (apicid_to_node[j] == nid &&
513 fake_apicid_to_node[j] == NUMA_NO_NODE)
465 fake_apicid_to_node[j] = i; 514 fake_apicid_to_node[j] = i;
466 } 515 }
467 for (i = 0; i < num_nodes; i++) 516 for (i = 0; i < num_nodes; i++)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2c505ee7101..b28d2f1253b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -31,8 +31,9 @@ static struct op_x86_model_spec *model;
31static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); 31static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
32static DEFINE_PER_CPU(unsigned long, saved_lvtpc); 32static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
33 33
34/* 0 == registered but off, 1 == registered and on */ 34/* must be protected with get_online_cpus()/put_online_cpus(): */
35static int nmi_enabled = 0; 35static int nmi_enabled;
36static int ctr_running;
36 37
37struct op_counter_config counter_config[OP_MAX_COUNTER]; 38struct op_counter_config counter_config[OP_MAX_COUNTER];
38 39
@@ -61,12 +62,16 @@ static int profile_exceptions_notify(struct notifier_block *self,
61{ 62{
62 struct die_args *args = (struct die_args *)data; 63 struct die_args *args = (struct die_args *)data;
63 int ret = NOTIFY_DONE; 64 int ret = NOTIFY_DONE;
64 int cpu = smp_processor_id();
65 65
66 switch (val) { 66 switch (val) {
67 case DIE_NMI: 67 case DIE_NMI:
68 case DIE_NMI_IPI: 68 case DIE_NMI_IPI:
69 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); 69 if (ctr_running)
70 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
71 else if (!nmi_enabled)
72 break;
73 else
74 model->stop(&__get_cpu_var(cpu_msrs));
70 ret = NOTIFY_STOP; 75 ret = NOTIFY_STOP;
71 break; 76 break;
72 default: 77 default:
@@ -95,24 +100,36 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
95static void nmi_cpu_start(void *dummy) 100static void nmi_cpu_start(void *dummy)
96{ 101{
97 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); 102 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
98 model->start(msrs); 103 if (!msrs->controls)
104 WARN_ON_ONCE(1);
105 else
106 model->start(msrs);
99} 107}
100 108
101static int nmi_start(void) 109static int nmi_start(void)
102{ 110{
111 get_online_cpus();
103 on_each_cpu(nmi_cpu_start, NULL, 1); 112 on_each_cpu(nmi_cpu_start, NULL, 1);
113 ctr_running = 1;
114 put_online_cpus();
104 return 0; 115 return 0;
105} 116}
106 117
107static void nmi_cpu_stop(void *dummy) 118static void nmi_cpu_stop(void *dummy)
108{ 119{
109 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); 120 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
110 model->stop(msrs); 121 if (!msrs->controls)
122 WARN_ON_ONCE(1);
123 else
124 model->stop(msrs);
111} 125}
112 126
113static void nmi_stop(void) 127static void nmi_stop(void)
114{ 128{
129 get_online_cpus();
115 on_each_cpu(nmi_cpu_stop, NULL, 1); 130 on_each_cpu(nmi_cpu_stop, NULL, 1);
131 ctr_running = 0;
132 put_online_cpus();
116} 133}
117 134
118#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 135#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
@@ -252,7 +269,10 @@ static int nmi_switch_event(void)
252 if (nmi_multiplex_on() < 0) 269 if (nmi_multiplex_on() < 0)
253 return -EINVAL; /* not necessary */ 270 return -EINVAL; /* not necessary */
254 271
255 on_each_cpu(nmi_cpu_switch, NULL, 1); 272 get_online_cpus();
273 if (ctr_running)
274 on_each_cpu(nmi_cpu_switch, NULL, 1);
275 put_online_cpus();
256 276
257 return 0; 277 return 0;
258} 278}
@@ -295,6 +315,7 @@ static void free_msrs(void)
295 kfree(per_cpu(cpu_msrs, i).controls); 315 kfree(per_cpu(cpu_msrs, i).controls);
296 per_cpu(cpu_msrs, i).controls = NULL; 316 per_cpu(cpu_msrs, i).controls = NULL;
297 } 317 }
318 nmi_shutdown_mux();
298} 319}
299 320
300static int allocate_msrs(void) 321static int allocate_msrs(void)
@@ -307,14 +328,21 @@ static int allocate_msrs(void)
307 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, 328 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
308 GFP_KERNEL); 329 GFP_KERNEL);
309 if (!per_cpu(cpu_msrs, i).counters) 330 if (!per_cpu(cpu_msrs, i).counters)
310 return 0; 331 goto fail;
311 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, 332 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
312 GFP_KERNEL); 333 GFP_KERNEL);
313 if (!per_cpu(cpu_msrs, i).controls) 334 if (!per_cpu(cpu_msrs, i).controls)
314 return 0; 335 goto fail;
315 } 336 }
316 337
338 if (!nmi_setup_mux())
339 goto fail;
340
317 return 1; 341 return 1;
342
343fail:
344 free_msrs();
345 return 0;
318} 346}
319 347
320static void nmi_cpu_setup(void *dummy) 348static void nmi_cpu_setup(void *dummy)
@@ -336,49 +364,6 @@ static struct notifier_block profile_exceptions_nb = {
336 .priority = 2 364 .priority = 2
337}; 365};
338 366
339static int nmi_setup(void)
340{
341 int err = 0;
342 int cpu;
343
344 if (!allocate_msrs())
345 err = -ENOMEM;
346 else if (!nmi_setup_mux())
347 err = -ENOMEM;
348 else
349 err = register_die_notifier(&profile_exceptions_nb);
350
351 if (err) {
352 free_msrs();
353 nmi_shutdown_mux();
354 return err;
355 }
356
357 /* We need to serialize save and setup for HT because the subset
358 * of msrs are distinct for save and setup operations
359 */
360
361 /* Assume saved/restored counters are the same on all CPUs */
362 model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
363 for_each_possible_cpu(cpu) {
364 if (!cpu)
365 continue;
366
367 memcpy(per_cpu(cpu_msrs, cpu).counters,
368 per_cpu(cpu_msrs, 0).counters,
369 sizeof(struct op_msr) * model->num_counters);
370
371 memcpy(per_cpu(cpu_msrs, cpu).controls,
372 per_cpu(cpu_msrs, 0).controls,
373 sizeof(struct op_msr) * model->num_controls);
374
375 mux_clone(cpu);
376 }
377 on_each_cpu(nmi_cpu_setup, NULL, 1);
378 nmi_enabled = 1;
379 return 0;
380}
381
382static void nmi_cpu_restore_registers(struct op_msrs *msrs) 367static void nmi_cpu_restore_registers(struct op_msrs *msrs)
383{ 368{
384 struct op_msr *counters = msrs->counters; 369 struct op_msr *counters = msrs->counters;
@@ -412,20 +397,24 @@ static void nmi_cpu_shutdown(void *dummy)
412 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); 397 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
413 apic_write(APIC_LVTERR, v); 398 apic_write(APIC_LVTERR, v);
414 nmi_cpu_restore_registers(msrs); 399 nmi_cpu_restore_registers(msrs);
400 if (model->cpu_down)
401 model->cpu_down();
415} 402}
416 403
417static void nmi_shutdown(void) 404static void nmi_cpu_up(void *dummy)
418{ 405{
419 struct op_msrs *msrs; 406 if (nmi_enabled)
407 nmi_cpu_setup(dummy);
408 if (ctr_running)
409 nmi_cpu_start(dummy);
410}
420 411
421 nmi_enabled = 0; 412static void nmi_cpu_down(void *dummy)
422 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 413{
423 unregister_die_notifier(&profile_exceptions_nb); 414 if (ctr_running)
424 nmi_shutdown_mux(); 415 nmi_cpu_stop(dummy);
425 msrs = &get_cpu_var(cpu_msrs); 416 if (nmi_enabled)
426 model->shutdown(msrs); 417 nmi_cpu_shutdown(dummy);
427 free_msrs();
428 put_cpu_var(cpu_msrs);
429} 418}
430 419
431static int nmi_create_files(struct super_block *sb, struct dentry *root) 420static int nmi_create_files(struct super_block *sb, struct dentry *root)
@@ -457,7 +446,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
457 return 0; 446 return 0;
458} 447}
459 448
460#ifdef CONFIG_SMP
461static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, 449static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
462 void *data) 450 void *data)
463{ 451{
@@ -465,10 +453,10 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
465 switch (action) { 453 switch (action) {
466 case CPU_DOWN_FAILED: 454 case CPU_DOWN_FAILED:
467 case CPU_ONLINE: 455 case CPU_ONLINE:
468 smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); 456 smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
469 break; 457 break;
470 case CPU_DOWN_PREPARE: 458 case CPU_DOWN_PREPARE:
471 smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); 459 smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
472 break; 460 break;
473 } 461 }
474 return NOTIFY_DONE; 462 return NOTIFY_DONE;
@@ -477,7 +465,75 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
477static struct notifier_block oprofile_cpu_nb = { 465static struct notifier_block oprofile_cpu_nb = {
478 .notifier_call = oprofile_cpu_notifier 466 .notifier_call = oprofile_cpu_notifier
479}; 467};
480#endif 468
469static int nmi_setup(void)
470{
471 int err = 0;
472 int cpu;
473
474 if (!allocate_msrs())
475 return -ENOMEM;
476
477 /* We need to serialize save and setup for HT because the subset
478 * of msrs are distinct for save and setup operations
479 */
480
481 /* Assume saved/restored counters are the same on all CPUs */
482 err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
483 if (err)
484 goto fail;
485
486 for_each_possible_cpu(cpu) {
487 if (!cpu)
488 continue;
489
490 memcpy(per_cpu(cpu_msrs, cpu).counters,
491 per_cpu(cpu_msrs, 0).counters,
492 sizeof(struct op_msr) * model->num_counters);
493
494 memcpy(per_cpu(cpu_msrs, cpu).controls,
495 per_cpu(cpu_msrs, 0).controls,
496 sizeof(struct op_msr) * model->num_controls);
497
498 mux_clone(cpu);
499 }
500
501 nmi_enabled = 0;
502 ctr_running = 0;
503 barrier();
504 err = register_die_notifier(&profile_exceptions_nb);
505 if (err)
506 goto fail;
507
508 get_online_cpus();
509 register_cpu_notifier(&oprofile_cpu_nb);
510 on_each_cpu(nmi_cpu_setup, NULL, 1);
511 nmi_enabled = 1;
512 put_online_cpus();
513
514 return 0;
515fail:
516 free_msrs();
517 return err;
518}
519
520static void nmi_shutdown(void)
521{
522 struct op_msrs *msrs;
523
524 get_online_cpus();
525 unregister_cpu_notifier(&oprofile_cpu_nb);
526 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
527 nmi_enabled = 0;
528 ctr_running = 0;
529 put_online_cpus();
530 barrier();
531 unregister_die_notifier(&profile_exceptions_nb);
532 msrs = &get_cpu_var(cpu_msrs);
533 model->shutdown(msrs);
534 free_msrs();
535 put_cpu_var(cpu_msrs);
536}
481 537
482#ifdef CONFIG_PM 538#ifdef CONFIG_PM
483 539
@@ -687,9 +743,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
687 return -ENODEV; 743 return -ENODEV;
688 } 744 }
689 745
690#ifdef CONFIG_SMP
691 register_cpu_notifier(&oprofile_cpu_nb);
692#endif
693 /* default values, can be overwritten by model */ 746 /* default values, can be overwritten by model */
694 ops->create_files = nmi_create_files; 747 ops->create_files = nmi_create_files;
695 ops->setup = nmi_setup; 748 ops->setup = nmi_setup;
@@ -716,12 +769,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
716 769
717void op_nmi_exit(void) 770void op_nmi_exit(void)
718{ 771{
719 if (using_nmi) { 772 if (using_nmi)
720 exit_sysfs(); 773 exit_sysfs();
721#ifdef CONFIG_SMP
722 unregister_cpu_notifier(&oprofile_cpu_nb);
723#endif
724 }
725 if (model->exit)
726 model->exit();
727} 774}
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 090cbbec7db..b67a6b5aa8d 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -30,13 +30,10 @@
30#include "op_counter.h" 30#include "op_counter.h"
31 31
32#define NUM_COUNTERS 4 32#define NUM_COUNTERS 4
33#define NUM_CONTROLS 4
34#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 33#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
35#define NUM_VIRT_COUNTERS 32 34#define NUM_VIRT_COUNTERS 32
36#define NUM_VIRT_CONTROLS 32
37#else 35#else
38#define NUM_VIRT_COUNTERS NUM_COUNTERS 36#define NUM_VIRT_COUNTERS NUM_COUNTERS
39#define NUM_VIRT_CONTROLS NUM_CONTROLS
40#endif 37#endif
41 38
42#define OP_EVENT_MASK 0x0FFF 39#define OP_EVENT_MASK 0x0FFF
@@ -105,102 +102,6 @@ static u32 get_ibs_caps(void)
105 return ibs_caps; 102 return ibs_caps;
106} 103}
107 104
108#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
109
110static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
111 struct op_msrs const * const msrs)
112{
113 u64 val;
114 int i;
115
116 /* enable active counters */
117 for (i = 0; i < NUM_COUNTERS; ++i) {
118 int virt = op_x86_phys_to_virt(i);
119 if (!reset_value[virt])
120 continue;
121 rdmsrl(msrs->controls[i].addr, val);
122 val &= model->reserved;
123 val |= op_x86_get_ctrl(model, &counter_config[virt]);
124 wrmsrl(msrs->controls[i].addr, val);
125 }
126}
127
128#endif
129
130/* functions for op_amd_spec */
131
132static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
133{
134 int i;
135
136 for (i = 0; i < NUM_COUNTERS; i++) {
137 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
138 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
139 }
140
141 for (i = 0; i < NUM_CONTROLS; i++) {
142 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
143 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
144 }
145}
146
147static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
148 struct op_msrs const * const msrs)
149{
150 u64 val;
151 int i;
152
153 /* setup reset_value */
154 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
155 if (counter_config[i].enabled
156 && msrs->counters[op_x86_virt_to_phys(i)].addr)
157 reset_value[i] = counter_config[i].count;
158 else
159 reset_value[i] = 0;
160 }
161
162 /* clear all counters */
163 for (i = 0; i < NUM_CONTROLS; ++i) {
164 if (unlikely(!msrs->controls[i].addr)) {
165 if (counter_config[i].enabled && !smp_processor_id())
166 /*
167 * counter is reserved, this is on all
168 * cpus, so report only for cpu #0
169 */
170 op_x86_warn_reserved(i);
171 continue;
172 }
173 rdmsrl(msrs->controls[i].addr, val);
174 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
175 op_x86_warn_in_use(i);
176 val &= model->reserved;
177 wrmsrl(msrs->controls[i].addr, val);
178 }
179
180 /* avoid a false detection of ctr overflows in NMI handler */
181 for (i = 0; i < NUM_COUNTERS; ++i) {
182 if (unlikely(!msrs->counters[i].addr))
183 continue;
184 wrmsrl(msrs->counters[i].addr, -1LL);
185 }
186
187 /* enable active counters */
188 for (i = 0; i < NUM_COUNTERS; ++i) {
189 int virt = op_x86_phys_to_virt(i);
190 if (!reset_value[virt])
191 continue;
192
193 /* setup counter registers */
194 wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
195
196 /* setup control registers */
197 rdmsrl(msrs->controls[i].addr, val);
198 val &= model->reserved;
199 val |= op_x86_get_ctrl(model, &counter_config[virt]);
200 wrmsrl(msrs->controls[i].addr, val);
201 }
202}
203
204/* 105/*
205 * 16-bit Linear Feedback Shift Register (LFSR) 106 * 16-bit Linear Feedback Shift Register (LFSR)
206 * 107 *
@@ -365,6 +266,125 @@ static void op_amd_stop_ibs(void)
365 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 266 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
366} 267}
367 268
269#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
270
271static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
272 struct op_msrs const * const msrs)
273{
274 u64 val;
275 int i;
276
277 /* enable active counters */
278 for (i = 0; i < NUM_COUNTERS; ++i) {
279 int virt = op_x86_phys_to_virt(i);
280 if (!reset_value[virt])
281 continue;
282 rdmsrl(msrs->controls[i].addr, val);
283 val &= model->reserved;
284 val |= op_x86_get_ctrl(model, &counter_config[virt]);
285 wrmsrl(msrs->controls[i].addr, val);
286 }
287}
288
289#endif
290
291/* functions for op_amd_spec */
292
293static void op_amd_shutdown(struct op_msrs const * const msrs)
294{
295 int i;
296
297 for (i = 0; i < NUM_COUNTERS; ++i) {
298 if (!msrs->counters[i].addr)
299 continue;
300 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
301 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
302 }
303}
304
305static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
306{
307 int i;
308
309 for (i = 0; i < NUM_COUNTERS; i++) {
310 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
311 goto fail;
312 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
313 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
314 goto fail;
315 }
316 /* both registers must be reserved */
317 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
318 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
319 continue;
320 fail:
321 if (!counter_config[i].enabled)
322 continue;
323 op_x86_warn_reserved(i);
324 op_amd_shutdown(msrs);
325 return -EBUSY;
326 }
327
328 return 0;
329}
330
331static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
332 struct op_msrs const * const msrs)
333{
334 u64 val;
335 int i;
336
337 /* setup reset_value */
338 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
339 if (counter_config[i].enabled
340 && msrs->counters[op_x86_virt_to_phys(i)].addr)
341 reset_value[i] = counter_config[i].count;
342 else
343 reset_value[i] = 0;
344 }
345
346 /* clear all counters */
347 for (i = 0; i < NUM_COUNTERS; ++i) {
348 if (!msrs->controls[i].addr)
349 continue;
350 rdmsrl(msrs->controls[i].addr, val);
351 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
352 op_x86_warn_in_use(i);
353 val &= model->reserved;
354 wrmsrl(msrs->controls[i].addr, val);
355 /*
356 * avoid a false detection of ctr overflows in NMI
357 * handler
358 */
359 wrmsrl(msrs->counters[i].addr, -1LL);
360 }
361
362 /* enable active counters */
363 for (i = 0; i < NUM_COUNTERS; ++i) {
364 int virt = op_x86_phys_to_virt(i);
365 if (!reset_value[virt])
366 continue;
367
368 /* setup counter registers */
369 wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
370
371 /* setup control registers */
372 rdmsrl(msrs->controls[i].addr, val);
373 val &= model->reserved;
374 val |= op_x86_get_ctrl(model, &counter_config[virt]);
375 wrmsrl(msrs->controls[i].addr, val);
376 }
377
378 if (ibs_caps)
379 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
380}
381
382static void op_amd_cpu_shutdown(void)
383{
384 if (ibs_caps)
385 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
386}
387
368static int op_amd_check_ctrs(struct pt_regs * const regs, 388static int op_amd_check_ctrs(struct pt_regs * const regs,
369 struct op_msrs const * const msrs) 389 struct op_msrs const * const msrs)
370{ 390{
@@ -425,42 +445,16 @@ static void op_amd_stop(struct op_msrs const * const msrs)
425 op_amd_stop_ibs(); 445 op_amd_stop_ibs();
426} 446}
427 447
428static void op_amd_shutdown(struct op_msrs const * const msrs) 448static int __init_ibs_nmi(void)
429{
430 int i;
431
432 for (i = 0; i < NUM_COUNTERS; ++i) {
433 if (msrs->counters[i].addr)
434 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
435 }
436 for (i = 0; i < NUM_CONTROLS; ++i) {
437 if (msrs->controls[i].addr)
438 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
439 }
440}
441
442static u8 ibs_eilvt_off;
443
444static inline void apic_init_ibs_nmi_per_cpu(void *arg)
445{
446 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
447}
448
449static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
450{
451 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
452}
453
454static int init_ibs_nmi(void)
455{ 449{
456#define IBSCTL_LVTOFFSETVAL (1 << 8) 450#define IBSCTL_LVTOFFSETVAL (1 << 8)
457#define IBSCTL 0x1cc 451#define IBSCTL 0x1cc
458 struct pci_dev *cpu_cfg; 452 struct pci_dev *cpu_cfg;
459 int nodes; 453 int nodes;
460 u32 value = 0; 454 u32 value = 0;
455 u8 ibs_eilvt_off;
461 456
462 /* per CPU setup */ 457 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
463 on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
464 458
465 nodes = 0; 459 nodes = 0;
466 cpu_cfg = NULL; 460 cpu_cfg = NULL;
@@ -490,22 +484,15 @@ static int init_ibs_nmi(void)
490 return 0; 484 return 0;
491} 485}
492 486
493/* uninitialize the APIC for the IBS interrupts if needed */
494static void clear_ibs_nmi(void)
495{
496 if (ibs_caps)
497 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
498}
499
500/* initialize the APIC for the IBS interrupts if available */ 487/* initialize the APIC for the IBS interrupts if available */
501static void ibs_init(void) 488static void init_ibs(void)
502{ 489{
503 ibs_caps = get_ibs_caps(); 490 ibs_caps = get_ibs_caps();
504 491
505 if (!ibs_caps) 492 if (!ibs_caps)
506 return; 493 return;
507 494
508 if (init_ibs_nmi()) { 495 if (__init_ibs_nmi()) {
509 ibs_caps = 0; 496 ibs_caps = 0;
510 return; 497 return;
511 } 498 }
@@ -514,14 +501,6 @@ static void ibs_init(void)
514 (unsigned)ibs_caps); 501 (unsigned)ibs_caps);
515} 502}
516 503
517static void ibs_exit(void)
518{
519 if (!ibs_caps)
520 return;
521
522 clear_ibs_nmi();
523}
524
525static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 504static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
526 505
527static int setup_ibs_files(struct super_block *sb, struct dentry *root) 506static int setup_ibs_files(struct super_block *sb, struct dentry *root)
@@ -570,27 +549,22 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
570 549
571static int op_amd_init(struct oprofile_operations *ops) 550static int op_amd_init(struct oprofile_operations *ops)
572{ 551{
573 ibs_init(); 552 init_ibs();
574 create_arch_files = ops->create_files; 553 create_arch_files = ops->create_files;
575 ops->create_files = setup_ibs_files; 554 ops->create_files = setup_ibs_files;
576 return 0; 555 return 0;
577} 556}
578 557
579static void op_amd_exit(void)
580{
581 ibs_exit();
582}
583
584struct op_x86_model_spec op_amd_spec = { 558struct op_x86_model_spec op_amd_spec = {
585 .num_counters = NUM_COUNTERS, 559 .num_counters = NUM_COUNTERS,
586 .num_controls = NUM_CONTROLS, 560 .num_controls = NUM_COUNTERS,
587 .num_virt_counters = NUM_VIRT_COUNTERS, 561 .num_virt_counters = NUM_VIRT_COUNTERS,
588 .reserved = MSR_AMD_EVENTSEL_RESERVED, 562 .reserved = MSR_AMD_EVENTSEL_RESERVED,
589 .event_mask = OP_EVENT_MASK, 563 .event_mask = OP_EVENT_MASK,
590 .init = op_amd_init, 564 .init = op_amd_init,
591 .exit = op_amd_exit,
592 .fill_in_addresses = &op_amd_fill_in_addresses, 565 .fill_in_addresses = &op_amd_fill_in_addresses,
593 .setup_ctrs = &op_amd_setup_ctrs, 566 .setup_ctrs = &op_amd_setup_ctrs,
567 .cpu_down = &op_amd_cpu_shutdown,
594 .check_ctrs = &op_amd_check_ctrs, 568 .check_ctrs = &op_amd_check_ctrs,
595 .start = &op_amd_start, 569 .start = &op_amd_start,
596 .stop = &op_amd_stop, 570 .stop = &op_amd_stop,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index e6a160a4684..182558dd551 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -385,8 +385,26 @@ static unsigned int get_stagger(void)
385 385
386static unsigned long reset_value[NUM_COUNTERS_NON_HT]; 386static unsigned long reset_value[NUM_COUNTERS_NON_HT];
387 387
388static void p4_shutdown(struct op_msrs const * const msrs)
389{
390 int i;
388 391
389static void p4_fill_in_addresses(struct op_msrs * const msrs) 392 for (i = 0; i < num_counters; ++i) {
393 if (msrs->counters[i].addr)
394 release_perfctr_nmi(msrs->counters[i].addr);
395 }
396 /*
397 * some of the control registers are specially reserved in
398 * conjunction with the counter registers (hence the starting offset).
399 * This saves a few bits.
400 */
401 for (i = num_counters; i < num_controls; ++i) {
402 if (msrs->controls[i].addr)
403 release_evntsel_nmi(msrs->controls[i].addr);
404 }
405}
406
407static int p4_fill_in_addresses(struct op_msrs * const msrs)
390{ 408{
391 unsigned int i; 409 unsigned int i;
392 unsigned int addr, cccraddr, stag; 410 unsigned int addr, cccraddr, stag;
@@ -468,6 +486,18 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
468 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; 486 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
469 } 487 }
470 } 488 }
489
490 for (i = 0; i < num_counters; ++i) {
491 if (!counter_config[i].enabled)
492 continue;
493 if (msrs->controls[i].addr)
494 continue;
495 op_x86_warn_reserved(i);
496 p4_shutdown(msrs);
497 return -EBUSY;
498 }
499
500 return 0;
471} 501}
472 502
473 503
@@ -668,26 +698,6 @@ static void p4_stop(struct op_msrs const * const msrs)
668 } 698 }
669} 699}
670 700
671static void p4_shutdown(struct op_msrs const * const msrs)
672{
673 int i;
674
675 for (i = 0; i < num_counters; ++i) {
676 if (msrs->counters[i].addr)
677 release_perfctr_nmi(msrs->counters[i].addr);
678 }
679 /*
680 * some of the control registers are specially reserved in
681 * conjunction with the counter registers (hence the starting offset).
682 * This saves a few bits.
683 */
684 for (i = num_counters; i < num_controls; ++i) {
685 if (msrs->controls[i].addr)
686 release_evntsel_nmi(msrs->controls[i].addr);
687 }
688}
689
690
691#ifdef CONFIG_SMP 701#ifdef CONFIG_SMP
692struct op_x86_model_spec op_p4_ht2_spec = { 702struct op_x86_model_spec op_p4_ht2_spec = {
693 .num_counters = NUM_COUNTERS_HT2, 703 .num_counters = NUM_COUNTERS_HT2,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b..d769cda5408 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -30,19 +30,46 @@ static int counter_width = 32;
30 30
31static u64 *reset_value; 31static u64 *reset_value;
32 32
33static void ppro_fill_in_addresses(struct op_msrs * const msrs) 33static void ppro_shutdown(struct op_msrs const * const msrs)
34{ 34{
35 int i; 35 int i;
36 36
37 for (i = 0; i < num_counters; i++) { 37 for (i = 0; i < num_counters; ++i) {
38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) 38 if (!msrs->counters[i].addr)
39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; 39 continue;
40 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
41 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
42 }
43 if (reset_value) {
44 kfree(reset_value);
45 reset_value = NULL;
40 } 46 }
47}
48
49static int ppro_fill_in_addresses(struct op_msrs * const msrs)
50{
51 int i;
41 52
42 for (i = 0; i < num_counters; i++) { 53 for (i = 0; i < num_counters; i++) {
43 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) 54 if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
44 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; 55 goto fail;
56 if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
57 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
58 goto fail;
59 }
60 /* both registers must be reserved */
61 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
62 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
63 continue;
64 fail:
65 if (!counter_config[i].enabled)
66 continue;
67 op_x86_warn_reserved(i);
68 ppro_shutdown(msrs);
69 return -EBUSY;
45 } 70 }
71
72 return 0;
46} 73}
47 74
48 75
@@ -78,26 +105,17 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
78 105
79 /* clear all counters */ 106 /* clear all counters */
80 for (i = 0; i < num_counters; ++i) { 107 for (i = 0; i < num_counters; ++i) {
81 if (unlikely(!msrs->controls[i].addr)) { 108 if (!msrs->controls[i].addr)
82 if (counter_config[i].enabled && !smp_processor_id())
83 /*
84 * counter is reserved, this is on all
85 * cpus, so report only for cpu #0
86 */
87 op_x86_warn_reserved(i);
88 continue; 109 continue;
89 }
90 rdmsrl(msrs->controls[i].addr, val); 110 rdmsrl(msrs->controls[i].addr, val);
91 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) 111 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
92 op_x86_warn_in_use(i); 112 op_x86_warn_in_use(i);
93 val &= model->reserved; 113 val &= model->reserved;
94 wrmsrl(msrs->controls[i].addr, val); 114 wrmsrl(msrs->controls[i].addr, val);
95 } 115 /*
96 116 * avoid a false detection of ctr overflows in NMI *
97 /* avoid a false detection of ctr overflows in NMI handler */ 117 * handler
98 for (i = 0; i < num_counters; ++i) { 118 */
99 if (unlikely(!msrs->counters[i].addr))
100 continue;
101 wrmsrl(msrs->counters[i].addr, -1LL); 119 wrmsrl(msrs->counters[i].addr, -1LL);
102 } 120 }
103 121
@@ -189,25 +207,6 @@ static void ppro_stop(struct op_msrs const * const msrs)
189 } 207 }
190} 208}
191 209
192static void ppro_shutdown(struct op_msrs const * const msrs)
193{
194 int i;
195
196 for (i = 0; i < num_counters; ++i) {
197 if (msrs->counters[i].addr)
198 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
199 }
200 for (i = 0; i < num_counters; ++i) {
201 if (msrs->controls[i].addr)
202 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
203 }
204 if (reset_value) {
205 kfree(reset_value);
206 reset_value = NULL;
207 }
208}
209
210
211struct op_x86_model_spec op_ppro_spec = { 210struct op_x86_model_spec op_ppro_spec = {
212 .num_counters = 2, 211 .num_counters = 2,
213 .num_controls = 2, 212 .num_controls = 2,
@@ -239,11 +238,11 @@ static void arch_perfmon_setup_counters(void)
239 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && 238 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
240 current_cpu_data.x86_model == 15) { 239 current_cpu_data.x86_model == 15) {
241 eax.split.version_id = 2; 240 eax.split.version_id = 2;
242 eax.split.num_events = 2; 241 eax.split.num_counters = 2;
243 eax.split.bit_width = 40; 242 eax.split.bit_width = 40;
244 } 243 }
245 244
246 num_counters = eax.split.num_events; 245 num_counters = eax.split.num_counters;
247 246
248 op_arch_perfmon_spec.num_counters = num_counters; 247 op_arch_perfmon_spec.num_counters = num_counters;
249 op_arch_perfmon_spec.num_controls = num_counters; 248 op_arch_perfmon_spec.num_controls = num_counters;
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index ff82a755edd..89017fa1fd6 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -40,10 +40,10 @@ struct op_x86_model_spec {
40 u64 reserved; 40 u64 reserved;
41 u16 event_mask; 41 u16 event_mask;
42 int (*init)(struct oprofile_operations *ops); 42 int (*init)(struct oprofile_operations *ops);
43 void (*exit)(void); 43 int (*fill_in_addresses)(struct op_msrs * const msrs);
44 void (*fill_in_addresses)(struct op_msrs * const msrs);
45 void (*setup_ctrs)(struct op_x86_model_spec const *model, 44 void (*setup_ctrs)(struct op_x86_model_spec const *model,
46 struct op_msrs const * const msrs); 45 struct op_msrs const * const msrs);
46 void (*cpu_down)(void);
47 int (*check_ctrs)(struct pt_regs * const regs, 47 int (*check_ctrs)(struct pt_regs * const regs,
48 struct op_msrs const * const msrs); 48 struct op_msrs const * const msrs);
49 void (*start)(struct op_msrs const * const msrs); 49 void (*start)(struct op_msrs const * const msrs);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index b110d97fb92..a0207a7fdf3 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,6 +18,8 @@ obj-$(CONFIG_X86_MRST) += mrst.o
18obj-y += common.o early.o 18obj-y += common.o early.o
19obj-y += amd_bus.o bus_numa.o 19obj-y += amd_bus.o bus_numa.o
20 20
21obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
22
21ifeq ($(CONFIG_PCI_DEBUG),y) 23ifeq ($(CONFIG_PCI_DEBUG),y)
22EXTRA_CFLAGS += -DDEBUG 24EXTRA_CFLAGS += -DDEBUG
23endif 25endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 31930fd30ea..2ec04c424a6 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -207,10 +207,9 @@ get_current_resources(struct acpi_device *device, int busnum,
207 if (!info.res) 207 if (!info.res)
208 goto res_alloc_fail; 208 goto res_alloc_fail;
209 209
210 info.name = kmalloc(16, GFP_KERNEL); 210 info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
211 if (!info.name) 211 if (!info.name)
212 goto name_alloc_fail; 212 goto name_alloc_fail;
213 sprintf(info.name, "PCI Bus %04x:%02x", domain, busnum);
214 213
215 info.res_num = 0; 214 info.res_num = 0;
216 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 215 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
@@ -224,8 +223,11 @@ res_alloc_fail:
224 return; 223 return;
225} 224}
226 225
227struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) 226struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
228{ 227{
228 struct acpi_device *device = root->device;
229 int domain = root->segment;
230 int busnum = root->secondary.start;
229 struct pci_bus *bus; 231 struct pci_bus *bus;
230 struct pci_sysdata *sd; 232 struct pci_sysdata *sd;
231 int node; 233 int node;
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
new file mode 100644
index 00000000000..0846a5bbbfb
--- /dev/null
+++ b/arch/x86/pci/broadcom_bus.c
@@ -0,0 +1,101 @@
1/*
2 * Read address ranges from a Broadcom CNB20LE Host Bridge
3 *
4 * Copyright (c) 2010 Ira W. Snyder <iws@ovro.caltech.edu>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2 of the License, or (at your
9 * option) any later version.
10 */
11
12#include <linux/delay.h>
13#include <linux/dmi.h>
14#include <linux/pci.h>
15#include <linux/init.h>
16#include <asm/pci_x86.h>
17
18#include "bus_numa.h"
19
20static void __devinit cnb20le_res(struct pci_dev *dev)
21{
22 struct pci_root_info *info;
23 struct resource res;
24 u16 word1, word2;
25 u8 fbus, lbus;
26 int i;
27
28 /*
29 * The x86_pci_root_bus_res_quirks() function already refuses to use
30 * this information if ACPI _CRS was used. Therefore, we don't bother
31 * checking if ACPI is enabled, and just generate the information
32 * for both the ACPI _CRS and no ACPI cases.
33 */
34
35 info = &pci_root_info[pci_root_num];
36 pci_root_num++;
37
38 /* read the PCI bus numbers */
39 pci_read_config_byte(dev, 0x44, &fbus);
40 pci_read_config_byte(dev, 0x45, &lbus);
41 info->bus_min = fbus;
42 info->bus_max = lbus;
43
44 /*
45 * Add the legacy IDE ports on bus 0
46 *
47 * These do not exist anywhere in the bridge registers, AFAICT. I do
48 * not have the datasheet, so this is the best I can do.
49 */
50 if (fbus == 0) {
51 update_res(info, 0x01f0, 0x01f7, IORESOURCE_IO, 0);
52 update_res(info, 0x03f6, 0x03f6, IORESOURCE_IO, 0);
53 update_res(info, 0x0170, 0x0177, IORESOURCE_IO, 0);
54 update_res(info, 0x0376, 0x0376, IORESOURCE_IO, 0);
55 update_res(info, 0xffa0, 0xffaf, IORESOURCE_IO, 0);
56 }
57
58 /* read the non-prefetchable memory window */
59 pci_read_config_word(dev, 0xc0, &word1);
60 pci_read_config_word(dev, 0xc2, &word2);
61 if (word1 != word2) {
62 res.start = (word1 << 16) | 0x0000;
63 res.end = (word2 << 16) | 0xffff;
64 res.flags = IORESOURCE_MEM;
65 update_res(info, res.start, res.end, res.flags, 0);
66 }
67
68 /* read the prefetchable memory window */
69 pci_read_config_word(dev, 0xc4, &word1);
70 pci_read_config_word(dev, 0xc6, &word2);
71 if (word1 != word2) {
72 res.start = (word1 << 16) | 0x0000;
73 res.end = (word2 << 16) | 0xffff;
74 res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
75 update_res(info, res.start, res.end, res.flags, 0);
76 }
77
78 /* read the IO port window */
79 pci_read_config_word(dev, 0xd0, &word1);
80 pci_read_config_word(dev, 0xd2, &word2);
81 if (word1 != word2) {
82 res.start = word1;
83 res.end = word2;
84 res.flags = IORESOURCE_IO;
85 update_res(info, res.start, res.end, res.flags, 0);
86 }
87
88 /* print information about this host bridge */
89 res.start = fbus;
90 res.end = lbus;
91 res.flags = IORESOURCE_BUS;
92 dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n",
93 pci_domain_nr(dev->bus), &res);
94
95 for (i = 0; i < info->res_num; i++)
96 dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]);
97}
98
99DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
100 cnb20le_res);
101
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index cf2e93869c4..215a27ae050 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -76,7 +76,7 @@ struct pci_ops pci_root_ops = {
76 * This interrupt-safe spinlock protects all accesses to PCI 76 * This interrupt-safe spinlock protects all accesses to PCI
77 * configuration space. 77 * configuration space.
78 */ 78 */
79DEFINE_SPINLOCK(pci_config_lock); 79DEFINE_RAW_SPINLOCK(pci_config_lock);
80 80
81static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) 81static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
82{ 82{
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 347d882b3bb..bd33620b007 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -27,7 +27,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
27 return -EINVAL; 27 return -EINVAL;
28 } 28 }
29 29
30 spin_lock_irqsave(&pci_config_lock, flags); 30 raw_spin_lock_irqsave(&pci_config_lock, flags);
31 31
32 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); 32 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
33 33
@@ -43,7 +43,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
43 break; 43 break;
44 } 44 }
45 45
46 spin_unlock_irqrestore(&pci_config_lock, flags); 46 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
47 47
48 return 0; 48 return 0;
49} 49}
@@ -56,7 +56,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
56 if ((bus > 255) || (devfn > 255) || (reg > 4095)) 56 if ((bus > 255) || (devfn > 255) || (reg > 4095))
57 return -EINVAL; 57 return -EINVAL;
58 58
59 spin_lock_irqsave(&pci_config_lock, flags); 59 raw_spin_lock_irqsave(&pci_config_lock, flags);
60 60
61 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); 61 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
62 62
@@ -72,7 +72,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
72 break; 72 break;
73 } 73 }
74 74
75 spin_unlock_irqrestore(&pci_config_lock, flags); 75 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
76 76
77 return 0; 77 return 0;
78} 78}
@@ -108,7 +108,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
108 if (dev & 0x10) 108 if (dev & 0x10)
109 return PCIBIOS_DEVICE_NOT_FOUND; 109 return PCIBIOS_DEVICE_NOT_FOUND;
110 110
111 spin_lock_irqsave(&pci_config_lock, flags); 111 raw_spin_lock_irqsave(&pci_config_lock, flags);
112 112
113 outb((u8)(0xF0 | (fn << 1)), 0xCF8); 113 outb((u8)(0xF0 | (fn << 1)), 0xCF8);
114 outb((u8)bus, 0xCFA); 114 outb((u8)bus, 0xCFA);
@@ -127,7 +127,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
127 127
128 outb(0, 0xCF8); 128 outb(0, 0xCF8);
129 129
130 spin_unlock_irqrestore(&pci_config_lock, flags); 130 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
131 131
132 return 0; 132 return 0;
133} 133}
@@ -147,7 +147,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
147 if (dev & 0x10) 147 if (dev & 0x10)
148 return PCIBIOS_DEVICE_NOT_FOUND; 148 return PCIBIOS_DEVICE_NOT_FOUND;
149 149
150 spin_lock_irqsave(&pci_config_lock, flags); 150 raw_spin_lock_irqsave(&pci_config_lock, flags);
151 151
152 outb((u8)(0xF0 | (fn << 1)), 0xCF8); 152 outb((u8)(0xF0 | (fn << 1)), 0xCF8);
153 outb((u8)bus, 0xCFA); 153 outb((u8)bus, 0xCFA);
@@ -166,7 +166,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
166 166
167 outb(0, 0xCF8); 167 outb(0, 0xCF8);
168 168
169 spin_unlock_irqrestore(&pci_config_lock, flags); 169 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
170 170
171 return 0; 171 return 0;
172} 172}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5d362b5ba06..9810a0f76c9 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,8 +589,6 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_CPT_LPC1:
593 case PCI_DEVICE_ID_INTEL_CPT_LPC2:
594 r->name = "PIIX/ICH"; 592 r->name = "PIIX/ICH";
595 r->get = pirq_piix_get; 593 r->get = pirq_piix_get;
596 r->set = pirq_piix_set; 594 r->set = pirq_piix_set;
@@ -605,6 +603,13 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
605 return 1; 603 return 1;
606 } 604 }
607 605
606 if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) &&
607 (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) {
608 r->name = "PIIX/ICH";
609 r->get = pirq_piix_get;
610 r->set = pirq_piix_set;
611 return 1;
612 }
608 return 0; 613 return 0;
609} 614}
610 615
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 39b9ebe8f88..a918553ebc7 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -483,16 +483,17 @@ static void __init pci_mmcfg_reject_broken(int early)
483 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 483 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
484 int valid = 0; 484 int valid = 0;
485 485
486 if (!early && !acpi_disabled) 486 if (!early && !acpi_disabled) {
487 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); 487 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
488 488
489 if (valid) 489 if (valid)
490 continue; 490 continue;
491 491 else
492 if (!early) 492 printk(KERN_ERR FW_BUG PREFIX
493 printk(KERN_ERR FW_BUG PREFIX 493 "MMCONFIG at %pR not reserved in "
494 "MMCONFIG at %pR not reserved in " 494 "ACPI motherboard resources\n",
495 "ACPI motherboard resources\n", &cfg->res); 495 &cfg->res);
496 }
496 497
497 /* Don't try to do this check unless configuration 498 /* Don't try to do this check unless configuration
498 type 1 is available. how about type 2 ?*/ 499 type 1 is available. how about type 2 ?*/
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 90d5fd476ed..a3d9c54792a 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -64,7 +64,7 @@ err: *value = -1;
64 if (!base) 64 if (!base)
65 goto err; 65 goto err;
66 66
67 spin_lock_irqsave(&pci_config_lock, flags); 67 raw_spin_lock_irqsave(&pci_config_lock, flags);
68 68
69 pci_exp_set_dev_base(base, bus, devfn); 69 pci_exp_set_dev_base(base, bus, devfn);
70 70
@@ -79,7 +79,7 @@ err: *value = -1;
79 *value = mmio_config_readl(mmcfg_virt_addr + reg); 79 *value = mmio_config_readl(mmcfg_virt_addr + reg);
80 break; 80 break;
81 } 81 }
82 spin_unlock_irqrestore(&pci_config_lock, flags); 82 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
83 83
84 return 0; 84 return 0;
85} 85}
@@ -97,7 +97,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
97 if (!base) 97 if (!base)
98 return -EINVAL; 98 return -EINVAL;
99 99
100 spin_lock_irqsave(&pci_config_lock, flags); 100 raw_spin_lock_irqsave(&pci_config_lock, flags);
101 101
102 pci_exp_set_dev_base(base, bus, devfn); 102 pci_exp_set_dev_base(base, bus, devfn);
103 103
@@ -112,7 +112,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
112 mmio_config_writel(mmcfg_virt_addr + reg, value); 112 mmio_config_writel(mmcfg_virt_addr + reg, value);
113 break; 113 break;
114 } 114 }
115 spin_unlock_irqrestore(&pci_config_lock, flags); 115 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
116 116
117 return 0; 117 return 0;
118} 118}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 8bf2fcb88d0..7ef3a2735df 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -109,7 +109,7 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
109 decode++; 109 decode++;
110 decode = ~(decode - 1); 110 decode = ~(decode - 1);
111 } else { 111 } else {
112 decode = ~0; 112 decode = 0;
113 } 113 }
114 114
115 /* 115 /*
@@ -247,6 +247,10 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
247 u32 size; 247 u32 size;
248 int i; 248 int i;
249 249
250 /* Must have extended configuration space */
251 if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
252 return;
253
250 /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ 254 /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
251 offset = fixed_bar_cap(dev->bus, dev->devfn); 255 offset = fixed_bar_cap(dev->bus, dev->devfn);
252 if (!offset || PCI_DEVFN(2, 0) == dev->devfn || 256 if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8223738ad80..5c9e2458df4 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -37,7 +37,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
37 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 37 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
38 return -EINVAL; 38 return -EINVAL;
39 39
40 spin_lock_irqsave(&pci_config_lock, flags); 40 raw_spin_lock_irqsave(&pci_config_lock, flags);
41 41
42 write_cf8(bus, devfn, reg); 42 write_cf8(bus, devfn, reg);
43 43
@@ -62,7 +62,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
62 break; 62 break;
63 } 63 }
64 64
65 spin_unlock_irqrestore(&pci_config_lock, flags); 65 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
66 66
67 return 0; 67 return 0;
68} 68}
@@ -76,7 +76,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
76 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 76 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
77 return -EINVAL; 77 return -EINVAL;
78 78
79 spin_lock_irqsave(&pci_config_lock, flags); 79 raw_spin_lock_irqsave(&pci_config_lock, flags);
80 80
81 write_cf8(bus, devfn, reg); 81 write_cf8(bus, devfn, reg);
82 82
@@ -101,7 +101,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
101 break; 101 break;
102 } 102 }
103 103
104 spin_unlock_irqrestore(&pci_config_lock, flags); 104 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
105 105
106 return 0; 106 return 0;
107} 107}
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 59a225c17b8..2492d165096 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -162,7 +162,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
162 if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) 162 if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
163 return -EINVAL; 163 return -EINVAL;
164 164
165 spin_lock_irqsave(&pci_config_lock, flags); 165 raw_spin_lock_irqsave(&pci_config_lock, flags);
166 166
167 switch (len) { 167 switch (len) {
168 case 1: 168 case 1:
@@ -213,7 +213,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
213 break; 213 break;
214 } 214 }
215 215
216 spin_unlock_irqrestore(&pci_config_lock, flags); 216 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
217 217
218 return (int)((result & 0xff00) >> 8); 218 return (int)((result & 0xff00) >> 8);
219} 219}
@@ -228,7 +228,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
228 if ((bus > 255) || (devfn > 255) || (reg > 255)) 228 if ((bus > 255) || (devfn > 255) || (reg > 255))
229 return -EINVAL; 229 return -EINVAL;
230 230
231 spin_lock_irqsave(&pci_config_lock, flags); 231 raw_spin_lock_irqsave(&pci_config_lock, flags);
232 232
233 switch (len) { 233 switch (len) {
234 case 1: 234 case 1:
@@ -269,7 +269,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
269 break; 269 break;
270 } 270 }
271 271
272 spin_unlock_irqrestore(&pci_config_lock, flags); 272 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
273 273
274 return (int)((result & 0xff00) >> 8); 274 return (int)((result & 0xff00) >> 8);
275} 275}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 32764b8880b..b3c6c59ed30 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -476,6 +476,7 @@ void xen_timer_resume(void)
476__init void xen_time_init(void) 476__init void xen_time_init(void)
477{ 477{
478 int cpu = smp_processor_id(); 478 int cpu = smp_processor_id();
479 struct timespec tp;
479 480
480 clocksource_register(&xen_clocksource); 481 clocksource_register(&xen_clocksource);
481 482
@@ -487,9 +488,8 @@ __init void xen_time_init(void)
487 } 488 }
488 489
489 /* Set initial system time with full resolution */ 490 /* Set initial system time with full resolution */
490 xen_read_wallclock(&xtime); 491 xen_read_wallclock(&tp);
491 set_normalized_timespec(&wall_to_monotonic, 492 do_settimeofday(&tp);
492 -xtime.tv_sec, -xtime.tv_nsec);
493 493
494 setup_force_cpu_cap(X86_FEATURE_TSC); 494 setup_force_cpu_cap(X86_FEATURE_TSC);
495 495