aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig103
-rw-r--r--arch/x86/Kconfig.cpu19
-rw-r--r--arch/x86/Makefile12
-rw-r--r--arch/x86/Makefile_32.cpu2
-rw-r--r--arch/x86/boot/install.sh4
-rw-r--r--arch/x86/boot/video-vesa.c7
-rw-r--r--arch/x86/boot/video-vga.c10
-rw-r--r--arch/x86/boot/video.c5
-rw-r--r--arch/x86/boot/video.h20
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c19
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/ia32/sys_ia32.c14
-rw-r--r--arch/x86/include/asm/acpi.h1
-rw-r--r--arch/x86/include/asm/agp.h4
-rw-r--r--arch/x86/include/asm/alternative.h7
-rw-r--r--arch/x86/include/asm/amd_iommu.h1
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h50
-rw-r--r--arch/x86/include/asm/apic.h39
-rw-r--r--arch/x86/include/asm/apicdef.h5
-rw-r--r--arch/x86/include/asm/asm.h10
-rw-r--r--arch/x86/include/asm/bootparam.h13
-rw-r--r--arch/x86/include/asm/cache.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h54
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/current.h2
-rw-r--r--arch/x86/include/asm/desc.h13
-rw-r--r--arch/x86/include/asm/desc_defs.h6
-rw-r--r--arch/x86/include/asm/device.h3
-rw-r--r--arch/x86/include/asm/dma-mapping.h18
-rw-r--r--arch/x86/include/asm/do_timer.h16
-rw-r--r--arch/x86/include/asm/dwarf2.h18
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/fixmap.h3
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/hypervisor.h2
-rw-r--r--arch/x86/include/asm/i387.h9
-rw-r--r--arch/x86/include/asm/io_apic.h20
-rw-r--r--arch/x86/include/asm/ioctls.h95
-rw-r--r--arch/x86/include/asm/iomap.h9
-rw-r--r--arch/x86/include/asm/ipcbuf.h29
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irqflags.h9
-rw-r--r--arch/x86/include/asm/kvm.h10
-rw-r--r--arch/x86/include/asm/kvm_emulate.h (renamed from arch/x86/include/asm/kvm_x86_emulate.h)0
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/lguest.h5
-rw-r--r--arch/x86/include/asm/mce.h32
-rw-r--r--arch/x86/include/asm/mman.h14
-rw-r--r--arch/x86/include/asm/mmu_context.h6
-rw-r--r--arch/x86/include/asm/module.h15
-rw-r--r--arch/x86/include/asm/mpspec.h47
-rw-r--r--arch/x86/include/asm/msgbuf.h40
-rw-r--r--arch/x86/include/asm/msr-index.h12
-rw-r--r--arch/x86/include/asm/msr.h75
-rw-r--r--arch/x86/include/asm/mtrr.h6
-rw-r--r--arch/x86/include/asm/nmi.h7
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/param.h23
-rw-r--r--arch/x86/include/asm/paravirt.h797
-rw-r--r--arch/x86/include/asm/paravirt_types.h693
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pci.h7
-rw-r--r--arch/x86/include/asm/percpu.h35
-rw-r--r--arch/x86/include/asm/perf_event.h (renamed from arch/x86/include/asm/perf_counter.h)36
-rw-r--r--arch/x86/include/asm/pgtable.h26
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/processor.h60
-rw-r--r--arch/x86/include/asm/scatterlist.h27
-rw-r--r--arch/x86/include/asm/setup.h49
-rw-r--r--arch/x86/include/asm/shmbuf.h52
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/socket.h61
-rw-r--r--arch/x86/include/asm/sockios.h14
-rw-r--r--arch/x86/include/asm/stackprotector.h10
-rw-r--r--arch/x86/include/asm/string_32.h1
-rw-r--r--arch/x86/include/asm/syscall.h14
-rw-r--r--arch/x86/include/asm/system.h29
-rw-r--r--arch/x86/include/asm/termbits.h199
-rw-r--r--arch/x86/include/asm/termios.h115
-rw-r--r--arch/x86/include/asm/thread_info.h15
-rw-r--r--arch/x86/include/asm/time.h53
-rw-r--r--arch/x86/include/asm/timer.h14
-rw-r--r--arch/x86/include/asm/topology.h49
-rw-r--r--arch/x86/include/asm/traps.h4
-rw-r--r--arch/x86/include/asm/tsc.h3
-rw-r--r--arch/x86/include/asm/types.h12
-rw-r--r--arch/x86/include/asm/uaccess_32.h2
-rw-r--r--arch/x86/include/asm/ucontext.h8
-rw-r--r--arch/x86/include/asm/unistd_32.h4
-rw-r--r--arch/x86/include/asm/unistd_64.h10
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h19
-rw-r--r--arch/x86/include/asm/vgtod.h1
-rw-r--r--arch/x86/include/asm/vmware.h2
-rw-r--r--arch/x86/include/asm/vmx.h8
-rw-r--r--arch/x86/include/asm/x86_init.h133
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c105
-rw-r--r--arch/x86/kernel/alternative.c58
-rw-r--r--arch/x86/kernel/amd_iommu.c489
-rw-r--r--arch/x86/kernel/amd_iommu_init.c42
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic/apic.c150
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c387
-rw-r--r--arch/x86/kernel/apic/ipi.c2
-rw-r--r--arch/x86/kernel/apic/nmi.c26
-rw-r--r--arch/x86/kernel/apic/numaq_32.c57
-rw-r--r--arch/x86/kernel/apic/probe_64.c21
-rw-r--r--arch/x86/kernel/apic/summit_32.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c11
-rw-r--r--arch/x86/kernel/apm_32.c31
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c129
-rw-r--r--arch/x86/kernel/cpu/bugs.c10
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c2
-rw-r--r--arch/x86/kernel/cpu/common.c73
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c116
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c44
-rw-r--r--arch/x86/kernel/cpu/cyrix.c19
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c19
-rw-r--r--arch/x86/kernel/cpu/intel.c17
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c148
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile5
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c116
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c158
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c324
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c163
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c13
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c97
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c168
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c390
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c94
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c304
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c135
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c499
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h19
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c68
-rw-r--r--arch/x86/kernel/cpu/perf_event.c (renamed from arch/x86/kernel/cpu/perf_counter.c)860
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c47
-rw-r--r--arch/x86/kernel/cpu/proc.c4
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/cpu/vmware.c41
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/doublefault_32.c4
-rw-r--r--arch/x86/kernel/ds.c6
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack_32.c1
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/e820.c23
-rw-r--r--arch/x86/kernel/early_printk.c780
-rw-r--r--arch/x86/kernel/efi.c4
-rw-r--r--arch/x86/kernel/entry_64.S30
-rw-r--r--arch/x86/kernel/ftrace.c51
-rw-r--r--arch/x86/kernel/head32.c26
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/i8253.c19
-rw-r--r--arch/x86/kernel/init_task.c5
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irq_32.c5
-rw-r--r--arch/x86/kernel/irqinit.c40
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/mpparse.c85
-rw-r--r--arch/x86/kernel/mrst.c24
-rw-r--r--arch/x86/kernel/msr.c65
-rw-r--r--arch/x86/kernel/paravirt.c39
-rw-r--r--arch/x86/kernel/pci-dma.c21
-rw-r--r--arch/x86/kernel/pci-gart_64.c5
-rw-r--r--arch/x86/kernel/pci-nommu.c29
-rw-r--r--arch/x86/kernel/pci-swiotlb.c30
-rw-r--r--arch/x86/kernel/process.c31
-rw-r--r--arch/x86/kernel/process_32.c30
-rw-r--r--arch/x86/kernel/process_64.c36
-rw-r--r--arch/x86/kernel/ptrace.c34
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c7
-rw-r--r--arch/x86/kernel/rtc.c17
-rw-r--r--arch/x86/kernel/setup.c131
-rw-r--r--arch/x86/kernel/setup_percpu.c364
-rw-r--r--arch/x86/kernel/sfi.c122
-rw-r--r--arch/x86/kernel/signal.c4
-rw-r--r--arch/x86/kernel/smpboot.c32
-rw-r--r--arch/x86/kernel/step.c9
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c447
-rw-r--r--arch/x86/kernel/time.c120
-rw-r--r--arch/x86/kernel/time_32.c137
-rw-r--r--arch/x86/kernel/time_64.c135
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/trampoline.c4
-rw-r--r--arch/x86/kernel/trampoline_32.S8
-rw-r--r--arch/x86/kernel/trampoline_64.S5
-rw-r--r--arch/x86/kernel/traps.c61
-rw-r--r--arch/x86/kernel/tsc.c88
-rw-r--r--arch/x86/kernel/visws_quirks.c54
-rw-r--r--arch/x86/kernel/vmi_32.c12
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S15
-rw-r--r--arch/x86/kernel/vsyscall_64.c11
-rw-r--r--arch/x86/kernel/x86_init.c75
-rw-r--r--arch/x86/kvm/Kconfig21
-rw-r--r--arch/x86/kvm/Makefile35
-rw-r--r--arch/x86/kvm/emulate.c (renamed from arch/x86/kvm/x86_emulate.c)265
-rw-r--r--arch/x86/kvm/i8254.c160
-rw-r--r--arch/x86/kvm/i8254.h5
-rw-r--r--arch/x86/kvm/i8259.c116
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/kvm_svm.h51
-rw-r--r--arch/x86/kvm/kvm_timer.h2
-rw-r--r--arch/x86/kvm/lapic.c334
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c587
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/mmutrace.h220
-rw-r--r--arch/x86/kvm/paging_tmpl.h141
-rw-r--r--arch/x86/kvm/svm.c889
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/trace.h355
-rw-r--r--arch/x86/kvm/vmx.c497
-rw-r--r--arch/x86/kvm/x86.c822
-rw-r--r--arch/x86/kvm/x86.h4
-rw-r--r--arch/x86/lguest/boot.c21
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/msr-reg-export.c5
-rw-r--r--arch/x86/lib/msr-reg.S102
-rw-r--r--arch/x86/lib/msr.c49
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/fault.c78
-rw-r--r--arch/x86/mm/highmem_32.c3
-rw-r--r--arch/x86/mm/init_32.c12
-rw-r--r--arch/x86/mm/init_64.c12
-rw-r--r--arch/x86/mm/iomap_32.c27
-rw-r--r--arch/x86/mm/ioremap.c90
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c17
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c1
-rw-r--r--arch/x86/mm/mmap.c17
-rw-r--r--arch/x86/mm/pageattr.c29
-rw-r--r--arch/x86/mm/pat.c355
-rw-r--r--arch/x86/mm/physaddr.c70
-rw-r--r--arch/x86/mm/physaddr.h10
-rw-r--r--arch/x86/mm/srat_32.c4
-rw-r--r--arch/x86/mm/tlb.c15
-rw-r--r--arch/x86/oprofile/nmi_int.c404
-rw-r--r--arch/x86/oprofile/op_counter.h2
-rw-r--r--arch/x86/oprofile/op_model_amd.c372
-rw-r--r--arch/x86/oprofile/op_model_p4.c72
-rw-r--r--arch/x86/oprofile/op_model_ppro.c105
-rw-r--r--arch/x86/oprofile/op_x86_model.h59
-rw-r--r--arch/x86/pci/amd_bus.c64
-rw-r--r--arch/x86/pci/common.c69
-rw-r--r--arch/x86/pci/direct.c5
-rw-r--r--arch/x86/pci/mmconfig-shared.c8
-rw-r--r--arch/x86/pci/mmconfig_32.c2
-rw-r--r--arch/x86/power/cpu.c6
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vclock_gettime.c39
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c159
-rw-r--r--arch/x86/xen/irq.c5
-rw-r--r--arch/x86/xen/mmu.c20
-rw-r--r--arch/x86/xen/mmu.h2
-rw-r--r--arch/x86/xen/smp.c1
-rw-r--r--arch/x86/xen/spinlock.c28
-rw-r--r--arch/x86/xen/xen-ops.h2
284 files changed, 10826 insertions, 8836 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d7..93698794aa3a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,7 +24,7 @@ config X86
24 select HAVE_UNSTABLE_SCHED_CLOCK 24 select HAVE_UNSTABLE_SCHED_CLOCK
25 select HAVE_IDE 25 select HAVE_IDE
26 select HAVE_OPROFILE 26 select HAVE_OPROFILE
27 select HAVE_PERF_COUNTERS if (!M386 && !M486) 27 select HAVE_PERF_EVENTS if (!M386 && !M486)
28 select HAVE_IOREMAP_PROT 28 select HAVE_IOREMAP_PROT
29 select HAVE_KPROBES 29 select HAVE_KPROBES
30 select ARCH_WANT_OPTIONAL_GPIOLIB 30 select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -38,7 +38,7 @@ config X86
38 select HAVE_FUNCTION_GRAPH_FP_TEST 38 select HAVE_FUNCTION_GRAPH_FP_TEST
39 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 39 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
40 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE 40 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
41 select HAVE_FTRACE_SYSCALLS 41 select HAVE_SYSCALL_TRACEPOINTS
42 select HAVE_KVM 42 select HAVE_KVM
43 select HAVE_ARCH_KGDB 43 select HAVE_ARCH_KGDB
44 select HAVE_ARCH_TRACEHOOK 44 select HAVE_ARCH_TRACEHOOK
@@ -150,7 +150,10 @@ config ARCH_HAS_CACHE_LINE_SIZE
150config HAVE_SETUP_PER_CPU_AREA 150config HAVE_SETUP_PER_CPU_AREA
151 def_bool y 151 def_bool y
152 152
153config HAVE_DYNAMIC_PER_CPU_AREA 153config NEED_PER_CPU_EMBED_FIRST_CHUNK
154 def_bool y
155
156config NEED_PER_CPU_PAGE_FIRST_CHUNK
154 def_bool y 157 def_bool y
155 158
156config HAVE_CPUMASK_OF_CPU_MAP 159config HAVE_CPUMASK_OF_CPU_MAP
@@ -179,6 +182,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
179config ARCH_SUPPORTS_DEBUG_PAGEALLOC 182config ARCH_SUPPORTS_DEBUG_PAGEALLOC
180 def_bool y 183 def_bool y
181 184
185config HAVE_INTEL_TXT
186 def_bool y
187 depends on EXPERIMENTAL && DMAR && ACPI
188
182# Use the generic interrupt handling code in kernel/irq/: 189# Use the generic interrupt handling code in kernel/irq/:
183config GENERIC_HARDIRQS 190config GENERIC_HARDIRQS
184 bool 191 bool
@@ -318,6 +325,7 @@ config X86_EXTENDED_PLATFORM
318 SGI 320/540 (Visual Workstation) 325 SGI 320/540 (Visual Workstation)
319 Summit/EXA (IBM x440) 326 Summit/EXA (IBM x440)
320 Unisys ES7000 IA32 series 327 Unisys ES7000 IA32 series
328 Moorestown MID devices
321 329
322 If you have one of these systems, or if you want to build a 330 If you have one of these systems, or if you want to build a
323 generic distribution kernel, say Y here - otherwise say N. 331 generic distribution kernel, say Y here - otherwise say N.
@@ -377,6 +385,18 @@ config X86_ELAN
377 385
378 If unsure, choose "PC-compatible" instead. 386 If unsure, choose "PC-compatible" instead.
379 387
388config X86_MRST
389 bool "Moorestown MID platform"
390 depends on X86_32
391 depends on X86_EXTENDED_PLATFORM
392 ---help---
393 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
394 Internet Device(MID) platform. Moorestown consists of two chips:
395 Lincroft (CPU core, graphics, and memory controller) and Langwell IOH.
396 Unlike standard x86 PCs, Moorestown does not have many legacy devices
397 nor standard legacy replacement devices/features. e.g. Moorestown does
398 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
399
380config X86_RDC321X 400config X86_RDC321X
381 bool "RDC R-321x SoC" 401 bool "RDC R-321x SoC"
382 depends on X86_32 402 depends on X86_32
@@ -586,7 +606,6 @@ config GART_IOMMU
586 bool "GART IOMMU support" if EMBEDDED 606 bool "GART IOMMU support" if EMBEDDED
587 default y 607 default y
588 select SWIOTLB 608 select SWIOTLB
589 select AGP
590 depends on X86_64 && PCI 609 depends on X86_64 && PCI
591 ---help--- 610 ---help---
592 Support for full DMA access of devices with 32bit memory access only 611 Support for full DMA access of devices with 32bit memory access only
@@ -777,41 +796,17 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
777 increased on these systems. 796 increased on these systems.
778 797
779config X86_MCE 798config X86_MCE
780 bool "Machine Check Exception" 799 bool "Machine Check / overheating reporting"
781 ---help--- 800 ---help---
782 Machine Check Exception support allows the processor to notify the 801 Machine Check support allows the processor to notify the
783 kernel if it detects a problem (e.g. overheating, component failure). 802 kernel if it detects a problem (e.g. overheating, data corruption).
784 The action the kernel takes depends on the severity of the problem, 803 The action the kernel takes depends on the severity of the problem,
785 ranging from a warning message on the console, to halting the machine. 804 ranging from warning messages to halting the machine.
786 Your processor must be a Pentium or newer to support this - check the
787 flags in /proc/cpuinfo for mce. Note that some older Pentium systems
788 have a design flaw which leads to false MCE events - hence MCE is
789 disabled on all P5 processors, unless explicitly enabled with "mce"
790 as a boot argument. Similarly, if MCE is built in and creates a
791 problem on some new non-standard machine, you can boot with "nomce"
792 to disable it. MCE support simply ignores non-MCE processors like
793 the 386 and 486, so nearly everyone can say Y here.
794
795config X86_OLD_MCE
796 depends on X86_32 && X86_MCE
797 bool "Use legacy machine check code (will go away)"
798 default n
799 select X86_ANCIENT_MCE
800 ---help---
801 Use the old i386 machine check code. This is merely intended for
802 testing in a transition period. Try this if you run into any machine
803 check related software problems, but report the problem to
804 linux-kernel. When in doubt say no.
805
806config X86_NEW_MCE
807 depends on X86_MCE
808 bool
809 default y if (!X86_OLD_MCE && X86_32) || X86_64
810 805
811config X86_MCE_INTEL 806config X86_MCE_INTEL
812 def_bool y 807 def_bool y
813 prompt "Intel MCE features" 808 prompt "Intel MCE features"
814 depends on X86_NEW_MCE && X86_LOCAL_APIC 809 depends on X86_MCE && X86_LOCAL_APIC
815 ---help--- 810 ---help---
816 Additional support for intel specific MCE features such as 811 Additional support for intel specific MCE features such as
817 the thermal monitor. 812 the thermal monitor.
@@ -819,14 +814,14 @@ config X86_MCE_INTEL
819config X86_MCE_AMD 814config X86_MCE_AMD
820 def_bool y 815 def_bool y
821 prompt "AMD MCE features" 816 prompt "AMD MCE features"
822 depends on X86_NEW_MCE && X86_LOCAL_APIC 817 depends on X86_MCE && X86_LOCAL_APIC
823 ---help--- 818 ---help---
824 Additional support for AMD specific MCE features such as 819 Additional support for AMD specific MCE features such as
825 the DRAM Error Threshold. 820 the DRAM Error Threshold.
826 821
827config X86_ANCIENT_MCE 822config X86_ANCIENT_MCE
828 def_bool n 823 def_bool n
829 depends on X86_32 824 depends on X86_32 && X86_MCE
830 prompt "Support for old Pentium 5 / WinChip machine checks" 825 prompt "Support for old Pentium 5 / WinChip machine checks"
831 ---help--- 826 ---help---
832 Include support for machine check handling on old Pentium 5 or WinChip 827 Include support for machine check handling on old Pentium 5 or WinChip
@@ -839,36 +834,16 @@ config X86_MCE_THRESHOLD
839 default y 834 default y
840 835
841config X86_MCE_INJECT 836config X86_MCE_INJECT
842 depends on X86_NEW_MCE 837 depends on X86_MCE
843 tristate "Machine check injector support" 838 tristate "Machine check injector support"
844 ---help--- 839 ---help---
845 Provide support for injecting machine checks for testing purposes. 840 Provide support for injecting machine checks for testing purposes.
846 If you don't know what a machine check is and you don't do kernel 841 If you don't know what a machine check is and you don't do kernel
847 QA it is safe to say n. 842 QA it is safe to say n.
848 843
849config X86_MCE_NONFATAL
850 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
851 depends on X86_OLD_MCE
852 ---help---
853 Enabling this feature starts a timer that triggers every 5 seconds which
854 will look at the machine check registers to see if anything happened.
855 Non-fatal problems automatically get corrected (but still logged).
856 Disable this if you don't want to see these messages.
857 Seeing the messages this option prints out may be indicative of dying
858 or out-of-spec (ie, overclocked) hardware.
859 This option only does something on certain CPUs.
860 (AMD Athlon/Duron and Intel Pentium 4)
861
862config X86_MCE_P4THERMAL
863 bool "check for P4 thermal throttling interrupt."
864 depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
865 ---help---
866 Enabling this feature will cause a message to be printed when the P4
867 enters thermal throttling.
868
869config X86_THERMAL_VECTOR 844config X86_THERMAL_VECTOR
870 def_bool y 845 def_bool y
871 depends on X86_MCE_P4THERMAL || X86_MCE_INTEL 846 depends on X86_MCE_INTEL
872 847
873config VM86 848config VM86
874 bool "Enable VM86 support" if EMBEDDED 849 bool "Enable VM86 support" if EMBEDDED
@@ -1229,6 +1204,10 @@ config ARCH_DISCONTIGMEM_DEFAULT
1229 def_bool y 1204 def_bool y
1230 depends on NUMA && X86_32 1205 depends on NUMA && X86_32
1231 1206
1207config ARCH_PROC_KCORE_TEXT
1208 def_bool y
1209 depends on X86_64 && PROC_KCORE
1210
1232config ARCH_SPARSEMEM_DEFAULT 1211config ARCH_SPARSEMEM_DEFAULT
1233 def_bool y 1212 def_bool y
1234 depends on X86_64 1213 depends on X86_64
@@ -1414,6 +1393,10 @@ config X86_PAT
1414 1393
1415 If unsure, say Y. 1394 If unsure, say Y.
1416 1395
1396config ARCH_USES_PG_UNCACHED
1397 def_bool y
1398 depends on X86_PAT
1399
1417config EFI 1400config EFI
1418 bool "EFI runtime service support" 1401 bool "EFI runtime service support"
1419 depends on ACPI 1402 depends on ACPI
@@ -1683,6 +1666,8 @@ source "kernel/power/Kconfig"
1683 1666
1684source "drivers/acpi/Kconfig" 1667source "drivers/acpi/Kconfig"
1685 1668
1669source "drivers/sfi/Kconfig"
1670
1686config X86_APM_BOOT 1671config X86_APM_BOOT
1687 bool 1672 bool
1688 default y 1673 default y
@@ -1878,7 +1863,7 @@ config PCI_DIRECT
1878 1863
1879config PCI_MMCONFIG 1864config PCI_MMCONFIG
1880 def_bool y 1865 def_bool y
1881 depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) 1866 depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY)
1882 1867
1883config PCI_OLPC 1868config PCI_OLPC
1884 def_bool y 1869 def_bool y
@@ -1916,7 +1901,7 @@ config DMAR_DEFAULT_ON
1916config DMAR_BROKEN_GFX_WA 1901config DMAR_BROKEN_GFX_WA
1917 def_bool n 1902 def_bool n
1918 prompt "Workaround broken graphics drivers (going away soon)" 1903 prompt "Workaround broken graphics drivers (going away soon)"
1919 depends on DMAR 1904 depends on DMAR && BROKEN
1920 ---help--- 1905 ---help---
1921 Current Graphics drivers tend to use physical address 1906 Current Graphics drivers tend to use physical address
1922 for DMA and avoid using DMA APIs. Setting this config 1907 for DMA and avoid using DMA APIs. Setting this config
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 8130334329c0..527519b8a9f9 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -262,6 +262,15 @@ config MCORE2
262 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 262 family in /proc/cpuinfo. Newer ones have 6 and older ones 15
263 (not a typo) 263 (not a typo)
264 264
265config MATOM
266 bool "Intel Atom"
267 ---help---
268
269 Select this for the Intel Atom platform. Intel Atom CPUs have an
270 in-order pipelining architecture and thus can benefit from
271 accordingly optimized code. Use a recent GCC with specific Atom
272 support in order to fully benefit from selecting this option.
273
265config GENERIC_CPU 274config GENERIC_CPU
266 bool "Generic-x86-64" 275 bool "Generic-x86-64"
267 depends on X86_64 276 depends on X86_64
@@ -295,7 +304,7 @@ config X86_CPU
295config X86_L1_CACHE_BYTES 304config X86_L1_CACHE_BYTES
296 int 305 int
297 default "128" if MPSC 306 default "128" if MPSC
298 default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32 307 default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32
299 308
300config X86_INTERNODE_CACHE_BYTES 309config X86_INTERNODE_CACHE_BYTES
301 int 310 int
@@ -310,7 +319,7 @@ config X86_L1_CACHE_SHIFT
310 default "7" if MPENTIUM4 || MPSC 319 default "7" if MPENTIUM4 || MPSC
311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 321 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU 322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
314 323
315config X86_XADD 324config X86_XADD
316 def_bool y 325 def_bool y
@@ -359,7 +368,7 @@ config X86_INTEL_USERCOPY
359 368
360config X86_USE_PPRO_CHECKSUM 369config X86_USE_PPRO_CHECKSUM
361 def_bool y 370 def_bool y
362 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 371 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
363 372
364config X86_USE_3DNOW 373config X86_USE_3DNOW
365 def_bool y 374 def_bool y
@@ -387,7 +396,7 @@ config X86_P6_NOP
387 396
388config X86_TSC 397config X86_TSC
389 def_bool y 398 def_bool y
390 depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 399 depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64
391 400
392config X86_CMPXCHG64 401config X86_CMPXCHG64
393 def_bool y 402 def_bool y
@@ -397,7 +406,7 @@ config X86_CMPXCHG64
397# generates cmov. 406# generates cmov.
398config X86_CMOV 407config X86_CMOV
399 def_bool y 408 def_bool y
400 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) 409 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM)
401 410
402config X86_MINIMUM_CPU_FAMILY 411config X86_MINIMUM_CPU_FAMILY
403 int 412 int
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1b68659c41b4..a012ee8ef803 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -32,8 +32,8 @@ ifeq ($(CONFIG_X86_32),y)
32 32
33 # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use 33 # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
34 # a lot more stack due to the lack of sharing of stacklots: 34 # a lot more stack due to the lack of sharing of stacklots:
35 KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ 35 KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \
36 echo $(call cc-option,-fno-unit-at-a-time); fi ;) 36 $(call cc-option,-fno-unit-at-a-time))
37 37
38 # CPU-specific tuning. Anything which can be shared with UML should go here. 38 # CPU-specific tuning. Anything which can be shared with UML should go here.
39 include $(srctree)/arch/x86/Makefile_32.cpu 39 include $(srctree)/arch/x86/Makefile_32.cpu
@@ -55,6 +55,8 @@ else
55 55
56 cflags-$(CONFIG_MCORE2) += \ 56 cflags-$(CONFIG_MCORE2) += \
57 $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) 57 $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
58 cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
59 $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
58 cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) 60 cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
59 KBUILD_CFLAGS += $(cflags-y) 61 KBUILD_CFLAGS += $(cflags-y)
60 62
@@ -72,7 +74,7 @@ endif
72 74
73ifdef CONFIG_CC_STACKPROTECTOR 75ifdef CONFIG_CC_STACKPROTECTOR
74 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh 76 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
75 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y) 77 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y)
76 stackp-y := -fstack-protector 78 stackp-y := -fstack-protector
77 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all 79 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
78 KBUILD_CFLAGS += $(stackp-y) 80 KBUILD_CFLAGS += $(stackp-y)
@@ -177,8 +179,8 @@ archclean:
177define archhelp 179define archhelp
178 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' 180 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
179 echo ' install - Install kernel using' 181 echo ' install - Install kernel using'
180 echo ' (your) ~/bin/installkernel or' 182 echo ' (your) ~/bin/$(INSTALLKERNEL) or'
181 echo ' (distribution) /sbin/installkernel or' 183 echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
182 echo ' install to $$(INSTALL_PATH) and run lilo' 184 echo ' install to $$(INSTALL_PATH) and run lilo'
183 echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' 185 echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
184 echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' 186 echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 80177ec052f0..30e9a264f69d 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -33,6 +33,8 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-f
33cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 33cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
34cflags-$(CONFIG_MVIAC7) += -march=i686 34cflags-$(CONFIG_MVIAC7) += -march=i686
35cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) 35cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
36cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
37 $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
36 38
37# AMD Elan support 39# AMD Elan support
38cflags-$(CONFIG_X86_ELAN) += -march=i486 40cflags-$(CONFIG_X86_ELAN) += -march=i486
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 8d60ee15dfd9..d13ec1c38640 100644
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -33,8 +33,8 @@ verify "$3"
33 33
34# User may have a custom install script 34# User may have a custom install script
35 35
36if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi 36if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
37if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi 37if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
38 38
39# Default install - same as make zlilo 39# Default install - same as make zlilo
40 40
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 275dd177f198..11e8c6eb80a1 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -31,7 +31,6 @@ static inline void vesa_store_mode_params_graphics(void) {}
31 31
32static int vesa_probe(void) 32static int vesa_probe(void)
33{ 33{
34#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
35 struct biosregs ireg, oreg; 34 struct biosregs ireg, oreg;
36 u16 mode; 35 u16 mode;
37 addr_t mode_ptr; 36 addr_t mode_ptr;
@@ -49,8 +48,7 @@ static int vesa_probe(void)
49 vginfo.signature != VESA_MAGIC || 48 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 49 vginfo.version < 0x0102)
51 return 0; /* Not present */ 50 return 0; /* Not present */
52#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */ 51
53#ifdef CONFIG_VIDEO_VESA
54 set_fs(vginfo.video_mode_ptr.seg); 52 set_fs(vginfo.video_mode_ptr.seg);
55 mode_ptr = vginfo.video_mode_ptr.off; 53 mode_ptr = vginfo.video_mode_ptr.off;
56 54
@@ -102,9 +100,6 @@ static int vesa_probe(void)
102 } 100 }
103 101
104 return nmodes; 102 return nmodes;
105#else
106 return 0;
107#endif /* CONFIG_VIDEO_VESA */
108} 103}
109 104
110static int vesa_set_mode(struct mode_info *mode) 105static int vesa_set_mode(struct mode_info *mode)
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 8f8d827e254d..819caa1f2008 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -47,14 +47,6 @@ static u8 vga_set_basic_mode(void)
47 47
48 initregs(&ireg); 48 initregs(&ireg);
49 49
50#ifdef CONFIG_VIDEO_400_HACK
51 if (adapter >= ADAPTER_VGA) {
52 ireg.ax = 0x1202;
53 ireg.bx = 0x0030;
54 intcall(0x10, &ireg, NULL);
55 }
56#endif
57
58 ax = 0x0f00; 50 ax = 0x0f00;
59 intcall(0x10, &ireg, &oreg); 51 intcall(0x10, &ireg, &oreg);
60 mode = oreg.al; 52 mode = oreg.al;
@@ -62,11 +54,9 @@ static u8 vga_set_basic_mode(void)
62 set_fs(0); 54 set_fs(0);
63 rows = rdfs8(0x484); /* rows minus one */ 55 rows = rdfs8(0x484); /* rows minus one */
64 56
65#ifndef CONFIG_VIDEO_400_HACK
66 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && 57 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
67 (rows == 0 || rows == 24)) 58 (rows == 0 || rows == 24))
68 return mode; 59 return mode;
69#endif
70 60
71 if (mode != 3 && mode != 7) 61 if (mode != 3 && mode != 7)
72 mode = 3; 62 mode = 3;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index bad728b76fc2..d42da3802499 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -221,7 +221,6 @@ static unsigned int mode_menu(void)
221 } 221 }
222} 222}
223 223
224#ifdef CONFIG_VIDEO_RETAIN
225/* Save screen content to the heap */ 224/* Save screen content to the heap */
226static struct saved_screen { 225static struct saved_screen {
227 int x, y; 226 int x, y;
@@ -299,10 +298,6 @@ static void restore_screen(void)
299 ireg.dl = saved.curx; 298 ireg.dl = saved.curx;
300 intcall(0x10, &ireg, NULL); 299 intcall(0x10, &ireg, NULL);
301} 300}
302#else
303#define save_screen() ((void)0)
304#define restore_screen() ((void)0)
305#endif
306 301
307void set_video(void) 302void set_video(void)
308{ 303{
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index 5bb174a997fc..ff339c5db311 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -17,19 +17,8 @@
17 17
18#include <linux/types.h> 18#include <linux/types.h>
19 19
20/* Enable autodetection of SVGA adapters and modes. */ 20/*
21#undef CONFIG_VIDEO_SVGA 21 * This code uses an extended set of video mode numbers. These include:
22
23/* Enable autodetection of VESA modes */
24#define CONFIG_VIDEO_VESA
25
26/* Retain screen contents when switching modes */
27#define CONFIG_VIDEO_RETAIN
28
29/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
30#undef CONFIG_VIDEO_400_HACK
31
32/* This code uses an extended set of video mode numbers. These include:
33 * Aliases for standard modes 22 * Aliases for standard modes
34 * NORMAL_VGA (-1) 23 * NORMAL_VGA (-1)
35 * EXTENDED_VGA (-2) 24 * EXTENDED_VGA (-2)
@@ -67,13 +56,8 @@
67/* The "recalculate timings" flag */ 56/* The "recalculate timings" flag */
68#define VIDEO_RECALC 0x8000 57#define VIDEO_RECALC 0x8000
69 58
70/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
71#ifdef CONFIG_VIDEO_RETAIN
72void store_screen(void); 59void store_screen(void);
73#define DO_STORE() store_screen() 60#define DO_STORE() store_screen()
74#else
75#define DO_STORE() ((void)0)
76#endif /* CONFIG_VIDEO_RETAIN */
77 61
78/* 62/*
79 * Mode table structures 63 * Mode table structures
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index edb992ebef92..d28fad19654a 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2355CONFIG_HAVE_DYNAMIC_FTRACE=y 2355CONFIG_HAVE_DYNAMIC_FTRACE=y
2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2357CONFIG_HAVE_HW_BRANCH_TRACER=y 2357CONFIG_HAVE_HW_BRANCH_TRACER=y
2358CONFIG_HAVE_FTRACE_SYSCALLS=y 2358CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
2359CONFIG_RING_BUFFER=y 2359CONFIG_RING_BUFFER=y
2360CONFIG_TRACING=y 2360CONFIG_TRACING=y
2361CONFIG_TRACING_SUPPORT=y 2361CONFIG_TRACING_SUPPORT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index cee1dd2e69b2..6c86acd847a4 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2329CONFIG_HAVE_DYNAMIC_FTRACE=y 2329CONFIG_HAVE_DYNAMIC_FTRACE=y
2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2331CONFIG_HAVE_HW_BRANCH_TRACER=y 2331CONFIG_HAVE_HW_BRANCH_TRACER=y
2332CONFIG_HAVE_FTRACE_SYSCALLS=y 2332CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
2333CONFIG_RING_BUFFER=y 2333CONFIG_RING_BUFFER=y
2334CONFIG_TRACING=y 2334CONFIG_TRACING=y
2335CONFIG_TRACING_SUPPORT=y 2335CONFIG_TRACING_SUPPORT=y
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index c580c5ec1cad..585edebe12cf 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -59,13 +59,6 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
59asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 59asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
60 const u8 *in, unsigned int len, u8 *iv); 60 const u8 *in, unsigned int len, u8 *iv);
61 61
62static inline int kernel_fpu_using(void)
63{
64 if (in_interrupt() && !(read_cr0() & X86_CR0_TS))
65 return 1;
66 return 0;
67}
68
69static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 62static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
70{ 63{
71 unsigned long addr = (unsigned long)raw_ctx; 64 unsigned long addr = (unsigned long)raw_ctx;
@@ -89,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
89 return -EINVAL; 82 return -EINVAL;
90 } 83 }
91 84
92 if (kernel_fpu_using()) 85 if (irq_fpu_usable())
93 err = crypto_aes_expand_key(ctx, in_key, key_len); 86 err = crypto_aes_expand_key(ctx, in_key, key_len);
94 else { 87 else {
95 kernel_fpu_begin(); 88 kernel_fpu_begin();
@@ -110,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
110{ 103{
111 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); 104 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
112 105
113 if (kernel_fpu_using()) 106 if (irq_fpu_usable())
114 crypto_aes_encrypt_x86(ctx, dst, src); 107 crypto_aes_encrypt_x86(ctx, dst, src);
115 else { 108 else {
116 kernel_fpu_begin(); 109 kernel_fpu_begin();
@@ -123,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
123{ 116{
124 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); 117 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
125 118
126 if (kernel_fpu_using()) 119 if (irq_fpu_usable())
127 crypto_aes_decrypt_x86(ctx, dst, src); 120 crypto_aes_decrypt_x86(ctx, dst, src);
128 else { 121 else {
129 kernel_fpu_begin(); 122 kernel_fpu_begin();
@@ -349,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req)
349 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); 342 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
350 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 343 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
351 344
352 if (kernel_fpu_using()) { 345 if (irq_fpu_usable()) {
353 struct ablkcipher_request *cryptd_req = 346 struct ablkcipher_request *cryptd_req =
354 ablkcipher_request_ctx(req); 347 ablkcipher_request_ctx(req);
355 memcpy(cryptd_req, req, sizeof(*req)); 348 memcpy(cryptd_req, req, sizeof(*req));
@@ -370,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req)
370 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); 363 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
371 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 364 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
372 365
373 if (kernel_fpu_using()) { 366 if (irq_fpu_usable()) {
374 struct ablkcipher_request *cryptd_req = 367 struct ablkcipher_request *cryptd_req =
375 ablkcipher_request_ctx(req); 368 ablkcipher_request_ctx(req);
376 memcpy(cryptd_req, req, sizeof(*req)); 369 memcpy(cryptd_req, req, sizeof(*req));
@@ -636,7 +629,7 @@ static int __init aesni_init(void)
636 int err; 629 int err;
637 630
638 if (!cpu_has_aes) { 631 if (!cpu_has_aes) {
639 printk(KERN_ERR "Intel AES-NI instructions are not detected.\n"); 632 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
640 return -ENODEV; 633 return -ENODEV;
641 } 634 }
642 if ((err = crypto_register_alg(&aesni_alg))) 635 if ((err = crypto_register_alg(&aesni_alg)))
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e590261ba059..74619c4f9fda 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -537,7 +537,7 @@ ia32_sys_call_table:
537 .quad sys_mkdir 537 .quad sys_mkdir
538 .quad sys_rmdir /* 40 */ 538 .quad sys_rmdir /* 40 */
539 .quad sys_dup 539 .quad sys_dup
540 .quad sys32_pipe 540 .quad sys_pipe
541 .quad compat_sys_times 541 .quad compat_sys_times
542 .quad quiet_ni_syscall /* old prof syscall holder */ 542 .quad quiet_ni_syscall /* old prof syscall holder */
543 .quad sys_brk /* 45 */ 543 .quad sys_brk /* 45 */
@@ -831,5 +831,5 @@ ia32_sys_call_table:
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_counter_open 834 .quad sys_perf_event_open
835ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 085a8c35f149..9f5527198825 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -189,20 +189,6 @@ asmlinkage long sys32_mprotect(unsigned long start, size_t len,
189 return sys_mprotect(start, len, prot); 189 return sys_mprotect(start, len, prot);
190} 190}
191 191
192asmlinkage long sys32_pipe(int __user *fd)
193{
194 int retval;
195 int fds[2];
196
197 retval = do_pipe_flags(fds, 0);
198 if (retval)
199 goto out;
200 if (copy_to_user(fd, fds, sizeof(fds)))
201 retval = -EFAULT;
202out:
203 return retval;
204}
205
206asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, 192asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
207 struct sigaction32 __user *oact, 193 struct sigaction32 __user *oact,
208 unsigned int sigsetsize) 194 unsigned int sigsetsize)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 20d1465a2ab0..4518dc500903 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
144 144
145#else /* !CONFIG_ACPI */ 145#else /* !CONFIG_ACPI */
146 146
147#define acpi_disabled 1
148#define acpi_lapic 0 147#define acpi_lapic 0
149#define acpi_ioapic 0 148#define acpi_ioapic 0
150static inline void acpi_noirq_set(void) { } 149static inline void acpi_noirq_set(void) { }
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 9825cd64c9b6..eec2a70d4376 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -22,10 +22,6 @@
22 */ 22 */
23#define flush_agp_cache() wbinvd() 23#define flush_agp_cache() wbinvd()
24 24
25/* Convert a physical address to an address suitable for the GART. */
26#define phys_to_gart(x) (x)
27#define gart_to_phys(x) (x)
28
29/* GATT allocation. Returns/accepts GATT kernel virtual address. */ 25/* GATT allocation. Returns/accepts GATT kernel virtual address. */
30#define alloc_gatt_pages(order) \ 26#define alloc_gatt_pages(order) \
31 ((char *)__get_free_pages(GFP_KERNEL, (order))) 27 ((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 1a37bcdc8606..c240efc74e00 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -73,8 +73,6 @@ static inline void alternatives_smp_module_del(struct module *mod) {}
73static inline void alternatives_smp_switch(int smp) {} 73static inline void alternatives_smp_switch(int smp) {}
74#endif /* CONFIG_SMP */ 74#endif /* CONFIG_SMP */
75 75
76const unsigned char *const *find_nop_table(void);
77
78/* alternative assembly primitive: */ 76/* alternative assembly primitive: */
79#define ALTERNATIVE(oldinstr, newinstr, feature) \ 77#define ALTERNATIVE(oldinstr, newinstr, feature) \
80 \ 78 \
@@ -144,8 +142,6 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
144#define __parainstructions_end NULL 142#define __parainstructions_end NULL
145#endif 143#endif
146 144
147extern void add_nops(void *insns, unsigned int len);
148
149/* 145/*
150 * Clear and restore the kernel write-protection flag on the local CPU. 146 * Clear and restore the kernel write-protection flag on the local CPU.
151 * Allows the kernel to edit read-only pages. 147 * Allows the kernel to edit read-only pages.
@@ -161,10 +157,7 @@ extern void add_nops(void *insns, unsigned int len);
161 * Intel's errata. 157 * Intel's errata.
162 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 158 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
163 * inconsistent instruction while you patch. 159 * inconsistent instruction while you patch.
164 * The _early version expects the memory to already be RW.
165 */ 160 */
166
167extern void *text_poke(void *addr, const void *opcode, size_t len); 161extern void *text_poke(void *addr, const void *opcode, size_t len);
168extern void *text_poke_early(void *addr, const void *opcode, size_t len);
169 162
170#endif /* _ASM_X86_ALTERNATIVE_H */ 163#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index bdf96f119f06..ac95995b7bad 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -25,6 +25,7 @@
25#ifdef CONFIG_AMD_IOMMU 25#ifdef CONFIG_AMD_IOMMU
26extern int amd_iommu_init(void); 26extern int amd_iommu_init(void);
27extern int amd_iommu_init_dma_ops(void); 27extern int amd_iommu_init_dma_ops(void);
28extern int amd_iommu_init_passthrough(void);
28extern void amd_iommu_detect(void); 29extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 30extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30extern void amd_iommu_flush_all_domains(void); 31extern void amd_iommu_flush_all_domains(void);
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 0c878caaa0a2..2a2cc7a78a81 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -143,22 +143,29 @@
143#define EVT_BUFFER_SIZE 8192 /* 512 entries */ 143#define EVT_BUFFER_SIZE 8192 /* 512 entries */
144#define EVT_LEN_MASK (0x9ULL << 56) 144#define EVT_LEN_MASK (0x9ULL << 56)
145 145
146#define PAGE_MODE_NONE 0x00
146#define PAGE_MODE_1_LEVEL 0x01 147#define PAGE_MODE_1_LEVEL 0x01
147#define PAGE_MODE_2_LEVEL 0x02 148#define PAGE_MODE_2_LEVEL 0x02
148#define PAGE_MODE_3_LEVEL 0x03 149#define PAGE_MODE_3_LEVEL 0x03
149 150#define PAGE_MODE_4_LEVEL 0x04
150#define IOMMU_PDE_NL_0 0x000ULL 151#define PAGE_MODE_5_LEVEL 0x05
151#define IOMMU_PDE_NL_1 0x200ULL 152#define PAGE_MODE_6_LEVEL 0x06
152#define IOMMU_PDE_NL_2 0x400ULL 153
153#define IOMMU_PDE_NL_3 0x600ULL 154#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
154 155#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
155#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL) 156 ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
156#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL) 157 (0xffffffffffffffffULL))
157#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL) 158#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
158 159#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
159#define IOMMU_MAP_SIZE_L1 (1ULL << 21) 160#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
160#define IOMMU_MAP_SIZE_L2 (1ULL << 30) 161 IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
161#define IOMMU_MAP_SIZE_L3 (1ULL << 39) 162#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
163
164#define PM_MAP_4k 0
165#define PM_ADDR_MASK 0x000ffffffffff000ULL
166#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
167 (~((1ULL << (12 + ((lvl) * 9))) - 1)))
168#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
162 169
163#define IOMMU_PTE_P (1ULL << 0) 170#define IOMMU_PTE_P (1ULL << 0)
164#define IOMMU_PTE_TV (1ULL << 1) 171#define IOMMU_PTE_TV (1ULL << 1)
@@ -167,11 +174,6 @@
167#define IOMMU_PTE_IR (1ULL << 61) 174#define IOMMU_PTE_IR (1ULL << 61)
168#define IOMMU_PTE_IW (1ULL << 62) 175#define IOMMU_PTE_IW (1ULL << 62)
169 176
170#define IOMMU_L1_PDE(address) \
171 ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
172#define IOMMU_L2_PDE(address) \
173 ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
174
175#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 177#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
176#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 178#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
177#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) 179#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
@@ -194,11 +196,14 @@
194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ 196#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops 197#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
196 domain for an IOMMU */ 198 domain for an IOMMU */
199#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
200 translation */
201
197extern bool amd_iommu_dump; 202extern bool amd_iommu_dump;
198#define DUMP_printk(format, arg...) \ 203#define DUMP_printk(format, arg...) \
199 do { \ 204 do { \
200 if (amd_iommu_dump) \ 205 if (amd_iommu_dump) \
201 printk(KERN_INFO "AMD IOMMU: " format, ## arg); \ 206 printk(KERN_INFO "AMD-Vi: " format, ## arg); \
202 } while(0); 207 } while(0);
203 208
204/* 209/*
@@ -226,6 +231,7 @@ struct protection_domain {
226 int mode; /* paging mode (0-6 levels) */ 231 int mode; /* paging mode (0-6 levels) */
227 u64 *pt_root; /* page table root pointer */ 232 u64 *pt_root; /* page table root pointer */
228 unsigned long flags; /* flags to find out type of domain */ 233 unsigned long flags; /* flags to find out type of domain */
234 bool updated; /* complete domain flush required */
229 unsigned dev_cnt; /* devices assigned to this domain */ 235 unsigned dev_cnt; /* devices assigned to this domain */
230 void *priv; /* private data */ 236 void *priv; /* private data */
231}; 237};
@@ -337,6 +343,9 @@ struct amd_iommu {
337 /* if one, we need to send a completion wait command */ 343 /* if one, we need to send a completion wait command */
338 bool need_sync; 344 bool need_sync;
339 345
346 /* becomes true if a command buffer reset is running */
347 bool reset_in_progress;
348
340 /* default dma_ops domain for that IOMMU */ 349 /* default dma_ops domain for that IOMMU */
341 struct dma_ops_domain *default_dom; 350 struct dma_ops_domain *default_dom;
342}; 351};
@@ -457,4 +466,7 @@ static inline void amd_iommu_stats_init(void) { }
457 466
458#endif /* CONFIG_AMD_IOMMU_STATS */ 467#endif /* CONFIG_AMD_IOMMU_STATS */
459 468
469/* some function prototypes */
470extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
471
460#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ 472#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bb7d47925847..474d80d3e6cc 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -66,13 +66,23 @@ static inline void default_inquire_remote_apic(int apicid)
66} 66}
67 67
68/* 68/*
69 * With 82489DX we can't rely on apic feature bit
70 * retrieved via cpuid but still have to deal with
71 * such an apic chip so we assume that SMP configuration
72 * is found from MP table (64bit case uses ACPI mostly
73 * which set smp presence flag as well so we are safe
74 * to use this helper too).
75 */
76static inline bool apic_from_smp_config(void)
77{
78 return smp_found_config && !disable_apic;
79}
80
81/*
69 * Basic functions accessing APICs. 82 * Basic functions accessing APICs.
70 */ 83 */
71#ifdef CONFIG_PARAVIRT 84#ifdef CONFIG_PARAVIRT
72#include <asm/paravirt.h> 85#include <asm/paravirt.h>
73#else
74#define setup_boot_clock setup_boot_APIC_clock
75#define setup_secondary_clock setup_secondary_APIC_clock
76#endif 86#endif
77 87
78#ifdef CONFIG_X86_64 88#ifdef CONFIG_X86_64
@@ -183,6 +193,10 @@ static inline int x2apic_enabled(void)
183} 193}
184 194
185#define x2apic_supported() (cpu_has_x2apic) 195#define x2apic_supported() (cpu_has_x2apic)
196static inline void x2apic_force_phys(void)
197{
198 x2apic_phys = 1;
199}
186#else 200#else
187static inline void check_x2apic(void) 201static inline void check_x2apic(void)
188{ 202{
@@ -194,6 +208,9 @@ static inline int x2apic_enabled(void)
194{ 208{
195 return 0; 209 return 0;
196} 210}
211static inline void x2apic_force_phys(void)
212{
213}
197 214
198#define x2apic_preenabled 0 215#define x2apic_preenabled 0
199#define x2apic_supported() 0 216#define x2apic_supported() 0
@@ -245,6 +262,8 @@ static inline void lapic_shutdown(void) { }
245static inline void init_apic_mappings(void) { } 262static inline void init_apic_mappings(void) { }
246static inline void disable_local_APIC(void) { } 263static inline void disable_local_APIC(void) { }
247static inline void apic_disable(void) { } 264static inline void apic_disable(void) { }
265# define setup_boot_APIC_clock x86_init_noop
266# define setup_secondary_APIC_clock x86_init_noop
248#endif /* !CONFIG_X86_LOCAL_APIC */ 267#endif /* !CONFIG_X86_LOCAL_APIC */
249 268
250#ifdef CONFIG_X86_64 269#ifdef CONFIG_X86_64
@@ -293,7 +312,7 @@ struct apic {
293 int (*cpu_present_to_apicid)(int mps_cpu); 312 int (*cpu_present_to_apicid)(int mps_cpu);
294 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); 313 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
295 void (*setup_portio_remap)(void); 314 void (*setup_portio_remap)(void);
296 int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); 315 int (*check_phys_apicid_present)(int phys_apicid);
297 void (*enable_apic_mode)(void); 316 void (*enable_apic_mode)(void);
298 int (*phys_pkg_id)(int cpuid_apic, int index_msb); 317 int (*phys_pkg_id)(int cpuid_apic, int index_msb);
299 318
@@ -427,7 +446,7 @@ extern struct apic apic_x2apic_uv_x;
427DECLARE_PER_CPU(int, x2apic_extra_bits); 446DECLARE_PER_CPU(int, x2apic_extra_bits);
428 447
429extern int default_cpu_present_to_apicid(int mps_cpu); 448extern int default_cpu_present_to_apicid(int mps_cpu);
430extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); 449extern int default_check_phys_apicid_present(int phys_apicid);
431#endif 450#endif
432 451
433static inline void default_wait_for_init_deassert(atomic_t *deassert) 452static inline void default_wait_for_init_deassert(atomic_t *deassert)
@@ -543,9 +562,9 @@ static inline int __default_cpu_present_to_apicid(int mps_cpu)
543} 562}
544 563
545static inline int 564static inline int
546__default_check_phys_apicid_present(int boot_cpu_physical_apicid) 565__default_check_phys_apicid_present(int phys_apicid)
547{ 566{
548 return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); 567 return physid_isset(phys_apicid, phys_cpu_present_map);
549} 568}
550 569
551#ifdef CONFIG_X86_32 570#ifdef CONFIG_X86_32
@@ -555,13 +574,13 @@ static inline int default_cpu_present_to_apicid(int mps_cpu)
555} 574}
556 575
557static inline int 576static inline int
558default_check_phys_apicid_present(int boot_cpu_physical_apicid) 577default_check_phys_apicid_present(int phys_apicid)
559{ 578{
560 return __default_check_phys_apicid_present(boot_cpu_physical_apicid); 579 return __default_check_phys_apicid_present(phys_apicid);
561} 580}
562#else 581#else
563extern int default_cpu_present_to_apicid(int mps_cpu); 582extern int default_cpu_present_to_apicid(int mps_cpu);
564extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); 583extern int default_check_phys_apicid_present(int phys_apicid);
565#endif 584#endif
566 585
567static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) 586static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7ddb36ab933b..3b62da926de9 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -8,12 +8,14 @@
8 * Ingo Molnar <mingo@redhat.com>, 1999, 2000 8 * Ingo Molnar <mingo@redhat.com>, 1999, 2000
9 */ 9 */
10 10
11#define APIC_DEFAULT_PHYS_BASE 0xfee00000 11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
12#define APIC_DEFAULT_PHYS_BASE 0xfee00000
12 13
13#define APIC_ID 0x20 14#define APIC_ID 0x20
14 15
15#define APIC_LVR 0x30 16#define APIC_LVR 0x30
16#define APIC_LVR_MASK 0xFF00FF 17#define APIC_LVR_MASK 0xFF00FF
18#define APIC_LVR_DIRECTED_EOI (1 << 24)
17#define GET_APIC_VERSION(x) ((x) & 0xFFu) 19#define GET_APIC_VERSION(x) ((x) & 0xFFu)
18#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) 20#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
19#ifdef CONFIG_X86_32 21#ifdef CONFIG_X86_32
@@ -40,6 +42,7 @@
40#define APIC_DFR_CLUSTER 0x0FFFFFFFul 42#define APIC_DFR_CLUSTER 0x0FFFFFFFul
41#define APIC_DFR_FLAT 0xFFFFFFFFul 43#define APIC_DFR_FLAT 0xFFFFFFFFul
42#define APIC_SPIV 0xF0 44#define APIC_SPIV 0xF0
45#define APIC_SPIV_DIRECTED_EOI (1 << 12)
43#define APIC_SPIV_FOCUS_DISABLED (1 << 9) 46#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
44#define APIC_SPIV_APIC_ENABLED (1 << 8) 47#define APIC_SPIV_APIC_ENABLED (1 << 8)
45#define APIC_ISR 0x100 48#define APIC_ISR 0x100
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 56be78f582f0..b3ed1e1460ff 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,7 +3,7 @@
3 3
4#ifdef __ASSEMBLY__ 4#ifdef __ASSEMBLY__
5# define __ASM_FORM(x) x 5# define __ASM_FORM(x) x
6# define __ASM_EX_SEC .section __ex_table 6# define __ASM_EX_SEC .section __ex_table, "a"
7#else 7#else
8# define __ASM_FORM(x) " " #x " " 8# define __ASM_FORM(x) " " #x " "
9# define __ASM_EX_SEC " .section __ex_table,\"a\"\n" 9# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
@@ -38,10 +38,18 @@
38#define _ASM_DI __ASM_REG(di) 38#define _ASM_DI __ASM_REG(di)
39 39
40/* Exception table entry */ 40/* Exception table entry */
41#ifdef __ASSEMBLY__
42# define _ASM_EXTABLE(from,to) \
43 __ASM_EX_SEC ; \
44 _ASM_ALIGN ; \
45 _ASM_PTR from , to ; \
46 .previous
47#else
41# define _ASM_EXTABLE(from,to) \ 48# define _ASM_EXTABLE(from,to) \
42 __ASM_EX_SEC \ 49 __ASM_EX_SEC \
43 _ASM_ALIGN "\n" \ 50 _ASM_ALIGN "\n" \
44 _ASM_PTR #from "," #to "\n" \ 51 _ASM_PTR #from "," #to "\n" \
45 " .previous\n" 52 " .previous\n"
53#endif
46 54
47#endif /* _ASM_X86_ASM_H */ 55#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 1724e8de317c..6be33d83c716 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -85,7 +85,8 @@ struct efi_info {
85struct boot_params { 85struct boot_params {
86 struct screen_info screen_info; /* 0x000 */ 86 struct screen_info screen_info; /* 0x000 */
87 struct apm_bios_info apm_bios_info; /* 0x040 */ 87 struct apm_bios_info apm_bios_info; /* 0x040 */
88 __u8 _pad2[12]; /* 0x054 */ 88 __u8 _pad2[4]; /* 0x054 */
89 __u64 tboot_addr; /* 0x058 */
89 struct ist_info ist_info; /* 0x060 */ 90 struct ist_info ist_info; /* 0x060 */
90 __u8 _pad3[16]; /* 0x070 */ 91 __u8 _pad3[16]; /* 0x070 */
91 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 92 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
@@ -109,4 +110,14 @@ struct boot_params {
109 __u8 _pad9[276]; /* 0xeec */ 110 __u8 _pad9[276]; /* 0xeec */
110} __attribute__((packed)); 111} __attribute__((packed));
111 112
113enum {
114 X86_SUBARCH_PC = 0,
115 X86_SUBARCH_LGUEST,
116 X86_SUBARCH_XEN,
117 X86_SUBARCH_MRST,
118 X86_NR_SUBARCHS,
119};
120
121
122
112#endif /* _ASM_X86_BOOTPARAM_H */ 123#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 5d367caa0e36..549860d3be8f 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_CACHE_H 1#ifndef _ASM_X86_CACHE_H
2#define _ASM_X86_CACHE_H 2#define _ASM_X86_CACHE_H
3 3
4#include <linux/linkage.h>
5
4/* L1 cache line size */ 6/* L1 cache line size */
5#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) 7#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
6#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) 8#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
@@ -13,7 +15,7 @@
13#ifdef CONFIG_SMP 15#ifdef CONFIG_SMP
14#define __cacheline_aligned_in_smp \ 16#define __cacheline_aligned_in_smp \
15 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ 17 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
16 __attribute__((__section__(".data.page_aligned"))) 18 __page_aligned_data
17#endif 19#endif
18#endif 20#endif
19 21
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e55dfc1ad453..b54f6afe7ec4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
43 memcpy(dst, src, len); 43 memcpy(dst, src, len);
44} 44}
45 45
46#define PG_non_WB PG_arch_1 46#define PG_WC PG_arch_1
47PAGEFLAG(NonWB, non_WB) 47PAGEFLAG(WC, WC)
48
49#ifdef CONFIG_X86_PAT
50/*
51 * X86 PAT uses page flags WC and Uncached together to keep track of
52 * memory type of pages that have backing page struct. X86 PAT supports 3
53 * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and
54 * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
55 * been changed from its default (value of -1 used to denote this).
56 * Note we do not support _PAGE_CACHE_UC here.
57 *
58 * Caller must hold memtype_lock for atomicity.
59 */
60static inline unsigned long get_page_memtype(struct page *pg)
61{
62 if (!PageUncached(pg) && !PageWC(pg))
63 return -1;
64 else if (!PageUncached(pg) && PageWC(pg))
65 return _PAGE_CACHE_WC;
66 else if (PageUncached(pg) && !PageWC(pg))
67 return _PAGE_CACHE_UC_MINUS;
68 else
69 return _PAGE_CACHE_WB;
70}
71
72static inline void set_page_memtype(struct page *pg, unsigned long memtype)
73{
74 switch (memtype) {
75 case _PAGE_CACHE_WC:
76 ClearPageUncached(pg);
77 SetPageWC(pg);
78 break;
79 case _PAGE_CACHE_UC_MINUS:
80 SetPageUncached(pg);
81 ClearPageWC(pg);
82 break;
83 case _PAGE_CACHE_WB:
84 SetPageUncached(pg);
85 SetPageWC(pg);
86 break;
87 default:
88 case -1:
89 ClearPageUncached(pg);
90 ClearPageWC(pg);
91 break;
92 }
93}
94#else
95static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
96static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
97#endif
48 98
49/* 99/*
50 * The set_memory_* API can be used to change various attributes of a virtual 100 * The set_memory_* API can be used to change various attributes of a virtual
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4a28d22d4793..9cfc88b97742 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -95,6 +95,8 @@
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ 95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ 96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ 97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
98#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */
99#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
98 100
99/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 101/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
100#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 102#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index c68c361697e1..4d447b732d82 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task);
11 11
12static __always_inline struct task_struct *get_current(void) 12static __always_inline struct task_struct *get_current(void)
13{ 13{
14 return percpu_read(current_task); 14 return percpu_read_stable(current_task);
15} 15}
16 16
17#define current get_current() 17#define current get_current()
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index c993e9e0fed4..e8de2f6f5ca5 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -291,11 +291,24 @@ static inline unsigned long get_desc_base(const struct desc_struct *desc)
291 return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); 291 return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
292} 292}
293 293
294static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
295{
296 desc->base0 = base & 0xffff;
297 desc->base1 = (base >> 16) & 0xff;
298 desc->base2 = (base >> 24) & 0xff;
299}
300
294static inline unsigned long get_desc_limit(const struct desc_struct *desc) 301static inline unsigned long get_desc_limit(const struct desc_struct *desc)
295{ 302{
296 return desc->limit0 | (desc->limit << 16); 303 return desc->limit0 | (desc->limit << 16);
297} 304}
298 305
306static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
307{
308 desc->limit0 = limit & 0xffff;
309 desc->limit = (limit >> 16) & 0xf;
310}
311
299static inline void _set_gate(int gate, unsigned type, void *addr, 312static inline void _set_gate(int gate, unsigned type, void *addr,
300 unsigned dpl, unsigned ist, unsigned seg) 313 unsigned dpl, unsigned ist, unsigned seg)
301{ 314{
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index a6adefa28b94..9d6684849fd9 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -34,6 +34,12 @@ struct desc_struct {
34 }; 34 };
35} __attribute__((packed)); 35} __attribute__((packed));
36 36
37#define GDT_ENTRY_INIT(flags, base, limit) { { { \
38 .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \
39 .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \
40 ((limit) & 0xf0000) | ((base) & 0xff000000), \
41 } } }
42
37enum { 43enum {
38 GATE_INTERRUPT = 0xE, 44 GATE_INTERRUPT = 0xE,
39 GATE_TRAP = 0xF, 45 GATE_TRAP = 0xF,
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 4994a20acbcb..cee34e9ca45b 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -13,4 +13,7 @@ struct dma_map_ops *dma_ops;
13#endif 13#endif
14}; 14};
15 15
16struct pdev_archdata {
17};
18
16#endif /* _ASM_X86_DEVICE_H */ 19#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1c3f9435f1c9..0ee770d23d0e 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -55,6 +55,24 @@ extern int dma_set_mask(struct device *dev, u64 mask);
55extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, 55extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
56 dma_addr_t *dma_addr, gfp_t flag); 56 dma_addr_t *dma_addr, gfp_t flag);
57 57
58static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
59{
60 if (!dev->dma_mask)
61 return 0;
62
63 return addr + size <= *dev->dma_mask;
64}
65
66static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
67{
68 return paddr;
69}
70
71static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
72{
73 return daddr;
74}
75
58static inline void 76static inline void
59dma_cache_sync(struct device *dev, void *vaddr, size_t size, 77dma_cache_sync(struct device *dev, void *vaddr, size_t size,
60 enum dma_data_direction dir) 78 enum dma_data_direction dir)
diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h
deleted file mode 100644
index 23ecda0b28a0..000000000000
--- a/arch/x86/include/asm/do_timer.h
+++ /dev/null
@@ -1,16 +0,0 @@
1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
3
4#include <asm/i8259.h>
5#include <asm/i8253.h>
6
7/**
8 * do_timer_interrupt_hook - hook into timer tick
9 *
10 * Call the pit clock event handler. see asm/i8253.h
11 **/
12
13static inline void do_timer_interrupt_hook(void)
14{
15 global_clock_event->event_handler(global_clock_event);
16}
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 3afc5e87cfdd..ae6253ab9029 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -87,9 +87,25 @@
87 CFI_RESTORE \reg 87 CFI_RESTORE \reg
88 .endm 88 .endm
89#else /*!CONFIG_X86_64*/ 89#else /*!CONFIG_X86_64*/
90 .macro pushl_cfi reg
91 pushl \reg
92 CFI_ADJUST_CFA_OFFSET 4
93 .endm
90 94
91 /* 32bit defenitions are missed yet */ 95 .macro popl_cfi reg
96 popl \reg
97 CFI_ADJUST_CFA_OFFSET -4
98 .endm
92 99
100 .macro movl_cfi reg offset=0
101 movl %\reg, \offset(%esp)
102 CFI_REL_OFFSET \reg, \offset
103 .endm
104
105 .macro movl_cfi_restore offset reg
106 movl \offset(%esp), %\reg
107 CFI_RESTORE \reg
108 .endm
93#endif /*!CONFIG_X86_64*/ 109#endif /*!CONFIG_X86_64*/
94#endif /*__ASSEMBLY__*/ 110#endif /*__ASSEMBLY__*/
95 111
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 7ecba4d85089..40b4e614fe71 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -126,8 +126,6 @@ extern void e820_reserve_resources(void);
126extern void e820_reserve_resources_late(void); 126extern void e820_reserve_resources_late(void);
127extern void setup_memory_map(void); 127extern void setup_memory_map(void);
128extern char *default_machine_specific_memory_setup(void); 128extern char *default_machine_specific_memory_setup(void);
129extern char *machine_specific_memory_setup(void);
130extern char *memory_setup(void);
131#endif /* __KERNEL__ */ 129#endif /* __KERNEL__ */
132#endif /* __ASSEMBLY__ */ 130#endif /* __ASSEMBLY__ */
133 131
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 83c1bc8d2e8a..456a304b8172 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -299,6 +299,8 @@ do { \
299 299
300#ifdef CONFIG_X86_32 300#ifdef CONFIG_X86_32
301 301
302#define STACK_RND_MASK (0x7ff)
303
302#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) 304#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
303 305
304#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) 306#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index ff8cbfa07851..f5693c81a1db 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
49BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 49BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
51 51
52#ifdef CONFIG_PERF_COUNTERS 52#ifdef CONFIG_PERF_EVENTS
53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) 53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
54#endif 54#endif
55 55
@@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
61BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) 61BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
62#endif 62#endif
63 63
64#ifdef CONFIG_X86_NEW_MCE 64#ifdef CONFIG_X86_MCE
65BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) 65BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
66#endif 66#endif
67 67
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 7b2d71df39a6..14f9890eb495 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -132,6 +132,9 @@ enum fixed_addresses {
132#ifdef CONFIG_X86_32 132#ifdef CONFIG_X86_32
133 FIX_WP_TEST, 133 FIX_WP_TEST,
134#endif 134#endif
135#ifdef CONFIG_INTEL_TXT
136 FIX_TBOOT_BASE,
137#endif
135 __end_of_fixed_addresses 138 __end_of_fixed_addresses
136}; 139};
137 140
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index bd2c6511c887..db24c2278be0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,13 +28,6 @@
28 28
29#endif 29#endif
30 30
31/* FIXME: I don't want to stay hardcoded */
32#ifdef CONFIG_X86_64
33# define FTRACE_SYSCALL_MAX 296
34#else
35# define FTRACE_SYSCALL_MAX 333
36#endif
37
38#ifdef CONFIG_FUNCTION_TRACER 31#ifdef CONFIG_FUNCTION_TRACER
39#define MCOUNT_ADDR ((long)(mcount)) 32#define MCOUNT_ADDR ((long)(mcount))
40#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ 33#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 369f5c5d09a1..b78c0941e422 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,7 +20,7 @@
20#ifndef ASM_X86__HYPERVISOR_H 20#ifndef ASM_X86__HYPERVISOR_H
21#define ASM_X86__HYPERVISOR_H 21#define ASM_X86__HYPERVISOR_H
22 22
23extern unsigned long get_hypervisor_tsc_freq(void);
24extern void init_hypervisor(struct cpuinfo_x86 *c); 23extern void init_hypervisor(struct cpuinfo_x86 *c);
24extern void init_hypervisor_platform(void);
25 25
26#endif 26#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 175adf58dd4f..0b20bbb758f2 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -26,6 +26,7 @@ extern void fpu_init(void);
26extern void mxcsr_feature_mask_init(void); 26extern void mxcsr_feature_mask_init(void);
27extern int init_fpu(struct task_struct *child); 27extern int init_fpu(struct task_struct *child);
28extern asmlinkage void math_state_restore(void); 28extern asmlinkage void math_state_restore(void);
29extern void __math_state_restore(void);
29extern void init_thread_xstate(void); 30extern void init_thread_xstate(void);
30extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); 31extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
31 32
@@ -301,6 +302,14 @@ static inline void kernel_fpu_end(void)
301 preempt_enable(); 302 preempt_enable();
302} 303}
303 304
305static inline bool irq_fpu_usable(void)
306{
307 struct pt_regs *regs;
308
309 return !in_interrupt() || !(regs = get_irq_regs()) || \
310 user_mode(regs) || (read_cr0() & X86_CR0_TS);
311}
312
304/* 313/*
305 * Some instructions like VIA's padlock instructions generate a spurious 314 * Some instructions like VIA's padlock instructions generate a spurious
306 * DNA fault but don't modify SSE registers. And these instructions 315 * DNA fault but don't modify SSE registers. And these instructions
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 330ee807f89e..7c7c16cde1f8 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,6 +143,8 @@ extern int noioapicreroute;
143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ 143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
144extern int timer_through_8259; 144extern int timer_through_8259;
145 145
146extern void io_apic_disable_legacy(void);
147
146/* 148/*
147 * If we use the IO-APIC for IRQ routing, disable automatic 149 * If we use the IO-APIC for IRQ routing, disable automatic
148 * assignment of PCI IRQ's. 150 * assignment of PCI IRQ's.
@@ -150,11 +152,10 @@ extern int timer_through_8259;
150#define io_apic_assign_pci_irqs \ 152#define io_apic_assign_pci_irqs \
151 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) 153 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
152 154
153#ifdef CONFIG_ACPI 155extern u8 io_apic_unique_id(u8 id);
154extern int io_apic_get_unique_id(int ioapic, int apic_id); 156extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic); 157extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic); 158extern int io_apic_get_redir_entries(int ioapic);
157#endif /* CONFIG_ACPI */
158 159
159struct io_apic_irq_attr; 160struct io_apic_irq_attr;
160extern int io_apic_set_pci_routing(struct device *dev, int irq, 161extern int io_apic_set_pci_routing(struct device *dev, int irq,
@@ -177,13 +178,26 @@ extern int setup_ioapic_entry(int apic, int irq,
177 int polarity, int vector, int pin); 178 int polarity, int vector, int pin);
178extern void ioapic_write_entry(int apic, int pin, 179extern void ioapic_write_entry(int apic, int pin,
179 struct IO_APIC_route_entry e); 180 struct IO_APIC_route_entry e);
181extern void setup_ioapic_ids_from_mpc(void);
182
183struct mp_ioapic_gsi{
184 int gsi_base;
185 int gsi_end;
186};
187extern struct mp_ioapic_gsi mp_gsi_routing[];
188int mp_find_ioapic(int gsi);
189int mp_find_ioapic_pin(int ioapic, int gsi);
190void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
191
180#else /* !CONFIG_X86_IO_APIC */ 192#else /* !CONFIG_X86_IO_APIC */
193
181#define io_apic_assign_pci_irqs 0 194#define io_apic_assign_pci_irqs 0
195#define setup_ioapic_ids_from_mpc x86_init_noop
182static const int timer_through_8259 = 0; 196static const int timer_through_8259 = 0;
183static inline void ioapic_init_mappings(void) { } 197static inline void ioapic_init_mappings(void) { }
184static inline void ioapic_insert_resources(void) { } 198static inline void ioapic_insert_resources(void) { }
185
186static inline void probe_nr_irqs_gsi(void) { } 199static inline void probe_nr_irqs_gsi(void) { }
200
187#endif 201#endif
188 202
189#endif /* _ASM_X86_IO_APIC_H */ 203#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h
index 0d5b23b7b06e..ec34c760665e 100644
--- a/arch/x86/include/asm/ioctls.h
+++ b/arch/x86/include/asm/ioctls.h
@@ -1,94 +1 @@
1#ifndef _ASM_X86_IOCTLS_H #include <asm-generic/ioctls.h>
2#define _ASM_X86_IOCTLS_H
3
4#include <asm/ioctl.h>
5
6/* 0x54 is just a magic number to make these relatively unique ('T') */
7
8#define TCGETS 0x5401
9#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */
10#define TCSETSW 0x5403
11#define TCSETSF 0x5404
12#define TCGETA 0x5405
13#define TCSETA 0x5406
14#define TCSETAW 0x5407
15#define TCSETAF 0x5408
16#define TCSBRK 0x5409
17#define TCXONC 0x540A
18#define TCFLSH 0x540B
19#define TIOCEXCL 0x540C
20#define TIOCNXCL 0x540D
21#define TIOCSCTTY 0x540E
22#define TIOCGPGRP 0x540F
23#define TIOCSPGRP 0x5410
24#define TIOCOUTQ 0x5411
25#define TIOCSTI 0x5412
26#define TIOCGWINSZ 0x5413
27#define TIOCSWINSZ 0x5414
28#define TIOCMGET 0x5415
29#define TIOCMBIS 0x5416
30#define TIOCMBIC 0x5417
31#define TIOCMSET 0x5418
32#define TIOCGSOFTCAR 0x5419
33#define TIOCSSOFTCAR 0x541A
34#define FIONREAD 0x541B
35#define TIOCINQ FIONREAD
36#define TIOCLINUX 0x541C
37#define TIOCCONS 0x541D
38#define TIOCGSERIAL 0x541E
39#define TIOCSSERIAL 0x541F
40#define TIOCPKT 0x5420
41#define FIONBIO 0x5421
42#define TIOCNOTTY 0x5422
43#define TIOCSETD 0x5423
44#define TIOCGETD 0x5424
45#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */
46/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */
47#define TIOCSBRK 0x5427 /* BSD compatibility */
48#define TIOCCBRK 0x5428 /* BSD compatibility */
49#define TIOCGSID 0x5429 /* Return the session ID of FD */
50#define TCGETS2 _IOR('T', 0x2A, struct termios2)
51#define TCSETS2 _IOW('T', 0x2B, struct termios2)
52#define TCSETSW2 _IOW('T', 0x2C, struct termios2)
53#define TCSETSF2 _IOW('T', 0x2D, struct termios2)
54#define TIOCGRS485 0x542E
55#define TIOCSRS485 0x542F
56#define TIOCGPTN _IOR('T', 0x30, unsigned int)
57 /* Get Pty Number (of pty-mux device) */
58#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */
59#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */
60#define TCSETX 0x5433
61#define TCSETXF 0x5434
62#define TCSETXW 0x5435
63
64#define FIONCLEX 0x5450
65#define FIOCLEX 0x5451
66#define FIOASYNC 0x5452
67#define TIOCSERCONFIG 0x5453
68#define TIOCSERGWILD 0x5454
69#define TIOCSERSWILD 0x5455
70#define TIOCGLCKTRMIOS 0x5456
71#define TIOCSLCKTRMIOS 0x5457
72#define TIOCSERGSTRUCT 0x5458 /* For debugging only */
73#define TIOCSERGETLSR 0x5459 /* Get line status register */
74#define TIOCSERGETMULTI 0x545A /* Get multiport config */
75#define TIOCSERSETMULTI 0x545B /* Set multiport config */
76
77#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */
78#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */
79#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */
80#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */
81#define FIOQSIZE 0x5460
82
83/* Used for packet mode */
84#define TIOCPKT_DATA 0
85#define TIOCPKT_FLUSHREAD 1
86#define TIOCPKT_FLUSHWRITE 2
87#define TIOCPKT_STOP 4
88#define TIOCPKT_START 8
89#define TIOCPKT_NOSTOP 16
90#define TIOCPKT_DOSTOP 32
91
92#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */
93
94#endif /* _ASM_X86_IOCTLS_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 0e9fe1d9d971..f35eb45d6576 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -26,13 +26,16 @@
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
29int
30is_io_mapping_possible(resource_size_t base, unsigned long size);
31
32void * 29void *
33iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 30iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
34 31
35void 32void
36iounmap_atomic(void *kvaddr, enum km_type type); 33iounmap_atomic(void *kvaddr, enum km_type type);
37 34
35int
36iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
37
38void
39iomap_free(resource_size_t base, unsigned long size);
40
38#endif /* _ASM_X86_IOMAP_H */ 41#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h
index ee678fd51594..84c7e51cb6d0 100644
--- a/arch/x86/include/asm/ipcbuf.h
+++ b/arch/x86/include/asm/ipcbuf.h
@@ -1,28 +1 @@
1#ifndef _ASM_X86_IPCBUF_H #include <asm-generic/ipcbuf.h>
2#define _ASM_X86_IPCBUF_H
3
4/*
5 * The ipc64_perm structure for x86 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space is left for:
10 * - 32-bit mode_t and seq
11 * - 2 miscellaneous 32-bit values
12 */
13
14struct ipc64_perm {
15 __kernel_key_t key;
16 __kernel_uid32_t uid;
17 __kernel_gid32_t gid;
18 __kernel_uid32_t cuid;
19 __kernel_gid32_t cgid;
20 __kernel_mode_t mode;
21 unsigned short __pad1;
22 unsigned short seq;
23 unsigned short __pad2;
24 unsigned long __unused1;
25 unsigned long __unused2;
26};
27
28#endif /* _ASM_X86_IPCBUF_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index f38481bcd455..ddda6cbed6f4 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -37,7 +37,6 @@ extern void fixup_irqs(void);
37#endif 37#endif
38 38
39extern void (*generic_interrupt_extension)(void); 39extern void (*generic_interrupt_extension)(void);
40extern void init_IRQ(void);
41extern void native_init_IRQ(void); 40extern void native_init_IRQ(void);
42extern bool handle_irq(unsigned irq, struct pt_regs *regs); 41extern bool handle_irq(unsigned irq, struct pt_regs *regs);
43 42
@@ -47,4 +46,6 @@ extern unsigned int do_IRQ(struct pt_regs *regs);
47extern DECLARE_BITMAP(used_vectors, NR_VECTORS); 46extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
48extern int vector_used_by_percpu_irq(unsigned int vector); 47extern int vector_used_by_percpu_irq(unsigned int vector);
49 48
49extern void init_ISA_irqs(void);
50
50#endif /* _ASM_X86_IRQ_H */ 51#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c6ccbe7e81ad..9e2b952f810a 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -13,14 +13,13 @@ static inline unsigned long native_save_fl(void)
13 unsigned long flags; 13 unsigned long flags;
14 14
15 /* 15 /*
16 * Note: this needs to be "=r" not "=rm", because we have the 16 * "=rm" is safe here, because "pop" adjusts the stack before
17 * stack offset from what gcc expects at the time the "pop" is 17 * it evaluates its effective address -- this is part of the
18 * executed, and so a memory reference with respect to the stack 18 * documented behavior of the "pop" instruction.
19 * would end up using the wrong address.
20 */ 19 */
21 asm volatile("# __raw_save_flags\n\t" 20 asm volatile("# __raw_save_flags\n\t"
22 "pushf ; pop %0" 21 "pushf ; pop %0"
23 : "=r" (flags) 22 : "=rm" (flags)
24 : /* no input */ 23 : /* no input */
25 : "memory"); 24 : "memory");
26 25
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 125be8b19568..4a5fe914dc59 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -17,6 +17,8 @@
17#define __KVM_HAVE_USER_NMI 17#define __KVM_HAVE_USER_NMI
18#define __KVM_HAVE_GUEST_DEBUG 18#define __KVM_HAVE_GUEST_DEBUG
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2
20 22
21/* Architectural interrupt line count. */ 23/* Architectural interrupt line count. */
22#define KVM_NR_INTERRUPTS 256 24#define KVM_NR_INTERRUPTS 256
@@ -236,6 +238,14 @@ struct kvm_pit_state {
236 struct kvm_pit_channel_state channels[3]; 238 struct kvm_pit_channel_state channels[3];
237}; 239};
238 240
241#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
242
243struct kvm_pit_state2 {
244 struct kvm_pit_channel_state channels[3];
245 __u32 flags;
246 __u32 reserved[9];
247};
248
239struct kvm_reinject_control { 249struct kvm_reinject_control {
240 __u8 pit_reinject; 250 __u8 pit_reinject;
241 __u8 reserved[31]; 251 __u8 reserved[31];
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..b7ed2c423116 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eabdc1cfab5c..3be000435fad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -14,6 +14,7 @@
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h>
17 18
18#include <linux/kvm.h> 19#include <linux/kvm.h>
19#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
@@ -37,12 +38,14 @@
37#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
38 0xFFFFFF0000000000ULL) 39 0xFFFFFF0000000000ULL)
39 40
40#define KVM_GUEST_CR0_MASK \ 41#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
41 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ 42 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
42 | X86_CR0_NW | X86_CR0_CD) 43#define KVM_GUEST_CR0_MASK \
44 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
45#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
46 (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
43#define KVM_VM_CR0_ALWAYS_ON \ 47#define KVM_VM_CR0_ALWAYS_ON \
44 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ 48 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
45 | X86_CR0_MP)
46#define KVM_GUEST_CR4_MASK \ 49#define KVM_GUEST_CR4_MASK \
47 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) 50 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
48#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 51#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -51,12 +54,12 @@
51#define INVALID_PAGE (~(hpa_t)0) 54#define INVALID_PAGE (~(hpa_t)0)
52#define UNMAPPED_GVA (~(gpa_t)0) 55#define UNMAPPED_GVA (~(gpa_t)0)
53 56
54/* shadow tables are PAE even on non-PAE hosts */ 57/* KVM Hugepage definitions for x86 */
55#define KVM_HPAGE_SHIFT 21 58#define KVM_NR_PAGE_SIZES 3
56#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) 59#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
57#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) 60#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
58 61#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
59#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) 62#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
60 63
61#define DE_VECTOR 0 64#define DE_VECTOR 0
62#define DB_VECTOR 1 65#define DB_VECTOR 1
@@ -120,6 +123,10 @@ enum kvm_reg {
120 NR_VCPU_REGS 123 NR_VCPU_REGS
121}; 124};
122 125
126enum kvm_reg_ex {
127 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
128};
129
123enum { 130enum {
124 VCPU_SREG_ES, 131 VCPU_SREG_ES,
125 VCPU_SREG_CS, 132 VCPU_SREG_CS,
@@ -131,7 +138,7 @@ enum {
131 VCPU_SREG_LDTR, 138 VCPU_SREG_LDTR,
132}; 139};
133 140
134#include <asm/kvm_x86_emulate.h> 141#include <asm/kvm_emulate.h>
135 142
136#define KVM_NR_MEM_OBJS 40 143#define KVM_NR_MEM_OBJS 40
137 144
@@ -308,7 +315,6 @@ struct kvm_vcpu_arch {
308 struct { 315 struct {
309 gfn_t gfn; /* presumed gfn during guest pte update */ 316 gfn_t gfn; /* presumed gfn during guest pte update */
310 pfn_t pfn; /* pfn corresponding to that gfn */ 317 pfn_t pfn; /* pfn corresponding to that gfn */
311 int largepage;
312 unsigned long mmu_seq; 318 unsigned long mmu_seq;
313 } update_pte; 319 } update_pte;
314 320
@@ -334,16 +340,6 @@ struct kvm_vcpu_arch {
334 u8 nr; 340 u8 nr;
335 } interrupt; 341 } interrupt;
336 342
337 struct {
338 int vm86_active;
339 u8 save_iopl;
340 struct kvm_save_segment {
341 u16 selector;
342 unsigned long base;
343 u32 limit;
344 u32 ar;
345 } tr, es, ds, fs, gs;
346 } rmode;
347 int halt_request; /* real mode on Intel only */ 343 int halt_request; /* real mode on Intel only */
348 344
349 int cpuid_nent; 345 int cpuid_nent;
@@ -366,13 +362,15 @@ struct kvm_vcpu_arch {
366 u32 pat; 362 u32 pat;
367 363
368 int switch_db_regs; 364 int switch_db_regs;
369 unsigned long host_db[KVM_NR_DB_REGS];
370 unsigned long host_dr6;
371 unsigned long host_dr7;
372 unsigned long db[KVM_NR_DB_REGS]; 365 unsigned long db[KVM_NR_DB_REGS];
373 unsigned long dr6; 366 unsigned long dr6;
374 unsigned long dr7; 367 unsigned long dr7;
375 unsigned long eff_db[KVM_NR_DB_REGS]; 368 unsigned long eff_db[KVM_NR_DB_REGS];
369
370 u64 mcg_cap;
371 u64 mcg_status;
372 u64 mcg_ctl;
373 u64 *mce_banks;
376}; 374};
377 375
378struct kvm_mem_alias { 376struct kvm_mem_alias {
@@ -409,6 +407,7 @@ struct kvm_arch{
409 407
410 struct page *ept_identity_pagetable; 408 struct page *ept_identity_pagetable;
411 bool ept_identity_pagetable_done; 409 bool ept_identity_pagetable_done;
410 gpa_t ept_identity_map_addr;
412 411
413 unsigned long irq_sources_bitmap; 412 unsigned long irq_sources_bitmap;
414 unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; 413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
@@ -526,6 +525,9 @@ struct kvm_x86_ops {
526 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 525 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
527 int (*get_tdp_level)(void); 526 int (*get_tdp_level)(void);
528 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 527 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
528 bool (*gb_page_enable)(void);
529
530 const struct trace_print_flags *exit_reasons_str;
529}; 531};
530 532
531extern struct kvm_x86_ops *kvm_x86_ops; 533extern struct kvm_x86_ops *kvm_x86_ops;
@@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
618void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
619void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
620 u32 error_code); 622 u32 error_code);
623bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
621 624
622int kvm_pic_set_irq(void *opaque, int irq, int level); 625int kvm_pic_set_irq(void *opaque, int irq, int level);
623 626
@@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
752 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 755 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
753} 756}
754 757
755#define MSR_IA32_TIME_STAMP_COUNTER 0x010
756
757#define TSS_IOPB_BASE_OFFSET 0x66 758#define TSS_IOPB_BASE_OFFSET 0x66
758#define TSS_BASE_SIZE 0x68 759#define TSS_BASE_SIZE 0x68
759#define TSS_IOPB_SIZE (65536 / 8) 760#define TSS_IOPB_SIZE (65536 / 8)
@@ -796,5 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
796int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 797int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
797int kvm_age_hva(struct kvm *kvm, unsigned long hva); 798int kvm_age_hva(struct kvm *kvm, unsigned long hva);
798int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 799int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
800int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
801int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
802int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
799 803
800#endif /* _ASM_X86_KVM_HOST_H */ 804#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index b8a3305ae093..c584076a47f4 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_KVM_PARA_H 1#ifndef _ASM_X86_KVM_PARA_H
2#define _ASM_X86_KVM_PARA_H 2#define _ASM_X86_KVM_PARA_H
3 3
4#include <linux/types.h>
5
4/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It 6/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
5 * should be used to determine that a VM is running under KVM. 7 * should be used to determine that a VM is running under KVM.
6 */ 8 */
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 5136dad57cbb..0d97deba1e35 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -90,8 +90,9 @@ static inline void lguest_set_ts(void)
90} 90}
91 91
92/* Full 4G segment descriptors, suitable for CS and DS. */ 92/* Full 4G segment descriptors, suitable for CS and DS. */
93#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } }) 93#define FULL_EXEC_SEGMENT \
94#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } }) 94 ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
95#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
95 96
96#endif /* __ASSEMBLY__ */ 97#endif /* __ASSEMBLY__ */
97 98
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5cdd8d100ec9..b608a64c5814 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -9,7 +9,7 @@
9 */ 9 */
10 10
11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */ 11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
12#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ 12#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ 13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
14#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ 14#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
15#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ 15#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
@@ -38,6 +38,14 @@
38#define MCM_ADDR_MEM 3 /* memory address */ 38#define MCM_ADDR_MEM 3 /* memory address */
39#define MCM_ADDR_GENERIC 7 /* generic */ 39#define MCM_ADDR_GENERIC 7 /* generic */
40 40
41#define MCJ_CTX_MASK 3
42#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
43#define MCJ_CTX_RANDOM 0 /* inject context: random */
44#define MCJ_CTX_PROCESS 1 /* inject context: process */
45#define MCJ_CTX_IRQ 2 /* inject context: IRQ */
46#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */
47#define MCJ_EXCEPTION 8 /* raise as exception */
48
41/* Fields are zero when not available */ 49/* Fields are zero when not available */
42struct mce { 50struct mce {
43 __u64 status; 51 __u64 status;
@@ -48,8 +56,8 @@ struct mce {
48 __u64 tsc; /* cpu time stamp counter */ 56 __u64 tsc; /* cpu time stamp counter */
49 __u64 time; /* wall time_t when error was detected */ 57 __u64 time; /* wall time_t when error was detected */
50 __u8 cpuvendor; /* cpu vendor as encoded in system.h */ 58 __u8 cpuvendor; /* cpu vendor as encoded in system.h */
51 __u8 pad1; 59 __u8 inject_flags; /* software inject flags */
52 __u16 pad2; 60 __u16 pad;
53 __u32 cpuid; /* CPUID 1 EAX */ 61 __u32 cpuid; /* CPUID 1 EAX */
54 __u8 cs; /* code segment */ 62 __u8 cs; /* code segment */
55 __u8 bank; /* machine check bank */ 63 __u8 bank; /* machine check bank */
@@ -115,13 +123,6 @@ void mcheck_init(struct cpuinfo_x86 *c);
115static inline void mcheck_init(struct cpuinfo_x86 *c) {} 123static inline void mcheck_init(struct cpuinfo_x86 *c) {}
116#endif 124#endif
117 125
118#ifdef CONFIG_X86_OLD_MCE
119extern int nr_mce_banks;
120void amd_mcheck_init(struct cpuinfo_x86 *c);
121void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
122void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
123#endif
124
125#ifdef CONFIG_X86_ANCIENT_MCE 126#ifdef CONFIG_X86_ANCIENT_MCE
126void intel_p5_mcheck_init(struct cpuinfo_x86 *c); 127void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
127void winchip_mcheck_init(struct cpuinfo_x86 *c); 128void winchip_mcheck_init(struct cpuinfo_x86 *c);
@@ -137,10 +138,11 @@ void mce_log(struct mce *m);
137DECLARE_PER_CPU(struct sys_device, mce_dev); 138DECLARE_PER_CPU(struct sys_device, mce_dev);
138 139
139/* 140/*
140 * To support more than 128 would need to escape the predefined 141 * Maximum banks number.
141 * Linux defined extended banks first. 142 * This is the limit of the current register layout on
143 * Intel CPUs.
142 */ 144 */
143#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) 145#define MAX_NR_BANKS 32
144 146
145#ifdef CONFIG_X86_MCE_INTEL 147#ifdef CONFIG_X86_MCE_INTEL
146extern int mce_cmci_disabled; 148extern int mce_cmci_disabled;
@@ -208,11 +210,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
208 210
209void intel_init_thermal(struct cpuinfo_x86 *c); 211void intel_init_thermal(struct cpuinfo_x86 *c);
210 212
211#ifdef CONFIG_X86_NEW_MCE
212void mce_log_therm_throt_event(__u64 status); 213void mce_log_therm_throt_event(__u64 status);
213#else
214static inline void mce_log_therm_throt_event(__u64 status) {}
215#endif
216 214
217#endif /* __KERNEL__ */ 215#endif /* __KERNEL__ */
218#endif /* _ASM_X86_MCE_H */ 216#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 751af2550ed9..593e51d4643f 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -1,20 +1,8 @@
1#ifndef _ASM_X86_MMAN_H 1#ifndef _ASM_X86_MMAN_H
2#define _ASM_X86_MMAN_H 2#define _ASM_X86_MMAN_H
3 3
4#include <asm-generic/mman-common.h>
5
6#define MAP_32BIT 0x40 /* only give out 32bit addresses */ 4#define MAP_32BIT 0x40 /* only give out 32bit addresses */
7 5
8#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ 6#include <asm-generic/mman.h>
9#define MAP_DENYWRITE 0x0800 /* ETXTBSY */
10#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
11#define MAP_LOCKED 0x2000 /* pages are locked */
12#define MAP_NORESERVE 0x4000 /* don't check for reservations */
13#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
14#define MAP_NONBLOCK 0x10000 /* do not block on IO */
15#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
16
17#define MCL_CURRENT 1 /* lock all current mappings */
18#define MCL_FUTURE 2 /* lock all future mappings */
19 7
20#endif /* _ASM_X86_MMAN_H */ 8#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index f923203dc39a..4a2d4e0c18d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
37 37
38 if (likely(prev != next)) { 38 if (likely(prev != next)) {
39 /* stop flush ipis for the previous mm */ 39 /* stop flush ipis for the previous mm */
40 cpu_clear(cpu, prev->cpu_vm_mask); 40 cpumask_clear_cpu(cpu, mm_cpumask(prev));
41#ifdef CONFIG_SMP 41#ifdef CONFIG_SMP
42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
43 percpu_write(cpu_tlbstate.active_mm, next); 43 percpu_write(cpu_tlbstate.active_mm, next);
44#endif 44#endif
45 cpu_set(cpu, next->cpu_vm_mask); 45 cpumask_set_cpu(cpu, mm_cpumask(next));
46 46
47 /* Re-load page tables */ 47 /* Re-load page tables */
48 load_cr3(next->pgd); 48 load_cr3(next->pgd);
@@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
58 percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 58 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
59 BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); 59 BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
60 60
61 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { 61 if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
62 /* We were in lazy tlb mode and leave_mm disabled 62 /* We were in lazy tlb mode and leave_mm disabled
63 * tlb flush IPI delivery. We must reload CR3 63 * tlb flush IPI delivery. We must reload CR3
64 * to make sure to use no freed page tables. 64 * to make sure to use no freed page tables.
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 47d62743c4d5..3e2ce58a31a3 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -1,18 +1,7 @@
1#ifndef _ASM_X86_MODULE_H 1#ifndef _ASM_X86_MODULE_H
2#define _ASM_X86_MODULE_H 2#define _ASM_X86_MODULE_H
3 3
4/* x86_32/64 are simple */ 4#include <asm-generic/module.h>
5struct mod_arch_specific {};
6
7#ifdef CONFIG_X86_32
8# define Elf_Shdr Elf32_Shdr
9# define Elf_Sym Elf32_Sym
10# define Elf_Ehdr Elf32_Ehdr
11#else
12# define Elf_Shdr Elf64_Shdr
13# define Elf_Sym Elf64_Sym
14# define Elf_Ehdr Elf64_Ehdr
15#endif
16 5
17#ifdef CONFIG_X86_64 6#ifdef CONFIG_X86_64
18/* X86_64 does not define MODULE_PROC_FAMILY */ 7/* X86_64 does not define MODULE_PROC_FAMILY */
@@ -28,6 +17,8 @@ struct mod_arch_specific {};
28#define MODULE_PROC_FAMILY "586MMX " 17#define MODULE_PROC_FAMILY "586MMX "
29#elif defined CONFIG_MCORE2 18#elif defined CONFIG_MCORE2
30#define MODULE_PROC_FAMILY "CORE2 " 19#define MODULE_PROC_FAMILY "CORE2 "
20#elif defined CONFIG_MATOM
21#define MODULE_PROC_FAMILY "ATOM "
31#elif defined CONFIG_M686 22#elif defined CONFIG_M686
32#define MODULE_PROC_FAMILY "686 " 23#define MODULE_PROC_FAMILY "686 "
33#elif defined CONFIG_MPENTIUMII 24#elif defined CONFIG_MPENTIUMII
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index e2a1bb6d71ea..79c94500c0bb 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -4,6 +4,7 @@
4#include <linux/init.h> 4#include <linux/init.h>
5 5
6#include <asm/mpspec_def.h> 6#include <asm/mpspec_def.h>
7#include <asm/x86_init.h>
7 8
8extern int apic_version[MAX_APICS]; 9extern int apic_version[MAX_APICS];
9extern int pic_mode; 10extern int pic_mode;
@@ -41,9 +42,6 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
41 42
42#endif /* CONFIG_X86_64 */ 43#endif /* CONFIG_X86_64 */
43 44
44extern void early_find_smp_config(void);
45extern void early_get_smp_config(void);
46
47#if defined(CONFIG_MCA) || defined(CONFIG_EISA) 45#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
48extern int mp_bus_id_to_type[MAX_MP_BUSSES]; 46extern int mp_bus_id_to_type[MAX_MP_BUSSES];
49#endif 47#endif
@@ -52,20 +50,55 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
52 50
53extern unsigned int boot_cpu_physical_apicid; 51extern unsigned int boot_cpu_physical_apicid;
54extern unsigned int max_physical_apicid; 52extern unsigned int max_physical_apicid;
55extern int smp_found_config;
56extern int mpc_default_type; 53extern int mpc_default_type;
57extern unsigned long mp_lapic_addr; 54extern unsigned long mp_lapic_addr;
58 55
59extern void get_smp_config(void); 56#ifdef CONFIG_X86_LOCAL_APIC
57extern int smp_found_config;
58#else
59# define smp_found_config 0
60#endif
61
62static inline void get_smp_config(void)
63{
64 x86_init.mpparse.get_smp_config(0);
65}
66
67static inline void early_get_smp_config(void)
68{
69 x86_init.mpparse.get_smp_config(1);
70}
71
72static inline void find_smp_config(void)
73{
74 x86_init.mpparse.find_smp_config(1);
75}
76
77static inline void early_find_smp_config(void)
78{
79 x86_init.mpparse.find_smp_config(0);
80}
60 81
61#ifdef CONFIG_X86_MPPARSE 82#ifdef CONFIG_X86_MPPARSE
62extern void find_smp_config(void);
63extern void early_reserve_e820_mpc_new(void); 83extern void early_reserve_e820_mpc_new(void);
64extern int enable_update_mptable; 84extern int enable_update_mptable;
85extern int default_mpc_apic_id(struct mpc_cpu *m);
86extern void default_smp_read_mpc_oem(struct mpc_table *mpc);
87# ifdef CONFIG_X86_IO_APIC
88extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
89# else
90# define default_mpc_oem_bus_info NULL
91# endif
92extern void default_find_smp_config(unsigned int reserve);
93extern void default_get_smp_config(unsigned int early);
65#else 94#else
66static inline void find_smp_config(void) { }
67static inline void early_reserve_e820_mpc_new(void) { } 95static inline void early_reserve_e820_mpc_new(void) { }
68#define enable_update_mptable 0 96#define enable_update_mptable 0
97#define default_mpc_apic_id NULL
98#define default_smp_read_mpc_oem NULL
99#define default_mpc_oem_bus_info NULL
100#define default_find_smp_config x86_init_uint_noop
101#define default_get_smp_config x86_init_uint_noop
69#endif 102#endif
70 103
71void __cpuinit generic_processor_info(int apicid, int version); 104void __cpuinit generic_processor_info(int apicid, int version);
diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h
index 7e4e9481f51c..809134c644a6 100644
--- a/arch/x86/include/asm/msgbuf.h
+++ b/arch/x86/include/asm/msgbuf.h
@@ -1,39 +1 @@
1#ifndef _ASM_X86_MSGBUF_H #include <asm-generic/msgbuf.h>
2#define _ASM_X86_MSGBUF_H
3
4/*
5 * The msqid64_ds structure for i386 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space on i386 is left for:
10 * - 64-bit time_t to solve y2038 problem
11 * - 2 miscellaneous 32-bit values
12 *
13 * Pad space on x8664 is left for:
14 * - 2 miscellaneous 64-bit values
15 */
16struct msqid64_ds {
17 struct ipc64_perm msg_perm;
18 __kernel_time_t msg_stime; /* last msgsnd time */
19#ifdef __i386__
20 unsigned long __unused1;
21#endif
22 __kernel_time_t msg_rtime; /* last msgrcv time */
23#ifdef __i386__
24 unsigned long __unused2;
25#endif
26 __kernel_time_t msg_ctime; /* last change time */
27#ifdef __i386__
28 unsigned long __unused3;
29#endif
30 unsigned long msg_cbytes; /* current number of bytes on queue */
31 unsigned long msg_qnum; /* number of messages in queue */
32 unsigned long msg_qbytes; /* max number of bytes on queue */
33 __kernel_pid_t msg_lspid; /* pid of last msgsnd */
34 __kernel_pid_t msg_lrpid; /* last receive pid */
35 unsigned long __unused4;
36 unsigned long __unused5;
37};
38
39#endif /* _ASM_X86_MSGBUF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6be7fc254b59..4ffe09b2ad75 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,8 +81,15 @@
81#define MSR_IA32_MC0_ADDR 0x00000402 81#define MSR_IA32_MC0_ADDR 0x00000402
82#define MSR_IA32_MC0_MISC 0x00000403 82#define MSR_IA32_MC0_MISC 0x00000403
83 83
84#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
85#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
86#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
87#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
88
84/* These are consecutive and not in the normal 4er MCE bank block */ 89/* These are consecutive and not in the normal 4er MCE bank block */
85#define MSR_IA32_MC0_CTL2 0x00000280 90#define MSR_IA32_MC0_CTL2 0x00000280
91#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
92
86#define CMCI_EN (1ULL << 30) 93#define CMCI_EN (1ULL << 30)
87#define CMCI_THRESHOLD_MASK 0xffffULL 94#define CMCI_THRESHOLD_MASK 0xffffULL
88 95
@@ -215,6 +222,10 @@
215 222
216#define THERM_STATUS_PROCHOT (1 << 0) 223#define THERM_STATUS_PROCHOT (1 << 0)
217 224
225#define MSR_THERM2_CTL 0x0000019d
226
227#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16)
228
218#define MSR_IA32_MISC_ENABLE 0x000001a0 229#define MSR_IA32_MISC_ENABLE 0x000001a0
219 230
220/* MISC_ENABLE bits: architectural */ 231/* MISC_ENABLE bits: architectural */
@@ -374,6 +385,7 @@
374/* AMD-V MSRs */ 385/* AMD-V MSRs */
375 386
376#define MSR_VM_CR 0xc0010114 387#define MSR_VM_CR 0xc0010114
388#define MSR_VM_IGNNE 0xc0010115
377#define MSR_VM_HSAVE_PA 0xc0010117 389#define MSR_VM_HSAVE_PA 0xc0010117
378 390
379#endif /* _ASM_X86_MSR_INDEX_H */ 391#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 48ad9d29484a..7e2b6ba962ff 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -3,10 +3,16 @@
3 3
4#include <asm/msr-index.h> 4#include <asm/msr-index.h>
5 5
6#ifdef __KERNEL__
7#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
8 7
9#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/ioctl.h>
10
11#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8])
12#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8])
13
14#ifdef __KERNEL__
15
10#include <asm/asm.h> 16#include <asm/asm.h>
11#include <asm/errno.h> 17#include <asm/errno.h>
12#include <asm/cpumask.h> 18#include <asm/cpumask.h>
@@ -67,23 +73,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
67 ".previous\n\t" 73 ".previous\n\t"
68 _ASM_EXTABLE(2b, 3b) 74 _ASM_EXTABLE(2b, 3b)
69 : [err] "=r" (*err), EAX_EDX_RET(val, low, high) 75 : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
70 : "c" (msr), [fault] "i" (-EFAULT)); 76 : "c" (msr), [fault] "i" (-EIO));
71 return EAX_EDX_VAL(val, low, high);
72}
73
74static inline unsigned long long native_read_msr_amd_safe(unsigned int msr,
75 int *err)
76{
77 DECLARE_ARGS(val, low, high);
78
79 asm volatile("2: rdmsr ; xor %0,%0\n"
80 "1:\n\t"
81 ".section .fixup,\"ax\"\n\t"
82 "3: mov %3,%0 ; jmp 1b\n\t"
83 ".previous\n\t"
84 _ASM_EXTABLE(2b, 3b)
85 : "=r" (*err), EAX_EDX_RET(val, low, high)
86 : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT));
87 return EAX_EDX_VAL(val, low, high); 77 return EAX_EDX_VAL(val, low, high);
88} 78}
89 79
@@ -106,13 +96,16 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
106 _ASM_EXTABLE(2b, 3b) 96 _ASM_EXTABLE(2b, 3b)
107 : [err] "=a" (err) 97 : [err] "=a" (err)
108 : "c" (msr), "0" (low), "d" (high), 98 : "c" (msr), "0" (low), "d" (high),
109 [fault] "i" (-EFAULT) 99 [fault] "i" (-EIO)
110 : "memory"); 100 : "memory");
111 return err; 101 return err;
112} 102}
113 103
114extern unsigned long long native_read_tsc(void); 104extern unsigned long long native_read_tsc(void);
115 105
106extern int native_rdmsr_safe_regs(u32 regs[8]);
107extern int native_wrmsr_safe_regs(u32 regs[8]);
108
116static __always_inline unsigned long long __native_read_tsc(void) 109static __always_inline unsigned long long __native_read_tsc(void)
117{ 110{
118 DECLARE_ARGS(val, low, high); 111 DECLARE_ARGS(val, low, high);
@@ -181,14 +174,44 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
181 *p = native_read_msr_safe(msr, &err); 174 *p = native_read_msr_safe(msr, &err);
182 return err; 175 return err;
183} 176}
177
184static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) 178static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
185{ 179{
180 u32 gprs[8] = { 0 };
186 int err; 181 int err;
187 182
188 *p = native_read_msr_amd_safe(msr, &err); 183 gprs[1] = msr;
184 gprs[7] = 0x9c5a203a;
185
186 err = native_rdmsr_safe_regs(gprs);
187
188 *p = gprs[0] | ((u64)gprs[2] << 32);
189
189 return err; 190 return err;
190} 191}
191 192
193static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
194{
195 u32 gprs[8] = { 0 };
196
197 gprs[0] = (u32)val;
198 gprs[1] = msr;
199 gprs[2] = val >> 32;
200 gprs[7] = 0x9c5a203a;
201
202 return native_wrmsr_safe_regs(gprs);
203}
204
205static inline int rdmsr_safe_regs(u32 regs[8])
206{
207 return native_rdmsr_safe_regs(regs);
208}
209
210static inline int wrmsr_safe_regs(u32 regs[8])
211{
212 return native_wrmsr_safe_regs(regs);
213}
214
192#define rdtscl(low) \ 215#define rdtscl(low) \
193 ((low) = (u32)__native_read_tsc()) 216 ((low) = (u32)__native_read_tsc())
194 217
@@ -228,6 +251,8 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
228void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 251void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
229int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 252int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
230int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 253int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
254int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
255int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
231#else /* CONFIG_SMP */ 256#else /* CONFIG_SMP */
232static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 257static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
233{ 258{
@@ -258,7 +283,15 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
258{ 283{
259 return wrmsr_safe(msr_no, l, h); 284 return wrmsr_safe(msr_no, l, h);
260} 285}
286static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
287{
288 return rdmsr_safe_regs(regs);
289}
290static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
291{
292 return wrmsr_safe_regs(regs);
293}
261#endif /* CONFIG_SMP */ 294#endif /* CONFIG_SMP */
262#endif /* __ASSEMBLY__ */
263#endif /* __KERNEL__ */ 295#endif /* __KERNEL__ */
296#endif /* __ASSEMBLY__ */
264#endif /* _ASM_X86_MSR_H */ 297#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index a51ada8467de..4365ffdb461f 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -121,6 +121,9 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
121extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); 121extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
122extern void mtrr_ap_init(void); 122extern void mtrr_ap_init(void);
123extern void mtrr_bp_init(void); 123extern void mtrr_bp_init(void);
124extern void set_mtrr_aps_delayed_init(void);
125extern void mtrr_aps_init(void);
126extern void mtrr_bp_restore(void);
124extern int mtrr_trim_uncached_memory(unsigned long end_pfn); 127extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
125extern int amd_special_default_mtrr(void); 128extern int amd_special_default_mtrr(void);
126# else 129# else
@@ -161,6 +164,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
161 164
162#define mtrr_ap_init() do {} while (0) 165#define mtrr_ap_init() do {} while (0)
163#define mtrr_bp_init() do {} while (0) 166#define mtrr_bp_init() do {} while (0)
167#define set_mtrr_aps_delayed_init() do {} while (0)
168#define mtrr_aps_init() do {} while (0)
169#define mtrr_bp_restore() do {} while (0)
164# endif 170# endif
165 171
166#ifdef CONFIG_COMPAT 172#ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c86e5ed4af51..139d4c1a33a7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -40,13 +40,12 @@ extern unsigned int nmi_watchdog;
40#define NMI_INVALID 3 40#define NMI_INVALID 3
41 41
42struct ctl_table; 42struct ctl_table;
43struct file; 43extern int proc_nmi_enabled(struct ctl_table *, int ,
44extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
45 void __user *, size_t *, loff_t *); 44 void __user *, size_t *, loff_t *);
46extern int unknown_nmi_panic; 45extern int unknown_nmi_panic;
47 46
48void __trigger_all_cpu_backtrace(void); 47void arch_trigger_all_cpu_backtrace(void);
49#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() 48#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
50 49
51static inline void localise_nmi_watchdog(void) 50static inline void localise_nmi_watchdog(void)
52{ 51{
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index ad2668ee1aa7..6d8723a766cc 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -65,6 +65,8 @@
65 6: osp nopl 0x00(%eax,%eax,1) 65 6: osp nopl 0x00(%eax,%eax,1)
66 7: nopl 0x00000000(%eax) 66 7: nopl 0x00000000(%eax)
67 8: nopl 0x00000000(%eax,%eax,1) 67 8: nopl 0x00000000(%eax,%eax,1)
68 Note: All the above are assumed to be a single instruction.
69 There is kernel code that depends on this.
68*/ 70*/
69#define P6_NOP1 GENERIC_NOP1 71#define P6_NOP1 GENERIC_NOP1
70#define P6_NOP2 ".byte 0x66,0x90\n" 72#define P6_NOP2 ".byte 0x66,0x90\n"
diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h
index 6f0d0422f4ca..965d45427975 100644
--- a/arch/x86/include/asm/param.h
+++ b/arch/x86/include/asm/param.h
@@ -1,22 +1 @@
1#ifndef _ASM_X86_PARAM_H #include <asm-generic/param.h>
2#define _ASM_X86_PARAM_H
3
4#ifdef __KERNEL__
5# define HZ CONFIG_HZ /* Internal kernel timer frequency */
6# define USER_HZ 100 /* some user interfaces are */
7# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */
8#endif
9
10#ifndef HZ
11#define HZ 100
12#endif
13
14#define EXEC_PAGESIZE 4096
15
16#ifndef NOGROUP
17#define NOGROUP (-1)
18#endif
19
20#define MAXHOSTNAMELEN 64 /* max length of hostname */
21
22#endif /* _ASM_X86_PARAM_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 4fb37c8a0832..8aebcc41041d 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -7,689 +7,11 @@
7#include <asm/pgtable_types.h> 7#include <asm/pgtable_types.h>
8#include <asm/asm.h> 8#include <asm/asm.h>
9 9
10/* Bitmask of what can be clobbered: usually at least eax. */ 10#include <asm/paravirt_types.h>
11#define CLBR_NONE 0
12#define CLBR_EAX (1 << 0)
13#define CLBR_ECX (1 << 1)
14#define CLBR_EDX (1 << 2)
15#define CLBR_EDI (1 << 3)
16
17#ifdef CONFIG_X86_32
18/* CLBR_ANY should match all regs platform has. For i386, that's just it */
19#define CLBR_ANY ((1 << 4) - 1)
20
21#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
22#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
23#define CLBR_SCRATCH (0)
24#else
25#define CLBR_RAX CLBR_EAX
26#define CLBR_RCX CLBR_ECX
27#define CLBR_RDX CLBR_EDX
28#define CLBR_RDI CLBR_EDI
29#define CLBR_RSI (1 << 4)
30#define CLBR_R8 (1 << 5)
31#define CLBR_R9 (1 << 6)
32#define CLBR_R10 (1 << 7)
33#define CLBR_R11 (1 << 8)
34
35#define CLBR_ANY ((1 << 9) - 1)
36
37#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
38 CLBR_RCX | CLBR_R8 | CLBR_R9)
39#define CLBR_RET_REG (CLBR_RAX)
40#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
41
42#include <asm/desc_defs.h>
43#endif /* X86_64 */
44
45#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
46 11
47#ifndef __ASSEMBLY__ 12#ifndef __ASSEMBLY__
48#include <linux/types.h> 13#include <linux/types.h>
49#include <linux/cpumask.h> 14#include <linux/cpumask.h>
50#include <asm/kmap_types.h>
51#include <asm/desc_defs.h>
52
53struct page;
54struct thread_struct;
55struct desc_ptr;
56struct tss_struct;
57struct mm_struct;
58struct desc_struct;
59struct task_struct;
60
61/*
62 * Wrapper type for pointers to code which uses the non-standard
63 * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
64 */
65struct paravirt_callee_save {
66 void *func;
67};
68
69/* general info */
70struct pv_info {
71 unsigned int kernel_rpl;
72 int shared_kernel_pmd;
73 int paravirt_enabled;
74 const char *name;
75};
76
77struct pv_init_ops {
78 /*
79 * Patch may replace one of the defined code sequences with
80 * arbitrary code, subject to the same register constraints.
81 * This generally means the code is not free to clobber any
82 * registers other than EAX. The patch function should return
83 * the number of bytes of code generated, as we nop pad the
84 * rest in generic code.
85 */
86 unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
87 unsigned long addr, unsigned len);
88
89 /* Basic arch-specific setup */
90 void (*arch_setup)(void);
91 char *(*memory_setup)(void);
92 void (*post_allocator_init)(void);
93
94 /* Print a banner to identify the environment */
95 void (*banner)(void);
96};
97
98
99struct pv_lazy_ops {
100 /* Set deferred update mode, used for batching operations. */
101 void (*enter)(void);
102 void (*leave)(void);
103};
104
105struct pv_time_ops {
106 void (*time_init)(void);
107
108 /* Set and set time of day */
109 unsigned long (*get_wallclock)(void);
110 int (*set_wallclock)(unsigned long);
111
112 unsigned long long (*sched_clock)(void);
113 unsigned long (*get_tsc_khz)(void);
114};
115
116struct pv_cpu_ops {
117 /* hooks for various privileged instructions */
118 unsigned long (*get_debugreg)(int regno);
119 void (*set_debugreg)(int regno, unsigned long value);
120
121 void (*clts)(void);
122
123 unsigned long (*read_cr0)(void);
124 void (*write_cr0)(unsigned long);
125
126 unsigned long (*read_cr4_safe)(void);
127 unsigned long (*read_cr4)(void);
128 void (*write_cr4)(unsigned long);
129
130#ifdef CONFIG_X86_64
131 unsigned long (*read_cr8)(void);
132 void (*write_cr8)(unsigned long);
133#endif
134
135 /* Segment descriptor handling */
136 void (*load_tr_desc)(void);
137 void (*load_gdt)(const struct desc_ptr *);
138 void (*load_idt)(const struct desc_ptr *);
139 void (*store_gdt)(struct desc_ptr *);
140 void (*store_idt)(struct desc_ptr *);
141 void (*set_ldt)(const void *desc, unsigned entries);
142 unsigned long (*store_tr)(void);
143 void (*load_tls)(struct thread_struct *t, unsigned int cpu);
144#ifdef CONFIG_X86_64
145 void (*load_gs_index)(unsigned int idx);
146#endif
147 void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
148 const void *desc);
149 void (*write_gdt_entry)(struct desc_struct *,
150 int entrynum, const void *desc, int size);
151 void (*write_idt_entry)(gate_desc *,
152 int entrynum, const gate_desc *gate);
153 void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
154 void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
155
156 void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
157
158 void (*set_iopl_mask)(unsigned mask);
159
160 void (*wbinvd)(void);
161 void (*io_delay)(void);
162
163 /* cpuid emulation, mostly so that caps bits can be disabled */
164 void (*cpuid)(unsigned int *eax, unsigned int *ebx,
165 unsigned int *ecx, unsigned int *edx);
166
167 /* MSR, PMC and TSR operations.
168 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
169 u64 (*read_msr_amd)(unsigned int msr, int *err);
170 u64 (*read_msr)(unsigned int msr, int *err);
171 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
172
173 u64 (*read_tsc)(void);
174 u64 (*read_pmc)(int counter);
175 unsigned long long (*read_tscp)(unsigned int *aux);
176
177 /*
178 * Atomically enable interrupts and return to userspace. This
179 * is only ever used to return to 32-bit processes; in a
180 * 64-bit kernel, it's used for 32-on-64 compat processes, but
181 * never native 64-bit processes. (Jump, not call.)
182 */
183 void (*irq_enable_sysexit)(void);
184
185 /*
186 * Switch to usermode gs and return to 64-bit usermode using
187 * sysret. Only used in 64-bit kernels to return to 64-bit
188 * processes. Usermode register state, including %rsp, must
189 * already be restored.
190 */
191 void (*usergs_sysret64)(void);
192
193 /*
194 * Switch to usermode gs and return to 32-bit usermode using
195 * sysret. Used to return to 32-on-64 compat processes.
196 * Other usermode register state, including %esp, must already
197 * be restored.
198 */
199 void (*usergs_sysret32)(void);
200
201 /* Normal iret. Jump to this with the standard iret stack
202 frame set up. */
203 void (*iret)(void);
204
205 void (*swapgs)(void);
206
207 void (*start_context_switch)(struct task_struct *prev);
208 void (*end_context_switch)(struct task_struct *next);
209};
210
211struct pv_irq_ops {
212 void (*init_IRQ)(void);
213
214 /*
215 * Get/set interrupt state. save_fl and restore_fl are only
216 * expected to use X86_EFLAGS_IF; all other bits
217 * returned from save_fl are undefined, and may be ignored by
218 * restore_fl.
219 *
220 * NOTE: These functions callers expect the callee to preserve
221 * more registers than the standard C calling convention.
222 */
223 struct paravirt_callee_save save_fl;
224 struct paravirt_callee_save restore_fl;
225 struct paravirt_callee_save irq_disable;
226 struct paravirt_callee_save irq_enable;
227
228 void (*safe_halt)(void);
229 void (*halt)(void);
230
231#ifdef CONFIG_X86_64
232 void (*adjust_exception_frame)(void);
233#endif
234};
235
236struct pv_apic_ops {
237#ifdef CONFIG_X86_LOCAL_APIC
238 void (*setup_boot_clock)(void);
239 void (*setup_secondary_clock)(void);
240
241 void (*startup_ipi_hook)(int phys_apicid,
242 unsigned long start_eip,
243 unsigned long start_esp);
244#endif
245};
246
247struct pv_mmu_ops {
248 /*
249 * Called before/after init_mm pagetable setup. setup_start
250 * may reset %cr3, and may pre-install parts of the pagetable;
251 * pagetable setup is expected to preserve any existing
252 * mapping.
253 */
254 void (*pagetable_setup_start)(pgd_t *pgd_base);
255 void (*pagetable_setup_done)(pgd_t *pgd_base);
256
257 unsigned long (*read_cr2)(void);
258 void (*write_cr2)(unsigned long);
259
260 unsigned long (*read_cr3)(void);
261 void (*write_cr3)(unsigned long);
262
263 /*
264 * Hooks for intercepting the creation/use/destruction of an
265 * mm_struct.
266 */
267 void (*activate_mm)(struct mm_struct *prev,
268 struct mm_struct *next);
269 void (*dup_mmap)(struct mm_struct *oldmm,
270 struct mm_struct *mm);
271 void (*exit_mmap)(struct mm_struct *mm);
272
273
274 /* TLB operations */
275 void (*flush_tlb_user)(void);
276 void (*flush_tlb_kernel)(void);
277 void (*flush_tlb_single)(unsigned long addr);
278 void (*flush_tlb_others)(const struct cpumask *cpus,
279 struct mm_struct *mm,
280 unsigned long va);
281
282 /* Hooks for allocating and freeing a pagetable top-level */
283 int (*pgd_alloc)(struct mm_struct *mm);
284 void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
285
286 /*
287 * Hooks for allocating/releasing pagetable pages when they're
288 * attached to a pagetable
289 */
290 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
291 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
292 void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
293 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
294 void (*release_pte)(unsigned long pfn);
295 void (*release_pmd)(unsigned long pfn);
296 void (*release_pud)(unsigned long pfn);
297
298 /* Pagetable manipulation functions */
299 void (*set_pte)(pte_t *ptep, pte_t pteval);
300 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
301 pte_t *ptep, pte_t pteval);
302 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
303 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
304 pte_t *ptep);
305 void (*pte_update_defer)(struct mm_struct *mm,
306 unsigned long addr, pte_t *ptep);
307
308 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
309 pte_t *ptep);
310 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
311 pte_t *ptep, pte_t pte);
312
313 struct paravirt_callee_save pte_val;
314 struct paravirt_callee_save make_pte;
315
316 struct paravirt_callee_save pgd_val;
317 struct paravirt_callee_save make_pgd;
318
319#if PAGETABLE_LEVELS >= 3
320#ifdef CONFIG_X86_PAE
321 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
322 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
323 pte_t *ptep);
324 void (*pmd_clear)(pmd_t *pmdp);
325
326#endif /* CONFIG_X86_PAE */
327
328 void (*set_pud)(pud_t *pudp, pud_t pudval);
329
330 struct paravirt_callee_save pmd_val;
331 struct paravirt_callee_save make_pmd;
332
333#if PAGETABLE_LEVELS == 4
334 struct paravirt_callee_save pud_val;
335 struct paravirt_callee_save make_pud;
336
337 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
338#endif /* PAGETABLE_LEVELS == 4 */
339#endif /* PAGETABLE_LEVELS >= 3 */
340
341#ifdef CONFIG_HIGHPTE
342 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
343#endif
344
345 struct pv_lazy_ops lazy_mode;
346
347 /* dom0 ops */
348
349 /* Sometimes the physical address is a pfn, and sometimes its
350 an mfn. We can tell which is which from the index. */
351 void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
352 phys_addr_t phys, pgprot_t flags);
353};
354
355struct raw_spinlock;
356struct pv_lock_ops {
357 int (*spin_is_locked)(struct raw_spinlock *lock);
358 int (*spin_is_contended)(struct raw_spinlock *lock);
359 void (*spin_lock)(struct raw_spinlock *lock);
360 void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
361 int (*spin_trylock)(struct raw_spinlock *lock);
362 void (*spin_unlock)(struct raw_spinlock *lock);
363};
364
365/* This contains all the paravirt structures: we get a convenient
366 * number for each function using the offset which we use to indicate
367 * what to patch. */
368struct paravirt_patch_template {
369 struct pv_init_ops pv_init_ops;
370 struct pv_time_ops pv_time_ops;
371 struct pv_cpu_ops pv_cpu_ops;
372 struct pv_irq_ops pv_irq_ops;
373 struct pv_apic_ops pv_apic_ops;
374 struct pv_mmu_ops pv_mmu_ops;
375 struct pv_lock_ops pv_lock_ops;
376};
377
378extern struct pv_info pv_info;
379extern struct pv_init_ops pv_init_ops;
380extern struct pv_time_ops pv_time_ops;
381extern struct pv_cpu_ops pv_cpu_ops;
382extern struct pv_irq_ops pv_irq_ops;
383extern struct pv_apic_ops pv_apic_ops;
384extern struct pv_mmu_ops pv_mmu_ops;
385extern struct pv_lock_ops pv_lock_ops;
386
387#define PARAVIRT_PATCH(x) \
388 (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
389
390#define paravirt_type(op) \
391 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
392 [paravirt_opptr] "i" (&(op))
393#define paravirt_clobber(clobber) \
394 [paravirt_clobber] "i" (clobber)
395
396/*
397 * Generate some code, and mark it as patchable by the
398 * apply_paravirt() alternate instruction patcher.
399 */
400#define _paravirt_alt(insn_string, type, clobber) \
401 "771:\n\t" insn_string "\n" "772:\n" \
402 ".pushsection .parainstructions,\"a\"\n" \
403 _ASM_ALIGN "\n" \
404 _ASM_PTR " 771b\n" \
405 " .byte " type "\n" \
406 " .byte 772b-771b\n" \
407 " .short " clobber "\n" \
408 ".popsection\n"
409
410/* Generate patchable code, with the default asm parameters. */
411#define paravirt_alt(insn_string) \
412 _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
413
414/* Simple instruction patching code. */
415#define DEF_NATIVE(ops, name, code) \
416 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
417 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
418
419unsigned paravirt_patch_nop(void);
420unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
421unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
422unsigned paravirt_patch_ignore(unsigned len);
423unsigned paravirt_patch_call(void *insnbuf,
424 const void *target, u16 tgt_clobbers,
425 unsigned long addr, u16 site_clobbers,
426 unsigned len);
427unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
428 unsigned long addr, unsigned len);
429unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
430 unsigned long addr, unsigned len);
431
432unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
433 const char *start, const char *end);
434
435unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
436 unsigned long addr, unsigned len);
437
438int paravirt_disable_iospace(void);
439
440/*
441 * This generates an indirect call based on the operation type number.
442 * The type number, computed in PARAVIRT_PATCH, is derived from the
443 * offset into the paravirt_patch_template structure, and can therefore be
444 * freely converted back into a structure offset.
445 */
446#define PARAVIRT_CALL "call *%c[paravirt_opptr];"
447
448/*
449 * These macros are intended to wrap calls through one of the paravirt
450 * ops structs, so that they can be later identified and patched at
451 * runtime.
452 *
453 * Normally, a call to a pv_op function is a simple indirect call:
454 * (pv_op_struct.operations)(args...).
455 *
456 * Unfortunately, this is a relatively slow operation for modern CPUs,
457 * because it cannot necessarily determine what the destination
458 * address is. In this case, the address is a runtime constant, so at
459 * the very least we can patch the call to e a simple direct call, or
460 * ideally, patch an inline implementation into the callsite. (Direct
461 * calls are essentially free, because the call and return addresses
462 * are completely predictable.)
463 *
464 * For i386, these macros rely on the standard gcc "regparm(3)" calling
465 * convention, in which the first three arguments are placed in %eax,
466 * %edx, %ecx (in that order), and the remaining arguments are placed
467 * on the stack. All caller-save registers (eax,edx,ecx) are expected
468 * to be modified (either clobbered or used for return values).
469 * X86_64, on the other hand, already specifies a register-based calling
470 * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
471 * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
472 * special handling for dealing with 4 arguments, unlike i386.
473 * However, x86_64 also have to clobber all caller saved registers, which
474 * unfortunately, are quite a bit (r8 - r11)
475 *
476 * The call instruction itself is marked by placing its start address
477 * and size into the .parainstructions section, so that
478 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
479 * appropriate patching under the control of the backend pv_init_ops
480 * implementation.
481 *
482 * Unfortunately there's no way to get gcc to generate the args setup
483 * for the call, and then allow the call itself to be generated by an
484 * inline asm. Because of this, we must do the complete arg setup and
485 * return value handling from within these macros. This is fairly
486 * cumbersome.
487 *
488 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
489 * It could be extended to more arguments, but there would be little
490 * to be gained from that. For each number of arguments, there are
491 * the two VCALL and CALL variants for void and non-void functions.
492 *
493 * When there is a return value, the invoker of the macro must specify
494 * the return type. The macro then uses sizeof() on that type to
495 * determine whether its a 32 or 64 bit value, and places the return
496 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
497 * 64-bit). For x86_64 machines, it just returns at %rax regardless of
498 * the return value size.
499 *
500 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
501 * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
502 * in low,high order
503 *
504 * Small structures are passed and returned in registers. The macro
505 * calling convention can't directly deal with this, so the wrapper
506 * functions must do this.
507 *
508 * These PVOP_* macros are only defined within this header. This
509 * means that all uses must be wrapped in inline functions. This also
510 * makes sure the incoming and outgoing types are always correct.
511 */
512#ifdef CONFIG_X86_32
513#define PVOP_VCALL_ARGS \
514 unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
515#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
516
517#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
518#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
519#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
520
521#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
522 "=c" (__ecx)
523#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
524
525#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
526#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
527
528#define EXTRA_CLOBBERS
529#define VEXTRA_CLOBBERS
530#else /* CONFIG_X86_64 */
531#define PVOP_VCALL_ARGS \
532 unsigned long __edi = __edi, __esi = __esi, \
533 __edx = __edx, __ecx = __ecx
534#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
535
536#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
537#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
538#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
539#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
540
541#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
542 "=S" (__esi), "=d" (__edx), \
543 "=c" (__ecx)
544#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
545
546#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
547#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
548
549#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
550#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
551#endif /* CONFIG_X86_32 */
552
553#ifdef CONFIG_PARAVIRT_DEBUG
554#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
555#else
556#define PVOP_TEST_NULL(op) ((void)op)
557#endif
558
559#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
560 pre, post, ...) \
561 ({ \
562 rettype __ret; \
563 PVOP_CALL_ARGS; \
564 PVOP_TEST_NULL(op); \
565 /* This is 32-bit specific, but is okay in 64-bit */ \
566 /* since this condition will never hold */ \
567 if (sizeof(rettype) > sizeof(unsigned long)) { \
568 asm volatile(pre \
569 paravirt_alt(PARAVIRT_CALL) \
570 post \
571 : call_clbr \
572 : paravirt_type(op), \
573 paravirt_clobber(clbr), \
574 ##__VA_ARGS__ \
575 : "memory", "cc" extra_clbr); \
576 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
577 } else { \
578 asm volatile(pre \
579 paravirt_alt(PARAVIRT_CALL) \
580 post \
581 : call_clbr \
582 : paravirt_type(op), \
583 paravirt_clobber(clbr), \
584 ##__VA_ARGS__ \
585 : "memory", "cc" extra_clbr); \
586 __ret = (rettype)__eax; \
587 } \
588 __ret; \
589 })
590
591#define __PVOP_CALL(rettype, op, pre, post, ...) \
592 ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
593 EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
594
595#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
596 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
597 PVOP_CALLEE_CLOBBERS, , \
598 pre, post, ##__VA_ARGS__)
599
600
601#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
602 ({ \
603 PVOP_VCALL_ARGS; \
604 PVOP_TEST_NULL(op); \
605 asm volatile(pre \
606 paravirt_alt(PARAVIRT_CALL) \
607 post \
608 : call_clbr \
609 : paravirt_type(op), \
610 paravirt_clobber(clbr), \
611 ##__VA_ARGS__ \
612 : "memory", "cc" extra_clbr); \
613 })
614
615#define __PVOP_VCALL(op, pre, post, ...) \
616 ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
617 VEXTRA_CLOBBERS, \
618 pre, post, ##__VA_ARGS__)
619
620#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
621 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
622 PVOP_VCALLEE_CLOBBERS, , \
623 pre, post, ##__VA_ARGS__)
624
625
626
627#define PVOP_CALL0(rettype, op) \
628 __PVOP_CALL(rettype, op, "", "")
629#define PVOP_VCALL0(op) \
630 __PVOP_VCALL(op, "", "")
631
632#define PVOP_CALLEE0(rettype, op) \
633 __PVOP_CALLEESAVE(rettype, op, "", "")
634#define PVOP_VCALLEE0(op) \
635 __PVOP_VCALLEESAVE(op, "", "")
636
637
638#define PVOP_CALL1(rettype, op, arg1) \
639 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
640#define PVOP_VCALL1(op, arg1) \
641 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
642
643#define PVOP_CALLEE1(rettype, op, arg1) \
644 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
645#define PVOP_VCALLEE1(op, arg1) \
646 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
647
648
649#define PVOP_CALL2(rettype, op, arg1, arg2) \
650 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
651 PVOP_CALL_ARG2(arg2))
652#define PVOP_VCALL2(op, arg1, arg2) \
653 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
654 PVOP_CALL_ARG2(arg2))
655
656#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
657 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
658 PVOP_CALL_ARG2(arg2))
659#define PVOP_VCALLEE2(op, arg1, arg2) \
660 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
661 PVOP_CALL_ARG2(arg2))
662
663
664#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
665 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
666 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
667#define PVOP_VCALL3(op, arg1, arg2, arg3) \
668 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
669 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
670
671/* This is the only difference in x86_64. We can make it much simpler */
672#ifdef CONFIG_X86_32
673#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
674 __PVOP_CALL(rettype, op, \
675 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
676 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
677 PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
678#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
679 __PVOP_VCALL(op, \
680 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
681 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
682 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
683#else
684#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
685 __PVOP_CALL(rettype, op, "", "", \
686 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
687 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
688#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
689 __PVOP_VCALL(op, "", "", \
690 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
691 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
692#endif
693 15
694static inline int paravirt_enabled(void) 16static inline int paravirt_enabled(void)
695{ 17{
@@ -702,22 +24,6 @@ static inline void load_sp0(struct tss_struct *tss,
702 PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); 24 PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
703} 25}
704 26
705#define ARCH_SETUP pv_init_ops.arch_setup();
706static inline unsigned long get_wallclock(void)
707{
708 return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
709}
710
711static inline int set_wallclock(unsigned long nowtime)
712{
713 return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
714}
715
716static inline void (*choose_time_init(void))(void)
717{
718 return pv_time_ops.time_init;
719}
720
721/* The paravirtualized CPUID instruction. */ 27/* The paravirtualized CPUID instruction. */
722static inline void __cpuid(unsigned int *eax, unsigned int *ebx, 28static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
723 unsigned int *ecx, unsigned int *edx) 29 unsigned int *ecx, unsigned int *edx)
@@ -820,15 +126,22 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err)
820{ 126{
821 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); 127 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
822} 128}
823static inline u64 paravirt_read_msr_amd(unsigned msr, int *err) 129
130static inline int paravirt_rdmsr_regs(u32 *regs)
824{ 131{
825 return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err); 132 return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs);
826} 133}
134
827static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) 135static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
828{ 136{
829 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); 137 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
830} 138}
831 139
140static inline int paravirt_wrmsr_regs(u32 *regs)
141{
142 return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs);
143}
144
832/* These should all do BUG_ON(_err), but our headers are too tangled. */ 145/* These should all do BUG_ON(_err), but our headers are too tangled. */
833#define rdmsr(msr, val1, val2) \ 146#define rdmsr(msr, val1, val2) \
834do { \ 147do { \
@@ -862,6 +175,9 @@ do { \
862 _err; \ 175 _err; \
863}) 176})
864 177
178#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs)
179#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs)
180
865static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) 181static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
866{ 182{
867 int err; 183 int err;
@@ -871,12 +187,31 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
871} 187}
872static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) 188static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
873{ 189{
190 u32 gprs[8] = { 0 };
874 int err; 191 int err;
875 192
876 *p = paravirt_read_msr_amd(msr, &err); 193 gprs[1] = msr;
194 gprs[7] = 0x9c5a203a;
195
196 err = paravirt_rdmsr_regs(gprs);
197
198 *p = gprs[0] | ((u64)gprs[2] << 32);
199
877 return err; 200 return err;
878} 201}
879 202
203static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
204{
205 u32 gprs[8] = { 0 };
206
207 gprs[0] = (u32)val;
208 gprs[1] = msr;
209 gprs[2] = val >> 32;
210 gprs[7] = 0x9c5a203a;
211
212 return paravirt_wrmsr_regs(gprs);
213}
214
880static inline u64 paravirt_read_tsc(void) 215static inline u64 paravirt_read_tsc(void)
881{ 216{
882 return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); 217 return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
@@ -894,7 +229,6 @@ static inline unsigned long long paravirt_sched_clock(void)
894{ 229{
895 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); 230 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
896} 231}
897#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
898 232
899static inline unsigned long long paravirt_read_pmc(int counter) 233static inline unsigned long long paravirt_read_pmc(int counter)
900{ 234{
@@ -1012,34 +346,6 @@ static inline void slow_down_io(void)
1012#endif 346#endif
1013} 347}
1014 348
1015#ifdef CONFIG_X86_LOCAL_APIC
1016static inline void setup_boot_clock(void)
1017{
1018 PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
1019}
1020
1021static inline void setup_secondary_clock(void)
1022{
1023 PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
1024}
1025#endif
1026
1027static inline void paravirt_post_allocator_init(void)
1028{
1029 if (pv_init_ops.post_allocator_init)
1030 (*pv_init_ops.post_allocator_init)();
1031}
1032
1033static inline void paravirt_pagetable_setup_start(pgd_t *base)
1034{
1035 (*pv_mmu_ops.pagetable_setup_start)(base);
1036}
1037
1038static inline void paravirt_pagetable_setup_done(pgd_t *base)
1039{
1040 (*pv_mmu_ops.pagetable_setup_done)(base);
1041}
1042
1043#ifdef CONFIG_SMP 349#ifdef CONFIG_SMP
1044static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, 350static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
1045 unsigned long start_esp) 351 unsigned long start_esp)
@@ -1393,20 +699,6 @@ static inline void pmd_clear(pmd_t *pmdp)
1393} 699}
1394#endif /* CONFIG_X86_PAE */ 700#endif /* CONFIG_X86_PAE */
1395 701
1396/* Lazy mode for batching updates / context switch */
1397enum paravirt_lazy_mode {
1398 PARAVIRT_LAZY_NONE,
1399 PARAVIRT_LAZY_MMU,
1400 PARAVIRT_LAZY_CPU,
1401};
1402
1403enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
1404void paravirt_start_context_switch(struct task_struct *prev);
1405void paravirt_end_context_switch(struct task_struct *next);
1406
1407void paravirt_enter_lazy_mmu(void);
1408void paravirt_leave_lazy_mmu(void);
1409
1410#define __HAVE_ARCH_START_CONTEXT_SWITCH 702#define __HAVE_ARCH_START_CONTEXT_SWITCH
1411static inline void arch_start_context_switch(struct task_struct *prev) 703static inline void arch_start_context_switch(struct task_struct *prev)
1412{ 704{
@@ -1437,12 +729,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
1437 pv_mmu_ops.set_fixmap(idx, phys, flags); 729 pv_mmu_ops.set_fixmap(idx, phys, flags);
1438} 730}
1439 731
1440void _paravirt_nop(void);
1441u32 _paravirt_ident_32(u32);
1442u64 _paravirt_ident_64(u64);
1443
1444#define paravirt_nop ((void *)_paravirt_nop)
1445
1446#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) 732#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
1447 733
1448static inline int __raw_spin_is_locked(struct raw_spinlock *lock) 734static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
@@ -1479,17 +765,6 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
1479 765
1480#endif 766#endif
1481 767
1482/* These all sit in the .parainstructions section to tell us what to patch. */
1483struct paravirt_patch_site {
1484 u8 *instr; /* original instructions */
1485 u8 instrtype; /* type of this instruction */
1486 u8 len; /* length of original instruction */
1487 u16 clobbers; /* what registers you may clobber */
1488};
1489
1490extern struct paravirt_patch_site __parainstructions[],
1491 __parainstructions_end[];
1492
1493#ifdef CONFIG_X86_32 768#ifdef CONFIG_X86_32
1494#define PV_SAVE_REGS "pushl %ecx; pushl %edx;" 769#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
1495#define PV_RESTORE_REGS "popl %edx; popl %ecx;" 770#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
@@ -1628,6 +903,8 @@ static inline unsigned long __raw_local_irq_save(void)
1628#undef PVOP_VCALL4 903#undef PVOP_VCALL4
1629#undef PVOP_CALL4 904#undef PVOP_CALL4
1630 905
906extern void default_banner(void);
907
1631#else /* __ASSEMBLY__ */ 908#else /* __ASSEMBLY__ */
1632 909
1633#define _PVSITE(ptype, clobbers, ops, word, algn) \ 910#define _PVSITE(ptype, clobbers, ops, word, algn) \
@@ -1768,5 +1045,7 @@ static inline unsigned long __raw_local_irq_save(void)
1768#endif /* CONFIG_X86_32 */ 1045#endif /* CONFIG_X86_32 */
1769 1046
1770#endif /* __ASSEMBLY__ */ 1047#endif /* __ASSEMBLY__ */
1771#endif /* CONFIG_PARAVIRT */ 1048#else /* CONFIG_PARAVIRT */
1049# define default_banner x86_init_noop
1050#endif /* !CONFIG_PARAVIRT */
1772#endif /* _ASM_X86_PARAVIRT_H */ 1051#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
new file mode 100644
index 000000000000..dd0f5b32489d
--- /dev/null
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -0,0 +1,693 @@
1#ifndef _ASM_X86_PARAVIRT_TYPES_H
2#define _ASM_X86_PARAVIRT_TYPES_H
3
4/* Bitmask of what can be clobbered: usually at least eax. */
5#define CLBR_NONE 0
6#define CLBR_EAX (1 << 0)
7#define CLBR_ECX (1 << 1)
8#define CLBR_EDX (1 << 2)
9#define CLBR_EDI (1 << 3)
10
11#ifdef CONFIG_X86_32
12/* CLBR_ANY should match all regs platform has. For i386, that's just it */
13#define CLBR_ANY ((1 << 4) - 1)
14
15#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
16#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
17#define CLBR_SCRATCH (0)
18#else
19#define CLBR_RAX CLBR_EAX
20#define CLBR_RCX CLBR_ECX
21#define CLBR_RDX CLBR_EDX
22#define CLBR_RDI CLBR_EDI
23#define CLBR_RSI (1 << 4)
24#define CLBR_R8 (1 << 5)
25#define CLBR_R9 (1 << 6)
26#define CLBR_R10 (1 << 7)
27#define CLBR_R11 (1 << 8)
28
29#define CLBR_ANY ((1 << 9) - 1)
30
31#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
32 CLBR_RCX | CLBR_R8 | CLBR_R9)
33#define CLBR_RET_REG (CLBR_RAX)
34#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
35
36#endif /* X86_64 */
37
38#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
39
40#ifndef __ASSEMBLY__
41
42#include <asm/desc_defs.h>
43#include <asm/kmap_types.h>
44
45struct page;
46struct thread_struct;
47struct desc_ptr;
48struct tss_struct;
49struct mm_struct;
50struct desc_struct;
51struct task_struct;
52struct cpumask;
53
54/*
55 * Wrapper type for pointers to code which uses the non-standard
56 * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
57 */
58struct paravirt_callee_save {
59 void *func;
60};
61
62/* general info */
63struct pv_info {
64 unsigned int kernel_rpl;
65 int shared_kernel_pmd;
66 int paravirt_enabled;
67 const char *name;
68};
69
70struct pv_init_ops {
71 /*
72 * Patch may replace one of the defined code sequences with
73 * arbitrary code, subject to the same register constraints.
74 * This generally means the code is not free to clobber any
75 * registers other than EAX. The patch function should return
76 * the number of bytes of code generated, as we nop pad the
77 * rest in generic code.
78 */
79 unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
80 unsigned long addr, unsigned len);
81};
82
83
84struct pv_lazy_ops {
85 /* Set deferred update mode, used for batching operations. */
86 void (*enter)(void);
87 void (*leave)(void);
88};
89
90struct pv_time_ops {
91 unsigned long long (*sched_clock)(void);
92 unsigned long (*get_tsc_khz)(void);
93};
94
95struct pv_cpu_ops {
96 /* hooks for various privileged instructions */
97 unsigned long (*get_debugreg)(int regno);
98 void (*set_debugreg)(int regno, unsigned long value);
99
100 void (*clts)(void);
101
102 unsigned long (*read_cr0)(void);
103 void (*write_cr0)(unsigned long);
104
105 unsigned long (*read_cr4_safe)(void);
106 unsigned long (*read_cr4)(void);
107 void (*write_cr4)(unsigned long);
108
109#ifdef CONFIG_X86_64
110 unsigned long (*read_cr8)(void);
111 void (*write_cr8)(unsigned long);
112#endif
113
114 /* Segment descriptor handling */
115 void (*load_tr_desc)(void);
116 void (*load_gdt)(const struct desc_ptr *);
117 void (*load_idt)(const struct desc_ptr *);
118 void (*store_gdt)(struct desc_ptr *);
119 void (*store_idt)(struct desc_ptr *);
120 void (*set_ldt)(const void *desc, unsigned entries);
121 unsigned long (*store_tr)(void);
122 void (*load_tls)(struct thread_struct *t, unsigned int cpu);
123#ifdef CONFIG_X86_64
124 void (*load_gs_index)(unsigned int idx);
125#endif
126 void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
127 const void *desc);
128 void (*write_gdt_entry)(struct desc_struct *,
129 int entrynum, const void *desc, int size);
130 void (*write_idt_entry)(gate_desc *,
131 int entrynum, const gate_desc *gate);
132 void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
133 void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
134
135 void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
136
137 void (*set_iopl_mask)(unsigned mask);
138
139 void (*wbinvd)(void);
140 void (*io_delay)(void);
141
142 /* cpuid emulation, mostly so that caps bits can be disabled */
143 void (*cpuid)(unsigned int *eax, unsigned int *ebx,
144 unsigned int *ecx, unsigned int *edx);
145
146 /* MSR, PMC and TSR operations.
147 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
148 u64 (*read_msr)(unsigned int msr, int *err);
149 int (*rdmsr_regs)(u32 *regs);
150 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
151 int (*wrmsr_regs)(u32 *regs);
152
153 u64 (*read_tsc)(void);
154 u64 (*read_pmc)(int counter);
155 unsigned long long (*read_tscp)(unsigned int *aux);
156
157 /*
158 * Atomically enable interrupts and return to userspace. This
159 * is only ever used to return to 32-bit processes; in a
160 * 64-bit kernel, it's used for 32-on-64 compat processes, but
161 * never native 64-bit processes. (Jump, not call.)
162 */
163 void (*irq_enable_sysexit)(void);
164
165 /*
166 * Switch to usermode gs and return to 64-bit usermode using
167 * sysret. Only used in 64-bit kernels to return to 64-bit
168 * processes. Usermode register state, including %rsp, must
169 * already be restored.
170 */
171 void (*usergs_sysret64)(void);
172
173 /*
174 * Switch to usermode gs and return to 32-bit usermode using
175 * sysret. Used to return to 32-on-64 compat processes.
176 * Other usermode register state, including %esp, must already
177 * be restored.
178 */
179 void (*usergs_sysret32)(void);
180
181 /* Normal iret. Jump to this with the standard iret stack
182 frame set up. */
183 void (*iret)(void);
184
185 void (*swapgs)(void);
186
187 void (*start_context_switch)(struct task_struct *prev);
188 void (*end_context_switch)(struct task_struct *next);
189};
190
191struct pv_irq_ops {
192 /*
193 * Get/set interrupt state. save_fl and restore_fl are only
194 * expected to use X86_EFLAGS_IF; all other bits
195 * returned from save_fl are undefined, and may be ignored by
196 * restore_fl.
197 *
198 * NOTE: These functions callers expect the callee to preserve
199 * more registers than the standard C calling convention.
200 */
201 struct paravirt_callee_save save_fl;
202 struct paravirt_callee_save restore_fl;
203 struct paravirt_callee_save irq_disable;
204 struct paravirt_callee_save irq_enable;
205
206 void (*safe_halt)(void);
207 void (*halt)(void);
208
209#ifdef CONFIG_X86_64
210 void (*adjust_exception_frame)(void);
211#endif
212};
213
214struct pv_apic_ops {
215#ifdef CONFIG_X86_LOCAL_APIC
216 void (*startup_ipi_hook)(int phys_apicid,
217 unsigned long start_eip,
218 unsigned long start_esp);
219#endif
220};
221
222struct pv_mmu_ops {
223 unsigned long (*read_cr2)(void);
224 void (*write_cr2)(unsigned long);
225
226 unsigned long (*read_cr3)(void);
227 void (*write_cr3)(unsigned long);
228
229 /*
230 * Hooks for intercepting the creation/use/destruction of an
231 * mm_struct.
232 */
233 void (*activate_mm)(struct mm_struct *prev,
234 struct mm_struct *next);
235 void (*dup_mmap)(struct mm_struct *oldmm,
236 struct mm_struct *mm);
237 void (*exit_mmap)(struct mm_struct *mm);
238
239
240 /* TLB operations */
241 void (*flush_tlb_user)(void);
242 void (*flush_tlb_kernel)(void);
243 void (*flush_tlb_single)(unsigned long addr);
244 void (*flush_tlb_others)(const struct cpumask *cpus,
245 struct mm_struct *mm,
246 unsigned long va);
247
248 /* Hooks for allocating and freeing a pagetable top-level */
249 int (*pgd_alloc)(struct mm_struct *mm);
250 void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
251
252 /*
253 * Hooks for allocating/releasing pagetable pages when they're
254 * attached to a pagetable
255 */
256 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
257 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
258 void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
259 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
260 void (*release_pte)(unsigned long pfn);
261 void (*release_pmd)(unsigned long pfn);
262 void (*release_pud)(unsigned long pfn);
263
264 /* Pagetable manipulation functions */
265 void (*set_pte)(pte_t *ptep, pte_t pteval);
266 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
267 pte_t *ptep, pte_t pteval);
268 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
269 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
270 pte_t *ptep);
271 void (*pte_update_defer)(struct mm_struct *mm,
272 unsigned long addr, pte_t *ptep);
273
274 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
275 pte_t *ptep);
276 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
277 pte_t *ptep, pte_t pte);
278
279 struct paravirt_callee_save pte_val;
280 struct paravirt_callee_save make_pte;
281
282 struct paravirt_callee_save pgd_val;
283 struct paravirt_callee_save make_pgd;
284
285#if PAGETABLE_LEVELS >= 3
286#ifdef CONFIG_X86_PAE
287 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
288 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
289 pte_t *ptep);
290 void (*pmd_clear)(pmd_t *pmdp);
291
292#endif /* CONFIG_X86_PAE */
293
294 void (*set_pud)(pud_t *pudp, pud_t pudval);
295
296 struct paravirt_callee_save pmd_val;
297 struct paravirt_callee_save make_pmd;
298
299#if PAGETABLE_LEVELS == 4
300 struct paravirt_callee_save pud_val;
301 struct paravirt_callee_save make_pud;
302
303 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
304#endif /* PAGETABLE_LEVELS == 4 */
305#endif /* PAGETABLE_LEVELS >= 3 */
306
307#ifdef CONFIG_HIGHPTE
308 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
309#endif
310
311 struct pv_lazy_ops lazy_mode;
312
313 /* dom0 ops */
314
315 /* Sometimes the physical address is a pfn, and sometimes its
316 an mfn. We can tell which is which from the index. */
317 void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
318 phys_addr_t phys, pgprot_t flags);
319};
320
321struct raw_spinlock;
322struct pv_lock_ops {
323 int (*spin_is_locked)(struct raw_spinlock *lock);
324 int (*spin_is_contended)(struct raw_spinlock *lock);
325 void (*spin_lock)(struct raw_spinlock *lock);
326 void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
327 int (*spin_trylock)(struct raw_spinlock *lock);
328 void (*spin_unlock)(struct raw_spinlock *lock);
329};
330
331/* This contains all the paravirt structures: we get a convenient
332 * number for each function using the offset which we use to indicate
333 * what to patch. */
334struct paravirt_patch_template {
335 struct pv_init_ops pv_init_ops;
336 struct pv_time_ops pv_time_ops;
337 struct pv_cpu_ops pv_cpu_ops;
338 struct pv_irq_ops pv_irq_ops;
339 struct pv_apic_ops pv_apic_ops;
340 struct pv_mmu_ops pv_mmu_ops;
341 struct pv_lock_ops pv_lock_ops;
342};
343
344extern struct pv_info pv_info;
345extern struct pv_init_ops pv_init_ops;
346extern struct pv_time_ops pv_time_ops;
347extern struct pv_cpu_ops pv_cpu_ops;
348extern struct pv_irq_ops pv_irq_ops;
349extern struct pv_apic_ops pv_apic_ops;
350extern struct pv_mmu_ops pv_mmu_ops;
351extern struct pv_lock_ops pv_lock_ops;
352
353#define PARAVIRT_PATCH(x) \
354 (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
355
356#define paravirt_type(op) \
357 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
358 [paravirt_opptr] "i" (&(op))
359#define paravirt_clobber(clobber) \
360 [paravirt_clobber] "i" (clobber)
361
362/*
363 * Generate some code, and mark it as patchable by the
364 * apply_paravirt() alternate instruction patcher.
365 */
366#define _paravirt_alt(insn_string, type, clobber) \
367 "771:\n\t" insn_string "\n" "772:\n" \
368 ".pushsection .parainstructions,\"a\"\n" \
369 _ASM_ALIGN "\n" \
370 _ASM_PTR " 771b\n" \
371 " .byte " type "\n" \
372 " .byte 772b-771b\n" \
373 " .short " clobber "\n" \
374 ".popsection\n"
375
376/* Generate patchable code, with the default asm parameters. */
377#define paravirt_alt(insn_string) \
378 _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
379
380/* Simple instruction patching code. */
381#define DEF_NATIVE(ops, name, code) \
382 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
383 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
384
385unsigned paravirt_patch_nop(void);
386unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
387unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
388unsigned paravirt_patch_ignore(unsigned len);
389unsigned paravirt_patch_call(void *insnbuf,
390 const void *target, u16 tgt_clobbers,
391 unsigned long addr, u16 site_clobbers,
392 unsigned len);
393unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
394 unsigned long addr, unsigned len);
395unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
396 unsigned long addr, unsigned len);
397
398unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
399 const char *start, const char *end);
400
401unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
402 unsigned long addr, unsigned len);
403
404int paravirt_disable_iospace(void);
405
406/*
407 * This generates an indirect call based on the operation type number.
408 * The type number, computed in PARAVIRT_PATCH, is derived from the
409 * offset into the paravirt_patch_template structure, and can therefore be
410 * freely converted back into a structure offset.
411 */
412#define PARAVIRT_CALL "call *%c[paravirt_opptr];"
413
414/*
415 * These macros are intended to wrap calls through one of the paravirt
416 * ops structs, so that they can be later identified and patched at
417 * runtime.
418 *
419 * Normally, a call to a pv_op function is a simple indirect call:
420 * (pv_op_struct.operations)(args...).
421 *
422 * Unfortunately, this is a relatively slow operation for modern CPUs,
423 * because it cannot necessarily determine what the destination
424 * address is. In this case, the address is a runtime constant, so at
425 * the very least we can patch the call to e a simple direct call, or
426 * ideally, patch an inline implementation into the callsite. (Direct
427 * calls are essentially free, because the call and return addresses
428 * are completely predictable.)
429 *
430 * For i386, these macros rely on the standard gcc "regparm(3)" calling
431 * convention, in which the first three arguments are placed in %eax,
432 * %edx, %ecx (in that order), and the remaining arguments are placed
433 * on the stack. All caller-save registers (eax,edx,ecx) are expected
434 * to be modified (either clobbered or used for return values).
435 * X86_64, on the other hand, already specifies a register-based calling
436 * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
437 * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
438 * special handling for dealing with 4 arguments, unlike i386.
439 * However, x86_64 also have to clobber all caller saved registers, which
440 * unfortunately, are quite a bit (r8 - r11)
441 *
442 * The call instruction itself is marked by placing its start address
443 * and size into the .parainstructions section, so that
444 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
445 * appropriate patching under the control of the backend pv_init_ops
446 * implementation.
447 *
448 * Unfortunately there's no way to get gcc to generate the args setup
449 * for the call, and then allow the call itself to be generated by an
450 * inline asm. Because of this, we must do the complete arg setup and
451 * return value handling from within these macros. This is fairly
452 * cumbersome.
453 *
454 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
455 * It could be extended to more arguments, but there would be little
456 * to be gained from that. For each number of arguments, there are
457 * the two VCALL and CALL variants for void and non-void functions.
458 *
459 * When there is a return value, the invoker of the macro must specify
460 * the return type. The macro then uses sizeof() on that type to
461 * determine whether its a 32 or 64 bit value, and places the return
462 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
463 * 64-bit). For x86_64 machines, it just returns at %rax regardless of
464 * the return value size.
465 *
466 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
467 * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
468 * in low,high order
469 *
470 * Small structures are passed and returned in registers. The macro
471 * calling convention can't directly deal with this, so the wrapper
472 * functions must do this.
473 *
474 * These PVOP_* macros are only defined within this header. This
475 * means that all uses must be wrapped in inline functions. This also
476 * makes sure the incoming and outgoing types are always correct.
477 */
478#ifdef CONFIG_X86_32
479#define PVOP_VCALL_ARGS \
480 unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
481#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
482
483#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
484#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
485#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
486
487#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
488 "=c" (__ecx)
489#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
490
491#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
492#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
493
494#define EXTRA_CLOBBERS
495#define VEXTRA_CLOBBERS
496#else /* CONFIG_X86_64 */
497#define PVOP_VCALL_ARGS \
498 unsigned long __edi = __edi, __esi = __esi, \
499 __edx = __edx, __ecx = __ecx
500#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
501
502#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
503#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
504#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
505#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
506
507#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
508 "=S" (__esi), "=d" (__edx), \
509 "=c" (__ecx)
510#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
511
512#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
513#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
514
515#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
516#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
517#endif /* CONFIG_X86_32 */
518
519#ifdef CONFIG_PARAVIRT_DEBUG
520#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
521#else
522#define PVOP_TEST_NULL(op) ((void)op)
523#endif
524
525#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
526 pre, post, ...) \
527 ({ \
528 rettype __ret; \
529 PVOP_CALL_ARGS; \
530 PVOP_TEST_NULL(op); \
531 /* This is 32-bit specific, but is okay in 64-bit */ \
532 /* since this condition will never hold */ \
533 if (sizeof(rettype) > sizeof(unsigned long)) { \
534 asm volatile(pre \
535 paravirt_alt(PARAVIRT_CALL) \
536 post \
537 : call_clbr \
538 : paravirt_type(op), \
539 paravirt_clobber(clbr), \
540 ##__VA_ARGS__ \
541 : "memory", "cc" extra_clbr); \
542 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
543 } else { \
544 asm volatile(pre \
545 paravirt_alt(PARAVIRT_CALL) \
546 post \
547 : call_clbr \
548 : paravirt_type(op), \
549 paravirt_clobber(clbr), \
550 ##__VA_ARGS__ \
551 : "memory", "cc" extra_clbr); \
552 __ret = (rettype)__eax; \
553 } \
554 __ret; \
555 })
556
557#define __PVOP_CALL(rettype, op, pre, post, ...) \
558 ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
559 EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
560
561#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
562 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
563 PVOP_CALLEE_CLOBBERS, , \
564 pre, post, ##__VA_ARGS__)
565
566
567#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
568 ({ \
569 PVOP_VCALL_ARGS; \
570 PVOP_TEST_NULL(op); \
571 asm volatile(pre \
572 paravirt_alt(PARAVIRT_CALL) \
573 post \
574 : call_clbr \
575 : paravirt_type(op), \
576 paravirt_clobber(clbr), \
577 ##__VA_ARGS__ \
578 : "memory", "cc" extra_clbr); \
579 })
580
581#define __PVOP_VCALL(op, pre, post, ...) \
582 ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
583 VEXTRA_CLOBBERS, \
584 pre, post, ##__VA_ARGS__)
585
586#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
587 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
588 PVOP_VCALLEE_CLOBBERS, , \
589 pre, post, ##__VA_ARGS__)
590
591
592
593#define PVOP_CALL0(rettype, op) \
594 __PVOP_CALL(rettype, op, "", "")
595#define PVOP_VCALL0(op) \
596 __PVOP_VCALL(op, "", "")
597
598#define PVOP_CALLEE0(rettype, op) \
599 __PVOP_CALLEESAVE(rettype, op, "", "")
600#define PVOP_VCALLEE0(op) \
601 __PVOP_VCALLEESAVE(op, "", "")
602
603
604#define PVOP_CALL1(rettype, op, arg1) \
605 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
606#define PVOP_VCALL1(op, arg1) \
607 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
608
609#define PVOP_CALLEE1(rettype, op, arg1) \
610 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
611#define PVOP_VCALLEE1(op, arg1) \
612 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
613
614
615#define PVOP_CALL2(rettype, op, arg1, arg2) \
616 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
617 PVOP_CALL_ARG2(arg2))
618#define PVOP_VCALL2(op, arg1, arg2) \
619 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
620 PVOP_CALL_ARG2(arg2))
621
622#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
623 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
624 PVOP_CALL_ARG2(arg2))
625#define PVOP_VCALLEE2(op, arg1, arg2) \
626 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
627 PVOP_CALL_ARG2(arg2))
628
629
630#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
631 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
632 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
633#define PVOP_VCALL3(op, arg1, arg2, arg3) \
634 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
635 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
636
637/* This is the only difference in x86_64. We can make it much simpler */
638#ifdef CONFIG_X86_32
639#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
640 __PVOP_CALL(rettype, op, \
641 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
642 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
643 PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
644#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
645 __PVOP_VCALL(op, \
646 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
647 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
648 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
649#else
650#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
651 __PVOP_CALL(rettype, op, "", "", \
652 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
653 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
654#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
655 __PVOP_VCALL(op, "", "", \
656 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
657 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
658#endif
659
660/* Lazy mode for batching updates / context switch */
661enum paravirt_lazy_mode {
662 PARAVIRT_LAZY_NONE,
663 PARAVIRT_LAZY_MMU,
664 PARAVIRT_LAZY_CPU,
665};
666
667enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
668void paravirt_start_context_switch(struct task_struct *prev);
669void paravirt_end_context_switch(struct task_struct *next);
670
671void paravirt_enter_lazy_mmu(void);
672void paravirt_leave_lazy_mmu(void);
673
674void _paravirt_nop(void);
675u32 _paravirt_ident_32(u32);
676u64 _paravirt_ident_64(u64);
677
678#define paravirt_nop ((void *)_paravirt_nop)
679
680/* These all sit in the .parainstructions section to tell us what to patch. */
681struct paravirt_patch_site {
682 u8 *instr; /* original instructions */
683 u8 instrtype; /* type of this instruction */
684 u8 len; /* length of original instruction */
685 u16 clobbers; /* what registers you may clobber */
686};
687
688extern struct paravirt_patch_site __parainstructions[],
689 __parainstructions_end[];
690
691#endif /* __ASSEMBLY__ */
692
693#endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 7af14e512f97..e2c1668dde7a 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end);
19extern int kernel_map_sync_memtype(u64 base, unsigned long size, 19extern int kernel_map_sync_memtype(u64 base, unsigned long size,
20 unsigned long flag); 20 unsigned long flag);
21 21
22int io_reserve_memtype(resource_size_t start, resource_size_t end,
23 unsigned long *type);
24
25void io_free_memtype(resource_size_t start, resource_size_t end);
26
22#endif /* _ASM_X86_PAT_H */ 27#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 1ff685ca221c..ada8c201d513 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void);
48#else 48#else
49#define pcibios_assign_all_busses() 0 49#define pcibios_assign_all_busses() 0
50#endif 50#endif
51#define pcibios_scan_all_fns(a, b) 0
52 51
53extern unsigned long pci_mem_start; 52extern unsigned long pci_mem_start;
54#define PCIBIOS_MIN_IO 0x1000 53#define PCIBIOS_MIN_IO 0x1000
@@ -144,7 +143,11 @@ static inline int __pcibus_to_node(const struct pci_bus *bus)
144static inline const struct cpumask * 143static inline const struct cpumask *
145cpumask_of_pcibus(const struct pci_bus *bus) 144cpumask_of_pcibus(const struct pci_bus *bus)
146{ 145{
147 return cpumask_of_node(__pcibus_to_node(bus)); 146 int node;
147
148 node = __pcibus_to_node(bus);
149 return (node == -1) ? cpu_online_mask :
150 cpumask_of_node(node);
148} 151}
149#endif 152#endif
150 153
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 103f1ddb0d85..b65a36defeb7 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -49,7 +49,7 @@
49#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 49#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
50#define __my_cpu_offset percpu_read(this_cpu_off) 50#define __my_cpu_offset percpu_read(this_cpu_off)
51#else 51#else
52#define __percpu_arg(x) "%" #x 52#define __percpu_arg(x) "%P" #x
53#endif 53#endif
54 54
55/* 55/*
@@ -104,36 +104,48 @@ do { \
104 } \ 104 } \
105} while (0) 105} while (0)
106 106
107#define percpu_from_op(op, var) \ 107#define percpu_from_op(op, var, constraint) \
108({ \ 108({ \
109 typeof(var) ret__; \ 109 typeof(var) ret__; \
110 switch (sizeof(var)) { \ 110 switch (sizeof(var)) { \
111 case 1: \ 111 case 1: \
112 asm(op "b "__percpu_arg(1)",%0" \ 112 asm(op "b "__percpu_arg(1)",%0" \
113 : "=q" (ret__) \ 113 : "=q" (ret__) \
114 : "m" (var)); \ 114 : constraint); \
115 break; \ 115 break; \
116 case 2: \ 116 case 2: \
117 asm(op "w "__percpu_arg(1)",%0" \ 117 asm(op "w "__percpu_arg(1)",%0" \
118 : "=r" (ret__) \ 118 : "=r" (ret__) \
119 : "m" (var)); \ 119 : constraint); \
120 break; \ 120 break; \
121 case 4: \ 121 case 4: \
122 asm(op "l "__percpu_arg(1)",%0" \ 122 asm(op "l "__percpu_arg(1)",%0" \
123 : "=r" (ret__) \ 123 : "=r" (ret__) \
124 : "m" (var)); \ 124 : constraint); \
125 break; \ 125 break; \
126 case 8: \ 126 case 8: \
127 asm(op "q "__percpu_arg(1)",%0" \ 127 asm(op "q "__percpu_arg(1)",%0" \
128 : "=r" (ret__) \ 128 : "=r" (ret__) \
129 : "m" (var)); \ 129 : constraint); \
130 break; \ 130 break; \
131 default: __bad_percpu_size(); \ 131 default: __bad_percpu_size(); \
132 } \ 132 } \
133 ret__; \ 133 ret__; \
134}) 134})
135 135
136#define percpu_read(var) percpu_from_op("mov", per_cpu__##var) 136/*
137 * percpu_read() makes gcc load the percpu variable every time it is
138 * accessed while percpu_read_stable() allows the value to be cached.
139 * percpu_read_stable() is more efficient and can be used if its value
140 * is guaranteed to be valid across cpus. The current users include
141 * get_current() and get_thread_info() both of which are actually
142 * per-thread variables implemented as per-cpu variables and thus
143 * stable for the duration of the respective task.
144 */
145#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \
146 "m" (per_cpu__##var))
147#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \
148 "p" (&per_cpu__##var))
137#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) 149#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
138#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) 150#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
139#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) 151#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
@@ -156,15 +168,6 @@ do { \
156/* We can use this directly for local CPU (faster). */ 168/* We can use this directly for local CPU (faster). */
157DECLARE_PER_CPU(unsigned long, this_cpu_off); 169DECLARE_PER_CPU(unsigned long, this_cpu_off);
158 170
159#ifdef CONFIG_NEED_MULTIPLE_NODES
160void *pcpu_lpage_remapped(void *kaddr);
161#else
162static inline void *pcpu_lpage_remapped(void *kaddr)
163{
164 return NULL;
165}
166#endif
167
168#endif /* !__ASSEMBLY__ */ 171#endif /* !__ASSEMBLY__ */
169 172
170#ifdef CONFIG_SMP 173#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_event.h
index fa64e401589d..ad7ce3fd5065 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -1,8 +1,8 @@
1#ifndef _ASM_X86_PERF_COUNTER_H 1#ifndef _ASM_X86_PERF_EVENT_H
2#define _ASM_X86_PERF_COUNTER_H 2#define _ASM_X86_PERF_EVENT_H
3 3
4/* 4/*
5 * Performance counter hw details: 5 * Performance event hw details:
6 */ 6 */
7 7
8#define X86_PMC_MAX_GENERIC 8 8#define X86_PMC_MAX_GENERIC 8
@@ -43,7 +43,7 @@
43union cpuid10_eax { 43union cpuid10_eax {
44 struct { 44 struct {
45 unsigned int version_id:8; 45 unsigned int version_id:8;
46 unsigned int num_counters:8; 46 unsigned int num_events:8;
47 unsigned int bit_width:8; 47 unsigned int bit_width:8;
48 unsigned int mask_length:8; 48 unsigned int mask_length:8;
49 } split; 49 } split;
@@ -52,7 +52,7 @@ union cpuid10_eax {
52 52
53union cpuid10_edx { 53union cpuid10_edx {
54 struct { 54 struct {
55 unsigned int num_counters_fixed:4; 55 unsigned int num_events_fixed:4;
56 unsigned int reserved:28; 56 unsigned int reserved:28;
57 } split; 57 } split;
58 unsigned int full; 58 unsigned int full;
@@ -60,7 +60,7 @@ union cpuid10_edx {
60 60
61 61
62/* 62/*
63 * Fixed-purpose performance counters: 63 * Fixed-purpose performance events:
64 */ 64 */
65 65
66/* 66/*
@@ -84,15 +84,25 @@ union cpuid10_edx {
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) 85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86 86
87#ifdef CONFIG_PERF_COUNTERS 87/*
88extern void init_hw_perf_counters(void); 88 * We model BTS tracing as another fixed-mode PMC.
89extern void perf_counters_lapic_init(void); 89 *
90 * We choose a value in the middle of the fixed event range, since lower
91 * values are used by actual fixed events and higher values are used
92 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
93 */
94#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
95
96
97#ifdef CONFIG_PERF_EVENTS
98extern void init_hw_perf_events(void);
99extern void perf_events_lapic_init(void);
90 100
91#define PERF_COUNTER_INDEX_OFFSET 0 101#define PERF_EVENT_INDEX_OFFSET 0
92 102
93#else 103#else
94static inline void init_hw_perf_counters(void) { } 104static inline void init_hw_perf_events(void) { }
95static inline void perf_counters_lapic_init(void) { } 105static inline void perf_events_lapic_init(void) { }
96#endif 106#endif
97 107
98#endif /* _ASM_X86_PERF_COUNTER_H */ 108#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 16748077559a..af6fd360ab35 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -56,16 +56,6 @@ extern struct list_head pgd_list;
56#define pte_update(mm, addr, ptep) do { } while (0) 56#define pte_update(mm, addr, ptep) do { } while (0)
57#define pte_update_defer(mm, addr, ptep) do { } while (0) 57#define pte_update_defer(mm, addr, ptep) do { } while (0)
58 58
59static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
60{
61 native_pagetable_setup_start(base);
62}
63
64static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
65{
66 native_pagetable_setup_done(base);
67}
68
69#define pgd_val(x) native_pgd_val(x) 59#define pgd_val(x) native_pgd_val(x)
70#define __pgd(x) native_make_pgd(x) 60#define __pgd(x) native_make_pgd(x)
71 61
@@ -135,6 +125,11 @@ static inline unsigned long pte_pfn(pte_t pte)
135 return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; 125 return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
136} 126}
137 127
128static inline unsigned long pmd_pfn(pmd_t pmd)
129{
130 return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
131}
132
138#define pte_page(pte) pfn_to_page(pte_pfn(pte)) 133#define pte_page(pte) pfn_to_page(pte_pfn(pte))
139 134
140static inline int pmd_large(pmd_t pte) 135static inline int pmd_large(pmd_t pte)
@@ -359,7 +354,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
359 * this macro returns the index of the entry in the pmd page which would 354 * this macro returns the index of the entry in the pmd page which would
360 * control the given virtual address 355 * control the given virtual address
361 */ 356 */
362static inline unsigned pmd_index(unsigned long address) 357static inline unsigned long pmd_index(unsigned long address)
363{ 358{
364 return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); 359 return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
365} 360}
@@ -379,7 +374,7 @@ static inline unsigned pmd_index(unsigned long address)
379 * this function returns the index of the entry in the pte page which would 374 * this function returns the index of the entry in the pte page which would
380 * control the given virtual address 375 * control the given virtual address
381 */ 376 */
382static inline unsigned pte_index(unsigned long address) 377static inline unsigned long pte_index(unsigned long address)
383{ 378{
384 return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 379 return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
385} 380}
@@ -430,11 +425,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
430 return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); 425 return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
431} 426}
432 427
433static inline unsigned long pmd_pfn(pmd_t pmd)
434{
435 return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
436}
437
438static inline int pud_large(pud_t pud) 428static inline int pud_large(pud_t pud)
439{ 429{
440 return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == 430 return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
@@ -470,7 +460,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
470#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) 460#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
471 461
472/* to find an entry in a page-table-directory. */ 462/* to find an entry in a page-table-directory. */
473static inline unsigned pud_index(unsigned long address) 463static inline unsigned long pud_index(unsigned long address)
474{ 464{
475 return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); 465 return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
476} 466}
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 54cb697f4900..7b467bf3c680 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -299,8 +299,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
299extern void native_pagetable_setup_start(pgd_t *base); 299extern void native_pagetable_setup_start(pgd_t *base);
300extern void native_pagetable_setup_done(pgd_t *base); 300extern void native_pagetable_setup_done(pgd_t *base);
301#else 301#else
302static inline void native_pagetable_setup_start(pgd_t *base) {} 302#define native_pagetable_setup_start x86_init_pgd_noop
303static inline void native_pagetable_setup_done(pgd_t *base) {} 303#define native_pagetable_setup_done x86_init_pgd_noop
304#endif 304#endif
305 305
306struct seq_file; 306struct seq_file;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c7768269b1cf..c3429e8b2424 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@ struct mm_struct;
27#include <linux/cpumask.h> 27#include <linux/cpumask.h>
28#include <linux/cache.h> 28#include <linux/cache.h>
29#include <linux/threads.h> 29#include <linux/threads.h>
30#include <linux/math64.h>
30#include <linux/init.h> 31#include <linux/init.h>
31 32
32/* 33/*
@@ -403,7 +404,17 @@ extern unsigned long kernel_eflags;
403extern asmlinkage void ignore_sysret(void); 404extern asmlinkage void ignore_sysret(void);
404#else /* X86_64 */ 405#else /* X86_64 */
405#ifdef CONFIG_CC_STACKPROTECTOR 406#ifdef CONFIG_CC_STACKPROTECTOR
406DECLARE_PER_CPU(unsigned long, stack_canary); 407/*
408 * Make sure stack canary segment base is cached-aligned:
409 * "For Intel Atom processors, avoid non zero segment base address
410 * that is not aligned to cache line boundary at all cost."
411 * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
412 */
413struct stack_canary {
414 char __pad[20]; /* canary at %gs:20 */
415 unsigned long canary;
416};
417DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
407#endif 418#endif
408#endif /* X86_64 */ 419#endif /* X86_64 */
409 420
@@ -703,13 +714,23 @@ static inline void cpu_relax(void)
703 rep_nop(); 714 rep_nop();
704} 715}
705 716
706/* Stop speculative execution: */ 717/* Stop speculative execution and prefetching of modified code. */
707static inline void sync_core(void) 718static inline void sync_core(void)
708{ 719{
709 int tmp; 720 int tmp;
710 721
711 asm volatile("cpuid" : "=a" (tmp) : "0" (1) 722#if defined(CONFIG_M386) || defined(CONFIG_M486)
712 : "ebx", "ecx", "edx", "memory"); 723 if (boot_cpu_data.x86 < 5)
724 /* There is no speculative execution.
725 * jmp is a barrier to prefetching. */
726 asm volatile("jmp 1f\n1:\n" ::: "memory");
727 else
728#endif
729 /* cpuid is a barrier to speculative execution.
730 * Prefetched instructions are automatically
731 * invalidated when modified. */
732 asm volatile("cpuid" : "=a" (tmp) : "0" (1)
733 : "ebx", "ecx", "edx", "memory");
713} 734}
714 735
715static inline void __monitor(const void *eax, unsigned long ecx, 736static inline void __monitor(const void *eax, unsigned long ecx,
@@ -1000,4 +1021,35 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
1000extern int get_tsc_mode(unsigned long adr); 1021extern int get_tsc_mode(unsigned long adr);
1001extern int set_tsc_mode(unsigned int val); 1022extern int set_tsc_mode(unsigned int val);
1002 1023
1024extern int amd_get_nb_id(int cpu);
1025
1026struct aperfmperf {
1027 u64 aperf, mperf;
1028};
1029
1030static inline void get_aperfmperf(struct aperfmperf *am)
1031{
1032 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
1033
1034 rdmsrl(MSR_IA32_APERF, am->aperf);
1035 rdmsrl(MSR_IA32_MPERF, am->mperf);
1036}
1037
1038#define APERFMPERF_SHIFT 10
1039
1040static inline
1041unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
1042 struct aperfmperf *new)
1043{
1044 u64 aperf = new->aperf - old->aperf;
1045 u64 mperf = new->mperf - old->mperf;
1046 unsigned long ratio = aperf;
1047
1048 mperf >>= APERFMPERF_SHIFT;
1049 if (mperf)
1050 ratio = div64_u64(aperf, mperf);
1051
1052 return ratio;
1053}
1054
1003#endif /* _ASM_X86_PROCESSOR_H */ 1055#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index 263d397d2eef..75af592677ec 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -1,33 +1,8 @@
1#ifndef _ASM_X86_SCATTERLIST_H 1#ifndef _ASM_X86_SCATTERLIST_H
2#define _ASM_X86_SCATTERLIST_H 2#define _ASM_X86_SCATTERLIST_H
3 3
4#include <asm/types.h>
5
6struct scatterlist {
7#ifdef CONFIG_DEBUG_SG
8 unsigned long sg_magic;
9#endif
10 unsigned long page_link;
11 unsigned int offset;
12 unsigned int length;
13 dma_addr_t dma_address;
14 unsigned int dma_length;
15};
16
17#define ARCH_HAS_SG_CHAIN
18#define ISA_DMA_THRESHOLD (0x00ffffff) 4#define ISA_DMA_THRESHOLD (0x00ffffff)
19 5
20/* 6#include <asm-generic/scatterlist.h>
21 * These macros should be used after a pci_map_sg call has been done
22 * to get bus addresses of each of the SG entries and their lengths.
23 * You should only work with the number of sg entries pci_map_sg
24 * returns.
25 */
26#define sg_dma_address(sg) ((sg)->dma_address)
27#ifdef CONFIG_X86_32
28# define sg_dma_len(sg) ((sg)->length)
29#else
30# define sg_dma_len(sg) ((sg)->dma_length)
31#endif
32 7
33#endif /* _ASM_X86_SCATTERLIST_H */ 8#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 4093d1ed6db2..18e496c98ff0 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -5,43 +5,6 @@
5 5
6#define COMMAND_LINE_SIZE 2048 6#define COMMAND_LINE_SIZE 2048
7 7
8#ifndef __ASSEMBLY__
9
10/*
11 * Any setup quirks to be performed?
12 */
13struct mpc_cpu;
14struct mpc_bus;
15struct mpc_oemtable;
16
17struct x86_quirks {
18 int (*arch_pre_time_init)(void);
19 int (*arch_time_init)(void);
20 int (*arch_pre_intr_init)(void);
21 int (*arch_intr_init)(void);
22 int (*arch_trap_init)(void);
23 char * (*arch_memory_setup)(void);
24 int (*mach_get_smp_config)(unsigned int early);
25 int (*mach_find_smp_config)(unsigned int reserve);
26
27 int *mpc_record;
28 int (*mpc_apic_id)(struct mpc_cpu *m);
29 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
30 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
31 void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
32 unsigned short oemsize);
33 int (*setup_ioapic_ids)(void);
34};
35
36extern void x86_quirk_intr_init(void);
37
38extern void x86_quirk_trap_init(void);
39
40extern void x86_quirk_pre_time_init(void);
41extern void x86_quirk_time_init(void);
42
43#endif /* __ASSEMBLY__ */
44
45#ifdef __i386__ 8#ifdef __i386__
46 9
47#include <linux/pfn.h> 10#include <linux/pfn.h>
@@ -61,6 +24,7 @@ extern void x86_quirk_time_init(void);
61 24
62#ifndef __ASSEMBLY__ 25#ifndef __ASSEMBLY__
63#include <asm/bootparam.h> 26#include <asm/bootparam.h>
27#include <asm/x86_init.h>
64 28
65/* Interrupt control for vSMPowered x86_64 systems */ 29/* Interrupt control for vSMPowered x86_64 systems */
66#ifdef CONFIG_X86_64 30#ifdef CONFIG_X86_64
@@ -79,11 +43,16 @@ static inline void visws_early_detect(void) { }
79static inline int is_visws_box(void) { return 0; } 43static inline int is_visws_box(void) { return 0; }
80#endif 44#endif
81 45
82extern struct x86_quirks *x86_quirks;
83extern unsigned long saved_video_mode; 46extern unsigned long saved_video_mode;
84 47
85#ifndef CONFIG_PARAVIRT 48extern void reserve_standard_io_resources(void);
86#define paravirt_post_allocator_init() do {} while (0) 49extern void i386_reserve_resources(void);
50extern void setup_default_timer_irq(void);
51
52#ifdef CONFIG_X86_MRST
53extern void x86_mrst_early_setup(void);
54#else
55static inline void x86_mrst_early_setup(void) { }
87#endif 56#endif
88 57
89#ifndef _SETUP 58#ifndef _SETUP
diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h
index b51413b74971..83c05fc2de38 100644
--- a/arch/x86/include/asm/shmbuf.h
+++ b/arch/x86/include/asm/shmbuf.h
@@ -1,51 +1 @@
1#ifndef _ASM_X86_SHMBUF_H #include <asm-generic/shmbuf.h>
2#define _ASM_X86_SHMBUF_H
3
4/*
5 * The shmid64_ds structure for x86 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space on 32 bit is left for:
10 * - 64-bit time_t to solve y2038 problem
11 * - 2 miscellaneous 32-bit values
12 *
13 * Pad space on 64 bit is left for:
14 * - 2 miscellaneous 64-bit values
15 */
16
17struct shmid64_ds {
18 struct ipc64_perm shm_perm; /* operation perms */
19 size_t shm_segsz; /* size of segment (bytes) */
20 __kernel_time_t shm_atime; /* last attach time */
21#ifdef __i386__
22 unsigned long __unused1;
23#endif
24 __kernel_time_t shm_dtime; /* last detach time */
25#ifdef __i386__
26 unsigned long __unused2;
27#endif
28 __kernel_time_t shm_ctime; /* last change time */
29#ifdef __i386__
30 unsigned long __unused3;
31#endif
32 __kernel_pid_t shm_cpid; /* pid of creator */
33 __kernel_pid_t shm_lpid; /* pid of last operator */
34 unsigned long shm_nattch; /* no. of current attaches */
35 unsigned long __unused4;
36 unsigned long __unused5;
37};
38
39struct shminfo64 {
40 unsigned long shmmax;
41 unsigned long shmmin;
42 unsigned long shmmni;
43 unsigned long shmseg;
44 unsigned long shmall;
45 unsigned long __unused1;
46 unsigned long __unused2;
47 unsigned long __unused3;
48 unsigned long __unused4;
49};
50
51#endif /* _ASM_X86_SHMBUF_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 6a84ed166aec..1e796782cd7b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu)
121 smp_ops.send_call_func_single_ipi(cpu); 121 smp_ops.send_call_func_single_ipi(cpu);
122} 122}
123 123
124#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
125static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) 124static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
126{ 125{
127 smp_ops.send_call_func_ipi(mask); 126 smp_ops.send_call_func_ipi(mask);
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
index ca8bf2cd0ba9..6b71384b9d8b 100644
--- a/arch/x86/include/asm/socket.h
+++ b/arch/x86/include/asm/socket.h
@@ -1,60 +1 @@
1#ifndef _ASM_X86_SOCKET_H #include <asm-generic/socket.h>
2#define _ASM_X86_SOCKET_H
3
4#include <asm/sockios.h>
5
6/* For setsockopt(2) */
7#define SOL_SOCKET 1
8
9#define SO_DEBUG 1
10#define SO_REUSEADDR 2
11#define SO_TYPE 3
12#define SO_ERROR 4
13#define SO_DONTROUTE 5
14#define SO_BROADCAST 6
15#define SO_SNDBUF 7
16#define SO_RCVBUF 8
17#define SO_SNDBUFFORCE 32
18#define SO_RCVBUFFORCE 33
19#define SO_KEEPALIVE 9
20#define SO_OOBINLINE 10
21#define SO_NO_CHECK 11
22#define SO_PRIORITY 12
23#define SO_LINGER 13
24#define SO_BSDCOMPAT 14
25/* To add :#define SO_REUSEPORT 15 */
26#define SO_PASSCRED 16
27#define SO_PEERCRED 17
28#define SO_RCVLOWAT 18
29#define SO_SNDLOWAT 19
30#define SO_RCVTIMEO 20
31#define SO_SNDTIMEO 21
32
33/* Security levels - as per NRL IPv6 - don't actually do anything */
34#define SO_SECURITY_AUTHENTICATION 22
35#define SO_SECURITY_ENCRYPTION_TRANSPORT 23
36#define SO_SECURITY_ENCRYPTION_NETWORK 24
37
38#define SO_BINDTODEVICE 25
39
40/* Socket filtering */
41#define SO_ATTACH_FILTER 26
42#define SO_DETACH_FILTER 27
43
44#define SO_PEERNAME 28
45#define SO_TIMESTAMP 29
46#define SCM_TIMESTAMP SO_TIMESTAMP
47
48#define SO_ACCEPTCONN 30
49
50#define SO_PEERSEC 31
51#define SO_PASSSEC 34
52#define SO_TIMESTAMPNS 35
53#define SCM_TIMESTAMPNS SO_TIMESTAMPNS
54
55#define SO_MARK 36
56
57#define SO_TIMESTAMPING 37
58#define SCM_TIMESTAMPING SO_TIMESTAMPING
59
60#endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h
index 49cc72b5d3c9..def6d4746ee7 100644
--- a/arch/x86/include/asm/sockios.h
+++ b/arch/x86/include/asm/sockios.h
@@ -1,13 +1 @@
1#ifndef _ASM_X86_SOCKIOS_H #include <asm-generic/sockios.h>
2#define _ASM_X86_SOCKIOS_H
3
4/* Socket-level I/O control calls. */
5#define FIOSETOWN 0x8901
6#define SIOCSPGRP 0x8902
7#define FIOGETOWN 0x8903
8#define SIOCGPGRP 0x8904
9#define SIOCATMARK 0x8905
10#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */
11#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */
12
13#endif /* _ASM_X86_SOCKIOS_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index c2d742c6e15f..157517763565 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -48,7 +48,7 @@
48 * head_32 for boot CPU and setup_per_cpu_areas() for others. 48 * head_32 for boot CPU and setup_per_cpu_areas() for others.
49 */ 49 */
50#define GDT_STACK_CANARY_INIT \ 50#define GDT_STACK_CANARY_INIT \
51 [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } }, 51 [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18),
52 52
53/* 53/*
54 * Initialize the stackprotector canary value. 54 * Initialize the stackprotector canary value.
@@ -78,21 +78,19 @@ static __always_inline void boot_init_stack_canary(void)
78#ifdef CONFIG_X86_64 78#ifdef CONFIG_X86_64
79 percpu_write(irq_stack_union.stack_canary, canary); 79 percpu_write(irq_stack_union.stack_canary, canary);
80#else 80#else
81 percpu_write(stack_canary, canary); 81 percpu_write(stack_canary.canary, canary);
82#endif 82#endif
83} 83}
84 84
85static inline void setup_stack_canary_segment(int cpu) 85static inline void setup_stack_canary_segment(int cpu)
86{ 86{
87#ifdef CONFIG_X86_32 87#ifdef CONFIG_X86_32
88 unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20; 88 unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
89 struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); 89 struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
90 struct desc_struct desc; 90 struct desc_struct desc;
91 91
92 desc = gdt_table[GDT_ENTRY_STACK_CANARY]; 92 desc = gdt_table[GDT_ENTRY_STACK_CANARY];
93 desc.base0 = canary & 0xffff; 93 set_desc_base(&desc, canary);
94 desc.base1 = (canary >> 16) & 0xff;
95 desc.base2 = (canary >> 24) & 0xff;
96 write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); 94 write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S);
97#endif 95#endif
98} 96}
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index c86f452256de..ae907e617181 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -65,7 +65,6 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
65 case 4: 65 case 4:
66 *(int *)to = *(int *)from; 66 *(int *)to = *(int *)from;
67 return to; 67 return to;
68
69 case 3: 68 case 3:
70 *(short *)to = *(short *)from; 69 *(short *)to = *(short *)from;
71 *((char *)to + 2) = *((char *)from + 2); 70 *((char *)to + 2) = *((char *)from + 2);
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d82f39bb7905..8d33bc5462d1 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Access to user system call parameters and results 2 * Access to user system call parameters and results
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved.
5 * 5 *
6 * This copyrighted material is made available to anyone wishing to use, 6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions 7 * modify, copy, or redistribute it subject to the terms and conditions
@@ -16,13 +16,13 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/err.h> 17#include <linux/err.h>
18 18
19static inline long syscall_get_nr(struct task_struct *task, 19/*
20 struct pt_regs *regs) 20 * Only the low 32 bits of orig_ax are meaningful, so we return int.
21 * This importantly ignores the high bits on 64-bit, so comparisons
22 * sign-extend the low 32 bits.
23 */
24static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
21{ 25{
22 /*
23 * We always sign-extend a -1 value being set here,
24 * so this is always either -1L or a syscall number.
25 */
26 return regs->orig_ax; 26 return regs->orig_ax;
27} 27}
28 28
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 643c59b4bc6e..f08f97374892 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -31,7 +31,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
31 "movl %P[task_canary](%[next]), %%ebx\n\t" \ 31 "movl %P[task_canary](%[next]), %%ebx\n\t" \
32 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" 32 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
33#define __switch_canary_oparam \ 33#define __switch_canary_oparam \
34 , [stack_canary] "=m" (per_cpu_var(stack_canary)) 34 , [stack_canary] "=m" (per_cpu_var(stack_canary.canary))
35#define __switch_canary_iparam \ 35#define __switch_canary_iparam \
36 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 36 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
37#else /* CC_STACKPROTECTOR */ 37#else /* CC_STACKPROTECTOR */
@@ -150,33 +150,6 @@ do { \
150#endif 150#endif
151 151
152#ifdef __KERNEL__ 152#ifdef __KERNEL__
153#define _set_base(addr, base) do { unsigned long __pr; \
154__asm__ __volatile__ ("movw %%dx,%1\n\t" \
155 "rorl $16,%%edx\n\t" \
156 "movb %%dl,%2\n\t" \
157 "movb %%dh,%3" \
158 :"=&d" (__pr) \
159 :"m" (*((addr)+2)), \
160 "m" (*((addr)+4)), \
161 "m" (*((addr)+7)), \
162 "0" (base) \
163 ); } while (0)
164
165#define _set_limit(addr, limit) do { unsigned long __lr; \
166__asm__ __volatile__ ("movw %%dx,%1\n\t" \
167 "rorl $16,%%edx\n\t" \
168 "movb %2,%%dh\n\t" \
169 "andb $0xf0,%%dh\n\t" \
170 "orb %%dh,%%dl\n\t" \
171 "movb %%dl,%2" \
172 :"=&d" (__lr) \
173 :"m" (*(addr)), \
174 "m" (*((addr)+6)), \
175 "0" (limit) \
176 ); } while (0)
177
178#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
179#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
180 153
181extern void native_load_gs_index(unsigned); 154extern void native_load_gs_index(unsigned);
182 155
diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h
index af1b70ea440f..3935b106de79 100644
--- a/arch/x86/include/asm/termbits.h
+++ b/arch/x86/include/asm/termbits.h
@@ -1,198 +1 @@
1#ifndef _ASM_X86_TERMBITS_H #include <asm-generic/termbits.h>
2#define _ASM_X86_TERMBITS_H
3
4#include <linux/posix_types.h>
5
6typedef unsigned char cc_t;
7typedef unsigned int speed_t;
8typedef unsigned int tcflag_t;
9
10#define NCCS 19
11struct termios {
12 tcflag_t c_iflag; /* input mode flags */
13 tcflag_t c_oflag; /* output mode flags */
14 tcflag_t c_cflag; /* control mode flags */
15 tcflag_t c_lflag; /* local mode flags */
16 cc_t c_line; /* line discipline */
17 cc_t c_cc[NCCS]; /* control characters */
18};
19
20struct termios2 {
21 tcflag_t c_iflag; /* input mode flags */
22 tcflag_t c_oflag; /* output mode flags */
23 tcflag_t c_cflag; /* control mode flags */
24 tcflag_t c_lflag; /* local mode flags */
25 cc_t c_line; /* line discipline */
26 cc_t c_cc[NCCS]; /* control characters */
27 speed_t c_ispeed; /* input speed */
28 speed_t c_ospeed; /* output speed */
29};
30
31struct ktermios {
32 tcflag_t c_iflag; /* input mode flags */
33 tcflag_t c_oflag; /* output mode flags */
34 tcflag_t c_cflag; /* control mode flags */
35 tcflag_t c_lflag; /* local mode flags */
36 cc_t c_line; /* line discipline */
37 cc_t c_cc[NCCS]; /* control characters */
38 speed_t c_ispeed; /* input speed */
39 speed_t c_ospeed; /* output speed */
40};
41
42/* c_cc characters */
43#define VINTR 0
44#define VQUIT 1
45#define VERASE 2
46#define VKILL 3
47#define VEOF 4
48#define VTIME 5
49#define VMIN 6
50#define VSWTC 7
51#define VSTART 8
52#define VSTOP 9
53#define VSUSP 10
54#define VEOL 11
55#define VREPRINT 12
56#define VDISCARD 13
57#define VWERASE 14
58#define VLNEXT 15
59#define VEOL2 16
60
61/* c_iflag bits */
62#define IGNBRK 0000001
63#define BRKINT 0000002
64#define IGNPAR 0000004
65#define PARMRK 0000010
66#define INPCK 0000020
67#define ISTRIP 0000040
68#define INLCR 0000100
69#define IGNCR 0000200
70#define ICRNL 0000400
71#define IUCLC 0001000
72#define IXON 0002000
73#define IXANY 0004000
74#define IXOFF 0010000
75#define IMAXBEL 0020000
76#define IUTF8 0040000
77
78/* c_oflag bits */
79#define OPOST 0000001
80#define OLCUC 0000002
81#define ONLCR 0000004
82#define OCRNL 0000010
83#define ONOCR 0000020
84#define ONLRET 0000040
85#define OFILL 0000100
86#define OFDEL 0000200
87#define NLDLY 0000400
88#define NL0 0000000
89#define NL1 0000400
90#define CRDLY 0003000
91#define CR0 0000000
92#define CR1 0001000
93#define CR2 0002000
94#define CR3 0003000
95#define TABDLY 0014000
96#define TAB0 0000000
97#define TAB1 0004000
98#define TAB2 0010000
99#define TAB3 0014000
100#define XTABS 0014000
101#define BSDLY 0020000
102#define BS0 0000000
103#define BS1 0020000
104#define VTDLY 0040000
105#define VT0 0000000
106#define VT1 0040000
107#define FFDLY 0100000
108#define FF0 0000000
109#define FF1 0100000
110
111/* c_cflag bit meaning */
112#define CBAUD 0010017
113#define B0 0000000 /* hang up */
114#define B50 0000001
115#define B75 0000002
116#define B110 0000003
117#define B134 0000004
118#define B150 0000005
119#define B200 0000006
120#define B300 0000007
121#define B600 0000010
122#define B1200 0000011
123#define B1800 0000012
124#define B2400 0000013
125#define B4800 0000014
126#define B9600 0000015
127#define B19200 0000016
128#define B38400 0000017
129#define EXTA B19200
130#define EXTB B38400
131#define CSIZE 0000060
132#define CS5 0000000
133#define CS6 0000020
134#define CS7 0000040
135#define CS8 0000060
136#define CSTOPB 0000100
137#define CREAD 0000200
138#define PARENB 0000400
139#define PARODD 0001000
140#define HUPCL 0002000
141#define CLOCAL 0004000
142#define CBAUDEX 0010000
143#define BOTHER 0010000 /* non standard rate */
144#define B57600 0010001
145#define B115200 0010002
146#define B230400 0010003
147#define B460800 0010004
148#define B500000 0010005
149#define B576000 0010006
150#define B921600 0010007
151#define B1000000 0010010
152#define B1152000 0010011
153#define B1500000 0010012
154#define B2000000 0010013
155#define B2500000 0010014
156#define B3000000 0010015
157#define B3500000 0010016
158#define B4000000 0010017
159#define CIBAUD 002003600000 /* input baud rate */
160#define CMSPAR 010000000000 /* mark or space (stick) parity */
161#define CRTSCTS 020000000000 /* flow control */
162
163#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */
164
165/* c_lflag bits */
166#define ISIG 0000001
167#define ICANON 0000002
168#define XCASE 0000004
169#define ECHO 0000010
170#define ECHOE 0000020
171#define ECHOK 0000040
172#define ECHONL 0000100
173#define NOFLSH 0000200
174#define TOSTOP 0000400
175#define ECHOCTL 0001000
176#define ECHOPRT 0002000
177#define ECHOKE 0004000
178#define FLUSHO 0010000
179#define PENDIN 0040000
180#define IEXTEN 0100000
181
182/* tcflow() and TCXONC use these */
183#define TCOOFF 0
184#define TCOON 1
185#define TCIOFF 2
186#define TCION 3
187
188/* tcflush() and TCFLSH use these */
189#define TCIFLUSH 0
190#define TCOFLUSH 1
191#define TCIOFLUSH 2
192
193/* tcsetattr uses these */
194#define TCSANOW 0
195#define TCSADRAIN 1
196#define TCSAFLUSH 2
197
198#endif /* _ASM_X86_TERMBITS_H */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index c4ee8056baca..280d78a9d966 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -1,114 +1 @@
1#ifndef _ASM_X86_TERMIOS_H #include <asm-generic/termios.h>
2#define _ASM_X86_TERMIOS_H
3
4#include <asm/termbits.h>
5#include <asm/ioctls.h>
6
7struct winsize {
8 unsigned short ws_row;
9 unsigned short ws_col;
10 unsigned short ws_xpixel;
11 unsigned short ws_ypixel;
12};
13
14#define NCC 8
15struct termio {
16 unsigned short c_iflag; /* input mode flags */
17 unsigned short c_oflag; /* output mode flags */
18 unsigned short c_cflag; /* control mode flags */
19 unsigned short c_lflag; /* local mode flags */
20 unsigned char c_line; /* line discipline */
21 unsigned char c_cc[NCC]; /* control characters */
22};
23
24/* modem lines */
25#define TIOCM_LE 0x001
26#define TIOCM_DTR 0x002
27#define TIOCM_RTS 0x004
28#define TIOCM_ST 0x008
29#define TIOCM_SR 0x010
30#define TIOCM_CTS 0x020
31#define TIOCM_CAR 0x040
32#define TIOCM_RNG 0x080
33#define TIOCM_DSR 0x100
34#define TIOCM_CD TIOCM_CAR
35#define TIOCM_RI TIOCM_RNG
36#define TIOCM_OUT1 0x2000
37#define TIOCM_OUT2 0x4000
38#define TIOCM_LOOP 0x8000
39
40/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
41
42#ifdef __KERNEL__
43
44#include <asm/uaccess.h>
45
46/* intr=^C quit=^\ erase=del kill=^U
47 eof=^D vtime=\0 vmin=\1 sxtc=\0
48 start=^Q stop=^S susp=^Z eol=\0
49 reprint=^R discard=^U werase=^W lnext=^V
50 eol2=\0
51*/
52#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
53
54/*
55 * Translate a "termio" structure into a "termios". Ugh.
56 */
57#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \
58 unsigned short __tmp; \
59 get_user(__tmp,&(termio)->x); \
60 *(unsigned short *) &(termios)->x = __tmp; \
61}
62
63static inline int user_termio_to_kernel_termios(struct ktermios *termios,
64 struct termio __user *termio)
65{
66 SET_LOW_TERMIOS_BITS(termios, termio, c_iflag);
67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
70 get_user(termios->c_line, &termio->c_line);
71 return copy_from_user(termios->c_cc, termio->c_cc, NCC);
72}
73
74/*
75 * Translate a "termios" structure into a "termio". Ugh.
76 */
77static inline int kernel_termios_to_user_termio(struct termio __user *termio,
78 struct ktermios *termios)
79{
80 put_user((termios)->c_iflag, &(termio)->c_iflag);
81 put_user((termios)->c_oflag, &(termio)->c_oflag);
82 put_user((termios)->c_cflag, &(termio)->c_cflag);
83 put_user((termios)->c_lflag, &(termio)->c_lflag);
84 put_user((termios)->c_line, &(termio)->c_line);
85 return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC);
86}
87
88static inline int user_termios_to_kernel_termios(struct ktermios *k,
89 struct termios2 __user *u)
90{
91 return copy_from_user(k, u, sizeof(struct termios2));
92}
93
94static inline int kernel_termios_to_user_termios(struct termios2 __user *u,
95 struct ktermios *k)
96{
97 return copy_to_user(u, k, sizeof(struct termios2));
98}
99
100static inline int user_termios_to_kernel_termios_1(struct ktermios *k,
101 struct termios __user *u)
102{
103 return copy_from_user(k, u, sizeof(struct termios));
104}
105
106static inline int kernel_termios_to_user_termios_1(struct termios __user *u,
107 struct ktermios *k)
108{
109 return copy_to_user(u, k, sizeof(struct termios));
110}
111
112#endif /* __KERNEL__ */
113
114#endif /* _ASM_X86_TERMIOS_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index fad7d40b75f8..d27d0a2fec4c 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,7 +95,7 @@ struct thread_info {
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ 97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ 98#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
99 99
100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -118,17 +118,17 @@ struct thread_info {
118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) 120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) 121#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
122 122
123/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
124#define _TIF_WORK_SYSCALL_ENTRY \ 124#define _TIF_WORK_SYSCALL_ENTRY \
125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ 125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
126 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) 126 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
127 127
128/* work to do in syscall_trace_leave() */ 128/* work to do in syscall_trace_leave() */
129#define _TIF_WORK_SYSCALL_EXIT \ 129#define _TIF_WORK_SYSCALL_EXIT \
130 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ 130 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
131 _TIF_SYSCALL_FTRACE) 131 _TIF_SYSCALL_TRACEPOINT)
132 132
133/* work to do on interrupt/exception return */ 133/* work to do on interrupt/exception return */
134#define _TIF_WORK_MASK \ 134#define _TIF_WORK_MASK \
@@ -137,7 +137,8 @@ struct thread_info {
137 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) 137 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
138 138
139/* work to do on any return to user space */ 139/* work to do on any return to user space */
140#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) 140#define _TIF_ALLWORK_MASK \
141 ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT)
141 142
142/* Only used for 64 bit */ 143/* Only used for 64 bit */
143#define _TIF_DO_NOTIFY_MASK \ 144#define _TIF_DO_NOTIFY_MASK \
@@ -213,7 +214,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
213static inline struct thread_info *current_thread_info(void) 214static inline struct thread_info *current_thread_info(void)
214{ 215{
215 struct thread_info *ti; 216 struct thread_info *ti;
216 ti = (void *)(percpu_read(kernel_stack) + 217 ti = (void *)(percpu_read_stable(kernel_stack) +
217 KERNEL_STACK_OFFSET - THREAD_SIZE); 218 KERNEL_STACK_OFFSET - THREAD_SIZE);
218 return ti; 219 return ti;
219} 220}
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 50c733aac421..7bdec4e9b739 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -4,60 +4,7 @@
4extern void hpet_time_init(void); 4extern void hpet_time_init(void);
5 5
6#include <asm/mc146818rtc.h> 6#include <asm/mc146818rtc.h>
7#ifdef CONFIG_X86_32
8#include <linux/efi.h>
9
10static inline unsigned long native_get_wallclock(void)
11{
12 unsigned long retval;
13
14 if (efi_enabled)
15 retval = efi_get_time();
16 else
17 retval = mach_get_cmos_time();
18
19 return retval;
20}
21
22static inline int native_set_wallclock(unsigned long nowtime)
23{
24 int retval;
25
26 if (efi_enabled)
27 retval = efi_set_rtc_mmss(nowtime);
28 else
29 retval = mach_set_rtc_mmss(nowtime);
30
31 return retval;
32}
33
34#else
35extern void native_time_init_hook(void);
36
37static inline unsigned long native_get_wallclock(void)
38{
39 return mach_get_cmos_time();
40}
41
42static inline int native_set_wallclock(unsigned long nowtime)
43{
44 return mach_set_rtc_mmss(nowtime);
45}
46
47#endif
48 7
49extern void time_init(void); 8extern void time_init(void);
50 9
51#ifdef CONFIG_PARAVIRT
52#include <asm/paravirt.h>
53#else /* !CONFIG_PARAVIRT */
54
55#define get_wallclock() native_get_wallclock()
56#define set_wallclock(x) native_set_wallclock(x)
57#define choose_time_init() hpet_time_init
58
59#endif /* CONFIG_PARAVIRT */
60
61extern unsigned long __init calibrate_cpu(void);
62
63#endif /* _ASM_X86_TIME_H */ 10#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 20ca9c4d4686..5469630b27f5 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -8,20 +8,16 @@
8#define TICK_SIZE (tick_nsec / 1000) 8#define TICK_SIZE (tick_nsec / 1000)
9 9
10unsigned long long native_sched_clock(void); 10unsigned long long native_sched_clock(void);
11unsigned long native_calibrate_tsc(void); 11extern int recalibrate_cpu_khz(void);
12 12
13#ifdef CONFIG_X86_32 13#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
14extern int timer_ack; 14extern int timer_ack;
15extern irqreturn_t timer_interrupt(int irq, void *dev_id); 15#else
16#endif /* CONFIG_X86_32 */ 16# define timer_ack (0)
17extern int recalibrate_cpu_khz(void); 17#endif
18 18
19extern int no_timer_check; 19extern int no_timer_check;
20 20
21#ifndef CONFIG_PARAVIRT
22#define calibrate_tsc() native_calibrate_tsc()
23#endif
24
25/* Accelerators for sched_clock() 21/* Accelerators for sched_clock()
26 * convert from cycles(64bits) => nanoseconds (64bits) 22 * convert from cycles(64bits) => nanoseconds (64bits)
27 * basic equation: 23 * basic equation:
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 066ef590d7e0..6f0695d744bf 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,38 +116,41 @@ extern unsigned long node_remap_size[];
116 116
117# define SD_CACHE_NICE_TRIES 1 117# define SD_CACHE_NICE_TRIES 1
118# define SD_IDLE_IDX 1 118# define SD_IDLE_IDX 1
119# define SD_NEWIDLE_IDX 2
120# define SD_FORKEXEC_IDX 0
121 119
122#else 120#else
123 121
124# define SD_CACHE_NICE_TRIES 2 122# define SD_CACHE_NICE_TRIES 2
125# define SD_IDLE_IDX 2 123# define SD_IDLE_IDX 2
126# define SD_NEWIDLE_IDX 2
127# define SD_FORKEXEC_IDX 1
128 124
129#endif 125#endif
130 126
131/* sched_domains SD_NODE_INIT for NUMA machines */ 127/* sched_domains SD_NODE_INIT for NUMA machines */
132#define SD_NODE_INIT (struct sched_domain) { \ 128#define SD_NODE_INIT (struct sched_domain) { \
133 .min_interval = 8, \ 129 .min_interval = 8, \
134 .max_interval = 32, \ 130 .max_interval = 32, \
135 .busy_factor = 32, \ 131 .busy_factor = 32, \
136 .imbalance_pct = 125, \ 132 .imbalance_pct = 125, \
137 .cache_nice_tries = SD_CACHE_NICE_TRIES, \ 133 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
138 .busy_idx = 3, \ 134 .busy_idx = 3, \
139 .idle_idx = SD_IDLE_IDX, \ 135 .idle_idx = SD_IDLE_IDX, \
140 .newidle_idx = SD_NEWIDLE_IDX, \ 136 .newidle_idx = 0, \
141 .wake_idx = 1, \ 137 .wake_idx = 0, \
142 .forkexec_idx = SD_FORKEXEC_IDX, \ 138 .forkexec_idx = 0, \
143 .flags = SD_LOAD_BALANCE \ 139 \
144 | SD_BALANCE_EXEC \ 140 .flags = 1*SD_LOAD_BALANCE \
145 | SD_BALANCE_FORK \ 141 | 1*SD_BALANCE_NEWIDLE \
146 | SD_WAKE_AFFINE \ 142 | 1*SD_BALANCE_EXEC \
147 | SD_WAKE_BALANCE \ 143 | 1*SD_BALANCE_FORK \
148 | SD_SERIALIZE, \ 144 | 0*SD_BALANCE_WAKE \
149 .last_balance = jiffies, \ 145 | 1*SD_WAKE_AFFINE \
150 .balance_interval = 1, \ 146 | 0*SD_SHARE_CPUPOWER \
147 | 0*SD_POWERSAVINGS_BALANCE \
148 | 0*SD_SHARE_PKG_RESOURCES \
149 | 1*SD_SERIALIZE \
150 | 0*SD_PREFER_SIBLING \
151 , \
152 .last_balance = jiffies, \
153 .balance_interval = 1, \
151} 154}
152 155
153#ifdef CONFIG_X86_64_ACPI_NUMA 156#ifdef CONFIG_X86_64_ACPI_NUMA
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index bfd74c032fca..4da91ad69e0d 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -81,9 +81,7 @@ extern int panic_on_unrecovered_nmi;
81 81
82void math_error(void __user *); 82void math_error(void __user *);
83void math_emulate(struct math_emu_info *); 83void math_emulate(struct math_emu_info *);
84#ifdef CONFIG_X86_32 84#ifndef CONFIG_X86_32
85unsigned long patch_espfix_desc(unsigned long, unsigned long);
86#else
87asmlinkage void smp_thermal_interrupt(void); 85asmlinkage void smp_thermal_interrupt(void);
88asmlinkage void mce_threshold_interrupt(void); 86asmlinkage void mce_threshold_interrupt(void);
89#endif 87#endif
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 38ae163cc91b..c0427295e8f5 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void)
48extern void tsc_init(void); 48extern void tsc_init(void);
49extern void mark_tsc_unstable(char *reason); 49extern void mark_tsc_unstable(char *reason);
50extern int unsynchronized_tsc(void); 50extern int unsynchronized_tsc(void);
51int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void);
52 53
53/* 54/*
54 * Boot-time check whether the TSCs are synchronized across 55 * Boot-time check whether the TSCs are synchronized across
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index 09b97745772f..df1da20f4534 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -1,19 +1,11 @@
1#ifndef _ASM_X86_TYPES_H 1#ifndef _ASM_X86_TYPES_H
2#define _ASM_X86_TYPES_H 2#define _ASM_X86_TYPES_H
3 3
4#include <asm-generic/int-ll64.h> 4#define dma_addr_t dma_addr_t
5 5
6#ifndef __ASSEMBLY__ 6#include <asm-generic/types.h>
7
8typedef unsigned short umode_t;
9 7
10#endif /* __ASSEMBLY__ */
11
12/*
13 * These aren't exported outside the kernel to avoid name space clashes
14 */
15#ifdef __KERNEL__ 8#ifdef __KERNEL__
16
17#ifndef __ASSEMBLY__ 9#ifndef __ASSEMBLY__
18 10
19typedef u64 dma64_addr_t; 11typedef u64 dma64_addr_t;
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 5e06259e90e5..632fb44b4cb5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -33,7 +33,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
33 * Copy data from kernel space to user space. Caller must check 33 * Copy data from kernel space to user space. Caller must check
34 * the specified block with access_ok() before calling this function. 34 * the specified block with access_ok() before calling this function.
35 * The caller should also make sure he pins the user space address 35 * The caller should also make sure he pins the user space address
36 * so that the we don't result in page fault and sleep. 36 * so that we don't result in page fault and sleep.
37 * 37 *
38 * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault 38 * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
39 * we return the initial request size (1, 2 or 4), as copy_*_user should do. 39 * we return the initial request size (1, 2 or 4), as copy_*_user should do.
diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h
index 87324cf439d9..b7c29c8017f2 100644
--- a/arch/x86/include/asm/ucontext.h
+++ b/arch/x86/include/asm/ucontext.h
@@ -7,12 +7,6 @@
7 * sigcontext struct (uc_mcontext). 7 * sigcontext struct (uc_mcontext).
8 */ 8 */
9 9
10struct ucontext { 10#include <asm-generic/ucontext.h>
11 unsigned long uc_flags;
12 struct ucontext *uc_link;
13 stack_t uc_stack;
14 struct sigcontext uc_mcontext;
15 sigset_t uc_sigmask; /* mask last for extensibility */
16};
17 11
18#endif /* _ASM_X86_UCONTEXT_H */ 12#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 732a30706153..6fb3c209a7e3 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,10 +341,12 @@
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_counter_open 336 344#define __NR_perf_event_open 336
345 345
346#ifdef __KERNEL__ 346#ifdef __KERNEL__
347 347
348#define NR_syscalls 337
349
348#define __ARCH_WANT_IPC_PARSE_VERSION 350#define __ARCH_WANT_IPC_PARSE_VERSION
349#define __ARCH_WANT_OLD_READDIR 351#define __ARCH_WANT_OLD_READDIR
350#define __ARCH_WANT_OLD_STAT 352#define __ARCH_WANT_OLD_STAT
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 900e1617e672..8d3ad0adbc68 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660#define __NR_rt_tgsigqueueinfo 297 660#define __NR_rt_tgsigqueueinfo 297
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_counter_open 298 662#define __NR_perf_event_open 298
663__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664 664
665#ifndef __NO_STUBS 665#ifndef __NO_STUBS
666#define __ARCH_WANT_OLD_READDIR 666#define __ARCH_WANT_OLD_READDIR
@@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
688#endif /* __NO_STUBS */ 688#endif /* __NO_STUBS */
689 689
690#ifdef __KERNEL__ 690#ifdef __KERNEL__
691
692#ifndef COMPILE_OFFSETS
693#include <asm/asm-offsets.h>
694#define NR_syscalls (__NR_syscall_max + 1)
695#endif
696
691/* 697/*
692 * "Conditional" syscalls 698 * "Conditional" syscalls
693 * 699 *
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 77a68505419a..04eb6c958b9d 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -15,6 +15,7 @@
15#include <linux/numa.h> 15#include <linux/numa.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/timer.h> 17#include <linux/timer.h>
18#include <linux/io.h>
18#include <asm/types.h> 19#include <asm/types.h>
19#include <asm/percpu.h> 20#include <asm/percpu.h>
20#include <asm/uv/uv_mmrs.h> 21#include <asm/uv/uv_mmrs.h>
@@ -258,13 +259,13 @@ static inline unsigned long *uv_global_mmr32_address(int pnode,
258static inline void uv_write_global_mmr32(int pnode, unsigned long offset, 259static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
259 unsigned long val) 260 unsigned long val)
260{ 261{
261 *uv_global_mmr32_address(pnode, offset) = val; 262 writeq(val, uv_global_mmr32_address(pnode, offset));
262} 263}
263 264
264static inline unsigned long uv_read_global_mmr32(int pnode, 265static inline unsigned long uv_read_global_mmr32(int pnode,
265 unsigned long offset) 266 unsigned long offset)
266{ 267{
267 return *uv_global_mmr32_address(pnode, offset); 268 return readq(uv_global_mmr32_address(pnode, offset));
268} 269}
269 270
270/* 271/*
@@ -281,13 +282,13 @@ static inline unsigned long *uv_global_mmr64_address(int pnode,
281static inline void uv_write_global_mmr64(int pnode, unsigned long offset, 282static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
282 unsigned long val) 283 unsigned long val)
283{ 284{
284 *uv_global_mmr64_address(pnode, offset) = val; 285 writeq(val, uv_global_mmr64_address(pnode, offset));
285} 286}
286 287
287static inline unsigned long uv_read_global_mmr64(int pnode, 288static inline unsigned long uv_read_global_mmr64(int pnode,
288 unsigned long offset) 289 unsigned long offset)
289{ 290{
290 return *uv_global_mmr64_address(pnode, offset); 291 return readq(uv_global_mmr64_address(pnode, offset));
291} 292}
292 293
293/* 294/*
@@ -301,22 +302,22 @@ static inline unsigned long *uv_local_mmr_address(unsigned long offset)
301 302
302static inline unsigned long uv_read_local_mmr(unsigned long offset) 303static inline unsigned long uv_read_local_mmr(unsigned long offset)
303{ 304{
304 return *uv_local_mmr_address(offset); 305 return readq(uv_local_mmr_address(offset));
305} 306}
306 307
307static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) 308static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
308{ 309{
309 *uv_local_mmr_address(offset) = val; 310 writeq(val, uv_local_mmr_address(offset));
310} 311}
311 312
312static inline unsigned char uv_read_local_mmr8(unsigned long offset) 313static inline unsigned char uv_read_local_mmr8(unsigned long offset)
313{ 314{
314 return *((unsigned char *)uv_local_mmr_address(offset)); 315 return readb(uv_local_mmr_address(offset));
315} 316}
316 317
317static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) 318static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
318{ 319{
319 *((unsigned char *)uv_local_mmr_address(offset)) = val; 320 writeb(val, uv_local_mmr_address(offset));
320} 321}
321 322
322/* 323/*
@@ -422,7 +423,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
422 unsigned long val; 423 unsigned long val;
423 424
424 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 425 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
425 ((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) | 426 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
426 (vector << UVH_IPI_INT_VECTOR_SHFT); 427 (vector << UVH_IPI_INT_VECTOR_SHFT);
427 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 428 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
428} 429}
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index dc27a69e5d2a..3d61e204826f 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -21,6 +21,7 @@ struct vsyscall_gtod_data {
21 u32 shift; 21 u32 shift;
22 } clock; 22 } clock;
23 struct timespec wall_to_monotonic; 23 struct timespec wall_to_monotonic;
24 struct timespec wall_time_coarse;
24}; 25};
25extern struct vsyscall_gtod_data __vsyscall_gtod_data 26extern struct vsyscall_gtod_data __vsyscall_gtod_data
26__section_vsyscall_gtod_data; 27__section_vsyscall_gtod_data;
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
index c11b7e100d83..e49ed6d2fd4e 100644
--- a/arch/x86/include/asm/vmware.h
+++ b/arch/x86/include/asm/vmware.h
@@ -20,7 +20,7 @@
20#ifndef ASM_X86__VMWARE_H 20#ifndef ASM_X86__VMWARE_H
21#define ASM_X86__VMWARE_H 21#define ASM_X86__VMWARE_H
22 22
23extern unsigned long vmware_get_tsc_khz(void); 23extern void vmware_platform_setup(void);
24extern int vmware_platform(void); 24extern int vmware_platform(void);
25extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); 25extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
26 26
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 11be5ad2e0e9..272514c2d456 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -55,6 +55,7 @@
55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
58 59
59 60
60#define PIN_BASED_EXT_INTR_MASK 0x00000001 61#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -351,9 +352,16 @@ enum vmcs_field {
351#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 352#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
352#define VMX_EPT_EXTENT_CONTEXT 1 353#define VMX_EPT_EXTENT_CONTEXT 1
353#define VMX_EPT_EXTENT_GLOBAL 2 354#define VMX_EPT_EXTENT_GLOBAL 2
355
356#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
357#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
358#define VMX_EPTP_UC_BIT (1ull << 8)
359#define VMX_EPTP_WB_BIT (1ull << 14)
360#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
354#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 361#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
355#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 362#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
356#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 363#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
364
357#define VMX_EPT_DEFAULT_GAW 3 365#define VMX_EPT_DEFAULT_GAW 3
358#define VMX_EPT_MAX_GAW 0x4 366#define VMX_EPT_MAX_GAW 0x4
359#define VMX_EPT_MT_EPTE_SHIFT 3 367#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
new file mode 100644
index 000000000000..2c756fd4ab0e
--- /dev/null
+++ b/arch/x86/include/asm/x86_init.h
@@ -0,0 +1,133 @@
1#ifndef _ASM_X86_PLATFORM_H
2#define _ASM_X86_PLATFORM_H
3
4#include <asm/pgtable_types.h>
5#include <asm/bootparam.h>
6
7struct mpc_bus;
8struct mpc_cpu;
9struct mpc_table;
10
11/**
12 * struct x86_init_mpparse - platform specific mpparse ops
13 * @mpc_record: platform specific mpc record accounting
14 * @setup_ioapic_ids: platform specific ioapic id override
15 * @mpc_apic_id: platform specific mpc apic id assignment
16 * @smp_read_mpc_oem: platform specific oem mpc table setup
17 * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL)
18 * @mpc_oem_bus_info: platform specific mpc bus info
19 * @find_smp_config: find the smp configuration
20 * @get_smp_config: get the smp configuration
21 */
22struct x86_init_mpparse {
23 void (*mpc_record)(unsigned int mode);
24 void (*setup_ioapic_ids)(void);
25 int (*mpc_apic_id)(struct mpc_cpu *m);
26 void (*smp_read_mpc_oem)(struct mpc_table *mpc);
27 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
29 void (*find_smp_config)(unsigned int reserve);
30 void (*get_smp_config)(unsigned int early);
31};
32
33/**
34 * struct x86_init_resources - platform specific resource related ops
35 * @probe_roms: probe BIOS roms
36 * @reserve_resources: reserve the standard resources for the
37 * platform
38 * @memory_setup: platform specific memory setup
39 *
40 */
41struct x86_init_resources {
42 void (*probe_roms)(void);
43 void (*reserve_resources)(void);
44 char *(*memory_setup)(void);
45};
46
47/**
48 * struct x86_init_irqs - platform specific interrupt setup
49 * @pre_vector_init: init code to run before interrupt vectors
50 * are set up.
51 * @intr_init: interrupt init code
52 * @trap_init: platform specific trap setup
53 */
54struct x86_init_irqs {
55 void (*pre_vector_init)(void);
56 void (*intr_init)(void);
57 void (*trap_init)(void);
58};
59
60/**
61 * struct x86_init_oem - oem platform specific customizing functions
62 * @arch_setup: platform specific architecure setup
63 * @banner: print a platform specific banner
64 */
65struct x86_init_oem {
66 void (*arch_setup)(void);
67 void (*banner)(void);
68};
69
70/**
71 * struct x86_init_paging - platform specific paging functions
72 * @pagetable_setup_start: platform specific pre paging_init() call
73 * @pagetable_setup_done: platform specific post paging_init() call
74 */
75struct x86_init_paging {
76 void (*pagetable_setup_start)(pgd_t *base);
77 void (*pagetable_setup_done)(pgd_t *base);
78};
79
80/**
81 * struct x86_init_timers - platform specific timer setup
82 * @setup_perpcu_clockev: set up the per cpu clock event device for the
83 * boot cpu
84 * @tsc_pre_init: platform function called before TSC init
85 * @timer_init: initialize the platform timer (default PIT/HPET)
86 */
87struct x86_init_timers {
88 void (*setup_percpu_clockev)(void);
89 void (*tsc_pre_init)(void);
90 void (*timer_init)(void);
91};
92
93/**
94 * struct x86_init_ops - functions for platform specific setup
95 *
96 */
97struct x86_init_ops {
98 struct x86_init_resources resources;
99 struct x86_init_mpparse mpparse;
100 struct x86_init_irqs irqs;
101 struct x86_init_oem oem;
102 struct x86_init_paging paging;
103 struct x86_init_timers timers;
104};
105
106/**
107 * struct x86_cpuinit_ops - platform specific cpu hotplug setups
108 * @setup_percpu_clockev: set up the per cpu clock event device
109 */
110struct x86_cpuinit_ops {
111 void (*setup_percpu_clockev)(void);
112};
113
114/**
115 * struct x86_platform_ops - platform specific runtime functions
116 * @calibrate_tsc: calibrate TSC
117 * @get_wallclock: get time from HW clock like RTC etc.
118 * @set_wallclock: set time back to HW clock
119 */
120struct x86_platform_ops {
121 unsigned long (*calibrate_tsc)(void);
122 unsigned long (*get_wallclock)(void);
123 int (*set_wallclock)(unsigned long nowtime);
124};
125
126extern struct x86_init_ops x86_init;
127extern struct x86_cpuinit_ops x86_cpuinit;
128extern struct x86_platform_ops x86_platform;
129
130extern void x86_init_noop(void);
131extern void x86_init_uint_noop(unsigned int unused);
132
133#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7b..d8e5d0cdd678 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n
31 31
32obj-y := process_$(BITS).o signal.o entry_$(BITS).o 32obj-y := process_$(BITS).o signal.o entry_$(BITS).o
33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
34obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 34obj-y += time.o ioport.o ldt.o dumpstack.o
35obj-y += setup.o i8259.o irqinit.o 35obj-y += setup.o x86_init.o i8259.o irqinit.o
36obj-$(CONFIG_X86_VISWS) += visws_quirks.o 36obj-$(CONFIG_X86_VISWS) += visws_quirks.o
37obj-$(CONFIG_X86_32) += probe_roms_32.o 37obj-$(CONFIG_X86_32) += probe_roms_32.o
38obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 38obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 52obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 53obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 54obj-y += step.o
55obj-$(CONFIG_INTEL_TXT) += tboot.o
55obj-$(CONFIG_STACKTRACE) += stacktrace.o 56obj-$(CONFIG_STACKTRACE) += stacktrace.o
56obj-y += cpu/ 57obj-y += cpu/
57obj-y += acpi/ 58obj-y += acpi/
59obj-$(CONFIG_SFI) += sfi.o
58obj-y += reboot.o 60obj-y += reboot.o
59obj-$(CONFIG_MCA) += mca_32.o 61obj-$(CONFIG_MCA) += mca_32.o
60obj-$(CONFIG_X86_MSR) += msr.o 62obj-$(CONFIG_X86_MSR) += msr.o
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o
104scx200-y += scx200_32.o 106scx200-y += scx200_32.o
105 107
106obj-$(CONFIG_OLPC) += olpc.o 108obj-$(CONFIG_OLPC) += olpc.o
109obj-$(CONFIG_X86_MRST) += mrst.o
107 110
108microcode-y := microcode_core.o 111microcode-y := microcode_core.o
109microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o 112microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6b8ca3a0285d..67e929b89875 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -833,106 +833,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
833extern int es7000_plat; 833extern int es7000_plat;
834#endif 834#endif
835 835
836static struct {
837 int gsi_base;
838 int gsi_end;
839} mp_ioapic_routing[MAX_IO_APICS];
840
841int mp_find_ioapic(int gsi)
842{
843 int i = 0;
844
845 /* Find the IOAPIC that manages this GSI. */
846 for (i = 0; i < nr_ioapics; i++) {
847 if ((gsi >= mp_ioapic_routing[i].gsi_base)
848 && (gsi <= mp_ioapic_routing[i].gsi_end))
849 return i;
850 }
851
852 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
853 return -1;
854}
855
856int mp_find_ioapic_pin(int ioapic, int gsi)
857{
858 if (WARN_ON(ioapic == -1))
859 return -1;
860 if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
861 return -1;
862
863 return gsi - mp_ioapic_routing[ioapic].gsi_base;
864}
865
866static u8 __init uniq_ioapic_id(u8 id)
867{
868#ifdef CONFIG_X86_32
869 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
870 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
871 return io_apic_get_unique_id(nr_ioapics, id);
872 else
873 return id;
874#else
875 int i;
876 DECLARE_BITMAP(used, 256);
877 bitmap_zero(used, 256);
878 for (i = 0; i < nr_ioapics; i++) {
879 struct mpc_ioapic *ia = &mp_ioapics[i];
880 __set_bit(ia->apicid, used);
881 }
882 if (!test_bit(id, used))
883 return id;
884 return find_first_zero_bit(used, 256);
885#endif
886}
887
888static int bad_ioapic(unsigned long address)
889{
890 if (nr_ioapics >= MAX_IO_APICS) {
891 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
892 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
893 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
894 }
895 if (!address) {
896 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
897 " found in table, skipping!\n");
898 return 1;
899 }
900 return 0;
901}
902
903void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
904{
905 int idx = 0;
906
907 if (bad_ioapic(address))
908 return;
909
910 idx = nr_ioapics;
911
912 mp_ioapics[idx].type = MP_IOAPIC;
913 mp_ioapics[idx].flags = MPC_APIC_USABLE;
914 mp_ioapics[idx].apicaddr = address;
915
916 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
917 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
918 mp_ioapics[idx].apicver = io_apic_get_version(idx);
919
920 /*
921 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
922 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
923 */
924 mp_ioapic_routing[idx].gsi_base = gsi_base;
925 mp_ioapic_routing[idx].gsi_end = gsi_base +
926 io_apic_get_redir_entries(idx);
927
928 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
929 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
930 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
931 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
932
933 nr_ioapics++;
934}
935
936int __init acpi_probe_gsi(void) 836int __init acpi_probe_gsi(void)
937{ 837{
938 int idx; 838 int idx;
@@ -947,7 +847,7 @@ int __init acpi_probe_gsi(void)
947 847
948 max_gsi = 0; 848 max_gsi = 0;
949 for (idx = 0; idx < nr_ioapics; idx++) { 849 for (idx = 0; idx < nr_ioapics; idx++) {
950 gsi = mp_ioapic_routing[idx].gsi_end; 850 gsi = mp_gsi_routing[idx].gsi_end;
951 851
952 if (gsi > max_gsi) 852 if (gsi > max_gsi)
953 max_gsi = gsi; 853 max_gsi = gsi;
@@ -1179,9 +1079,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1179 * If MPS is present, it will handle them, 1079 * If MPS is present, it will handle them,
1180 * otherwise the system will stay in PIC mode 1080 * otherwise the system will stay in PIC mode
1181 */ 1081 */
1182 if (acpi_disabled || acpi_noirq) { 1082 if (acpi_disabled || acpi_noirq)
1183 return -ENODEV; 1083 return -ENODEV;
1184 }
1185 1084
1186 if (!cpu_has_apic) 1085 if (!cpu_has_apic)
1187 return -ENODEV; 1086 return -ENODEV;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f57658702571..de7353c0ce9c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/mutex.h> 3#include <linux/mutex.h>
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/stringify.h>
5#include <linux/kprobes.h> 6#include <linux/kprobes.h>
6#include <linux/mm.h> 7#include <linux/mm.h>
7#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
@@ -32,7 +33,7 @@ __setup("smp-alt-boot", bootonly);
32#define smp_alt_once 1 33#define smp_alt_once 1
33#endif 34#endif
34 35
35static int debug_alternative; 36static int __initdata_or_module debug_alternative;
36 37
37static int __init debug_alt(char *str) 38static int __init debug_alt(char *str)
38{ 39{
@@ -51,7 +52,7 @@ static int __init setup_noreplace_smp(char *str)
51__setup("noreplace-smp", setup_noreplace_smp); 52__setup("noreplace-smp", setup_noreplace_smp);
52 53
53#ifdef CONFIG_PARAVIRT 54#ifdef CONFIG_PARAVIRT
54static int noreplace_paravirt = 0; 55static int __initdata_or_module noreplace_paravirt = 0;
55 56
56static int __init setup_noreplace_paravirt(char *str) 57static int __init setup_noreplace_paravirt(char *str)
57{ 58{
@@ -64,16 +65,17 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
64#define DPRINTK(fmt, args...) if (debug_alternative) \ 65#define DPRINTK(fmt, args...) if (debug_alternative) \
65 printk(KERN_DEBUG fmt, args) 66 printk(KERN_DEBUG fmt, args)
66 67
67#ifdef GENERIC_NOP1 68#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
68/* Use inline assembly to define this because the nops are defined 69/* Use inline assembly to define this because the nops are defined
69 as inline assembly strings in the include files and we cannot 70 as inline assembly strings in the include files and we cannot
70 get them easily into strings. */ 71 get them easily into strings. */
71asm("\t.section .rodata, \"a\"\nintelnops: " 72asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
72 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 73 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
73 GENERIC_NOP7 GENERIC_NOP8 74 GENERIC_NOP7 GENERIC_NOP8
74 "\t.previous"); 75 "\t.previous");
75extern const unsigned char intelnops[]; 76extern const unsigned char intelnops[];
76static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { 77static const unsigned char *const __initconst_or_module
78intel_nops[ASM_NOP_MAX+1] = {
77 NULL, 79 NULL,
78 intelnops, 80 intelnops,
79 intelnops + 1, 81 intelnops + 1,
@@ -87,12 +89,13 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
87#endif 89#endif
88 90
89#ifdef K8_NOP1 91#ifdef K8_NOP1
90asm("\t.section .rodata, \"a\"\nk8nops: " 92asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
91 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 93 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
92 K8_NOP7 K8_NOP8 94 K8_NOP7 K8_NOP8
93 "\t.previous"); 95 "\t.previous");
94extern const unsigned char k8nops[]; 96extern const unsigned char k8nops[];
95static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { 97static const unsigned char *const __initconst_or_module
98k8_nops[ASM_NOP_MAX+1] = {
96 NULL, 99 NULL,
97 k8nops, 100 k8nops,
98 k8nops + 1, 101 k8nops + 1,
@@ -105,13 +108,14 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
105}; 108};
106#endif 109#endif
107 110
108#ifdef K7_NOP1 111#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
109asm("\t.section .rodata, \"a\"\nk7nops: " 112asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
110 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 113 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
111 K7_NOP7 K7_NOP8 114 K7_NOP7 K7_NOP8
112 "\t.previous"); 115 "\t.previous");
113extern const unsigned char k7nops[]; 116extern const unsigned char k7nops[];
114static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { 117static const unsigned char *const __initconst_or_module
118k7_nops[ASM_NOP_MAX+1] = {
115 NULL, 119 NULL,
116 k7nops, 120 k7nops,
117 k7nops + 1, 121 k7nops + 1,
@@ -125,12 +129,13 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
125#endif 129#endif
126 130
127#ifdef P6_NOP1 131#ifdef P6_NOP1
128asm("\t.section .rodata, \"a\"\np6nops: " 132asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
129 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 133 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
130 P6_NOP7 P6_NOP8 134 P6_NOP7 P6_NOP8
131 "\t.previous"); 135 "\t.previous");
132extern const unsigned char p6nops[]; 136extern const unsigned char p6nops[];
133static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { 137static const unsigned char *const __initconst_or_module
138p6_nops[ASM_NOP_MAX+1] = {
134 NULL, 139 NULL,
135 p6nops, 140 p6nops,
136 p6nops + 1, 141 p6nops + 1,
@@ -146,7 +151,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
146#ifdef CONFIG_X86_64 151#ifdef CONFIG_X86_64
147 152
148extern char __vsyscall_0; 153extern char __vsyscall_0;
149const unsigned char *const *find_nop_table(void) 154static const unsigned char *const *__init_or_module find_nop_table(void)
150{ 155{
151 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 156 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
152 boot_cpu_has(X86_FEATURE_NOPL)) 157 boot_cpu_has(X86_FEATURE_NOPL))
@@ -157,7 +162,7 @@ const unsigned char *const *find_nop_table(void)
157 162
158#else /* CONFIG_X86_64 */ 163#else /* CONFIG_X86_64 */
159 164
160const unsigned char *const *find_nop_table(void) 165static const unsigned char *const *__init_or_module find_nop_table(void)
161{ 166{
162 if (boot_cpu_has(X86_FEATURE_K8)) 167 if (boot_cpu_has(X86_FEATURE_K8))
163 return k8_nops; 168 return k8_nops;
@@ -172,7 +177,7 @@ const unsigned char *const *find_nop_table(void)
172#endif /* CONFIG_X86_64 */ 177#endif /* CONFIG_X86_64 */
173 178
174/* Use this to add nops to a buffer, then text_poke the whole buffer. */ 179/* Use this to add nops to a buffer, then text_poke the whole buffer. */
175void add_nops(void *insns, unsigned int len) 180static void __init_or_module add_nops(void *insns, unsigned int len)
176{ 181{
177 const unsigned char *const *noptable = find_nop_table(); 182 const unsigned char *const *noptable = find_nop_table();
178 183
@@ -185,10 +190,10 @@ void add_nops(void *insns, unsigned int len)
185 len -= noplen; 190 len -= noplen;
186 } 191 }
187} 192}
188EXPORT_SYMBOL_GPL(add_nops);
189 193
190extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 194extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
191extern u8 *__smp_locks[], *__smp_locks_end[]; 195extern u8 *__smp_locks[], *__smp_locks_end[];
196static void *text_poke_early(void *addr, const void *opcode, size_t len);
192 197
193/* Replace instructions with better alternatives for this CPU type. 198/* Replace instructions with better alternatives for this CPU type.
194 This runs before SMP is initialized to avoid SMP problems with 199 This runs before SMP is initialized to avoid SMP problems with
@@ -196,7 +201,8 @@ extern u8 *__smp_locks[], *__smp_locks_end[];
196 APs have less capabilities than the boot processor are not handled. 201 APs have less capabilities than the boot processor are not handled.
197 Tough. Make sure you disable such features by hand. */ 202 Tough. Make sure you disable such features by hand. */
198 203
199void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 204void __init_or_module apply_alternatives(struct alt_instr *start,
205 struct alt_instr *end)
200{ 206{
201 struct alt_instr *a; 207 struct alt_instr *a;
202 char insnbuf[MAX_PATCH_LEN]; 208 char insnbuf[MAX_PATCH_LEN];
@@ -279,9 +285,10 @@ static LIST_HEAD(smp_alt_modules);
279static DEFINE_MUTEX(smp_alt); 285static DEFINE_MUTEX(smp_alt);
280static int smp_mode = 1; /* protected by smp_alt */ 286static int smp_mode = 1; /* protected by smp_alt */
281 287
282void alternatives_smp_module_add(struct module *mod, char *name, 288void __init_or_module alternatives_smp_module_add(struct module *mod,
283 void *locks, void *locks_end, 289 char *name,
284 void *text, void *text_end) 290 void *locks, void *locks_end,
291 void *text, void *text_end)
285{ 292{
286 struct smp_alt_module *smp; 293 struct smp_alt_module *smp;
287 294
@@ -317,7 +324,7 @@ void alternatives_smp_module_add(struct module *mod, char *name,
317 mutex_unlock(&smp_alt); 324 mutex_unlock(&smp_alt);
318} 325}
319 326
320void alternatives_smp_module_del(struct module *mod) 327void __init_or_module alternatives_smp_module_del(struct module *mod)
321{ 328{
322 struct smp_alt_module *item; 329 struct smp_alt_module *item;
323 330
@@ -386,8 +393,8 @@ void alternatives_smp_switch(int smp)
386#endif 393#endif
387 394
388#ifdef CONFIG_PARAVIRT 395#ifdef CONFIG_PARAVIRT
389void apply_paravirt(struct paravirt_patch_site *start, 396void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
390 struct paravirt_patch_site *end) 397 struct paravirt_patch_site *end)
391{ 398{
392 struct paravirt_patch_site *p; 399 struct paravirt_patch_site *p;
393 char insnbuf[MAX_PATCH_LEN]; 400 char insnbuf[MAX_PATCH_LEN];
@@ -485,13 +492,14 @@ void __init alternative_instructions(void)
485 * instructions. And on the local CPU you need to be protected again NMI or MCE 492 * instructions. And on the local CPU you need to be protected again NMI or MCE
486 * handlers seeing an inconsistent instruction while you patch. 493 * handlers seeing an inconsistent instruction while you patch.
487 */ 494 */
488void *text_poke_early(void *addr, const void *opcode, size_t len) 495static void *__init_or_module text_poke_early(void *addr, const void *opcode,
496 size_t len)
489{ 497{
490 unsigned long flags; 498 unsigned long flags;
491 local_irq_save(flags); 499 local_irq_save(flags);
492 memcpy(addr, opcode, len); 500 memcpy(addr, opcode, len);
493 local_irq_restore(flags);
494 sync_core(); 501 sync_core();
502 local_irq_restore(flags);
495 /* Could also do a CLFLUSH here to speed up CPU recovery; but 503 /* Could also do a CLFLUSH here to speed up CPU recovery; but
496 that causes hangs on some VIA CPUs. */ 504 that causes hangs on some VIA CPUs. */
497 return addr; 505 return addr;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f5037801..98f230f6a28d 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
41static LIST_HEAD(iommu_pd_list); 41static LIST_HEAD(iommu_pd_list);
42static DEFINE_SPINLOCK(iommu_pd_list_lock); 42static DEFINE_SPINLOCK(iommu_pd_list_lock);
43 43
44#ifdef CONFIG_IOMMU_API 44/*
45 * Domain for untranslated devices - only allocated
46 * if iommu=pt passed on kernel cmd line.
47 */
48static struct protection_domain *pt_domain;
49
45static struct iommu_ops amd_iommu_ops; 50static struct iommu_ops amd_iommu_ops;
46#endif
47 51
48/* 52/*
49 * general struct to manage commands send to an IOMMU 53 * general struct to manage commands send to an IOMMU
@@ -55,16 +59,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 60 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 61static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom, 62static u64 *alloc_pte(struct protection_domain *domain,
59 unsigned long address, u64 63 unsigned long address, int end_lvl,
60 **pte_page, gfp_t gfp); 64 u64 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page, 66 unsigned long start_page,
63 unsigned int pages); 67 unsigned int pages);
64 68static void reset_iommu_command_buffer(struct amd_iommu *iommu);
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER 69static u64 *fetch_pte(struct protection_domain *domain,
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 70 unsigned long address, int map_size);
67#endif 71static void update_domain(struct protection_domain *domain);
68 72
69#ifdef CONFIG_AMD_IOMMU_STATS 73#ifdef CONFIG_AMD_IOMMU_STATS
70 74
@@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
138 * 142 *
139 ****************************************************************************/ 143 ****************************************************************************/
140 144
141static void iommu_print_event(void *__evt) 145static void dump_dte_entry(u16 devid)
146{
147 int i;
148
149 for (i = 0; i < 8; ++i)
150 pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
151 amd_iommu_dev_table[devid].data[i]);
152}
153
154static void dump_command(unsigned long phys_addr)
155{
156 struct iommu_cmd *cmd = phys_to_virt(phys_addr);
157 int i;
158
159 for (i = 0; i < 4; ++i)
160 pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
161}
162
163static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
142{ 164{
143 u32 *event = __evt; 165 u32 *event = __evt;
144 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 166 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
@@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt)
147 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 169 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
148 u64 address = (u64)(((u64)event[3]) << 32) | event[2]; 170 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
149 171
150 printk(KERN_ERR "AMD IOMMU: Event logged ["); 172 printk(KERN_ERR "AMD-Vi: Event logged [");
151 173
152 switch (type) { 174 switch (type) {
153 case EVENT_TYPE_ILL_DEV: 175 case EVENT_TYPE_ILL_DEV:
@@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt)
155 "address=0x%016llx flags=0x%04x]\n", 177 "address=0x%016llx flags=0x%04x]\n",
156 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), 178 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
157 address, flags); 179 address, flags);
180 dump_dte_entry(devid);
158 break; 181 break;
159 case EVENT_TYPE_IO_FAULT: 182 case EVENT_TYPE_IO_FAULT:
160 printk("IO_PAGE_FAULT device=%02x:%02x.%x " 183 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
@@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt)
176 break; 199 break;
177 case EVENT_TYPE_ILL_CMD: 200 case EVENT_TYPE_ILL_CMD:
178 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
202 reset_iommu_command_buffer(iommu);
203 dump_command(address);
179 break; 204 break;
180 case EVENT_TYPE_CMD_HARD_ERR: 205 case EVENT_TYPE_CMD_HARD_ERR:
181 printk("COMMAND_HARDWARE_ERROR address=0x%016llx " 206 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
@@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
209 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 234 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
210 235
211 while (head != tail) { 236 while (head != tail) {
212 iommu_print_event(iommu->evt_buf + head); 237 iommu_print_event(iommu, iommu->evt_buf + head);
213 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; 238 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
214 } 239 }
215 240
@@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
296 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
297 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
298 323
299 if (unlikely(i == EXIT_LOOP_COUNT)) 324 if (unlikely(i == EXIT_LOOP_COUNT)) {
300 panic("AMD IOMMU: Completion wait loop failed\n"); 325 spin_unlock(&iommu->lock);
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
301} 329}
302 330
303/* 331/*
@@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
445} 473}
446 474
447/* 475/*
476 * This function flushes one domain on one IOMMU
477 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
479{
480 struct iommu_cmd cmd;
481 unsigned long flags;
482
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
484 domid, 1, 1);
485
486 spin_lock_irqsave(&iommu->lock, flags);
487 __iommu_queue_command(iommu, &cmd);
488 __iommu_completion_wait(iommu);
489 __iommu_wait_for_completion(iommu);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491}
492
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
494{
495 int i;
496
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
499 continue;
500 flush_domain_on_iommu(iommu, i);
501 }
502
503}
504
505/*
448 * This function is used to flush the IO/TLB for a given protection domain 506 * This function is used to flush the IO/TLB for a given protection domain
449 * on every IOMMU in the system 507 * on every IOMMU in the system
450 */ 508 */
451static void iommu_flush_domain(u16 domid) 509static void iommu_flush_domain(u16 domid)
452{ 510{
453 unsigned long flags;
454 struct amd_iommu *iommu; 511 struct amd_iommu *iommu;
455 struct iommu_cmd cmd;
456 512
457 INC_STATS_COUNTER(domain_flush_all); 513 INC_STATS_COUNTER(domain_flush_all);
458 514
459 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 515 for_each_iommu(iommu)
460 domid, 1, 1); 516 flush_domain_on_iommu(iommu, domid);
461
462 for_each_iommu(iommu) {
463 spin_lock_irqsave(&iommu->lock, flags);
464 __iommu_queue_command(iommu, &cmd);
465 __iommu_completion_wait(iommu);
466 __iommu_wait_for_completion(iommu);
467 spin_unlock_irqrestore(&iommu->lock, flags);
468 }
469} 517}
470 518
471void amd_iommu_flush_all_domains(void) 519void amd_iommu_flush_all_domains(void)
472{ 520{
521 struct amd_iommu *iommu;
522
523 for_each_iommu(iommu)
524 flush_all_domains_on_iommu(iommu);
525}
526
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
528{
473 int i; 529 int i;
474 530
475 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 531 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
476 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 532 if (iommu != amd_iommu_rlookup_table[i])
477 continue; 533 continue;
478 iommu_flush_domain(i); 534
535 iommu_queue_inv_dev_entry(iommu, i);
536 iommu_completion_wait(iommu);
479 } 537 }
480} 538}
481 539
482void amd_iommu_flush_all_devices(void) 540static void flush_devices_by_domain(struct protection_domain *domain)
483{ 541{
484 struct amd_iommu *iommu; 542 struct amd_iommu *iommu;
485 int i; 543 int i;
486 544
487 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 545 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
488 if (amd_iommu_pd_table[i] == NULL) 546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
547 (amd_iommu_pd_table[i] != domain))
489 continue; 548 continue;
490 549
491 iommu = amd_iommu_rlookup_table[i]; 550 iommu = amd_iommu_rlookup_table[i];
@@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void)
497 } 556 }
498} 557}
499 558
559static void reset_iommu_command_buffer(struct amd_iommu *iommu)
560{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
562
563 if (iommu->reset_in_progress)
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
565
566 iommu->reset_in_progress = true;
567
568 amd_iommu_reset_cmd_buffer(iommu);
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571
572 iommu->reset_in_progress = false;
573}
574
575void amd_iommu_flush_all_devices(void)
576{
577 flush_devices_by_domain(NULL);
578}
579
500/**************************************************************************** 580/****************************************************************************
501 * 581 *
502 * The functions below are used the create the page table mappings for 582 * The functions below are used the create the page table mappings for
@@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void)
514static int iommu_map_page(struct protection_domain *dom, 594static int iommu_map_page(struct protection_domain *dom,
515 unsigned long bus_addr, 595 unsigned long bus_addr,
516 unsigned long phys_addr, 596 unsigned long phys_addr,
517 int prot) 597 int prot,
598 int map_size)
518{ 599{
519 u64 __pte, *pte; 600 u64 __pte, *pte;
520 601
521 bus_addr = PAGE_ALIGN(bus_addr); 602 bus_addr = PAGE_ALIGN(bus_addr);
522 phys_addr = PAGE_ALIGN(phys_addr); 603 phys_addr = PAGE_ALIGN(phys_addr);
523 604
524 /* only support 512GB address spaces for now */ 605 BUG_ON(!PM_ALIGNED(map_size, bus_addr));
525 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 606 BUG_ON(!PM_ALIGNED(map_size, phys_addr));
607
608 if (!(prot & IOMMU_PROT_MASK))
526 return -EINVAL; 609 return -EINVAL;
527 610
528 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); 611 pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
529 612
530 if (IOMMU_PTE_PRESENT(*pte)) 613 if (IOMMU_PTE_PRESENT(*pte))
531 return -EBUSY; 614 return -EBUSY;
@@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom,
538 621
539 *pte = __pte; 622 *pte = __pte;
540 623
624 update_domain(dom);
625
541 return 0; 626 return 0;
542} 627}
543 628
544static void iommu_unmap_page(struct protection_domain *dom, 629static void iommu_unmap_page(struct protection_domain *dom,
545 unsigned long bus_addr) 630 unsigned long bus_addr, int map_size)
546{ 631{
547 u64 *pte; 632 u64 *pte = fetch_pte(dom, bus_addr, map_size);
548
549 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
550
551 if (!IOMMU_PTE_PRESENT(*pte))
552 return;
553
554 pte = IOMMU_PTE_PAGE(*pte);
555 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
556 633
557 if (!IOMMU_PTE_PRESENT(*pte)) 634 if (pte)
558 return; 635 *pte = 0;
559
560 pte = IOMMU_PTE_PAGE(*pte);
561 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
562
563 *pte = 0;
564} 636}
565 637
566/* 638/*
@@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
615 687
616 for (addr = e->address_start; addr < e->address_end; 688 for (addr = e->address_start; addr < e->address_end;
617 addr += PAGE_SIZE) { 689 addr += PAGE_SIZE) {
618 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); 690 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
691 PM_MAP_4k);
619 if (ret) 692 if (ret)
620 return ret; 693 return ret;
621 /* 694 /*
@@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
670 * This function checks if there is a PTE for a given dma address. If 743 * This function checks if there is a PTE for a given dma address. If
671 * there is one, it returns the pointer to it. 744 * there is one, it returns the pointer to it.
672 */ 745 */
673static u64* fetch_pte(struct protection_domain *domain, 746static u64 *fetch_pte(struct protection_domain *domain,
674 unsigned long address) 747 unsigned long address, int map_size)
675{ 748{
749 int level;
676 u64 *pte; 750 u64 *pte;
677 751
678 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; 752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
679 754
680 if (!IOMMU_PTE_PRESENT(*pte)) 755 while (level > map_size) {
681 return NULL; 756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
682 758
683 pte = IOMMU_PTE_PAGE(*pte); 759 level -= 1;
684 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
685 760
686 if (!IOMMU_PTE_PRESENT(*pte)) 761 pte = IOMMU_PTE_PAGE(*pte);
687 return NULL; 762 pte = &pte[PM_LEVEL_INDEX(level, address)];
688 763
689 pte = IOMMU_PTE_PAGE(*pte); 764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
690 pte = &pte[IOMMU_PTE_L0_INDEX(address)]; 765 pte = NULL;
766 break;
767 }
768 }
691 769
692 return pte; 770 return pte;
693} 771}
@@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu,
727 u64 *pte, *pte_page; 805 u64 *pte, *pte_page;
728 806
729 for (i = 0; i < num_ptes; ++i) { 807 for (i = 0; i < num_ptes; ++i) {
730 pte = alloc_pte(&dma_dom->domain, address, 808 pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
731 &pte_page, gfp); 809 &pte_page, gfp);
732 if (!pte) 810 if (!pte)
733 goto out_free; 811 goto out_free;
@@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu,
760 for (i = dma_dom->aperture[index]->offset; 838 for (i = dma_dom->aperture[index]->offset;
761 i < dma_dom->aperture_size; 839 i < dma_dom->aperture_size;
762 i += PAGE_SIZE) { 840 i += PAGE_SIZE) {
763 u64 *pte = fetch_pte(&dma_dom->domain, i); 841 u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
764 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 842 if (!pte || !IOMMU_PTE_PRESENT(*pte))
765 continue; 843 continue;
766 844
767 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); 845 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
768 } 846 }
769 847
848 update_domain(&dma_dom->domain);
849
770 return 0; 850 return 0;
771 851
772out_free: 852out_free:
853 update_domain(&dma_dom->domain);
854
773 free_page((unsigned long)dma_dom->aperture[index]->bitmap); 855 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
774 856
775 kfree(dma_dom->aperture[index]); 857 kfree(dma_dom->aperture[index]);
@@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1009 dma_dom->domain.id = domain_id_alloc(); 1091 dma_dom->domain.id = domain_id_alloc();
1010 if (dma_dom->domain.id == 0) 1092 if (dma_dom->domain.id == 0)
1011 goto free_dma_dom; 1093 goto free_dma_dom;
1012 dma_dom->domain.mode = PAGE_MODE_3_LEVEL; 1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1013 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1014 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1096 dma_dom->domain.flags = PD_DMA_OPS_MASK;
1015 dma_dom->domain.priv = dma_dom; 1097 dma_dom->domain.priv = dma_dom;
@@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid)
1063 return dom; 1145 return dom;
1064} 1146}
1065 1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{
1150 u64 pte_root = virt_to_phys(domain->pt_root);
1151
1152 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1153 << DEV_ENTRY_MODE_SHIFT;
1154 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1155
1156 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1159
1160 amd_iommu_pd_table[devid] = domain;
1161}
1162
1163/*
1164 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware
1166 */
1167static void __attach_device(struct amd_iommu *iommu,
1168 struct protection_domain *domain,
1169 u16 devid)
1170{
1171 /* lock domain */
1172 spin_lock(&domain->lock);
1173
1174 /* update DTE entry */
1175 set_dte_entry(devid, domain);
1176
1177 domain->dev_cnt += 1;
1178
1179 /* ready */
1180 spin_unlock(&domain->lock);
1181}
1182
1066/* 1183/*
1067 * If a device is not yet associated with a domain, this function does 1184 * If a device is not yet associated with a domain, this function does
1068 * assigns it visible for the hardware 1185 * assigns it visible for the hardware
@@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu,
1072 u16 devid) 1189 u16 devid)
1073{ 1190{
1074 unsigned long flags; 1191 unsigned long flags;
1075 u64 pte_root = virt_to_phys(domain->pt_root);
1076
1077 domain->dev_cnt += 1;
1078
1079 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1080 << DEV_ENTRY_MODE_SHIFT;
1081 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1082 1192
1083 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1084 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1194 __attach_device(iommu, domain, devid);
1085 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1086 amd_iommu_dev_table[devid].data[2] = domain->id;
1087
1088 amd_iommu_pd_table[devid] = domain;
1089 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1090 1196
1091 /* 1197 /*
1092 * We might boot into a crash-kernel here. The crashed kernel 1198 * We might boot into a crash-kernel here. The crashed kernel
1093 * left the caches in the IOMMU dirty. So we have to flush 1199 * left the caches in the IOMMU dirty. So we have to flush
1094 * here to evict all dirty stuff. 1200 * here to evict all dirty stuff.
1095 */ 1201 */
1096 iommu_queue_inv_dev_entry(iommu, devid); 1202 iommu_queue_inv_dev_entry(iommu, devid);
1097 iommu_flush_tlb_pde(iommu, domain->id); 1203 iommu_flush_tlb_pde(iommu, domain->id);
1098} 1204}
@@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid)
1119 1225
1120 /* ready */ 1226 /* ready */
1121 spin_unlock(&domain->lock); 1227 spin_unlock(&domain->lock);
1228
1229 /*
1230 * If we run in passthrough mode the device must be assigned to the
1231 * passthrough domain if it is detached from any other domain
1232 */
1233 if (iommu_pass_through) {
1234 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
1235 __attach_device(iommu, pt_domain, devid);
1236 }
1122} 1237}
1123 1238
1124/* 1239/*
@@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb,
1164 case BUS_NOTIFY_UNBOUND_DRIVER: 1279 case BUS_NOTIFY_UNBOUND_DRIVER:
1165 if (!domain) 1280 if (!domain)
1166 goto out; 1281 goto out;
1282 if (iommu_pass_through)
1283 break;
1167 detach_device(domain, devid); 1284 detach_device(domain, devid);
1168 break; 1285 break;
1169 case BUS_NOTIFY_ADD_DEVICE: 1286 case BUS_NOTIFY_ADD_DEVICE:
@@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev,
1292 return 1; 1409 return 1;
1293} 1410}
1294 1411
1412static void update_device_table(struct protection_domain *domain)
1413{
1414 unsigned long flags;
1415 int i;
1416
1417 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
1418 if (amd_iommu_pd_table[i] != domain)
1419 continue;
1420 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1421 set_dte_entry(i, domain);
1422 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1423 }
1424}
1425
1426static void update_domain(struct protection_domain *domain)
1427{
1428 if (!domain->updated)
1429 return;
1430
1431 update_device_table(domain);
1432 flush_devices_by_domain(domain);
1433 iommu_flush_domain(domain->id);
1434
1435 domain->updated = false;
1436}
1437
1295/* 1438/*
1296 * If the pte_page is not yet allocated this function is called 1439 * This function is used to add another level to an IO page table. Adding
1440 * another level increases the size of the address space by 9 bits to a size up
1441 * to 64 bits.
1297 */ 1442 */
1298static u64* alloc_pte(struct protection_domain *dom, 1443static bool increase_address_space(struct protection_domain *domain,
1299 unsigned long address, u64 **pte_page, gfp_t gfp) 1444 gfp_t gfp)
1445{
1446 u64 *pte;
1447
1448 if (domain->mode == PAGE_MODE_6_LEVEL)
1449 /* address space already 64 bit large */
1450 return false;
1451
1452 pte = (void *)get_zeroed_page(gfp);
1453 if (!pte)
1454 return false;
1455
1456 *pte = PM_LEVEL_PDE(domain->mode,
1457 virt_to_phys(domain->pt_root));
1458 domain->pt_root = pte;
1459 domain->mode += 1;
1460 domain->updated = true;
1461
1462 return true;
1463}
1464
1465static u64 *alloc_pte(struct protection_domain *domain,
1466 unsigned long address,
1467 int end_lvl,
1468 u64 **pte_page,
1469 gfp_t gfp)
1300{ 1470{
1301 u64 *pte, *page; 1471 u64 *pte, *page;
1472 int level;
1302 1473
1303 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; 1474 while (address > PM_LEVEL_SIZE(domain->mode))
1475 increase_address_space(domain, gfp);
1304 1476
1305 if (!IOMMU_PTE_PRESENT(*pte)) { 1477 level = domain->mode - 1;
1306 page = (u64 *)get_zeroed_page(gfp); 1478 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1307 if (!page)
1308 return NULL;
1309 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1310 }
1311 1479
1312 pte = IOMMU_PTE_PAGE(*pte); 1480 while (level > end_lvl) {
1313 pte = &pte[IOMMU_PTE_L1_INDEX(address)]; 1481 if (!IOMMU_PTE_PRESENT(*pte)) {
1482 page = (u64 *)get_zeroed_page(gfp);
1483 if (!page)
1484 return NULL;
1485 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1486 }
1314 1487
1315 if (!IOMMU_PTE_PRESENT(*pte)) { 1488 level -= 1;
1316 page = (u64 *)get_zeroed_page(gfp);
1317 if (!page)
1318 return NULL;
1319 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1320 }
1321 1489
1322 pte = IOMMU_PTE_PAGE(*pte); 1490 pte = IOMMU_PTE_PAGE(*pte);
1323 1491
1324 if (pte_page) 1492 if (pte_page && level == end_lvl)
1325 *pte_page = pte; 1493 *pte_page = pte;
1326 1494
1327 pte = &pte[IOMMU_PTE_L0_INDEX(address)]; 1495 pte = &pte[PM_LEVEL_INDEX(level, address)];
1496 }
1328 1497
1329 return pte; 1498 return pte;
1330} 1499}
@@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1344 1513
1345 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 1514 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1346 if (!pte) { 1515 if (!pte) {
1347 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); 1516 pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
1517 GFP_ATOMIC);
1348 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; 1518 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1349 } else 1519 } else
1350 pte += IOMMU_PTE_L0_INDEX(address); 1520 pte += PM_LEVEL_INDEX(0, address);
1521
1522 update_domain(&dom->domain);
1351 1523
1352 return pte; 1524 return pte;
1353} 1525}
@@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1409 if (!pte) 1581 if (!pte)
1410 return; 1582 return;
1411 1583
1412 pte += IOMMU_PTE_L0_INDEX(address); 1584 pte += PM_LEVEL_INDEX(0, address);
1413 1585
1414 WARN_ON(!*pte); 1586 WARN_ON(!*pte);
1415 1587
@@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain)
1988 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2160 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1989} 2161}
1990 2162
1991static int amd_iommu_domain_init(struct iommu_domain *dom) 2163static void protection_domain_free(struct protection_domain *domain)
2164{
2165 if (!domain)
2166 return;
2167
2168 if (domain->id)
2169 domain_id_free(domain->id);
2170
2171 kfree(domain);
2172}
2173
2174static struct protection_domain *protection_domain_alloc(void)
1992{ 2175{
1993 struct protection_domain *domain; 2176 struct protection_domain *domain;
1994 2177
1995 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2178 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1996 if (!domain) 2179 if (!domain)
1997 return -ENOMEM; 2180 return NULL;
1998 2181
1999 spin_lock_init(&domain->lock); 2182 spin_lock_init(&domain->lock);
2000 domain->mode = PAGE_MODE_3_LEVEL;
2001 domain->id = domain_id_alloc(); 2183 domain->id = domain_id_alloc();
2002 if (!domain->id) 2184 if (!domain->id)
2185 goto out_err;
2186
2187 return domain;
2188
2189out_err:
2190 kfree(domain);
2191
2192 return NULL;
2193}
2194
2195static int amd_iommu_domain_init(struct iommu_domain *dom)
2196{
2197 struct protection_domain *domain;
2198
2199 domain = protection_domain_alloc();
2200 if (!domain)
2003 goto out_free; 2201 goto out_free;
2202
2203 domain->mode = PAGE_MODE_3_LEVEL;
2004 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); 2204 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2005 if (!domain->pt_root) 2205 if (!domain->pt_root)
2006 goto out_free; 2206 goto out_free;
@@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
2010 return 0; 2210 return 0;
2011 2211
2012out_free: 2212out_free:
2013 kfree(domain); 2213 protection_domain_free(domain);
2014 2214
2015 return -ENOMEM; 2215 return -ENOMEM;
2016} 2216}
@@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2115 paddr &= PAGE_MASK; 2315 paddr &= PAGE_MASK;
2116 2316
2117 for (i = 0; i < npages; ++i) { 2317 for (i = 0; i < npages; ++i) {
2118 ret = iommu_map_page(domain, iova, paddr, prot); 2318 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2119 if (ret) 2319 if (ret)
2120 return ret; 2320 return ret;
2121 2321
@@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2136 iova &= PAGE_MASK; 2336 iova &= PAGE_MASK;
2137 2337
2138 for (i = 0; i < npages; ++i) { 2338 for (i = 0; i < npages; ++i) {
2139 iommu_unmap_page(domain, iova); 2339 iommu_unmap_page(domain, iova, PM_MAP_4k);
2140 iova += PAGE_SIZE; 2340 iova += PAGE_SIZE;
2141 } 2341 }
2142 2342
@@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2151 phys_addr_t paddr; 2351 phys_addr_t paddr;
2152 u64 *pte; 2352 u64 *pte;
2153 2353
2154 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; 2354 pte = fetch_pte(domain, iova, PM_MAP_4k);
2155
2156 if (!IOMMU_PTE_PRESENT(*pte))
2157 return 0;
2158
2159 pte = IOMMU_PTE_PAGE(*pte);
2160 pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
2161
2162 if (!IOMMU_PTE_PRESENT(*pte))
2163 return 0;
2164
2165 pte = IOMMU_PTE_PAGE(*pte);
2166 pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
2167 2355
2168 if (!IOMMU_PTE_PRESENT(*pte)) 2356 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2169 return 0; 2357 return 0;
2170 2358
2171 paddr = *pte & IOMMU_PAGE_MASK; 2359 paddr = *pte & IOMMU_PAGE_MASK;
@@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = {
2191 .domain_has_cap = amd_iommu_domain_has_cap, 2379 .domain_has_cap = amd_iommu_domain_has_cap,
2192}; 2380};
2193 2381
2382/*****************************************************************************
2383 *
2384 * The next functions do a basic initialization of IOMMU for pass through
2385 * mode
2386 *
2387 * In passthrough mode the IOMMU is initialized and enabled but not used for
2388 * DMA-API translation.
2389 *
2390 *****************************************************************************/
2391
2392int __init amd_iommu_init_passthrough(void)
2393{
2394 struct pci_dev *dev = NULL;
2395 u16 devid, devid2;
2396
2397 /* allocate passthroug domain */
2398 pt_domain = protection_domain_alloc();
2399 if (!pt_domain)
2400 return -ENOMEM;
2401
2402 pt_domain->mode |= PAGE_MODE_NONE;
2403
2404 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2405 struct amd_iommu *iommu;
2406
2407 devid = calc_devid(dev->bus->number, dev->devfn);
2408 if (devid > amd_iommu_last_bdf)
2409 continue;
2410
2411 devid2 = amd_iommu_alias_table[devid];
2412
2413 iommu = amd_iommu_rlookup_table[devid2];
2414 if (!iommu)
2415 continue;
2416
2417 __attach_device(iommu, pt_domain, devid);
2418 __attach_device(iommu, pt_domain, devid2);
2419 }
2420
2421 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
2422
2423 return 0;
2424}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c1b17e97252e..b4b61d462dcc 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
252/* Function to enable the hardware */ 252/* Function to enable the hardware */
253static void iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
254{ 254{
255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
256 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
257 257
258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
@@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
435} 435}
436 436
437/* 437/*
438 * This function resets the command buffer if the IOMMU stopped fetching
439 * commands from it.
440 */
441void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
442{
443 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
444
445 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
446 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
447
448 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
449}
450
451/*
438 * This function writes the command buffer address to the hardware and 452 * This function writes the command buffer address to the hardware and
439 * enables it. 453 * enables it.
440 */ 454 */
@@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
450 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 464 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
451 &entry, sizeof(entry)); 465 &entry, sizeof(entry));
452 466
453 /* set head and tail to zero manually */ 467 amd_iommu_reset_cmd_buffer(iommu);
454 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
455 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
456
457 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
458} 468}
459 469
460static void __init free_command_buffer(struct amd_iommu *iommu) 470static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
858 switch (*p) { 868 switch (*p) {
859 case ACPI_IVHD_TYPE: 869 case ACPI_IVHD_TYPE:
860 870
861 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " 871 DUMP_printk("device: %02x:%02x.%01x cap: %04x "
862 "seg: %d flags: %01x info %04x\n", 872 "seg: %d flags: %01x info %04x\n",
863 PCI_BUS(h->devid), PCI_SLOT(h->devid), 873 PCI_BUS(h->devid), PCI_SLOT(h->devid),
864 PCI_FUNC(h->devid), h->cap_ptr, 874 PCI_FUNC(h->devid), h->cap_ptr,
@@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
902 912
903 r = request_irq(iommu->dev->irq, amd_iommu_int_handler, 913 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
904 IRQF_SAMPLE_RANDOM, 914 IRQF_SAMPLE_RANDOM,
905 "AMD IOMMU", 915 "AMD-Vi",
906 NULL); 916 NULL);
907 917
908 if (r) { 918 if (r) {
@@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void)
1150 1160
1151 1161
1152 if (no_iommu) { 1162 if (no_iommu) {
1153 printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); 1163 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1154 return 0; 1164 return 0;
1155 } 1165 }
1156 1166
@@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void)
1242 if (ret) 1252 if (ret)
1243 goto free; 1253 goto free;
1244 1254
1245 ret = amd_iommu_init_dma_ops(); 1255 if (iommu_pass_through)
1256 ret = amd_iommu_init_passthrough();
1257 else
1258 ret = amd_iommu_init_dma_ops();
1246 if (ret) 1259 if (ret)
1247 goto free; 1260 goto free;
1248 1261
1249 enable_iommus(); 1262 enable_iommus();
1250 1263
1251 printk(KERN_INFO "AMD IOMMU: device isolation "); 1264 if (iommu_pass_through)
1265 goto out;
1266
1267 printk(KERN_INFO "AMD-Vi: device isolation ");
1252 if (amd_iommu_isolate) 1268 if (amd_iommu_isolate)
1253 printk("enabled\n"); 1269 printk("enabled\n");
1254 else 1270 else
1255 printk("disabled\n"); 1271 printk("disabled\n");
1256 1272
1257 if (amd_iommu_unmap_flush) 1273 if (amd_iommu_unmap_flush)
1258 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); 1274 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1259 else 1275 else
1260 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); 1276 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1261 1277
1262out: 1278out:
1263 return ret; 1279 return ret;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 676debfc1702..128111d8ffe0 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,6 +20,7 @@
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/suspend.h> 22#include <linux/suspend.h>
23#include <linux/kmemleak.h>
23#include <asm/e820.h> 24#include <asm/e820.h>
24#include <asm/io.h> 25#include <asm/io.h>
25#include <asm/iommu.h> 26#include <asm/iommu.h>
@@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void)
94 * code for safe 95 * code for safe
95 */ 96 */
96 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); 97 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
98 /*
99 * Kmemleak should not scan this block as it may not be mapped via the
100 * kernel direct mapping.
101 */
102 kmemleak_ignore(p);
97 if (!p || __pa(p)+aper_size > 0xffffffff) { 103 if (!p || __pa(p)+aper_size > 0xffffffff) {
98 printk(KERN_ERR 104 printk(KERN_ERR
99 "Cannot allocate aperture memory hole (%p,%uK)\n", 105 "Cannot allocate aperture memory hole (%p,%uK)\n",
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c2830ec66..894aa97f0717 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,7 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
19#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
20#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -35,7 +35,8 @@
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37 37
38#include <asm/perf_counter.h> 38#include <asm/perf_event.h>
39#include <asm/x86_init.h>
39#include <asm/pgalloc.h> 40#include <asm/pgalloc.h>
40#include <asm/atomic.h> 41#include <asm/atomic.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
@@ -49,6 +50,7 @@
49#include <asm/mtrr.h> 50#include <asm/mtrr.h>
50#include <asm/smp.h> 51#include <asm/smp.h>
51#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h>
52 54
53unsigned int num_processors; 55unsigned int num_processors;
54 56
@@ -60,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U;
60/* 62/*
61 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
62 * 64 *
63 * This determines the messaging protocol we can use: if all APIC IDs 65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
64 * are in the 0 ... 7 range, then we can use logical addressing which 66 * are in the 0 ... 7 range, then we can use logical addressing which
65 * has some performance advantages (better broadcasting). 67 * has some performance advantages (better broadcasting).
66 * 68 *
@@ -977,7 +979,7 @@ void lapic_shutdown(void)
977{ 979{
978 unsigned long flags; 980 unsigned long flags;
979 981
980 if (!cpu_has_apic) 982 if (!cpu_has_apic && !apic_from_smp_config())
981 return; 983 return;
982 984
983 local_irq_save(flags); 985 local_irq_save(flags);
@@ -1187,7 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1187 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1188 } 1190 }
1189#endif 1191#endif
1190 perf_counters_lapic_init(); 1192 perf_events_lapic_init();
1191 1193
1192 preempt_disable(); 1194 preempt_disable();
1193 1195
@@ -1195,8 +1197,7 @@ void __cpuinit setup_local_APIC(void)
1195 * Double-check whether this APIC is really registered. 1197 * Double-check whether this APIC is really registered.
1196 * This is meaningless in clustered apic mode, so we skip it. 1198 * This is meaningless in clustered apic mode, so we skip it.
1197 */ 1199 */
1198 if (!apic->apic_id_registered()) 1200 BUG_ON(!apic->apic_id_registered());
1199 BUG();
1200 1201
1201 /* 1202 /*
1202 * Intel recommends to set DFR, LDR and TPR before enabling 1203 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -1361,52 +1362,80 @@ void enable_x2apic(void)
1361} 1362}
1362#endif /* CONFIG_X86_X2APIC */ 1363#endif /* CONFIG_X86_X2APIC */
1363 1364
1364void __init enable_IR_x2apic(void) 1365int __init enable_IR(void)
1365{ 1366{
1366#ifdef CONFIG_INTR_REMAP 1367#ifdef CONFIG_INTR_REMAP
1367 int ret;
1368 unsigned long flags;
1369 struct IO_APIC_route_entry **ioapic_entries = NULL;
1370
1371 ret = dmar_table_init();
1372 if (ret) {
1373 pr_debug("dmar_table_init() failed with %d:\n", ret);
1374 goto ir_failed;
1375 }
1376
1377 if (!intr_remapping_supported()) { 1368 if (!intr_remapping_supported()) {
1378 pr_debug("intr-remapping not supported\n"); 1369 pr_debug("intr-remapping not supported\n");
1379 goto ir_failed; 1370 return 0;
1380 } 1371 }
1381 1372
1382
1383 if (!x2apic_preenabled && skip_ioapic_setup) { 1373 if (!x2apic_preenabled && skip_ioapic_setup) {
1384 pr_info("Skipped enabling intr-remap because of skipping " 1374 pr_info("Skipped enabling intr-remap because of skipping "
1385 "io-apic setup\n"); 1375 "io-apic setup\n");
1386 return; 1376 return 0;
1387 } 1377 }
1388 1378
1379 if (enable_intr_remapping(x2apic_supported()))
1380 return 0;
1381
1382 pr_info("Enabled Interrupt-remapping\n");
1383
1384 return 1;
1385
1386#endif
1387 return 0;
1388}
1389
1390void __init enable_IR_x2apic(void)
1391{
1392 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0;
1396
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret)
1400 pr_debug("dmar_table_init() failed with %d:\n",
1401 dmar_table_init_ret);
1402#endif
1403
1389 ioapic_entries = alloc_ioapic_entries(); 1404 ioapic_entries = alloc_ioapic_entries();
1390 if (!ioapic_entries) { 1405 if (!ioapic_entries) {
1391 pr_info("Allocate ioapic_entries failed: %d\n", ret); 1406 pr_err("Allocate ioapic_entries failed\n");
1392 goto end; 1407 goto out;
1393 } 1408 }
1394 1409
1395 ret = save_IO_APIC_setup(ioapic_entries); 1410 ret = save_IO_APIC_setup(ioapic_entries);
1396 if (ret) { 1411 if (ret) {
1397 pr_info("Saving IO-APIC state failed: %d\n", ret); 1412 pr_info("Saving IO-APIC state failed: %d\n", ret);
1398 goto end; 1413 goto out;
1399 } 1414 }
1400 1415
1401 local_irq_save(flags); 1416 local_irq_save(flags);
1402 mask_IO_APIC_setup(ioapic_entries);
1403 mask_8259A(); 1417 mask_8259A();
1418 mask_IO_APIC_setup(ioapic_entries);
1404 1419
1405 ret = enable_intr_remapping(x2apic_supported()); 1420 if (dmar_table_init_ret)
1406 if (ret) 1421 ret = 0;
1407 goto end_restore; 1422 else
1423 ret = enable_IR();
1408 1424
1409 pr_info("Enabled Interrupt-remapping\n"); 1425 if (!ret) {
1426 /* IR is required if there is APIC ID > 255 even when running
1427 * under KVM
1428 */
1429 if (max_physical_apicid > 255 || !kvm_para_available())
1430 goto nox2apic;
1431 /*
1432 * without IR all CPUs can be addressed by IOAPIC/MSI
1433 * only in physical mode
1434 */
1435 x2apic_force_phys();
1436 }
1437
1438 x2apic_enabled = 1;
1410 1439
1411 if (x2apic_supported() && !x2apic_mode) { 1440 if (x2apic_supported() && !x2apic_mode) {
1412 x2apic_mode = 1; 1441 x2apic_mode = 1;
@@ -1414,41 +1443,25 @@ void __init enable_IR_x2apic(void)
1414 pr_info("Enabled x2apic\n"); 1443 pr_info("Enabled x2apic\n");
1415 } 1444 }
1416 1445
1417end_restore: 1446nox2apic:
1418 if (ret) 1447 if (!ret) /* IR enabling failed */
1419 /*
1420 * IR enabling failed
1421 */
1422 restore_IO_APIC_setup(ioapic_entries); 1448 restore_IO_APIC_setup(ioapic_entries);
1423
1424 unmask_8259A(); 1449 unmask_8259A();
1425 local_irq_restore(flags); 1450 local_irq_restore(flags);
1426 1451
1427end: 1452out:
1428 if (ioapic_entries) 1453 if (ioapic_entries)
1429 free_ioapic_entries(ioapic_entries); 1454 free_ioapic_entries(ioapic_entries);
1430 1455
1431 if (!ret) 1456 if (x2apic_enabled)
1432 return; 1457 return;
1433 1458
1434ir_failed:
1435 if (x2apic_preenabled) 1459 if (x2apic_preenabled)
1436 panic("x2apic enabled by bios. But IR enabling failed"); 1460 panic("x2apic: enabled by BIOS but kernel init failed.");
1437 else if (cpu_has_x2apic) 1461 else if (cpu_has_x2apic)
1438 pr_info("Not enabling x2apic,Intr-remapping\n"); 1462 pr_info("Not enabling x2apic, Intr-remapping init failed.\n");
1439#else
1440 if (!cpu_has_x2apic)
1441 return;
1442
1443 if (x2apic_preenabled)
1444 panic("x2apic enabled prior OS handover,"
1445 " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
1446#endif
1447
1448 return;
1449} 1463}
1450 1464
1451
1452#ifdef CONFIG_X86_64 1465#ifdef CONFIG_X86_64
1453/* 1466/*
1454 * Detect and enable local APICs on non-SMP boards. 1467 * Detect and enable local APICs on non-SMP boards.
@@ -1549,8 +1562,6 @@ no_apic:
1549#ifdef CONFIG_X86_64 1562#ifdef CONFIG_X86_64
1550void __init early_init_lapic_mapping(void) 1563void __init early_init_lapic_mapping(void)
1551{ 1564{
1552 unsigned long phys_addr;
1553
1554 /* 1565 /*
1555 * If no local APIC can be found then go out 1566 * If no local APIC can be found then go out
1556 * : it means there is no mpatable and MADT 1567 * : it means there is no mpatable and MADT
@@ -1558,11 +1569,9 @@ void __init early_init_lapic_mapping(void)
1558 if (!smp_found_config) 1569 if (!smp_found_config)
1559 return; 1570 return;
1560 1571
1561 phys_addr = mp_lapic_addr; 1572 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
1562
1563 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
1564 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", 1573 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1565 APIC_BASE, phys_addr); 1574 APIC_BASE, mp_lapic_addr);
1566 1575
1567 /* 1576 /*
1568 * Fetch the APIC ID of the BSP in case we have a 1577 * Fetch the APIC ID of the BSP in case we have a
@@ -1651,7 +1660,6 @@ int __init APIC_init_uniprocessor(void)
1651 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1660 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1652 pr_err("BIOS bug, local APIC 0x%x not detected!...\n", 1661 pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
1653 boot_cpu_physical_apicid); 1662 boot_cpu_physical_apicid);
1654 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1655 return -1; 1663 return -1;
1656 } 1664 }
1657#endif 1665#endif
@@ -1701,7 +1709,7 @@ int __init APIC_init_uniprocessor(void)
1701 localise_nmi_watchdog(); 1709 localise_nmi_watchdog();
1702#endif 1710#endif
1703 1711
1704 setup_boot_clock(); 1712 x86_init.timers.setup_percpu_clockev();
1705#ifdef CONFIG_X86_64 1713#ifdef CONFIG_X86_64
1706 check_nmi_watchdog(); 1714 check_nmi_watchdog();
1707#endif 1715#endif
@@ -1908,24 +1916,14 @@ void __cpuinit generic_processor_info(int apicid, int version)
1908 max_physical_apicid = apicid; 1916 max_physical_apicid = apicid;
1909 1917
1910#ifdef CONFIG_X86_32 1918#ifdef CONFIG_X86_32
1911 /* 1919 switch (boot_cpu_data.x86_vendor) {
1912 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1920 case X86_VENDOR_INTEL:
1913 * but we need to work other dependencies like SMP_SUSPEND etc 1921 if (num_processors > 8)
1914 * before this can be done without some confusion. 1922 def_to_bigsmp = 1;
1915 * if (CPU_HOTPLUG_ENABLED || num_processors > 8) 1923 break;
1916 * - Ashok Raj <ashok.raj@intel.com> 1924 case X86_VENDOR_AMD:
1917 */ 1925 if (max_physical_apicid >= 8)
1918 if (max_physical_apicid >= 8) {
1919 switch (boot_cpu_data.x86_vendor) {
1920 case X86_VENDOR_INTEL:
1921 if (!APIC_XAPIC(version)) {
1922 def_to_bigsmp = 0;
1923 break;
1924 }
1925 /* If P4 and above fall through */
1926 case X86_VENDOR_AMD:
1927 def_to_bigsmp = 1; 1926 def_to_bigsmp = 1;
1928 }
1929 } 1927 }
1930#endif 1928#endif
1931 1929
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 676cdac385c0..77a06413b6b2 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
112 return physids_promote(0xFFL); 112 return physids_promote(0xFFL);
113} 113}
114 114
115static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) 115static int bigsmp_check_phys_apicid_present(int phys_apicid)
116{ 116{
117 return 1; 117 return 1;
118} 118}
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8952a5890281..89174f847b49 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -167,7 +167,7 @@ static int es7000_apic_is_cluster(void)
167{ 167{
168 /* MPENTIUMIII */ 168 /* MPENTIUMIII */
169 if (boot_cpu_data.x86 == 6 && 169 if (boot_cpu_data.x86 == 6 &&
170 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) 170 (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11))
171 return 1; 171 return 1;
172 172
173 return 0; 173 return 0;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d2ed6c5ddc80..dc69f28489f5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -66,6 +66,8 @@
66#include <asm/apic.h> 66#include <asm/apic.h>
67 67
68#define __apicdebuginit(type) static type __init 68#define __apicdebuginit(type) static type __init
69#define for_each_irq_pin(entry, head) \
70 for (entry = head; entry; entry = entry->next)
69 71
70/* 72/*
71 * Is the SiS APIC rmw bug present ? 73 * Is the SiS APIC rmw bug present ?
@@ -85,12 +87,20 @@ int nr_ioapic_registers[MAX_IO_APICS];
85struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; 87struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
86int nr_ioapics; 88int nr_ioapics;
87 89
90/* IO APIC gsi routing info */
91struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
92
88/* MP IRQ source entries */ 93/* MP IRQ source entries */
89struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; 94struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
90 95
91/* # of MP IRQ source entries */ 96/* # of MP IRQ source entries */
92int mp_irq_entries; 97int mp_irq_entries;
93 98
99/* Number of legacy interrupts */
100static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
101/* GSI interrupts */
102static int nr_irqs_gsi = NR_IRQS_LEGACY;
103
94#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 104#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
95int mp_bus_id_to_type[MAX_MP_BUSSES]; 105int mp_bus_id_to_type[MAX_MP_BUSSES];
96#endif 106#endif
@@ -116,15 +126,6 @@ static int __init parse_noapic(char *str)
116} 126}
117early_param("noapic", parse_noapic); 127early_param("noapic", parse_noapic);
118 128
119struct irq_pin_list;
120
121/*
122 * This is performance-critical, we want to do it O(1)
123 *
124 * the indexing order of this array favors 1:1 mappings
125 * between pins and IRQs.
126 */
127
128struct irq_pin_list { 129struct irq_pin_list {
129 int apic, pin; 130 int apic, pin;
130 struct irq_pin_list *next; 131 struct irq_pin_list *next;
@@ -139,6 +140,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
139 return pin; 140 return pin;
140} 141}
141 142
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
142struct irq_cfg { 148struct irq_cfg {
143 struct irq_pin_list *irq_2_pin; 149 struct irq_pin_list *irq_2_pin;
144 cpumask_var_t domain; 150 cpumask_var_t domain;
@@ -172,6 +178,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
172 [15] = { .vector = IRQ15_VECTOR, }, 178 [15] = { .vector = IRQ15_VECTOR, },
173}; 179};
174 180
181void __init io_apic_disable_legacy(void)
182{
183 nr_legacy_irqs = 0;
184 nr_irqs_gsi = 0;
185}
186
175int __init arch_early_irq_init(void) 187int __init arch_early_irq_init(void)
176{ 188{
177 struct irq_cfg *cfg; 189 struct irq_cfg *cfg;
@@ -189,7 +201,7 @@ int __init arch_early_irq_init(void)
189 desc->chip_data = &cfg[i]; 201 desc->chip_data = &cfg[i];
190 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 202 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
191 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 203 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
192 if (i < NR_IRQS_LEGACY) 204 if (i < nr_legacy_irqs)
193 cpumask_setall(cfg[i].domain); 205 cpumask_setall(cfg[i].domain);
194 } 206 }
195 207
@@ -215,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node)
215 227
216 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 228 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
217 if (cfg) { 229 if (cfg) {
218 if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { 230 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
219 kfree(cfg); 231 kfree(cfg);
220 cfg = NULL; 232 cfg = NULL;
221 } else if (!alloc_cpumask_var_node(&cfg->old_domain, 233 } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
222 GFP_ATOMIC, node)) { 234 GFP_ATOMIC, node)) {
223 free_cpumask_var(cfg->domain); 235 free_cpumask_var(cfg->domain);
224 kfree(cfg); 236 kfree(cfg);
225 cfg = NULL; 237 cfg = NULL;
226 } else {
227 cpumask_clear(cfg->domain);
228 cpumask_clear(cfg->old_domain);
229 } 238 }
230 } 239 }
231 240
@@ -414,13 +423,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
414 unsigned long flags; 423 unsigned long flags;
415 424
416 spin_lock_irqsave(&ioapic_lock, flags); 425 spin_lock_irqsave(&ioapic_lock, flags);
417 entry = cfg->irq_2_pin; 426 for_each_irq_pin(entry, cfg->irq_2_pin) {
418 for (;;) {
419 unsigned int reg; 427 unsigned int reg;
420 int pin; 428 int pin;
421 429
422 if (!entry)
423 break;
424 pin = entry->pin; 430 pin = entry->pin;
425 reg = io_apic_read(entry->apic, 0x10 + pin*2); 431 reg = io_apic_read(entry->apic, 0x10 + pin*2);
426 /* Is the remote IRR bit set? */ 432 /* Is the remote IRR bit set? */
@@ -428,9 +434,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
428 spin_unlock_irqrestore(&ioapic_lock, flags); 434 spin_unlock_irqrestore(&ioapic_lock, flags);
429 return true; 435 return true;
430 } 436 }
431 if (!entry->next)
432 break;
433 entry = entry->next;
434 } 437 }
435 spin_unlock_irqrestore(&ioapic_lock, flags); 438 spin_unlock_irqrestore(&ioapic_lock, flags);
436 439
@@ -498,72 +501,68 @@ static void ioapic_mask_entry(int apic, int pin)
498 * shared ISA-space IRQs, so we have to support them. We are super 501 * shared ISA-space IRQs, so we have to support them. We are super
499 * fast in the common case, and fast for shared ISA-space IRQs. 502 * fast in the common case, and fast for shared ISA-space IRQs.
500 */ 503 */
501static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) 504static int
505add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
502{ 506{
503 struct irq_pin_list *entry; 507 struct irq_pin_list **last, *entry;
504 508
505 entry = cfg->irq_2_pin; 509 /* don't allow duplicates */
506 if (!entry) { 510 last = &cfg->irq_2_pin;
507 entry = get_one_free_irq_2_pin(node); 511 for_each_irq_pin(entry, cfg->irq_2_pin) {
508 if (!entry) {
509 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
510 apic, pin);
511 return;
512 }
513 cfg->irq_2_pin = entry;
514 entry->apic = apic;
515 entry->pin = pin;
516 return;
517 }
518
519 while (entry->next) {
520 /* not again, please */
521 if (entry->apic == apic && entry->pin == pin) 512 if (entry->apic == apic && entry->pin == pin)
522 return; 513 return 0;
523 514 last = &entry->next;
524 entry = entry->next;
525 } 515 }
526 516
527 entry->next = get_one_free_irq_2_pin(node); 517 entry = get_one_free_irq_2_pin(node);
528 entry = entry->next; 518 if (!entry) {
519 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
520 node, apic, pin);
521 return -ENOMEM;
522 }
529 entry->apic = apic; 523 entry->apic = apic;
530 entry->pin = pin; 524 entry->pin = pin;
525
526 *last = entry;
527 return 0;
528}
529
530static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
531{
532 if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
533 panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
531} 534}
532 535
533/* 536/*
534 * Reroute an IRQ to a different pin. 537 * Reroute an IRQ to a different pin.
535 */ 538 */
536static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, 539static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
537 int oldapic, int oldpin, 540 int oldapic, int oldpin,
538 int newapic, int newpin) 541 int newapic, int newpin)
539{ 542{
540 struct irq_pin_list *entry = cfg->irq_2_pin; 543 struct irq_pin_list *entry;
541 int replaced = 0;
542 544
543 while (entry) { 545 for_each_irq_pin(entry, cfg->irq_2_pin) {
544 if (entry->apic == oldapic && entry->pin == oldpin) { 546 if (entry->apic == oldapic && entry->pin == oldpin) {
545 entry->apic = newapic; 547 entry->apic = newapic;
546 entry->pin = newpin; 548 entry->pin = newpin;
547 replaced = 1;
548 /* every one is different, right? */ 549 /* every one is different, right? */
549 break; 550 return;
550 } 551 }
551 entry = entry->next;
552 } 552 }
553 553
554 /* why? call replace before add? */ 554 /* old apic/pin didn't exist, so just add new ones */
555 if (!replaced) 555 add_pin_to_irq_node(cfg, node, newapic, newpin);
556 add_pin_to_irq_node(cfg, node, newapic, newpin);
557} 556}
558 557
559static inline void io_apic_modify_irq(struct irq_cfg *cfg, 558static void io_apic_modify_irq(struct irq_cfg *cfg,
560 int mask_and, int mask_or, 559 int mask_and, int mask_or,
561 void (*final)(struct irq_pin_list *entry)) 560 void (*final)(struct irq_pin_list *entry))
562{ 561{
563 int pin; 562 int pin;
564 struct irq_pin_list *entry; 563 struct irq_pin_list *entry;
565 564
566 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { 565 for_each_irq_pin(entry, cfg->irq_2_pin) {
567 unsigned int reg; 566 unsigned int reg;
568 pin = entry->pin; 567 pin = entry->pin;
569 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 568 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
@@ -580,7 +579,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
580 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); 579 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
581} 580}
582 581
583#ifdef CONFIG_X86_64
584static void io_apic_sync(struct irq_pin_list *entry) 582static void io_apic_sync(struct irq_pin_list *entry)
585{ 583{
586 /* 584 /*
@@ -596,11 +594,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
596{ 594{
597 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
598} 596}
599#else /* CONFIG_X86_32 */
600static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
601{
602 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
603}
604 597
605static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) 598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
606{ 599{
@@ -613,7 +606,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
613 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
614 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
615} 608}
616#endif /* CONFIG_X86_32 */
617 609
618static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 610static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
619{ 611{
@@ -883,7 +875,7 @@ static int __init find_isa_irq_apic(int irq, int type)
883 */ 875 */
884static int EISA_ELCR(unsigned int irq) 876static int EISA_ELCR(unsigned int irq)
885{ 877{
886 if (irq < NR_IRQS_LEGACY) { 878 if (irq < nr_legacy_irqs) {
887 unsigned int port = 0x4d0 + (irq >> 3); 879 unsigned int port = 0x4d0 + (irq >> 3);
888 return (inb(port) >> (irq & 7)) & 1; 880 return (inb(port) >> (irq & 7)) & 1;
889 } 881 }
@@ -1480,7 +1472,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1480 } 1472 }
1481 1473
1482 ioapic_register_intr(irq, desc, trigger); 1474 ioapic_register_intr(irq, desc, trigger);
1483 if (irq < NR_IRQS_LEGACY) 1475 if (irq < nr_legacy_irqs)
1484 disable_8259A_irq(irq); 1476 disable_8259A_irq(irq);
1485 1477
1486 ioapic_write_entry(apic_id, pin, entry); 1478 ioapic_write_entry(apic_id, pin, entry);
@@ -1702,12 +1694,8 @@ __apicdebuginit(void) print_IO_APIC(void)
1702 if (!entry) 1694 if (!entry)
1703 continue; 1695 continue;
1704 printk(KERN_DEBUG "IRQ%d ", irq); 1696 printk(KERN_DEBUG "IRQ%d ", irq);
1705 for (;;) { 1697 for_each_irq_pin(entry, cfg->irq_2_pin)
1706 printk("-> %d:%d", entry->apic, entry->pin); 1698 printk("-> %d:%d", entry->apic, entry->pin);
1707 if (!entry->next)
1708 break;
1709 entry = entry->next;
1710 }
1711 printk("\n"); 1699 printk("\n");
1712 } 1700 }
1713 1701
@@ -1851,7 +1839,7 @@ __apicdebuginit(void) print_PIC(void)
1851 unsigned int v; 1839 unsigned int v;
1852 unsigned long flags; 1840 unsigned long flags;
1853 1841
1854 if (apic_verbosity == APIC_QUIET) 1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs)
1855 return; 1843 return;
1856 1844
1857 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1845 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1883,7 +1871,7 @@ __apicdebuginit(int) print_all_ICs(void)
1883 print_PIC(); 1871 print_PIC();
1884 1872
1885 /* don't print out if apic is not there */ 1873 /* don't print out if apic is not there */
1886 if (!cpu_has_apic || disable_apic) 1874 if (!cpu_has_apic && !apic_from_smp_config())
1887 return 0; 1875 return 0;
1888 1876
1889 print_all_local_APICs(); 1877 print_all_local_APICs();
@@ -1914,6 +1902,10 @@ void __init enable_IO_APIC(void)
1914 spin_unlock_irqrestore(&ioapic_lock, flags); 1902 spin_unlock_irqrestore(&ioapic_lock, flags);
1915 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1903 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1916 } 1904 }
1905
1906 if (!nr_legacy_irqs)
1907 return;
1908
1917 for(apic = 0; apic < nr_ioapics; apic++) { 1909 for(apic = 0; apic < nr_ioapics; apic++) {
1918 int pin; 1910 int pin;
1919 /* See if any of the pins is in ExtINT mode */ 1911 /* See if any of the pins is in ExtINT mode */
@@ -1968,6 +1960,9 @@ void disable_IO_APIC(void)
1968 */ 1960 */
1969 clear_IO_APIC(); 1961 clear_IO_APIC();
1970 1962
1963 if (!nr_legacy_irqs)
1964 return;
1965
1971 /* 1966 /*
1972 * If the i8259 is routed through an IOAPIC 1967 * If the i8259 is routed through an IOAPIC
1973 * Put that IOAPIC in virtual wire mode 1968 * Put that IOAPIC in virtual wire mode
@@ -2001,7 +1996,7 @@ void disable_IO_APIC(void)
2001 /* 1996 /*
2002 * Use virtual wire A mode when interrupt remapping is enabled. 1997 * Use virtual wire A mode when interrupt remapping is enabled.
2003 */ 1998 */
2004 if (cpu_has_apic) 1999 if (cpu_has_apic || apic_from_smp_config())
2005 disconnect_bsp_APIC(!intr_remapping_enabled && 2000 disconnect_bsp_APIC(!intr_remapping_enabled &&
2006 ioapic_i8259.pin != -1); 2001 ioapic_i8259.pin != -1);
2007} 2002}
@@ -2014,7 +2009,7 @@ void disable_IO_APIC(void)
2014 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 2009 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
2015 */ 2010 */
2016 2011
2017static void __init setup_ioapic_ids_from_mpc(void) 2012void __init setup_ioapic_ids_from_mpc(void)
2018{ 2013{
2019 union IO_APIC_reg_00 reg_00; 2014 union IO_APIC_reg_00 reg_00;
2020 physid_mask_t phys_id_present_map; 2015 physid_mask_t phys_id_present_map;
@@ -2023,9 +2018,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
2023 unsigned char old_id; 2018 unsigned char old_id;
2024 unsigned long flags; 2019 unsigned long flags;
2025 2020
2026 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) 2021 if (acpi_ioapic)
2027 return; 2022 return;
2028
2029 /* 2023 /*
2030 * Don't check I/O APIC IDs for xAPIC systems. They have 2024 * Don't check I/O APIC IDs for xAPIC systems. They have
2031 * no meaning without the serial APIC bus. 2025 * no meaning without the serial APIC bus.
@@ -2199,7 +2193,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2199 struct irq_cfg *cfg; 2193 struct irq_cfg *cfg;
2200 2194
2201 spin_lock_irqsave(&ioapic_lock, flags); 2195 spin_lock_irqsave(&ioapic_lock, flags);
2202 if (irq < NR_IRQS_LEGACY) { 2196 if (irq < nr_legacy_irqs) {
2203 disable_8259A_irq(irq); 2197 disable_8259A_irq(irq);
2204 if (i8259A_irq_pending(irq)) 2198 if (i8259A_irq_pending(irq))
2205 was_pending = 1; 2199 was_pending = 1;
@@ -2211,7 +2205,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2211 return was_pending; 2205 return was_pending;
2212} 2206}
2213 2207
2214#ifdef CONFIG_X86_64
2215static int ioapic_retrigger_irq(unsigned int irq) 2208static int ioapic_retrigger_irq(unsigned int irq)
2216{ 2209{
2217 2210
@@ -2224,14 +2217,6 @@ static int ioapic_retrigger_irq(unsigned int irq)
2224 2217
2225 return 1; 2218 return 1;
2226} 2219}
2227#else
2228static int ioapic_retrigger_irq(unsigned int irq)
2229{
2230 apic->send_IPI_self(irq_cfg(irq)->vector);
2231
2232 return 1;
2233}
2234#endif
2235 2220
2236/* 2221/*
2237 * Level and edge triggered IO-APIC interrupts need different handling, 2222 * Level and edge triggered IO-APIC interrupts need different handling,
@@ -2269,13 +2254,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2269 struct irq_pin_list *entry; 2254 struct irq_pin_list *entry;
2270 u8 vector = cfg->vector; 2255 u8 vector = cfg->vector;
2271 2256
2272 entry = cfg->irq_2_pin; 2257 for_each_irq_pin(entry, cfg->irq_2_pin) {
2273 for (;;) {
2274 unsigned int reg; 2258 unsigned int reg;
2275 2259
2276 if (!entry)
2277 break;
2278
2279 apic = entry->apic; 2260 apic = entry->apic;
2280 pin = entry->pin; 2261 pin = entry->pin;
2281 /* 2262 /*
@@ -2288,9 +2269,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2288 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 2269 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2289 reg |= vector; 2270 reg |= vector;
2290 io_apic_modify(apic, 0x10 + pin*2, reg); 2271 io_apic_modify(apic, 0x10 + pin*2, reg);
2291 if (!entry->next)
2292 break;
2293 entry = entry->next;
2294 } 2272 }
2295} 2273}
2296 2274
@@ -2515,11 +2493,8 @@ atomic_t irq_mis_count;
2515static void ack_apic_level(unsigned int irq) 2493static void ack_apic_level(unsigned int irq)
2516{ 2494{
2517 struct irq_desc *desc = irq_to_desc(irq); 2495 struct irq_desc *desc = irq_to_desc(irq);
2518
2519#ifdef CONFIG_X86_32
2520 unsigned long v; 2496 unsigned long v;
2521 int i; 2497 int i;
2522#endif
2523 struct irq_cfg *cfg; 2498 struct irq_cfg *cfg;
2524 int do_unmask_irq = 0; 2499 int do_unmask_irq = 0;
2525 2500
@@ -2532,31 +2507,28 @@ static void ack_apic_level(unsigned int irq)
2532 } 2507 }
2533#endif 2508#endif
2534 2509
2535#ifdef CONFIG_X86_32
2536 /* 2510 /*
2537 * It appears there is an erratum which affects at least version 0x11 2511 * It appears there is an erratum which affects at least version 0x11
2538 * of I/O APIC (that's the 82093AA and cores integrated into various 2512 * of I/O APIC (that's the 82093AA and cores integrated into various
2539 * chipsets). Under certain conditions a level-triggered interrupt is 2513 * chipsets). Under certain conditions a level-triggered interrupt is
2540 * erroneously delivered as edge-triggered one but the respective IRR 2514 * erroneously delivered as edge-triggered one but the respective IRR
2541 * bit gets set nevertheless. As a result the I/O unit expects an EOI 2515 * bit gets set nevertheless. As a result the I/O unit expects an EOI
2542 * message but it will never arrive and further interrupts are blocked 2516 * message but it will never arrive and further interrupts are blocked
2543 * from the source. The exact reason is so far unknown, but the 2517 * from the source. The exact reason is so far unknown, but the
2544 * phenomenon was observed when two consecutive interrupt requests 2518 * phenomenon was observed when two consecutive interrupt requests
2545 * from a given source get delivered to the same CPU and the source is 2519 * from a given source get delivered to the same CPU and the source is
2546 * temporarily disabled in between. 2520 * temporarily disabled in between.
2547 * 2521 *
2548 * A workaround is to simulate an EOI message manually. We achieve it 2522 * A workaround is to simulate an EOI message manually. We achieve it
2549 * by setting the trigger mode to edge and then to level when the edge 2523 * by setting the trigger mode to edge and then to level when the edge
2550 * trigger mode gets detected in the TMR of a local APIC for a 2524 * trigger mode gets detected in the TMR of a local APIC for a
2551 * level-triggered interrupt. We mask the source for the time of the 2525 * level-triggered interrupt. We mask the source for the time of the
2552 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2526 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2553 * The idea is from Manfred Spraul. --macro 2527 * The idea is from Manfred Spraul. --macro
2554 */ 2528 */
2555 cfg = desc->chip_data; 2529 cfg = desc->chip_data;
2556 i = cfg->vector; 2530 i = cfg->vector;
2557
2558 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2531 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2559#endif
2560 2532
2561 /* 2533 /*
2562 * We must acknowledge the irq before we move it or the acknowledge will 2534 * We must acknowledge the irq before we move it or the acknowledge will
@@ -2598,7 +2570,7 @@ static void ack_apic_level(unsigned int irq)
2598 unmask_IO_APIC_irq_desc(desc); 2570 unmask_IO_APIC_irq_desc(desc);
2599 } 2571 }
2600 2572
2601#ifdef CONFIG_X86_32 2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2602 if (!(v & (1 << (i & 0x1f)))) { 2574 if (!(v & (1 << (i & 0x1f)))) {
2603 atomic_inc(&irq_mis_count); 2575 atomic_inc(&irq_mis_count);
2604 spin_lock(&ioapic_lock); 2576 spin_lock(&ioapic_lock);
@@ -2606,26 +2578,15 @@ static void ack_apic_level(unsigned int irq)
2606 __unmask_and_level_IO_APIC_irq(cfg); 2578 __unmask_and_level_IO_APIC_irq(cfg);
2607 spin_unlock(&ioapic_lock); 2579 spin_unlock(&ioapic_lock);
2608 } 2580 }
2609#endif
2610} 2581}
2611 2582
2612#ifdef CONFIG_INTR_REMAP 2583#ifdef CONFIG_INTR_REMAP
2613static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) 2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2614{ 2585{
2615 int apic, pin;
2616 struct irq_pin_list *entry; 2586 struct irq_pin_list *entry;
2617 2587
2618 entry = cfg->irq_2_pin; 2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2619 for (;;) { 2589 io_apic_eoi(entry->apic, entry->pin);
2620
2621 if (!entry)
2622 break;
2623
2624 apic = entry->apic;
2625 pin = entry->pin;
2626 io_apic_eoi(apic, pin);
2627 entry = entry->next;
2628 }
2629} 2590}
2630 2591
2631static void 2592static void
@@ -2710,7 +2671,7 @@ static inline void init_IO_APIC_traps(void)
2710 * so default to an old-fashioned 8259 2671 * so default to an old-fashioned 8259
2711 * interrupt if we can.. 2672 * interrupt if we can..
2712 */ 2673 */
2713 if (irq < NR_IRQS_LEGACY) 2674 if (irq < nr_legacy_irqs)
2714 make_8259A_irq(irq); 2675 make_8259A_irq(irq);
2715 else 2676 else
2716 /* Strange. Oh, well.. */ 2677 /* Strange. Oh, well.. */
@@ -3046,7 +3007,7 @@ out:
3046 * the I/O APIC in all cases now. No actual device should request 3007 * the I/O APIC in all cases now. No actual device should request
3047 * it anyway. --macro 3008 * it anyway. --macro
3048 */ 3009 */
3049#define PIC_IRQS (1 << PIC_CASCADE_IR) 3010#define PIC_IRQS (1UL << PIC_CASCADE_IR)
3050 3011
3051void __init setup_IO_APIC(void) 3012void __init setup_IO_APIC(void)
3052{ 3013{
@@ -3054,21 +3015,19 @@ void __init setup_IO_APIC(void)
3054 /* 3015 /*
3055 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3016 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3056 */ 3017 */
3057 3018 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3058 io_apic_irqs = ~PIC_IRQS;
3059 3019
3060 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3020 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3061 /* 3021 /*
3062 * Set up IO-APIC IRQ routing. 3022 * Set up IO-APIC IRQ routing.
3063 */ 3023 */
3064#ifdef CONFIG_X86_32 3024 x86_init.mpparse.setup_ioapic_ids();
3065 if (!acpi_ioapic) 3025
3066 setup_ioapic_ids_from_mpc();
3067#endif
3068 sync_Arb_IDs(); 3026 sync_Arb_IDs();
3069 setup_IO_APIC_irqs(); 3027 setup_IO_APIC_irqs();
3070 init_IO_APIC_traps(); 3028 init_IO_APIC_traps();
3071 check_timer(); 3029 if (nr_legacy_irqs)
3030 check_timer();
3072} 3031}
3073 3032
3074/* 3033/*
@@ -3169,7 +3128,6 @@ static int __init ioapic_init_sysfs(void)
3169 3128
3170device_initcall(ioapic_init_sysfs); 3129device_initcall(ioapic_init_sysfs);
3171 3130
3172static int nr_irqs_gsi = NR_IRQS_LEGACY;
3173/* 3131/*
3174 * Dynamic irq allocate and deallocation 3132 * Dynamic irq allocate and deallocation
3175 */ 3133 */
@@ -3241,8 +3199,7 @@ void destroy_irq(unsigned int irq)
3241 cfg = desc->chip_data; 3199 cfg = desc->chip_data;
3242 dynamic_irq_cleanup(irq); 3200 dynamic_irq_cleanup(irq);
3243 /* connect back irq_cfg */ 3201 /* connect back irq_cfg */
3244 if (desc) 3202 desc->chip_data = cfg;
3245 desc->chip_data = cfg;
3246 3203
3247 free_irte(irq); 3204 free_irte(irq);
3248 spin_lock_irqsave(&vector_lock, flags); 3205 spin_lock_irqsave(&vector_lock, flags);
@@ -3910,9 +3867,13 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3910 /* 3867 /*
3911 * IRQs < 16 are already in the irq_2_pin[] map 3868 * IRQs < 16 are already in the irq_2_pin[] map
3912 */ 3869 */
3913 if (irq >= NR_IRQS_LEGACY) { 3870 if (irq >= nr_legacy_irqs) {
3914 cfg = desc->chip_data; 3871 cfg = desc->chip_data;
3915 add_pin_to_irq_node(cfg, node, ioapic, pin); 3872 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3873 printk(KERN_INFO "can not add pin %d for irq %d\n",
3874 pin, irq);
3875 return 0;
3876 }
3916 } 3877 }
3917 3878
3918 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); 3879 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
@@ -3941,11 +3902,28 @@ int io_apic_set_pci_routing(struct device *dev, int irq,
3941 return __io_apic_set_pci_routing(dev, irq, irq_attr); 3902 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3942} 3903}
3943 3904
3944/* -------------------------------------------------------------------------- 3905u8 __init io_apic_unique_id(u8 id)
3945 ACPI-based IOAPIC Configuration 3906{
3946 -------------------------------------------------------------------------- */ 3907#ifdef CONFIG_X86_32
3908 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3909 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3910 return io_apic_get_unique_id(nr_ioapics, id);
3911 else
3912 return id;
3913#else
3914 int i;
3915 DECLARE_BITMAP(used, 256);
3947 3916
3948#ifdef CONFIG_ACPI 3917 bitmap_zero(used, 256);
3918 for (i = 0; i < nr_ioapics; i++) {
3919 struct mpc_ioapic *ia = &mp_ioapics[i];
3920 __set_bit(ia->apicid, used);
3921 }
3922 if (!test_bit(id, used))
3923 return id;
3924 return find_first_zero_bit(used, 256);
3925#endif
3926}
3949 3927
3950#ifdef CONFIG_X86_32 3928#ifdef CONFIG_X86_32
3951int __init io_apic_get_unique_id(int ioapic, int apic_id) 3929int __init io_apic_get_unique_id(int ioapic, int apic_id)
@@ -4054,8 +4032,6 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4054 return 0; 4032 return 0;
4055} 4033}
4056 4034
4057#endif /* CONFIG_ACPI */
4058
4059/* 4035/*
4060 * This function currently is only a helper for the i386 smp boot process where 4036 * This function currently is only a helper for the i386 smp boot process where
4061 * we need to reprogram the ioredtbls to cater for the cpus which have come online 4037 * we need to reprogram the ioredtbls to cater for the cpus which have come online
@@ -4109,7 +4085,7 @@ void __init setup_ioapic_dest(void)
4109 4085
4110static struct resource *ioapic_resources; 4086static struct resource *ioapic_resources;
4111 4087
4112static struct resource * __init ioapic_setup_resources(void) 4088static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4113{ 4089{
4114 unsigned long n; 4090 unsigned long n;
4115 struct resource *res; 4091 struct resource *res;
@@ -4125,15 +4101,13 @@ static struct resource * __init ioapic_setup_resources(void)
4125 mem = alloc_bootmem(n); 4101 mem = alloc_bootmem(n);
4126 res = (void *)mem; 4102 res = (void *)mem;
4127 4103
4128 if (mem != NULL) { 4104 mem += sizeof(struct resource) * nr_ioapics;
4129 mem += sizeof(struct resource) * nr_ioapics;
4130 4105
4131 for (i = 0; i < nr_ioapics; i++) { 4106 for (i = 0; i < nr_ioapics; i++) {
4132 res[i].name = mem; 4107 res[i].name = mem;
4133 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4134 sprintf(mem, "IOAPIC %u", i); 4109 sprintf(mem, "IOAPIC %u", i);
4135 mem += IOAPIC_RESOURCE_NAME_SIZE; 4110 mem += IOAPIC_RESOURCE_NAME_SIZE;
4136 }
4137 } 4111 }
4138 4112
4139 ioapic_resources = res; 4113 ioapic_resources = res;
@@ -4147,7 +4121,7 @@ void __init ioapic_init_mappings(void)
4147 struct resource *ioapic_res; 4121 struct resource *ioapic_res;
4148 int i; 4122 int i;
4149 4123
4150 ioapic_res = ioapic_setup_resources(); 4124 ioapic_res = ioapic_setup_resources(nr_ioapics);
4151 for (i = 0; i < nr_ioapics; i++) { 4125 for (i = 0; i < nr_ioapics; i++) {
4152 if (smp_found_config) { 4126 if (smp_found_config) {
4153 ioapic_phys = mp_ioapics[i].apicaddr; 4127 ioapic_phys = mp_ioapics[i].apicaddr;
@@ -4176,11 +4150,9 @@ fake_ioapic_page:
4176 __fix_to_virt(idx), ioapic_phys); 4150 __fix_to_virt(idx), ioapic_phys);
4177 idx++; 4151 idx++;
4178 4152
4179 if (ioapic_res != NULL) { 4153 ioapic_res->start = ioapic_phys;
4180 ioapic_res->start = ioapic_phys; 4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
4181 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4155 ioapic_res++;
4182 ioapic_res++;
4183 }
4184 } 4156 }
4185} 4157}
4186 4158
@@ -4201,3 +4173,76 @@ void __init ioapic_insert_resources(void)
4201 r++; 4173 r++;
4202 } 4174 }
4203} 4175}
4176
4177int mp_find_ioapic(int gsi)
4178{
4179 int i = 0;
4180
4181 /* Find the IOAPIC that manages this GSI. */
4182 for (i = 0; i < nr_ioapics; i++) {
4183 if ((gsi >= mp_gsi_routing[i].gsi_base)
4184 && (gsi <= mp_gsi_routing[i].gsi_end))
4185 return i;
4186 }
4187
4188 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
4189 return -1;
4190}
4191
4192int mp_find_ioapic_pin(int ioapic, int gsi)
4193{
4194 if (WARN_ON(ioapic == -1))
4195 return -1;
4196 if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
4197 return -1;
4198
4199 return gsi - mp_gsi_routing[ioapic].gsi_base;
4200}
4201
4202static int bad_ioapic(unsigned long address)
4203{
4204 if (nr_ioapics >= MAX_IO_APICS) {
4205 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded "
4206 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
4207 return 1;
4208 }
4209 if (!address) {
4210 printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
4211 " found in table, skipping!\n");
4212 return 1;
4213 }
4214 return 0;
4215}
4216
4217void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4218{
4219 int idx = 0;
4220
4221 if (bad_ioapic(address))
4222 return;
4223
4224 idx = nr_ioapics;
4225
4226 mp_ioapics[idx].type = MP_IOAPIC;
4227 mp_ioapics[idx].flags = MPC_APIC_USABLE;
4228 mp_ioapics[idx].apicaddr = address;
4229
4230 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4231 mp_ioapics[idx].apicid = io_apic_unique_id(id);
4232 mp_ioapics[idx].apicver = io_apic_get_version(idx);
4233
4234 /*
4235 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4236 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4237 */
4238 mp_gsi_routing[idx].gsi_base = gsi_base;
4239 mp_gsi_routing[idx].gsi_end = gsi_base +
4240 io_apic_get_redir_entries(idx);
4241
4242 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4243 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
4244 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
4245 mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
4246
4247 nr_ioapics++;
4248}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 6ef00ba4c886..08385e090a6f 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -153,7 +153,7 @@ int safe_smp_processor_id(void)
153{ 153{
154 int apicid, cpuid; 154 int apicid, cpuid;
155 155
156 if (!boot_cpu_has(X86_FEATURE_APIC)) 156 if (!cpu_has_apic)
157 return 0; 157 return 0;
158 158
159 apicid = hard_smp_processor_id(); 159 apicid = hard_smp_processor_id();
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index b3025b43b63a..7ff61d6a188a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,7 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_var_t backtrace_mask; 42static cpumask_t backtrace_mask __read_mostly;
43 43
44/* nmi_active: 44/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 45 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_NEW_MCE) 69#if defined(CONFIG_X86_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
@@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void)
138 if (!prev_nmi_count) 138 if (!prev_nmi_count)
139 goto error; 139 goto error;
140 140
141 alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
142 printk(KERN_INFO "Testing NMI watchdog ... "); 141 printk(KERN_INFO "Testing NMI watchdog ... ");
143 142
144#ifdef CONFIG_SMP 143#ifdef CONFIG_SMP
@@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
415 } 414 }
416 415
417 /* We can be called before check_nmi_watchdog, hence NULL check. */ 416 /* We can be called before check_nmi_watchdog, hence NULL check. */
418 if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { 417 if (cpumask_test_cpu(cpu, &backtrace_mask)) {
419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
420 419
421 spin_lock(&lock); 420 spin_lock(&lock);
422 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 421 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
422 show_regs(regs);
423 dump_stack(); 423 dump_stack();
424 spin_unlock(&lock); 424 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, backtrace_mask); 425 cpumask_clear_cpu(cpu, &backtrace_mask);
426
427 rc = 1;
426 } 428 }
427 429
428 /* Could check oops_in_progress here too, but it's safer not to */ 430 /* Could check oops_in_progress here too, but it's safer not to */
@@ -506,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
506/* 508/*
507 * proc handler for /proc/sys/kernel/nmi 509 * proc handler for /proc/sys/kernel/nmi
508 */ 510 */
509int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, 511int proc_nmi_enabled(struct ctl_table *table, int write,
510 void __user *buffer, size_t *length, loff_t *ppos) 512 void __user *buffer, size_t *length, loff_t *ppos)
511{ 513{
512 int old_state; 514 int old_state;
513 515
514 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; 516 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
515 old_state = nmi_watchdog_enabled; 517 old_state = nmi_watchdog_enabled;
516 proc_dointvec(table, write, file, buffer, length, ppos); 518 proc_dointvec(table, write, buffer, length, ppos);
517 if (!!old_state == !!nmi_watchdog_enabled) 519 if (!!old_state == !!nmi_watchdog_enabled)
518 return 0; 520 return 0;
519 521
@@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu)
552 return 0; 554 return 0;
553} 555}
554 556
555void __trigger_all_cpu_backtrace(void) 557void arch_trigger_all_cpu_backtrace(void)
556{ 558{
557 int i; 559 int i;
558 560
559 cpumask_copy(backtrace_mask, cpu_online_mask); 561 cpumask_copy(&backtrace_mask, cpu_online_mask);
562
563 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR);
565
560 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
561 for (i = 0; i < 10 * 1000; i++) { 567 for (i = 0; i < 10 * 1000; i++) {
562 if (cpumask_empty(backtrace_mask)) 568 if (cpumask_empty(&backtrace_mask))
563 break; 569 break;
564 mdelay(1); 570 mdelay(1);
565 } 571 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index ca96e68f0d23..efa00e2b8505 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -66,7 +66,6 @@ struct mpc_trans {
66 unsigned short trans_reserved; 66 unsigned short trans_reserved;
67}; 67};
68 68
69/* x86_quirks member */
70static int mpc_record; 69static int mpc_record;
71 70
72static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; 71static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
@@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void)
130 } 129 }
131} 130}
132 131
133static int __init numaq_pre_time_init(void) 132static void __init numaq_tsc_init(void)
134{ 133{
135 numaq_tsc_disable(); 134 numaq_tsc_disable();
136 return 0;
137} 135}
138 136
139static inline int generate_logical_apicid(int quad, int phys_apicid) 137static inline int generate_logical_apicid(int quad, int phys_apicid)
@@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m)
177 quad_local_to_mp_bus_id[quad][local] = m->busid; 175 quad_local_to_mp_bus_id[quad][local] = m->busid;
178} 176}
179 177
178/*
179 * Called from mpparse code.
180 * mode = 0: prescan
181 * mode = 1: one mpc entry scanned
182 */
183static void numaq_mpc_record(unsigned int mode)
184{
185 if (!mode)
186 mpc_record = 0;
187 else
188 mpc_record++;
189}
190
180static void __init MP_translation_info(struct mpc_trans *m) 191static void __init MP_translation_info(struct mpc_trans *m)
181{ 192{
182 printk(KERN_INFO 193 printk(KERN_INFO
@@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len)
206/* 217/*
207 * Read/parse the MPC oem tables 218 * Read/parse the MPC oem tables
208 */ 219 */
209static void __init 220static void __init smp_read_mpc_oem(struct mpc_table *mpc)
210 smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
211{ 221{
222 struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
212 int count = sizeof(*oemtable); /* the header size */ 223 int count = sizeof(*oemtable); /* the header size */
213 unsigned char *oemptr = ((unsigned char *)oemtable) + count; 224 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
214 225
@@ -250,29 +261,6 @@ static void __init
250 } 261 }
251} 262}
252 263
253static int __init numaq_setup_ioapic_ids(void)
254{
255 /* so can skip it */
256 return 1;
257}
258
259static struct x86_quirks numaq_x86_quirks __initdata = {
260 .arch_pre_time_init = numaq_pre_time_init,
261 .arch_time_init = NULL,
262 .arch_pre_intr_init = NULL,
263 .arch_memory_setup = NULL,
264 .arch_intr_init = NULL,
265 .arch_trap_init = NULL,
266 .mach_get_smp_config = NULL,
267 .mach_find_smp_config = NULL,
268 .mpc_record = &mpc_record,
269 .mpc_apic_id = mpc_apic_id,
270 .mpc_oem_bus_info = mpc_oem_bus_info,
271 .mpc_oem_pci_bus = mpc_oem_pci_bus,
272 .smp_read_mpc_oem = smp_read_mpc_oem,
273 .setup_ioapic_ids = numaq_setup_ioapic_ids,
274};
275
276static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
277{ 265{
278 /* 266 /*
@@ -286,8 +274,15 @@ static __init void early_check_numaq(void)
286 if (smp_found_config) 274 if (smp_found_config)
287 early_get_smp_config(); 275 early_get_smp_config();
288 276
289 if (found_numaq) 277 if (found_numaq) {
290 x86_quirks = &numaq_x86_quirks; 278 x86_init.mpparse.mpc_record = numaq_mpc_record;
279 x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
280 x86_init.mpparse.mpc_apic_id = mpc_apic_id;
281 x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
282 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
283 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
284 x86_init.timers.tsc_pre_init = numaq_tsc_init;
285 }
291} 286}
292 287
293int __init get_memcfg_numaq(void) 288int __init get_memcfg_numaq(void)
@@ -418,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
418/* Where the IO area was mapped on multiquad, always 0 otherwise */ 413/* Where the IO area was mapped on multiquad, always 0 otherwise */
419void *xquad_portio; 414void *xquad_portio;
420 415
421static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) 416static inline int numaq_check_phys_apicid_present(int phys_apicid)
422{ 417{
423 return 1; 418 return 1;
424} 419}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index fcec2f1d34a1..c4cbd3080c1c 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -55,25 +55,32 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
55void __init default_setup_apic_routing(void) 55void __init default_setup_apic_routing(void)
56{ 56{
57#ifdef CONFIG_X86_X2APIC 57#ifdef CONFIG_X86_X2APIC
58 if (x2apic_mode && (apic != &apic_x2apic_phys && 58 if (x2apic_mode
59#ifdef CONFIG_X86_UV 59#ifdef CONFIG_X86_UV
60 apic != &apic_x2apic_uv_x && 60 && apic != &apic_x2apic_uv_x
61#endif 61#endif
62 apic != &apic_x2apic_cluster)) { 62 ) {
63 if (x2apic_phys) 63 if (x2apic_phys)
64 apic = &apic_x2apic_phys; 64 apic = &apic_x2apic_phys;
65 else 65 else
66 apic = &apic_x2apic_cluster; 66 apic = &apic_x2apic_cluster;
67 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
68 } 67 }
69#endif 68#endif
70 69
71 if (apic == &apic_flat) { 70 if (apic == &apic_flat) {
72 if (max_physical_apicid >= 8) 71 switch (boot_cpu_data.x86_vendor) {
73 apic = &apic_physflat; 72 case X86_VENDOR_INTEL:
74 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
75 } 80 }
76 81
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83
77 if (is_vsmp_box()) { 84 if (is_vsmp_box()) {
78 /* need to update phys_pkg_id */ 85 /* need to update phys_pkg_id */
79 apic->phys_pkg_id = apicid_phys_pkg_id; 86 apic->phys_pkg_id = apicid_phys_pkg_id;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index eafdfbd1ea95..645ecc4ff0be 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid)
272 return physid_mask_of_physid(0); 272 return physid_mask_of_physid(0);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
276{ 276{
277 return 1; 277 return 1;
278} 278}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 601159374e87..f5f5886a6b53 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode)
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
390} 390}
391 391
392static __init void map_mmr_high(int max_pnode)
393{
394 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
395 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
396
397 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
398 if (mmr.s.enable)
399 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
400}
401
392static __init void map_mmioh_high(int max_pnode) 402static __init void map_mmioh_high(int max_pnode)
393{ 403{
394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 404 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -643,6 +653,7 @@ void __init uv_system_init(void)
643 } 653 }
644 654
645 map_gru_high(max_pnode); 655 map_gru_high(max_pnode);
656 map_mmr_high(max_pnode);
646 map_mmioh_high(max_pnode); 657 map_mmioh_high(max_pnode);
647 658
648 uv_cpu_init(); 659 uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 442b5508893f..151ace69a5aa 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 404static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 405static DEFINE_SPINLOCK(user_list_lock);
406static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; 406
407/*
408 * Set up a segment that references the real mode segment 0x40
409 * that extends up to the end of page zero (that we have reserved).
410 * This is for buggy BIOS's that refer to (real mode) segment 0x40
411 * even though they are called in protected mode.
412 */
413static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
414 (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
407 415
408static const char driver_version[] = "1.16ac"; /* no spaces */ 416static const char driver_version[] = "1.16ac"; /* no spaces */
409 417
@@ -2332,15 +2340,6 @@ static int __init apm_init(void)
2332 pm_flags |= PM_APM; 2340 pm_flags |= PM_APM;
2333 2341
2334 /* 2342 /*
2335 * Set up a segment that references the real mode segment 0x40
2336 * that extends up to the end of page zero (that we have reserved).
2337 * This is for buggy BIOS's that refer to (real mode) segment 0x40
2338 * even though they are called in protected mode.
2339 */
2340 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
2341 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
2342
2343 /*
2344 * Set up the long jump entry point to the APM BIOS, which is called 2343 * Set up the long jump entry point to the APM BIOS, which is called
2345 * from inline assembly. 2344 * from inline assembly.
2346 */ 2345 */
@@ -2358,12 +2357,12 @@ static int __init apm_init(void)
2358 * code to that CPU. 2357 * code to that CPU.
2359 */ 2358 */
2360 gdt = get_cpu_gdt_table(0); 2359 gdt = get_cpu_gdt_table(0);
2361 set_base(gdt[APM_CS >> 3], 2360 set_desc_base(&gdt[APM_CS >> 3],
2362 __va((unsigned long)apm_info.bios.cseg << 4)); 2361 (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
2363 set_base(gdt[APM_CS_16 >> 3], 2362 set_desc_base(&gdt[APM_CS_16 >> 3],
2364 __va((unsigned long)apm_info.bios.cseg_16 << 4)); 2363 (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4));
2365 set_base(gdt[APM_DS >> 3], 2364 set_desc_base(&gdt[APM_DS >> 3],
2366 __va((unsigned long)apm_info.bios.dseg << 4)); 2365 (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
2367 2366
2368 proc_create("apm", 0, NULL, &apm_file_ops); 2367 proc_create("apm", 0, NULL, &apm_file_ops);
2369 2368
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 898ecc47e129..4a6aeedcd965 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -3,6 +3,7 @@
3 * This code generates raw asm output which is post-processed to extract 3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data. 4 * and format the required data.
5 */ 5 */
6#define COMPILE_OFFSETS
6 7
7#include <linux/crypto.h> 8#include <linux/crypto.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac155..68537e957a9b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
13 13
14obj-y := intel_cacheinfo.o addon_cpuid_features.o 14obj-y := intel_cacheinfo.o addon_cpuid_features.o
15obj-y += proc.o capflags.o powerflags.o common.o 15obj-y += proc.o capflags.o powerflags.o common.o
16obj-y += vmware.o hypervisor.o 16obj-y += vmware.o hypervisor.o sched.o
17 17
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 19obj-$(CONFIG_X86_64) += bugs_64.o
@@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
27obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 27obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
28obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 28obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
29 29
30obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 30obj-$(CONFIG_PERF_EVENTS) += perf_event.o
31 31
32obj-$(CONFIG_X86_MCE) += mcheck/ 32obj-$(CONFIG_X86_MCE) += mcheck/
33obj-$(CONFIG_MTRR) += mtrr/ 33obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 63fddcd082cd..c910a716a71c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -2,7 +2,7 @@
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4 4
5#include <asm/io.h> 5#include <linux/io.h>
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h> 8#include <asm/cpu.h>
@@ -45,8 +45,8 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
45#define CBAR_ENB (0x80000000) 45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB) 46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) { 47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB) 48 if (inl(CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR); 49 outl(0 | CBAR_KEY, CBAR);
50 } 50 }
51} 51}
52 52
@@ -87,9 +87,10 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
87 d = d2-d; 87 d = d2-d;
88 88
89 if (d > 20*K6_BUG_LOOP) 89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n"); 90 printk(KERN_CONT
91 "system stability may be impaired when more than 32 MB are used.\n");
91 else 92 else
92 printk("probably OK (after B9730xxxx).\n"); 93 printk(KERN_CONT "probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); 94 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 } 95 }
95 96
@@ -183,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
183 * approved Athlon 184 * approved Athlon
184 */ 185 */
185 WARN_ONCE(1, "WARNING: This combination of AMD" 186 WARN_ONCE(1, "WARNING: This combination of AMD"
186 "processors is not suitable for SMP.\n"); 187 " processors is not suitable for SMP.\n");
187 if (!test_taint(TAINT_UNSAFE_SMP)) 188 if (!test_taint(TAINT_UNSAFE_SMP))
188 add_taint(TAINT_UNSAFE_SMP); 189 add_taint(TAINT_UNSAFE_SMP);
189 190
@@ -219,8 +220,9 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
219 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { 220 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
220 rdmsr(MSR_K7_CLK_CTL, l, h); 221 rdmsr(MSR_K7_CLK_CTL, l, h);
221 if ((l & 0xfff00000) != 0x20000000) { 222 if ((l & 0xfff00000) != 0x20000000) {
222 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, 223 printk(KERN_INFO
223 ((l & 0x000fffff)|0x20000000)); 224 "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
225 l, ((l & 0x000fffff)|0x20000000));
224 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); 226 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
225 } 227 }
226 } 228 }
@@ -251,6 +253,64 @@ static int __cpuinit nearby_node(int apicid)
251#endif 253#endif
252 254
253/* 255/*
256 * Fixup core topology information for AMD multi-node processors.
257 * Assumption 1: Number of cores in each internal node is the same.
258 * Assumption 2: Mixed systems with both single-node and dual-node
259 * processors are not supported.
260 */
261#ifdef CONFIG_X86_HT
262static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
263{
264#ifdef CONFIG_PCI
265 u32 t, cpn;
266 u8 n, n_id;
267 int cpu = smp_processor_id();
268
269 /* fixup topology information only once for a core */
270 if (cpu_has(c, X86_FEATURE_AMD_DCM))
271 return;
272
273 /* check for multi-node processor on boot cpu */
274 t = read_pci_config(0, 24, 3, 0xe8);
275 if (!(t & (1 << 29)))
276 return;
277
278 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
279
280 /* cores per node: each internal node has half the number of cores */
281 cpn = c->x86_max_cores >> 1;
282
283 /* even-numbered NB_id of this dual-node processor */
284 n = c->phys_proc_id << 1;
285
286 /*
287 * determine internal node id and assign cores fifty-fifty to
288 * each node of the dual-node processor
289 */
290 t = read_pci_config(0, 24 + n, 3, 0xe8);
291 n = (t>>30) & 0x3;
292 if (n == 0) {
293 if (c->cpu_core_id < cpn)
294 n_id = 0;
295 else
296 n_id = 1;
297 } else {
298 if (c->cpu_core_id < cpn)
299 n_id = 1;
300 else
301 n_id = 0;
302 }
303
304 /* compute entire NodeID, use llc_shared_map to store sibling info */
305 per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
306
307 /* fixup core id to be in range from 0 to cpn */
308 c->cpu_core_id = c->cpu_core_id % cpn;
309#endif
310}
311#endif
312
313/*
254 * On a AMD dual core setup the lower bits of the APIC id distingush the cores. 314 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
255 * Assumes number of cores is a power of two. 315 * Assumes number of cores is a power of two.
256 */ 316 */
@@ -267,17 +327,31 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
267 c->phys_proc_id = c->initial_apicid >> bits; 327 c->phys_proc_id = c->initial_apicid >> bits;
268 /* use socket ID also for last level cache */ 328 /* use socket ID also for last level cache */
269 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; 329 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
330 /* fixup topology information on multi-node processors */
331 if ((c->x86 == 0x10) && (c->x86_model == 9))
332 amd_fixup_dcm(c);
270#endif 333#endif
271} 334}
272 335
336int amd_get_nb_id(int cpu)
337{
338 int id = 0;
339#ifdef CONFIG_SMP
340 id = per_cpu(cpu_llc_id, cpu);
341#endif
342 return id;
343}
344EXPORT_SYMBOL_GPL(amd_get_nb_id);
345
273static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 346static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
274{ 347{
275#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 348#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
276 int cpu = smp_processor_id(); 349 int cpu = smp_processor_id();
277 int node; 350 int node;
278 unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; 351 unsigned apicid = c->apicid;
352
353 node = per_cpu(cpu_llc_id, cpu);
279 354
280 node = c->phys_proc_id;
281 if (apicid_to_node[apicid] != NUMA_NO_NODE) 355 if (apicid_to_node[apicid] != NUMA_NO_NODE)
282 node = apicid_to_node[apicid]; 356 node = apicid_to_node[apicid];
283 if (!node_online(node)) { 357 if (!node_online(node)) {
@@ -398,18 +472,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
398 u32 level; 472 u32 level;
399 473
400 level = cpuid_eax(1); 474 level = cpuid_eax(1);
401 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) 475 if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
402 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 476 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
403 477
404 /* 478 /*
405 * Some BIOSes incorrectly force this feature, but only K8 479 * Some BIOSes incorrectly force this feature, but only K8
406 * revision D (model = 0x14) and later actually support it. 480 * revision D (model = 0x14) and later actually support it.
481 * (AMD Erratum #110, docId: 25759).
407 */ 482 */
408 if (c->x86_model < 0x14) 483 if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
484 u64 val;
485
409 clear_cpu_cap(c, X86_FEATURE_LAHF_LM); 486 clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
487 if (!rdmsrl_amd_safe(0xc001100d, &val)) {
488 val &= ~(1ULL << 32);
489 wrmsrl_amd_safe(0xc001100d, val);
490 }
491 }
492
410 } 493 }
411 if (c->x86 == 0x10 || c->x86 == 0x11) 494 if (c->x86 == 0x10 || c->x86 == 0x11)
412 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 495 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
496
497 /* get apicid instead of initial apic id from cpuid */
498 c->apicid = hard_smp_processor_id();
413#else 499#else
414 500
415 /* 501 /*
@@ -494,27 +580,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
494 * benefit in doing so. 580 * benefit in doing so.
495 */ 581 */
496 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { 582 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
497 printk(KERN_DEBUG "tseg: %010llx\n", tseg); 583 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
498 if ((tseg>>PMD_SHIFT) < 584 if ((tseg>>PMD_SHIFT) <
499 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || 585 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
500 ((tseg>>PMD_SHIFT) < 586 ((tseg>>PMD_SHIFT) <
501 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && 587 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
502 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) 588 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
503 set_memory_4k((unsigned long)__va(tseg), 1); 589 set_memory_4k((unsigned long)__va(tseg), 1);
504 } 590 }
505 } 591 }
506#endif 592#endif
507} 593}
508 594
509#ifdef CONFIG_X86_32 595#ifdef CONFIG_X86_32
510static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 596static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
597 unsigned int size)
511{ 598{
512 /* AMD errata T13 (order #21922) */ 599 /* AMD errata T13 (order #21922) */
513 if ((c->x86 == 6)) { 600 if ((c->x86 == 6)) {
514 if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ 601 /* Duron Rev A0 */
602 if (c->x86_model == 3 && c->x86_mask == 0)
515 size = 64; 603 size = 64;
604 /* Tbird rev A1/A2 */
516 if (c->x86_model == 4 && 605 if (c->x86_model == 4 &&
517 (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ 606 (c->x86_mask == 0 || c->x86_mask == 1))
518 size = 256; 607 size = 256;
519 } 608 }
520 return size; 609 return size;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c8e315f1aa83..01a265212395 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -81,7 +81,7 @@ static void __init check_fpu(void)
81 81
82 boot_cpu_data.fdiv_bug = fdiv_bug; 82 boot_cpu_data.fdiv_bug = fdiv_bug;
83 if (boot_cpu_data.fdiv_bug) 83 if (boot_cpu_data.fdiv_bug)
84 printk("Hmm, FPU with FDIV bug.\n"); 84 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
85} 85}
86 86
87static void __init check_hlt(void) 87static void __init check_hlt(void)
@@ -98,7 +98,7 @@ static void __init check_hlt(void)
98 halt(); 98 halt();
99 halt(); 99 halt();
100 halt(); 100 halt();
101 printk("OK.\n"); 101 printk(KERN_CONT "OK.\n");
102} 102}
103 103
104/* 104/*
@@ -122,9 +122,9 @@ static void __init check_popad(void)
122 * CPU hard. Too bad. 122 * CPU hard. Too bad.
123 */ 123 */
124 if (res != 12345678) 124 if (res != 12345678)
125 printk("Buggy.\n"); 125 printk(KERN_CONT "Buggy.\n");
126 else 126 else
127 printk("OK.\n"); 127 printk(KERN_CONT "OK.\n");
128#endif 128#endif
129} 129}
130 130
@@ -156,7 +156,7 @@ void __init check_bugs(void)
156{ 156{
157 identify_boot_cpu(); 157 identify_boot_cpu();
158#ifndef CONFIG_SMP 158#ifndef CONFIG_SMP
159 printk("CPU: "); 159 printk(KERN_INFO "CPU: ");
160 print_cpu_info(&boot_cpu_data); 160 print_cpu_info(&boot_cpu_data);
161#endif 161#endif
162 check_config(); 162 check_config();
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 9a3ed0649d4e..04f0fe5af83e 100644
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -15,7 +15,7 @@ void __init check_bugs(void)
15{ 15{
16 identify_boot_cpu(); 16 identify_boot_cpu();
17#if !defined(CONFIG_SMP) 17#if !defined(CONFIG_SMP)
18 printk("CPU: "); 18 printk(KERN_INFO "CPU: ");
19 print_cpu_info(&boot_cpu_data); 19 print_cpu_info(&boot_cpu_data);
20#endif 20#endif
21 alternative_instructions(); 21 alternative_instructions();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5ce60a88027b..cc25c2b4a567 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,13 +13,13 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h> 16#include <asm/perf_event.h>
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
19#include <asm/processor.h> 19#include <asm/processor.h>
20#include <asm/sections.h> 20#include <asm/sections.h>
21#include <asm/topology.h> 21#include <linux/topology.h>
22#include <asm/cpumask.h> 22#include <linux/cpumask.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/atomic.h> 24#include <asm/atomic.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
@@ -28,13 +28,12 @@
28#include <asm/desc.h> 28#include <asm/desc.h>
29#include <asm/i387.h> 29#include <asm/i387.h>
30#include <asm/mtrr.h> 30#include <asm/mtrr.h>
31#include <asm/numa.h> 31#include <linux/numa.h>
32#include <asm/asm.h> 32#include <asm/asm.h>
33#include <asm/cpu.h> 33#include <asm/cpu.h>
34#include <asm/mce.h> 34#include <asm/mce.h>
35#include <asm/msr.h> 35#include <asm/msr.h>
36#include <asm/pat.h> 36#include <asm/pat.h>
37#include <asm/smp.h>
38 37
39#ifdef CONFIG_X86_LOCAL_APIC 38#ifdef CONFIG_X86_LOCAL_APIC
40#include <asm/uv/uv.h> 39#include <asm/uv/uv.h>
@@ -94,45 +93,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
94 * TLS descriptors are currently at a different place compared to i386. 93 * TLS descriptors are currently at a different place compared to i386.
95 * Hopefully nobody expects them at a fixed place (Wine?) 94 * Hopefully nobody expects them at a fixed place (Wine?)
96 */ 95 */
97 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 96 [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
98 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 97 [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
99 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 98 [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
100 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 99 [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
101 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 100 [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
102 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 101 [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
103#else 102#else
104 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 103 [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
105 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 104 [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
106 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 105 [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
107 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, 106 [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
108 /* 107 /*
109 * Segments used for calling PnP BIOS have byte granularity. 108 * Segments used for calling PnP BIOS have byte granularity.
110 * They code segments and data segments have fixed 64k limits, 109 * They code segments and data segments have fixed 64k limits,
111 * the transfer segment sizes are set at run time. 110 * the transfer segment sizes are set at run time.
112 */ 111 */
113 /* 32-bit code */ 112 /* 32-bit code */
114 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, 113 [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
115 /* 16-bit code */ 114 /* 16-bit code */
116 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, 115 [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
117 /* 16-bit data */ 116 /* 16-bit data */
118 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, 117 [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
119 /* 16-bit data */ 118 /* 16-bit data */
120 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, 119 [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
121 /* 16-bit data */ 120 /* 16-bit data */
122 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, 121 [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
123 /* 122 /*
124 * The APM segments have byte granularity and their bases 123 * The APM segments have byte granularity and their bases
125 * are set at run time. All have 64k limits. 124 * are set at run time. All have 64k limits.
126 */ 125 */
127 /* 32-bit code */ 126 /* 32-bit code */
128 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, 127 [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
129 /* 16-bit code */ 128 /* 16-bit code */
130 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, 129 [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
131 /* data */ 130 /* data */
132 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 131 [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
133 132
134 [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, 133 [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
135 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, 134 [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
136 GDT_STACK_CANARY_INIT 135 GDT_STACK_CANARY_INIT
137#endif 136#endif
138} }; 137} };
@@ -870,7 +869,7 @@ void __init identify_boot_cpu(void)
870#else 869#else
871 vgetcpu_set_mode(); 870 vgetcpu_set_mode();
872#endif 871#endif
873 init_hw_perf_counters(); 872 init_hw_perf_events();
874} 873}
875 874
876void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 875void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -982,18 +981,26 @@ static __init int setup_disablecpuid(char *arg)
982__setup("clearcpuid=", setup_disablecpuid); 981__setup("clearcpuid=", setup_disablecpuid);
983 982
984#ifdef CONFIG_X86_64 983#ifdef CONFIG_X86_64
985struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 984struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
986 985
987DEFINE_PER_CPU_FIRST(union irq_stack_union, 986DEFINE_PER_CPU_FIRST(union irq_stack_union,
988 irq_stack_union) __aligned(PAGE_SIZE); 987 irq_stack_union) __aligned(PAGE_SIZE);
989 988
990DEFINE_PER_CPU(char *, irq_stack_ptr) = 989/*
991 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; 990 * The following four percpu variables are hot. Align current_task to
991 * cacheline size such that all four fall in the same cacheline.
992 */
993DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
994 &init_task;
995EXPORT_PER_CPU_SYMBOL(current_task);
992 996
993DEFINE_PER_CPU(unsigned long, kernel_stack) = 997DEFINE_PER_CPU(unsigned long, kernel_stack) =
994 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; 998 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
995EXPORT_PER_CPU_SYMBOL(kernel_stack); 999EXPORT_PER_CPU_SYMBOL(kernel_stack);
996 1000
1001DEFINE_PER_CPU(char *, irq_stack_ptr) =
1002 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
1003
997DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1004DEFINE_PER_CPU(unsigned int, irq_count) = -1;
998 1005
999/* 1006/*
@@ -1008,8 +1015,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
1008}; 1015};
1009 1016
1010static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks 1017static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
1011 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) 1018 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
1012 __aligned(PAGE_SIZE);
1013 1019
1014/* May not be marked __init: used by software suspend */ 1020/* May not be marked __init: used by software suspend */
1015void syscall_init(void) 1021void syscall_init(void)
@@ -1042,8 +1048,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist);
1042 1048
1043#else /* CONFIG_X86_64 */ 1049#else /* CONFIG_X86_64 */
1044 1050
1051DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1052EXPORT_PER_CPU_SYMBOL(current_task);
1053
1045#ifdef CONFIG_CC_STACKPROTECTOR 1054#ifdef CONFIG_CC_STACKPROTECTOR
1046DEFINE_PER_CPU(unsigned long, stack_canary); 1055DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
1047#endif 1056#endif
1048 1057
1049/* Make sure %fs and %gs are initialized properly in idle threads */ 1058/* Make sure %fs and %gs are initialized properly in idle threads */
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd0403..dca325c03999 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
36 36
37static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220ca..7d5c3b0ea8da 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <trace/power.h> 36#include <trace/events/power.h>
37 37
38#include <linux/acpi.h> 38#include <linux/acpi.h>
39#include <linux/io.h> 39#include <linux/io.h>
@@ -60,7 +60,6 @@ enum {
60}; 60};
61 61
62#define INTEL_MSR_RANGE (0xffff) 62#define INTEL_MSR_RANGE (0xffff)
63#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
64 63
65struct acpi_cpufreq_data { 64struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data; 65 struct acpi_processor_performance *acpi_data;
@@ -71,13 +70,7 @@ struct acpi_cpufreq_data {
71 70
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
73 72
74struct acpi_msr_data { 73static DEFINE_PER_CPU(struct aperfmperf, old_perf);
75 u64 saved_aperf, saved_mperf;
76};
77
78static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
79
80DEFINE_TRACE(power_mark);
81 74
82/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
83static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
@@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask)
244 return cmd.val; 237 return cmd.val;
245} 238}
246 239
247struct perf_pair {
248 union {
249 struct {
250 u32 lo;
251 u32 hi;
252 } split;
253 u64 whole;
254 } aperf, mperf;
255};
256
257/* Called via smp_call_function_single(), on the target CPU */ 240/* Called via smp_call_function_single(), on the target CPU */
258static void read_measured_perf_ctrs(void *_cur) 241static void read_measured_perf_ctrs(void *_cur)
259{ 242{
260 struct perf_pair *cur = _cur; 243 struct aperfmperf *am = _cur;
261 244
262 rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); 245 get_aperfmperf(am);
263 rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
264} 246}
265 247
266/* 248/*
@@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur)
279static unsigned int get_measured_perf(struct cpufreq_policy *policy, 261static unsigned int get_measured_perf(struct cpufreq_policy *policy,
280 unsigned int cpu) 262 unsigned int cpu)
281{ 263{
282 struct perf_pair readin, cur; 264 struct aperfmperf perf;
283 unsigned int perf_percent; 265 unsigned long ratio;
284 unsigned int retval; 266 unsigned int retval;
285 267
286 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) 268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
287 return 0; 269 return 0;
288 270
289 cur.aperf.whole = readin.aperf.whole - 271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
290 per_cpu(msr_data, cpu).saved_aperf; 272 per_cpu(old_perf, cpu) = perf;
291 cur.mperf.whole = readin.mperf.whole -
292 per_cpu(msr_data, cpu).saved_mperf;
293 per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
294 per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
295
296#ifdef __i386__
297 /*
298 * We dont want to do 64 bit divide with 32 bit kernel
299 * Get an approximate value. Return failure in case we cannot get
300 * an approximate value.
301 */
302 if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
303 int shift_count;
304 u32 h;
305
306 h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
307 shift_count = fls(h);
308
309 cur.aperf.whole >>= shift_count;
310 cur.mperf.whole >>= shift_count;
311 }
312
313 if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
314 int shift_count = 7;
315 cur.aperf.split.lo >>= shift_count;
316 cur.mperf.split.lo >>= shift_count;
317 }
318
319 if (cur.aperf.split.lo && cur.mperf.split.lo)
320 perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
321 else
322 perf_percent = 0;
323
324#else
325 if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
326 int shift_count = 7;
327 cur.aperf.whole >>= shift_count;
328 cur.mperf.whole >>= shift_count;
329 }
330
331 if (cur.aperf.whole && cur.mperf.whole)
332 perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
333 else
334 perf_percent = 0;
335
336#endif
337 273
338 retval = (policy->cpuinfo.max_freq * perf_percent) / 100; 274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
339 275
340 return retval; 276 return retval;
341} 277}
@@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
394 unsigned int next_perf_state = 0; /* Index into perf table */ 330 unsigned int next_perf_state = 0; /* Index into perf table */
395 unsigned int i; 331 unsigned int i;
396 int result = 0; 332 int result = 0;
397 struct power_trace it;
398 333
399 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 334 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
400 335
@@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
426 } 361 }
427 } 362 }
428 363
429 trace_power_mark(&it, POWER_PSTATE, next_perf_state); 364 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
430 365
431 switch (data->cpu_feature) { 366 switch (data->cpu_feature) {
432 case SYSTEM_INTEL_MSR_CAPABLE: 367 case SYSTEM_INTEL_MSR_CAPABLE:
@@ -588,6 +523,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
588 }, 523 },
589 { } 524 { }
590}; 525};
526
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/
533 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) &&
535 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable())
537 return -ENODEV;
538 }
539 return 0;
540}
591#endif 541#endif
592 542
593static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) 543static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
@@ -602,6 +552,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
602 552
603 dprintk("acpi_cpufreq_cpu_init\n"); 553 dprintk("acpi_cpufreq_cpu_init\n");
604 554
555#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c);
557 if (result)
558 return result;
559#endif
560
605 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
606 if (!data) 562 if (!data)
607 return -ENOMEM; 563 return -ENOMEM;
@@ -731,12 +687,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
731 acpi_processor_notify_smm(THIS_MODULE); 687 acpi_processor_notify_smm(THIS_MODULE);
732 688
733 /* Check for APERF/MPERF support in hardware */ 689 /* Check for APERF/MPERF support in hardware */
734 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { 690 if (cpu_has(c, X86_FEATURE_APERFMPERF))
735 unsigned int ecx; 691 acpi_cpufreq_driver.getavg = get_measured_perf;
736 ecx = cpuid_ecx(6);
737 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
738 acpi_cpufreq_driver.getavg = get_measured_perf;
739 }
740 692
741 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 693 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
742 for (i = 0; i < perf->state_count; i++) 694 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 2a50ef891000..6394aa5c7985 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
605 return 0; 605 return 0;
606} 606}
607 607
608static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) 608static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
609 unsigned int entry)
609{ 610{
610 data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; 611 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
611} 612}
612 613
613static void print_basics(struct powernow_k8_data *data) 614static void print_basics(struct powernow_k8_data *data)
@@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
854 goto err_out; 855 goto err_out;
855 } 856 }
856 857
858 /* fill in data */
859 data->numps = data->acpi_data.state_count;
860 powernow_k8_acpi_pst_values(data, 0);
861
857 if (cpu_family == CPU_HW_PSTATE) 862 if (cpu_family == CPU_HW_PSTATE)
858 ret_val = fill_powernow_table_pstate(data, powernow_table); 863 ret_val = fill_powernow_table_pstate(data, powernow_table);
859 else 864 else
@@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
866 powernow_table[data->acpi_data.state_count].index = 0; 871 powernow_table[data->acpi_data.state_count].index = 0;
867 data->powernow_table = powernow_table; 872 data->powernow_table = powernow_table;
868 873
869 /* fill in data */
870 data->numps = data->acpi_data.state_count;
871 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) 874 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
872 print_basics(data); 875 print_basics(data);
873 powernow_k8_acpi_pst_values(data, 0);
874 876
875 /* notify BIOS that we exist */ 877 /* notify BIOS that we exist */
876 acpi_processor_notify_smm(THIS_MODULE); 878 acpi_processor_notify_smm(THIS_MODULE);
@@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
914 "bad value %d.\n", i, index); 916 "bad value %d.\n", i, index);
915 printk(KERN_ERR PFX "Please report to BIOS " 917 printk(KERN_ERR PFX "Please report to BIOS "
916 "manufacturer\n"); 918 "manufacturer\n");
917 invalidate_entry(data, i); 919 invalidate_entry(powernow_table, i);
918 continue; 920 continue;
919 } 921 }
920 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); 922 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
921 if (!(hi & HW_PSTATE_VALID_MASK)) { 923 if (!(hi & HW_PSTATE_VALID_MASK)) {
922 dprintk("invalid pstate %d, ignoring\n", index); 924 dprintk("invalid pstate %d, ignoring\n", index);
923 invalidate_entry(data, i); 925 invalidate_entry(powernow_table, i);
924 continue; 926 continue;
925 } 927 }
926 928
@@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
941 struct cpufreq_frequency_table *powernow_table) 943 struct cpufreq_frequency_table *powernow_table)
942{ 944{
943 int i; 945 int i;
944 int cntlofreq = 0;
945 946
946 for (i = 0; i < data->acpi_data.state_count; i++) { 947 for (i = 0; i < data->acpi_data.state_count; i++) {
947 u32 fid; 948 u32 fid;
@@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
970 /* verify frequency is OK */ 971 /* verify frequency is OK */
971 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { 972 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
972 dprintk("invalid freq %u kHz, ignoring\n", freq); 973 dprintk("invalid freq %u kHz, ignoring\n", freq);
973 invalidate_entry(data, i); 974 invalidate_entry(powernow_table, i);
974 continue; 975 continue;
975 } 976 }
976 977
@@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
978 * BIOSs are using "off" to indicate invalid */ 979 * BIOSs are using "off" to indicate invalid */
979 if (vid == VID_OFF) { 980 if (vid == VID_OFF) {
980 dprintk("invalid vid %u, ignoring\n", vid); 981 dprintk("invalid vid %u, ignoring\n", vid);
981 invalidate_entry(data, i); 982 invalidate_entry(powernow_table, i);
982 continue; 983 continue;
983 } 984 }
984 985
985 /* verify only 1 entry from the lo frequency table */
986 if (fid < HI_FID_TABLE_BOTTOM) {
987 if (cntlofreq) {
988 /* if both entries are the same,
989 * ignore this one ... */
990 if ((freq != powernow_table[cntlofreq].frequency) ||
991 (index != powernow_table[cntlofreq].index)) {
992 printk(KERN_ERR PFX
993 "Too many lo freq table "
994 "entries\n");
995 return 1;
996 }
997
998 dprintk("double low frequency table entry, "
999 "ignoring it.\n");
1000 invalidate_entry(data, i);
1001 continue;
1002 } else
1003 cntlofreq = i;
1004 }
1005
1006 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { 986 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
1007 printk(KERN_INFO PFX "invalid freq entries " 987 printk(KERN_INFO PFX "invalid freq entries "
1008 "%u kHz vs. %u kHz\n", freq, 988 "%u kHz vs. %u kHz\n", freq,
1009 (unsigned int) 989 (unsigned int)
1010 (data->acpi_data.states[i].core_frequency 990 (data->acpi_data.states[i].core_frequency
1011 * 1000)); 991 * 1000));
1012 invalidate_entry(data, i); 992 invalidate_entry(powernow_table, i);
1013 continue; 993 continue;
1014 } 994 }
1015 } 995 }
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 593171e967ef..19807b89f058 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -3,10 +3,10 @@
3#include <linux/delay.h> 3#include <linux/delay.h>
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <asm/dma.h> 5#include <asm/dma.h>
6#include <asm/io.h> 6#include <linux/io.h>
7#include <asm/processor-cyrix.h> 7#include <asm/processor-cyrix.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include <asm/timer.h> 9#include <linux/timer.h>
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/tsc.h> 11#include <asm/tsc.h>
12 12
@@ -282,7 +282,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
282 * The 5510/5520 companion chips have a funky PIT. 282 * The 5510/5520 companion chips have a funky PIT.
283 */ 283 */
284 if (vendor == PCI_VENDOR_ID_CYRIX && 284 if (vendor == PCI_VENDOR_ID_CYRIX &&
285 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) 285 (device == PCI_DEVICE_ID_CYRIX_5510 ||
286 device == PCI_DEVICE_ID_CYRIX_5520))
286 mark_tsc_unstable("cyrix 5510/5520 detected"); 287 mark_tsc_unstable("cyrix 5510/5520 detected");
287 } 288 }
288#endif 289#endif
@@ -299,7 +300,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
299 * ? : 0x7x 300 * ? : 0x7x
300 * GX1 : 0x8x GX1 datasheet 56 301 * GX1 : 0x8x GX1 datasheet 56
301 */ 302 */
302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 303 if ((0x30 <= dir1 && dir1 <= 0x6f) ||
304 (0x80 <= dir1 && dir1 <= 0x8f))
303 geode_configure(); 305 geode_configure();
304 return; 306 return;
305 } else { /* MediaGX */ 307 } else { /* MediaGX */
@@ -427,9 +429,12 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
427 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); 429 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
428 local_irq_save(flags); 430 local_irq_save(flags);
429 ccr3 = getCx86(CX86_CCR3); 431 ccr3 = getCx86(CX86_CCR3);
430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 432 /* enable MAPEN */
431 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ 433 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 434 /* enable cpuid */
435 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
436 /* disable MAPEN */
437 setCx86(CX86_CCR3, ccr3);
433 local_irq_restore(flags); 438 local_irq_restore(flags);
434 } 439 }
435 } 440 }
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index fb5b86af0b01..08be922de33a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -28,18 +28,10 @@
28static inline void __cpuinit 28static inline void __cpuinit
29detect_hypervisor_vendor(struct cpuinfo_x86 *c) 29detect_hypervisor_vendor(struct cpuinfo_x86 *c)
30{ 30{
31 if (vmware_platform()) { 31 if (vmware_platform())
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; 32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
33 } else { 33 else
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; 34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35 }
36}
37
38unsigned long get_hypervisor_tsc_freq(void)
39{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
41 return vmware_get_tsc_khz();
42 return 0;
43} 35}
44 36
45static inline void __cpuinit 37static inline void __cpuinit
@@ -56,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
56 detect_hypervisor_vendor(c); 48 detect_hypervisor_vendor(c);
57 hypervisor_set_feature_bits(c); 49 hypervisor_set_feature_bits(c);
58} 50}
51
52void __init init_hypervisor_platform(void)
53{
54 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
56 vmware_platform_setup();
57}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3260ab044996..40e1835b35e8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,17 +7,17 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/thread_info.h> 8#include <linux/thread_info.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/uaccess.h>
10 11
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14#include <asm/uaccess.h>
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17#include <asm/cpu.h> 17#include <asm/cpu.h>
18 18
19#ifdef CONFIG_X86_64 19#ifdef CONFIG_X86_64
20#include <asm/topology.h> 20#include <linux/topology.h>
21#include <asm/numa_64.h> 21#include <asm/numa_64.h>
22#endif 22#endif
23 23
@@ -174,7 +174,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
174#ifdef CONFIG_X86_F00F_BUG 174#ifdef CONFIG_X86_F00F_BUG
175 /* 175 /*
176 * All current models of Pentium and Pentium with MMX technology CPUs 176 * All current models of Pentium and Pentium with MMX technology CPUs
177 * have the F0 0F bug, which lets nonprivileged users lock up the system. 177 * have the F0 0F bug, which lets nonprivileged users lock up the
178 * system.
178 * Note that the workaround only should be initialized once... 179 * Note that the workaround only should be initialized once...
179 */ 180 */
180 c->f00f_bug = 0; 181 c->f00f_bug = 0;
@@ -207,7 +208,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
207 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); 208 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
208 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); 209 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
209 lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; 210 lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
210 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 211 wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
211 } 212 }
212 } 213 }
213 214
@@ -283,7 +284,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
283 /* Intel has a non-standard dependency on %ecx for this CPUID level. */ 284 /* Intel has a non-standard dependency on %ecx for this CPUID level. */
284 cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); 285 cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
285 if (eax & 0x1f) 286 if (eax & 0x1f)
286 return ((eax >> 26) + 1); 287 return (eax >> 26) + 1;
287 else 288 else
288 return 1; 289 return 1;
289} 290}
@@ -349,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
349 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
350 } 351 }
351 352
353 if (c->cpuid_level > 6) {
354 unsigned ecx = cpuid_ecx(6);
355 if (ecx & 0x01)
356 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
357 }
358
352 if (cpu_has_xmm2) 359 if (cpu_has_xmm2)
353 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 360 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
354 if (cpu_has_ds) { 361 if (cpu_has_ds) {
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 789efe217e1a..804c40e2bc3e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
9 9
@@ -16,7 +16,7 @@
16#include <linux/pci.h> 16#include <linux/pci.h>
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21 21
22#define LVL_1_INST 1 22#define LVL_1_INST 1
@@ -25,14 +25,15 @@
25#define LVL_3 4 25#define LVL_3 4
26#define LVL_TRACE 5 26#define LVL_TRACE 5
27 27
28struct _cache_table 28struct _cache_table {
29{
30 unsigned char descriptor; 29 unsigned char descriptor;
31 char cache_type; 30 char cache_type;
32 short size; 31 short size;
33}; 32};
34 33
35/* all the cache descriptor types we care about (no TLB or trace cache entries) */ 34/* All the cache descriptor types we care about (no TLB or
35 trace cache entries) */
36
36static const struct _cache_table __cpuinitconst cache_table[] = 37static const struct _cache_table __cpuinitconst cache_table[] =
37{ 38{
38 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ 39 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
@@ -105,8 +106,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
105}; 106};
106 107
107 108
108enum _cache_type 109enum _cache_type {
109{
110 CACHE_TYPE_NULL = 0, 110 CACHE_TYPE_NULL = 0,
111 CACHE_TYPE_DATA = 1, 111 CACHE_TYPE_DATA = 1,
112 CACHE_TYPE_INST = 2, 112 CACHE_TYPE_INST = 2,
@@ -170,31 +170,31 @@ unsigned short num_cache_leaves;
170 Maybe later */ 170 Maybe later */
171union l1_cache { 171union l1_cache {
172 struct { 172 struct {
173 unsigned line_size : 8; 173 unsigned line_size:8;
174 unsigned lines_per_tag : 8; 174 unsigned lines_per_tag:8;
175 unsigned assoc : 8; 175 unsigned assoc:8;
176 unsigned size_in_kb : 8; 176 unsigned size_in_kb:8;
177 }; 177 };
178 unsigned val; 178 unsigned val;
179}; 179};
180 180
181union l2_cache { 181union l2_cache {
182 struct { 182 struct {
183 unsigned line_size : 8; 183 unsigned line_size:8;
184 unsigned lines_per_tag : 4; 184 unsigned lines_per_tag:4;
185 unsigned assoc : 4; 185 unsigned assoc:4;
186 unsigned size_in_kb : 16; 186 unsigned size_in_kb:16;
187 }; 187 };
188 unsigned val; 188 unsigned val;
189}; 189};
190 190
191union l3_cache { 191union l3_cache {
192 struct { 192 struct {
193 unsigned line_size : 8; 193 unsigned line_size:8;
194 unsigned lines_per_tag : 4; 194 unsigned lines_per_tag:4;
195 unsigned assoc : 4; 195 unsigned assoc:4;
196 unsigned res : 2; 196 unsigned res:2;
197 unsigned size_encoded : 14; 197 unsigned size_encoded:14;
198 }; 198 };
199 unsigned val; 199 unsigned val;
200}; 200};
@@ -241,7 +241,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
241 case 0: 241 case 0:
242 if (!l1->val) 242 if (!l1->val)
243 return; 243 return;
244 assoc = l1->assoc; 244 assoc = assocs[l1->assoc];
245 line_size = l1->line_size; 245 line_size = l1->line_size;
246 lines_per_tag = l1->lines_per_tag; 246 lines_per_tag = l1->lines_per_tag;
247 size_in_kb = l1->size_in_kb; 247 size_in_kb = l1->size_in_kb;
@@ -249,7 +249,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
249 case 2: 249 case 2:
250 if (!l2.val) 250 if (!l2.val)
251 return; 251 return;
252 assoc = l2.assoc; 252 assoc = assocs[l2.assoc];
253 line_size = l2.line_size; 253 line_size = l2.line_size;
254 lines_per_tag = l2.lines_per_tag; 254 lines_per_tag = l2.lines_per_tag;
255 /* cpu_data has errata corrections for K7 applied */ 255 /* cpu_data has errata corrections for K7 applied */
@@ -258,10 +258,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
258 case 3: 258 case 3:
259 if (!l3.val) 259 if (!l3.val)
260 return; 260 return;
261 assoc = l3.assoc; 261 assoc = assocs[l3.assoc];
262 line_size = l3.line_size; 262 line_size = l3.line_size;
263 lines_per_tag = l3.lines_per_tag; 263 lines_per_tag = l3.lines_per_tag;
264 size_in_kb = l3.size_encoded * 512; 264 size_in_kb = l3.size_encoded * 512;
265 if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
266 size_in_kb = size_in_kb >> 1;
267 assoc = assoc >> 1;
268 }
265 break; 269 break;
266 default: 270 default:
267 return; 271 return;
@@ -270,18 +274,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
270 eax->split.is_self_initializing = 1; 274 eax->split.is_self_initializing = 1;
271 eax->split.type = types[leaf]; 275 eax->split.type = types[leaf];
272 eax->split.level = levels[leaf]; 276 eax->split.level = levels[leaf];
273 if (leaf == 3) 277 eax->split.num_threads_sharing = 0;
274 eax->split.num_threads_sharing =
275 current_cpu_data.x86_max_cores - 1;
276 else
277 eax->split.num_threads_sharing = 0;
278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
279 279
280 280
281 if (assoc == 0xf) 281 if (assoc == 0xffff)
282 eax->split.is_fully_associative = 1; 282 eax->split.is_fully_associative = 1;
283 ebx->split.coherency_line_size = line_size - 1; 283 ebx->split.coherency_line_size = line_size - 1;
284 ebx->split.ways_of_associativity = assocs[assoc] - 1; 284 ebx->split.ways_of_associativity = assoc - 1;
285 ebx->split.physical_line_partition = lines_per_tag - 1; 285 ebx->split.physical_line_partition = lines_per_tag - 1;
286 ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / 286 ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
287 (ebx->split.ways_of_associativity + 1) - 1; 287 (ebx->split.ways_of_associativity + 1) - 1;
@@ -350,7 +350,8 @@ static int __cpuinit find_num_cache_leaves(void)
350 350
351unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) 351unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
352{ 352{
353 unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ 353 /* Cache sizes */
354 unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
354 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ 355 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
355 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ 356 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
356 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; 357 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
@@ -377,8 +378,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
377 378
378 retval = cpuid4_cache_lookup_regs(i, &this_leaf); 379 retval = cpuid4_cache_lookup_regs(i, &this_leaf);
379 if (retval >= 0) { 380 if (retval >= 0) {
380 switch(this_leaf.eax.split.level) { 381 switch (this_leaf.eax.split.level) {
381 case 1: 382 case 1:
382 if (this_leaf.eax.split.type == 383 if (this_leaf.eax.split.type ==
383 CACHE_TYPE_DATA) 384 CACHE_TYPE_DATA)
384 new_l1d = this_leaf.size/1024; 385 new_l1d = this_leaf.size/1024;
@@ -386,19 +387,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
386 CACHE_TYPE_INST) 387 CACHE_TYPE_INST)
387 new_l1i = this_leaf.size/1024; 388 new_l1i = this_leaf.size/1024;
388 break; 389 break;
389 case 2: 390 case 2:
390 new_l2 = this_leaf.size/1024; 391 new_l2 = this_leaf.size/1024;
391 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; 392 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
392 index_msb = get_count_order(num_threads_sharing); 393 index_msb = get_count_order(num_threads_sharing);
393 l2_id = c->apicid >> index_msb; 394 l2_id = c->apicid >> index_msb;
394 break; 395 break;
395 case 3: 396 case 3:
396 new_l3 = this_leaf.size/1024; 397 new_l3 = this_leaf.size/1024;
397 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; 398 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
398 index_msb = get_count_order(num_threads_sharing); 399 index_msb = get_count_order(
400 num_threads_sharing);
399 l3_id = c->apicid >> index_msb; 401 l3_id = c->apicid >> index_msb;
400 break; 402 break;
401 default: 403 default:
402 break; 404 break;
403 } 405 }
404 } 406 }
@@ -421,22 +423,21 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
421 /* Number of times to iterate */ 423 /* Number of times to iterate */
422 n = cpuid_eax(2) & 0xFF; 424 n = cpuid_eax(2) & 0xFF;
423 425
424 for ( i = 0 ; i < n ; i++ ) { 426 for (i = 0 ; i < n ; i++) {
425 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]); 427 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
426 428
427 /* If bit 31 is set, this is an unknown format */ 429 /* If bit 31 is set, this is an unknown format */
428 for ( j = 0 ; j < 3 ; j++ ) { 430 for (j = 0 ; j < 3 ; j++)
429 if (regs[j] & (1 << 31)) regs[j] = 0; 431 if (regs[j] & (1 << 31))
430 } 432 regs[j] = 0;
431 433
432 /* Byte 0 is level count, not a descriptor */ 434 /* Byte 0 is level count, not a descriptor */
433 for ( j = 1 ; j < 16 ; j++ ) { 435 for (j = 1 ; j < 16 ; j++) {
434 unsigned char des = dp[j]; 436 unsigned char des = dp[j];
435 unsigned char k = 0; 437 unsigned char k = 0;
436 438
437 /* look up this descriptor in the table */ 439 /* look up this descriptor in the table */
438 while (cache_table[k].descriptor != 0) 440 while (cache_table[k].descriptor != 0) {
439 {
440 if (cache_table[k].descriptor == des) { 441 if (cache_table[k].descriptor == des) {
441 if (only_trace && cache_table[k].cache_type != LVL_TRACE) 442 if (only_trace && cache_table[k].cache_type != LVL_TRACE)
442 break; 443 break;
@@ -488,14 +489,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488 } 489 }
489 490
490 if (trace) 491 if (trace)
491 printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); 492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
492 else if ( l1i ) 493 else if (l1i)
493 printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); 494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
494 495
495 if (l1d) 496 if (l1d)
496 printk(", L1 D cache: %dK\n", l1d); 497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
497 else 498 else
498 printk("\n"); 499 printk(KERN_CONT "\n");
499 500
500 if (l2) 501 if (l2)
501 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); 502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
@@ -522,6 +523,18 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
522 int index_msb, i; 523 int index_msb, i;
523 struct cpuinfo_x86 *c = &cpu_data(cpu); 524 struct cpuinfo_x86 *c = &cpu_data(cpu);
524 525
526 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
527 struct cpuinfo_x86 *d;
528 for_each_online_cpu(i) {
529 if (!per_cpu(cpuid4_info, i))
530 continue;
531 d = &cpu_data(i);
532 this_leaf = CPUID4_INFO_IDX(i, index);
533 cpumask_copy(to_cpumask(this_leaf->shared_cpu_map),
534 d->llc_shared_map);
535 }
536 return;
537 }
525 this_leaf = CPUID4_INFO_IDX(cpu, index); 538 this_leaf = CPUID4_INFO_IDX(cpu, index);
526 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; 539 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
527 540
@@ -558,8 +571,13 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
558 } 571 }
559} 572}
560#else 573#else
561static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} 574static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
562static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} 575{
576}
577
578static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
579{
580}
563#endif 581#endif
564 582
565static void __cpuinit free_cache_attributes(unsigned int cpu) 583static void __cpuinit free_cache_attributes(unsigned int cpu)
@@ -645,7 +663,7 @@ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
645static ssize_t show_##file_name \ 663static ssize_t show_##file_name \
646 (struct _cpuid4_info *this_leaf, char *buf) \ 664 (struct _cpuid4_info *this_leaf, char *buf) \
647{ \ 665{ \
648 return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ 666 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
649} 667}
650 668
651show_one_plus(level, eax.split.level, 0); 669show_one_plus(level, eax.split.level, 0);
@@ -656,7 +674,7 @@ show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
656 674
657static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) 675static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
658{ 676{
659 return sprintf (buf, "%luK\n", this_leaf->size / 1024); 677 return sprintf(buf, "%luK\n", this_leaf->size / 1024);
660} 678}
661 679
662static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, 680static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
@@ -669,7 +687,7 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
669 const struct cpumask *mask; 687 const struct cpumask *mask;
670 688
671 mask = to_cpumask(this_leaf->shared_cpu_map); 689 mask = to_cpumask(this_leaf->shared_cpu_map);
672 n = type? 690 n = type ?
673 cpulist_scnprintf(buf, len-2, mask) : 691 cpulist_scnprintf(buf, len-2, mask) :
674 cpumask_scnprintf(buf, len-2, mask); 692 cpumask_scnprintf(buf, len-2, mask);
675 buf[n++] = '\n'; 693 buf[n++] = '\n';
@@ -800,7 +818,7 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
800static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 818static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
801 show_cache_disable_1, store_cache_disable_1); 819 show_cache_disable_1, store_cache_disable_1);
802 820
803static struct attribute * default_attrs[] = { 821static struct attribute *default_attrs[] = {
804 &type.attr, 822 &type.attr,
805 &level.attr, 823 &level.attr,
806 &coherency_line_size.attr, 824 &coherency_line_size.attr,
@@ -815,7 +833,7 @@ static struct attribute * default_attrs[] = {
815 NULL 833 NULL
816}; 834};
817 835
818static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) 836static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
819{ 837{
820 struct _cache_attr *fattr = to_attr(attr); 838 struct _cache_attr *fattr = to_attr(attr);
821 struct _index_kobject *this_leaf = to_object(kobj); 839 struct _index_kobject *this_leaf = to_object(kobj);
@@ -828,8 +846,8 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
828 return ret; 846 return ret;
829} 847}
830 848
831static ssize_t store(struct kobject * kobj, struct attribute * attr, 849static ssize_t store(struct kobject *kobj, struct attribute *attr,
832 const char * buf, size_t count) 850 const char *buf, size_t count)
833{ 851{
834 struct _cache_attr *fattr = to_attr(attr); 852 struct _cache_attr *fattr = to_attr(attr);
835 struct _index_kobject *this_leaf = to_object(kobj); 853 struct _index_kobject *this_leaf = to_object(kobj);
@@ -883,7 +901,7 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
883 goto err_out; 901 goto err_out;
884 902
885 per_cpu(index_kobject, cpu) = kzalloc( 903 per_cpu(index_kobject, cpu) = kzalloc(
886 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); 904 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
887 if (unlikely(per_cpu(index_kobject, cpu) == NULL)) 905 if (unlikely(per_cpu(index_kobject, cpu) == NULL))
888 goto err_out; 906 goto err_out;
889 907
@@ -917,7 +935,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
917 } 935 }
918 936
919 for (i = 0; i < num_cache_leaves; i++) { 937 for (i = 0; i < num_cache_leaves; i++) {
920 this_object = INDEX_KOBJECT_PTR(cpu,i); 938 this_object = INDEX_KOBJECT_PTR(cpu, i);
921 this_object->cpu = cpu; 939 this_object->cpu = cpu;
922 this_object->index = i; 940 this_object->index = i;
923 retval = kobject_init_and_add(&(this_object->kobj), 941 retval = kobject_init_and_add(&(this_object->kobj),
@@ -925,9 +943,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
925 per_cpu(cache_kobject, cpu), 943 per_cpu(cache_kobject, cpu),
926 "index%1lu", i); 944 "index%1lu", i);
927 if (unlikely(retval)) { 945 if (unlikely(retval)) {
928 for (j = 0; j < i; j++) { 946 for (j = 0; j < i; j++)
929 kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); 947 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
930 }
931 kobject_put(per_cpu(cache_kobject, cpu)); 948 kobject_put(per_cpu(cache_kobject, cpu));
932 cpuid4_cache_sysfs_exit(cpu); 949 cpuid4_cache_sysfs_exit(cpu);
933 return retval; 950 return retval;
@@ -952,7 +969,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
952 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); 969 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
953 970
954 for (i = 0; i < num_cache_leaves; i++) 971 for (i = 0; i < num_cache_leaves; i++)
955 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 972 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
956 kobject_put(per_cpu(cache_kobject, cpu)); 973 kobject_put(per_cpu(cache_kobject, cpu));
957 cpuid4_cache_sysfs_exit(cpu); 974 cpuid4_cache_sysfs_exit(cpu);
958} 975}
@@ -977,8 +994,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
977 return NOTIFY_OK; 994 return NOTIFY_OK;
978} 995}
979 996
980static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = 997static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = {
981{
982 .notifier_call = cacheinfo_cpu_callback, 998 .notifier_call = cacheinfo_cpu_callback,
983}; 999};
984 1000
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 188a1ca5ad2b..4ac6d48fe11b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,8 @@
1obj-y = mce.o 1obj-y = mce.o mce-severity.o
2 2
3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o 3obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 6obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11 8
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
deleted file mode 100644
index b945d5dbc609..000000000000
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ /dev/null
@@ -1,116 +0,0 @@
1/*
2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For AMD Athlon/Duron: */
17static void k7_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 1; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57
58 /* Clear it: */
59 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
60 /* Serialize: */
61 wmb();
62 add_taint(TAINT_MACHINE_CHECK);
63 }
64 }
65
66 if (recover & 2)
67 panic("CPU context corrupt");
68 if (recover & 1)
69 panic("Unable to continue");
70
71 printk(KERN_EMERG "Attempting to continue.\n");
72
73 mcgstl &= ~(1<<2);
74 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
75}
76
77
78/* AMD K7 machine check is Intel like: */
79void amd_mcheck_init(struct cpuinfo_x86 *c)
80{
81 u32 l, h;
82 int i;
83
84 if (!cpu_has(c, X86_FEATURE_MCE))
85 return;
86
87 machine_check_vector = k7_machine_check;
88 /* Make sure the vector pointer is visible before we enable MCEs: */
89 wmb();
90
91 printk(KERN_INFO "Intel machine check architecture supported.\n");
92
93 rdmsr(MSR_IA32_MCG_CAP, l, h);
94 if (l & (1<<8)) /* Control register present ? */
95 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
96 nr_mce_banks = l & 0xff;
97
98 /*
99 * Clear status for MC index 0 separately, we don't touch CTL,
100 * as some K7 Athlons cause spurious MCEs when its enabled:
101 */
102 if (boot_cpu_data.x86 == 6) {
103 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
104 i = 1;
105 } else
106 i = 0;
107
108 for (; i < nr_mce_banks; i++) {
109 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
110 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
111 }
112
113 set_in_cr4(X86_CR4_MCE);
114 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
115 smp_processor_id());
116}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a3a235a53f09..7029f0e2acad 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -18,7 +18,12 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
21#include <linux/notifier.h>
22#include <linux/kdebug.h>
23#include <linux/cpu.h>
24#include <linux/sched.h>
21#include <asm/mce.h> 25#include <asm/mce.h>
26#include <asm/apic.h>
22 27
23/* Update fake mce registers on current CPU. */ 28/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m) 29static void inject_mce(struct mce *m)
@@ -39,44 +44,141 @@ static void inject_mce(struct mce *m)
39 i->finished = 1; 44 i->finished = 1;
40} 45}
41 46
42struct delayed_mce { 47static void raise_poll(struct mce *m)
43 struct timer_list timer; 48{
44 struct mce m; 49 unsigned long flags;
45}; 50 mce_banks_t b;
46 51
47/* Inject mce on current CPU */ 52 memset(&b, 0xff, sizeof(mce_banks_t));
48static void raise_mce(unsigned long data) 53 local_irq_save(flags);
54 machine_check_poll(0, &b);
55 local_irq_restore(flags);
56 m->finished = 0;
57}
58
59static void raise_exception(struct mce *m, struct pt_regs *pregs)
49{ 60{
50 struct delayed_mce *dm = (struct delayed_mce *)data; 61 struct pt_regs regs;
51 struct mce *m = &dm->m; 62 unsigned long flags;
52 int cpu = m->extcpu;
53 63
54 inject_mce(m); 64 if (!pregs) {
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs)); 65 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip; 66 regs.ip = m->ip;
59 regs.cs = m->cs; 67 regs.cs = m->cs;
68 pregs = &regs;
69 }
70 /* in mcheck exeception handler, irq will be disabled */
71 local_irq_save(flags);
72 do_machine_check(pregs, 0);
73 local_irq_restore(flags);
74 m->finished = 0;
75}
76
77static cpumask_t mce_inject_cpumask;
78
79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data)
81{
82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs);
90 else if (m->status)
91 raise_poll(m);
92 return NOTIFY_STOP;
93}
94
95static struct notifier_block mce_raise_nb = {
96 .notifier_call = mce_raise_notify,
97 .priority = 1000,
98};
99
100/* Inject mce on current CPU */
101static int raise_local(struct mce *m)
102{
103 int context = MCJ_CTX(m->inject_flags);
104 int ret = 0;
105 int cpu = m->extcpu;
106
107 if (m->inject_flags & MCJ_EXCEPTION) {
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); 108 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0); 109 switch (context) {
110 case MCJ_CTX_IRQ:
111 /*
112 * Could do more to fake interrupts like
113 * calling irq_enter, but the necessary
114 * machinery isn't exported currently.
115 */
116 /*FALL THROUGH*/
117 case MCJ_CTX_PROCESS:
118 raise_exception(m, NULL);
119 break;
120 default:
121 printk(KERN_INFO "Invalid MCE context\n");
122 ret = -EINVAL;
123 }
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); 124 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else { 125 } else if (m->status) {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); 126 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b); 127 raise_poll(m);
68 mce_notify_irq(); 128 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n", 129 printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
70 cpu); 130 } else
71 } 131 m->finished = 0;
72 kfree(dm); 132
133 return ret;
134}
135
136static void raise_mce(struct mce *m)
137{
138 int context = MCJ_CTX(m->inject_flags);
139
140 inject_mce(m);
141
142 if (context == MCJ_CTX_RANDOM)
143 return;
144
145#ifdef CONFIG_X86_LOCAL_APIC
146 if (m->inject_flags & MCJ_NMI_BROADCAST) {
147 unsigned long start;
148 int cpu;
149 get_online_cpus();
150 mce_inject_cpumask = cpu_online_map;
151 cpu_clear(get_cpu(), mce_inject_cpumask);
152 for_each_online_cpu(cpu) {
153 struct mce *mcpu = &per_cpu(injectm, cpu);
154 if (!mcpu->finished ||
155 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
156 cpu_clear(cpu, mce_inject_cpumask);
157 }
158 if (!cpus_empty(mce_inject_cpumask))
159 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR);
160 start = jiffies;
161 while (!cpus_empty(mce_inject_cpumask)) {
162 if (!time_before(jiffies, start + 2*HZ)) {
163 printk(KERN_ERR
164 "Timeout waiting for mce inject NMI %lx\n",
165 *cpus_addr(mce_inject_cpumask));
166 break;
167 }
168 cpu_relax();
169 }
170 raise_local(m);
171 put_cpu();
172 put_online_cpus();
173 } else
174#endif
175 raise_local(m);
73} 176}
74 177
75/* Error injection interface */ 178/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf, 179static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off) 180 size_t usize, loff_t *off)
78{ 181{
79 struct delayed_mce *dm;
80 struct mce m; 182 struct mce m;
81 183
82 if (!capable(CAP_SYS_ADMIN)) 184 if (!capable(CAP_SYS_ADMIN))
@@ -96,19 +198,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) 198 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL; 199 return -EINVAL;
98 200
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /* 201 /*
104 * Need to give user space some time to set everything up, 202 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere. 203 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */ 204 */
108 memcpy(&dm->m, &m, sizeof(struct mce)); 205 schedule_timeout(2);
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm); 206 raise_mce(&m);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize; 207 return usize;
113} 208}
114 209
@@ -116,6 +211,7 @@ static int inject_init(void)
116{ 211{
117 printk(KERN_INFO "Machine check injector initialized\n"); 212 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write; 213 mce_chrdev_ops.write = mce_write;
214 register_die_notifier(&mce_raise_nb);
119 return 0; 215 return 0;
120} 216}
121 217
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 54dcb8ff12e5..32996f9fab67 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,4 @@
1#include <linux/sysdev.h>
1#include <asm/mce.h> 2#include <asm/mce.h>
2 3
3enum severity_level { 4enum severity_level {
@@ -10,6 +11,20 @@ enum severity_level {
10 MCE_PANIC_SEVERITY, 11 MCE_PANIC_SEVERITY,
11}; 12};
12 13
14#define ATTR_LEN 16
15
16/* One object for each MCE bank, shared by all CPUs */
17struct mce_bank {
18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */
20 struct sysdev_attribute attr; /* sysdev attribute */
21 char attrname[ATTR_LEN]; /* attribute name */
22};
23
13int mce_severity(struct mce *a, int tolerant, char **msg); 24int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void);
14 26
15extern int mce_ser; 27extern int mce_ser;
28
29extern struct mce_bank *mce_banks;
30
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index ff0807f97056..8a85dd1b1aa1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
139 } 139 }
140} 140}
141 141
142#ifdef CONFIG_DEBUG_FS
142static void *s_start(struct seq_file *f, loff_t *pos) 143static void *s_start(struct seq_file *f, loff_t *pos)
143{ 144{
144 if (*pos >= ARRAY_SIZE(severities)) 145 if (*pos >= ARRAY_SIZE(severities))
@@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void)
197{ 198{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL; 199 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199 200
200 dmce = debugfs_create_dir("mce", NULL); 201 dmce = mce_get_debugfs_dir();
201 if (dmce == NULL) 202 if (dmce == NULL)
202 goto err_out; 203 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage", 204 fseverities_coverage = debugfs_create_file("severities-coverage",
@@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void)
209 return 0; 210 return 0;
210 211
211err_out: 212err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM; 213 return -ENOMEM;
217} 214}
218late_initcall(severities_debugfs_init); 215late_initcall(severities_debugfs_init);
216#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 01213048f62f..2f5aab26320e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/debugfs.h>
37 38
38#include <asm/processor.h> 39#include <asm/processor.h>
39#include <asm/hw_irq.h> 40#include <asm/hw_irq.h>
@@ -45,21 +46,8 @@
45 46
46#include "mce-internal.h" 47#include "mce-internal.h"
47 48
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52 smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57 unexpected_machine_check;
58
59int mce_disabled __read_mostly; 49int mce_disabled __read_mostly;
60 50
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR 227 51#define MISC_MCELOG_MINOR 227
64 52
65#define SPINUNIT 100 /* 100ns */ 53#define SPINUNIT 100 /* 100ns */
@@ -77,7 +65,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
77 */ 65 */
78static int tolerant __read_mostly = 1; 66static int tolerant __read_mostly = 1;
79static int banks __read_mostly; 67static int banks __read_mostly;
80static u64 *bank __read_mostly;
81static int rip_msr __read_mostly; 68static int rip_msr __read_mostly;
82static int mce_bootlog __read_mostly = -1; 69static int mce_bootlog __read_mostly = -1;
83static int monarch_timeout __read_mostly = -1; 70static int monarch_timeout __read_mostly = -1;
@@ -87,13 +74,13 @@ int mce_cmci_disabled __read_mostly;
87int mce_ignore_ce __read_mostly; 74int mce_ignore_ce __read_mostly;
88int mce_ser __read_mostly; 75int mce_ser __read_mostly;
89 76
77struct mce_bank *mce_banks __read_mostly;
78
90/* User mode helper program triggered by machine check event */ 79/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify; 80static unsigned long mce_need_notify;
92static char mce_helper[128]; 81static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL }; 82static char *mce_helper_argv[2] = { mce_helper, NULL };
94 83
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 84static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen); 85static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing; 86static int cpu_missing;
@@ -104,11 +91,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 91 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105}; 92};
106 93
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work); 94static DEFINE_PER_CPU(struct work_struct, mce_work);
113 95
114/* Do initial initialization of a struct mce */ 96/* Do initial initialization of a struct mce */
@@ -183,6 +165,11 @@ void mce_log(struct mce *mce)
183 set_bit(0, &mce_need_notify); 165 set_bit(0, &mce_need_notify);
184} 166}
185 167
168void __weak decode_mce(struct mce *m)
169{
170 return;
171}
172
186static void print_mce(struct mce *m) 173static void print_mce(struct mce *m)
187{ 174{
188 printk(KERN_EMERG 175 printk(KERN_EMERG
@@ -205,6 +192,8 @@ static void print_mce(struct mce *m)
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 192 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 193 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 194 m->apicid);
195
196 decode_mce(m);
208} 197}
209 198
210static void print_mce_head(void) 199static void print_mce_head(void)
@@ -215,13 +204,19 @@ static void print_mce_head(void)
215static void print_mce_tail(void) 204static void print_mce_tail(void)
216{ 205{
217 printk(KERN_EMERG "This is not a software problem!\n" 206 printk(KERN_EMERG "This is not a software problem!\n"
218 "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 207#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
208 "Run through mcelog --ascii to decode and contact your hardware vendor\n"
209#endif
210 );
219} 211}
220 212
221#define PANIC_TIMEOUT 5 /* 5 seconds */ 213#define PANIC_TIMEOUT 5 /* 5 seconds */
222 214
223static atomic_t mce_paniced; 215static atomic_t mce_paniced;
224 216
217static int fake_panic;
218static atomic_t mce_fake_paniced;
219
225/* Panic in progress. Enable interrupts and wait for final IPI */ 220/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void) 221static void wait_for_panic(void)
227{ 222{
@@ -239,15 +234,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
239{ 234{
240 int i; 235 int i;
241 236
242 /* 237 if (!fake_panic) {
243 * Make sure only one CPU runs in machine check panic 238 /*
244 */ 239 * Make sure only one CPU runs in machine check panic
245 if (atomic_add_return(1, &mce_paniced) > 1) 240 */
246 wait_for_panic(); 241 if (atomic_inc_return(&mce_paniced) > 1)
247 barrier(); 242 wait_for_panic();
243 barrier();
248 244
249 bust_spinlocks(1); 245 bust_spinlocks(1);
250 console_verbose(); 246 console_verbose();
247 } else {
248 /* Don't log too much for fake panic */
249 if (atomic_inc_return(&mce_fake_paniced) > 1)
250 return;
251 }
251 print_mce_head(); 252 print_mce_head();
252 /* First print corrected ones that are still unlogged */ 253 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) { 254 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -274,9 +275,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
274 print_mce_tail(); 275 print_mce_tail();
275 if (exp) 276 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp); 277 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0) 278 if (!fake_panic) {
278 panic_timeout = mce_panic_timeout; 279 if (panic_timeout == 0)
279 panic(msg); 280 panic_timeout = mce_panic_timeout;
281 panic(msg);
282 } else
283 printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
280} 284}
281 285
282/* Support code for software error injection */ 286/* Support code for software error injection */
@@ -286,11 +290,11 @@ static int msr_to_offset(u32 msr)
286 unsigned bank = __get_cpu_var(injectm.bank); 290 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr) 291 if (msr == rip_msr)
288 return offsetof(struct mce, ip); 292 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4) 293 if (msr == MSR_IA32_MCx_STATUS(bank))
290 return offsetof(struct mce, status); 294 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4) 295 if (msr == MSR_IA32_MCx_ADDR(bank))
292 return offsetof(struct mce, addr); 296 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4) 297 if (msr == MSR_IA32_MCx_MISC(bank))
294 return offsetof(struct mce, misc); 298 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS) 299 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus); 300 return offsetof(struct mce, mcgstatus);
@@ -495,7 +499,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
495 499
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 500 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) { 501 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b)) 502 if (!mce_banks[i].ctl || !test_bit(i, *b))
499 continue; 503 continue;
500 504
501 m.misc = 0; 505 m.misc = 0;
@@ -504,7 +508,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
504 m.tsc = 0; 508 m.tsc = 0;
505 509
506 barrier(); 510 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 511 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
508 if (!(m.status & MCI_STATUS_VAL)) 512 if (!(m.status & MCI_STATUS_VAL))
509 continue; 513 continue;
510 514
@@ -519,9 +523,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
519 continue; 523 continue;
520 524
521 if (m.status & MCI_STATUS_MISCV) 525 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 526 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
523 if (m.status & MCI_STATUS_ADDRV) 527 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 528 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
525 529
526 if (!(flags & MCP_TIMESTAMP)) 530 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0; 531 m.tsc = 0;
@@ -537,7 +541,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
537 /* 541 /*
538 * Clear state for this bank. 542 * Clear state for this bank.
539 */ 543 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 544 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
541 } 545 }
542 546
543 /* 547 /*
@@ -558,7 +562,7 @@ static int mce_no_way_out(struct mce *m, char **msg)
558 int i; 562 int i;
559 563
560 for (i = 0; i < banks; i++) { 564 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 565 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 566 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1; 567 return 1;
564 } 568 }
@@ -618,7 +622,7 @@ out:
618 * This way we prevent any potential data corruption in a unrecoverable case 622 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined. 623 * and also makes sure always all CPU's errors are examined.
620 * 624 *
621 * Also this detects the case of an machine check event coming from outer 625 * Also this detects the case of a machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants 626 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too. 627 * us to shut down, so panic too.
624 * 628 *
@@ -671,7 +675,7 @@ static void mce_reign(void)
671 * No machine check event found. Must be some external 675 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic. 676 * source or one CPU is hung. Panic.
673 */ 677 */
674 if (!m && tolerant < 3) 678 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL); 679 mce_panic("Machine check from unknown source", NULL, NULL);
676 680
677 /* 681 /*
@@ -705,7 +709,7 @@ static int mce_start(int *no_way_out)
705 * global_nwo should be updated before mce_callin 709 * global_nwo should be updated before mce_callin
706 */ 710 */
707 smp_wmb(); 711 smp_wmb();
708 order = atomic_add_return(1, &mce_callin); 712 order = atomic_inc_return(&mce_callin);
709 713
710 /* 714 /*
711 * Wait for everyone. 715 * Wait for everyone.
@@ -842,7 +846,7 @@ static void mce_clear_state(unsigned long *toclear)
842 846
843 for (i = 0; i < banks; i++) { 847 for (i = 0; i < banks; i++) {
844 if (test_bit(i, toclear)) 848 if (test_bit(i, toclear))
845 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 849 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
846 } 850 }
847} 851}
848 852
@@ -895,11 +899,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
895 mce_setup(&m); 899 mce_setup(&m);
896 900
897 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 901 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
898 no_way_out = mce_no_way_out(&m, &msg);
899
900 final = &__get_cpu_var(mces_seen); 902 final = &__get_cpu_var(mces_seen);
901 *final = m; 903 *final = m;
902 904
905 no_way_out = mce_no_way_out(&m, &msg);
906
903 barrier(); 907 barrier();
904 908
905 /* 909 /*
@@ -916,14 +920,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
916 order = mce_start(&no_way_out); 920 order = mce_start(&no_way_out);
917 for (i = 0; i < banks; i++) { 921 for (i = 0; i < banks; i++) {
918 __clear_bit(i, toclear); 922 __clear_bit(i, toclear);
919 if (!bank[i]) 923 if (!mce_banks[i].ctl)
920 continue; 924 continue;
921 925
922 m.misc = 0; 926 m.misc = 0;
923 m.addr = 0; 927 m.addr = 0;
924 m.bank = i; 928 m.bank = i;
925 929
926 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 930 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
927 if ((m.status & MCI_STATUS_VAL) == 0) 931 if ((m.status & MCI_STATUS_VAL) == 0)
928 continue; 932 continue;
929 933
@@ -964,9 +968,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
964 kill_it = 1; 968 kill_it = 1;
965 969
966 if (m.status & MCI_STATUS_MISCV) 970 if (m.status & MCI_STATUS_MISCV)
967 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 971 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
968 if (m.status & MCI_STATUS_ADDRV) 972 if (m.status & MCI_STATUS_ADDRV)
969 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 973 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
970 974
971 /* 975 /*
972 * Action optional error. Queue address for later processing. 976 * Action optional error. Queue address for later processing.
@@ -1091,7 +1095,7 @@ void mce_log_therm_throt_event(__u64 status)
1091 */ 1095 */
1092static int check_interval = 5 * 60; /* 5 minutes */ 1096static int check_interval = 5 * 60; /* 5 minutes */
1093 1097
1094static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1098static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1095static DEFINE_PER_CPU(struct timer_list, mce_timer); 1099static DEFINE_PER_CPU(struct timer_list, mce_timer);
1096 1100
1097static void mcheck_timer(unsigned long data) 1101static void mcheck_timer(unsigned long data)
@@ -1110,7 +1114,7 @@ static void mcheck_timer(unsigned long data)
1110 * Alert userspace if needed. If we logged an MCE, reduce the 1114 * Alert userspace if needed. If we logged an MCE, reduce the
1111 * polling interval, otherwise increase the polling interval. 1115 * polling interval, otherwise increase the polling interval.
1112 */ 1116 */
1113 n = &__get_cpu_var(next_interval); 1117 n = &__get_cpu_var(mce_next_interval);
1114 if (mce_notify_irq()) 1118 if (mce_notify_irq())
1115 *n = max(*n/2, HZ/100); 1119 *n = max(*n/2, HZ/100);
1116 else 1120 else
@@ -1159,10 +1163,25 @@ int mce_notify_irq(void)
1159} 1163}
1160EXPORT_SYMBOL_GPL(mce_notify_irq); 1164EXPORT_SYMBOL_GPL(mce_notify_irq);
1161 1165
1166static int mce_banks_init(void)
1167{
1168 int i;
1169
1170 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1171 if (!mce_banks)
1172 return -ENOMEM;
1173 for (i = 0; i < banks; i++) {
1174 struct mce_bank *b = &mce_banks[i];
1175 b->ctl = -1ULL;
1176 b->init = 1;
1177 }
1178 return 0;
1179}
1180
1162/* 1181/*
1163 * Initialize Machine Checks for a CPU. 1182 * Initialize Machine Checks for a CPU.
1164 */ 1183 */
1165static int mce_cap_init(void) 1184static int __cpuinit mce_cap_init(void)
1166{ 1185{
1167 unsigned b; 1186 unsigned b;
1168 u64 cap; 1187 u64 cap;
@@ -1182,11 +1201,10 @@ static int mce_cap_init(void)
1182 /* Don't support asymmetric configurations today */ 1201 /* Don't support asymmetric configurations today */
1183 WARN_ON(banks != 0 && b != banks); 1202 WARN_ON(banks != 0 && b != banks);
1184 banks = b; 1203 banks = b;
1185 if (!bank) { 1204 if (!mce_banks) {
1186 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1205 int err = mce_banks_init();
1187 if (!bank) 1206 if (err)
1188 return -ENOMEM; 1207 return err;
1189 memset(bank, 0xff, banks * sizeof(u64));
1190 } 1208 }
1191 1209
1192 /* Use accurate RIP reporting if available. */ 1210 /* Use accurate RIP reporting if available. */
@@ -1218,15 +1236,16 @@ static void mce_init(void)
1218 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1236 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1219 1237
1220 for (i = 0; i < banks; i++) { 1238 for (i = 0; i < banks; i++) {
1221 if (skip_bank_init(i)) 1239 struct mce_bank *b = &mce_banks[i];
1240 if (!b->init)
1222 continue; 1241 continue;
1223 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1242 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1224 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1243 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1225 } 1244 }
1226} 1245}
1227 1246
1228/* Add per CPU specific workarounds here */ 1247/* Add per CPU specific workarounds here */
1229static int mce_cpu_quirks(struct cpuinfo_x86 *c) 1248static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1230{ 1249{
1231 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1250 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1232 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1251 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1241,7 +1260,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1241 * trips off incorrectly with the IOMMU & 3ware 1260 * trips off incorrectly with the IOMMU & 3ware
1242 * & Cerberus: 1261 * & Cerberus:
1243 */ 1262 */
1244 clear_bit(10, (unsigned long *)&bank[4]); 1263 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1245 } 1264 }
1246 if (c->x86 <= 17 && mce_bootlog < 0) { 1265 if (c->x86 <= 17 && mce_bootlog < 0) {
1247 /* 1266 /*
@@ -1255,7 +1274,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1255 * by default. 1274 * by default.
1256 */ 1275 */
1257 if (c->x86 == 6 && banks > 0) 1276 if (c->x86 == 6 && banks > 0)
1258 bank[0] = 0; 1277 mce_banks[0].ctl = 0;
1259 } 1278 }
1260 1279
1261 if (c->x86_vendor == X86_VENDOR_INTEL) { 1280 if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1268,8 +1287,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1268 * valid event later, merely don't write CTL0. 1287 * valid event later, merely don't write CTL0.
1269 */ 1288 */
1270 1289
1271 if (c->x86 == 6 && c->x86_model < 0x1A) 1290 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1272 __set_bit(0, &dont_init_banks); 1291 mce_banks[0].init = 0;
1273 1292
1274 /* 1293 /*
1275 * All newer Intel systems support MCE broadcasting. Enable 1294 * All newer Intel systems support MCE broadcasting. Enable
@@ -1325,7 +1344,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1325static void mce_init_timer(void) 1344static void mce_init_timer(void)
1326{ 1345{
1327 struct timer_list *t = &__get_cpu_var(mce_timer); 1346 struct timer_list *t = &__get_cpu_var(mce_timer);
1328 int *n = &__get_cpu_var(next_interval); 1347 int *n = &__get_cpu_var(mce_next_interval);
1329 1348
1330 if (mce_ignore_ce) 1349 if (mce_ignore_ce)
1331 return; 1350 return;
@@ -1338,6 +1357,17 @@ static void mce_init_timer(void)
1338 add_timer_on(t, smp_processor_id()); 1357 add_timer_on(t, smp_processor_id());
1339} 1358}
1340 1359
1360/* Handle unconfigured int18 (should never happen) */
1361static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1362{
1363 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1364 smp_processor_id());
1365}
1366
1367/* Call the installed machine check handler for this CPU setup. */
1368void (*machine_check_vector)(struct pt_regs *, long error_code) =
1369 unexpected_machine_check;
1370
1341/* 1371/*
1342 * Called for each booted CPU to set up machine checks. 1372 * Called for each booted CPU to set up machine checks.
1343 * Must be called with preempt off: 1373 * Must be called with preempt off:
@@ -1551,8 +1581,10 @@ static struct miscdevice mce_log_device = {
1551 */ 1581 */
1552static int __init mcheck_enable(char *str) 1582static int __init mcheck_enable(char *str)
1553{ 1583{
1554 if (*str == 0) 1584 if (*str == 0) {
1555 enable_p5_mce(); 1585 enable_p5_mce();
1586 return 1;
1587 }
1556 if (*str == '=') 1588 if (*str == '=')
1557 str++; 1589 str++;
1558 if (!strcmp(str, "off")) 1590 if (!strcmp(str, "off"))
@@ -1593,8 +1625,9 @@ static int mce_disable(void)
1593 int i; 1625 int i;
1594 1626
1595 for (i = 0; i < banks; i++) { 1627 for (i = 0; i < banks; i++) {
1596 if (!skip_bank_init(i)) 1628 struct mce_bank *b = &mce_banks[i];
1597 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1629 if (b->init)
1630 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1598 } 1631 }
1599 return 0; 1632 return 0;
1600} 1633}
@@ -1669,14 +1702,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);
1669__cpuinitdata 1702__cpuinitdata
1670void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1703void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1671 1704
1672static struct sysdev_attribute *bank_attrs; 1705static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1706{
1707 return container_of(attr, struct mce_bank, attr);
1708}
1673 1709
1674static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1710static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1675 char *buf) 1711 char *buf)
1676{ 1712{
1677 u64 b = bank[attr - bank_attrs]; 1713 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1678
1679 return sprintf(buf, "%llx\n", b);
1680} 1714}
1681 1715
1682static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1716static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
@@ -1687,7 +1721,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1687 if (strict_strtoull(buf, 0, &new) < 0) 1721 if (strict_strtoull(buf, 0, &new) < 0)
1688 return -EINVAL; 1722 return -EINVAL;
1689 1723
1690 bank[attr - bank_attrs] = new; 1724 attr_to_bank(attr)->ctl = new;
1691 mce_restart(); 1725 mce_restart();
1692 1726
1693 return size; 1727 return size;
@@ -1829,7 +1863,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1829 } 1863 }
1830 for (j = 0; j < banks; j++) { 1864 for (j = 0; j < banks; j++) {
1831 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1865 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1832 &bank_attrs[j]); 1866 &mce_banks[j].attr);
1833 if (err) 1867 if (err)
1834 goto error2; 1868 goto error2;
1835 } 1869 }
@@ -1838,10 +1872,10 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1838 return 0; 1872 return 0;
1839error2: 1873error2:
1840 while (--j >= 0) 1874 while (--j >= 0)
1841 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); 1875 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1842error: 1876error:
1843 while (--i >= 0) 1877 while (--i >= 0)
1844 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1878 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1845 1879
1846 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1880 sysdev_unregister(&per_cpu(mce_dev, cpu));
1847 1881
@@ -1859,7 +1893,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1859 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1893 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1860 1894
1861 for (i = 0; i < banks; i++) 1895 for (i = 0; i < banks; i++)
1862 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1896 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1863 1897
1864 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1898 sysdev_unregister(&per_cpu(mce_dev, cpu));
1865 cpumask_clear_cpu(cpu, mce_dev_initialized); 1899 cpumask_clear_cpu(cpu, mce_dev_initialized);
@@ -1876,8 +1910,9 @@ static void mce_disable_cpu(void *h)
1876 if (!(action & CPU_TASKS_FROZEN)) 1910 if (!(action & CPU_TASKS_FROZEN))
1877 cmci_clear(); 1911 cmci_clear();
1878 for (i = 0; i < banks; i++) { 1912 for (i = 0; i < banks; i++) {
1879 if (!skip_bank_init(i)) 1913 struct mce_bank *b = &mce_banks[i];
1880 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1914 if (b->init)
1915 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1881 } 1916 }
1882} 1917}
1883 1918
@@ -1892,8 +1927,9 @@ static void mce_reenable_cpu(void *h)
1892 if (!(action & CPU_TASKS_FROZEN)) 1927 if (!(action & CPU_TASKS_FROZEN))
1893 cmci_reenable(); 1928 cmci_reenable();
1894 for (i = 0; i < banks; i++) { 1929 for (i = 0; i < banks; i++) {
1895 if (!skip_bank_init(i)) 1930 struct mce_bank *b = &mce_banks[i];
1896 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1931 if (b->init)
1932 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1897 } 1933 }
1898} 1934}
1899 1935
@@ -1925,7 +1961,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1925 case CPU_DOWN_FAILED: 1961 case CPU_DOWN_FAILED:
1926 case CPU_DOWN_FAILED_FROZEN: 1962 case CPU_DOWN_FAILED_FROZEN:
1927 t->expires = round_jiffies(jiffies + 1963 t->expires = round_jiffies(jiffies +
1928 __get_cpu_var(next_interval)); 1964 __get_cpu_var(mce_next_interval));
1929 add_timer_on(t, cpu); 1965 add_timer_on(t, cpu);
1930 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1966 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1931 break; 1967 break;
@@ -1941,35 +1977,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1941 .notifier_call = mce_cpu_callback, 1977 .notifier_call = mce_cpu_callback,
1942}; 1978};
1943 1979
1944static __init int mce_init_banks(void) 1980static __init void mce_init_banks(void)
1945{ 1981{
1946 int i; 1982 int i;
1947 1983
1948 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1949 GFP_KERNEL);
1950 if (!bank_attrs)
1951 return -ENOMEM;
1952
1953 for (i = 0; i < banks; i++) { 1984 for (i = 0; i < banks; i++) {
1954 struct sysdev_attribute *a = &bank_attrs[i]; 1985 struct mce_bank *b = &mce_banks[i];
1986 struct sysdev_attribute *a = &b->attr;
1955 1987
1956 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1988 a->attr.name = b->attrname;
1957 if (!a->attr.name) 1989 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
1958 goto nomem;
1959 1990
1960 a->attr.mode = 0644; 1991 a->attr.mode = 0644;
1961 a->show = show_bank; 1992 a->show = show_bank;
1962 a->store = set_bank; 1993 a->store = set_bank;
1963 } 1994 }
1964 return 0;
1965
1966nomem:
1967 while (--i >= 0)
1968 kfree(bank_attrs[i].attr.name);
1969 kfree(bank_attrs);
1970 bank_attrs = NULL;
1971
1972 return -ENOMEM;
1973} 1995}
1974 1996
1975static __init int mce_init_device(void) 1997static __init int mce_init_device(void)
@@ -1982,9 +2004,7 @@ static __init int mce_init_device(void)
1982 2004
1983 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2005 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1984 2006
1985 err = mce_init_banks(); 2007 mce_init_banks();
1986 if (err)
1987 return err;
1988 2008
1989 err = sysdev_class_register(&mce_sysclass); 2009 err = sysdev_class_register(&mce_sysclass);
1990 if (err) 2010 if (err)
@@ -2004,57 +2024,65 @@ static __init int mce_init_device(void)
2004 2024
2005device_initcall(mce_init_device); 2025device_initcall(mce_init_device);
2006 2026
2007#else /* CONFIG_X86_OLD_MCE: */ 2027/*
2008 2028 * Old style boot options parsing. Only for compatibility.
2009int nr_mce_banks; 2029 */
2010EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 2030static int __init mcheck_disable(char *str)
2031{
2032 mce_disabled = 1;
2033 return 1;
2034}
2035__setup("nomce", mcheck_disable);
2011 2036
2012/* This has to be run for each processor */ 2037#ifdef CONFIG_DEBUG_FS
2013void mcheck_init(struct cpuinfo_x86 *c) 2038struct dentry *mce_get_debugfs_dir(void)
2014{ 2039{
2015 if (mce_disabled) 2040 static struct dentry *dmce;
2016 return;
2017 2041
2018 switch (c->x86_vendor) { 2042 if (!dmce)
2019 case X86_VENDOR_AMD: 2043 dmce = debugfs_create_dir("mce", NULL);
2020 amd_mcheck_init(c);
2021 break;
2022 2044
2023 case X86_VENDOR_INTEL: 2045 return dmce;
2024 if (c->x86 == 5) 2046}
2025 intel_p5_mcheck_init(c);
2026 if (c->x86 == 6)
2027 intel_p6_mcheck_init(c);
2028 if (c->x86 == 15)
2029 intel_p4_mcheck_init(c);
2030 break;
2031 2047
2032 case X86_VENDOR_CENTAUR: 2048static void mce_reset(void)
2033 if (c->x86 == 5) 2049{
2034 winchip_mcheck_init(c); 2050 cpu_missing = 0;
2035 break; 2051 atomic_set(&mce_fake_paniced, 0);
2052 atomic_set(&mce_executing, 0);
2053 atomic_set(&mce_callin, 0);
2054 atomic_set(&global_nwo, 0);
2055}
2036 2056
2037 default: 2057static int fake_panic_get(void *data, u64 *val)
2038 break; 2058{
2039 } 2059 *val = fake_panic;
2040 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 2060 return 0;
2041} 2061}
2042 2062
2043static int __init mcheck_enable(char *str) 2063static int fake_panic_set(void *data, u64 val)
2044{ 2064{
2045 mce_p5_enabled = 1; 2065 mce_reset();
2046 return 1; 2066 fake_panic = val;
2067 return 0;
2047} 2068}
2048__setup("mce", mcheck_enable);
2049 2069
2050#endif /* CONFIG_X86_OLD_MCE */ 2070DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2071 fake_panic_set, "%llu\n");
2051 2072
2052/* 2073static int __init mce_debugfs_init(void)
2053 * Old style boot options parsing. Only for compatibility.
2054 */
2055static int __init mcheck_disable(char *str)
2056{ 2074{
2057 mce_disabled = 1; 2075 struct dentry *dmce, *ffake_panic;
2058 return 1; 2076
2077 dmce = mce_get_debugfs_dir();
2078 if (!dmce)
2079 return -ENOMEM;
2080 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2081 &fake_panic_fops);
2082 if (!ffake_panic)
2083 return -ENOMEM;
2084
2085 return 0;
2059} 2086}
2060__setup("nomce", mcheck_disable); 2087late_initcall(mce_debugfs_init);
2088#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae21620bda..83a3d1f4efca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
73 73
74#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = { 75static unsigned char shared_bank[NR_BANKS] = {
@@ -489,12 +489,15 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
489 int i, err = 0; 489 int i, err = 0;
490 struct threshold_bank *b = NULL; 490 struct threshold_bank *b = NULL;
491 char name[32]; 491 char name[32];
492#ifdef CONFIG_SMP
493 struct cpuinfo_x86 *c = &cpu_data(cpu);
494#endif
492 495
493 sprintf(name, "threshold_bank%i", bank); 496 sprintf(name, "threshold_bank%i", bank);
494 497
495#ifdef CONFIG_SMP 498#ifdef CONFIG_SMP
496 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 499 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
497 i = cpumask_first(cpu_core_mask(cpu)); 500 i = cpumask_first(c->llc_shared_map);
498 501
499 /* first core not up yet */ 502 /* first core not up yet */
500 if (cpu_data(i).cpu_core_id) 503 if (cpu_data(i).cpu_core_id)
@@ -514,7 +517,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
514 if (err) 517 if (err)
515 goto out; 518 goto out;
516 519
517 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 520 cpumask_copy(b->cpus, c->llc_shared_map);
518 per_cpu(threshold_banks, cpu)[bank] = b; 521 per_cpu(threshold_banks, cpu)[bank] = b;
519 522
520 goto out; 523 goto out;
@@ -539,7 +542,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
539#ifndef CONFIG_SMP 542#ifndef CONFIG_SMP
540 cpumask_setall(b->cpus); 543 cpumask_setall(b->cpus);
541#else 544#else
542 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 545 cpumask_copy(b->cpus, c->llc_shared_map);
543#endif 546#endif
544 547
545 per_cpu(threshold_banks, cpu)[bank] = b; 548 per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0f7a32..889f665fe93d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot)
90 if (test_bit(i, owned)) 90 if (test_bit(i, owned))
91 continue; 91 continue;
92 92
93 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 93 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
94 94
95 /* Already owned by someone else? */ 95 /* Already owned by someone else? */
96 if (val & CMCI_EN) { 96 if (val & CMCI_EN) {
@@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot)
101 } 101 }
102 102
103 val |= CMCI_EN | CMCI_THRESHOLD; 103 val |= CMCI_EN | CMCI_THRESHOLD;
104 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 104 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
105 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 105 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
106 106
107 /* Did the enable bit stick? -- the bank supports CMCI */ 107 /* Did the enable bit stick? -- the bank supports CMCI */
108 if (val & CMCI_EN) { 108 if (val & CMCI_EN) {
@@ -152,9 +152,9 @@ void cmci_clear(void)
152 if (!test_bit(i, __get_cpu_var(mce_banks_owned))) 152 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
153 continue; 153 continue;
154 /* Disable CMCI */ 154 /* Disable CMCI */
155 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 155 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
157 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 157 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
158 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 158 __clear_bit(i, __get_cpu_var(mce_banks_owned));
159 } 159 }
160 spin_unlock_irqrestore(&cmci_discover_lock, flags); 160 spin_unlock_irqrestore(&cmci_discover_lock, flags);
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
deleted file mode 100644
index f5f2d6f71fb6..000000000000
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ /dev/null
@@ -1,94 +0,0 @@
1/*
2 * Non Fatal Machine Check Exception Reporting
3 *
4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 *
6 * This file contains routines to check for non-fatal MCEs every 15s
7 *
8 */
9#include <linux/interrupt.h>
10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
17
18#include <asm/processor.h>
19#include <asm/system.h>
20#include <asm/mce.h>
21#include <asm/msr.h>
22
23static int firstbank;
24
25#define MCE_RATE (15*HZ) /* timer rate is 15s */
26
27static void mce_checkregs(void *info)
28{
29 u32 low, high;
30 int i;
31
32 for (i = firstbank; i < nr_mce_banks; i++) {
33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
34
35 if (!(high & (1<<31)))
36 continue;
37
38 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
39 "correctable incident occurred on CPU %d.\n",
40 smp_processor_id());
41
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
43
44 /*
45 * Scrub the error so we don't pick it up in MCE_RATE
46 * seconds time:
47 */
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
49
50 /* Serialize: */
51 wmb();
52 add_taint(TAINT_MACHINE_CHECK);
53 }
54}
55
56static void mce_work_fn(struct work_struct *work);
57static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
58
59static void mce_work_fn(struct work_struct *work)
60{
61 on_each_cpu(mce_checkregs, NULL, 1);
62 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
63}
64
65static int __init init_nonfatal_mce_checker(void)
66{
67 struct cpuinfo_x86 *c = &boot_cpu_data;
68
69 /* Check for MCE support */
70 if (!cpu_has(c, X86_FEATURE_MCE))
71 return -ENODEV;
72
73 /* Check for PPro style MCA */
74 if (!cpu_has(c, X86_FEATURE_MCA))
75 return -ENODEV;
76
77 /* Some Athlons misbehave when we frob bank 0 */
78 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
79 boot_cpu_data.x86 == 6)
80 firstbank = 1;
81 else
82 firstbank = 0;
83
84 /*
85 * Check for non-fatal errors every MCE_RATE s
86 */
87 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
88 printk(KERN_INFO "Machine check exception polling timer started.\n");
89
90 return 0;
91}
92module_init(init_nonfatal_mce_checker);
93
94MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
deleted file mode 100644
index 4482aea9aa2e..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,163 +0,0 @@
1/*
2 * P4 specific Machine Check Exception Reporting
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/smp.h>
8
9#include <asm/processor.h>
10#include <asm/mce.h>
11#include <asm/msr.h>
12
13/* as supported by the P4/Xeon family */
14struct intel_mce_extended_msrs {
15 u32 eax;
16 u32 ebx;
17 u32 ecx;
18 u32 edx;
19 u32 esi;
20 u32 edi;
21 u32 ebp;
22 u32 esp;
23 u32 eflags;
24 u32 eip;
25 /* u32 *reserved[]; */
26};
27
28static int mce_num_extended_msrs;
29
30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
32{
33 u32 h;
34
35 rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
36 rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
37 rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
38 rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
39 rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
40 rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
41 rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
42 rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
43 rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
44 rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
45}
46
47static void intel_machine_check(struct pt_regs *regs, long error_code)
48{
49 u32 alow, ahigh, high, low;
50 u32 mcgstl, mcgsth;
51 int recover = 1;
52 int i;
53
54 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
55 if (mcgstl & (1<<0)) /* Recoverable ? */
56 recover = 0;
57
58 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
59 smp_processor_id(), mcgsth, mcgstl);
60
61 if (mce_num_extended_msrs > 0) {
62 struct intel_mce_extended_msrs dbg;
63
64 intel_get_extended_msrs(&dbg);
65
66 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
67 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
68 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
69 smp_processor_id(), dbg.eip, dbg.eflags,
70 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
71 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
72 }
73
74 for (i = 0; i < nr_mce_banks; i++) {
75 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
76 if (high & (1<<31)) {
77 char misc[20];
78 char addr[24];
79
80 misc[0] = addr[0] = '\0';
81 if (high & (1<<29))
82 recover |= 1;
83 if (high & (1<<25))
84 recover |= 2;
85 high &= ~(1<<31);
86 if (high & (1<<27)) {
87 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
88 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
89 }
90 if (high & (1<<26)) {
91 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
92 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
93 }
94 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
95 smp_processor_id(), i, high, low, misc, addr);
96 }
97 }
98
99 if (recover & 2)
100 panic("CPU context corrupt");
101 if (recover & 1)
102 panic("Unable to continue");
103
104 printk(KERN_EMERG "Attempting to continue.\n");
105
106 /*
107 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
108 * recoverable/continuable.This will allow BIOS to look at the MSRs
109 * for errors if the OS could not log the error.
110 */
111 for (i = 0; i < nr_mce_banks; i++) {
112 u32 msr;
113 msr = MSR_IA32_MC0_STATUS+i*4;
114 rdmsr(msr, low, high);
115 if (high&(1<<31)) {
116 /* Clear it */
117 wrmsr(msr, 0UL, 0UL);
118 /* Serialize */
119 wmb();
120 add_taint(TAINT_MACHINE_CHECK);
121 }
122 }
123 mcgstl &= ~(1<<2);
124 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
125}
126
127void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
128{
129 u32 l, h;
130 int i;
131
132 machine_check_vector = intel_machine_check;
133 wmb();
134
135 printk(KERN_INFO "Intel machine check architecture supported.\n");
136 rdmsr(MSR_IA32_MCG_CAP, l, h);
137 if (l & (1<<8)) /* Control register present ? */
138 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
139 nr_mce_banks = l & 0xff;
140
141 for (i = 0; i < nr_mce_banks; i++) {
142 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
143 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
144 }
145
146 set_in_cr4(X86_CR4_MCE);
147 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
148 smp_processor_id());
149
150 /* Check for P4/Xeon extended MCE MSRs */
151 rdmsr(MSR_IA32_MCG_CAP, l, h);
152 if (l & (1<<9)) {/* MCG_EXT_P */
153 mce_num_extended_msrs = (l >> 16) & 0xff;
154 printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
155 " available\n",
156 smp_processor_id(), mce_num_extended_msrs);
157
158#ifdef CONFIG_X86_MCE_P4THERMAL
159 /* Check for P4/Xeon Thermal monitor */
160 intel_init_thermal(c);
161#endif
162 }
163}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
deleted file mode 100644
index 01e4f8178183..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For PII/PIII */
17static void intel_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 0; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57 }
58 }
59
60 if (recover & 2)
61 panic("CPU context corrupt");
62 if (recover & 1)
63 panic("Unable to continue");
64
65 printk(KERN_EMERG "Attempting to continue.\n");
66 /*
67 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
68 * recoverable/continuable.This will allow BIOS to look at the MSRs
69 * for errors if the OS could not log the error:
70 */
71 for (i = 0; i < nr_mce_banks; i++) {
72 unsigned int msr;
73
74 msr = MSR_IA32_MC0_STATUS+i*4;
75 rdmsr(msr, low, high);
76 if (high & (1<<31)) {
77 /* Clear it: */
78 wrmsr(msr, 0UL, 0UL);
79 /* Serialize: */
80 wmb();
81 add_taint(TAINT_MACHINE_CHECK);
82 }
83 }
84 mcgstl &= ~(1<<2);
85 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
86}
87
88/* Set up machine check reporting for processors with Intel style MCE: */
89void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
90{
91 u32 l, h;
92 int i;
93
94 /* Check for MCE support */
95 if (!cpu_has(c, X86_FEATURE_MCE))
96 return;
97
98 /* Check for PPro style MCA */
99 if (!cpu_has(c, X86_FEATURE_MCA))
100 return;
101
102 /* Ok machine check is available */
103 machine_check_vector = intel_machine_check;
104 /* Make sure the vector pointer is visible before we enable MCEs: */
105 wmb();
106
107 printk(KERN_INFO "Intel machine check architecture supported.\n");
108 rdmsr(MSR_IA32_MCG_CAP, l, h);
109 if (l & (1<<8)) /* Control register present ? */
110 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
111 nr_mce_banks = l & 0xff;
112
113 /*
114 * Following the example in IA-32 SDM Vol 3:
115 * - MC0_CTL should not be written
116 * - Status registers on all banks should be cleared on reset
117 */
118 for (i = 1; i < nr_mce_banks; i++)
119 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
120
121 for (i = 0; i < nr_mce_banks; i++)
122 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
123
124 set_in_cr4(X86_CR4_MCE);
125 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
126 smp_processor_id());
127}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 5957a93e5173..63a56d147e4a 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -260,9 +260,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
260 return; 260 return;
261 } 261 }
262 262
263 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
264 tm2 = 1;
265
266 /* Check whether a vector already exists */ 263 /* Check whether a vector already exists */
267 if (h & APIC_VECTOR_MASK) { 264 if (h & APIC_VECTOR_MASK) {
268 printk(KERN_DEBUG 265 printk(KERN_DEBUG
@@ -271,6 +268,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
271 return; 268 return;
272 } 269 }
273 270
271 /* early Pentium M models use different method for enabling TM2 */
272 if (cpu_has(c, X86_FEATURE_TM2)) {
273 if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
274 rdmsr(MSR_THERM2_CTL, l, h);
275 if (l & MSR_THERM2_CTL_TM_SELECT)
276 tm2 = 1;
277 } else if (l & MSR_IA32_MISC_ENABLE_TM2)
278 tm2 = 1;
279 }
280
274 /* We'll mask the thermal vector in the lapic till we're ready: */ 281 /* We'll mask the thermal vector in the lapic till we're ready: */
275 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; 282 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
276 apic_write(APIC_LVTTHMR, h); 283 apic_write(APIC_LVTTHMR, h);
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index ee2331b0e58f..33af14110dfd 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -7,15 +7,15 @@
7 7
8static void 8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base, 9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned long *size, mtrr_type * type) 10 unsigned long *size, mtrr_type *type)
11{ 11{
12 unsigned long low, high; 12 unsigned long low, high;
13 13
14 rdmsr(MSR_K6_UWCCR, low, high); 14 rdmsr(MSR_K6_UWCCR, low, high);
15 /* Upper dword is region 1, lower is region 0 */ 15 /* Upper dword is region 1, lower is region 0 */
16 if (reg == 1) 16 if (reg == 1)
17 low = high; 17 low = high;
18 /* The base masks off on the right alignment */ 18 /* The base masks off on the right alignment */
19 *base = (low & 0xFFFE0000) >> PAGE_SHIFT; 19 *base = (low & 0xFFFE0000) >> PAGE_SHIFT;
20 *type = 0; 20 *type = 0;
21 if (low & 1) 21 if (low & 1)
@@ -27,74 +27,81 @@ amd_get_mtrr(unsigned int reg, unsigned long *base,
27 return; 27 return;
28 } 28 }
29 /* 29 /*
30 * This needs a little explaining. The size is stored as an 30 * This needs a little explaining. The size is stored as an
31 * inverted mask of bits of 128K granularity 15 bits long offset 31 * inverted mask of bits of 128K granularity 15 bits long offset
32 * 2 bits 32 * 2 bits.
33 * 33 *
34 * So to get a size we do invert the mask and add 1 to the lowest 34 * So to get a size we do invert the mask and add 1 to the lowest
35 * mask bit (4 as its 2 bits in). This gives us a size we then shift 35 * mask bit (4 as its 2 bits in). This gives us a size we then shift
36 * to turn into 128K blocks 36 * to turn into 128K blocks.
37 * 37 *
38 * eg 111 1111 1111 1100 is 512K 38 * eg 111 1111 1111 1100 is 512K
39 * 39 *
40 * invert 000 0000 0000 0011 40 * invert 000 0000 0000 0011
41 * +1 000 0000 0000 0100 41 * +1 000 0000 0000 0100
42 * *128K ... 42 * *128K ...
43 */ 43 */
44 low = (~low) & 0x1FFFC; 44 low = (~low) & 0x1FFFC;
45 *size = (low + 4) << (15 - PAGE_SHIFT); 45 *size = (low + 4) << (15 - PAGE_SHIFT);
46 return;
47} 46}
48 47
49static void amd_set_mtrr(unsigned int reg, unsigned long base, 48/**
50 unsigned long size, mtrr_type type) 49 * amd_set_mtrr - Set variable MTRR register on the local CPU.
51/* [SUMMARY] Set variable MTRR register on the local CPU. 50 *
52 <reg> The register to set. 51 * @reg The register to set.
53 <base> The base address of the region. 52 * @base The base address of the region.
54 <size> The size of the region. If this is 0 the region is disabled. 53 * @size The size of the region. If this is 0 the region is disabled.
55 <type> The type of the region. 54 * @type The type of the region.
56 [RETURNS] Nothing. 55 *
57*/ 56 * Returns nothing.
57 */
58static void
59amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
58{ 60{
59 u32 regs[2]; 61 u32 regs[2];
60 62
61 /* 63 /*
62 * Low is MTRR0 , High MTRR 1 64 * Low is MTRR0, High MTRR 1
63 */ 65 */
64 rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); 66 rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
65 /* 67 /*
66 * Blank to disable 68 * Blank to disable
67 */ 69 */
68 if (size == 0) 70 if (size == 0) {
69 regs[reg] = 0; 71 regs[reg] = 0;
70 else 72 } else {
71 /* Set the register to the base, the type (off by one) and an 73 /*
72 inverted bitmask of the size The size is the only odd 74 * Set the register to the base, the type (off by one) and an
73 bit. We are fed say 512K We invert this and we get 111 1111 75 * inverted bitmask of the size The size is the only odd
74 1111 1011 but if you subtract one and invert you get the 76 * bit. We are fed say 512K We invert this and we get 111 1111
75 desired 111 1111 1111 1100 mask 77 * 1111 1011 but if you subtract one and invert you get the
76 78 * desired 111 1111 1111 1100 mask
77 But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ 79 *
80 * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!
81 */
78 regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) 82 regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
79 | (base << PAGE_SHIFT) | (type + 1); 83 | (base << PAGE_SHIFT) | (type + 1);
84 }
80 85
81 /* 86 /*
82 * The writeback rule is quite specific. See the manual. Its 87 * The writeback rule is quite specific. See the manual. Its
83 * disable local interrupts, write back the cache, set the mtrr 88 * disable local interrupts, write back the cache, set the mtrr
84 */ 89 */
85 wbinvd(); 90 wbinvd();
86 wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); 91 wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
87} 92}
88 93
89static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) 94static int
95amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
90{ 96{
91 /* Apply the K6 block alignment and size rules 97 /*
92 In order 98 * Apply the K6 block alignment and size rules
93 o Uncached or gathering only 99 * In order
94 o 128K or bigger block 100 * o Uncached or gathering only
95 o Power of 2 block 101 * o 128K or bigger block
96 o base suitably aligned to the power 102 * o Power of 2 block
97 */ 103 * o base suitably aligned to the power
104 */
98 if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) 105 if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
99 || (size & ~(size - 1)) - size || (base & (size - 1))) 106 || (size & ~(size - 1)) - size || (base & (size - 1)))
100 return -EINVAL; 107 return -EINVAL;
@@ -115,5 +122,3 @@ int __init amd_init_mtrr(void)
115 set_mtrr_ops(&amd_mtrr_ops); 122 set_mtrr_ops(&amd_mtrr_ops);
116 return 0; 123 return 0;
117} 124}
118
119//arch_initcall(amd_mtrr_init);
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index cb9aa3a7a7ab..de89f14eff3a 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -1,7 +1,9 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/mm.h> 2#include <linux/mm.h>
3
3#include <asm/mtrr.h> 4#include <asm/mtrr.h>
4#include <asm/msr.h> 5#include <asm/msr.h>
6
5#include "mtrr.h" 7#include "mtrr.h"
6 8
7static struct { 9static struct {
@@ -12,25 +14,25 @@ static struct {
12static u8 centaur_mcr_reserved; 14static u8 centaur_mcr_reserved;
13static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ 15static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
14 16
15/* 17/**
16 * Report boot time MCR setups 18 * centaur_get_free_region - Get a free MTRR.
19 *
20 * @base: The starting (base) address of the region.
21 * @size: The size (in bytes) of the region.
22 *
23 * Returns: the index of the region on success, else -1 on error.
17 */ 24 */
18
19static int 25static int
20centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) 26centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region.
24 [RETURNS] The index of the region on success, else -1 on error.
25*/
26{ 27{
27 int i, max;
28 mtrr_type ltype;
29 unsigned long lbase, lsize; 28 unsigned long lbase, lsize;
29 mtrr_type ltype;
30 int i, max;
30 31
31 max = num_var_ranges; 32 max = num_var_ranges;
32 if (replace_reg >= 0 && replace_reg < max) 33 if (replace_reg >= 0 && replace_reg < max)
33 return replace_reg; 34 return replace_reg;
35
34 for (i = 0; i < max; ++i) { 36 for (i = 0; i < max; ++i) {
35 if (centaur_mcr_reserved & (1 << i)) 37 if (centaur_mcr_reserved & (1 << i))
36 continue; 38 continue;
@@ -38,11 +40,14 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
38 if (lsize == 0) 40 if (lsize == 0)
39 return i; 41 return i;
40 } 42 }
43
41 return -ENOSPC; 44 return -ENOSPC;
42} 45}
43 46
44void 47/*
45mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) 48 * Report boot time MCR setups
49 */
50void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
46{ 51{
47 centaur_mcr[mcr].low = lo; 52 centaur_mcr[mcr].low = lo;
48 centaur_mcr[mcr].high = hi; 53 centaur_mcr[mcr].high = hi;
@@ -54,33 +59,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base,
54{ 59{
55 *base = centaur_mcr[reg].high >> PAGE_SHIFT; 60 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
56 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; 61 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
57 *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ 62 *type = MTRR_TYPE_WRCOMB; /* write-combining */
63
58 if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) 64 if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
59 *type = MTRR_TYPE_UNCACHABLE; 65 *type = MTRR_TYPE_UNCACHABLE;
60 if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) 66 if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
61 *type = MTRR_TYPE_WRBACK; 67 *type = MTRR_TYPE_WRBACK;
62 if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) 68 if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
63 *type = MTRR_TYPE_WRBACK; 69 *type = MTRR_TYPE_WRBACK;
64
65} 70}
66 71
67static void centaur_set_mcr(unsigned int reg, unsigned long base, 72static void
68 unsigned long size, mtrr_type type) 73centaur_set_mcr(unsigned int reg, unsigned long base,
74 unsigned long size, mtrr_type type)
69{ 75{
70 unsigned long low, high; 76 unsigned long low, high;
71 77
72 if (size == 0) { 78 if (size == 0) {
73 /* Disable */ 79 /* Disable */
74 high = low = 0; 80 high = low = 0;
75 } else { 81 } else {
76 high = base << PAGE_SHIFT; 82 high = base << PAGE_SHIFT;
77 if (centaur_mcr_type == 0) 83 if (centaur_mcr_type == 0) {
78 low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ 84 /* Only support write-combining... */
79 else { 85 low = -size << PAGE_SHIFT | 0x1f;
86 } else {
80 if (type == MTRR_TYPE_UNCACHABLE) 87 if (type == MTRR_TYPE_UNCACHABLE)
81 low = -size << PAGE_SHIFT | 0x02; /* NC */ 88 low = -size << PAGE_SHIFT | 0x02; /* NC */
82 else 89 else
83 low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ 90 low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */
84 } 91 }
85 } 92 }
86 centaur_mcr[reg].high = high; 93 centaur_mcr[reg].high = high;
@@ -88,118 +95,16 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base,
88 wrmsr(MSR_IDT_MCR0 + reg, low, high); 95 wrmsr(MSR_IDT_MCR0 + reg, low, high);
89} 96}
90 97
91#if 0 98static int
92/* 99centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
93 * Initialise the later (saner) Winchip MCR variant. In this version
94 * the BIOS can pass us the registers it has used (but not their values)
95 * and the control register is read/write
96 */
97
98static void __init
99centaur_mcr1_init(void)
100{
101 unsigned i;
102 u32 lo, hi;
103
104 /* Unfortunately, MCR's are read-only, so there is no way to
105 * find out what the bios might have done.
106 */
107
108 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
109 if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */
110 lo &= ~0x1C0; /* clear key */
111 lo |= 0x040; /* set key to 1 */
112 wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */
113 }
114
115 centaur_mcr_type = 1;
116
117 /*
118 * Clear any unconfigured MCR's.
119 */
120
121 for (i = 0; i < 8; ++i) {
122 if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) {
123 if (!(lo & (1 << (9 + i))))
124 wrmsr(MSR_IDT_MCR0 + i, 0, 0);
125 else
126 /*
127 * If the BIOS set up an MCR we cannot see it
128 * but we don't wish to obliterate it
129 */
130 centaur_mcr_reserved |= (1 << i);
131 }
132 }
133 /*
134 * Throw the main write-combining switch...
135 * However if OOSTORE is enabled then people have already done far
136 * cleverer things and we should behave.
137 */
138
139 lo |= 15; /* Write combine enables */
140 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
141}
142
143/*
144 * Initialise the original winchip with read only MCR registers
145 * no used bitmask for the BIOS to pass on and write only control
146 */
147
148static void __init
149centaur_mcr0_init(void)
150{
151 unsigned i;
152
153 /* Unfortunately, MCR's are read-only, so there is no way to
154 * find out what the bios might have done.
155 */
156
157 /* Clear any unconfigured MCR's.
158 * This way we are sure that the centaur_mcr array contains the actual
159 * values. The disadvantage is that any BIOS tweaks are thus undone.
160 *
161 */
162 for (i = 0; i < 8; ++i) {
163 if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0)
164 wrmsr(MSR_IDT_MCR0 + i, 0, 0);
165 }
166
167 wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */
168}
169
170/*
171 * Initialise Winchip series MCR registers
172 */
173
174static void __init
175centaur_mcr_init(void)
176{
177 struct set_mtrr_context ctxt;
178
179 set_mtrr_prepare_save(&ctxt);
180 set_mtrr_cache_disable(&ctxt);
181
182 if (boot_cpu_data.x86_model == 4)
183 centaur_mcr0_init();
184 else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9)
185 centaur_mcr1_init();
186
187 set_mtrr_done(&ctxt);
188}
189#endif
190
191static int centaur_validate_add_page(unsigned long base,
192 unsigned long size, unsigned int type)
193{ 100{
194 /* 101 /*
195 * FIXME: Winchip2 supports uncached 102 * FIXME: Winchip2 supports uncached
196 */ 103 */
197 if (type != MTRR_TYPE_WRCOMB && 104 if (type != MTRR_TYPE_WRCOMB &&
198 (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { 105 (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
199 printk(KERN_WARNING 106 pr_warning("mtrr: only write-combining%s supported\n",
200 "mtrr: only write-combining%s supported\n", 107 centaur_mcr_type ? " and uncacheable are" : " is");
201 centaur_mcr_type ? " and uncacheable are"
202 : " is");
203 return -EINVAL; 108 return -EINVAL;
204 } 109 }
205 return 0; 110 return 0;
@@ -207,7 +112,6 @@ static int centaur_validate_add_page(unsigned long base,
207 112
208static struct mtrr_ops centaur_mtrr_ops = { 113static struct mtrr_ops centaur_mtrr_ops = {
209 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
210// .init = centaur_mcr_init,
211 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
212 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
213 .get_free_region = centaur_get_free_region, 117 .get_free_region = centaur_get_free_region,
@@ -220,5 +124,3 @@ int __init centaur_init_mtrr(void)
220 set_mtrr_ops(&centaur_mtrr_ops); 124 set_mtrr_ops(&centaur_mtrr_ops);
221 return 0; 125 return 0;
222} 126}
223
224//arch_initcall(centaur_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 1d584a18a50d..315738c74aad 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -1,51 +1,75 @@
1/* MTRR (Memory Type Range Register) cleanup 1/*
2 2 * MTRR (Memory Type Range Register) cleanup
3 Copyright (C) 2009 Yinghai Lu 3 *
4 4 * Copyright (C) 2009 Yinghai Lu
5 This library is free software; you can redistribute it and/or 5 *
6 modify it under the terms of the GNU Library General Public 6 * This library is free software; you can redistribute it and/or
7 License as published by the Free Software Foundation; either 7 * modify it under the terms of the GNU Library General Public
8 version 2 of the License, or (at your option) any later version. 8 * License as published by the Free Software Foundation; either
9 9 * version 2 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful, 10 *
11 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * This library is distributed in the hope that it will be useful,
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 Library General Public License for more details. 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 14 * Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public 15 *
16 License along with this library; if not, write to the Free 16 * You should have received a copy of the GNU Library General Public
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 * License along with this library; if not, write to the Free
18*/ 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 19 */
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/mutex.h>
26#include <linux/sort.h> 25#include <linux/sort.h>
26#include <linux/mutex.h>
27#include <linux/uaccess.h>
28#include <linux/kvm_para.h>
27 29
30#include <asm/processor.h>
28#include <asm/e820.h> 31#include <asm/e820.h>
29#include <asm/mtrr.h> 32#include <asm/mtrr.h>
30#include <asm/uaccess.h>
31#include <asm/processor.h>
32#include <asm/msr.h> 33#include <asm/msr.h>
33#include <asm/kvm_para.h>
34#include "mtrr.h"
35 34
36/* should be related to MTRR_VAR_RANGES nums */ 35#include "mtrr.h"
37#define RANGE_NUM 256
38 36
39struct res_range { 37struct res_range {
40 unsigned long start; 38 unsigned long start;
41 unsigned long end; 39 unsigned long end;
40};
41
42struct var_mtrr_range_state {
43 unsigned long base_pfn;
44 unsigned long size_pfn;
45 mtrr_type type;
46};
47
48struct var_mtrr_state {
49 unsigned long range_startk;
50 unsigned long range_sizek;
51 unsigned long chunk_sizek;
52 unsigned long gran_sizek;
53 unsigned int reg;
42}; 54};
43 55
56/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256
58
59static struct res_range __initdata range[RANGE_NUM];
60static int __initdata nr_range;
61
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
63
64static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66
67
44static int __init 68static int __init
45add_range(struct res_range *range, int nr_range, unsigned long start, 69add_range(struct res_range *range, int nr_range,
46 unsigned long end) 70 unsigned long start, unsigned long end)
47{ 71{
48 /* out of slots */ 72 /* Out of slots: */
49 if (nr_range >= RANGE_NUM) 73 if (nr_range >= RANGE_NUM)
50 return nr_range; 74 return nr_range;
51 75
@@ -58,12 +82,12 @@ add_range(struct res_range *range, int nr_range, unsigned long start,
58} 82}
59 83
60static int __init 84static int __init
61add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, 85add_range_with_merge(struct res_range *range, int nr_range,
62 unsigned long end) 86 unsigned long start, unsigned long end)
63{ 87{
64 int i; 88 int i;
65 89
66 /* try to merge it with old one */ 90 /* Try to merge it with old one: */
67 for (i = 0; i < nr_range; i++) { 91 for (i = 0; i < nr_range; i++) {
68 unsigned long final_start, final_end; 92 unsigned long final_start, final_end;
69 unsigned long common_start, common_end; 93 unsigned long common_start, common_end;
@@ -84,7 +108,7 @@ add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
84 return nr_range; 108 return nr_range;
85 } 109 }
86 110
87 /* need to add that */ 111 /* Need to add it: */
88 return add_range(range, nr_range, start, end); 112 return add_range(range, nr_range, start, end);
89} 113}
90 114
@@ -117,7 +141,7 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117 } 141 }
118 142
119 if (start > range[j].start && end < range[j].end) { 143 if (start > range[j].start && end < range[j].end) {
120 /* find the new spare */ 144 /* Find the new spare: */
121 for (i = 0; i < RANGE_NUM; i++) { 145 for (i = 0; i < RANGE_NUM; i++) {
122 if (range[i].end == 0) 146 if (range[i].end == 0)
123 break; 147 break;
@@ -146,14 +170,8 @@ static int __init cmp_range(const void *x1, const void *x2)
146 return start1 - start2; 170 return start1 - start2;
147} 171}
148 172
149struct var_mtrr_range_state { 173#define BIOS_BUG_MSG KERN_WARNING \
150 unsigned long base_pfn; 174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
151 unsigned long size_pfn;
152 mtrr_type type;
153};
154
155static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
156static int __initdata debug_print;
157 175
158static int __init 176static int __init
159x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 177x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
@@ -180,7 +198,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
180 range[i].start, range[i].end + 1); 198 range[i].start, range[i].end + 1);
181 } 199 }
182 200
183 /* take out UC ranges */ 201 /* Take out UC ranges: */
184 for (i = 0; i < num_var_ranges; i++) { 202 for (i = 0; i < num_var_ranges; i++) {
185 type = range_state[i].type; 203 type = range_state[i].type;
186 if (type != MTRR_TYPE_UNCACHABLE && 204 if (type != MTRR_TYPE_UNCACHABLE &&
@@ -193,9 +211,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
193 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && 211 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
194 (mtrr_state.enabled & 1)) { 212 (mtrr_state.enabled & 1)) {
195 /* Var MTRR contains UC entry below 1M? Skip it: */ 213 /* Var MTRR contains UC entry below 1M? Skip it: */
196 printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " 214 printk(BIOS_BUG_MSG, i);
197 "contains strange UC entry under 1M, check "
198 "with your system vendor!\n", i);
199 if (base + size <= (1<<(20-PAGE_SHIFT))) 215 if (base + size <= (1<<(20-PAGE_SHIFT)))
200 continue; 216 continue;
201 size -= (1<<(20-PAGE_SHIFT)) - base; 217 size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -237,17 +253,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
237 return nr_range; 253 return nr_range;
238} 254}
239 255
240static struct res_range __initdata range[RANGE_NUM];
241static int __initdata nr_range;
242
243#ifdef CONFIG_MTRR_SANITIZER 256#ifdef CONFIG_MTRR_SANITIZER
244 257
245static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 258static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
246{ 259{
247 unsigned long sum; 260 unsigned long sum = 0;
248 int i; 261 int i;
249 262
250 sum = 0;
251 for (i = 0; i < nr_range; i++) 263 for (i = 0; i < nr_range; i++)
252 sum += range[i].end + 1 - range[i].start; 264 sum += range[i].end + 1 - range[i].start;
253 265
@@ -278,17 +290,9 @@ static int __init mtrr_cleanup_debug_setup(char *str)
278} 290}
279early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); 291early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
280 292
281struct var_mtrr_state {
282 unsigned long range_startk;
283 unsigned long range_sizek;
284 unsigned long chunk_sizek;
285 unsigned long gran_sizek;
286 unsigned int reg;
287};
288
289static void __init 293static void __init
290set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, 294set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
291 unsigned char type, unsigned int address_bits) 295 unsigned char type, unsigned int address_bits)
292{ 296{
293 u32 base_lo, base_hi, mask_lo, mask_hi; 297 u32 base_lo, base_hi, mask_lo, mask_hi;
294 u64 base, mask; 298 u64 base, mask;
@@ -301,7 +305,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
301 mask = (1ULL << address_bits) - 1; 305 mask = (1ULL << address_bits) - 1;
302 mask &= ~((((u64)sizek) << 10) - 1); 306 mask &= ~((((u64)sizek) << 10) - 1);
303 307
304 base = ((u64)basek) << 10; 308 base = ((u64)basek) << 10;
305 309
306 base |= type; 310 base |= type;
307 mask |= 0x800; 311 mask |= 0x800;
@@ -317,15 +321,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
317 321
318static void __init 322static void __init
319save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, 323save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
320 unsigned char type) 324 unsigned char type)
321{ 325{
322 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); 326 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
323 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); 327 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
324 range_state[reg].type = type; 328 range_state[reg].type = type;
325} 329}
326 330
327static void __init 331static void __init set_var_mtrr_all(unsigned int address_bits)
328set_var_mtrr_all(unsigned int address_bits)
329{ 332{
330 unsigned long basek, sizek; 333 unsigned long basek, sizek;
331 unsigned char type; 334 unsigned char type;
@@ -342,11 +345,11 @@ set_var_mtrr_all(unsigned int address_bits)
342 345
343static unsigned long to_size_factor(unsigned long sizek, char *factorp) 346static unsigned long to_size_factor(unsigned long sizek, char *factorp)
344{ 347{
345 char factor;
346 unsigned long base = sizek; 348 unsigned long base = sizek;
349 char factor;
347 350
348 if (base & ((1<<10) - 1)) { 351 if (base & ((1<<10) - 1)) {
349 /* not MB alignment */ 352 /* Not MB-aligned: */
350 factor = 'K'; 353 factor = 'K';
351 } else if (base & ((1<<20) - 1)) { 354 } else if (base & ((1<<20) - 1)) {
352 factor = 'M'; 355 factor = 'M';
@@ -372,11 +375,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
372 unsigned long max_align, align; 375 unsigned long max_align, align;
373 unsigned long sizek; 376 unsigned long sizek;
374 377
375 /* Compute the maximum size I can make a range */ 378 /* Compute the maximum size with which we can make a range: */
376 if (range_startk) 379 if (range_startk)
377 max_align = ffs(range_startk) - 1; 380 max_align = ffs(range_startk) - 1;
378 else 381 else
379 max_align = 32; 382 max_align = 32;
383
380 align = fls(range_sizek) - 1; 384 align = fls(range_sizek) - 1;
381 if (align > max_align) 385 if (align > max_align)
382 align = max_align; 386 align = max_align;
@@ -386,11 +390,10 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
386 char start_factor = 'K', size_factor = 'K'; 390 char start_factor = 'K', size_factor = 'K';
387 unsigned long start_base, size_base; 391 unsigned long start_base, size_base;
388 392
389 start_base = to_size_factor(range_startk, 393 start_base = to_size_factor(range_startk, &start_factor);
390 &start_factor), 394 size_base = to_size_factor(sizek, &size_factor);
391 size_base = to_size_factor(sizek, &size_factor),
392 395
393 printk(KERN_DEBUG "Setting variable MTRR %d, " 396 Dprintk("Setting variable MTRR %d, "
394 "base: %ld%cB, range: %ld%cB, type %s\n", 397 "base: %ld%cB, range: %ld%cB, type %s\n",
395 reg, start_base, start_factor, 398 reg, start_base, start_factor,
396 size_base, size_factor, 399 size_base, size_factor,
@@ -425,10 +428,11 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
425 chunk_sizek = state->chunk_sizek; 428 chunk_sizek = state->chunk_sizek;
426 gran_sizek = state->gran_sizek; 429 gran_sizek = state->gran_sizek;
427 430
428 /* align with gran size, prevent small block used up MTRRs */ 431 /* Align with gran size, prevent small block used up MTRRs: */
429 range_basek = ALIGN(state->range_startk, gran_sizek); 432 range_basek = ALIGN(state->range_startk, gran_sizek);
430 if ((range_basek > basek) && basek) 433 if ((range_basek > basek) && basek)
431 return second_sizek; 434 return second_sizek;
435
432 state->range_sizek -= (range_basek - state->range_startk); 436 state->range_sizek -= (range_basek - state->range_startk);
433 range_sizek = ALIGN(state->range_sizek, gran_sizek); 437 range_sizek = ALIGN(state->range_sizek, gran_sizek);
434 438
@@ -439,22 +443,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
439 } 443 }
440 state->range_sizek = range_sizek; 444 state->range_sizek = range_sizek;
441 445
442 /* try to append some small hole */ 446 /* Try to append some small hole: */
443 range0_basek = state->range_startk; 447 range0_basek = state->range_startk;
444 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 448 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
445 449
446 /* no increase */ 450 /* No increase: */
447 if (range0_sizek == state->range_sizek) { 451 if (range0_sizek == state->range_sizek) {
448 if (debug_print) 452 Dprintk("rangeX: %016lx - %016lx\n",
449 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 453 range0_basek<<10,
450 range0_basek<<10, 454 (range0_basek + state->range_sizek)<<10);
451 (range0_basek + state->range_sizek)<<10);
452 state->reg = range_to_mtrr(state->reg, range0_basek, 455 state->reg = range_to_mtrr(state->reg, range0_basek,
453 state->range_sizek, MTRR_TYPE_WRBACK); 456 state->range_sizek, MTRR_TYPE_WRBACK);
454 return 0; 457 return 0;
455 } 458 }
456 459
457 /* only cut back, when it is not the last */ 460 /* Only cut back when it is not the last: */
458 if (sizek) { 461 if (sizek) {
459 while (range0_basek + range0_sizek > (basek + sizek)) { 462 while (range0_basek + range0_sizek > (basek + sizek)) {
460 if (range0_sizek >= chunk_sizek) 463 if (range0_sizek >= chunk_sizek)
@@ -470,16 +473,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
470second_try: 473second_try:
471 range_basek = range0_basek + range0_sizek; 474 range_basek = range0_basek + range0_sizek;
472 475
473 /* one hole in the middle */ 476 /* One hole in the middle: */
474 if (range_basek > basek && range_basek <= (basek + sizek)) 477 if (range_basek > basek && range_basek <= (basek + sizek))
475 second_sizek = range_basek - basek; 478 second_sizek = range_basek - basek;
476 479
477 if (range0_sizek > state->range_sizek) { 480 if (range0_sizek > state->range_sizek) {
478 481
479 /* one hole in middle or at end */ 482 /* One hole in middle or at the end: */
480 hole_sizek = range0_sizek - state->range_sizek - second_sizek; 483 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
481 484
482 /* hole size should be less than half of range0 size */ 485 /* Hole size should be less than half of range0 size: */
483 if (hole_sizek >= (range0_sizek >> 1) && 486 if (hole_sizek >= (range0_sizek >> 1) &&
484 range0_sizek >= chunk_sizek) { 487 range0_sizek >= chunk_sizek) {
485 range0_sizek -= chunk_sizek; 488 range0_sizek -= chunk_sizek;
@@ -491,32 +494,30 @@ second_try:
491 } 494 }
492 495
493 if (range0_sizek) { 496 if (range0_sizek) {
494 if (debug_print) 497 Dprintk("range0: %016lx - %016lx\n",
495 printk(KERN_DEBUG "range0: %016lx - %016lx\n", 498 range0_basek<<10,
496 range0_basek<<10, 499 (range0_basek + range0_sizek)<<10);
497 (range0_basek + range0_sizek)<<10);
498 state->reg = range_to_mtrr(state->reg, range0_basek, 500 state->reg = range_to_mtrr(state->reg, range0_basek,
499 range0_sizek, MTRR_TYPE_WRBACK); 501 range0_sizek, MTRR_TYPE_WRBACK);
500 } 502 }
501 503
502 if (range0_sizek < state->range_sizek) { 504 if (range0_sizek < state->range_sizek) {
503 /* need to handle left over */ 505 /* Need to handle left over range: */
504 range_sizek = state->range_sizek - range0_sizek; 506 range_sizek = state->range_sizek - range0_sizek;
505 507
506 if (debug_print) 508 Dprintk("range: %016lx - %016lx\n",
507 printk(KERN_DEBUG "range: %016lx - %016lx\n", 509 range_basek<<10,
508 range_basek<<10, 510 (range_basek + range_sizek)<<10);
509 (range_basek + range_sizek)<<10); 511
510 state->reg = range_to_mtrr(state->reg, range_basek, 512 state->reg = range_to_mtrr(state->reg, range_basek,
511 range_sizek, MTRR_TYPE_WRBACK); 513 range_sizek, MTRR_TYPE_WRBACK);
512 } 514 }
513 515
514 if (hole_sizek) { 516 if (hole_sizek) {
515 hole_basek = range_basek - hole_sizek - second_sizek; 517 hole_basek = range_basek - hole_sizek - second_sizek;
516 if (debug_print) 518 Dprintk("hole: %016lx - %016lx\n",
517 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 519 hole_basek<<10,
518 hole_basek<<10, 520 (hole_basek + hole_sizek)<<10);
519 (hole_basek + hole_sizek)<<10);
520 state->reg = range_to_mtrr(state->reg, hole_basek, 521 state->reg = range_to_mtrr(state->reg, hole_basek,
521 hole_sizek, MTRR_TYPE_UNCACHABLE); 522 hole_sizek, MTRR_TYPE_UNCACHABLE);
522 } 523 }
@@ -537,23 +538,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
537 basek = base_pfn << (PAGE_SHIFT - 10); 538 basek = base_pfn << (PAGE_SHIFT - 10);
538 sizek = size_pfn << (PAGE_SHIFT - 10); 539 sizek = size_pfn << (PAGE_SHIFT - 10);
539 540
540 /* See if I can merge with the last range */ 541 /* See if I can merge with the last range: */
541 if ((basek <= 1024) || 542 if ((basek <= 1024) ||
542 (state->range_startk + state->range_sizek == basek)) { 543 (state->range_startk + state->range_sizek == basek)) {
543 unsigned long endk = basek + sizek; 544 unsigned long endk = basek + sizek;
544 state->range_sizek = endk - state->range_startk; 545 state->range_sizek = endk - state->range_startk;
545 return; 546 return;
546 } 547 }
547 /* Write the range mtrrs */ 548 /* Write the range mtrrs: */
548 if (state->range_sizek != 0) 549 if (state->range_sizek != 0)
549 second_sizek = range_to_mtrr_with_hole(state, basek, sizek); 550 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
550 551
551 /* Allocate an msr */ 552 /* Allocate an msr: */
552 state->range_startk = basek + second_sizek; 553 state->range_startk = basek + second_sizek;
553 state->range_sizek = sizek - second_sizek; 554 state->range_sizek = sizek - second_sizek;
554} 555}
555 556
556/* mininum size of mtrr block that can take hole */ 557/* Mininum size of mtrr block that can take hole: */
557static u64 mtrr_chunk_size __initdata = (256ULL<<20); 558static u64 mtrr_chunk_size __initdata = (256ULL<<20);
558 559
559static int __init parse_mtrr_chunk_size_opt(char *p) 560static int __init parse_mtrr_chunk_size_opt(char *p)
@@ -565,7 +566,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p)
565} 566}
566early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); 567early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
567 568
568/* granity of mtrr of block */ 569/* Granularity of mtrr of block: */
569static u64 mtrr_gran_size __initdata; 570static u64 mtrr_gran_size __initdata;
570 571
571static int __init parse_mtrr_gran_size_opt(char *p) 572static int __init parse_mtrr_gran_size_opt(char *p)
@@ -577,7 +578,7 @@ static int __init parse_mtrr_gran_size_opt(char *p)
577} 578}
578early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); 579early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
579 580
580static int nr_mtrr_spare_reg __initdata = 581static unsigned long nr_mtrr_spare_reg __initdata =
581 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; 582 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
582 583
583static int __init parse_mtrr_spare_reg(char *arg) 584static int __init parse_mtrr_spare_reg(char *arg)
@@ -586,7 +587,6 @@ static int __init parse_mtrr_spare_reg(char *arg)
586 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); 587 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
587 return 0; 588 return 0;
588} 589}
589
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591 591
592static int __init 592static int __init
@@ -594,8 +594,8 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
594 u64 chunk_size, u64 gran_size) 594 u64 chunk_size, u64 gran_size)
595{ 595{
596 struct var_mtrr_state var_state; 596 struct var_mtrr_state var_state;
597 int i;
598 int num_reg; 597 int num_reg;
598 int i;
599 599
600 var_state.range_startk = 0; 600 var_state.range_startk = 0;
601 var_state.range_sizek = 0; 601 var_state.range_sizek = 0;
@@ -605,17 +605,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
605 605
606 memset(range_state, 0, sizeof(range_state)); 606 memset(range_state, 0, sizeof(range_state));
607 607
608 /* Write the range etc */ 608 /* Write the range: */
609 for (i = 0; i < nr_range; i++) 609 for (i = 0; i < nr_range; i++) {
610 set_var_mtrr_range(&var_state, range[i].start, 610 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1); 611 range[i].end - range[i].start + 1);
612 }
612 613
613 /* Write the last range */ 614 /* Write the last range: */
614 if (var_state.range_sizek != 0) 615 if (var_state.range_sizek != 0)
615 range_to_mtrr_with_hole(&var_state, 0, 0); 616 range_to_mtrr_with_hole(&var_state, 0, 0);
616 617
617 num_reg = var_state.reg; 618 num_reg = var_state.reg;
618 /* Clear out the extra MTRR's */ 619 /* Clear out the extra MTRR's: */
619 while (var_state.reg < num_var_ranges) { 620 while (var_state.reg < num_var_ranges) {
620 save_var_mtrr(var_state.reg, 0, 0, 0); 621 save_var_mtrr(var_state.reg, 0, 0, 0);
621 var_state.reg++; 622 var_state.reg++;
@@ -625,11 +626,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
625} 626}
626 627
627struct mtrr_cleanup_result { 628struct mtrr_cleanup_result {
628 unsigned long gran_sizek; 629 unsigned long gran_sizek;
629 unsigned long chunk_sizek; 630 unsigned long chunk_sizek;
630 unsigned long lose_cover_sizek; 631 unsigned long lose_cover_sizek;
631 unsigned int num_reg; 632 unsigned int num_reg;
632 int bad; 633 int bad;
633}; 634};
634 635
635/* 636/*
@@ -645,10 +646,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
645 646
646static void __init print_out_mtrr_range_state(void) 647static void __init print_out_mtrr_range_state(void)
647{ 648{
648 int i;
649 char start_factor = 'K', size_factor = 'K'; 649 char start_factor = 'K', size_factor = 'K';
650 unsigned long start_base, size_base; 650 unsigned long start_base, size_base;
651 mtrr_type type; 651 mtrr_type type;
652 int i;
652 653
653 for (i = 0; i < num_var_ranges; i++) { 654 for (i = 0; i < num_var_ranges; i++) {
654 655
@@ -676,10 +677,10 @@ static int __init mtrr_need_cleanup(void)
676 int i; 677 int i;
677 mtrr_type type; 678 mtrr_type type;
678 unsigned long size; 679 unsigned long size;
679 /* extra one for all 0 */ 680 /* Extra one for all 0: */
680 int num[MTRR_NUM_TYPES + 1]; 681 int num[MTRR_NUM_TYPES + 1];
681 682
682 /* check entries number */ 683 /* Check entries number: */
683 memset(num, 0, sizeof(num)); 684 memset(num, 0, sizeof(num));
684 for (i = 0; i < num_var_ranges; i++) { 685 for (i = 0; i < num_var_ranges; i++) {
685 type = range_state[i].type; 686 type = range_state[i].type;
@@ -693,88 +694,86 @@ static int __init mtrr_need_cleanup(void)
693 num[type]++; 694 num[type]++;
694 } 695 }
695 696
696 /* check if we got UC entries */ 697 /* Check if we got UC entries: */
697 if (!num[MTRR_TYPE_UNCACHABLE]) 698 if (!num[MTRR_TYPE_UNCACHABLE])
698 return 0; 699 return 0;
699 700
700 /* check if we only had WB and UC */ 701 /* Check if we only had WB and UC */
701 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != 702 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
702 num_var_ranges - num[MTRR_NUM_TYPES]) 703 num_var_ranges - num[MTRR_NUM_TYPES])
703 return 0; 704 return 0;
704 705
705 return 1; 706 return 1;
706} 707}
707 708
708static unsigned long __initdata range_sums; 709static unsigned long __initdata range_sums;
709static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, 710
710 unsigned long extra_remove_base, 711static void __init
711 unsigned long extra_remove_size, 712mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
712 int i) 713 unsigned long x_remove_base,
714 unsigned long x_remove_size, int i)
713{ 715{
714 int num_reg;
715 static struct res_range range_new[RANGE_NUM]; 716 static struct res_range range_new[RANGE_NUM];
716 static int nr_range_new;
717 unsigned long range_sums_new; 717 unsigned long range_sums_new;
718 static int nr_range_new;
719 int num_reg;
718 720
719 /* convert ranges to var ranges state */ 721 /* Convert ranges to var ranges state: */
720 num_reg = x86_setup_var_mtrrs(range, nr_range, 722 num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
721 chunk_size, gran_size);
722 723
723 /* we got new setting in range_state, check it */ 724 /* We got new setting in range_state, check it: */
724 memset(range_new, 0, sizeof(range_new)); 725 memset(range_new, 0, sizeof(range_new));
725 nr_range_new = x86_get_mtrr_mem_range(range_new, 0, 726 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
726 extra_remove_base, extra_remove_size); 727 x_remove_base, x_remove_size);
727 range_sums_new = sum_ranges(range_new, nr_range_new); 728 range_sums_new = sum_ranges(range_new, nr_range_new);
728 729
729 result[i].chunk_sizek = chunk_size >> 10; 730 result[i].chunk_sizek = chunk_size >> 10;
730 result[i].gran_sizek = gran_size >> 10; 731 result[i].gran_sizek = gran_size >> 10;
731 result[i].num_reg = num_reg; 732 result[i].num_reg = num_reg;
733
732 if (range_sums < range_sums_new) { 734 if (range_sums < range_sums_new) {
733 result[i].lose_cover_sizek = 735 result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT;
734 (range_sums_new - range_sums) << PSHIFT;
735 result[i].bad = 1; 736 result[i].bad = 1;
736 } else 737 } else {
737 result[i].lose_cover_sizek = 738 result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT;
738 (range_sums - range_sums_new) << PSHIFT; 739 }
739 740
740 /* double check it */ 741 /* Double check it: */
741 if (!result[i].bad && !result[i].lose_cover_sizek) { 742 if (!result[i].bad && !result[i].lose_cover_sizek) {
742 if (nr_range_new != nr_range || 743 if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range)))
743 memcmp(range, range_new, sizeof(range))) 744 result[i].bad = 1;
744 result[i].bad = 1;
745 } 745 }
746 746
747 if (!result[i].bad && (range_sums - range_sums_new < 747 if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg]))
748 min_loss_pfn[num_reg])) { 748 min_loss_pfn[num_reg] = range_sums - range_sums_new;
749 min_loss_pfn[num_reg] =
750 range_sums - range_sums_new;
751 }
752} 749}
753 750
754static void __init mtrr_print_out_one_result(int i) 751static void __init mtrr_print_out_one_result(int i)
755{ 752{
756 char gran_factor, chunk_factor, lose_factor;
757 unsigned long gran_base, chunk_base, lose_base; 753 unsigned long gran_base, chunk_base, lose_base;
754 char gran_factor, chunk_factor, lose_factor;
758 755
759 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 756 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
760 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 757 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
761 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 758 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
762 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", 759
763 result[i].bad ? "*BAD*" : " ", 760 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
764 gran_base, gran_factor, chunk_base, chunk_factor); 761 result[i].bad ? "*BAD*" : " ",
765 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", 762 gran_base, gran_factor, chunk_base, chunk_factor);
766 result[i].num_reg, result[i].bad ? "-" : "", 763 pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n",
767 lose_base, lose_factor); 764 result[i].num_reg, result[i].bad ? "-" : "",
765 lose_base, lose_factor);
768} 766}
769 767
770static int __init mtrr_search_optimal_index(void) 768static int __init mtrr_search_optimal_index(void)
771{ 769{
772 int i;
773 int num_reg_good; 770 int num_reg_good;
774 int index_good; 771 int index_good;
772 int i;
775 773
776 if (nr_mtrr_spare_reg >= num_var_ranges) 774 if (nr_mtrr_spare_reg >= num_var_ranges)
777 nr_mtrr_spare_reg = num_var_ranges - 1; 775 nr_mtrr_spare_reg = num_var_ranges - 1;
776
778 num_reg_good = -1; 777 num_reg_good = -1;
779 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 778 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
780 if (!min_loss_pfn[i]) 779 if (!min_loss_pfn[i])
@@ -796,24 +795,24 @@ static int __init mtrr_search_optimal_index(void)
796 return index_good; 795 return index_good;
797} 796}
798 797
799
800int __init mtrr_cleanup(unsigned address_bits) 798int __init mtrr_cleanup(unsigned address_bits)
801{ 799{
802 unsigned long extra_remove_base, extra_remove_size; 800 unsigned long x_remove_base, x_remove_size;
803 unsigned long base, size, def, dummy; 801 unsigned long base, size, def, dummy;
804 mtrr_type type;
805 u64 chunk_size, gran_size; 802 u64 chunk_size, gran_size;
803 mtrr_type type;
806 int index_good; 804 int index_good;
807 int i; 805 int i;
808 806
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 807 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0; 808 return 0;
809
811 rdmsr(MSR_MTRRdefType, def, dummy); 810 rdmsr(MSR_MTRRdefType, def, dummy);
812 def &= 0xff; 811 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE) 812 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0; 813 return 0;
815 814
816 /* get it and store it aside */ 815 /* Get it and store it aside: */
817 memset(range_state, 0, sizeof(range_state)); 816 memset(range_state, 0, sizeof(range_state));
818 for (i = 0; i < num_var_ranges; i++) { 817 for (i = 0; i < num_var_ranges; i++) {
819 mtrr_if->get(i, &base, &size, &type); 818 mtrr_if->get(i, &base, &size, &type);
@@ -822,29 +821,28 @@ int __init mtrr_cleanup(unsigned address_bits)
822 range_state[i].type = type; 821 range_state[i].type = type;
823 } 822 }
824 823
825 /* check if we need handle it and can handle it */ 824 /* Check if we need handle it and can handle it: */
826 if (!mtrr_need_cleanup()) 825 if (!mtrr_need_cleanup())
827 return 0; 826 return 0;
828 827
829 /* print original var MTRRs at first, for debugging: */ 828 /* Print original var MTRRs at first, for debugging: */
830 printk(KERN_DEBUG "original variable MTRRs\n"); 829 printk(KERN_DEBUG "original variable MTRRs\n");
831 print_out_mtrr_range_state(); 830 print_out_mtrr_range_state();
832 831
833 memset(range, 0, sizeof(range)); 832 memset(range, 0, sizeof(range));
834 extra_remove_size = 0; 833 x_remove_size = 0;
835 extra_remove_base = 1 << (32 - PAGE_SHIFT); 834 x_remove_base = 1 << (32 - PAGE_SHIFT);
836 if (mtrr_tom2) 835 if (mtrr_tom2)
837 extra_remove_size = 836 x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
838 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 837
839 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 838 nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size);
840 extra_remove_size);
841 /* 839 /*
842 * [0, 1M) should always be coverred by var mtrr with WB 840 * [0, 1M) should always be covered by var mtrr with WB
843 * and fixed mtrrs should take effective before var mtrr for it 841 * and fixed mtrrs should take effect before var mtrr for it:
844 */ 842 */
845 nr_range = add_range_with_merge(range, nr_range, 0, 843 nr_range = add_range_with_merge(range, nr_range, 0,
846 (1ULL<<(20 - PAGE_SHIFT)) - 1); 844 (1ULL<<(20 - PAGE_SHIFT)) - 1);
847 /* sort the ranges */ 845 /* Sort the ranges: */
848 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 846 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
849 847
850 range_sums = sum_ranges(range, nr_range); 848 range_sums = sum_ranges(range, nr_range);
@@ -854,7 +852,7 @@ int __init mtrr_cleanup(unsigned address_bits)
854 if (mtrr_chunk_size && mtrr_gran_size) { 852 if (mtrr_chunk_size && mtrr_gran_size) {
855 i = 0; 853 i = 0;
856 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, 854 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
857 extra_remove_base, extra_remove_size, i); 855 x_remove_base, x_remove_size, i);
858 856
859 mtrr_print_out_one_result(i); 857 mtrr_print_out_one_result(i);
860 858
@@ -880,7 +878,7 @@ int __init mtrr_cleanup(unsigned address_bits)
880 continue; 878 continue;
881 879
882 mtrr_calc_range_state(chunk_size, gran_size, 880 mtrr_calc_range_state(chunk_size, gran_size,
883 extra_remove_base, extra_remove_size, i); 881 x_remove_base, x_remove_size, i);
884 if (debug_print) { 882 if (debug_print) {
885 mtrr_print_out_one_result(i); 883 mtrr_print_out_one_result(i);
886 printk(KERN_INFO "\n"); 884 printk(KERN_INFO "\n");
@@ -890,7 +888,7 @@ int __init mtrr_cleanup(unsigned address_bits)
890 } 888 }
891 } 889 }
892 890
893 /* try to find the optimal index */ 891 /* Try to find the optimal index: */
894 index_good = mtrr_search_optimal_index(); 892 index_good = mtrr_search_optimal_index();
895 893
896 if (index_good != -1) { 894 if (index_good != -1) {
@@ -898,7 +896,7 @@ int __init mtrr_cleanup(unsigned address_bits)
898 i = index_good; 896 i = index_good;
899 mtrr_print_out_one_result(i); 897 mtrr_print_out_one_result(i);
900 898
901 /* convert ranges to var ranges state */ 899 /* Convert ranges to var ranges state: */
902 chunk_size = result[i].chunk_sizek; 900 chunk_size = result[i].chunk_sizek;
903 chunk_size <<= 10; 901 chunk_size <<= 10;
904 gran_size = result[i].gran_sizek; 902 gran_size = result[i].gran_sizek;
@@ -941,8 +939,8 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
941 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't 939 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
942 * apply to are wrong, but so far we don't know of any such case in the wild. 940 * apply to are wrong, but so far we don't know of any such case in the wild.
943 */ 941 */
944#define Tom2Enabled (1U << 21) 942#define Tom2Enabled (1U << 21)
945#define Tom2ForceMemTypeWB (1U << 22) 943#define Tom2ForceMemTypeWB (1U << 22)
946 944
947int __init amd_special_default_mtrr(void) 945int __init amd_special_default_mtrr(void)
948{ 946{
@@ -952,7 +950,7 @@ int __init amd_special_default_mtrr(void)
952 return 0; 950 return 0;
953 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 951 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
954 return 0; 952 return 0;
955 /* In case some hypervisor doesn't pass SYSCFG through */ 953 /* In case some hypervisor doesn't pass SYSCFG through: */
956 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) 954 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
957 return 0; 955 return 0;
958 /* 956 /*
@@ -965,19 +963,21 @@ int __init amd_special_default_mtrr(void)
965 return 0; 963 return 0;
966} 964}
967 965
968static u64 __init real_trim_memory(unsigned long start_pfn, 966static u64 __init
969 unsigned long limit_pfn) 967real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
970{ 968{
971 u64 trim_start, trim_size; 969 u64 trim_start, trim_size;
970
972 trim_start = start_pfn; 971 trim_start = start_pfn;
973 trim_start <<= PAGE_SHIFT; 972 trim_start <<= PAGE_SHIFT;
973
974 trim_size = limit_pfn; 974 trim_size = limit_pfn;
975 trim_size <<= PAGE_SHIFT; 975 trim_size <<= PAGE_SHIFT;
976 trim_size -= trim_start; 976 trim_size -= trim_start;
977 977
978 return e820_update_range(trim_start, trim_size, E820_RAM, 978 return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED);
979 E820_RESERVED);
980} 979}
980
981/** 981/**
982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs 982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
983 * @end_pfn: ending page frame number 983 * @end_pfn: ending page frame number
@@ -985,7 +985,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn,
985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain 985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
986 * memory configurations. This routine checks that the highest MTRR matches 986 * memory configurations. This routine checks that the highest MTRR matches
987 * the end of memory, to make sure the MTRRs having a write back type cover 987 * the end of memory, to make sure the MTRRs having a write back type cover
988 * all of the memory the kernel is intending to use. If not, it'll trim any 988 * all of the memory the kernel is intending to use. If not, it'll trim any
989 * memory off the end by adjusting end_pfn, removing it from the kernel's 989 * memory off the end by adjusting end_pfn, removing it from the kernel's
990 * allocation pools, warning the user with an obnoxious message. 990 * allocation pools, warning the user with an obnoxious message.
991 */ 991 */
@@ -994,21 +994,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
994 unsigned long i, base, size, highest_pfn = 0, def, dummy; 994 unsigned long i, base, size, highest_pfn = 0, def, dummy;
995 mtrr_type type; 995 mtrr_type type;
996 u64 total_trim_size; 996 u64 total_trim_size;
997
998 /* extra one for all 0 */ 997 /* extra one for all 0 */
999 int num[MTRR_NUM_TYPES + 1]; 998 int num[MTRR_NUM_TYPES + 1];
999
1000 /* 1000 /*
1001 * Make sure we only trim uncachable memory on machines that 1001 * Make sure we only trim uncachable memory on machines that
1002 * support the Intel MTRR architecture: 1002 * support the Intel MTRR architecture:
1003 */ 1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim) 1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0; 1005 return 0;
1006
1006 rdmsr(MSR_MTRRdefType, def, dummy); 1007 rdmsr(MSR_MTRRdefType, def, dummy);
1007 def &= 0xff; 1008 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE) 1009 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0; 1010 return 0;
1010 1011
1011 /* get it and store it aside */ 1012 /* Get it and store it aside: */
1012 memset(range_state, 0, sizeof(range_state)); 1013 memset(range_state, 0, sizeof(range_state));
1013 for (i = 0; i < num_var_ranges; i++) { 1014 for (i = 0; i < num_var_ranges; i++) {
1014 mtrr_if->get(i, &base, &size, &type); 1015 mtrr_if->get(i, &base, &size, &type);
@@ -1017,7 +1018,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1017 range_state[i].type = type; 1018 range_state[i].type = type;
1018 } 1019 }
1019 1020
1020 /* Find highest cached pfn */ 1021 /* Find highest cached pfn: */
1021 for (i = 0; i < num_var_ranges; i++) { 1022 for (i = 0; i < num_var_ranges; i++) {
1022 type = range_state[i].type; 1023 type = range_state[i].type;
1023 if (type != MTRR_TYPE_WRBACK) 1024 if (type != MTRR_TYPE_WRBACK)
@@ -1028,13 +1029,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1028 highest_pfn = base + size; 1029 highest_pfn = base + size;
1029 } 1030 }
1030 1031
1031 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1032 /* kvm/qemu doesn't have mtrr set right, don't trim them all: */
1032 if (!highest_pfn) { 1033 if (!highest_pfn) {
1033 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); 1034 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1034 return 0; 1035 return 0;
1035 } 1036 }
1036 1037
1037 /* check entries number */ 1038 /* Check entries number: */
1038 memset(num, 0, sizeof(num)); 1039 memset(num, 0, sizeof(num));
1039 for (i = 0; i < num_var_ranges; i++) { 1040 for (i = 0; i < num_var_ranges; i++) {
1040 type = range_state[i].type; 1041 type = range_state[i].type;
@@ -1046,11 +1047,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1046 num[type]++; 1047 num[type]++;
1047 } 1048 }
1048 1049
1049 /* no entry for WB? */ 1050 /* No entry for WB? */
1050 if (!num[MTRR_TYPE_WRBACK]) 1051 if (!num[MTRR_TYPE_WRBACK])
1051 return 0; 1052 return 0;
1052 1053
1053 /* check if we only had WB and UC */ 1054 /* Check if we only had WB and UC: */
1054 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != 1055 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1055 num_var_ranges - num[MTRR_NUM_TYPES]) 1056 num_var_ranges - num[MTRR_NUM_TYPES])
1056 return 0; 1057 return 0;
@@ -1066,31 +1067,31 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1066 } 1067 }
1067 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 1068 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1068 1069
1070 /* Check the head: */
1069 total_trim_size = 0; 1071 total_trim_size = 0;
1070 /* check the head */
1071 if (range[0].start) 1072 if (range[0].start)
1072 total_trim_size += real_trim_memory(0, range[0].start); 1073 total_trim_size += real_trim_memory(0, range[0].start);
1073 /* check the holes */ 1074
1075 /* Check the holes: */
1074 for (i = 0; i < nr_range - 1; i++) { 1076 for (i = 0; i < nr_range - 1; i++) {
1075 if (range[i].end + 1 < range[i+1].start) 1077 if (range[i].end + 1 < range[i+1].start)
1076 total_trim_size += real_trim_memory(range[i].end + 1, 1078 total_trim_size += real_trim_memory(range[i].end + 1,
1077 range[i+1].start); 1079 range[i+1].start);
1078 } 1080 }
1079 /* check the top */ 1081
1082 /* Check the top: */
1080 i = nr_range - 1; 1083 i = nr_range - 1;
1081 if (range[i].end + 1 < end_pfn) 1084 if (range[i].end + 1 < end_pfn)
1082 total_trim_size += real_trim_memory(range[i].end + 1, 1085 total_trim_size += real_trim_memory(range[i].end + 1,
1083 end_pfn); 1086 end_pfn);
1084 1087
1085 if (total_trim_size) { 1088 if (total_trim_size) {
1086 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" 1089 pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
1087 " all of memory, losing %lluMB of RAM.\n",
1088 total_trim_size >> 20);
1089 1090
1090 if (!changed_by_mtrr_cleanup) 1091 if (!changed_by_mtrr_cleanup)
1091 WARN_ON(1); 1092 WARN_ON(1);
1092 1093
1093 printk(KERN_INFO "update e820 for mtrr\n"); 1094 pr_info("update e820 for mtrr\n");
1094 update_e820(); 1095 update_e820();
1095 1096
1096 return 1; 1097 return 1;
@@ -1098,4 +1099,3 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1098 1099
1099 return 0; 1100 return 0;
1100} 1101}
1101
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index ff14c320040c..228d982ce09c 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -1,38 +1,40 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/io.h>
2#include <linux/mm.h> 3#include <linux/mm.h>
3#include <asm/mtrr.h> 4
4#include <asm/msr.h>
5#include <asm/io.h>
6#include <asm/processor-cyrix.h> 5#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
8#include "mtrr.h" 10#include "mtrr.h"
9 11
10static void 12static void
11cyrix_get_arr(unsigned int reg, unsigned long *base, 13cyrix_get_arr(unsigned int reg, unsigned long *base,
12 unsigned long *size, mtrr_type * type) 14 unsigned long *size, mtrr_type * type)
13{ 15{
14 unsigned long flags;
15 unsigned char arr, ccr3, rcr, shift; 16 unsigned char arr, ccr3, rcr, shift;
17 unsigned long flags;
16 18
17 arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ 19 arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
18 20
19 /* Save flags and disable interrupts */
20 local_irq_save(flags); 21 local_irq_save(flags);
21 22
22 ccr3 = getCx86(CX86_CCR3); 23 ccr3 = getCx86(CX86_CCR3);
23 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 24 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
24 ((unsigned char *) base)[3] = getCx86(arr); 25 ((unsigned char *)base)[3] = getCx86(arr);
25 ((unsigned char *) base)[2] = getCx86(arr + 1); 26 ((unsigned char *)base)[2] = getCx86(arr + 1);
26 ((unsigned char *) base)[1] = getCx86(arr + 2); 27 ((unsigned char *)base)[1] = getCx86(arr + 2);
27 rcr = getCx86(CX86_RCR_BASE + reg); 28 rcr = getCx86(CX86_RCR_BASE + reg);
28 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 29 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
29 30
30 /* Enable interrupts if it was enabled previously */
31 local_irq_restore(flags); 31 local_irq_restore(flags);
32
32 shift = ((unsigned char *) base)[1] & 0x0f; 33 shift = ((unsigned char *) base)[1] & 0x0f;
33 *base >>= PAGE_SHIFT; 34 *base >>= PAGE_SHIFT;
34 35
35 /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 36 /*
37 * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
36 * Note: shift==0xf means 4G, this is unsupported. 38 * Note: shift==0xf means 4G, this is unsupported.
37 */ 39 */
38 if (shift) 40 if (shift)
@@ -76,17 +78,20 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
76 } 78 }
77} 79}
78 80
81/*
82 * cyrix_get_free_region - get a free ARR.
83 *
84 * @base: the starting (base) address of the region.
85 * @size: the size (in bytes) of the region.
86 *
87 * Returns: the index of the region on success, else -1 on error.
88*/
79static int 89static int
80cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) 90cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
81/* [SUMMARY] Get a free ARR.
82 <base> The starting (base) address of the region.
83 <size> The size (in bytes) of the region.
84 [RETURNS] The index of the region on success, else -1 on error.
85*/
86{ 91{
87 int i;
88 mtrr_type ltype;
89 unsigned long lbase, lsize; 92 unsigned long lbase, lsize;
93 mtrr_type ltype;
94 int i;
90 95
91 switch (replace_reg) { 96 switch (replace_reg) {
92 case 7: 97 case 7:
@@ -107,14 +112,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
107 cyrix_get_arr(7, &lbase, &lsize, &ltype); 112 cyrix_get_arr(7, &lbase, &lsize, &ltype);
108 if (lsize == 0) 113 if (lsize == 0)
109 return 7; 114 return 7;
110 /* Else try ARR0-ARR6 first */ 115 /* Else try ARR0-ARR6 first */
111 } else { 116 } else {
112 for (i = 0; i < 7; i++) { 117 for (i = 0; i < 7; i++) {
113 cyrix_get_arr(i, &lbase, &lsize, &ltype); 118 cyrix_get_arr(i, &lbase, &lsize, &ltype);
114 if (lsize == 0) 119 if (lsize == 0)
115 return i; 120 return i;
116 } 121 }
117 /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ 122 /*
123 * ARR0-ARR6 isn't free
124 * try ARR7 but its size must be at least 256K
125 */
118 cyrix_get_arr(i, &lbase, &lsize, &ltype); 126 cyrix_get_arr(i, &lbase, &lsize, &ltype);
119 if ((lsize == 0) && (size >= 0x40)) 127 if ((lsize == 0) && (size >= 0x40))
120 return i; 128 return i;
@@ -122,21 +130,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
122 return -ENOSPC; 130 return -ENOSPC;
123} 131}
124 132
125static u32 cr4 = 0; 133static u32 cr4, ccr3;
126static u32 ccr3;
127 134
128static void prepare_set(void) 135static void prepare_set(void)
129{ 136{
130 u32 cr0; 137 u32 cr0;
131 138
132 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 139 /* Save value of CR4 and clear Page Global Enable (bit 7) */
133 if ( cpu_has_pge ) { 140 if (cpu_has_pge) {
134 cr4 = read_cr4(); 141 cr4 = read_cr4();
135 write_cr4(cr4 & ~X86_CR4_PGE); 142 write_cr4(cr4 & ~X86_CR4_PGE);
136 } 143 }
137 144
138 /* Disable and flush caches. Note that wbinvd flushes the TLBs as 145 /*
139 a side-effect */ 146 * Disable and flush caches.
147 * Note that wbinvd flushes the TLBs as a side-effect
148 */
140 cr0 = read_cr0() | X86_CR0_CD; 149 cr0 = read_cr0() | X86_CR0_CD;
141 wbinvd(); 150 wbinvd();
142 write_cr0(cr0); 151 write_cr0(cr0);
@@ -147,22 +156,21 @@ static void prepare_set(void)
147 156
148 /* Cyrix ARRs - everything else was excluded at the top */ 157 /* Cyrix ARRs - everything else was excluded at the top */
149 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); 158 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
150
151} 159}
152 160
153static void post_set(void) 161static void post_set(void)
154{ 162{
155 /* Flush caches and TLBs */ 163 /* Flush caches and TLBs */
156 wbinvd(); 164 wbinvd();
157 165
158 /* Cyrix ARRs - everything else was excluded at the top */ 166 /* Cyrix ARRs - everything else was excluded at the top */
159 setCx86(CX86_CCR3, ccr3); 167 setCx86(CX86_CCR3, ccr3);
160 168
161 /* Enable caches */ 169 /* Enable caches */
162 write_cr0(read_cr0() & 0xbfffffff); 170 write_cr0(read_cr0() & 0xbfffffff);
163 171
164 /* Restore value of CR4 */ 172 /* Restore value of CR4 */
165 if ( cpu_has_pge ) 173 if (cpu_has_pge)
166 write_cr4(cr4); 174 write_cr4(cr4);
167} 175}
168 176
@@ -178,7 +186,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
178 size >>= 6; 186 size >>= 6;
179 187
180 size &= 0x7fff; /* make sure arr_size <= 14 */ 188 size &= 0x7fff; /* make sure arr_size <= 14 */
181 for (arr_size = 0; size; arr_size++, size >>= 1) ; 189 for (arr_size = 0; size; arr_size++, size >>= 1)
190 ;
182 191
183 if (reg < 7) { 192 if (reg < 7) {
184 switch (type) { 193 switch (type) {
@@ -215,18 +224,18 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
215 prepare_set(); 224 prepare_set();
216 225
217 base <<= PAGE_SHIFT; 226 base <<= PAGE_SHIFT;
218 setCx86(arr, ((unsigned char *) &base)[3]); 227 setCx86(arr + 0, ((unsigned char *)&base)[3]);
219 setCx86(arr + 1, ((unsigned char *) &base)[2]); 228 setCx86(arr + 1, ((unsigned char *)&base)[2]);
220 setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); 229 setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size);
221 setCx86(CX86_RCR_BASE + reg, arr_type); 230 setCx86(CX86_RCR_BASE + reg, arr_type);
222 231
223 post_set(); 232 post_set();
224} 233}
225 234
226typedef struct { 235typedef struct {
227 unsigned long base; 236 unsigned long base;
228 unsigned long size; 237 unsigned long size;
229 mtrr_type type; 238 mtrr_type type;
230} arr_state_t; 239} arr_state_t;
231 240
232static arr_state_t arr_state[8] = { 241static arr_state_t arr_state[8] = {
@@ -247,16 +256,17 @@ static void cyrix_set_all(void)
247 setCx86(CX86_CCR0 + i, ccr_state[i]); 256 setCx86(CX86_CCR0 + i, ccr_state[i]);
248 for (; i < 7; i++) 257 for (; i < 7; i++)
249 setCx86(CX86_CCR4 + i, ccr_state[i]); 258 setCx86(CX86_CCR4 + i, ccr_state[i]);
250 for (i = 0; i < 8; i++) 259
251 cyrix_set_arr(i, arr_state[i].base, 260 for (i = 0; i < 8; i++) {
261 cyrix_set_arr(i, arr_state[i].base,
252 arr_state[i].size, arr_state[i].type); 262 arr_state[i].size, arr_state[i].type);
263 }
253 264
254 post_set(); 265 post_set();
255} 266}
256 267
257static struct mtrr_ops cyrix_mtrr_ops = { 268static struct mtrr_ops cyrix_mtrr_ops = {
258 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
259// .init = cyrix_arr_init,
260 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
261 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
262 .get = cyrix_get_arr, 272 .get = cyrix_get_arr,
@@ -270,5 +280,3 @@ int __init cyrix_init_mtrr(void)
270 set_mtrr_ops(&cyrix_mtrr_ops); 280 set_mtrr_ops(&cyrix_mtrr_ops);
271 return 0; 281 return 0;
272} 282}
273
274//arch_initcall(cyrix_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0543f69f0b27..55da0c5f68dd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,28 +1,34 @@
1/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong 1/*
2 because MTRRs can span upto 40 bits (36bits on most modern x86) */ 2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
3 * because MTRRs can span upto 40 bits (36bits on most modern x86)
4 */
5#define DEBUG
6
7#include <linux/module.h>
3#include <linux/init.h> 8#include <linux/init.h>
4#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/io.h>
5#include <linux/mm.h> 11#include <linux/mm.h>
6#include <linux/module.h> 12
7#include <asm/io.h>
8#include <asm/mtrr.h>
9#include <asm/msr.h>
10#include <asm/system.h>
11#include <asm/cpufeature.h>
12#include <asm/processor-flags.h> 13#include <asm/processor-flags.h>
14#include <asm/cpufeature.h>
13#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
16#include <asm/system.h>
17#include <asm/mtrr.h>
18#include <asm/msr.h>
14#include <asm/pat.h> 19#include <asm/pat.h>
20
15#include "mtrr.h" 21#include "mtrr.h"
16 22
17struct fixed_range_block { 23struct fixed_range_block {
18 int base_msr; /* start address of an MTRR block */ 24 int base_msr; /* start address of an MTRR block */
19 int ranges; /* number of MTRRs in this block */ 25 int ranges; /* number of MTRRs in this block */
20}; 26};
21 27
22static struct fixed_range_block fixed_range_blocks[] = { 28static struct fixed_range_block fixed_range_blocks[] = {
23 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ 29 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
24 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ 30 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
25 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ 31 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
26 {} 32 {}
27}; 33};
28 34
@@ -30,10 +36,10 @@ static unsigned long smp_changes_mask;
30static int mtrr_state_set; 36static int mtrr_state_set;
31u64 mtrr_tom2; 37u64 mtrr_tom2;
32 38
33struct mtrr_state_type mtrr_state = {}; 39struct mtrr_state_type mtrr_state;
34EXPORT_SYMBOL_GPL(mtrr_state); 40EXPORT_SYMBOL_GPL(mtrr_state);
35 41
36/** 42/*
37 * BIOS is expected to clear MtrrFixDramModEn bit, see for example 43 * BIOS is expected to clear MtrrFixDramModEn bit, see for example
38 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD 44 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
39 * Opteron Processors" (26094 Rev. 3.30 February 2006), section 45 * Opteron Processors" (26094 Rev. 3.30 February 2006), section
@@ -104,9 +110,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
104 * Look of multiple ranges matching this address and pick type 110 * Look of multiple ranges matching this address and pick type
105 * as per MTRR precedence 111 * as per MTRR precedence
106 */ 112 */
107 if (!(mtrr_state.enabled & 2)) { 113 if (!(mtrr_state.enabled & 2))
108 return mtrr_state.def_type; 114 return mtrr_state.def_type;
109 }
110 115
111 prev_match = 0xFF; 116 prev_match = 0xFF;
112 for (i = 0; i < num_var_ranges; ++i) { 117 for (i = 0; i < num_var_ranges; ++i) {
@@ -125,9 +130,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
125 if (start_state != end_state) 130 if (start_state != end_state)
126 return 0xFE; 131 return 0xFE;
127 132
128 if ((start & mask) != (base & mask)) { 133 if ((start & mask) != (base & mask))
129 continue; 134 continue;
130 }
131 135
132 curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; 136 curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
133 if (prev_match == 0xFF) { 137 if (prev_match == 0xFF) {
@@ -148,9 +152,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
148 curr_match = MTRR_TYPE_WRTHROUGH; 152 curr_match = MTRR_TYPE_WRTHROUGH;
149 } 153 }
150 154
151 if (prev_match != curr_match) { 155 if (prev_match != curr_match)
152 return MTRR_TYPE_UNCACHABLE; 156 return MTRR_TYPE_UNCACHABLE;
153 }
154 } 157 }
155 158
156 if (mtrr_tom2) { 159 if (mtrr_tom2) {
@@ -164,7 +167,7 @@ u8 mtrr_type_lookup(u64 start, u64 end)
164 return mtrr_state.def_type; 167 return mtrr_state.def_type;
165} 168}
166 169
167/* Get the MSR pair relating to a var range */ 170/* Get the MSR pair relating to a var range */
168static void 171static void
169get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 172get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
170{ 173{
@@ -172,7 +175,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
172 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 175 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
173} 176}
174 177
175/* fill the MSR pair relating to a var range */ 178/* Fill the MSR pair relating to a var range */
176void fill_mtrr_var_range(unsigned int index, 179void fill_mtrr_var_range(unsigned int index,
177 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) 180 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
178{ 181{
@@ -186,10 +189,9 @@ void fill_mtrr_var_range(unsigned int index,
186 vr[index].mask_hi = mask_hi; 189 vr[index].mask_hi = mask_hi;
187} 190}
188 191
189static void 192static void get_fixed_ranges(mtrr_type *frs)
190get_fixed_ranges(mtrr_type * frs)
191{ 193{
192 unsigned int *p = (unsigned int *) frs; 194 unsigned int *p = (unsigned int *)frs;
193 int i; 195 int i;
194 196
195 k8_check_syscfg_dram_mod_en(); 197 k8_check_syscfg_dram_mod_en();
@@ -217,22 +219,22 @@ static void __init print_fixed_last(void)
217 if (!last_fixed_end) 219 if (!last_fixed_end)
218 return; 220 return;
219 221
220 printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, 222 pr_debug(" %05X-%05X %s\n", last_fixed_start,
221 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); 223 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
222 224
223 last_fixed_end = 0; 225 last_fixed_end = 0;
224} 226}
225 227
226static void __init update_fixed_last(unsigned base, unsigned end, 228static void __init update_fixed_last(unsigned base, unsigned end,
227 mtrr_type type) 229 mtrr_type type)
228{ 230{
229 last_fixed_start = base; 231 last_fixed_start = base;
230 last_fixed_end = end; 232 last_fixed_end = end;
231 last_fixed_type = type; 233 last_fixed_type = type;
232} 234}
233 235
234static void __init print_fixed(unsigned base, unsigned step, 236static void __init
235 const mtrr_type *types) 237print_fixed(unsigned base, unsigned step, const mtrr_type *types)
236{ 238{
237 unsigned i; 239 unsigned i;
238 240
@@ -259,54 +261,55 @@ static void __init print_mtrr_state(void)
259 unsigned int i; 261 unsigned int i;
260 int high_width; 262 int high_width;
261 263
262 printk(KERN_DEBUG "MTRR default type: %s\n", 264 pr_debug("MTRR default type: %s\n",
263 mtrr_attrib_to_str(mtrr_state.def_type)); 265 mtrr_attrib_to_str(mtrr_state.def_type));
264 if (mtrr_state.have_fixed) { 266 if (mtrr_state.have_fixed) {
265 printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", 267 pr_debug("MTRR fixed ranges %sabled:\n",
266 mtrr_state.enabled & 1 ? "en" : "dis"); 268 mtrr_state.enabled & 1 ? "en" : "dis");
267 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); 269 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
268 for (i = 0; i < 2; ++i) 270 for (i = 0; i < 2; ++i)
269 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); 271 print_fixed(0x80000 + i * 0x20000, 0x04000,
272 mtrr_state.fixed_ranges + (i + 1) * 8);
270 for (i = 0; i < 8; ++i) 273 for (i = 0; i < 8; ++i)
271 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); 274 print_fixed(0xC0000 + i * 0x08000, 0x01000,
275 mtrr_state.fixed_ranges + (i + 3) * 8);
272 276
273 /* tail */ 277 /* tail */
274 print_fixed_last(); 278 print_fixed_last();
275 } 279 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", 280 pr_debug("MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis"); 281 mtrr_state.enabled & 2 ? "en" : "dis");
278 if (size_or_mask & 0xffffffffUL) 282 if (size_or_mask & 0xffffffffUL)
279 high_width = ffs(size_or_mask & 0xffffffffUL) - 1; 283 high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
280 else 284 else
281 high_width = ffs(size_or_mask>>32) + 32 - 1; 285 high_width = ffs(size_or_mask>>32) + 32 - 1;
282 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; 286 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
287
283 for (i = 0; i < num_var_ranges; ++i) { 288 for (i = 0; i < num_var_ranges; ++i) {
284 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) 289 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
285 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", 290 pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
286 i, 291 i,
287 high_width, 292 high_width,
288 mtrr_state.var_ranges[i].base_hi, 293 mtrr_state.var_ranges[i].base_hi,
289 mtrr_state.var_ranges[i].base_lo >> 12, 294 mtrr_state.var_ranges[i].base_lo >> 12,
290 high_width, 295 high_width,
291 mtrr_state.var_ranges[i].mask_hi, 296 mtrr_state.var_ranges[i].mask_hi,
292 mtrr_state.var_ranges[i].mask_lo >> 12, 297 mtrr_state.var_ranges[i].mask_lo >> 12,
293 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); 298 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
294 else 299 else
295 printk(KERN_DEBUG " %u disabled\n", i); 300 pr_debug(" %u disabled\n", i);
296 }
297 if (mtrr_tom2) {
298 printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
299 mtrr_tom2, mtrr_tom2>>20);
300 } 301 }
302 if (mtrr_tom2)
303 pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
301} 304}
302 305
303/* Grab all of the MTRR state for this CPU into *state */ 306/* Grab all of the MTRR state for this CPU into *state */
304void __init get_mtrr_state(void) 307void __init get_mtrr_state(void)
305{ 308{
306 unsigned int i;
307 struct mtrr_var_range *vrs; 309 struct mtrr_var_range *vrs;
308 unsigned lo, dummy;
309 unsigned long flags; 310 unsigned long flags;
311 unsigned lo, dummy;
312 unsigned int i;
310 313
311 vrs = mtrr_state.var_ranges; 314 vrs = mtrr_state.var_ranges;
312 315
@@ -324,6 +327,7 @@ void __init get_mtrr_state(void)
324 327
325 if (amd_special_default_mtrr()) { 328 if (amd_special_default_mtrr()) {
326 unsigned low, high; 329 unsigned low, high;
330
327 /* TOP_MEM2 */ 331 /* TOP_MEM2 */
328 rdmsr(MSR_K8_TOP_MEM2, low, high); 332 rdmsr(MSR_K8_TOP_MEM2, low, high);
329 mtrr_tom2 = high; 333 mtrr_tom2 = high;
@@ -344,10 +348,9 @@ void __init get_mtrr_state(void)
344 348
345 post_set(); 349 post_set();
346 local_irq_restore(flags); 350 local_irq_restore(flags);
347
348} 351}
349 352
350/* Some BIOS's are fucked and don't set all MTRRs the same! */ 353/* Some BIOS's are messed up and don't set all MTRRs the same! */
351void __init mtrr_state_warn(void) 354void __init mtrr_state_warn(void)
352{ 355{
353 unsigned long mask = smp_changes_mask; 356 unsigned long mask = smp_changes_mask;
@@ -355,28 +358,33 @@ void __init mtrr_state_warn(void)
355 if (!mask) 358 if (!mask)
356 return; 359 return;
357 if (mask & MTRR_CHANGE_MASK_FIXED) 360 if (mask & MTRR_CHANGE_MASK_FIXED)
358 printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); 361 pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
359 if (mask & MTRR_CHANGE_MASK_VARIABLE) 362 if (mask & MTRR_CHANGE_MASK_VARIABLE)
360 printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); 363 pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
361 if (mask & MTRR_CHANGE_MASK_DEFTYPE) 364 if (mask & MTRR_CHANGE_MASK_DEFTYPE)
362 printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); 365 pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
366
363 printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); 367 printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
364 printk(KERN_INFO "mtrr: corrected configuration.\n"); 368 printk(KERN_INFO "mtrr: corrected configuration.\n");
365} 369}
366 370
367/* Doesn't attempt to pass an error out to MTRR users 371/*
368 because it's quite complicated in some cases and probably not 372 * Doesn't attempt to pass an error out to MTRR users
369 worth it because the best error handling is to ignore it. */ 373 * because it's quite complicated in some cases and probably not
374 * worth it because the best error handling is to ignore it.
375 */
370void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) 376void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
371{ 377{
372 if (wrmsr_safe(msr, a, b) < 0) 378 if (wrmsr_safe(msr, a, b) < 0) {
373 printk(KERN_ERR 379 printk(KERN_ERR
374 "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", 380 "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
375 smp_processor_id(), msr, a, b); 381 smp_processor_id(), msr, a, b);
382 }
376} 383}
377 384
378/** 385/**
379 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have 386 * set_fixed_range - checks & updates a fixed-range MTRR if it
387 * differs from the value it should have
380 * @msr: MSR address of the MTTR which should be checked and updated 388 * @msr: MSR address of the MTTR which should be checked and updated
381 * @changed: pointer which indicates whether the MTRR needed to be changed 389 * @changed: pointer which indicates whether the MTRR needed to be changed
382 * @msrwords: pointer to the MSR values which the MSR should have 390 * @msrwords: pointer to the MSR values which the MSR should have
@@ -401,20 +409,23 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
401 * 409 *
402 * Returns: The index of the region on success, else negative on error. 410 * Returns: The index of the region on success, else negative on error.
403 */ 411 */
404int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) 412int
413generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
405{ 414{
406 int i, max;
407 mtrr_type ltype;
408 unsigned long lbase, lsize; 415 unsigned long lbase, lsize;
416 mtrr_type ltype;
417 int i, max;
409 418
410 max = num_var_ranges; 419 max = num_var_ranges;
411 if (replace_reg >= 0 && replace_reg < max) 420 if (replace_reg >= 0 && replace_reg < max)
412 return replace_reg; 421 return replace_reg;
422
413 for (i = 0; i < max; ++i) { 423 for (i = 0; i < max; ++i) {
414 mtrr_if->get(i, &lbase, &lsize, &ltype); 424 mtrr_if->get(i, &lbase, &lsize, &ltype);
415 if (lsize == 0) 425 if (lsize == 0)
416 return i; 426 return i;
417 } 427 }
428
418 return -ENOSPC; 429 return -ENOSPC;
419} 430}
420 431
@@ -434,7 +445,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
434 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 445 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
435 446
436 if ((mask_lo & 0x800) == 0) { 447 if ((mask_lo & 0x800) == 0) {
437 /* Invalid (i.e. free) range */ 448 /* Invalid (i.e. free) range */
438 *base = 0; 449 *base = 0;
439 *size = 0; 450 *size = 0;
440 *type = 0; 451 *type = 0;
@@ -471,27 +482,31 @@ out_put_cpu:
471} 482}
472 483
473/** 484/**
474 * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set 485 * set_fixed_ranges - checks & updates the fixed-range MTRRs if they
486 * differ from the saved set
475 * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() 487 * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
476 */ 488 */
477static int set_fixed_ranges(mtrr_type * frs) 489static int set_fixed_ranges(mtrr_type *frs)
478{ 490{
479 unsigned long long *saved = (unsigned long long *) frs; 491 unsigned long long *saved = (unsigned long long *)frs;
480 bool changed = false; 492 bool changed = false;
481 int block=-1, range; 493 int block = -1, range;
482 494
483 k8_check_syscfg_dram_mod_en(); 495 k8_check_syscfg_dram_mod_en();
484 496
485 while (fixed_range_blocks[++block].ranges) 497 while (fixed_range_blocks[++block].ranges) {
486 for (range=0; range < fixed_range_blocks[block].ranges; range++) 498 for (range = 0; range < fixed_range_blocks[block].ranges; range++)
487 set_fixed_range(fixed_range_blocks[block].base_msr + range, 499 set_fixed_range(fixed_range_blocks[block].base_msr + range,
488 &changed, (unsigned int *) saved++); 500 &changed, (unsigned int *)saved++);
501 }
489 502
490 return changed; 503 return changed;
491} 504}
492 505
493/* Set the MSR pair relating to a var range. Returns TRUE if 506/*
494 changes are made */ 507 * Set the MSR pair relating to a var range.
508 * Returns true if changes are made.
509 */
495static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) 510static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
496{ 511{
497 unsigned int lo, hi; 512 unsigned int lo, hi;
@@ -501,6 +516,7 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
501 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) 516 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
502 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != 517 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
503 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { 518 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
519
504 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); 520 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
505 changed = true; 521 changed = true;
506 } 522 }
@@ -526,21 +542,26 @@ static u32 deftype_lo, deftype_hi;
526 */ 542 */
527static unsigned long set_mtrr_state(void) 543static unsigned long set_mtrr_state(void)
528{ 544{
529 unsigned int i;
530 unsigned long change_mask = 0; 545 unsigned long change_mask = 0;
546 unsigned int i;
531 547
532 for (i = 0; i < num_var_ranges; i++) 548 for (i = 0; i < num_var_ranges; i++) {
533 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) 549 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
534 change_mask |= MTRR_CHANGE_MASK_VARIABLE; 550 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
551 }
535 552
536 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) 553 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
537 change_mask |= MTRR_CHANGE_MASK_FIXED; 554 change_mask |= MTRR_CHANGE_MASK_FIXED;
538 555
539 /* Set_mtrr_restore restores the old value of MTRRdefType, 556 /*
540 so to set it we fiddle with the saved value */ 557 * Set_mtrr_restore restores the old value of MTRRdefType,
558 * so to set it we fiddle with the saved value:
559 */
541 if ((deftype_lo & 0xff) != mtrr_state.def_type 560 if ((deftype_lo & 0xff) != mtrr_state.def_type
542 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { 561 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
543 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); 562
563 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
564 (mtrr_state.enabled << 10);
544 change_mask |= MTRR_CHANGE_MASK_DEFTYPE; 565 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
545 } 566 }
546 567
@@ -548,33 +569,36 @@ static unsigned long set_mtrr_state(void)
548} 569}
549 570
550 571
551static unsigned long cr4 = 0; 572static unsigned long cr4;
552static DEFINE_SPINLOCK(set_atomicity_lock); 573static DEFINE_SPINLOCK(set_atomicity_lock);
553 574
554/* 575/*
555 * Since we are disabling the cache don't allow any interrupts - they 576 * Since we are disabling the cache don't allow any interrupts,
556 * would run extremely slow and would only increase the pain. The caller must 577 * they would run extremely slow and would only increase the pain.
557 * ensure that local interrupts are disabled and are reenabled after post_set() 578 *
558 * has been called. 579 * The caller must ensure that local interrupts are disabled and
580 * are reenabled after post_set() has been called.
559 */ 581 */
560
561static void prepare_set(void) __acquires(set_atomicity_lock) 582static void prepare_set(void) __acquires(set_atomicity_lock)
562{ 583{
563 unsigned long cr0; 584 unsigned long cr0;
564 585
565 /* Note that this is not ideal, since the cache is only flushed/disabled 586 /*
566 for this CPU while the MTRRs are changed, but changing this requires 587 * Note that this is not ideal
567 more invasive changes to the way the kernel boots */ 588 * since the cache is only flushed/disabled for this CPU while the
589 * MTRRs are changed, but changing this requires more invasive
590 * changes to the way the kernel boots
591 */
568 592
569 spin_lock(&set_atomicity_lock); 593 spin_lock(&set_atomicity_lock);
570 594
571 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
572 cr0 = read_cr0() | X86_CR0_CD; 596 cr0 = read_cr0() | X86_CR0_CD;
573 write_cr0(cr0); 597 write_cr0(cr0);
574 wbinvd(); 598 wbinvd();
575 599
576 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 600 /* Save value of CR4 and clear Page Global Enable (bit 7) */
577 if ( cpu_has_pge ) { 601 if (cpu_has_pge) {
578 cr4 = read_cr4(); 602 cr4 = read_cr4();
579 write_cr4(cr4 & ~X86_CR4_PGE); 603 write_cr4(cr4 & ~X86_CR4_PGE);
580 } 604 }
@@ -582,26 +606,26 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
582 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ 606 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
583 __flush_tlb(); 607 __flush_tlb();
584 608
585 /* Save MTRR state */ 609 /* Save MTRR state */
586 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); 610 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
587 611
588 /* Disable MTRRs, and set the default type to uncached */ 612 /* Disable MTRRs, and set the default type to uncached */
589 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); 613 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
590} 614}
591 615
592static void post_set(void) __releases(set_atomicity_lock) 616static void post_set(void) __releases(set_atomicity_lock)
593{ 617{
594 /* Flush TLBs (no need to flush caches - they are disabled) */ 618 /* Flush TLBs (no need to flush caches - they are disabled) */
595 __flush_tlb(); 619 __flush_tlb();
596 620
597 /* Intel (P6) standard MTRRs */ 621 /* Intel (P6) standard MTRRs */
598 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); 622 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
599 623
600 /* Enable caches */ 624 /* Enable caches */
601 write_cr0(read_cr0() & 0xbfffffff); 625 write_cr0(read_cr0() & 0xbfffffff);
602 626
603 /* Restore value of CR4 */ 627 /* Restore value of CR4 */
604 if ( cpu_has_pge ) 628 if (cpu_has_pge)
605 write_cr4(cr4); 629 write_cr4(cr4);
606 spin_unlock(&set_atomicity_lock); 630 spin_unlock(&set_atomicity_lock);
607} 631}
@@ -623,24 +647,27 @@ static void generic_set_all(void)
623 post_set(); 647 post_set();
624 local_irq_restore(flags); 648 local_irq_restore(flags);
625 649
626 /* Use the atomic bitops to update the global mask */ 650 /* Use the atomic bitops to update the global mask */
627 for (count = 0; count < sizeof mask * 8; ++count) { 651 for (count = 0; count < sizeof mask * 8; ++count) {
628 if (mask & 0x01) 652 if (mask & 0x01)
629 set_bit(count, &smp_changes_mask); 653 set_bit(count, &smp_changes_mask);
630 mask >>= 1; 654 mask >>= 1;
631 } 655 }
632 656
633} 657}
634 658
659/**
660 * generic_set_mtrr - set variable MTRR register on the local CPU.
661 *
662 * @reg: The register to set.
663 * @base: The base address of the region.
664 * @size: The size of the region. If this is 0 the region is disabled.
665 * @type: The type of the region.
666 *
667 * Returns nothing.
668 */
635static void generic_set_mtrr(unsigned int reg, unsigned long base, 669static void generic_set_mtrr(unsigned int reg, unsigned long base,
636 unsigned long size, mtrr_type type) 670 unsigned long size, mtrr_type type)
637/* [SUMMARY] Set variable MTRR register on the local CPU.
638 <reg> The register to set.
639 <base> The base address of the region.
640 <size> The size of the region. If this is 0 the region is disabled.
641 <type> The type of the region.
642 [RETURNS] Nothing.
643*/
644{ 671{
645 unsigned long flags; 672 unsigned long flags;
646 struct mtrr_var_range *vr; 673 struct mtrr_var_range *vr;
@@ -651,8 +678,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
651 prepare_set(); 678 prepare_set();
652 679
653 if (size == 0) { 680 if (size == 0) {
654 /* The invalid bit is kept in the mask, so we simply clear the 681 /*
655 relevant mask register to disable a range. */ 682 * The invalid bit is kept in the mask, so we simply
683 * clear the relevant mask register to disable a range.
684 */
656 mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); 685 mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
657 memset(vr, 0, sizeof(struct mtrr_var_range)); 686 memset(vr, 0, sizeof(struct mtrr_var_range));
658 } else { 687 } else {
@@ -669,46 +698,50 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
669 local_irq_restore(flags); 698 local_irq_restore(flags);
670} 699}
671 700
672int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) 701int generic_validate_add_page(unsigned long base, unsigned long size,
702 unsigned int type)
673{ 703{
674 unsigned long lbase, last; 704 unsigned long lbase, last;
675 705
676 /* For Intel PPro stepping <= 7, must be 4 MiB aligned 706 /*
677 and not touch 0x70000000->0x7003FFFF */ 707 * For Intel PPro stepping <= 7
708 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
709 */
678 if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && 710 if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
679 boot_cpu_data.x86_model == 1 && 711 boot_cpu_data.x86_model == 1 &&
680 boot_cpu_data.x86_mask <= 7) { 712 boot_cpu_data.x86_mask <= 7) {
681 if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { 713 if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
682 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); 714 pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
683 return -EINVAL; 715 return -EINVAL;
684 } 716 }
685 if (!(base + size < 0x70000 || base > 0x7003F) && 717 if (!(base + size < 0x70000 || base > 0x7003F) &&
686 (type == MTRR_TYPE_WRCOMB 718 (type == MTRR_TYPE_WRCOMB
687 || type == MTRR_TYPE_WRBACK)) { 719 || type == MTRR_TYPE_WRBACK)) {
688 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); 720 pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
689 return -EINVAL; 721 return -EINVAL;
690 } 722 }
691 } 723 }
692 724
693 /* Check upper bits of base and last are equal and lower bits are 0 725 /*
694 for base and 1 for last */ 726 * Check upper bits of base and last are equal and lower bits are 0
727 * for base and 1 for last
728 */
695 last = base + size - 1; 729 last = base + size - 1;
696 for (lbase = base; !(lbase & 1) && (last & 1); 730 for (lbase = base; !(lbase & 1) && (last & 1);
697 lbase = lbase >> 1, last = last >> 1) ; 731 lbase = lbase >> 1, last = last >> 1)
732 ;
698 if (lbase != last) { 733 if (lbase != last) {
699 printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", 734 pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
700 base, size);
701 return -EINVAL; 735 return -EINVAL;
702 } 736 }
703 return 0; 737 return 0;
704} 738}
705 739
706
707static int generic_have_wrcomb(void) 740static int generic_have_wrcomb(void)
708{ 741{
709 unsigned long config, dummy; 742 unsigned long config, dummy;
710 rdmsr(MSR_MTRRcap, config, dummy); 743 rdmsr(MSR_MTRRcap, config, dummy);
711 return (config & (1 << 10)); 744 return config & (1 << 10);
712} 745}
713 746
714int positive_have_wrcomb(void) 747int positive_have_wrcomb(void)
@@ -716,14 +749,15 @@ int positive_have_wrcomb(void)
716 return 1; 749 return 1;
717} 750}
718 751
719/* generic structure... 752/*
753 * Generic structure...
720 */ 754 */
721struct mtrr_ops generic_mtrr_ops = { 755struct mtrr_ops generic_mtrr_ops = {
722 .use_intel_if = 1, 756 .use_intel_if = 1,
723 .set_all = generic_set_all, 757 .set_all = generic_set_all,
724 .get = generic_get_mtrr, 758 .get = generic_get_mtrr,
725 .get_free_region = generic_get_free_region, 759 .get_free_region = generic_get_free_region,
726 .set = generic_set_mtrr, 760 .set = generic_set_mtrr,
727 .validate_add_page = generic_validate_add_page, 761 .validate_add_page = generic_validate_add_page,
728 .have_wrcomb = generic_have_wrcomb, 762 .have_wrcomb = generic_have_wrcomb,
729}; 763};
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index fb73a52913a4..f04e72527604 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -1,27 +1,28 @@
1#include <linux/init.h>
2#include <linux/proc_fs.h>
3#include <linux/capability.h> 1#include <linux/capability.h>
4#include <linux/ctype.h>
5#include <linux/module.h>
6#include <linux/seq_file.h> 2#include <linux/seq_file.h>
7#include <asm/uaccess.h> 3#include <linux/uaccess.h>
4#include <linux/proc_fs.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/init.h>
8 8
9#define LINE_SIZE 80 9#define LINE_SIZE 80
10 10
11#include <asm/mtrr.h> 11#include <asm/mtrr.h>
12
12#include "mtrr.h" 13#include "mtrr.h"
13 14
14#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 15#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
15 16
16static const char *const mtrr_strings[MTRR_NUM_TYPES] = 17static const char *const mtrr_strings[MTRR_NUM_TYPES] =
17{ 18{
18 "uncachable", /* 0 */ 19 "uncachable", /* 0 */
19 "write-combining", /* 1 */ 20 "write-combining", /* 1 */
20 "?", /* 2 */ 21 "?", /* 2 */
21 "?", /* 3 */ 22 "?", /* 3 */
22 "write-through", /* 4 */ 23 "write-through", /* 4 */
23 "write-protect", /* 5 */ 24 "write-protect", /* 5 */
24 "write-back", /* 6 */ 25 "write-back", /* 6 */
25}; 26};
26 27
27const char *mtrr_attrib_to_str(int x) 28const char *mtrr_attrib_to_str(int x)
@@ -35,8 +36,8 @@ static int
35mtrr_file_add(unsigned long base, unsigned long size, 36mtrr_file_add(unsigned long base, unsigned long size,
36 unsigned int type, bool increment, struct file *file, int page) 37 unsigned int type, bool increment, struct file *file, int page)
37{ 38{
39 unsigned int *fcount = FILE_FCOUNT(file);
38 int reg, max; 40 int reg, max;
39 unsigned int *fcount = FILE_FCOUNT(file);
40 41
41 max = num_var_ranges; 42 max = num_var_ranges;
42 if (fcount == NULL) { 43 if (fcount == NULL) {
@@ -61,8 +62,8 @@ static int
61mtrr_file_del(unsigned long base, unsigned long size, 62mtrr_file_del(unsigned long base, unsigned long size,
62 struct file *file, int page) 63 struct file *file, int page)
63{ 64{
64 int reg;
65 unsigned int *fcount = FILE_FCOUNT(file); 65 unsigned int *fcount = FILE_FCOUNT(file);
66 int reg;
66 67
67 if (!page) { 68 if (!page) {
68 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) 69 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
@@ -81,13 +82,14 @@ mtrr_file_del(unsigned long base, unsigned long size,
81 return reg; 82 return reg;
82} 83}
83 84
84/* RED-PEN: seq_file can seek now. this is ignored. */ 85/*
86 * seq_file can seek but we ignore it.
87 *
88 * Format of control line:
89 * "base=%Lx size=%Lx type=%s" or "disable=%d"
90 */
85static ssize_t 91static ssize_t
86mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) 92mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
87/* Format of control line:
88 "base=%Lx size=%Lx type=%s" OR:
89 "disable=%d"
90*/
91{ 93{
92 int i, err; 94 int i, err;
93 unsigned long reg; 95 unsigned long reg;
@@ -100,15 +102,18 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
100 return -EPERM; 102 return -EPERM;
101 if (!len) 103 if (!len)
102 return -EINVAL; 104 return -EINVAL;
105
103 memset(line, 0, LINE_SIZE); 106 memset(line, 0, LINE_SIZE);
104 if (len > LINE_SIZE) 107 if (len > LINE_SIZE)
105 len = LINE_SIZE; 108 len = LINE_SIZE;
106 if (copy_from_user(line, buf, len - 1)) 109 if (copy_from_user(line, buf, len - 1))
107 return -EFAULT; 110 return -EFAULT;
111
108 linelen = strlen(line); 112 linelen = strlen(line);
109 ptr = line + linelen - 1; 113 ptr = line + linelen - 1;
110 if (linelen && *ptr == '\n') 114 if (linelen && *ptr == '\n')
111 *ptr = '\0'; 115 *ptr = '\0';
116
112 if (!strncmp(line, "disable=", 8)) { 117 if (!strncmp(line, "disable=", 8)) {
113 reg = simple_strtoul(line + 8, &ptr, 0); 118 reg = simple_strtoul(line + 8, &ptr, 0);
114 err = mtrr_del_page(reg, 0, 0); 119 err = mtrr_del_page(reg, 0, 0);
@@ -116,28 +121,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
116 return err; 121 return err;
117 return len; 122 return len;
118 } 123 }
124
119 if (strncmp(line, "base=", 5)) 125 if (strncmp(line, "base=", 5))
120 return -EINVAL; 126 return -EINVAL;
127
121 base = simple_strtoull(line + 5, &ptr, 0); 128 base = simple_strtoull(line + 5, &ptr, 0);
122 for (; isspace(*ptr); ++ptr) ; 129 while (isspace(*ptr))
130 ptr++;
131
123 if (strncmp(ptr, "size=", 5)) 132 if (strncmp(ptr, "size=", 5))
124 return -EINVAL; 133 return -EINVAL;
134
125 size = simple_strtoull(ptr + 5, &ptr, 0); 135 size = simple_strtoull(ptr + 5, &ptr, 0);
126 if ((base & 0xfff) || (size & 0xfff)) 136 if ((base & 0xfff) || (size & 0xfff))
127 return -EINVAL; 137 return -EINVAL;
128 for (; isspace(*ptr); ++ptr) ; 138 while (isspace(*ptr))
139 ptr++;
140
129 if (strncmp(ptr, "type=", 5)) 141 if (strncmp(ptr, "type=", 5))
130 return -EINVAL; 142 return -EINVAL;
131 ptr += 5; 143 ptr += 5;
132 for (; isspace(*ptr); ++ptr) ; 144 while (isspace(*ptr))
145 ptr++;
146
133 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 147 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
134 if (strcmp(ptr, mtrr_strings[i])) 148 if (strcmp(ptr, mtrr_strings[i]))
135 continue; 149 continue;
136 base >>= PAGE_SHIFT; 150 base >>= PAGE_SHIFT;
137 size >>= PAGE_SHIFT; 151 size >>= PAGE_SHIFT;
138 err = 152 err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
139 mtrr_add_page((unsigned long) base, (unsigned long) size, i,
140 true);
141 if (err < 0) 153 if (err < 0)
142 return err; 154 return err;
143 return len; 155 return len;
@@ -181,7 +193,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
181 case MTRRIOC32_SET_PAGE_ENTRY: 193 case MTRRIOC32_SET_PAGE_ENTRY:
182 case MTRRIOC32_DEL_PAGE_ENTRY: 194 case MTRRIOC32_DEL_PAGE_ENTRY:
183 case MTRRIOC32_KILL_PAGE_ENTRY: { 195 case MTRRIOC32_KILL_PAGE_ENTRY: {
184 struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg; 196 struct mtrr_sentry32 __user *s32;
197
198 s32 = (struct mtrr_sentry32 __user *)__arg;
185 err = get_user(sentry.base, &s32->base); 199 err = get_user(sentry.base, &s32->base);
186 err |= get_user(sentry.size, &s32->size); 200 err |= get_user(sentry.size, &s32->size);
187 err |= get_user(sentry.type, &s32->type); 201 err |= get_user(sentry.type, &s32->type);
@@ -191,7 +205,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
191 } 205 }
192 case MTRRIOC32_GET_ENTRY: 206 case MTRRIOC32_GET_ENTRY:
193 case MTRRIOC32_GET_PAGE_ENTRY: { 207 case MTRRIOC32_GET_PAGE_ENTRY: {
194 struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; 208 struct mtrr_gentry32 __user *g32;
209
210 g32 = (struct mtrr_gentry32 __user *)__arg;
195 err = get_user(gentry.regnum, &g32->regnum); 211 err = get_user(gentry.regnum, &g32->regnum);
196 err |= get_user(gentry.base, &g32->base); 212 err |= get_user(gentry.base, &g32->base);
197 err |= get_user(gentry.size, &g32->size); 213 err |= get_user(gentry.size, &g32->size);
@@ -314,7 +330,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
314 if (err) 330 if (err)
315 return err; 331 return err;
316 332
317 switch(cmd) { 333 switch (cmd) {
318 case MTRRIOC_GET_ENTRY: 334 case MTRRIOC_GET_ENTRY:
319 case MTRRIOC_GET_PAGE_ENTRY: 335 case MTRRIOC_GET_PAGE_ENTRY:
320 if (copy_to_user(arg, &gentry, sizeof gentry)) 336 if (copy_to_user(arg, &gentry, sizeof gentry))
@@ -323,7 +339,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
323#ifdef CONFIG_COMPAT 339#ifdef CONFIG_COMPAT
324 case MTRRIOC32_GET_ENTRY: 340 case MTRRIOC32_GET_ENTRY:
325 case MTRRIOC32_GET_PAGE_ENTRY: { 341 case MTRRIOC32_GET_PAGE_ENTRY: {
326 struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; 342 struct mtrr_gentry32 __user *g32;
343
344 g32 = (struct mtrr_gentry32 __user *)__arg;
327 err = put_user(gentry.base, &g32->base); 345 err = put_user(gentry.base, &g32->base);
328 err |= put_user(gentry.size, &g32->size); 346 err |= put_user(gentry.size, &g32->size);
329 err |= put_user(gentry.regnum, &g32->regnum); 347 err |= put_user(gentry.regnum, &g32->regnum);
@@ -335,11 +353,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
335 return err; 353 return err;
336} 354}
337 355
338static int 356static int mtrr_close(struct inode *ino, struct file *file)
339mtrr_close(struct inode *ino, struct file *file)
340{ 357{
341 int i, max;
342 unsigned int *fcount = FILE_FCOUNT(file); 358 unsigned int *fcount = FILE_FCOUNT(file);
359 int i, max;
343 360
344 if (fcount != NULL) { 361 if (fcount != NULL) {
345 max = num_var_ranges; 362 max = num_var_ranges;
@@ -359,22 +376,22 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset);
359 376
360static int mtrr_open(struct inode *inode, struct file *file) 377static int mtrr_open(struct inode *inode, struct file *file)
361{ 378{
362 if (!mtrr_if) 379 if (!mtrr_if)
363 return -EIO; 380 return -EIO;
364 if (!mtrr_if->get) 381 if (!mtrr_if->get)
365 return -ENXIO; 382 return -ENXIO;
366 return single_open(file, mtrr_seq_show, NULL); 383 return single_open(file, mtrr_seq_show, NULL);
367} 384}
368 385
369static const struct file_operations mtrr_fops = { 386static const struct file_operations mtrr_fops = {
370 .owner = THIS_MODULE, 387 .owner = THIS_MODULE,
371 .open = mtrr_open, 388 .open = mtrr_open,
372 .read = seq_read, 389 .read = seq_read,
373 .llseek = seq_lseek, 390 .llseek = seq_lseek,
374 .write = mtrr_write, 391 .write = mtrr_write,
375 .unlocked_ioctl = mtrr_ioctl, 392 .unlocked_ioctl = mtrr_ioctl,
376 .compat_ioctl = mtrr_ioctl, 393 .compat_ioctl = mtrr_ioctl,
377 .release = mtrr_close, 394 .release = mtrr_close,
378}; 395};
379 396
380static int mtrr_seq_show(struct seq_file *seq, void *offset) 397static int mtrr_seq_show(struct seq_file *seq, void *offset)
@@ -388,23 +405,24 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
388 max = num_var_ranges; 405 max = num_var_ranges;
389 for (i = 0; i < max; i++) { 406 for (i = 0; i < max; i++) {
390 mtrr_if->get(i, &base, &size, &type); 407 mtrr_if->get(i, &base, &size, &type);
391 if (size == 0) 408 if (size == 0) {
392 mtrr_usage_table[i] = 0; 409 mtrr_usage_table[i] = 0;
393 else { 410 continue;
394 if (size < (0x100000 >> PAGE_SHIFT)) {
395 /* less than 1MB */
396 factor = 'K';
397 size <<= PAGE_SHIFT - 10;
398 } else {
399 factor = 'M';
400 size >>= 20 - PAGE_SHIFT;
401 }
402 /* RED-PEN: base can be > 32bit */
403 len += seq_printf(seq,
404 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
405 i, base, base >> (20 - PAGE_SHIFT), size, factor,
406 mtrr_usage_table[i], mtrr_attrib_to_str(type));
407 } 411 }
412 if (size < (0x100000 >> PAGE_SHIFT)) {
413 /* less than 1MB */
414 factor = 'K';
415 size <<= PAGE_SHIFT - 10;
416 } else {
417 factor = 'M';
418 size >>= 20 - PAGE_SHIFT;
419 }
420 /* Base can be > 32bit */
421 len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
422 "(%5luMB), size=%5lu%cB, count=%d: %s\n",
423 i, base, base >> (20 - PAGE_SHIFT), size,
424 factor, mtrr_usage_table[i],
425 mtrr_attrib_to_str(type));
408 } 426 }
409 return 0; 427 return 0;
410} 428}
@@ -422,6 +440,5 @@ static int __init mtrr_if_init(void)
422 proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); 440 proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
423 return 0; 441 return 0;
424} 442}
425
426arch_initcall(mtrr_if_init); 443arch_initcall(mtrr_if_init);
427#endif /* CONFIG_PROC_FS */ 444#endif /* CONFIG_PROC_FS */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 8fc248b5aeaf..84e83de54575 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -25,43 +25,49 @@
25 Operating System Writer's Guide" (Intel document number 242692), 25 Operating System Writer's Guide" (Intel document number 242692),
26 section 11.11.7 26 section 11.11.7
27 27
28 This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> 28 This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
29 on 6-7 March 2002. 29 on 6-7 March 2002.
30 Source: Intel Architecture Software Developers Manual, Volume 3: 30 Source: Intel Architecture Software Developers Manual, Volume 3:
31 System Programming Guide; Section 9.11. (1997 edition - PPro). 31 System Programming Guide; Section 9.11. (1997 edition - PPro).
32*/ 32*/
33 33
34#define DEBUG
35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37
38#include <linux/kvm_para.h>
39#include <linux/uaccess.h>
34#include <linux/module.h> 40#include <linux/module.h>
41#include <linux/mutex.h>
35#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/sort.h>
44#include <linux/cpu.h>
36#include <linux/pci.h> 45#include <linux/pci.h>
37#include <linux/smp.h> 46#include <linux/smp.h>
38#include <linux/cpu.h>
39#include <linux/mutex.h>
40#include <linux/sort.h>
41 47
48#include <asm/processor.h>
42#include <asm/e820.h> 49#include <asm/e820.h>
43#include <asm/mtrr.h> 50#include <asm/mtrr.h>
44#include <asm/uaccess.h>
45#include <asm/processor.h>
46#include <asm/msr.h> 51#include <asm/msr.h>
47#include <asm/kvm_para.h> 52
48#include "mtrr.h" 53#include "mtrr.h"
49 54
50u32 num_var_ranges = 0; 55u32 num_var_ranges;
51 56
52unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; 57unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
53static DEFINE_MUTEX(mtrr_mutex); 58static DEFINE_MUTEX(mtrr_mutex);
54 59
55u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init;
56 62
57static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; 63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
58 64
59struct mtrr_ops * mtrr_if = NULL; 65struct mtrr_ops *mtrr_if;
60 66
61static void set_mtrr(unsigned int reg, unsigned long base, 67static void set_mtrr(unsigned int reg, unsigned long base,
62 unsigned long size, mtrr_type type); 68 unsigned long size, mtrr_type type);
63 69
64void set_mtrr_ops(struct mtrr_ops * ops) 70void set_mtrr_ops(struct mtrr_ops *ops)
65{ 71{
66 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
67 mtrr_ops[ops->vendor] = ops; 73 mtrr_ops[ops->vendor] = ops;
@@ -72,30 +78,36 @@ static int have_wrcomb(void)
72{ 78{
73 struct pci_dev *dev; 79 struct pci_dev *dev;
74 u8 rev; 80 u8 rev;
75 81
76 if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { 82 dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
77 /* ServerWorks LE chipsets < rev 6 have problems with write-combining 83 if (dev != NULL) {
78 Don't allow it and leave room for other chipsets to be tagged */ 84 /*
85 * ServerWorks LE chipsets < rev 6 have problems with
86 * write-combining. Don't allow it and leave room for other
87 * chipsets to be tagged
88 */
79 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && 89 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
80 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { 90 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
81 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 91 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
82 if (rev <= 5) { 92 if (rev <= 5) {
83 printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); 93 pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
84 pci_dev_put(dev); 94 pci_dev_put(dev);
85 return 0; 95 return 0;
86 } 96 }
87 } 97 }
88 /* Intel 450NX errata # 23. Non ascending cacheline evictions to 98 /*
89 write combining memory may resulting in data corruption */ 99 * Intel 450NX errata # 23. Non ascending cacheline evictions to
100 * write combining memory may resulting in data corruption
101 */
90 if (dev->vendor == PCI_VENDOR_ID_INTEL && 102 if (dev->vendor == PCI_VENDOR_ID_INTEL &&
91 dev->device == PCI_DEVICE_ID_INTEL_82451NX) { 103 dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
92 printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); 104 pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
93 pci_dev_put(dev); 105 pci_dev_put(dev);
94 return 0; 106 return 0;
95 } 107 }
96 pci_dev_put(dev); 108 pci_dev_put(dev);
97 } 109 }
98 return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); 110 return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
99} 111}
100 112
101/* This function returns the number of variable MTRRs */ 113/* This function returns the number of variable MTRRs */
@@ -103,12 +115,13 @@ static void __init set_num_var_ranges(void)
103{ 115{
104 unsigned long config = 0, dummy; 116 unsigned long config = 0, dummy;
105 117
106 if (use_intel()) { 118 if (use_intel())
107 rdmsr(MSR_MTRRcap, config, dummy); 119 rdmsr(MSR_MTRRcap, config, dummy);
108 } else if (is_cpu(AMD)) 120 else if (is_cpu(AMD))
109 config = 2; 121 config = 2;
110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) 122 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
111 config = 8; 123 config = 8;
124
112 num_var_ranges = config & 0xff; 125 num_var_ranges = config & 0xff;
113} 126}
114 127
@@ -130,10 +143,12 @@ struct set_mtrr_data {
130 mtrr_type smp_type; 143 mtrr_type smp_type;
131}; 144};
132 145
146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 *
149 * Returns nothing.
150 */
133static void ipi_handler(void *info) 151static void ipi_handler(void *info)
134/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
135 [RETURNS] Nothing.
136*/
137{ 152{
138#ifdef CONFIG_SMP 153#ifdef CONFIG_SMP
139 struct set_mtrr_data *data = info; 154 struct set_mtrr_data *data = info;
@@ -142,18 +157,22 @@ static void ipi_handler(void *info)
142 local_irq_save(flags); 157 local_irq_save(flags);
143 158
144 atomic_dec(&data->count); 159 atomic_dec(&data->count);
145 while(!atomic_read(&data->gate)) 160 while (!atomic_read(&data->gate))
146 cpu_relax(); 161 cpu_relax();
147 162
148 /* The master has cleared me to execute */ 163 /* The master has cleared me to execute */
149 if (data->smp_reg != ~0U) 164 if (data->smp_reg != ~0U) {
150 mtrr_if->set(data->smp_reg, data->smp_base, 165 mtrr_if->set(data->smp_reg, data->smp_base,
151 data->smp_size, data->smp_type); 166 data->smp_size, data->smp_type);
152 else 167 } else if (mtrr_aps_delayed_init) {
168 /*
169 * Initialize the MTRRs inaddition to the synchronisation.
170 */
153 mtrr_if->set_all(); 171 mtrr_if->set_all();
172 }
154 173
155 atomic_dec(&data->count); 174 atomic_dec(&data->count);
156 while(atomic_read(&data->gate)) 175 while (atomic_read(&data->gate))
157 cpu_relax(); 176 cpu_relax();
158 177
159 atomic_dec(&data->count); 178 atomic_dec(&data->count);
@@ -161,7 +180,8 @@ static void ipi_handler(void *info)
161#endif 180#endif
162} 181}
163 182
164static inline int types_compatible(mtrr_type type1, mtrr_type type2) { 183static inline int types_compatible(mtrr_type type1, mtrr_type type2)
184{
165 return type1 == MTRR_TYPE_UNCACHABLE || 185 return type1 == MTRR_TYPE_UNCACHABLE ||
166 type2 == MTRR_TYPE_UNCACHABLE || 186 type2 == MTRR_TYPE_UNCACHABLE ||
167 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || 187 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
@@ -176,10 +196,10 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 * @type: mtrr type 196 * @type: mtrr type
177 * 197 *
178 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 198 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
179 * 199 *
180 * 1. Send IPI to do the following: 200 * 1. Send IPI to do the following:
181 * 2. Disable Interrupts 201 * 2. Disable Interrupts
182 * 3. Wait for all procs to do so 202 * 3. Wait for all procs to do so
183 * 4. Enter no-fill cache mode 203 * 4. Enter no-fill cache mode
184 * 5. Flush caches 204 * 5. Flush caches
185 * 6. Clear PGE bit 205 * 6. Clear PGE bit
@@ -189,26 +209,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
189 * 10. Enable all range registers 209 * 10. Enable all range registers
190 * 11. Flush all TLBs and caches again 210 * 11. Flush all TLBs and caches again
191 * 12. Enter normal cache mode and reenable caching 211 * 12. Enter normal cache mode and reenable caching
192 * 13. Set PGE 212 * 13. Set PGE
193 * 14. Wait for buddies to catch up 213 * 14. Wait for buddies to catch up
194 * 15. Enable interrupts. 214 * 15. Enable interrupts.
195 * 215 *
196 * What does that mean for us? Well, first we set data.count to the number 216 * What does that mean for us? Well, first we set data.count to the number
197 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 217 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
198 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 218 * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
199 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 219 * Meanwhile, they are waiting for that flag to be set. Once it's set, each
200 * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it 220 * CPU goes through the transition of updating MTRRs.
201 * differently, so we call mtrr_if->set() callback and let them take care of it. 221 * The CPU vendors may each do it differently,
202 * When they're done, they again decrement data->count and wait for data.gate to 222 * so we call mtrr_if->set() callback and let them take care of it.
203 * be reset. 223 * When they're done, they again decrement data->count and wait for data.gate
204 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. 224 * to be reset.
225 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
205 * Everyone then enables interrupts and we all continue on. 226 * Everyone then enables interrupts and we all continue on.
206 * 227 *
207 * Note that the mechanism is the same for UP systems, too; all the SMP stuff 228 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
208 * becomes nops. 229 * becomes nops.
209 */ 230 */
210static void set_mtrr(unsigned int reg, unsigned long base, 231static void
211 unsigned long size, mtrr_type type) 232set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
212{ 233{
213 struct set_mtrr_data data; 234 struct set_mtrr_data data;
214 unsigned long flags; 235 unsigned long flags;
@@ -218,121 +239,124 @@ static void set_mtrr(unsigned int reg, unsigned long base,
218 data.smp_size = size; 239 data.smp_size = size;
219 data.smp_type = type; 240 data.smp_type = type;
220 atomic_set(&data.count, num_booting_cpus() - 1); 241 atomic_set(&data.count, num_booting_cpus() - 1);
221 /* make sure data.count is visible before unleashing other CPUs */ 242
243 /* Make sure data.count is visible before unleashing other CPUs */
222 smp_wmb(); 244 smp_wmb();
223 atomic_set(&data.gate,0); 245 atomic_set(&data.gate, 0);
224 246
225 /* Start the ball rolling on other CPUs */ 247 /* Start the ball rolling on other CPUs */
226 if (smp_call_function(ipi_handler, &data, 0) != 0) 248 if (smp_call_function(ipi_handler, &data, 0) != 0)
227 panic("mtrr: timed out waiting for other CPUs\n"); 249 panic("mtrr: timed out waiting for other CPUs\n");
228 250
229 local_irq_save(flags); 251 local_irq_save(flags);
230 252
231 while(atomic_read(&data.count)) 253 while (atomic_read(&data.count))
232 cpu_relax(); 254 cpu_relax();
233 255
234 /* ok, reset count and toggle gate */ 256 /* Ok, reset count and toggle gate */
235 atomic_set(&data.count, num_booting_cpus() - 1); 257 atomic_set(&data.count, num_booting_cpus() - 1);
236 smp_wmb(); 258 smp_wmb();
237 atomic_set(&data.gate,1); 259 atomic_set(&data.gate, 1);
238 260
239 /* do our MTRR business */ 261 /* Do our MTRR business */
240 262
241 /* HACK! 263 /*
264 * HACK!
242 * We use this same function to initialize the mtrrs on boot. 265 * We use this same function to initialize the mtrrs on boot.
243 * The state of the boot cpu's mtrrs has been saved, and we want 266 * The state of the boot cpu's mtrrs has been saved, and we want
244 * to replicate across all the APs. 267 * to replicate across all the APs.
245 * If we're doing that @reg is set to something special... 268 * If we're doing that @reg is set to something special...
246 */ 269 */
247 if (reg != ~0U) 270 if (reg != ~0U)
248 mtrr_if->set(reg,base,size,type); 271 mtrr_if->set(reg, base, size, type);
272 else if (!mtrr_aps_delayed_init)
273 mtrr_if->set_all();
249 274
250 /* wait for the others */ 275 /* Wait for the others */
251 while(atomic_read(&data.count)) 276 while (atomic_read(&data.count))
252 cpu_relax(); 277 cpu_relax();
253 278
254 atomic_set(&data.count, num_booting_cpus() - 1); 279 atomic_set(&data.count, num_booting_cpus() - 1);
255 smp_wmb(); 280 smp_wmb();
256 atomic_set(&data.gate,0); 281 atomic_set(&data.gate, 0);
257 282
258 /* 283 /*
259 * Wait here for everyone to have seen the gate change 284 * Wait here for everyone to have seen the gate change
260 * So we're the last ones to touch 'data' 285 * So we're the last ones to touch 'data'
261 */ 286 */
262 while(atomic_read(&data.count)) 287 while (atomic_read(&data.count))
263 cpu_relax(); 288 cpu_relax();
264 289
265 local_irq_restore(flags); 290 local_irq_restore(flags);
266} 291}
267 292
268/** 293/**
269 * mtrr_add_page - Add a memory type region 294 * mtrr_add_page - Add a memory type region
270 * @base: Physical base address of region in pages (in units of 4 kB!) 295 * @base: Physical base address of region in pages (in units of 4 kB!)
271 * @size: Physical size of region in pages (4 kB) 296 * @size: Physical size of region in pages (4 kB)
272 * @type: Type of MTRR desired 297 * @type: Type of MTRR desired
273 * @increment: If this is true do usage counting on the region 298 * @increment: If this is true do usage counting on the region
274 * 299 *
275 * Memory type region registers control the caching on newer Intel and 300 * Memory type region registers control the caching on newer Intel and
276 * non Intel processors. This function allows drivers to request an 301 * non Intel processors. This function allows drivers to request an
277 * MTRR is added. The details and hardware specifics of each processor's 302 * MTRR is added. The details and hardware specifics of each processor's
278 * implementation are hidden from the caller, but nevertheless the 303 * implementation are hidden from the caller, but nevertheless the
279 * caller should expect to need to provide a power of two size on an 304 * caller should expect to need to provide a power of two size on an
280 * equivalent power of two boundary. 305 * equivalent power of two boundary.
281 * 306 *
282 * If the region cannot be added either because all regions are in use 307 * If the region cannot be added either because all regions are in use
283 * or the CPU cannot support it a negative value is returned. On success 308 * or the CPU cannot support it a negative value is returned. On success
284 * the register number for this entry is returned, but should be treated 309 * the register number for this entry is returned, but should be treated
285 * as a cookie only. 310 * as a cookie only.
286 * 311 *
287 * On a multiprocessor machine the changes are made to all processors. 312 * On a multiprocessor machine the changes are made to all processors.
288 * This is required on x86 by the Intel processors. 313 * This is required on x86 by the Intel processors.
289 * 314 *
290 * The available types are 315 * The available types are
291 * 316 *
292 * %MTRR_TYPE_UNCACHABLE - No caching 317 * %MTRR_TYPE_UNCACHABLE - No caching
293 * 318 *
294 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever 319 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
295 * 320 *
296 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts 321 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
297 * 322 *
298 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes 323 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
299 * 324 *
300 * BUGS: Needs a quiet flag for the cases where drivers do not mind 325 * BUGS: Needs a quiet flag for the cases where drivers do not mind
301 * failures and do not wish system log messages to be sent. 326 * failures and do not wish system log messages to be sent.
302 */ 327 */
303 328int mtrr_add_page(unsigned long base, unsigned long size,
304int mtrr_add_page(unsigned long base, unsigned long size,
305 unsigned int type, bool increment) 329 unsigned int type, bool increment)
306{ 330{
331 unsigned long lbase, lsize;
307 int i, replace, error; 332 int i, replace, error;
308 mtrr_type ltype; 333 mtrr_type ltype;
309 unsigned long lbase, lsize;
310 334
311 if (!mtrr_if) 335 if (!mtrr_if)
312 return -ENXIO; 336 return -ENXIO;
313 337
314 if ((error = mtrr_if->validate_add_page(base,size,type))) 338 error = mtrr_if->validate_add_page(base, size, type);
339 if (error)
315 return error; 340 return error;
316 341
317 if (type >= MTRR_NUM_TYPES) { 342 if (type >= MTRR_NUM_TYPES) {
318 printk(KERN_WARNING "mtrr: type: %u invalid\n", type); 343 pr_warning("mtrr: type: %u invalid\n", type);
319 return -EINVAL; 344 return -EINVAL;
320 } 345 }
321 346
322 /* If the type is WC, check that this processor supports it */ 347 /* If the type is WC, check that this processor supports it */
323 if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { 348 if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
324 printk(KERN_WARNING 349 pr_warning("mtrr: your processor doesn't support write-combining\n");
325 "mtrr: your processor doesn't support write-combining\n");
326 return -ENOSYS; 350 return -ENOSYS;
327 } 351 }
328 352
329 if (!size) { 353 if (!size) {
330 printk(KERN_WARNING "mtrr: zero sized request\n"); 354 pr_warning("mtrr: zero sized request\n");
331 return -EINVAL; 355 return -EINVAL;
332 } 356 }
333 357
334 if (base & size_or_mask || size & size_or_mask) { 358 if (base & size_or_mask || size & size_or_mask) {
335 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); 359 pr_warning("mtrr: base or size exceeds the MTRR width\n");
336 return -EINVAL; 360 return -EINVAL;
337 } 361 }
338 362
@@ -341,36 +365,40 @@ int mtrr_add_page(unsigned long base, unsigned long size,
341 365
342 /* No CPU hotplug when we change MTRR entries */ 366 /* No CPU hotplug when we change MTRR entries */
343 get_online_cpus(); 367 get_online_cpus();
344 /* Search for existing MTRR */ 368
369 /* Search for existing MTRR */
345 mutex_lock(&mtrr_mutex); 370 mutex_lock(&mtrr_mutex);
346 for (i = 0; i < num_var_ranges; ++i) { 371 for (i = 0; i < num_var_ranges; ++i) {
347 mtrr_if->get(i, &lbase, &lsize, &ltype); 372 mtrr_if->get(i, &lbase, &lsize, &ltype);
348 if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) 373 if (!lsize || base > lbase + lsize - 1 ||
374 base + size - 1 < lbase)
349 continue; 375 continue;
350 /* At this point we know there is some kind of overlap/enclosure */ 376 /*
377 * At this point we know there is some kind of
378 * overlap/enclosure
379 */
351 if (base < lbase || base + size - 1 > lbase + lsize - 1) { 380 if (base < lbase || base + size - 1 > lbase + lsize - 1) {
352 if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { 381 if (base <= lbase &&
382 base + size - 1 >= lbase + lsize - 1) {
353 /* New region encloses an existing region */ 383 /* New region encloses an existing region */
354 if (type == ltype) { 384 if (type == ltype) {
355 replace = replace == -1 ? i : -2; 385 replace = replace == -1 ? i : -2;
356 continue; 386 continue;
357 } 387 } else if (types_compatible(type, ltype))
358 else if (types_compatible(type, ltype))
359 continue; 388 continue;
360 } 389 }
361 printk(KERN_WARNING 390 pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
362 "mtrr: 0x%lx000,0x%lx000 overlaps existing" 391 " 0x%lx000,0x%lx000\n", base, size, lbase,
363 " 0x%lx000,0x%lx000\n", base, size, lbase, 392 lsize);
364 lsize);
365 goto out; 393 goto out;
366 } 394 }
367 /* New region is enclosed by an existing region */ 395 /* New region is enclosed by an existing region */
368 if (ltype != type) { 396 if (ltype != type) {
369 if (types_compatible(type, ltype)) 397 if (types_compatible(type, ltype))
370 continue; 398 continue;
371 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", 399 pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
372 base, size, mtrr_attrib_to_str(ltype), 400 base, size, mtrr_attrib_to_str(ltype),
373 mtrr_attrib_to_str(type)); 401 mtrr_attrib_to_str(type));
374 goto out; 402 goto out;
375 } 403 }
376 if (increment) 404 if (increment)
@@ -378,7 +406,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
378 error = i; 406 error = i;
379 goto out; 407 goto out;
380 } 408 }
381 /* Search for an empty MTRR */ 409 /* Search for an empty MTRR */
382 i = mtrr_if->get_free_region(base, size, replace); 410 i = mtrr_if->get_free_region(base, size, replace);
383 if (i >= 0) { 411 if (i >= 0) {
384 set_mtrr(i, base, size, type); 412 set_mtrr(i, base, size, type);
@@ -393,8 +421,9 @@ int mtrr_add_page(unsigned long base, unsigned long size,
393 mtrr_usage_table[replace] = 0; 421 mtrr_usage_table[replace] = 0;
394 } 422 }
395 } 423 }
396 } else 424 } else {
397 printk(KERN_INFO "mtrr: no more MTRRs available\n"); 425 pr_info("mtrr: no more MTRRs available\n");
426 }
398 error = i; 427 error = i;
399 out: 428 out:
400 mutex_unlock(&mtrr_mutex); 429 mutex_unlock(&mtrr_mutex);
@@ -405,10 +434,8 @@ int mtrr_add_page(unsigned long base, unsigned long size,
405static int mtrr_check(unsigned long base, unsigned long size) 434static int mtrr_check(unsigned long base, unsigned long size)
406{ 435{
407 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { 436 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
408 printk(KERN_WARNING 437 pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
409 "mtrr: size and base must be multiples of 4 kiB\n"); 438 pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
410 printk(KERN_DEBUG
411 "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
412 dump_stack(); 439 dump_stack();
413 return -1; 440 return -1;
414 } 441 }
@@ -416,66 +443,64 @@ static int mtrr_check(unsigned long base, unsigned long size)
416} 443}
417 444
418/** 445/**
419 * mtrr_add - Add a memory type region 446 * mtrr_add - Add a memory type region
420 * @base: Physical base address of region 447 * @base: Physical base address of region
421 * @size: Physical size of region 448 * @size: Physical size of region
422 * @type: Type of MTRR desired 449 * @type: Type of MTRR desired
423 * @increment: If this is true do usage counting on the region 450 * @increment: If this is true do usage counting on the region
424 * 451 *
425 * Memory type region registers control the caching on newer Intel and 452 * Memory type region registers control the caching on newer Intel and
426 * non Intel processors. This function allows drivers to request an 453 * non Intel processors. This function allows drivers to request an
427 * MTRR is added. The details and hardware specifics of each processor's 454 * MTRR is added. The details and hardware specifics of each processor's
428 * implementation are hidden from the caller, but nevertheless the 455 * implementation are hidden from the caller, but nevertheless the
429 * caller should expect to need to provide a power of two size on an 456 * caller should expect to need to provide a power of two size on an
430 * equivalent power of two boundary. 457 * equivalent power of two boundary.
431 * 458 *
432 * If the region cannot be added either because all regions are in use 459 * If the region cannot be added either because all regions are in use
433 * or the CPU cannot support it a negative value is returned. On success 460 * or the CPU cannot support it a negative value is returned. On success
434 * the register number for this entry is returned, but should be treated 461 * the register number for this entry is returned, but should be treated
435 * as a cookie only. 462 * as a cookie only.
436 * 463 *
437 * On a multiprocessor machine the changes are made to all processors. 464 * On a multiprocessor machine the changes are made to all processors.
438 * This is required on x86 by the Intel processors. 465 * This is required on x86 by the Intel processors.
439 * 466 *
440 * The available types are 467 * The available types are
441 * 468 *
442 * %MTRR_TYPE_UNCACHABLE - No caching 469 * %MTRR_TYPE_UNCACHABLE - No caching
443 * 470 *
444 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever 471 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
445 * 472 *
446 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts 473 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
447 * 474 *
448 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes 475 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
449 * 476 *
450 * BUGS: Needs a quiet flag for the cases where drivers do not mind 477 * BUGS: Needs a quiet flag for the cases where drivers do not mind
451 * failures and do not wish system log messages to be sent. 478 * failures and do not wish system log messages to be sent.
452 */ 479 */
453 480int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
454int 481 bool increment)
455mtrr_add(unsigned long base, unsigned long size, unsigned int type,
456 bool increment)
457{ 482{
458 if (mtrr_check(base, size)) 483 if (mtrr_check(base, size))
459 return -EINVAL; 484 return -EINVAL;
460 return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, 485 return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
461 increment); 486 increment);
462} 487}
488EXPORT_SYMBOL(mtrr_add);
463 489
464/** 490/**
465 * mtrr_del_page - delete a memory type region 491 * mtrr_del_page - delete a memory type region
466 * @reg: Register returned by mtrr_add 492 * @reg: Register returned by mtrr_add
467 * @base: Physical base address 493 * @base: Physical base address
468 * @size: Size of region 494 * @size: Size of region
469 * 495 *
470 * If register is supplied then base and size are ignored. This is 496 * If register is supplied then base and size are ignored. This is
471 * how drivers should call it. 497 * how drivers should call it.
472 * 498 *
473 * Releases an MTRR region. If the usage count drops to zero the 499 * Releases an MTRR region. If the usage count drops to zero the
474 * register is freed and the region returns to default state. 500 * register is freed and the region returns to default state.
475 * On success the register is returned, on failure a negative error 501 * On success the register is returned, on failure a negative error
476 * code. 502 * code.
477 */ 503 */
478
479int mtrr_del_page(int reg, unsigned long base, unsigned long size) 504int mtrr_del_page(int reg, unsigned long base, unsigned long size)
480{ 505{
481 int i, max; 506 int i, max;
@@ -500,22 +525,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
500 } 525 }
501 } 526 }
502 if (reg < 0) { 527 if (reg < 0) {
503 printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, 528 pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
504 size); 529 base, size);
505 goto out; 530 goto out;
506 } 531 }
507 } 532 }
508 if (reg >= max) { 533 if (reg >= max) {
509 printk(KERN_WARNING "mtrr: register: %d too big\n", reg); 534 pr_warning("mtrr: register: %d too big\n", reg);
510 goto out; 535 goto out;
511 } 536 }
512 mtrr_if->get(reg, &lbase, &lsize, &ltype); 537 mtrr_if->get(reg, &lbase, &lsize, &ltype);
513 if (lsize < 1) { 538 if (lsize < 1) {
514 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); 539 pr_warning("mtrr: MTRR %d not used\n", reg);
515 goto out; 540 goto out;
516 } 541 }
517 if (mtrr_usage_table[reg] < 1) { 542 if (mtrr_usage_table[reg] < 1) {
518 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); 543 pr_warning("mtrr: reg: %d has count=0\n", reg);
519 goto out; 544 goto out;
520 } 545 }
521 if (--mtrr_usage_table[reg] < 1) 546 if (--mtrr_usage_table[reg] < 1)
@@ -526,33 +551,31 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
526 put_online_cpus(); 551 put_online_cpus();
527 return error; 552 return error;
528} 553}
554
529/** 555/**
530 * mtrr_del - delete a memory type region 556 * mtrr_del - delete a memory type region
531 * @reg: Register returned by mtrr_add 557 * @reg: Register returned by mtrr_add
532 * @base: Physical base address 558 * @base: Physical base address
533 * @size: Size of region 559 * @size: Size of region
534 * 560 *
535 * If register is supplied then base and size are ignored. This is 561 * If register is supplied then base and size are ignored. This is
536 * how drivers should call it. 562 * how drivers should call it.
537 * 563 *
538 * Releases an MTRR region. If the usage count drops to zero the 564 * Releases an MTRR region. If the usage count drops to zero the
539 * register is freed and the region returns to default state. 565 * register is freed and the region returns to default state.
540 * On success the register is returned, on failure a negative error 566 * On success the register is returned, on failure a negative error
541 * code. 567 * code.
542 */ 568 */
543 569int mtrr_del(int reg, unsigned long base, unsigned long size)
544int
545mtrr_del(int reg, unsigned long base, unsigned long size)
546{ 570{
547 if (mtrr_check(base, size)) 571 if (mtrr_check(base, size))
548 return -EINVAL; 572 return -EINVAL;
549 return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); 573 return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
550} 574}
551
552EXPORT_SYMBOL(mtrr_add);
553EXPORT_SYMBOL(mtrr_del); 575EXPORT_SYMBOL(mtrr_del);
554 576
555/* HACK ALERT! 577/*
578 * HACK ALERT!
556 * These should be called implicitly, but we can't yet until all the initcall 579 * These should be called implicitly, but we can't yet until all the initcall
557 * stuff is done... 580 * stuff is done...
558 */ 581 */
@@ -576,29 +599,28 @@ struct mtrr_value {
576 599
577static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; 600static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
578 601
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 602static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
580{ 603{
581 int i; 604 int i;
582 605
583 for (i = 0; i < num_var_ranges; i++) { 606 for (i = 0; i < num_var_ranges; i++) {
584 mtrr_if->get(i, 607 mtrr_if->get(i, &mtrr_value[i].lbase,
585 &mtrr_value[i].lbase, 608 &mtrr_value[i].lsize,
586 &mtrr_value[i].lsize, 609 &mtrr_value[i].ltype);
587 &mtrr_value[i].ltype);
588 } 610 }
589 return 0; 611 return 0;
590} 612}
591 613
592static int mtrr_restore(struct sys_device * sysdev) 614static int mtrr_restore(struct sys_device *sysdev)
593{ 615{
594 int i; 616 int i;
595 617
596 for (i = 0; i < num_var_ranges; i++) { 618 for (i = 0; i < num_var_ranges; i++) {
597 if (mtrr_value[i].lsize) 619 if (mtrr_value[i].lsize) {
598 set_mtrr(i, 620 set_mtrr(i, mtrr_value[i].lbase,
599 mtrr_value[i].lbase, 621 mtrr_value[i].lsize,
600 mtrr_value[i].lsize, 622 mtrr_value[i].ltype);
601 mtrr_value[i].ltype); 623 }
602 } 624 }
603 return 0; 625 return 0;
604} 626}
@@ -615,26 +637,29 @@ int __initdata changed_by_mtrr_cleanup;
615/** 637/**
616 * mtrr_bp_init - initialize mtrrs on the boot CPU 638 * mtrr_bp_init - initialize mtrrs on the boot CPU
617 * 639 *
618 * This needs to be called early; before any of the other CPUs are 640 * This needs to be called early; before any of the other CPUs are
619 * initialized (i.e. before smp_init()). 641 * initialized (i.e. before smp_init()).
620 * 642 *
621 */ 643 */
622void __init mtrr_bp_init(void) 644void __init mtrr_bp_init(void)
623{ 645{
624 u32 phys_addr; 646 u32 phys_addr;
647
625 init_ifs(); 648 init_ifs();
626 649
627 phys_addr = 32; 650 phys_addr = 32;
628 651
629 if (cpu_has_mtrr) { 652 if (cpu_has_mtrr) {
630 mtrr_if = &generic_mtrr_ops; 653 mtrr_if = &generic_mtrr_ops;
631 size_or_mask = 0xff000000; /* 36 bits */ 654 size_or_mask = 0xff000000; /* 36 bits */
632 size_and_mask = 0x00f00000; 655 size_and_mask = 0x00f00000;
633 phys_addr = 36; 656 phys_addr = 36;
634 657
635 /* This is an AMD specific MSR, but we assume(hope?) that 658 /*
636 Intel will implement it to when they extend the address 659 * This is an AMD specific MSR, but we assume(hope?) that
637 bus of the Xeon. */ 660 * Intel will implement it to when they extend the address
661 * bus of the Xeon.
662 */
638 if (cpuid_eax(0x80000000) >= 0x80000008) { 663 if (cpuid_eax(0x80000000) >= 0x80000008) {
639 phys_addr = cpuid_eax(0x80000008) & 0xff; 664 phys_addr = cpuid_eax(0x80000008) & 0xff;
640 /* CPUID workaround for Intel 0F33/0F34 CPU */ 665 /* CPUID workaround for Intel 0F33/0F34 CPU */
@@ -649,9 +674,11 @@ void __init mtrr_bp_init(void)
649 size_and_mask = ~size_or_mask & 0xfffff00000ULL; 674 size_and_mask = ~size_or_mask & 0xfffff00000ULL;
650 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && 675 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
651 boot_cpu_data.x86 == 6) { 676 boot_cpu_data.x86 == 6) {
652 /* VIA C* family have Intel style MTRRs, but 677 /*
653 don't support PAE */ 678 * VIA C* family have Intel style MTRRs,
654 size_or_mask = 0xfff00000; /* 32 bits */ 679 * but don't support PAE
680 */
681 size_or_mask = 0xfff00000; /* 32 bits */
655 size_and_mask = 0; 682 size_and_mask = 0;
656 phys_addr = 32; 683 phys_addr = 32;
657 } 684 }
@@ -694,30 +721,28 @@ void __init mtrr_bp_init(void)
694 changed_by_mtrr_cleanup = 1; 721 changed_by_mtrr_cleanup = 1;
695 mtrr_if->set_all(); 722 mtrr_if->set_all();
696 } 723 }
697
698 } 724 }
699 } 725 }
700} 726}
701 727
702void mtrr_ap_init(void) 728void mtrr_ap_init(void)
703{ 729{
704 unsigned long flags; 730 if (!use_intel() || mtrr_aps_delayed_init)
705
706 if (!mtrr_if || !use_intel())
707 return; 731 return;
708 /* 732 /*
709 * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, 733 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
710 * but this routine will be called in cpu boot time, holding the lock 734 * changed, but this routine will be called in cpu boot time,
711 * breaks it. This routine is called in two cases: 1.very earily time 735 * holding the lock breaks it.
712 * of software resume, when there absolutely isn't mtrr entry changes; 736 *
713 * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to 737 * This routine is called in two cases:
714 * prevent mtrr entry changes 738 *
739 * 1. very earily time of software resume, when there absolutely
740 * isn't mtrr entry changes;
741 *
742 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
743 * lock to prevent mtrr entry changes
715 */ 744 */
716 local_irq_save(flags); 745 set_mtrr(~0U, 0, 0, 0);
717
718 mtrr_if->set_all();
719
720 local_irq_restore(flags);
721} 746}
722 747
723/** 748/**
@@ -728,23 +753,55 @@ void mtrr_save_state(void)
728 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); 753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
729} 754}
730 755
756void set_mtrr_aps_delayed_init(void)
757{
758 if (!use_intel())
759 return;
760
761 mtrr_aps_delayed_init = true;
762}
763
764/*
765 * MTRR initialization for all AP's
766 */
767void mtrr_aps_init(void)
768{
769 if (!use_intel())
770 return;
771
772 set_mtrr(~0U, 0, 0, 0);
773 mtrr_aps_delayed_init = false;
774}
775
776void mtrr_bp_restore(void)
777{
778 if (!use_intel())
779 return;
780
781 mtrr_if->set_all();
782}
783
731static int __init mtrr_init_finialize(void) 784static int __init mtrr_init_finialize(void)
732{ 785{
733 if (!mtrr_if) 786 if (!mtrr_if)
734 return 0; 787 return 0;
788
735 if (use_intel()) { 789 if (use_intel()) {
736 if (!changed_by_mtrr_cleanup) 790 if (!changed_by_mtrr_cleanup)
737 mtrr_state_warn(); 791 mtrr_state_warn();
738 } else { 792 return 0;
739 /* The CPUs haven't MTRR and seem to not support SMP. They have
740 * specific drivers, we use a tricky method to support
741 * suspend/resume for them.
742 * TBD: is there any system with such CPU which supports
743 * suspend/resume? if no, we should remove the code.
744 */
745 sysdev_driver_register(&cpu_sysdev_class,
746 &mtrr_sysdev_driver);
747 } 793 }
794
795 /*
796 * The CPU has no MTRR and seems to not support SMP. They have
797 * specific drivers, we use a tricky method to support
798 * suspend/resume for them.
799 *
800 * TBD: is there any system with such CPU which supports
801 * suspend/resume? If no, we should remove the code.
802 */
803 sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver);
804
748 return 0; 805 return 0;
749} 806}
750subsys_initcall(mtrr_init_finialize); 807subsys_initcall(mtrr_init_finialize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 7538b767f206..a501dee9a87a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * local mtrr defines. 2 * local MTRR defines.
3 */ 3 */
4 4
5#include <linux/types.h> 5#include <linux/types.h>
@@ -14,13 +14,12 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
14struct mtrr_ops { 14struct mtrr_ops {
15 u32 vendor; 15 u32 vendor;
16 u32 use_intel_if; 16 u32 use_intel_if;
17// void (*init)(void);
18 void (*set)(unsigned int reg, unsigned long base, 17 void (*set)(unsigned int reg, unsigned long base,
19 unsigned long size, mtrr_type type); 18 unsigned long size, mtrr_type type);
20 void (*set_all)(void); 19 void (*set_all)(void);
21 20
22 void (*get)(unsigned int reg, unsigned long *base, 21 void (*get)(unsigned int reg, unsigned long *base,
23 unsigned long *size, mtrr_type * type); 22 unsigned long *size, mtrr_type *type);
24 int (*get_free_region)(unsigned long base, unsigned long size, 23 int (*get_free_region)(unsigned long base, unsigned long size,
25 int replace_reg); 24 int replace_reg);
26 int (*validate_add_page)(unsigned long base, unsigned long size, 25 int (*validate_add_page)(unsigned long base, unsigned long size,
@@ -39,11 +38,11 @@ extern int positive_have_wrcomb(void);
39 38
40/* library functions for processor-specific routines */ 39/* library functions for processor-specific routines */
41struct set_mtrr_context { 40struct set_mtrr_context {
42 unsigned long flags; 41 unsigned long flags;
43 unsigned long cr4val; 42 unsigned long cr4val;
44 u32 deftype_lo; 43 u32 deftype_lo;
45 u32 deftype_hi; 44 u32 deftype_hi;
46 u32 ccr3; 45 u32 ccr3;
47}; 46};
48 47
49void set_mtrr_done(struct set_mtrr_context *ctxt); 48void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -54,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
54 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
55void get_mtrr_state(void); 54void get_mtrr_state(void);
56 55
57extern void set_mtrr_ops(struct mtrr_ops * ops); 56extern void set_mtrr_ops(struct mtrr_ops *ops);
58 57
59extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
60extern struct mtrr_ops * mtrr_if; 59extern struct mtrr_ops *mtrr_if;
61 60
62#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
63#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 1f5fb1588d1f..dfc80b4e6b0d 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -1,24 +1,25 @@
1#include <linux/mm.h>
2#include <linux/init.h> 1#include <linux/init.h>
3#include <asm/io.h> 2#include <linux/io.h>
4#include <asm/mtrr.h> 3#include <linux/mm.h>
5#include <asm/msr.h> 4
6#include <asm/processor-cyrix.h> 5#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
8#include "mtrr.h" 7#include <asm/mtrr.h>
8#include <asm/msr.h>
9 9
10#include "mtrr.h"
10 11
11/* Put the processor into a state where MTRRs can be safely set */ 12/* Put the processor into a state where MTRRs can be safely set */
12void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) 13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
13{ 14{
14 unsigned int cr0; 15 unsigned int cr0;
15 16
16 /* Disable interrupts locally */ 17 /* Disable interrupts locally */
17 local_irq_save(ctxt->flags); 18 local_irq_save(ctxt->flags);
18 19
19 if (use_intel() || is_cpu(CYRIX)) { 20 if (use_intel() || is_cpu(CYRIX)) {
20 21
21 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
22 if (cpu_has_pge) { 23 if (cpu_has_pge) {
23 ctxt->cr4val = read_cr4(); 24 ctxt->cr4val = read_cr4();
24 write_cr4(ctxt->cr4val & ~X86_CR4_PGE); 25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
@@ -33,50 +34,61 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
33 write_cr0(cr0); 34 write_cr0(cr0);
34 wbinvd(); 35 wbinvd();
35 36
36 if (use_intel()) 37 if (use_intel()) {
37 /* Save MTRR state */ 38 /* Save MTRR state */
38 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); 39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
39 else 40 } else {
40 /* Cyrix ARRs - everything else were excluded at the top */ 41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
41 ctxt->ccr3 = getCx86(CX86_CCR3); 45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
42 } 47 }
43} 48}
44 49
45void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) 50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
46{ 51{
47 if (use_intel()) 52 if (use_intel()) {
48 /* Disable MTRRs, and set the default type to uncached */ 53 /* Disable MTRRs, and set the default type to uncached */
49 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, 54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
50 ctxt->deftype_hi); 55 ctxt->deftype_hi);
51 else if (is_cpu(CYRIX)) 56 } else {
52 /* Cyrix ARRs - everything else were excluded at the top */ 57 if (is_cpu(CYRIX)) {
53 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); 58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
54} 62}
55 63
56/* Restore the processor after a set_mtrr_prepare */ 64/* Restore the processor after a set_mtrr_prepare */
57void set_mtrr_done(struct set_mtrr_context *ctxt) 65void set_mtrr_done(struct set_mtrr_context *ctxt)
58{ 66{
59 if (use_intel() || is_cpu(CYRIX)) { 67 if (use_intel() || is_cpu(CYRIX)) {
60 68
61 /* Flush caches and TLBs */ 69 /* Flush caches and TLBs */
62 wbinvd(); 70 wbinvd();
63 71
64 /* Restore MTRRdefType */ 72 /* Restore MTRRdefType */
65 if (use_intel()) 73 if (use_intel()) {
66 /* Intel (P6) standard MTRRs */ 74 /* Intel (P6) standard MTRRs */
67 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); 75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
68 else 76 ctxt->deftype_hi);
69 /* Cyrix ARRs - everything else was excluded at the top */ 77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
70 setCx86(CX86_CCR3, ctxt->ccr3); 82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
71 84
72 /* Enable caches */ 85 /* Enable caches */
73 write_cr0(read_cr0() & 0xbfffffff); 86 write_cr0(read_cr0() & 0xbfffffff);
74 87
75 /* Restore value of CR4 */ 88 /* Restore value of CR4 */
76 if (cpu_has_pge) 89 if (cpu_has_pge)
77 write_cr4(ctxt->cr4val); 90 write_cr4(ctxt->cr4val);
78 } 91 }
79 /* Re-enable interrupts locally (if enabled previously) */ 92 /* Re-enable interrupts locally (if enabled previously) */
80 local_irq_restore(ctxt->flags); 93 local_irq_restore(ctxt->flags);
81} 94}
82
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c
index 900332b800f8..a3c7adb06b78 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1,16 +1,17 @@
1/* 1/*
2 * Performance counter x86 architecture code 2 * Performance events x86 architecture code
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput 6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
9 * 10 *
10 * For licencing details see kernel-base/COPYING 11 * For licencing details see kernel-base/COPYING
11 */ 12 */
12 13
13#include <linux/perf_counter.h> 14#include <linux/perf_event.h>
14#include <linux/capability.h> 15#include <linux/capability.h>
15#include <linux/notifier.h> 16#include <linux/notifier.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
@@ -20,19 +21,60 @@
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/uaccess.h> 22#include <linux/uaccess.h>
22#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/cpu.h>
23 25
24#include <asm/apic.h> 26#include <asm/apic.h>
25#include <asm/stacktrace.h> 27#include <asm/stacktrace.h>
26#include <asm/nmi.h> 28#include <asm/nmi.h>
27 29
28static u64 perf_counter_mask __read_mostly; 30static u64 perf_event_mask __read_mostly;
29 31
30struct cpu_hw_counters { 32/* The maximal number of PEBS events: */
31 struct perf_counter *counters[X86_PMC_IDX_MAX]; 33#define MAX_PEBS_EVENTS 4
34
35/* The size of a BTS record in bytes: */
36#define BTS_RECORD_SIZE 24
37
38/* The size of a per-cpu BTS buffer in bytes: */
39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
40
41/* The BTS overflow threshold in bytes from the end of the buffer: */
42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
43
44
45/*
46 * Bits in the debugctlmsr controlling branch tracing.
47 */
48#define X86_DEBUGCTL_TR (1 << 6)
49#define X86_DEBUGCTL_BTS (1 << 7)
50#define X86_DEBUGCTL_BTINT (1 << 8)
51#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
52#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
53
54/*
55 * A debug store configuration.
56 *
57 * We only support architectures that use 64bit fields.
58 */
59struct debug_store {
60 u64 bts_buffer_base;
61 u64 bts_index;
62 u64 bts_absolute_maximum;
63 u64 bts_interrupt_threshold;
64 u64 pebs_buffer_base;
65 u64 pebs_index;
66 u64 pebs_absolute_maximum;
67 u64 pebs_interrupt_threshold;
68 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69};
70
71struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX];
32 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
34 unsigned long interrupts; 75 unsigned long interrupts;
35 int enabled; 76 int enabled;
77 struct debug_store *ds;
36}; 78};
37 79
38/* 80/*
@@ -44,25 +86,27 @@ struct x86_pmu {
44 int (*handle_irq)(struct pt_regs *); 86 int (*handle_irq)(struct pt_regs *);
45 void (*disable_all)(void); 87 void (*disable_all)(void);
46 void (*enable_all)(void); 88 void (*enable_all)(void);
47 void (*enable)(struct hw_perf_counter *, int); 89 void (*enable)(struct hw_perf_event *, int);
48 void (*disable)(struct hw_perf_counter *, int); 90 void (*disable)(struct hw_perf_event *, int);
49 unsigned eventsel; 91 unsigned eventsel;
50 unsigned perfctr; 92 unsigned perfctr;
51 u64 (*event_map)(int); 93 u64 (*event_map)(int);
52 u64 (*raw_event)(u64); 94 u64 (*raw_event)(u64);
53 int max_events; 95 int max_events;
54 int num_counters; 96 int num_events;
55 int num_counters_fixed; 97 int num_events_fixed;
56 int counter_bits; 98 int event_bits;
57 u64 counter_mask; 99 u64 event_mask;
58 int apic; 100 int apic;
59 u64 max_period; 101 u64 max_period;
60 u64 intel_ctrl; 102 u64 intel_ctrl;
103 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void);
61}; 105};
62 106
63static struct x86_pmu x86_pmu __read_mostly; 107static struct x86_pmu x86_pmu __read_mostly;
64 108
65static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { 109static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
66 .enabled = 1, 110 .enabled = 1,
67}; 111};
68 112
@@ -80,35 +124,35 @@ static const u64 p6_perfmon_event_map[] =
80 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, 124 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
81}; 125};
82 126
83static u64 p6_pmu_event_map(int event) 127static u64 p6_pmu_event_map(int hw_event)
84{ 128{
85 return p6_perfmon_event_map[event]; 129 return p6_perfmon_event_map[hw_event];
86} 130}
87 131
88/* 132/*
89 * Counter setting that is specified not to count anything. 133 * Event setting that is specified not to count anything.
90 * We use this to effectively disable a counter. 134 * We use this to effectively disable a counter.
91 * 135 *
92 * L2_RQSTS with 0 MESI unit mask. 136 * L2_RQSTS with 0 MESI unit mask.
93 */ 137 */
94#define P6_NOP_COUNTER 0x0000002EULL 138#define P6_NOP_EVENT 0x0000002EULL
95 139
96static u64 p6_pmu_raw_event(u64 event) 140static u64 p6_pmu_raw_event(u64 hw_event)
97{ 141{
98#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL 142#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
99#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL 143#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
100#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL 144#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
101#define P6_EVNTSEL_INV_MASK 0x00800000ULL 145#define P6_EVNTSEL_INV_MASK 0x00800000ULL
102#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL 146#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
103 147
104#define P6_EVNTSEL_MASK \ 148#define P6_EVNTSEL_MASK \
105 (P6_EVNTSEL_EVENT_MASK | \ 149 (P6_EVNTSEL_EVENT_MASK | \
106 P6_EVNTSEL_UNIT_MASK | \ 150 P6_EVNTSEL_UNIT_MASK | \
107 P6_EVNTSEL_EDGE_MASK | \ 151 P6_EVNTSEL_EDGE_MASK | \
108 P6_EVNTSEL_INV_MASK | \ 152 P6_EVNTSEL_INV_MASK | \
109 P6_EVNTSEL_COUNTER_MASK) 153 P6_EVNTSEL_REG_MASK)
110 154
111 return event & P6_EVNTSEL_MASK; 155 return hw_event & P6_EVNTSEL_MASK;
112} 156}
113 157
114 158
@@ -126,16 +170,16 @@ static const u64 intel_perfmon_event_map[] =
126 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
127}; 171};
128 172
129static u64 intel_pmu_event_map(int event) 173static u64 intel_pmu_event_map(int hw_event)
130{ 174{
131 return intel_perfmon_event_map[event]; 175 return intel_perfmon_event_map[hw_event];
132} 176}
133 177
134/* 178/*
135 * Generalized hw caching related event table, filled 179 * Generalized hw caching related hw_event table, filled
136 * in on a per model basis. A value of 0 means 180 * in on a per model basis. A value of 0 means
137 * 'not supported', -1 means 'event makes no sense on 181 * 'not supported', -1 means 'hw_event makes no sense on
138 * this CPU', any other value means the raw event 182 * this CPU', any other value means the raw hw_event
139 * ID. 183 * ID.
140 */ 184 */
141 185
@@ -419,22 +463,22 @@ static const u64 atom_hw_cache_event_ids
419 }, 463 },
420}; 464};
421 465
422static u64 intel_pmu_raw_event(u64 event) 466static u64 intel_pmu_raw_event(u64 hw_event)
423{ 467{
424#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL 468#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
425#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
426#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
427#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
428#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL 472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
429 473
430#define CORE_EVNTSEL_MASK \ 474#define CORE_EVNTSEL_MASK \
431 (CORE_EVNTSEL_EVENT_MASK | \ 475 (CORE_EVNTSEL_EVENT_MASK | \
432 CORE_EVNTSEL_UNIT_MASK | \ 476 CORE_EVNTSEL_UNIT_MASK | \
433 CORE_EVNTSEL_EDGE_MASK | \ 477 CORE_EVNTSEL_EDGE_MASK | \
434 CORE_EVNTSEL_INV_MASK | \ 478 CORE_EVNTSEL_INV_MASK | \
435 CORE_EVNTSEL_COUNTER_MASK) 479 CORE_EVNTSEL_REG_MASK)
436 480
437 return event & CORE_EVNTSEL_MASK; 481 return hw_event & CORE_EVNTSEL_MASK;
438} 482}
439 483
440static const u64 amd_hw_cache_event_ids 484static const u64 amd_hw_cache_event_ids
@@ -541,52 +585,55 @@ static const u64 amd_perfmon_event_map[] =
541 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 585 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
542}; 586};
543 587
544static u64 amd_pmu_event_map(int event) 588static u64 amd_pmu_event_map(int hw_event)
545{ 589{
546 return amd_perfmon_event_map[event]; 590 return amd_perfmon_event_map[hw_event];
547} 591}
548 592
549static u64 amd_pmu_raw_event(u64 event) 593static u64 amd_pmu_raw_event(u64 hw_event)
550{ 594{
551#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL 595#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
552#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 596#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
553#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 597#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
554#define K7_EVNTSEL_INV_MASK 0x000800000ULL 598#define K7_EVNTSEL_INV_MASK 0x000800000ULL
555#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL 599#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
556 600
557#define K7_EVNTSEL_MASK \ 601#define K7_EVNTSEL_MASK \
558 (K7_EVNTSEL_EVENT_MASK | \ 602 (K7_EVNTSEL_EVENT_MASK | \
559 K7_EVNTSEL_UNIT_MASK | \ 603 K7_EVNTSEL_UNIT_MASK | \
560 K7_EVNTSEL_EDGE_MASK | \ 604 K7_EVNTSEL_EDGE_MASK | \
561 K7_EVNTSEL_INV_MASK | \ 605 K7_EVNTSEL_INV_MASK | \
562 K7_EVNTSEL_COUNTER_MASK) 606 K7_EVNTSEL_REG_MASK)
563 607
564 return event & K7_EVNTSEL_MASK; 608 return hw_event & K7_EVNTSEL_MASK;
565} 609}
566 610
567/* 611/*
568 * Propagate counter elapsed time into the generic counter. 612 * Propagate event elapsed time into the generic event.
569 * Can only be executed on the CPU where the counter is active. 613 * Can only be executed on the CPU where the event is active.
570 * Returns the delta events processed. 614 * Returns the delta events processed.
571 */ 615 */
572static u64 616static u64
573x86_perf_counter_update(struct perf_counter *counter, 617x86_perf_event_update(struct perf_event *event,
574 struct hw_perf_counter *hwc, int idx) 618 struct hw_perf_event *hwc, int idx)
575{ 619{
576 int shift = 64 - x86_pmu.counter_bits; 620 int shift = 64 - x86_pmu.event_bits;
577 u64 prev_raw_count, new_raw_count; 621 u64 prev_raw_count, new_raw_count;
578 s64 delta; 622 s64 delta;
579 623
624 if (idx == X86_PMC_IDX_FIXED_BTS)
625 return 0;
626
580 /* 627 /*
581 * Careful: an NMI might modify the previous counter value. 628 * Careful: an NMI might modify the previous event value.
582 * 629 *
583 * Our tactic to handle this is to first atomically read and 630 * Our tactic to handle this is to first atomically read and
584 * exchange a new raw count - then add that new-prev delta 631 * exchange a new raw count - then add that new-prev delta
585 * count to the generic counter atomically: 632 * count to the generic event atomically:
586 */ 633 */
587again: 634again:
588 prev_raw_count = atomic64_read(&hwc->prev_count); 635 prev_raw_count = atomic64_read(&hwc->prev_count);
589 rdmsrl(hwc->counter_base + idx, new_raw_count); 636 rdmsrl(hwc->event_base + idx, new_raw_count);
590 637
591 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 638 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
592 new_raw_count) != prev_raw_count) 639 new_raw_count) != prev_raw_count)
@@ -595,7 +642,7 @@ again:
595 /* 642 /*
596 * Now we have the new raw value and have updated the prev 643 * Now we have the new raw value and have updated the prev
597 * timestamp already. We can now calculate the elapsed delta 644 * timestamp already. We can now calculate the elapsed delta
598 * (counter-)time and add that to the generic counter. 645 * (event-)time and add that to the generic event.
599 * 646 *
600 * Careful, not all hw sign-extends above the physical width 647 * Careful, not all hw sign-extends above the physical width
601 * of the count. 648 * of the count.
@@ -603,13 +650,13 @@ again:
603 delta = (new_raw_count << shift) - (prev_raw_count << shift); 650 delta = (new_raw_count << shift) - (prev_raw_count << shift);
604 delta >>= shift; 651 delta >>= shift;
605 652
606 atomic64_add(delta, &counter->count); 653 atomic64_add(delta, &event->count);
607 atomic64_sub(delta, &hwc->period_left); 654 atomic64_sub(delta, &hwc->period_left);
608 655
609 return new_raw_count; 656 return new_raw_count;
610} 657}
611 658
612static atomic_t active_counters; 659static atomic_t active_events;
613static DEFINE_MUTEX(pmc_reserve_mutex); 660static DEFINE_MUTEX(pmc_reserve_mutex);
614 661
615static bool reserve_pmc_hardware(void) 662static bool reserve_pmc_hardware(void)
@@ -620,12 +667,12 @@ static bool reserve_pmc_hardware(void)
620 if (nmi_watchdog == NMI_LOCAL_APIC) 667 if (nmi_watchdog == NMI_LOCAL_APIC)
621 disable_lapic_nmi_watchdog(); 668 disable_lapic_nmi_watchdog();
622 669
623 for (i = 0; i < x86_pmu.num_counters; i++) { 670 for (i = 0; i < x86_pmu.num_events; i++) {
624 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 671 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
625 goto perfctr_fail; 672 goto perfctr_fail;
626 } 673 }
627 674
628 for (i = 0; i < x86_pmu.num_counters; i++) { 675 for (i = 0; i < x86_pmu.num_events; i++) {
629 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 676 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
630 goto eventsel_fail; 677 goto eventsel_fail;
631 } 678 }
@@ -638,7 +685,7 @@ eventsel_fail:
638 for (i--; i >= 0; i--) 685 for (i--; i >= 0; i--)
639 release_evntsel_nmi(x86_pmu.eventsel + i); 686 release_evntsel_nmi(x86_pmu.eventsel + i);
640 687
641 i = x86_pmu.num_counters; 688 i = x86_pmu.num_events;
642 689
643perfctr_fail: 690perfctr_fail:
644 for (i--; i >= 0; i--) 691 for (i--; i >= 0; i--)
@@ -656,7 +703,7 @@ static void release_pmc_hardware(void)
656#ifdef CONFIG_X86_LOCAL_APIC 703#ifdef CONFIG_X86_LOCAL_APIC
657 int i; 704 int i;
658 705
659 for (i = 0; i < x86_pmu.num_counters; i++) { 706 for (i = 0; i < x86_pmu.num_events; i++) {
660 release_perfctr_nmi(x86_pmu.perfctr + i); 707 release_perfctr_nmi(x86_pmu.perfctr + i);
661 release_evntsel_nmi(x86_pmu.eventsel + i); 708 release_evntsel_nmi(x86_pmu.eventsel + i);
662 } 709 }
@@ -666,10 +713,110 @@ static void release_pmc_hardware(void)
666#endif 713#endif
667} 714}
668 715
669static void hw_perf_counter_destroy(struct perf_counter *counter) 716static inline bool bts_available(void)
670{ 717{
671 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { 718 return x86_pmu.enable_bts != NULL;
719}
720
721static inline void init_debug_store_on_cpu(int cpu)
722{
723 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
724
725 if (!ds)
726 return;
727
728 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
729 (u32)((u64)(unsigned long)ds),
730 (u32)((u64)(unsigned long)ds >> 32));
731}
732
733static inline void fini_debug_store_on_cpu(int cpu)
734{
735 if (!per_cpu(cpu_hw_events, cpu).ds)
736 return;
737
738 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
739}
740
741static void release_bts_hardware(void)
742{
743 int cpu;
744
745 if (!bts_available())
746 return;
747
748 get_online_cpus();
749
750 for_each_online_cpu(cpu)
751 fini_debug_store_on_cpu(cpu);
752
753 for_each_possible_cpu(cpu) {
754 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
755
756 if (!ds)
757 continue;
758
759 per_cpu(cpu_hw_events, cpu).ds = NULL;
760
761 kfree((void *)(unsigned long)ds->bts_buffer_base);
762 kfree(ds);
763 }
764
765 put_online_cpus();
766}
767
768static int reserve_bts_hardware(void)
769{
770 int cpu, err = 0;
771
772 if (!bts_available())
773 return 0;
774
775 get_online_cpus();
776
777 for_each_possible_cpu(cpu) {
778 struct debug_store *ds;
779 void *buffer;
780
781 err = -ENOMEM;
782 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
783 if (unlikely(!buffer))
784 break;
785
786 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
787 if (unlikely(!ds)) {
788 kfree(buffer);
789 break;
790 }
791
792 ds->bts_buffer_base = (u64)(unsigned long)buffer;
793 ds->bts_index = ds->bts_buffer_base;
794 ds->bts_absolute_maximum =
795 ds->bts_buffer_base + BTS_BUFFER_SIZE;
796 ds->bts_interrupt_threshold =
797 ds->bts_absolute_maximum - BTS_OVFL_TH;
798
799 per_cpu(cpu_hw_events, cpu).ds = ds;
800 err = 0;
801 }
802
803 if (err)
804 release_bts_hardware();
805 else {
806 for_each_online_cpu(cpu)
807 init_debug_store_on_cpu(cpu);
808 }
809
810 put_online_cpus();
811
812 return err;
813}
814
815static void hw_perf_event_destroy(struct perf_event *event)
816{
817 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
672 release_pmc_hardware(); 818 release_pmc_hardware();
819 release_bts_hardware();
673 mutex_unlock(&pmc_reserve_mutex); 820 mutex_unlock(&pmc_reserve_mutex);
674 } 821 }
675} 822}
@@ -680,7 +827,7 @@ static inline int x86_pmu_initialized(void)
680} 827}
681 828
682static inline int 829static inline int
683set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) 830set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
684{ 831{
685 unsigned int cache_type, cache_op, cache_result; 832 unsigned int cache_type, cache_op, cache_result;
686 u64 config, val; 833 u64 config, val;
@@ -712,13 +859,49 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
712 return 0; 859 return 0;
713} 860}
714 861
862static void intel_pmu_enable_bts(u64 config)
863{
864 unsigned long debugctlmsr;
865
866 debugctlmsr = get_debugctlmsr();
867
868 debugctlmsr |= X86_DEBUGCTL_TR;
869 debugctlmsr |= X86_DEBUGCTL_BTS;
870 debugctlmsr |= X86_DEBUGCTL_BTINT;
871
872 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
873 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
874
875 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
876 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
877
878 update_debugctlmsr(debugctlmsr);
879}
880
881static void intel_pmu_disable_bts(void)
882{
883 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
884 unsigned long debugctlmsr;
885
886 if (!cpuc->ds)
887 return;
888
889 debugctlmsr = get_debugctlmsr();
890
891 debugctlmsr &=
892 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
893 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
894
895 update_debugctlmsr(debugctlmsr);
896}
897
715/* 898/*
716 * Setup the hardware configuration for a given attr_type 899 * Setup the hardware configuration for a given attr_type
717 */ 900 */
718static int __hw_perf_counter_init(struct perf_counter *counter) 901static int __hw_perf_event_init(struct perf_event *event)
719{ 902{
720 struct perf_counter_attr *attr = &counter->attr; 903 struct perf_event_attr *attr = &event->attr;
721 struct hw_perf_counter *hwc = &counter->hw; 904 struct hw_perf_event *hwc = &event->hw;
722 u64 config; 905 u64 config;
723 int err; 906 int err;
724 907
@@ -726,17 +909,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
726 return -ENODEV; 909 return -ENODEV;
727 910
728 err = 0; 911 err = 0;
729 if (!atomic_inc_not_zero(&active_counters)) { 912 if (!atomic_inc_not_zero(&active_events)) {
730 mutex_lock(&pmc_reserve_mutex); 913 mutex_lock(&pmc_reserve_mutex);
731 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) 914 if (atomic_read(&active_events) == 0) {
732 err = -EBUSY; 915 if (!reserve_pmc_hardware())
733 else 916 err = -EBUSY;
734 atomic_inc(&active_counters); 917 else
918 err = reserve_bts_hardware();
919 }
920 if (!err)
921 atomic_inc(&active_events);
735 mutex_unlock(&pmc_reserve_mutex); 922 mutex_unlock(&pmc_reserve_mutex);
736 } 923 }
737 if (err) 924 if (err)
738 return err; 925 return err;
739 926
927 event->destroy = hw_perf_event_destroy;
928
740 /* 929 /*
741 * Generate PMC IRQs: 930 * Generate PMC IRQs:
742 * (keep 'enabled' bit clear for now) 931 * (keep 'enabled' bit clear for now)
@@ -759,17 +948,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
759 /* 948 /*
760 * If we have a PMU initialized but no APIC 949 * If we have a PMU initialized but no APIC
761 * interrupts, we cannot sample hardware 950 * interrupts, we cannot sample hardware
762 * counters (user-space has to fall back and 951 * events (user-space has to fall back and
763 * sample via a hrtimer based software counter): 952 * sample via a hrtimer based software event):
764 */ 953 */
765 if (!x86_pmu.apic) 954 if (!x86_pmu.apic)
766 return -EOPNOTSUPP; 955 return -EOPNOTSUPP;
767 } 956 }
768 957
769 counter->destroy = hw_perf_counter_destroy;
770
771 /* 958 /*
772 * Raw event type provide the config in the event structure 959 * Raw hw_event type provide the config in the hw_event structure
773 */ 960 */
774 if (attr->type == PERF_TYPE_RAW) { 961 if (attr->type == PERF_TYPE_RAW) {
775 hwc->config |= x86_pmu.raw_event(attr->config); 962 hwc->config |= x86_pmu.raw_event(attr->config);
@@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
793 if (config == -1LL) 980 if (config == -1LL)
794 return -EINVAL; 981 return -EINVAL;
795 982
983 /*
984 * Branch tracing:
985 */
986 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
987 (hwc->sample_period == 1)) {
988 /* BTS is not supported by this architecture. */
989 if (!bts_available())
990 return -EOPNOTSUPP;
991
992 /* BTS is currently only allowed for user-mode. */
993 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
994 return -EOPNOTSUPP;
995 }
996
796 hwc->config |= config; 997 hwc->config |= config;
797 998
798 return 0; 999 return 0;
@@ -800,7 +1001,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
800 1001
801static void p6_pmu_disable_all(void) 1002static void p6_pmu_disable_all(void)
802{ 1003{
803 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1004 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
804 u64 val; 1005 u64 val;
805 1006
806 if (!cpuc->enabled) 1007 if (!cpuc->enabled)
@@ -817,12 +1018,23 @@ static void p6_pmu_disable_all(void)
817 1018
818static void intel_pmu_disable_all(void) 1019static void intel_pmu_disable_all(void)
819{ 1020{
1021 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1022
1023 if (!cpuc->enabled)
1024 return;
1025
1026 cpuc->enabled = 0;
1027 barrier();
1028
820 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 1029 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
1030
1031 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1032 intel_pmu_disable_bts();
821} 1033}
822 1034
823static void amd_pmu_disable_all(void) 1035static void amd_pmu_disable_all(void)
824{ 1036{
825 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1037 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
826 int idx; 1038 int idx;
827 1039
828 if (!cpuc->enabled) 1040 if (!cpuc->enabled)
@@ -831,12 +1043,12 @@ static void amd_pmu_disable_all(void)
831 cpuc->enabled = 0; 1043 cpuc->enabled = 0;
832 /* 1044 /*
833 * ensure we write the disable before we start disabling the 1045 * ensure we write the disable before we start disabling the
834 * counters proper, so that amd_pmu_enable_counter() does the 1046 * events proper, so that amd_pmu_enable_event() does the
835 * right thing. 1047 * right thing.
836 */ 1048 */
837 barrier(); 1049 barrier();
838 1050
839 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1051 for (idx = 0; idx < x86_pmu.num_events; idx++) {
840 u64 val; 1052 u64 val;
841 1053
842 if (!test_bit(idx, cpuc->active_mask)) 1054 if (!test_bit(idx, cpuc->active_mask))
@@ -858,7 +1070,7 @@ void hw_perf_disable(void)
858 1070
859static void p6_pmu_enable_all(void) 1071static void p6_pmu_enable_all(void)
860{ 1072{
861 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1073 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
862 unsigned long val; 1074 unsigned long val;
863 1075
864 if (cpuc->enabled) 1076 if (cpuc->enabled)
@@ -875,12 +1087,30 @@ static void p6_pmu_enable_all(void)
875 1087
876static void intel_pmu_enable_all(void) 1088static void intel_pmu_enable_all(void)
877{ 1089{
1090 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1091
1092 if (cpuc->enabled)
1093 return;
1094
1095 cpuc->enabled = 1;
1096 barrier();
1097
878 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 1098 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1099
1100 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
1101 struct perf_event *event =
1102 cpuc->events[X86_PMC_IDX_FIXED_BTS];
1103
1104 if (WARN_ON_ONCE(!event))
1105 return;
1106
1107 intel_pmu_enable_bts(event->hw.config);
1108 }
879} 1109}
880 1110
881static void amd_pmu_enable_all(void) 1111static void amd_pmu_enable_all(void)
882{ 1112{
883 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1113 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
884 int idx; 1114 int idx;
885 1115
886 if (cpuc->enabled) 1116 if (cpuc->enabled)
@@ -889,14 +1119,14 @@ static void amd_pmu_enable_all(void)
889 cpuc->enabled = 1; 1119 cpuc->enabled = 1;
890 barrier(); 1120 barrier();
891 1121
892 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1122 for (idx = 0; idx < x86_pmu.num_events; idx++) {
893 struct perf_counter *counter = cpuc->counters[idx]; 1123 struct perf_event *event = cpuc->events[idx];
894 u64 val; 1124 u64 val;
895 1125
896 if (!test_bit(idx, cpuc->active_mask)) 1126 if (!test_bit(idx, cpuc->active_mask))
897 continue; 1127 continue;
898 1128
899 val = counter->hw.config; 1129 val = event->hw.config;
900 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 1130 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
901 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 1131 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
902 } 1132 }
@@ -923,19 +1153,19 @@ static inline void intel_pmu_ack_status(u64 ack)
923 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 1153 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
924} 1154}
925 1155
926static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1156static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
927{ 1157{
928 (void)checking_wrmsrl(hwc->config_base + idx, 1158 (void)checking_wrmsrl(hwc->config_base + idx,
929 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 1159 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
930} 1160}
931 1161
932static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1162static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
933{ 1163{
934 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 1164 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
935} 1165}
936 1166
937static inline void 1167static inline void
938intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) 1168intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
939{ 1169{
940 int idx = __idx - X86_PMC_IDX_FIXED; 1170 int idx = __idx - X86_PMC_IDX_FIXED;
941 u64 ctrl_val, mask; 1171 u64 ctrl_val, mask;
@@ -948,10 +1178,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
948} 1178}
949 1179
950static inline void 1180static inline void
951p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1181p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
952{ 1182{
953 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
954 u64 val = P6_NOP_COUNTER; 1184 u64 val = P6_NOP_EVENT;
955 1185
956 if (cpuc->enabled) 1186 if (cpuc->enabled)
957 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -960,36 +1190,44 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
960} 1190}
961 1191
962static inline void 1192static inline void
963intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1193intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
964{ 1194{
1195 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1196 intel_pmu_disable_bts();
1197 return;
1198 }
1199
965 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1200 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
966 intel_pmu_disable_fixed(hwc, idx); 1201 intel_pmu_disable_fixed(hwc, idx);
967 return; 1202 return;
968 } 1203 }
969 1204
970 x86_pmu_disable_counter(hwc, idx); 1205 x86_pmu_disable_event(hwc, idx);
971} 1206}
972 1207
973static inline void 1208static inline void
974amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1209amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
975{ 1210{
976 x86_pmu_disable_counter(hwc, idx); 1211 x86_pmu_disable_event(hwc, idx);
977} 1212}
978 1213
979static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); 1214static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
980 1215
981/* 1216/*
982 * Set the next IRQ period, based on the hwc->period_left value. 1217 * Set the next IRQ period, based on the hwc->period_left value.
983 * To be called with the counter disabled in hw: 1218 * To be called with the event disabled in hw:
984 */ 1219 */
985static int 1220static int
986x86_perf_counter_set_period(struct perf_counter *counter, 1221x86_perf_event_set_period(struct perf_event *event,
987 struct hw_perf_counter *hwc, int idx) 1222 struct hw_perf_event *hwc, int idx)
988{ 1223{
989 s64 left = atomic64_read(&hwc->period_left); 1224 s64 left = atomic64_read(&hwc->period_left);
990 s64 period = hwc->sample_period; 1225 s64 period = hwc->sample_period;
991 int err, ret = 0; 1226 int err, ret = 0;
992 1227
1228 if (idx == X86_PMC_IDX_FIXED_BTS)
1229 return 0;
1230
993 /* 1231 /*
994 * If we are way outside a reasoable range then just skip forward: 1232 * If we are way outside a reasoable range then just skip forward:
995 */ 1233 */
@@ -1007,7 +1245,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1007 ret = 1; 1245 ret = 1;
1008 } 1246 }
1009 /* 1247 /*
1010 * Quirk: certain CPUs dont like it if just 1 event is left: 1248 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1011 */ 1249 */
1012 if (unlikely(left < 2)) 1250 if (unlikely(left < 2))
1013 left = 2; 1251 left = 2;
@@ -1015,24 +1253,24 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1015 if (left > x86_pmu.max_period) 1253 if (left > x86_pmu.max_period)
1016 left = x86_pmu.max_period; 1254 left = x86_pmu.max_period;
1017 1255
1018 per_cpu(prev_left[idx], smp_processor_id()) = left; 1256 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1019 1257
1020 /* 1258 /*
1021 * The hw counter starts counting from this counter offset, 1259 * The hw event starts counting from this event offset,
1022 * mark it to be able to extra future deltas: 1260 * mark it to be able to extra future deltas:
1023 */ 1261 */
1024 atomic64_set(&hwc->prev_count, (u64)-left); 1262 atomic64_set(&hwc->prev_count, (u64)-left);
1025 1263
1026 err = checking_wrmsrl(hwc->counter_base + idx, 1264 err = checking_wrmsrl(hwc->event_base + idx,
1027 (u64)(-left) & x86_pmu.counter_mask); 1265 (u64)(-left) & x86_pmu.event_mask);
1028 1266
1029 perf_counter_update_userpage(counter); 1267 perf_event_update_userpage(event);
1030 1268
1031 return ret; 1269 return ret;
1032} 1270}
1033 1271
1034static inline void 1272static inline void
1035intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) 1273intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1036{ 1274{
1037 int idx = __idx - X86_PMC_IDX_FIXED; 1275 int idx = __idx - X86_PMC_IDX_FIXED;
1038 u64 ctrl_val, bits, mask; 1276 u64 ctrl_val, bits, mask;
@@ -1057,9 +1295,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
1057 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1295 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1058} 1296}
1059 1297
1060static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1298static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1061{ 1299{
1062 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1300 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1063 u64 val; 1301 u64 val;
1064 1302
1065 val = hwc->config; 1303 val = hwc->config;
@@ -1070,128 +1308,149 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1070} 1308}
1071 1309
1072 1310
1073static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1311static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1074{ 1312{
1313 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1314 if (!__get_cpu_var(cpu_hw_events).enabled)
1315 return;
1316
1317 intel_pmu_enable_bts(hwc->config);
1318 return;
1319 }
1320
1075 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1321 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1076 intel_pmu_enable_fixed(hwc, idx); 1322 intel_pmu_enable_fixed(hwc, idx);
1077 return; 1323 return;
1078 } 1324 }
1079 1325
1080 x86_pmu_enable_counter(hwc, idx); 1326 x86_pmu_enable_event(hwc, idx);
1081} 1327}
1082 1328
1083static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1329static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1084{ 1330{
1085 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1331 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1086 1332
1087 if (cpuc->enabled) 1333 if (cpuc->enabled)
1088 x86_pmu_enable_counter(hwc, idx); 1334 x86_pmu_enable_event(hwc, idx);
1089} 1335}
1090 1336
1091static int 1337static int
1092fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) 1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1093{ 1339{
1094 unsigned int event; 1340 unsigned int hw_event;
1095 1341
1096 if (!x86_pmu.num_counters_fixed) 1342 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1097 return -1;
1098 1343
1099 event = hwc->config & ARCH_PERFMON_EVENT_MASK; 1344 if (unlikely((hw_event ==
1345 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1346 (hwc->sample_period == 1)))
1347 return X86_PMC_IDX_FIXED_BTS;
1348
1349 if (!x86_pmu.num_events_fixed)
1350 return -1;
1100 1351
1101 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1102 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1103 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1104 return X86_PMC_IDX_FIXED_CPU_CYCLES; 1355 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1105 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) 1356 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1106 return X86_PMC_IDX_FIXED_BUS_CYCLES; 1357 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1107 1358
1108 return -1; 1359 return -1;
1109} 1360}
1110 1361
1111/* 1362/*
1112 * Find a PMC slot for the freshly enabled / scheduled in counter: 1363 * Find a PMC slot for the freshly enabled / scheduled in event:
1113 */ 1364 */
1114static int x86_pmu_enable(struct perf_counter *counter) 1365static int x86_pmu_enable(struct perf_event *event)
1115{ 1366{
1116 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1117 struct hw_perf_counter *hwc = &counter->hw; 1368 struct hw_perf_event *hwc = &event->hw;
1118 int idx; 1369 int idx;
1119 1370
1120 idx = fixed_mode_idx(counter, hwc); 1371 idx = fixed_mode_idx(event, hwc);
1121 if (idx >= 0) { 1372 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN;
1376
1377 hwc->config_base = 0;
1378 hwc->event_base = 0;
1379 hwc->idx = idx;
1380 } else if (idx >= 0) {
1122 /* 1381 /*
1123 * Try to get the fixed counter, if that is already taken 1382 * Try to get the fixed event, if that is already taken
1124 * then try to get a generic counter: 1383 * then try to get a generic event:
1125 */ 1384 */
1126 if (test_and_set_bit(idx, cpuc->used_mask)) 1385 if (test_and_set_bit(idx, cpuc->used_mask))
1127 goto try_generic; 1386 goto try_generic;
1128 1387
1129 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1388 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1130 /* 1389 /*
1131 * We set it so that counter_base + idx in wrmsr/rdmsr maps to 1390 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1132 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: 1391 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1133 */ 1392 */
1134 hwc->counter_base = 1393 hwc->event_base =
1135 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; 1394 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1136 hwc->idx = idx; 1395 hwc->idx = idx;
1137 } else { 1396 } else {
1138 idx = hwc->idx; 1397 idx = hwc->idx;
1139 /* Try to get the previous generic counter again */ 1398 /* Try to get the previous generic event again */
1140 if (test_and_set_bit(idx, cpuc->used_mask)) { 1399 if (test_and_set_bit(idx, cpuc->used_mask)) {
1141try_generic: 1400try_generic:
1142 idx = find_first_zero_bit(cpuc->used_mask, 1401 idx = find_first_zero_bit(cpuc->used_mask,
1143 x86_pmu.num_counters); 1402 x86_pmu.num_events);
1144 if (idx == x86_pmu.num_counters) 1403 if (idx == x86_pmu.num_events)
1145 return -EAGAIN; 1404 return -EAGAIN;
1146 1405
1147 set_bit(idx, cpuc->used_mask); 1406 set_bit(idx, cpuc->used_mask);
1148 hwc->idx = idx; 1407 hwc->idx = idx;
1149 } 1408 }
1150 hwc->config_base = x86_pmu.eventsel; 1409 hwc->config_base = x86_pmu.eventsel;
1151 hwc->counter_base = x86_pmu.perfctr; 1410 hwc->event_base = x86_pmu.perfctr;
1152 } 1411 }
1153 1412
1154 perf_counters_lapic_init(); 1413 perf_events_lapic_init();
1155 1414
1156 x86_pmu.disable(hwc, idx); 1415 x86_pmu.disable(hwc, idx);
1157 1416
1158 cpuc->counters[idx] = counter; 1417 cpuc->events[idx] = event;
1159 set_bit(idx, cpuc->active_mask); 1418 set_bit(idx, cpuc->active_mask);
1160 1419
1161 x86_perf_counter_set_period(counter, hwc, idx); 1420 x86_perf_event_set_period(event, hwc, idx);
1162 x86_pmu.enable(hwc, idx); 1421 x86_pmu.enable(hwc, idx);
1163 1422
1164 perf_counter_update_userpage(counter); 1423 perf_event_update_userpage(event);
1165 1424
1166 return 0; 1425 return 0;
1167} 1426}
1168 1427
1169static void x86_pmu_unthrottle(struct perf_counter *counter) 1428static void x86_pmu_unthrottle(struct perf_event *event)
1170{ 1429{
1171 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1430 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1172 struct hw_perf_counter *hwc = &counter->hw; 1431 struct hw_perf_event *hwc = &event->hw;
1173 1432
1174 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || 1433 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1175 cpuc->counters[hwc->idx] != counter)) 1434 cpuc->events[hwc->idx] != event))
1176 return; 1435 return;
1177 1436
1178 x86_pmu.enable(hwc, hwc->idx); 1437 x86_pmu.enable(hwc, hwc->idx);
1179} 1438}
1180 1439
1181void perf_counter_print_debug(void) 1440void perf_event_print_debug(void)
1182{ 1441{
1183 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1442 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1184 struct cpu_hw_counters *cpuc; 1443 struct cpu_hw_events *cpuc;
1185 unsigned long flags; 1444 unsigned long flags;
1186 int cpu, idx; 1445 int cpu, idx;
1187 1446
1188 if (!x86_pmu.num_counters) 1447 if (!x86_pmu.num_events)
1189 return; 1448 return;
1190 1449
1191 local_irq_save(flags); 1450 local_irq_save(flags);
1192 1451
1193 cpu = smp_processor_id(); 1452 cpu = smp_processor_id();
1194 cpuc = &per_cpu(cpu_hw_counters, cpu); 1453 cpuc = &per_cpu(cpu_hw_events, cpu);
1195 1454
1196 if (x86_pmu.version >= 2) { 1455 if (x86_pmu.version >= 2) {
1197 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1456 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
@@ -1207,11 +1466,11 @@ void perf_counter_print_debug(void)
1207 } 1466 }
1208 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1467 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1209 1468
1210 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1469 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1211 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1470 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1212 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1471 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1213 1472
1214 prev_left = per_cpu(prev_left[idx], cpu); 1473 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1215 1474
1216 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1475 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1217 cpu, idx, pmc_ctrl); 1476 cpu, idx, pmc_ctrl);
@@ -1220,7 +1479,7 @@ void perf_counter_print_debug(void)
1220 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1479 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1221 cpu, idx, prev_left); 1480 cpu, idx, prev_left);
1222 } 1481 }
1223 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1482 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1224 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1483 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1225 1484
1226 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1485 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1229,10 +1488,69 @@ void perf_counter_print_debug(void)
1229 local_irq_restore(flags); 1488 local_irq_restore(flags);
1230} 1489}
1231 1490
1232static void x86_pmu_disable(struct perf_counter *counter) 1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1492{
1493 struct debug_store *ds = cpuc->ds;
1494 struct bts_record {
1495 u64 from;
1496 u64 to;
1497 u64 flags;
1498 };
1499 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1500 struct bts_record *at, *top;
1501 struct perf_output_handle handle;
1502 struct perf_event_header header;
1503 struct perf_sample_data data;
1504 struct pt_regs regs;
1505
1506 if (!event)
1507 return;
1508
1509 if (!ds)
1510 return;
1511
1512 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1513 top = (struct bts_record *)(unsigned long)ds->bts_index;
1514
1515 if (top <= at)
1516 return;
1517
1518 ds->bts_index = ds->bts_buffer_base;
1519
1520
1521 data.period = event->hw.last_period;
1522 data.addr = 0;
1523 regs.ip = 0;
1524
1525 /*
1526 * Prepare a generic sample, i.e. fill in the invariant fields.
1527 * We will overwrite the from and to address before we output
1528 * the sample.
1529 */
1530 perf_prepare_sample(&header, &data, event, &regs);
1531
1532 if (perf_output_begin(&handle, event,
1533 header.size * (top - at), 1, 1))
1534 return;
1535
1536 for (; at < top; at++) {
1537 data.ip = at->from;
1538 data.addr = at->to;
1539
1540 perf_output_sample(&handle, &header, &data, event);
1541 }
1542
1543 perf_output_end(&handle);
1544
1545 /* There's new data available. */
1546 event->hw.interrupts++;
1547 event->pending_kill = POLL_IN;
1548}
1549
1550static void x86_pmu_disable(struct perf_event *event)
1233{ 1551{
1234 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1552 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1235 struct hw_perf_counter *hwc = &counter->hw; 1553 struct hw_perf_event *hwc = &event->hw;
1236 int idx = hwc->idx; 1554 int idx = hwc->idx;
1237 1555
1238 /* 1556 /*
@@ -1244,59 +1562,67 @@ static void x86_pmu_disable(struct perf_counter *counter)
1244 1562
1245 /* 1563 /*
1246 * Make sure the cleared pointer becomes visible before we 1564 * Make sure the cleared pointer becomes visible before we
1247 * (potentially) free the counter: 1565 * (potentially) free the event:
1248 */ 1566 */
1249 barrier(); 1567 barrier();
1250 1568
1251 /* 1569 /*
1252 * Drain the remaining delta count out of a counter 1570 * Drain the remaining delta count out of a event
1253 * that we are disabling: 1571 * that we are disabling:
1254 */ 1572 */
1255 x86_perf_counter_update(counter, hwc, idx); 1573 x86_perf_event_update(event, hwc, idx);
1256 cpuc->counters[idx] = NULL; 1574
1575 /* Drain the remaining BTS records. */
1576 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1577 intel_pmu_drain_bts_buffer(cpuc);
1578
1579 cpuc->events[idx] = NULL;
1257 clear_bit(idx, cpuc->used_mask); 1580 clear_bit(idx, cpuc->used_mask);
1258 1581
1259 perf_counter_update_userpage(counter); 1582 perf_event_update_userpage(event);
1260} 1583}
1261 1584
1262/* 1585/*
1263 * Save and restart an expired counter. Called by NMI contexts, 1586 * Save and restart an expired event. Called by NMI contexts,
1264 * so it has to be careful about preempting normal counter ops: 1587 * so it has to be careful about preempting normal event ops:
1265 */ 1588 */
1266static int intel_pmu_save_and_restart(struct perf_counter *counter) 1589static int intel_pmu_save_and_restart(struct perf_event *event)
1267{ 1590{
1268 struct hw_perf_counter *hwc = &counter->hw; 1591 struct hw_perf_event *hwc = &event->hw;
1269 int idx = hwc->idx; 1592 int idx = hwc->idx;
1270 int ret; 1593 int ret;
1271 1594
1272 x86_perf_counter_update(counter, hwc, idx); 1595 x86_perf_event_update(event, hwc, idx);
1273 ret = x86_perf_counter_set_period(counter, hwc, idx); 1596 ret = x86_perf_event_set_period(event, hwc, idx);
1274 1597
1275 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 1598 if (event->state == PERF_EVENT_STATE_ACTIVE)
1276 intel_pmu_enable_counter(hwc, idx); 1599 intel_pmu_enable_event(hwc, idx);
1277 1600
1278 return ret; 1601 return ret;
1279} 1602}
1280 1603
1281static void intel_pmu_reset(void) 1604static void intel_pmu_reset(void)
1282{ 1605{
1606 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1283 unsigned long flags; 1607 unsigned long flags;
1284 int idx; 1608 int idx;
1285 1609
1286 if (!x86_pmu.num_counters) 1610 if (!x86_pmu.num_events)
1287 return; 1611 return;
1288 1612
1289 local_irq_save(flags); 1613 local_irq_save(flags);
1290 1614
1291 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 1615 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1292 1616
1293 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1617 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1294 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 1618 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1295 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 1619 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1296 } 1620 }
1297 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1621 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1298 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 1622 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1299 } 1623 }
1624 if (ds)
1625 ds->bts_index = ds->bts_buffer_base;
1300 1626
1301 local_irq_restore(flags); 1627 local_irq_restore(flags);
1302} 1628}
@@ -1304,39 +1630,38 @@ static void intel_pmu_reset(void)
1304static int p6_pmu_handle_irq(struct pt_regs *regs) 1630static int p6_pmu_handle_irq(struct pt_regs *regs)
1305{ 1631{
1306 struct perf_sample_data data; 1632 struct perf_sample_data data;
1307 struct cpu_hw_counters *cpuc; 1633 struct cpu_hw_events *cpuc;
1308 struct perf_counter *counter; 1634 struct perf_event *event;
1309 struct hw_perf_counter *hwc; 1635 struct hw_perf_event *hwc;
1310 int idx, handled = 0; 1636 int idx, handled = 0;
1311 u64 val; 1637 u64 val;
1312 1638
1313 data.regs = regs;
1314 data.addr = 0; 1639 data.addr = 0;
1315 1640
1316 cpuc = &__get_cpu_var(cpu_hw_counters); 1641 cpuc = &__get_cpu_var(cpu_hw_events);
1317 1642
1318 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1643 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1319 if (!test_bit(idx, cpuc->active_mask)) 1644 if (!test_bit(idx, cpuc->active_mask))
1320 continue; 1645 continue;
1321 1646
1322 counter = cpuc->counters[idx]; 1647 event = cpuc->events[idx];
1323 hwc = &counter->hw; 1648 hwc = &event->hw;
1324 1649
1325 val = x86_perf_counter_update(counter, hwc, idx); 1650 val = x86_perf_event_update(event, hwc, idx);
1326 if (val & (1ULL << (x86_pmu.counter_bits - 1))) 1651 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1327 continue; 1652 continue;
1328 1653
1329 /* 1654 /*
1330 * counter overflow 1655 * event overflow
1331 */ 1656 */
1332 handled = 1; 1657 handled = 1;
1333 data.period = counter->hw.last_period; 1658 data.period = event->hw.last_period;
1334 1659
1335 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1660 if (!x86_perf_event_set_period(event, hwc, idx))
1336 continue; 1661 continue;
1337 1662
1338 if (perf_counter_overflow(counter, 1, &data)) 1663 if (perf_event_overflow(event, 1, &data, regs))
1339 p6_pmu_disable_counter(hwc, idx); 1664 p6_pmu_disable_event(hwc, idx);
1340 } 1665 }
1341 1666
1342 if (handled) 1667 if (handled)
@@ -1352,16 +1677,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1352static int intel_pmu_handle_irq(struct pt_regs *regs) 1677static int intel_pmu_handle_irq(struct pt_regs *regs)
1353{ 1678{
1354 struct perf_sample_data data; 1679 struct perf_sample_data data;
1355 struct cpu_hw_counters *cpuc; 1680 struct cpu_hw_events *cpuc;
1356 int bit, loops; 1681 int bit, loops;
1357 u64 ack, status; 1682 u64 ack, status;
1358 1683
1359 data.regs = regs;
1360 data.addr = 0; 1684 data.addr = 0;
1361 1685
1362 cpuc = &__get_cpu_var(cpu_hw_counters); 1686 cpuc = &__get_cpu_var(cpu_hw_events);
1363 1687
1364 perf_disable(); 1688 perf_disable();
1689 intel_pmu_drain_bts_buffer(cpuc);
1365 status = intel_pmu_get_status(); 1690 status = intel_pmu_get_status();
1366 if (!status) { 1691 if (!status) {
1367 perf_enable(); 1692 perf_enable();
@@ -1371,8 +1696,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1371 loops = 0; 1696 loops = 0;
1372again: 1697again:
1373 if (++loops > 100) { 1698 if (++loops > 100) {
1374 WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); 1699 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1375 perf_counter_print_debug(); 1700 perf_event_print_debug();
1376 intel_pmu_reset(); 1701 intel_pmu_reset();
1377 perf_enable(); 1702 perf_enable();
1378 return 1; 1703 return 1;
@@ -1381,19 +1706,19 @@ again:
1381 inc_irq_stat(apic_perf_irqs); 1706 inc_irq_stat(apic_perf_irqs);
1382 ack = status; 1707 ack = status;
1383 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 1708 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1384 struct perf_counter *counter = cpuc->counters[bit]; 1709 struct perf_event *event = cpuc->events[bit];
1385 1710
1386 clear_bit(bit, (unsigned long *) &status); 1711 clear_bit(bit, (unsigned long *) &status);
1387 if (!test_bit(bit, cpuc->active_mask)) 1712 if (!test_bit(bit, cpuc->active_mask))
1388 continue; 1713 continue;
1389 1714
1390 if (!intel_pmu_save_and_restart(counter)) 1715 if (!intel_pmu_save_and_restart(event))
1391 continue; 1716 continue;
1392 1717
1393 data.period = counter->hw.last_period; 1718 data.period = event->hw.last_period;
1394 1719
1395 if (perf_counter_overflow(counter, 1, &data)) 1720 if (perf_event_overflow(event, 1, &data, regs))
1396 intel_pmu_disable_counter(&counter->hw, bit); 1721 intel_pmu_disable_event(&event->hw, bit);
1397 } 1722 }
1398 1723
1399 intel_pmu_ack_status(ack); 1724 intel_pmu_ack_status(ack);
@@ -1413,39 +1738,38 @@ again:
1413static int amd_pmu_handle_irq(struct pt_regs *regs) 1738static int amd_pmu_handle_irq(struct pt_regs *regs)
1414{ 1739{
1415 struct perf_sample_data data; 1740 struct perf_sample_data data;
1416 struct cpu_hw_counters *cpuc; 1741 struct cpu_hw_events *cpuc;
1417 struct perf_counter *counter; 1742 struct perf_event *event;
1418 struct hw_perf_counter *hwc; 1743 struct hw_perf_event *hwc;
1419 int idx, handled = 0; 1744 int idx, handled = 0;
1420 u64 val; 1745 u64 val;
1421 1746
1422 data.regs = regs;
1423 data.addr = 0; 1747 data.addr = 0;
1424 1748
1425 cpuc = &__get_cpu_var(cpu_hw_counters); 1749 cpuc = &__get_cpu_var(cpu_hw_events);
1426 1750
1427 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1751 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1428 if (!test_bit(idx, cpuc->active_mask)) 1752 if (!test_bit(idx, cpuc->active_mask))
1429 continue; 1753 continue;
1430 1754
1431 counter = cpuc->counters[idx]; 1755 event = cpuc->events[idx];
1432 hwc = &counter->hw; 1756 hwc = &event->hw;
1433 1757
1434 val = x86_perf_counter_update(counter, hwc, idx); 1758 val = x86_perf_event_update(event, hwc, idx);
1435 if (val & (1ULL << (x86_pmu.counter_bits - 1))) 1759 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1436 continue; 1760 continue;
1437 1761
1438 /* 1762 /*
1439 * counter overflow 1763 * event overflow
1440 */ 1764 */
1441 handled = 1; 1765 handled = 1;
1442 data.period = counter->hw.last_period; 1766 data.period = event->hw.last_period;
1443 1767
1444 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1768 if (!x86_perf_event_set_period(event, hwc, idx))
1445 continue; 1769 continue;
1446 1770
1447 if (perf_counter_overflow(counter, 1, &data)) 1771 if (perf_event_overflow(event, 1, &data, regs))
1448 amd_pmu_disable_counter(hwc, idx); 1772 amd_pmu_disable_event(hwc, idx);
1449 } 1773 }
1450 1774
1451 if (handled) 1775 if (handled)
@@ -1459,18 +1783,18 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
1459 irq_enter(); 1783 irq_enter();
1460 ack_APIC_irq(); 1784 ack_APIC_irq();
1461 inc_irq_stat(apic_pending_irqs); 1785 inc_irq_stat(apic_pending_irqs);
1462 perf_counter_do_pending(); 1786 perf_event_do_pending();
1463 irq_exit(); 1787 irq_exit();
1464} 1788}
1465 1789
1466void set_perf_counter_pending(void) 1790void set_perf_event_pending(void)
1467{ 1791{
1468#ifdef CONFIG_X86_LOCAL_APIC 1792#ifdef CONFIG_X86_LOCAL_APIC
1469 apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1793 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1470#endif 1794#endif
1471} 1795}
1472 1796
1473void perf_counters_lapic_init(void) 1797void perf_events_lapic_init(void)
1474{ 1798{
1475#ifdef CONFIG_X86_LOCAL_APIC 1799#ifdef CONFIG_X86_LOCAL_APIC
1476 if (!x86_pmu.apic || !x86_pmu_initialized()) 1800 if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1484,13 +1808,13 @@ void perf_counters_lapic_init(void)
1484} 1808}
1485 1809
1486static int __kprobes 1810static int __kprobes
1487perf_counter_nmi_handler(struct notifier_block *self, 1811perf_event_nmi_handler(struct notifier_block *self,
1488 unsigned long cmd, void *__args) 1812 unsigned long cmd, void *__args)
1489{ 1813{
1490 struct die_args *args = __args; 1814 struct die_args *args = __args;
1491 struct pt_regs *regs; 1815 struct pt_regs *regs;
1492 1816
1493 if (!atomic_read(&active_counters)) 1817 if (!atomic_read(&active_events))
1494 return NOTIFY_DONE; 1818 return NOTIFY_DONE;
1495 1819
1496 switch (cmd) { 1820 switch (cmd) {
@@ -1509,7 +1833,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
1509#endif 1833#endif
1510 /* 1834 /*
1511 * Can't rely on the handled return value to say it was our NMI, two 1835 * Can't rely on the handled return value to say it was our NMI, two
1512 * counters could trigger 'simultaneously' raising two back-to-back NMIs. 1836 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1513 * 1837 *
1514 * If the first NMI handles both, the latter will be empty and daze 1838 * If the first NMI handles both, the latter will be empty and daze
1515 * the CPU. 1839 * the CPU.
@@ -1519,8 +1843,8 @@ perf_counter_nmi_handler(struct notifier_block *self,
1519 return NOTIFY_STOP; 1843 return NOTIFY_STOP;
1520} 1844}
1521 1845
1522static __read_mostly struct notifier_block perf_counter_nmi_notifier = { 1846static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1523 .notifier_call = perf_counter_nmi_handler, 1847 .notifier_call = perf_event_nmi_handler,
1524 .next = NULL, 1848 .next = NULL,
1525 .priority = 1 1849 .priority = 1
1526}; 1850};
@@ -1530,8 +1854,8 @@ static struct x86_pmu p6_pmu = {
1530 .handle_irq = p6_pmu_handle_irq, 1854 .handle_irq = p6_pmu_handle_irq,
1531 .disable_all = p6_pmu_disable_all, 1855 .disable_all = p6_pmu_disable_all,
1532 .enable_all = p6_pmu_enable_all, 1856 .enable_all = p6_pmu_enable_all,
1533 .enable = p6_pmu_enable_counter, 1857 .enable = p6_pmu_enable_event,
1534 .disable = p6_pmu_disable_counter, 1858 .disable = p6_pmu_disable_event,
1535 .eventsel = MSR_P6_EVNTSEL0, 1859 .eventsel = MSR_P6_EVNTSEL0,
1536 .perfctr = MSR_P6_PERFCTR0, 1860 .perfctr = MSR_P6_PERFCTR0,
1537 .event_map = p6_pmu_event_map, 1861 .event_map = p6_pmu_event_map,
@@ -1540,16 +1864,16 @@ static struct x86_pmu p6_pmu = {
1540 .apic = 1, 1864 .apic = 1,
1541 .max_period = (1ULL << 31) - 1, 1865 .max_period = (1ULL << 31) - 1,
1542 .version = 0, 1866 .version = 0,
1543 .num_counters = 2, 1867 .num_events = 2,
1544 /* 1868 /*
1545 * Counters have 40 bits implemented. However they are designed such 1869 * Events have 40 bits implemented. However they are designed such
1546 * that bits [32-39] are sign extensions of bit 31. As such the 1870 * that bits [32-39] are sign extensions of bit 31. As such the
1547 * effective width of a counter for P6-like PMU is 32 bits only. 1871 * effective width of a event for P6-like PMU is 32 bits only.
1548 * 1872 *
1549 * See IA-32 Intel Architecture Software developer manual Vol 3B 1873 * See IA-32 Intel Architecture Software developer manual Vol 3B
1550 */ 1874 */
1551 .counter_bits = 32, 1875 .event_bits = 32,
1552 .counter_mask = (1ULL << 32) - 1, 1876 .event_mask = (1ULL << 32) - 1,
1553}; 1877};
1554 1878
1555static struct x86_pmu intel_pmu = { 1879static struct x86_pmu intel_pmu = {
@@ -1557,8 +1881,8 @@ static struct x86_pmu intel_pmu = {
1557 .handle_irq = intel_pmu_handle_irq, 1881 .handle_irq = intel_pmu_handle_irq,
1558 .disable_all = intel_pmu_disable_all, 1882 .disable_all = intel_pmu_disable_all,
1559 .enable_all = intel_pmu_enable_all, 1883 .enable_all = intel_pmu_enable_all,
1560 .enable = intel_pmu_enable_counter, 1884 .enable = intel_pmu_enable_event,
1561 .disable = intel_pmu_disable_counter, 1885 .disable = intel_pmu_disable_event,
1562 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 1886 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1563 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 1887 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1564 .event_map = intel_pmu_event_map, 1888 .event_map = intel_pmu_event_map,
@@ -1568,9 +1892,11 @@ static struct x86_pmu intel_pmu = {
1568 /* 1892 /*
1569 * Intel PMCs cannot be accessed sanely above 32 bit width, 1893 * Intel PMCs cannot be accessed sanely above 32 bit width,
1570 * so we install an artificial 1<<31 period regardless of 1894 * so we install an artificial 1<<31 period regardless of
1571 * the generic counter period: 1895 * the generic event period:
1572 */ 1896 */
1573 .max_period = (1ULL << 31) - 1, 1897 .max_period = (1ULL << 31) - 1,
1898 .enable_bts = intel_pmu_enable_bts,
1899 .disable_bts = intel_pmu_disable_bts,
1574}; 1900};
1575 1901
1576static struct x86_pmu amd_pmu = { 1902static struct x86_pmu amd_pmu = {
@@ -1578,16 +1904,16 @@ static struct x86_pmu amd_pmu = {
1578 .handle_irq = amd_pmu_handle_irq, 1904 .handle_irq = amd_pmu_handle_irq,
1579 .disable_all = amd_pmu_disable_all, 1905 .disable_all = amd_pmu_disable_all,
1580 .enable_all = amd_pmu_enable_all, 1906 .enable_all = amd_pmu_enable_all,
1581 .enable = amd_pmu_enable_counter, 1907 .enable = amd_pmu_enable_event,
1582 .disable = amd_pmu_disable_counter, 1908 .disable = amd_pmu_disable_event,
1583 .eventsel = MSR_K7_EVNTSEL0, 1909 .eventsel = MSR_K7_EVNTSEL0,
1584 .perfctr = MSR_K7_PERFCTR0, 1910 .perfctr = MSR_K7_PERFCTR0,
1585 .event_map = amd_pmu_event_map, 1911 .event_map = amd_pmu_event_map,
1586 .raw_event = amd_pmu_raw_event, 1912 .raw_event = amd_pmu_raw_event,
1587 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 1913 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1588 .num_counters = 4, 1914 .num_events = 4,
1589 .counter_bits = 48, 1915 .event_bits = 48,
1590 .counter_mask = (1ULL << 48) - 1, 1916 .event_mask = (1ULL << 48) - 1,
1591 .apic = 1, 1917 .apic = 1,
1592 /* use highest bit to detect overflow */ 1918 /* use highest bit to detect overflow */
1593 .max_period = (1ULL << 47) - 1, 1919 .max_period = (1ULL << 47) - 1,
@@ -1644,7 +1970,7 @@ static int intel_pmu_init(void)
1644 1970
1645 /* 1971 /*
1646 * Check whether the Architectural PerfMon supports 1972 * Check whether the Architectural PerfMon supports
1647 * Branch Misses Retired Event or not. 1973 * Branch Misses Retired hw_event or not.
1648 */ 1974 */
1649 cpuid(10, &eax.full, &ebx, &unused, &edx.full); 1975 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1650 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) 1976 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
@@ -1656,15 +1982,15 @@ static int intel_pmu_init(void)
1656 1982
1657 x86_pmu = intel_pmu; 1983 x86_pmu = intel_pmu;
1658 x86_pmu.version = version; 1984 x86_pmu.version = version;
1659 x86_pmu.num_counters = eax.split.num_counters; 1985 x86_pmu.num_events = eax.split.num_events;
1660 x86_pmu.counter_bits = eax.split.bit_width; 1986 x86_pmu.event_bits = eax.split.bit_width;
1661 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; 1987 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
1662 1988
1663 /* 1989 /*
1664 * Quirk: v2 perfmon does not report fixed-purpose counters, so 1990 * Quirk: v2 perfmon does not report fixed-purpose events, so
1665 * assume at least 3 counters: 1991 * assume at least 3 events:
1666 */ 1992 */
1667 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); 1993 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
1668 1994
1669 /* 1995 /*
1670 * Install the hw-cache-events table: 1996 * Install the hw-cache-events table:
@@ -1711,11 +2037,11 @@ static int amd_pmu_init(void)
1711 return 0; 2037 return 0;
1712} 2038}
1713 2039
1714void __init init_hw_perf_counters(void) 2040void __init init_hw_perf_events(void)
1715{ 2041{
1716 int err; 2042 int err;
1717 2043
1718 pr_info("Performance Counters: "); 2044 pr_info("Performance Events: ");
1719 2045
1720 switch (boot_cpu_data.x86_vendor) { 2046 switch (boot_cpu_data.x86_vendor) {
1721 case X86_VENDOR_INTEL: 2047 case X86_VENDOR_INTEL:
@@ -1728,45 +2054,45 @@ void __init init_hw_perf_counters(void)
1728 return; 2054 return;
1729 } 2055 }
1730 if (err != 0) { 2056 if (err != 0) {
1731 pr_cont("no PMU driver, software counters only.\n"); 2057 pr_cont("no PMU driver, software events only.\n");
1732 return; 2058 return;
1733 } 2059 }
1734 2060
1735 pr_cont("%s PMU driver.\n", x86_pmu.name); 2061 pr_cont("%s PMU driver.\n", x86_pmu.name);
1736 2062
1737 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 2063 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
1738 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", 2064 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1739 x86_pmu.num_counters, X86_PMC_MAX_GENERIC); 2065 x86_pmu.num_events, X86_PMC_MAX_GENERIC);
1740 x86_pmu.num_counters = X86_PMC_MAX_GENERIC; 2066 x86_pmu.num_events = X86_PMC_MAX_GENERIC;
1741 } 2067 }
1742 perf_counter_mask = (1 << x86_pmu.num_counters) - 1; 2068 perf_event_mask = (1 << x86_pmu.num_events) - 1;
1743 perf_max_counters = x86_pmu.num_counters; 2069 perf_max_events = x86_pmu.num_events;
1744 2070
1745 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 2071 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
1746 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", 2072 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1747 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); 2073 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
1748 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; 2074 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
1749 } 2075 }
1750 2076
1751 perf_counter_mask |= 2077 perf_event_mask |=
1752 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; 2078 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
1753 x86_pmu.intel_ctrl = perf_counter_mask; 2079 x86_pmu.intel_ctrl = perf_event_mask;
1754 2080
1755 perf_counters_lapic_init(); 2081 perf_events_lapic_init();
1756 register_die_notifier(&perf_counter_nmi_notifier); 2082 register_die_notifier(&perf_event_nmi_notifier);
1757 2083
1758 pr_info("... version: %d\n", x86_pmu.version); 2084 pr_info("... version: %d\n", x86_pmu.version);
1759 pr_info("... bit width: %d\n", x86_pmu.counter_bits); 2085 pr_info("... bit width: %d\n", x86_pmu.event_bits);
1760 pr_info("... generic counters: %d\n", x86_pmu.num_counters); 2086 pr_info("... generic registers: %d\n", x86_pmu.num_events);
1761 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); 2087 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask);
1762 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 2088 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1763 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); 2089 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
1764 pr_info("... counter mask: %016Lx\n", perf_counter_mask); 2090 pr_info("... event mask: %016Lx\n", perf_event_mask);
1765} 2091}
1766 2092
1767static inline void x86_pmu_read(struct perf_counter *counter) 2093static inline void x86_pmu_read(struct perf_event *event)
1768{ 2094{
1769 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); 2095 x86_perf_event_update(event, &event->hw, event->hw.idx);
1770} 2096}
1771 2097
1772static const struct pmu pmu = { 2098static const struct pmu pmu = {
@@ -1776,13 +2102,16 @@ static const struct pmu pmu = {
1776 .unthrottle = x86_pmu_unthrottle, 2102 .unthrottle = x86_pmu_unthrottle,
1777}; 2103};
1778 2104
1779const struct pmu *hw_perf_counter_init(struct perf_counter *counter) 2105const struct pmu *hw_perf_event_init(struct perf_event *event)
1780{ 2106{
1781 int err; 2107 int err;
1782 2108
1783 err = __hw_perf_counter_init(counter); 2109 err = __hw_perf_event_init(event);
1784 if (err) 2110 if (err) {
2111 if (event->destroy)
2112 event->destroy(event);
1785 return ERR_PTR(err); 2113 return ERR_PTR(err);
2114 }
1786 2115
1787 return &pmu; 2116 return &pmu;
1788} 2117}
@@ -1798,8 +2127,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1798 entry->ip[entry->nr++] = ip; 2127 entry->ip[entry->nr++] = ip;
1799} 2128}
1800 2129
1801static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 2130static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
1802static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 2131static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1803static DEFINE_PER_CPU(int, in_nmi_frame); 2132static DEFINE_PER_CPU(int, in_nmi_frame);
1804 2133
1805 2134
@@ -1952,9 +2281,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1952 struct perf_callchain_entry *entry; 2281 struct perf_callchain_entry *entry;
1953 2282
1954 if (in_nmi()) 2283 if (in_nmi())
1955 entry = &__get_cpu_var(nmi_entry); 2284 entry = &__get_cpu_var(pmc_nmi_entry);
1956 else 2285 else
1957 entry = &__get_cpu_var(irq_entry); 2286 entry = &__get_cpu_var(pmc_irq_entry);
1958 2287
1959 entry->nr = 0; 2288 entry->nr = 0;
1960 2289
@@ -1962,3 +2291,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1962 2291
1963 return entry; 2292 return entry;
1964} 2293}
2294
2295void hw_perf_event_setup_online(int cpu)
2296{
2297 init_debug_store_on_cpu(cpu);
2298}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index e60ed740d2b3..fab786f60ed6 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_counter.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
@@ -68,16 +68,16 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
68 /* returns the bit offset of the performance counter register */ 68 /* returns the bit offset of the performance counter register */
69 switch (boot_cpu_data.x86_vendor) { 69 switch (boot_cpu_data.x86_vendor) {
70 case X86_VENDOR_AMD: 70 case X86_VENDOR_AMD:
71 return (msr - MSR_K7_PERFCTR0); 71 return msr - MSR_K7_PERFCTR0;
72 case X86_VENDOR_INTEL: 72 case X86_VENDOR_INTEL:
73 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 73 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
74 return (msr - MSR_ARCH_PERFMON_PERFCTR0); 74 return msr - MSR_ARCH_PERFMON_PERFCTR0;
75 75
76 switch (boot_cpu_data.x86) { 76 switch (boot_cpu_data.x86) {
77 case 6: 77 case 6:
78 return (msr - MSR_P6_PERFCTR0); 78 return msr - MSR_P6_PERFCTR0;
79 case 15: 79 case 15:
80 return (msr - MSR_P4_BPU_PERFCTR0); 80 return msr - MSR_P4_BPU_PERFCTR0;
81 } 81 }
82 } 82 }
83 return 0; 83 return 0;
@@ -92,16 +92,16 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
92 /* returns the bit offset of the event selection register */ 92 /* returns the bit offset of the event selection register */
93 switch (boot_cpu_data.x86_vendor) { 93 switch (boot_cpu_data.x86_vendor) {
94 case X86_VENDOR_AMD: 94 case X86_VENDOR_AMD:
95 return (msr - MSR_K7_EVNTSEL0); 95 return msr - MSR_K7_EVNTSEL0;
96 case X86_VENDOR_INTEL: 96 case X86_VENDOR_INTEL:
97 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 97 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
98 return (msr - MSR_ARCH_PERFMON_EVENTSEL0); 98 return msr - MSR_ARCH_PERFMON_EVENTSEL0;
99 99
100 switch (boot_cpu_data.x86) { 100 switch (boot_cpu_data.x86) {
101 case 6: 101 case 6:
102 return (msr - MSR_P6_EVNTSEL0); 102 return msr - MSR_P6_EVNTSEL0;
103 case 15: 103 case 15:
104 return (msr - MSR_P4_BSU_ESCR0); 104 return msr - MSR_P4_BSU_ESCR0;
105 } 105 }
106 } 106 }
107 return 0; 107 return 0;
@@ -113,7 +113,7 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
113{ 113{
114 BUG_ON(counter > NMI_MAX_COUNTER_BITS); 114 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
115 115
116 return (!test_bit(counter, perfctr_nmi_owner)); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118 118
119/* checks the an msr for availability */ 119/* checks the an msr for availability */
@@ -124,7 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr)
124 counter = nmi_perfctr_msr_to_bit(msr); 124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS); 125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126 126
127 return (!test_bit(counter, perfctr_nmi_owner)); 127 return !test_bit(counter, perfctr_nmi_owner);
128} 128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 130
@@ -237,7 +237,7 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz)
237 */ 237 */
238 counter_val = (u64)cpu_khz * 1000; 238 counter_val = (u64)cpu_khz * 1000;
239 do_div(counter_val, retval); 239 do_div(counter_val, retval);
240 if (counter_val > 0x7fffffffULL) { 240 if (counter_val > 0x7fffffffULL) {
241 u64 count = (u64)cpu_khz * 1000; 241 u64 count = (u64)cpu_khz * 1000;
242 do_div(count, 0x7fffffffUL); 242 do_div(count, 0x7fffffffUL);
243 retval = count + 1; 243 retval = count + 1;
@@ -251,7 +251,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr,
251 u64 count = (u64)cpu_khz * 1000; 251 u64 count = (u64)cpu_khz * 1000;
252 252
253 do_div(count, nmi_hz); 253 do_div(count, nmi_hz);
254 if(descr) 254 if (descr)
255 pr_debug("setting %s to -0x%08Lx\n", descr, count); 255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
256 wrmsrl(perfctr_msr, 0 - count); 256 wrmsrl(perfctr_msr, 0 - count);
257} 257}
@@ -262,7 +262,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
262 u64 count = (u64)cpu_khz * 1000; 262 u64 count = (u64)cpu_khz * 1000;
263 263
264 do_div(count, nmi_hz); 264 do_div(count, nmi_hz);
265 if(descr) 265 if (descr)
266 pr_debug("setting %s to -0x%08Lx\n", descr, count); 266 pr_debug("setting %s to -0x%08Lx\n", descr, count);
267 wrmsr(perfctr_msr, (u32)(-count), 0); 267 wrmsr(perfctr_msr, (u32)(-count), 0);
268} 268}
@@ -296,7 +296,7 @@ static int setup_k7_watchdog(unsigned nmi_hz)
296 296
297 /* setup the timer */ 297 /* setup the timer */
298 wrmsr(evntsel_msr, evntsel, 0); 298 wrmsr(evntsel_msr, evntsel, 0);
299 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 299 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
300 300
301 /* initialize the wd struct before enabling */ 301 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 302 wd->perfctr_msr = perfctr_msr;
@@ -387,7 +387,7 @@ static int setup_p6_watchdog(unsigned nmi_hz)
387 /* setup the timer */ 387 /* setup the timer */
388 wrmsr(evntsel_msr, evntsel, 0); 388 wrmsr(evntsel_msr, evntsel, 0);
389 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 389 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
390 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 390 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
391 391
392 /* initialize the wd struct before enabling */ 392 /* initialize the wd struct before enabling */
393 wd->perfctr_msr = perfctr_msr; 393 wd->perfctr_msr = perfctr_msr;
@@ -415,7 +415,7 @@ static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
415 apic_write(APIC_LVTPC, APIC_DM_NMI); 415 apic_write(APIC_LVTPC, APIC_DM_NMI);
416 416
417 /* P6/ARCH_PERFMON has 32 bit counter write */ 417 /* P6/ARCH_PERFMON has 32 bit counter write */
418 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); 418 write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
419} 419}
420 420
421static const struct wd_ops p6_wd_ops = { 421static const struct wd_ops p6_wd_ops = {
@@ -490,9 +490,9 @@ static int setup_p4_watchdog(unsigned nmi_hz)
490 if (smp_num_siblings == 2) { 490 if (smp_num_siblings == 2) {
491 unsigned int ebx, apicid; 491 unsigned int ebx, apicid;
492 492
493 ebx = cpuid_ebx(1); 493 ebx = cpuid_ebx(1);
494 apicid = (ebx >> 24) & 0xff; 494 apicid = (ebx >> 24) & 0xff;
495 ht_num = apicid & 1; 495 ht_num = apicid & 1;
496 } else 496 } else
497#endif 497#endif
498 ht_num = 0; 498 ht_num = 0;
@@ -544,7 +544,7 @@ static int setup_p4_watchdog(unsigned nmi_hz)
544 } 544 }
545 545
546 evntsel = P4_ESCR_EVENT_SELECT(0x3F) 546 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
547 | P4_ESCR_OS 547 | P4_ESCR_OS
548 | P4_ESCR_USR; 548 | P4_ESCR_USR;
549 549
550 cccr_val |= P4_CCCR_THRESHOLD(15) 550 cccr_val |= P4_CCCR_THRESHOLD(15)
@@ -612,7 +612,7 @@ static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
612{ 612{
613 unsigned dummy; 613 unsigned dummy;
614 /* 614 /*
615 * P4 quirks: 615 * P4 quirks:
616 * - An overflown perfctr will assert its interrupt 616 * - An overflown perfctr will assert its interrupt
617 * until the OVF flag in its CCCR is cleared. 617 * until the OVF flag in its CCCR is cleared.
618 * - LVTPC is masked on interrupt and must be 618 * - LVTPC is masked on interrupt and must be
@@ -662,7 +662,8 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
662 * NOTE: Corresponding bit = 0 in ebx indicates event present. 662 * NOTE: Corresponding bit = 0 in ebx indicates event present.
663 */ 663 */
664 cpuid(10, &(eax.full), &ebx, &unused, &unused); 664 cpuid(10, &(eax.full), &ebx, &unused, &unused);
665 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || 665 if ((eax.split.mask_length <
666 (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
666 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) 667 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
667 return 0; 668 return 0;
668 669
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index d5e30397246b..62ac8cb6ba27 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
116 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); 116 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
117#endif 117#endif
118 seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); 118 seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
119#ifdef CONFIG_X86_64
120 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); 119 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
121 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 120 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
122 c->x86_phys_bits, c->x86_virt_bits); 121 c->x86_phys_bits, c->x86_virt_bits);
123#endif
124 122
125 seq_printf(m, "power management:"); 123 seq_printf(m, "power management:");
126 for (i = 0; i < 32; i++) { 124 for (i = 0; i < 32; i++) {
@@ -128,7 +126,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
128 if (i < ARRAY_SIZE(x86_power_flags) && 126 if (i < ARRAY_SIZE(x86_power_flags) &&
129 x86_power_flags[i]) 127 x86_power_flags[i])
130 seq_printf(m, "%s%s", 128 seq_printf(m, "%s%s",
131 x86_power_flags[i][0]?" ":"", 129 x86_power_flags[i][0] ? " " : "",
132 x86_power_flags[i]); 130 x86_power_flags[i]);
133 else 131 else
134 seq_printf(m, " [%d]", i); 132 seq_printf(m, " [%d]", i);
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 000000000000..a640ae5ad201
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 284c399e3234..1cbed97b59cf 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,6 +24,7 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <asm/div64.h> 25#include <asm/div64.h>
26#include <asm/vmware.h> 26#include <asm/vmware.h>
27#include <asm/x86_init.h>
27 28
28#define CPUID_VMWARE_INFO_LEAF 0x40000000 29#define CPUID_VMWARE_INFO_LEAF 0x40000000
29#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 30#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -47,19 +48,33 @@ static inline int __vmware_platform(void)
47 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; 48 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
48} 49}
49 50
50static unsigned long __vmware_get_tsc_khz(void) 51static unsigned long vmware_get_tsc_khz(void)
51{ 52{
52 uint64_t tsc_hz; 53 uint64_t tsc_hz;
53 uint32_t eax, ebx, ecx, edx; 54 uint32_t eax, ebx, ecx, edx;
55
56 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
57
58 tsc_hz = eax | (((uint64_t)ebx) << 32);
59 do_div(tsc_hz, 1000);
60 BUG_ON(tsc_hz >> 32);
61 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
62 (unsigned long) tsc_hz / 1000,
63 (unsigned long) tsc_hz % 1000);
64 return tsc_hz;
65}
66
67void __init vmware_platform_setup(void)
68{
69 uint32_t eax, ebx, ecx, edx;
54 70
55 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 71 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
56 72
57 if (ebx == UINT_MAX) 73 if (ebx != UINT_MAX)
58 return 0; 74 x86_platform.calibrate_tsc = vmware_get_tsc_khz;
59 tsc_hz = eax | (((uint64_t)ebx) << 32); 75 else
60 do_div(tsc_hz, 1000); 76 printk(KERN_WARNING
61 BUG_ON(tsc_hz >> 32); 77 "Failed to get TSC freq from the hypervisor\n");
62 return tsc_hz;
63} 78}
64 79
65/* 80/*
@@ -87,12 +102,6 @@ int vmware_platform(void)
87 return 0; 102 return 0;
88} 103}
89 104
90unsigned long vmware_get_tsc_khz(void)
91{
92 BUG_ON(!vmware_platform());
93 return __vmware_get_tsc_khz();
94}
95
96/* 105/*
97 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 106 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
98 * Still, due to timing difference when running on virtual cpus, the TSC can 107 * Still, due to timing difference when running on virtual cpus, the TSC can
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index b07af8861244..6a52d4b36a30 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev) 185static char *cpuid_devnode(struct device *dev, mode_t *mode)
186{ 186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); 187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188} 188}
@@ -203,7 +203,7 @@ static int __init cpuid_init(void)
203 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
204 goto out_chrdev; 204 goto out_chrdev;
205 } 205 }
206 cpuid_class->nodename = cpuid_nodename; 206 cpuid_class->devnode = cpuid_devnode;
207 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
208 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
209 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index b4f14c6c09d9..37250fe490b1 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -27,9 +27,7 @@ static void doublefault_fn(void)
27 27
28 if (ptr_ok(gdt)) { 28 if (ptr_ok(gdt)) {
29 gdt += GDT_ENTRY_TSS << 3; 29 gdt += GDT_ENTRY_TSS << 3;
30 tss = *(u16 *)(gdt+2); 30 tss = get_desc_base((struct desc_struct *)gdt);
31 tss += *(u8 *)(gdt+4) << 16;
32 tss += *(u8 *)(gdt+7) << 24;
33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss); 31 printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
34 32
35 if (ptr_ok(tss)) { 33 if (ptr_ok(tss)) {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 48bfe1386038..ef42a038f1a6 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -509,15 +509,15 @@ enum bts_field {
509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask) 509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
510}; 510};
511 511
512static inline unsigned long bts_get(const char *base, enum bts_field field) 512static inline unsigned long bts_get(const char *base, unsigned long field)
513{ 513{
514 base += (ds_cfg.sizeof_ptr_field * field); 514 base += (ds_cfg.sizeof_ptr_field * field);
515 return *(unsigned long *)base; 515 return *(unsigned long *)base;
516} 516}
517 517
518static inline void bts_set(char *base, enum bts_field field, unsigned long val) 518static inline void bts_set(char *base, unsigned long field, unsigned long val)
519{ 519{
520 base += (ds_cfg.sizeof_ptr_field * field);; 520 base += (ds_cfg.sizeof_ptr_field * field);
521 (*(unsigned long *)base) = val; 521 (*(unsigned long *)base) = val;
522} 522}
523 523
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c8405718a4c3..2d8a371d4339 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -15,7 +15,6 @@
15#include <linux/bug.h> 15#include <linux/bug.h>
16#include <linux/nmi.h> 16#include <linux/nmi.h>
17#include <linux/sysfs.h> 17#include <linux/sysfs.h>
18#include <linux/ftrace.h>
19 18
20#include <asm/stacktrace.h> 19#include <asm/stacktrace.h>
21 20
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9e..f7dd2a7c3bf4 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -5,7 +5,6 @@
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/kprobes.h> 6#include <linux/kprobes.h>
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h> 8#include <linux/hardirq.h>
10#include <linux/kdebug.h> 9#include <linux/kdebug.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a3276766..a071e6be177e 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -5,7 +5,6 @@
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/kprobes.h> 6#include <linux/kprobes.h>
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h> 8#include <linux/hardirq.h>
10#include <linux/kdebug.h> 9#include <linux/kdebug.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5cb5725b2bae..85419bb7d4ab 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -115,7 +115,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
115{ 115{
116 int x = e820x->nr_map; 116 int x = e820x->nr_map;
117 117
118 if (x == ARRAY_SIZE(e820x->map)) { 118 if (x >= ARRAY_SIZE(e820x->map)) {
119 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); 119 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
120 return; 120 return;
121 } 121 }
@@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void)
1331 struct resource *res; 1331 struct resource *res;
1332 u64 end; 1332 u64 end;
1333 1333
1334 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1334 res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
1335 e820_res = res; 1335 e820_res = res;
1336 for (i = 0; i < e820.nr_map; i++) { 1336 for (i = 0; i < e820.nr_map; i++) {
1337 end = e820.map[i].addr + e820.map[i].size - 1; 1337 end = e820.map[i].addr + e820.map[i].size - 1;
@@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void)
1455 return who; 1455 return who;
1456} 1456}
1457 1457
1458char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1459{
1460 if (x86_quirks->arch_memory_setup) {
1461 char *who = x86_quirks->arch_memory_setup();
1462
1463 if (who)
1464 return who;
1465 }
1466 return default_machine_specific_memory_setup();
1467}
1468
1469/* Overridden in paravirt.c if CONFIG_PARAVIRT */
1470char * __init __attribute__((weak)) memory_setup(void)
1471{
1472 return machine_specific_memory_setup();
1473}
1474
1475void __init setup_memory_map(void) 1458void __init setup_memory_map(void)
1476{ 1459{
1477 char *who; 1460 char *who;
1478 1461
1479 who = memory_setup(); 1462 who = x86_init.resources.memory_setup();
1480 memcpy(&e820_saved, &e820, sizeof(struct e820map)); 1463 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1481 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1464 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1482 e820_print_map(who); 1465 e820_print_map(who);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 335f049d110f..2acfd3fdc0cc 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -160,721 +160,6 @@ static struct console early_serial_console = {
160 .index = -1, 160 .index = -1,
161}; 161};
162 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void __init dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void __init dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int __init dbgp_control_msg(unsigned devnum, int requesttype,
390 int request, int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int __init ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int __init ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void __init default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
565
566static void __init nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
878/* Direct interface for emergencies */ 163/* Direct interface for emergencies */
879static struct console *early_console = &early_vga_console; 164static struct console *early_console = &early_vga_console;
880static int __initdata early_console_initialized; 165static int __initdata early_console_initialized;
@@ -891,10 +176,19 @@ asmlinkage void early_printk(const char *fmt, ...)
891 va_end(ap); 176 va_end(ap);
892} 177}
893 178
179static inline void early_console_register(struct console *con, int keep_early)
180{
181 early_console = con;
182 if (keep_early)
183 early_console->flags &= ~CON_BOOT;
184 else
185 early_console->flags |= CON_BOOT;
186 register_console(early_console);
187}
894 188
895static int __init setup_early_printk(char *buf) 189static int __init setup_early_printk(char *buf)
896{ 190{
897 int keep_early; 191 int keep;
898 192
899 if (!buf) 193 if (!buf)
900 return 0; 194 return 0;
@@ -903,42 +197,34 @@ static int __init setup_early_printk(char *buf)
903 return 0; 197 return 0;
904 early_console_initialized = 1; 198 early_console_initialized = 1;
905 199
906 keep_early = (strstr(buf, "keep") != NULL); 200 keep = (strstr(buf, "keep") != NULL);
907 201
908 if (!strncmp(buf, "serial", 6)) { 202 while (*buf != '\0') {
909 early_serial_init(buf + 6); 203 if (!strncmp(buf, "serial", 6)) {
910 early_console = &early_serial_console; 204 early_serial_init(buf + 6);
911 } else if (!strncmp(buf, "ttyS", 4)) { 205 early_console_register(&early_serial_console, keep);
912 early_serial_init(buf); 206 }
913 early_console = &early_serial_console; 207 if (!strncmp(buf, "ttyS", 4)) {
914 } else if (!strncmp(buf, "vga", 3) 208 early_serial_init(buf + 4);
915 && boot_params.screen_info.orig_video_isVGA == 1) { 209 early_console_register(&early_serial_console, keep);
916 max_xpos = boot_params.screen_info.orig_video_cols; 210 }
917 max_ypos = boot_params.screen_info.orig_video_lines; 211 if (!strncmp(buf, "vga", 3) &&
918 current_ypos = boot_params.screen_info.orig_y; 212 boot_params.screen_info.orig_video_isVGA == 1) {
919 early_console = &early_vga_console; 213 max_xpos = boot_params.screen_info.orig_video_cols;
214 max_ypos = boot_params.screen_info.orig_video_lines;
215 current_ypos = boot_params.screen_info.orig_y;
216 early_console_register(&early_vga_console, keep);
217 }
920#ifdef CONFIG_EARLY_PRINTK_DBGP 218#ifdef CONFIG_EARLY_PRINTK_DBGP
921 } else if (!strncmp(buf, "dbgp", 4)) { 219 if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
922 if (early_dbgp_init(buf+4) < 0) 220 early_console_register(&early_dbgp_console, keep);
923 return 0;
924 early_console = &early_dbgp_console;
925 /*
926 * usb subsys will reset ehci controller, so don't keep
927 * that early console
928 */
929 keep_early = 0;
930#endif 221#endif
931#ifdef CONFIG_HVC_XEN 222#ifdef CONFIG_HVC_XEN
932 } else if (!strncmp(buf, "xen", 3)) { 223 if (!strncmp(buf, "xen", 3))
933 early_console = &xenboot_console; 224 early_console_register(&xenboot_console, keep);
934#endif 225#endif
226 buf++;
935 } 227 }
936
937 if (keep_early)
938 early_console->flags &= ~CON_BOOT;
939 else
940 early_console->flags |= CON_BOOT;
941 register_console(early_console);
942 return 0; 228 return 0;
943} 229}
944 230
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index fe26ba3e3451..ad5bd988fb79 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -42,6 +42,7 @@
42#include <asm/time.h> 42#include <asm/time.h>
43#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
44#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
45#include <asm/x86_init.h>
45 46
46#define EFI_DEBUG 1 47#define EFI_DEBUG 1
47#define PFX "EFI: " 48#define PFX "EFI: "
@@ -453,6 +454,9 @@ void __init efi_init(void)
453 if (add_efi_memmap) 454 if (add_efi_memmap)
454 do_add_efi_memmap(); 455 do_add_efi_memmap();
455 456
457 x86_platform.get_wallclock = efi_get_time;
458 x86_platform.set_wallclock = efi_set_rtc_mmss;
459
456 /* Setup for EFI runtime service */ 460 /* Setup for EFI runtime service */
457 reboot_type = BOOT_EFI; 461 reboot_type = BOOT_EFI;
458 462
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be745107..b5c061f8f358 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)
146END(ftrace_graph_caller) 146END(ftrace_graph_caller)
147 147
148GLOBAL(return_to_handler) 148GLOBAL(return_to_handler)
149 subq $80, %rsp 149 subq $24, %rsp
150 150
151 /* Save the return values */ 151 /* Save the return values */
152 movq %rax, (%rsp) 152 movq %rax, (%rsp)
@@ -155,10 +155,10 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 72(%rsp) 158 movq %rax, 16(%rsp)
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $72, %rsp 161 addq $16, %rsp
162 retq 162 retq
163#endif 163#endif
164 164
@@ -536,20 +536,13 @@ sysret_signal:
536 bt $TIF_SYSCALL_AUDIT,%edx 536 bt $TIF_SYSCALL_AUDIT,%edx
537 jc sysret_audit 537 jc sysret_audit
538#endif 538#endif
539 /* edx: work flags (arg3) */ 539 /*
540 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 540 * We have a signal, or exit tracing or single-step.
541 xorl %esi,%esi # oldset -> arg2 541 * These all wind up with the iret return path anyway,
542 SAVE_REST 542 * so just join that path right now.
543 FIXUP_TOP_OF_STACK %r11 543 */
544 call do_notify_resume 544 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
545 RESTORE_TOP_OF_STACK %r11 545 jmp int_check_syscall_exit_work
546 RESTORE_REST
547 movl $_TIF_WORK_MASK,%edi
548 /* Use IRET because user could have changed frame. This
549 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
550 DISABLE_INTERRUPTS(CLBR_NONE)
551 TRACE_IRQS_OFF
552 jmp int_with_check
553 546
554badsys: 547badsys:
555 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 548 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -654,6 +647,7 @@ int_careful:
654int_very_careful: 647int_very_careful:
655 TRACE_IRQS_ON 648 TRACE_IRQS_ON
656 ENABLE_INTERRUPTS(CLBR_NONE) 649 ENABLE_INTERRUPTS(CLBR_NONE)
650int_check_syscall_exit_work:
657 SAVE_REST 651 SAVE_REST
658 /* Check for syscall exit trace */ 652 /* Check for syscall exit trace */
659 testl $_TIF_WORK_SYSCALL_EXIT,%edx 653 testl $_TIF_WORK_SYSCALL_EXIT,%edx
@@ -1021,7 +1015,7 @@ apicinterrupt ERROR_APIC_VECTOR \
1021apicinterrupt SPURIOUS_APIC_VECTOR \ 1015apicinterrupt SPURIOUS_APIC_VECTOR \
1022 spurious_interrupt smp_spurious_interrupt 1016 spurious_interrupt smp_spurious_interrupt
1023 1017
1024#ifdef CONFIG_PERF_COUNTERS 1018#ifdef CONFIG_PERF_EVENTS
1025apicinterrupt LOCAL_PENDING_VECTOR \ 1019apicinterrupt LOCAL_PENDING_VECTOR \
1026 perf_pending_interrupt smp_perf_pending_interrupt 1020 perf_pending_interrupt smp_perf_pending_interrupt
1027#endif 1021#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d94e1ea3b9fe..9dbb527e1652 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
417 unsigned long return_hooker = (unsigned long) 417 unsigned long return_hooker = (unsigned long)
418 &return_to_handler; 418 &return_to_handler;
419 419
420 /* Nmi's are currently unsupported */
421 if (unlikely(in_nmi()))
422 return;
423
424 if (unlikely(atomic_read(&current->tracing_graph_pause))) 420 if (unlikely(atomic_read(&current->tracing_graph_pause)))
425 return; 421 return;
426 422
@@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
498 494
499struct syscall_metadata *syscall_nr_to_meta(int nr) 495struct syscall_metadata *syscall_nr_to_meta(int nr)
500{ 496{
501 if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) 497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
502 return NULL; 498 return NULL;
503 499
504 return syscalls_metadata[nr]; 500 return syscalls_metadata[nr];
505} 501}
506 502
507void arch_init_ftrace_syscalls(void) 503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
508{ 530{
509 int i; 531 int i;
510 struct syscall_metadata *meta; 532 struct syscall_metadata *meta;
511 unsigned long **psys_syscall_table = &sys_call_table; 533 unsigned long **psys_syscall_table = &sys_call_table;
512 static atomic_t refs;
513
514 if (atomic_inc_return(&refs) != 1)
515 goto end;
516 534
517 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
518 FTRACE_SYSCALL_MAX, GFP_KERNEL); 536 NR_syscalls, GFP_KERNEL);
519 if (!syscalls_metadata) { 537 if (!syscalls_metadata) {
520 WARN_ON(1); 538 WARN_ON(1);
521 return; 539 return -ENOMEM;
522 } 540 }
523 541
524 for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { 542 for (i = 0; i < NR_syscalls; i++) {
525 meta = find_syscall_meta(psys_syscall_table[i]); 543 meta = find_syscall_meta(psys_syscall_table[i]);
526 syscalls_metadata[i] = meta; 544 syscalls_metadata[i] = meta;
527 } 545 }
528 return; 546 return 0;
529
530 /* Paranoid: avoid overflow */
531end:
532 atomic_dec(&refs);
533} 547}
548arch_initcall(arch_init_ftrace_syscalls);
534#endif 549#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3f8579f8d42c..4f8e2507e8f3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -11,8 +11,21 @@
11#include <asm/setup.h> 11#include <asm/setup.h>
12#include <asm/sections.h> 12#include <asm/sections.h>
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/bios_ebda.h> 14#include <asm/page.h>
15#include <asm/trampoline.h> 15#include <asm/trampoline.h>
16#include <asm/apic.h>
17#include <asm/io_apic.h>
18#include <asm/bios_ebda.h>
19
20static void __init i386_default_early_setup(void)
21{
22 /* Initilize 32bit specific setup functions */
23 x86_init.resources.probe_roms = probe_roms;
24 x86_init.resources.reserve_resources = i386_reserve_resources;
25 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
26
27 reserve_ebda_region();
28}
16 29
17void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
18{ 31{
@@ -29,7 +42,16 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 42 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 43 }
31#endif 44#endif
32 reserve_ebda_region(); 45
46 /* Call the subarch specific early setup function */
47 switch (boot_params.hdr.hardware_subarch) {
48 case X86_SUBARCH_MRST:
49 x86_mrst_early_setup();
50 break;
51 default:
52 i386_default_early_setup();
53 break;
54 }
33 55
34 /* 56 /*
35 * At this point everything still needed from the boot loader 57 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 70eaa852c732..0b06cd778fd9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -23,8 +23,8 @@
23#include <asm/sections.h> 23#include <asm/sections.h>
24#include <asm/kdebug.h> 24#include <asm/kdebug.h>
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 26#include <asm/trampoline.h>
27#include <asm/bios_ebda.h>
28 28
29static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
30{ 30{
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index cc827ac9e8d3..218aad7ee76e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -157,6 +157,7 @@ subarch_entries:
157 .long default_entry /* normal x86/PC */ 157 .long default_entry /* normal x86/PC */
158 .long lguest_entry /* lguest hypervisor */ 158 .long lguest_entry /* lguest hypervisor */
159 .long xen_entry /* Xen hypervisor */ 159 .long xen_entry /* Xen hypervisor */
160 .long default_entry /* Moorestown MID */
160num_subarch_entries = (. - subarch_entries) / 4 161num_subarch_entries = (. - subarch_entries) / 4
161.previous 162.previous
162#endif /* CONFIG_PARAVIRT */ 163#endif /* CONFIG_PARAVIRT */
@@ -439,7 +440,6 @@ is386: movl $2,%ecx # set MP
439 jne 1f 440 jne 1f
440 movl $per_cpu__gdt_page,%eax 441 movl $per_cpu__gdt_page,%eax
441 movl $per_cpu__stack_canary,%ecx 442 movl $per_cpu__stack_canary,%ecx
442 subl $20, %ecx
443 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 443 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
444 shrl $16, %ecx 444 shrl $16, %ecx
445 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 445 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -608,7 +608,7 @@ ENTRY(initial_code)
608/* 608/*
609 * BSS section 609 * BSS section
610 */ 610 */
611.section ".bss.page_aligned","wa" 611__PAGE_ALIGNED_BSS
612 .align PAGE_SIZE_asm 612 .align PAGE_SIZE_asm
613#ifdef CONFIG_X86_PAE 613#ifdef CONFIG_X86_PAE
614swapper_pg_pmd: 614swapper_pg_pmd:
@@ -626,7 +626,7 @@ ENTRY(empty_zero_page)
626 * This starts the data section. 626 * This starts the data section.
627 */ 627 */
628#ifdef CONFIG_X86_PAE 628#ifdef CONFIG_X86_PAE
629.section ".data.page_aligned","wa" 629__PAGE_ALIGNED_DATA
630 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
631 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
632ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a05..d0bc0a13a437 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -418,7 +418,7 @@ ENTRY(phys_base)
418ENTRY(idt_table) 418ENTRY(idt_table)
419 .skip IDT_ENTRIES * 16 419 .skip IDT_ENTRIES * 16
420 420
421 .section .bss.page_aligned, "aw", @nobits 421 __PAGE_ALIGNED_BSS
422 .align PAGE_SIZE 422 .align PAGE_SIZE
423ENTRY(empty_zero_page) 423ENTRY(empty_zero_page)
424 .skip PAGE_SIZE 424 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 5cf36c053ac4..23c167925a5c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -19,12 +19,6 @@
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22#ifdef CONFIG_X86_32
23static void pit_disable_clocksource(void);
24#else
25static inline void pit_disable_clocksource(void) { }
26#endif
27
28/* 22/*
29 * HPET replaces the PIT, when enabled. So we need to know, which of 23 * HPET replaces the PIT, when enabled. So we need to know, which of
30 * the two timers is used 24 * the two timers is used
@@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode,
57 outb_pit(0, PIT_CH0); 51 outb_pit(0, PIT_CH0);
58 outb_pit(0, PIT_CH0); 52 outb_pit(0, PIT_CH0);
59 } 53 }
60 pit_disable_clocksource();
61 break; 54 break;
62 55
63 case CLOCK_EVT_MODE_ONESHOT: 56 case CLOCK_EVT_MODE_ONESHOT:
64 /* One shot setup */ 57 /* One shot setup */
65 pit_disable_clocksource();
66 outb_pit(0x38, PIT_MODE); 58 outb_pit(0x38, PIT_MODE);
67 break; 59 break;
68 60
@@ -200,17 +192,6 @@ static struct clocksource pit_cs = {
200 .shift = 20, 192 .shift = 20,
201}; 193};
202 194
203static void pit_disable_clocksource(void)
204{
205 /*
206 * Use mult to check whether it is registered or not
207 */
208 if (pit_cs.mult) {
209 clocksource_unregister(&pit_cs);
210 pit_cs.mult = 0;
211 }
212}
213
214static int __init init_pit_clocksource(void) 195static int __init init_pit_clocksource(void)
215{ 196{
216 /* 197 /*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 270ff83efc11..3a54dcb9cd0e 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
20 * way process stacks are handled. This is done by having a special 20 * way process stacks are handled. This is done by having a special
21 * "init_task" linker map entry.. 21 * "init_task" linker map entry..
22 */ 22 */
23union thread_union init_thread_union 23union thread_union init_thread_union __init_task_data =
24 __attribute__((__section__(".data.init_task"))) = 24 { INIT_THREAD_INFO(init_task) };
25 { INIT_THREAD_INFO(init_task) };
26 25
27/* 26/*
28 * Initial task structure. 27 * Initial task structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b0cdde6932f5..74656d1d4e30 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
104 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
105# endif 105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j) 109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); 110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
@@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
200 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
201# endif 201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu); 205 sum += per_cpu(mce_poll_count, cpu);
206#endif 206#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 3b09634a5153..7d35d0fe2329 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -218,7 +218,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
218void fixup_irqs(void) 218void fixup_irqs(void)
219{ 219{
220 unsigned int irq; 220 unsigned int irq;
221 static int warned;
222 struct irq_desc *desc; 221 struct irq_desc *desc;
223 222
224 for_each_irq_desc(irq, desc) { 223 for_each_irq_desc(irq, desc) {
@@ -236,8 +235,8 @@ void fixup_irqs(void)
236 } 235 }
237 if (desc->chip->set_affinity) 236 if (desc->chip->set_affinity)
238 desc->chip->set_affinity(irq, affinity); 237 desc->chip->set_affinity(irq, affinity);
239 else if (desc->action && !(warned++)) 238 else if (desc->action)
240 printk("Cannot set affinity for irq %i\n", irq); 239 printk_once("Cannot set affinity for irq %i\n", irq);
241 } 240 }
242 241
243#if 0 242#if 0
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 92b7703d3d58..40f30773fb29 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
116 return 0; 116 return 0;
117} 117}
118 118
119static void __init init_ISA_irqs(void) 119void __init init_ISA_irqs(void)
120{ 120{
121 int i; 121 int i;
122 122
@@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void)
140 } 140 }
141} 141}
142 142
143/* Overridden in paravirt.c */ 143void __init init_IRQ(void)
144void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 144{
145 x86_init.irqs.intr_init();
146}
145 147
146static void __init smp_intr_init(void) 148static void __init smp_intr_init(void)
147{ 149{
@@ -190,7 +192,7 @@ static void __init apic_intr_init(void)
190#ifdef CONFIG_X86_MCE_THRESHOLD 192#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 193 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 194#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 195#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); 196 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
195#endif 197#endif
196 198
@@ -206,39 +208,19 @@ static void __init apic_intr_init(void)
206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 208 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207 209
208 /* Performance monitoring interrupts: */ 210 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS 211# ifdef CONFIG_PERF_EVENTS
210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 212 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
211# endif 213# endif
212 214
213#endif 215#endif
214} 216}
215 217
216/**
217 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
218 *
219 * Description:
220 * Perform any necessary interrupt initialisation prior to setting up
221 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
222 * interrupts should be initialised here if the machine emulates a PC
223 * in any way.
224 **/
225static void __init x86_quirk_pre_intr_init(void)
226{
227#ifdef CONFIG_X86_32
228 if (x86_quirks->arch_pre_intr_init) {
229 if (x86_quirks->arch_pre_intr_init())
230 return;
231 }
232#endif
233 init_ISA_irqs();
234}
235
236void __init native_init_IRQ(void) 218void __init native_init_IRQ(void)
237{ 219{
238 int i; 220 int i;
239 221
240 /* Execute any quirks before the call gates are initialised: */ 222 /* Execute any quirks before the call gates are initialised: */
241 x86_quirk_pre_intr_init(); 223 x86_init.irqs.pre_vector_init();
242 224
243 apic_intr_init(); 225 apic_intr_init();
244 226
@@ -258,12 +240,6 @@ void __init native_init_IRQ(void)
258 240
259#ifdef CONFIG_X86_32 241#ifdef CONFIG_X86_32
260 /* 242 /*
261 * Call quirks after call gates are initialised (usually add in
262 * the architecture specific gates):
263 */
264 x86_quirk_intr_init();
265
266 /*
267 * External FPU? Set up irq13 if so, for 243 * External FPU? Set up irq13 if so, for
268 * original braindamaged IBM FERR coupling. 244 * original braindamaged IBM FERR coupling.
269 */ 245 */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c664d515f613..63b0ec8d3d4a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,7 +34,6 @@
34struct kvm_para_state { 34struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 35 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 36 int mmu_queue_len;
37 enum paravirt_lazy_mode mode;
38}; 37};
39 38
40static DEFINE_PER_CPU(struct kvm_para_state, para_state); 39static DEFINE_PER_CPU(struct kvm_para_state, para_state);
@@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len)
77{ 76{
78 struct kvm_para_state *state = kvm_para_state(); 77 struct kvm_para_state *state = kvm_para_state();
79 78
80 if (state->mode != PARAVIRT_LAZY_MMU) { 79 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
81 kvm_mmu_op(buffer, len); 80 kvm_mmu_op(buffer, len);
82 return; 81 return;
83 } 82 }
@@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn)
185 184
186static void kvm_enter_lazy_mmu(void) 185static void kvm_enter_lazy_mmu(void)
187{ 186{
188 struct kvm_para_state *state = kvm_para_state();
189
190 paravirt_enter_lazy_mmu(); 187 paravirt_enter_lazy_mmu();
191 state->mode = paravirt_get_lazy_mode();
192} 188}
193 189
194static void kvm_leave_lazy_mmu(void) 190static void kvm_leave_lazy_mmu(void)
@@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void)
197 193
198 mmu_queue_flush(state); 194 mmu_queue_flush(state);
199 paravirt_leave_lazy_mmu(); 195 paravirt_leave_lazy_mmu();
200 state->mode = paravirt_get_lazy_mode();
201} 196}
202 197
203static void __init paravirt_ops_setup(void) 198static void __init paravirt_ops_setup(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 223af43f1526..feaeb0d3aa4f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,8 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25
26#include <asm/x86_init.h>
25#include <asm/reboot.h> 27#include <asm/reboot.h>
26 28
27#define KVM_SCALE 22 29#define KVM_SCALE 22
@@ -50,8 +52,8 @@ static unsigned long kvm_get_wallclock(void)
50 struct timespec ts; 52 struct timespec ts;
51 int low, high; 53 int low, high;
52 54
53 low = (int)__pa(&wall_clock); 55 low = (int)__pa_symbol(&wall_clock);
54 high = ((u64)__pa(&wall_clock) >> 32); 56 high = ((u64)__pa_symbol(&wall_clock) >> 32);
55 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 57 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
56 58
57 vcpu_time = &get_cpu_var(hv_clock); 59 vcpu_time = &get_cpu_var(hv_clock);
@@ -182,12 +184,13 @@ void __init kvmclock_init(void)
182 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 184 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
183 if (kvm_register_clock("boot clock")) 185 if (kvm_register_clock("boot clock"))
184 return; 186 return;
185 pv_time_ops.get_wallclock = kvm_get_wallclock;
186 pv_time_ops.set_wallclock = kvm_set_wallclock;
187 pv_time_ops.sched_clock = kvm_clock_read; 187 pv_time_ops.sched_clock = kvm_clock_read;
188 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; 188 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
189 x86_platform.get_wallclock = kvm_get_wallclock;
190 x86_platform.set_wallclock = kvm_set_wallclock;
189#ifdef CONFIG_X86_LOCAL_APIC 191#ifdef CONFIG_X86_LOCAL_APIC
190 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 192 x86_cpuinit.setup_percpu_clockev =
193 kvm_setup_secondary_clock;
191#endif 194#endif
192#ifdef CONFIG_SMP 195#ifdef CONFIG_SMP
193 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 196 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 71f1d99a635d..ec6ef60cbd17 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
67#ifdef CONFIG_SMP 67#ifdef CONFIG_SMP
68 preempt_disable(); 68 preempt_disable();
69 load_LDT(pc); 69 load_LDT(pc);
70 if (!cpus_equal(current->mm->cpu_vm_mask, 70 if (!cpumask_equal(mm_cpumask(current->mm),
71 cpumask_of_cpu(smp_processor_id()))) 71 cpumask_of(smp_processor_id())))
72 smp_call_function(flush_ldt, current->mm, 1); 72 smp_call_function(flush_ldt, current->mm, 1);
73 preempt_enable(); 73 preempt_enable();
74#else 74#else
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9371448290ac..378e9a8f1bf8 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
210{ 210{
211 ssize_t ret = -EINVAL; 211 ssize_t ret = -EINVAL;
212 212
213 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages); 214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);
215 return ret; 215 return ret;
216 } 216 }
217 217
@@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = {
236static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
237 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
238 .name = "microcode", 238 .name = "microcode",
239 .devnode = "cpu/microcode", 239 .nodename = "cpu/microcode",
240 .fops = &microcode_fops, 240 .fops = &microcode_fops,
241}; 241};
242 242
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 651c93b28862..5be95ef4ffec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len)
45 return sum & 0xFF; 45 return sum & 0xFF;
46} 46}
47 47
48int __init default_mpc_apic_id(struct mpc_cpu *m)
49{
50 return m->apicid;
51}
52
48static void __init MP_processor_info(struct mpc_cpu *m) 53static void __init MP_processor_info(struct mpc_cpu *m)
49{ 54{
50 int apicid; 55 int apicid;
@@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
55 return; 60 return;
56 } 61 }
57 62
58 if (x86_quirks->mpc_apic_id) 63 apicid = x86_init.mpparse.mpc_apic_id(m);
59 apicid = x86_quirks->mpc_apic_id(m);
60 else
61 apicid = m->apicid;
62 64
63 if (m->cpuflag & CPU_BOOTPROCESSOR) { 65 if (m->cpuflag & CPU_BOOTPROCESSOR) {
64 bootup_cpu = " (Bootup-CPU)"; 66 bootup_cpu = " (Bootup-CPU)";
@@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m)
70} 72}
71 73
72#ifdef CONFIG_X86_IO_APIC 74#ifdef CONFIG_X86_IO_APIC
73static void __init MP_bus_info(struct mpc_bus *m) 75void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
74{ 76{
75 char str[7];
76 memcpy(str, m->bustype, 6); 77 memcpy(str, m->bustype, 6);
77 str[6] = 0; 78 str[6] = 0;
79 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
80}
78 81
79 if (x86_quirks->mpc_oem_bus_info) 82static void __init MP_bus_info(struct mpc_bus *m)
80 x86_quirks->mpc_oem_bus_info(m, str); 83{
81 else 84 char str[7];
82 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); 85
86 x86_init.mpparse.mpc_oem_bus_info(m, str);
83 87
84#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
85 if (m->busid >= MAX_MP_BUSSES) { 89 if (m->busid >= MAX_MP_BUSSES) {
@@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
96 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
97#endif 101#endif
98 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
99 if (x86_quirks->mpc_oem_pci_bus) 103 if (x86_init.mpparse.mpc_oem_pci_bus)
100 x86_quirks->mpc_oem_pci_bus(m); 104 x86_init.mpparse.mpc_oem_pci_bus(m);
101 105
102 clear_bit(m->busid, mp_bus_not_pci); 106 clear_bit(m->busid, mp_bus_not_pci);
103#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
@@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
291 1, mpc, mpc->length, 1); 295 1, mpc, mpc->length, 1);
292} 296}
293 297
298void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
299
294static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 300static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
295{ 301{
296 char str[16]; 302 char str[16];
@@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
312 if (early) 318 if (early)
313 return 1; 319 return 1;
314 320
315 if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { 321 if (mpc->oemptr)
316 struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; 322 x86_init.mpparse.smp_read_mpc_oem(mpc);
317 x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
318 }
319 323
320 /* 324 /*
321 * Now process the configuration blocks. 325 * Now process the configuration blocks.
322 */ 326 */
323 if (x86_quirks->mpc_record) 327 x86_init.mpparse.mpc_record(0);
324 *x86_quirks->mpc_record = 0;
325 328
326 while (count < mpc->length) { 329 while (count < mpc->length) {
327 switch (*mpt) { 330 switch (*mpt) {
@@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
353 count = mpc->length; 356 count = mpc->length;
354 break; 357 break;
355 } 358 }
356 if (x86_quirks->mpc_record) 359 x86_init.mpparse.mpc_record(1);
357 (*x86_quirks->mpc_record)++;
358 } 360 }
359 361
360#ifdef CONFIG_X86_BIGSMP 362#ifdef CONFIG_X86_BIGSMP
@@ -482,11 +484,11 @@ static void __init construct_ioapic_table(int mpc_default_type)
482 MP_bus_info(&bus); 484 MP_bus_info(&bus);
483 } 485 }
484 486
485 ioapic.type = MP_IOAPIC; 487 ioapic.type = MP_IOAPIC;
486 ioapic.apicid = 2; 488 ioapic.apicid = 2;
487 ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; 489 ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
488 ioapic.flags = MPC_APIC_USABLE; 490 ioapic.flags = MPC_APIC_USABLE;
489 ioapic.apicaddr = 0xFEC00000; 491 ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE;
490 MP_ioapic_info(&ioapic); 492 MP_ioapic_info(&ioapic);
491 493
492 /* 494 /*
@@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
608/* 610/*
609 * Scan the memory blocks for an SMP configuration block. 611 * Scan the memory blocks for an SMP configuration block.
610 */ 612 */
611static void __init __get_smp_config(unsigned int early) 613void __init default_get_smp_config(unsigned int early)
612{ 614{
613 struct mpf_intel *mpf = mpf_found; 615 struct mpf_intel *mpf = mpf_found;
614 616
@@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early)
625 if (acpi_lapic && acpi_ioapic) 627 if (acpi_lapic && acpi_ioapic)
626 return; 628 return;
627 629
628 if (x86_quirks->mach_get_smp_config) {
629 if (x86_quirks->mach_get_smp_config(early))
630 return;
631 }
632
633 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 630 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
634 mpf->specification); 631 mpf->specification);
635#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 632#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early)
670 */ 667 */
671} 668}
672 669
673void __init early_get_smp_config(void)
674{
675 __get_smp_config(1);
676}
677
678void __init get_smp_config(void)
679{
680 __get_smp_config(0);
681}
682
683static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
684{ 671{
685 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
@@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
745 return 0; 732 return 0;
746} 733}
747 734
748static void __init __find_smp_config(unsigned int reserve) 735void __init default_find_smp_config(unsigned int reserve)
749{ 736{
750 unsigned int address; 737 unsigned int address;
751 738
752 if (x86_quirks->mach_find_smp_config) {
753 if (x86_quirks->mach_find_smp_config(reserve))
754 return;
755 }
756 /* 739 /*
757 * FIXME: Linux assumes you have 640K of base ram.. 740 * FIXME: Linux assumes you have 640K of base ram..
758 * this continues the error... 741 * this continues the error...
@@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve)
787 smp_scan_config(address, 0x400, reserve); 770 smp_scan_config(address, 0x400, reserve);
788} 771}
789 772
790void __init early_find_smp_config(void)
791{
792 __find_smp_config(0);
793}
794
795void __init find_smp_config(void)
796{
797 __find_smp_config(1);
798}
799
800#ifdef CONFIG_X86_IO_APIC 773#ifdef CONFIG_X86_IO_APIC
801static u8 __initdata irq_used[MAX_IRQ_SOURCES]; 774static u8 __initdata irq_used[MAX_IRQ_SOURCES];
802 775
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
new file mode 100644
index 000000000000..3b7078abc871
--- /dev/null
+++ b/arch/x86/kernel/mrst.c
@@ -0,0 +1,24 @@
1/*
2 * mrst.c: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/init.h>
13
14#include <asm/setup.h>
15
16/*
17 * Moorestown specific x86_init function overrides and early setup
18 * calls.
19 */
20void __init x86_mrst_early_setup(void)
21{
22 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop;
24}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 98fd6cd4e3a4..6a3cefc7dda1 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -1,6 +1,7 @@
1/* ----------------------------------------------------------------------- * 1/* ----------------------------------------------------------------------- *
2 * 2 *
3 * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved 3 * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
4 * Copyright 2009 Intel Corporation; author: H. Peter Anvin
4 * 5 *
5 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -80,11 +81,8 @@ static ssize_t msr_read(struct file *file, char __user *buf,
80 81
81 for (; count; count -= 8) { 82 for (; count; count -= 8) {
82 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); 83 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
83 if (err) { 84 if (err)
84 if (err == -EFAULT) /* Fix idiotic error code */
85 err = -EIO;
86 break; 85 break;
87 }
88 if (copy_to_user(tmp, &data, 8)) { 86 if (copy_to_user(tmp, &data, 8)) {
89 err = -EFAULT; 87 err = -EFAULT;
90 break; 88 break;
@@ -115,11 +113,8 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
115 break; 113 break;
116 } 114 }
117 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); 115 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
118 if (err) { 116 if (err)
119 if (err == -EFAULT) /* Fix idiotic error code */
120 err = -EIO;
121 break; 117 break;
122 }
123 tmp += 2; 118 tmp += 2;
124 bytes += 8; 119 bytes += 8;
125 } 120 }
@@ -127,6 +122,54 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
127 return bytes ? bytes : err; 122 return bytes ? bytes : err;
128} 123}
129 124
125static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
126{
127 u32 __user *uregs = (u32 __user *)arg;
128 u32 regs[8];
129 int cpu = iminor(file->f_path.dentry->d_inode);
130 int err;
131
132 switch (ioc) {
133 case X86_IOC_RDMSR_REGS:
134 if (!(file->f_mode & FMODE_READ)) {
135 err = -EBADF;
136 break;
137 }
138 if (copy_from_user(&regs, uregs, sizeof regs)) {
139 err = -EFAULT;
140 break;
141 }
142 err = rdmsr_safe_regs_on_cpu(cpu, regs);
143 if (err)
144 break;
145 if (copy_to_user(uregs, &regs, sizeof regs))
146 err = -EFAULT;
147 break;
148
149 case X86_IOC_WRMSR_REGS:
150 if (!(file->f_mode & FMODE_WRITE)) {
151 err = -EBADF;
152 break;
153 }
154 if (copy_from_user(&regs, uregs, sizeof regs)) {
155 err = -EFAULT;
156 break;
157 }
158 err = wrmsr_safe_regs_on_cpu(cpu, regs);
159 if (err)
160 break;
161 if (copy_to_user(uregs, &regs, sizeof regs))
162 err = -EFAULT;
163 break;
164
165 default:
166 err = -ENOTTY;
167 break;
168 }
169
170 return err;
171}
172
130static int msr_open(struct inode *inode, struct file *file) 173static int msr_open(struct inode *inode, struct file *file)
131{ 174{
132 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 175 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
@@ -157,6 +200,8 @@ static const struct file_operations msr_fops = {
157 .read = msr_read, 200 .read = msr_read,
158 .write = msr_write, 201 .write = msr_write,
159 .open = msr_open, 202 .open = msr_open,
203 .unlocked_ioctl = msr_ioctl,
204 .compat_ioctl = msr_ioctl,
160}; 205};
161 206
162static int __cpuinit msr_device_create(int cpu) 207static int __cpuinit msr_device_create(int cpu)
@@ -196,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
196 .notifier_call = msr_class_cpu_callback, 241 .notifier_call = msr_class_cpu_callback,
197}; 242};
198 243
199static char *msr_nodename(struct device *dev) 244static char *msr_devnode(struct device *dev, mode_t *mode)
200{ 245{
201 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); 246 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
202} 247}
@@ -217,7 +262,7 @@ static int __init msr_init(void)
217 err = PTR_ERR(msr_class); 262 err = PTR_ERR(msr_class);
218 goto out_chrdev; 263 goto out_chrdev;
219 } 264 }
220 msr_class->nodename = msr_nodename; 265 msr_class->devnode = msr_devnode;
221 for_each_online_cpu(i) { 266 for_each_online_cpu(i) {
222 err = msr_device_create(i); 267 err = msr_device_create(i);
223 if (err != 0) 268 if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 70ec9b951d76..1b1739d16310 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x)
54 return x; 54 return x;
55} 55}
56 56
57static void __init default_banner(void) 57void __init default_banner(void)
58{ 58{
59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
60 pv_info.name); 60 pv_info.name);
61} 61}
62 62
63char *memory_setup(void)
64{
65 return pv_init_ops.memory_setup();
66}
67
68/* Simple instruction patching code. */ 63/* Simple instruction patching code. */
69#define DEF_NATIVE(ops, name, code) \ 64#define DEF_NATIVE(ops, name, code) \
70 extern const char start_##ops##_##name[], end_##ops##_##name[]; \ 65 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
@@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
188 return insn_len; 183 return insn_len;
189} 184}
190 185
191void init_IRQ(void)
192{
193 pv_irq_ops.init_IRQ();
194}
195
196static void native_flush_tlb(void) 186static void native_flush_tlb(void)
197{ 187{
198 __native_flush_tlb(); 188 __native_flush_tlb();
@@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void);
218extern void native_usergs_sysret32(void); 208extern void native_usergs_sysret32(void);
219extern void native_usergs_sysret64(void); 209extern void native_usergs_sysret64(void);
220 210
221static int __init print_banner(void)
222{
223 pv_init_ops.banner();
224 return 0;
225}
226core_initcall(print_banner);
227
228static struct resource reserve_ioports = { 211static struct resource reserve_ioports = {
229 .start = 0, 212 .start = 0,
230 .end = IO_SPACE_LIMIT, 213 .end = IO_SPACE_LIMIT,
@@ -320,21 +303,13 @@ struct pv_info pv_info = {
320 303
321struct pv_init_ops pv_init_ops = { 304struct pv_init_ops pv_init_ops = {
322 .patch = native_patch, 305 .patch = native_patch,
323 .banner = default_banner,
324 .arch_setup = paravirt_nop,
325 .memory_setup = machine_specific_memory_setup,
326}; 306};
327 307
328struct pv_time_ops pv_time_ops = { 308struct pv_time_ops pv_time_ops = {
329 .time_init = hpet_time_init,
330 .get_wallclock = native_get_wallclock,
331 .set_wallclock = native_set_wallclock,
332 .sched_clock = native_sched_clock, 309 .sched_clock = native_sched_clock,
333 .get_tsc_khz = native_calibrate_tsc,
334}; 310};
335 311
336struct pv_irq_ops pv_irq_ops = { 312struct pv_irq_ops pv_irq_ops = {
337 .init_IRQ = native_init_IRQ,
338 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), 313 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
339 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), 314 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
340 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), 315 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
@@ -362,8 +337,9 @@ struct pv_cpu_ops pv_cpu_ops = {
362#endif 337#endif
363 .wbinvd = native_wbinvd, 338 .wbinvd = native_wbinvd,
364 .read_msr = native_read_msr_safe, 339 .read_msr = native_read_msr_safe,
365 .read_msr_amd = native_read_msr_amd_safe, 340 .rdmsr_regs = native_rdmsr_safe_regs,
366 .write_msr = native_write_msr_safe, 341 .write_msr = native_write_msr_safe,
342 .wrmsr_regs = native_wrmsr_safe_regs,
367 .read_tsc = native_read_tsc, 343 .read_tsc = native_read_tsc,
368 .read_pmc = native_read_pmc, 344 .read_pmc = native_read_pmc,
369 .read_tscp = native_read_tscp, 345 .read_tscp = native_read_tscp,
@@ -408,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = {
408 384
409struct pv_apic_ops pv_apic_ops = { 385struct pv_apic_ops pv_apic_ops = {
410#ifdef CONFIG_X86_LOCAL_APIC 386#ifdef CONFIG_X86_LOCAL_APIC
411 .setup_boot_clock = setup_boot_APIC_clock,
412 .setup_secondary_clock = setup_secondary_APIC_clock,
413 .startup_ipi_hook = paravirt_nop, 387 .startup_ipi_hook = paravirt_nop,
414#endif 388#endif
415}; 389};
@@ -423,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = {
423#endif 397#endif
424 398
425struct pv_mmu_ops pv_mmu_ops = { 399struct pv_mmu_ops pv_mmu_ops = {
426#ifndef CONFIG_X86_64
427 .pagetable_setup_start = native_pagetable_setup_start,
428 .pagetable_setup_done = native_pagetable_setup_done,
429#else
430 .pagetable_setup_start = paravirt_nop,
431 .pagetable_setup_done = paravirt_nop,
432#endif
433 400
434 .read_cr2 = native_read_cr2, 401 .read_cr2 = native_read_cr2,
435 .write_cr2 = native_write_cr2, 402 .write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1a041bcf506b..64b838eac18c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -3,6 +3,7 @@
3#include <linux/dmar.h> 3#include <linux/dmar.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/kmemleak.h>
6 7
7#include <asm/proto.h> 8#include <asm/proto.h>
8#include <asm/dma.h> 9#include <asm/dma.h>
@@ -32,7 +33,14 @@ int no_iommu __read_mostly;
32/* Set this to 1 if there is a HW IOMMU in the system */ 33/* Set this to 1 if there is a HW IOMMU in the system */
33int iommu_detected __read_mostly = 0; 34int iommu_detected __read_mostly = 0;
34 35
35int iommu_pass_through; 36/*
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA ranslation for
39 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation.
42 */
43int iommu_pass_through __read_mostly;
36 44
37dma_addr_t bad_dma_address __read_mostly = 0; 45dma_addr_t bad_dma_address __read_mostly = 0;
38EXPORT_SYMBOL(bad_dma_address); 46EXPORT_SYMBOL(bad_dma_address);
@@ -88,6 +96,11 @@ void __init dma32_reserve_bootmem(void)
88 size = roundup(dma32_bootmem_size, align); 96 size = roundup(dma32_bootmem_size, align);
89 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 97 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
90 512ULL<<20); 98 512ULL<<20);
99 /*
100 * Kmemleak should not scan this block as it may not be mapped via the
101 * kernel direct mapping.
102 */
103 kmemleak_ignore(dma32_bootmem_ptr);
91 if (dma32_bootmem_ptr) 104 if (dma32_bootmem_ptr)
92 dma32_bootmem_size = size; 105 dma32_bootmem_size = size;
93 else 106 else
@@ -147,7 +160,7 @@ again:
147 return NULL; 160 return NULL;
148 161
149 addr = page_to_phys(page); 162 addr = page_to_phys(page);
150 if (!is_buffer_dma_capable(dma_mask, addr, size)) { 163 if (addr + size > dma_mask) {
151 __free_pages(page, get_order(size)); 164 __free_pages(page, get_order(size));
152 165
153 if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { 166 if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
@@ -212,10 +225,8 @@ static __init int iommu_setup(char *p)
212 if (!strncmp(p, "soft", 4)) 225 if (!strncmp(p, "soft", 4))
213 swiotlb = 1; 226 swiotlb = 1;
214#endif 227#endif
215 if (!strncmp(p, "pt", 2)) { 228 if (!strncmp(p, "pt", 2))
216 iommu_pass_through = 1; 229 iommu_pass_through = 1;
217 return 1;
218 }
219 230
220 gart_parse_options(p); 231 gart_parse_options(p);
221 232
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d2e56b8f48e7..98a827ee9ed7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
190static inline int 190static inline int
191need_iommu(struct device *dev, unsigned long addr, size_t size) 191need_iommu(struct device *dev, unsigned long addr, size_t size)
192{ 192{
193 return force_iommu || 193 return force_iommu || !dma_capable(dev, addr, size);
194 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
195} 194}
196 195
197static inline int 196static inline int
198nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 197nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
199{ 198{
200 return !is_buffer_dma_capable(*dev->dma_mask, addr, size); 199 return !dma_capable(dev, addr, size);
201} 200}
202 201
203/* Map a single continuous physical area into the IOMMU. 202/* Map a single continuous physical area into the IOMMU.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 71d412a09f30..a3933d4330cd 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { 17 if (hwdev && !dma_capable(hwdev, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) 18 if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
79 free_pages((unsigned long)vaddr, get_order(size)); 79 free_pages((unsigned long)vaddr, get_order(size));
80} 80}
81 81
82static void nommu_sync_single_for_device(struct device *dev,
83 dma_addr_t addr, size_t size,
84 enum dma_data_direction dir)
85{
86 flush_write_buffers();
87}
88
89
90static void nommu_sync_sg_for_device(struct device *dev,
91 struct scatterlist *sg, int nelems,
92 enum dma_data_direction dir)
93{
94 flush_write_buffers();
95}
96
82struct dma_map_ops nommu_dma_ops = { 97struct dma_map_ops nommu_dma_ops = {
83 .alloc_coherent = dma_generic_alloc_coherent, 98 .alloc_coherent = dma_generic_alloc_coherent,
84 .free_coherent = nommu_free_coherent, 99 .free_coherent = nommu_free_coherent,
85 .map_sg = nommu_map_sg, 100 .map_sg = nommu_map_sg,
86 .map_page = nommu_map_page, 101 .map_page = nommu_map_page,
87 .is_phys = 1, 102 .sync_single_for_device = nommu_sync_single_for_device,
103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1,
88}; 105};
89 106
90void __init no_iommu_init(void) 107void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6af96ee44200..aaa6b7839f1e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -13,31 +13,6 @@
13 13
14int swiotlb __read_mostly; 14int swiotlb __read_mostly;
15 15
16void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
37{
38 return 0;
39}
40
41static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 16static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
42 dma_addr_t *dma_handle, gfp_t flags) 17 dma_addr_t *dma_handle, gfp_t flags)
43{ 18{
@@ -71,9 +46,8 @@ void __init pci_swiotlb_init(void)
71{ 46{
72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
73#ifdef CONFIG_X86_64 48#ifdef CONFIG_X86_64
74 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || 49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
75 iommu_pass_through) 50 swiotlb = 1;
76 swiotlb = 1;
77#endif 51#endif
78 if (swiotlb_force) 52 if (swiotlb_force)
79 swiotlb = 1; 53 swiotlb = 1;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a4ba83..5284cd2b5776 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <trace/power.h> 12#include <trace/events/power.h>
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h> 15#include <asm/syscalls.h>
@@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait);
25 25
26struct kmem_cache *task_xstate_cachep; 26struct kmem_cache *task_xstate_cachep;
27 27
28DEFINE_TRACE(power_start);
29DEFINE_TRACE(power_end);
30
31int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 28int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
32{ 29{
33 *dst = *src; 30 *dst = *src;
@@ -299,9 +296,7 @@ static inline int hlt_use_halt(void)
299void default_idle(void) 296void default_idle(void)
300{ 297{
301 if (hlt_use_halt()) { 298 if (hlt_use_halt()) {
302 struct power_trace it; 299 trace_power_start(POWER_CSTATE, 1);
303
304 trace_power_start(&it, POWER_CSTATE, 1);
305 current_thread_info()->status &= ~TS_POLLING; 300 current_thread_info()->status &= ~TS_POLLING;
306 /* 301 /*
307 * TS_POLLING-cleared state must be visible before we 302 * TS_POLLING-cleared state must be visible before we
@@ -314,7 +309,6 @@ void default_idle(void)
314 else 309 else
315 local_irq_enable(); 310 local_irq_enable();
316 current_thread_info()->status |= TS_POLLING; 311 current_thread_info()->status |= TS_POLLING;
317 trace_power_end(&it);
318 } else { 312 } else {
319 local_irq_enable(); 313 local_irq_enable();
320 /* loop is done by the caller */ 314 /* loop is done by the caller */
@@ -372,9 +366,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
372 */ 366 */
373void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 367void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
374{ 368{
375 struct power_trace it; 369 trace_power_start(POWER_CSTATE, (ax>>4)+1);
376
377 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
378 if (!need_resched()) { 370 if (!need_resched()) {
379 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 371 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
380 clflush((void *)&current_thread_info()->flags); 372 clflush((void *)&current_thread_info()->flags);
@@ -384,15 +376,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
384 if (!need_resched()) 376 if (!need_resched())
385 __mwait(ax, cx); 377 __mwait(ax, cx);
386 } 378 }
387 trace_power_end(&it);
388} 379}
389 380
390/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 381/* Default MONITOR/MWAIT with no hints, used for default C1 state */
391static void mwait_idle(void) 382static void mwait_idle(void)
392{ 383{
393 struct power_trace it;
394 if (!need_resched()) { 384 if (!need_resched()) {
395 trace_power_start(&it, POWER_CSTATE, 1); 385 trace_power_start(POWER_CSTATE, 1);
396 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 386 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
397 clflush((void *)&current_thread_info()->flags); 387 clflush((void *)&current_thread_info()->flags);
398 388
@@ -402,7 +392,6 @@ static void mwait_idle(void)
402 __sti_mwait(0, 0); 392 __sti_mwait(0, 0);
403 else 393 else
404 local_irq_enable(); 394 local_irq_enable();
405 trace_power_end(&it);
406 } else 395 } else
407 local_irq_enable(); 396 local_irq_enable();
408} 397}
@@ -414,13 +403,11 @@ static void mwait_idle(void)
414 */ 403 */
415static void poll_idle(void) 404static void poll_idle(void)
416{ 405{
417 struct power_trace it; 406 trace_power_start(POWER_CSTATE, 0);
418
419 trace_power_start(&it, POWER_CSTATE, 0);
420 local_irq_enable(); 407 local_irq_enable();
421 while (!need_resched()) 408 while (!need_resched())
422 cpu_relax(); 409 cpu_relax();
423 trace_power_end(&it); 410 trace_power_end(0);
424} 411}
425 412
426/* 413/*
@@ -568,10 +555,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
568void __init init_c1e_mask(void) 555void __init init_c1e_mask(void)
569{ 556{
570 /* If we're using c1e_idle, we need to allocate c1e_mask. */ 557 /* If we're using c1e_idle, we need to allocate c1e_mask. */
571 if (pm_idle == c1e_idle) { 558 if (pm_idle == c1e_idle)
572 alloc_cpumask_var(&c1e_mask, GFP_KERNEL); 559 zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
573 cpumask_clear(c1e_mask);
574 }
575} 560}
576 561
577static int __init idle_setup(char *str) 562static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 59f4524984af..4cf79567cdab 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -61,9 +61,6 @@
61 61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 63
64DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
65EXPORT_PER_CPU_SYMBOL(current_task);
66
67/* 64/*
68 * Return saved PC of a blocked thread. 65 * Return saved PC of a blocked thread.
69 */ 66 */
@@ -350,14 +347,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
350 *next = &next_p->thread; 347 *next = &next_p->thread;
351 int cpu = smp_processor_id(); 348 int cpu = smp_processor_id();
352 struct tss_struct *tss = &per_cpu(init_tss, cpu); 349 struct tss_struct *tss = &per_cpu(init_tss, cpu);
350 bool preload_fpu;
353 351
354 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 352 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
355 353
356 __unlazy_fpu(prev_p); 354 /*
355 * If the task has used fpu the last 5 timeslices, just do a full
356 * restore of the math state immediately to avoid the trap; the
357 * chances of needing FPU soon are obviously high now
358 */
359 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
357 360
361 __unlazy_fpu(prev_p);
358 362
359 /* we're going to use this soon, after a few expensive things */ 363 /* we're going to use this soon, after a few expensive things */
360 if (next_p->fpu_counter > 5) 364 if (preload_fpu)
361 prefetch(next->xstate); 365 prefetch(next->xstate);
362 366
363 /* 367 /*
@@ -398,6 +402,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
398 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) 402 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
399 __switch_to_xtra(prev_p, next_p, tss); 403 __switch_to_xtra(prev_p, next_p, tss);
400 404
405 /* If we're going to preload the fpu context, make sure clts
406 is run while we're batching the cpu state updates. */
407 if (preload_fpu)
408 clts();
409
401 /* 410 /*
402 * Leave lazy mode, flushing any hypercalls made here. 411 * Leave lazy mode, flushing any hypercalls made here.
403 * This must be done before restoring TLS segments so 412 * This must be done before restoring TLS segments so
@@ -407,15 +416,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
407 */ 416 */
408 arch_end_context_switch(next_p); 417 arch_end_context_switch(next_p);
409 418
410 /* If the task has used fpu the last 5 timeslices, just do a full 419 if (preload_fpu)
411 * restore of the math state immediately to avoid the trap; the 420 __math_state_restore();
412 * chances of needing FPU soon are obviously high now
413 *
414 * tsk_used_math() checks prevent calling math_state_restore(),
415 * which can sleep in the case of !tsk_used_math()
416 */
417 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
418 math_state_restore();
419 421
420 /* 422 /*
421 * Restore %gs if needed (which is common) 423 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ebefb5407b9d..ad535b683170 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -55,9 +55,6 @@
55 55
56asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
57 57
58DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
59EXPORT_PER_CPU_SYMBOL(current_task);
60
61DEFINE_PER_CPU(unsigned long, old_rsp); 58DEFINE_PER_CPU(unsigned long, old_rsp);
62static DEFINE_PER_CPU(unsigned char, is_idle); 59static DEFINE_PER_CPU(unsigned char, is_idle);
63 60
@@ -386,9 +383,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
386 int cpu = smp_processor_id(); 383 int cpu = smp_processor_id();
387 struct tss_struct *tss = &per_cpu(init_tss, cpu); 384 struct tss_struct *tss = &per_cpu(init_tss, cpu);
388 unsigned fsindex, gsindex; 385 unsigned fsindex, gsindex;
386 bool preload_fpu;
387
388 /*
389 * If the task has used fpu the last 5 timeslices, just do a full
390 * restore of the math state immediately to avoid the trap; the
391 * chances of needing FPU soon are obviously high now
392 */
393 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
389 394
390 /* we're going to use this soon, after a few expensive things */ 395 /* we're going to use this soon, after a few expensive things */
391 if (next_p->fpu_counter > 5) 396 if (preload_fpu)
392 prefetch(next->xstate); 397 prefetch(next->xstate);
393 398
394 /* 399 /*
@@ -419,6 +424,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
419 424
420 load_TLS(next, cpu); 425 load_TLS(next, cpu);
421 426
427 /* Must be after DS reload */
428 unlazy_fpu(prev_p);
429
430 /* Make sure cpu is ready for new context */
431 if (preload_fpu)
432 clts();
433
422 /* 434 /*
423 * Leave lazy mode, flushing any hypercalls made here. 435 * Leave lazy mode, flushing any hypercalls made here.
424 * This must be done before restoring TLS segments so 436 * This must be done before restoring TLS segments so
@@ -459,9 +471,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
459 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 471 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
460 prev->gsindex = gsindex; 472 prev->gsindex = gsindex;
461 473
462 /* Must be after DS reload */
463 unlazy_fpu(prev_p);
464
465 /* 474 /*
466 * Switch the PDA and FPU contexts. 475 * Switch the PDA and FPU contexts.
467 */ 476 */
@@ -480,15 +489,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
480 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 489 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
481 __switch_to_xtra(prev_p, next_p, tss); 490 __switch_to_xtra(prev_p, next_p, tss);
482 491
483 /* If the task has used fpu the last 5 timeslices, just do a full 492 /*
484 * restore of the math state immediately to avoid the trap; the 493 * Preload the FPU context, now that we've determined that the
485 * chances of needing FPU soon are obviously high now 494 * task is likely to be using it.
486 *
487 * tsk_used_math() checks prevent calling math_state_restore(),
488 * which can sleep in the case of !tsk_used_math()
489 */ 495 */
490 if (tsk_used_math(next_p) && next_p->fpu_counter > 5) 496 if (preload_fpu)
491 math_state_restore(); 497 __math_state_restore();
492 return prev_p; 498 return prev_p;
493} 499}
494 500
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 09ecbde91c13..7b058a2dc66a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -35,10 +35,11 @@
35#include <asm/proto.h> 35#include <asm/proto.h>
36#include <asm/ds.h> 36#include <asm/ds.h>
37 37
38#include <trace/syscall.h>
39
40#include "tls.h" 38#include "tls.h"
41 39
40#define CREATE_TRACE_POINTS
41#include <trace/events/syscalls.h>
42
42enum x86_regset { 43enum x86_regset {
43 REGSET_GENERAL, 44 REGSET_GENERAL,
44 REGSET_FP, 45 REGSET_FP,
@@ -324,16 +325,6 @@ static int putreg(struct task_struct *child,
324 return set_flags(child, value); 325 return set_flags(child, value);
325 326
326#ifdef CONFIG_X86_64 327#ifdef CONFIG_X86_64
327 /*
328 * Orig_ax is really just a flag with small positive and
329 * negative values, so make sure to always sign-extend it
330 * from 32 bits so that it works correctly regardless of
331 * whether we come from a 32-bit environment or not.
332 */
333 case offsetof(struct user_regs_struct, orig_ax):
334 value = (long) (s32) value;
335 break;
336
337 case offsetof(struct user_regs_struct,fs_base): 328 case offsetof(struct user_regs_struct,fs_base):
338 if (value >= TASK_SIZE_OF(child)) 329 if (value >= TASK_SIZE_OF(child))
339 return -EIO; 330 return -EIO;
@@ -1125,10 +1116,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1125 1116
1126 case offsetof(struct user32, regs.orig_eax): 1117 case offsetof(struct user32, regs.orig_eax):
1127 /* 1118 /*
1128 * Sign-extend the value so that orig_eax = -1 1119 * A 32-bit debugger setting orig_eax means to restore
1129 * causes (long)orig_ax < 0 tests to fire correctly. 1120 * the state of the task restarting a 32-bit syscall.
1121 * Make sure we interpret the -ERESTART* codes correctly
1122 * in case the task is not actually still sitting at the
1123 * exit from a 32-bit syscall with TS_COMPAT still set.
1130 */ 1124 */
1131 regs->orig_ax = (long) (s32) value; 1125 regs->orig_ax = value;
1126 if (syscall_get_nr(child, regs) >= 0)
1127 task_thread_info(child)->status |= TS_COMPAT;
1132 break; 1128 break;
1133 1129
1134 case offsetof(struct user32, regs.eflags): 1130 case offsetof(struct user32, regs.eflags):
@@ -1497,8 +1493,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1497 tracehook_report_syscall_entry(regs)) 1493 tracehook_report_syscall_entry(regs))
1498 ret = -1L; 1494 ret = -1L;
1499 1495
1500 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1496 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1501 ftrace_syscall_enter(regs); 1497 trace_sys_enter(regs, regs->orig_ax);
1502 1498
1503 if (unlikely(current->audit_context)) { 1499 if (unlikely(current->audit_context)) {
1504 if (IS_IA32) 1500 if (IS_IA32)
@@ -1523,8 +1519,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1523 if (unlikely(current->audit_context)) 1519 if (unlikely(current->audit_context))
1524 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1520 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1525 1521
1526 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1522 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1527 ftrace_syscall_exit(regs); 1523 trace_sys_exit(regs, regs->ax);
1528 1524
1529 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1525 if (test_thread_flag(TIF_SYSCALL_TRACE))
1530 tracehook_report_syscall_exit(regs, 0); 1526 tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624bf..6c3b2c6fd772 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
508 508
509 pci_read_config_dword(nb_ht, 0x60, &val); 509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev); 511 pci_dev_put(nb_ht);
512} 512}
513 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d101844..27349f92a6d7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/tboot.h>
7#include <acpi/reboot.h> 8#include <acpi/reboot.h>
8#include <asm/io.h> 9#include <asm/io.h>
9#include <asm/apic.h> 10#include <asm/apic.h>
@@ -508,6 +509,8 @@ static void native_machine_emergency_restart(void)
508 if (reboot_emergency) 509 if (reboot_emergency)
509 emergency_vmx_disable_all(); 510 emergency_vmx_disable_all();
510 511
512 tboot_shutdown(TB_SHUTDOWN_REBOOT);
513
511 /* Tell the BIOS if we want cold or warm reboot */ 514 /* Tell the BIOS if we want cold or warm reboot */
512 *((unsigned short *)__va(0x472)) = reboot_mode; 515 *((unsigned short *)__va(0x472)) = reboot_mode;
513 516
@@ -634,6 +637,8 @@ static void native_machine_halt(void)
634 /* stop other cpus and apics */ 637 /* stop other cpus and apics */
635 machine_shutdown(); 638 machine_shutdown();
636 639
640 tboot_shutdown(TB_SHUTDOWN_HALT);
641
637 /* stop this cpu */ 642 /* stop this cpu */
638 stop_this_cpu(NULL); 643 stop_this_cpu(NULL);
639} 644}
@@ -645,6 +650,8 @@ static void native_machine_power_off(void)
645 machine_shutdown(); 650 machine_shutdown();
646 pm_power_off(); 651 pm_power_off();
647 } 652 }
653 /* a fallback in case there is no PM info available */
654 tboot_shutdown(TB_SHUTDOWN_HALT);
648} 655}
649 656
650struct machine_ops machine_ops = { 657struct machine_ops machine_ops = {
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e72..1cfbbfc3ae26 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -8,6 +8,7 @@
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9 9
10#include <asm/vsyscall.h> 10#include <asm/vsyscall.h>
11#include <asm/x86_init.h>
11#include <asm/time.h> 12#include <asm/time.h>
12 13
13#ifdef CONFIG_X86_32 14#ifdef CONFIG_X86_32
@@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
165} 166}
166EXPORT_SYMBOL(rtc_cmos_write); 167EXPORT_SYMBOL(rtc_cmos_write);
167 168
168static int set_rtc_mmss(unsigned long nowtime) 169int update_persistent_clock(struct timespec now)
169{ 170{
170 unsigned long flags; 171 unsigned long flags;
171 int retval; 172 int retval;
172 173
173 spin_lock_irqsave(&rtc_lock, flags); 174 spin_lock_irqsave(&rtc_lock, flags);
174 retval = set_wallclock(nowtime); 175 retval = x86_platform.set_wallclock(now.tv_sec);
175 spin_unlock_irqrestore(&rtc_lock, flags); 176 spin_unlock_irqrestore(&rtc_lock, flags);
176 177
177 return retval; 178 return retval;
178} 179}
179 180
180/* not static: needed by APM */ 181/* not static: needed by APM */
181unsigned long read_persistent_clock(void) 182void read_persistent_clock(struct timespec *ts)
182{ 183{
183 unsigned long retval, flags; 184 unsigned long retval, flags;
184 185
185 spin_lock_irqsave(&rtc_lock, flags); 186 spin_lock_irqsave(&rtc_lock, flags);
186 retval = get_wallclock(); 187 retval = x86_platform.get_wallclock();
187 spin_unlock_irqrestore(&rtc_lock, flags); 188 spin_unlock_irqrestore(&rtc_lock, flags);
188 189
189 return retval; 190 ts->tv_sec = retval;
190} 191 ts->tv_nsec = 0;
191
192int update_persistent_clock(struct timespec now)
193{
194 return set_rtc_mmss(now.tv_sec);
195} 192}
196 193
197unsigned long long native_read_tsc(void) 194unsigned long long native_read_tsc(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef2..e09f0e2c14b5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -27,6 +27,7 @@
27#include <linux/screen_info.h> 27#include <linux/screen_info.h>
28#include <linux/ioport.h> 28#include <linux/ioport.h>
29#include <linux/acpi.h> 29#include <linux/acpi.h>
30#include <linux/sfi.h>
30#include <linux/apm_bios.h> 31#include <linux/apm_bios.h>
31#include <linux/initrd.h> 32#include <linux/initrd.h>
32#include <linux/bootmem.h> 33#include <linux/bootmem.h>
@@ -66,6 +67,7 @@
66 67
67#include <linux/percpu.h> 68#include <linux/percpu.h>
68#include <linux/crash_dump.h> 69#include <linux/crash_dump.h>
70#include <linux/tboot.h>
69 71
70#include <video/edid.h> 72#include <video/edid.h>
71 73
@@ -108,10 +110,6 @@
108#include <asm/numa_64.h> 110#include <asm/numa_64.h>
109#endif 111#endif
110 112
111#ifndef ARCH_SETUP
112#define ARCH_SETUP
113#endif
114
115/* 113/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access 115 * The direct mapping extends to max_pfn_mapped, so that we can directly access
@@ -133,9 +131,9 @@ int default_cpu_present_to_apicid(int mps_cpu)
133 return __default_cpu_present_to_apicid(mps_cpu); 131 return __default_cpu_present_to_apicid(mps_cpu);
134} 132}
135 133
136int default_check_phys_apicid_present(int boot_cpu_physical_apicid) 134int default_check_phys_apicid_present(int phys_apicid)
137{ 135{
138 return __default_check_phys_apicid_present(boot_cpu_physical_apicid); 136 return __default_check_phys_apicid_present(phys_apicid);
139} 137}
140#endif 138#endif
141 139
@@ -171,13 +169,6 @@ static struct resource bss_resource = {
171 169
172 170
173#ifdef CONFIG_X86_32 171#ifdef CONFIG_X86_32
174static struct resource video_ram_resource = {
175 .name = "Video RAM area",
176 .start = 0xa0000,
177 .end = 0xbffff,
178 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
179};
180
181/* cpu data as detected by the assembly code in head.S */ 172/* cpu data as detected by the assembly code in head.S */
182struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; 173struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
183/* common cpu data for all cpus */ 174/* common cpu data for all cpus */
@@ -605,7 +596,7 @@ static struct resource standard_io_resources[] = {
605 .flags = IORESOURCE_BUSY | IORESOURCE_IO } 596 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
606}; 597};
607 598
608static void __init reserve_standard_io_resources(void) 599void __init reserve_standard_io_resources(void)
609{ 600{
610 int i; 601 int i;
611 602
@@ -637,10 +628,6 @@ static int __init setup_elfcorehdr(char *arg)
637early_param("elfcorehdr", setup_elfcorehdr); 628early_param("elfcorehdr", setup_elfcorehdr);
638#endif 629#endif
639 630
640static struct x86_quirks default_x86_quirks __initdata;
641
642struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
643
644#ifdef CONFIG_X86_RESERVE_LOW_64K 631#ifdef CONFIG_X86_RESERVE_LOW_64K
645static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 632static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
646{ 633{
@@ -757,7 +744,7 @@ void __init setup_arch(char **cmdline_p)
757 } 744 }
758#endif 745#endif
759 746
760 ARCH_SETUP 747 x86_init.oem.arch_setup();
761 748
762 setup_memory_map(); 749 setup_memory_map();
763 parse_setup_data(); 750 parse_setup_data();
@@ -796,6 +783,16 @@ void __init setup_arch(char **cmdline_p)
796 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 783 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
797 *cmdline_p = command_line; 784 *cmdline_p = command_line;
798 785
786#ifdef CONFIG_X86_64
787 /*
788 * Must call this twice: Once just to detect whether hardware doesn't
789 * support NX (so that the early EHCI debug console setup can safely
790 * call set_fixmap(), and then again after parsing early parameters to
791 * honor the respective command line option.
792 */
793 check_efer();
794#endif
795
799 parse_early_param(); 796 parse_early_param();
800 797
801#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -833,11 +830,9 @@ void __init setup_arch(char **cmdline_p)
833 * VMware detection requires dmi to be available, so this 830 * VMware detection requires dmi to be available, so this
834 * needs to be done after dmi_scan_machine, for the BP. 831 * needs to be done after dmi_scan_machine, for the BP.
835 */ 832 */
836 init_hypervisor(&boot_cpu_data); 833 init_hypervisor_platform();
837 834
838#ifdef CONFIG_X86_32 835 x86_init.resources.probe_roms();
839 probe_roms();
840#endif
841 836
842 /* after parse_early_param, so could debug it */ 837 /* after parse_early_param, so could debug it */
843 insert_resource(&iomem_resource, &code_resource); 838 insert_resource(&iomem_resource, &code_resource);
@@ -972,10 +967,11 @@ void __init setup_arch(char **cmdline_p)
972 kvmclock_init(); 967 kvmclock_init();
973#endif 968#endif
974 969
975 paravirt_pagetable_setup_start(swapper_pg_dir); 970 x86_init.paging.pagetable_setup_start(swapper_pg_dir);
976 paging_init(); 971 paging_init();
977 paravirt_pagetable_setup_done(swapper_pg_dir); 972 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
978 paravirt_post_allocator_init(); 973
974 tboot_probe();
979 975
980#ifdef CONFIG_X86_64 976#ifdef CONFIG_X86_64
981 map_vsyscall(); 977 map_vsyscall();
@@ -990,13 +986,13 @@ void __init setup_arch(char **cmdline_p)
990 */ 986 */
991 acpi_boot_init(); 987 acpi_boot_init();
992 988
993#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) 989 sfi_init();
990
994 /* 991 /*
995 * get boot-time SMP configuration: 992 * get boot-time SMP configuration:
996 */ 993 */
997 if (smp_found_config) 994 if (smp_found_config)
998 get_smp_config(); 995 get_smp_config();
999#endif
1000 996
1001 prefill_possible_map(); 997 prefill_possible_map();
1002 998
@@ -1015,10 +1011,7 @@ void __init setup_arch(char **cmdline_p)
1015 e820_reserve_resources(); 1011 e820_reserve_resources();
1016 e820_mark_nosave_regions(max_low_pfn); 1012 e820_mark_nosave_regions(max_low_pfn);
1017 1013
1018#ifdef CONFIG_X86_32 1014 x86_init.resources.reserve_resources();
1019 request_resource(&iomem_resource, &video_ram_resource);
1020#endif
1021 reserve_standard_io_resources();
1022 1015
1023 e820_setup_gap(); 1016 e820_setup_gap();
1024 1017
@@ -1030,78 +1023,22 @@ void __init setup_arch(char **cmdline_p)
1030 conswitchp = &dummy_con; 1023 conswitchp = &dummy_con;
1031#endif 1024#endif
1032#endif 1025#endif
1026 x86_init.oem.banner();
1033} 1027}
1034 1028
1035#ifdef CONFIG_X86_32 1029#ifdef CONFIG_X86_32
1036 1030
1037/** 1031static struct resource video_ram_resource = {
1038 * x86_quirk_intr_init - post gate setup interrupt initialisation 1032 .name = "Video RAM area",
1039 * 1033 .start = 0xa0000,
1040 * Description: 1034 .end = 0xbffff,
1041 * Fill in any interrupts that may have been left out by the general 1035 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1042 * init_IRQ() routine. interrupts having to do with the machine rather
1043 * than the devices on the I/O bus (like APIC interrupts in intel MP
1044 * systems) are started here.
1045 **/
1046void __init x86_quirk_intr_init(void)
1047{
1048 if (x86_quirks->arch_intr_init) {
1049 if (x86_quirks->arch_intr_init())
1050 return;
1051 }
1052}
1053
1054/**
1055 * x86_quirk_trap_init - initialise system specific traps
1056 *
1057 * Description:
1058 * Called as the final act of trap_init(). Used in VISWS to initialise
1059 * the various board specific APIC traps.
1060 **/
1061void __init x86_quirk_trap_init(void)
1062{
1063 if (x86_quirks->arch_trap_init) {
1064 if (x86_quirks->arch_trap_init())
1065 return;
1066 }
1067}
1068
1069static struct irqaction irq0 = {
1070 .handler = timer_interrupt,
1071 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
1072 .name = "timer"
1073}; 1036};
1074 1037
1075/** 1038void __init i386_reserve_resources(void)
1076 * x86_quirk_pre_time_init - do any specific initialisations before.
1077 *
1078 **/
1079void __init x86_quirk_pre_time_init(void)
1080{ 1039{
1081 if (x86_quirks->arch_pre_time_init) 1040 request_resource(&iomem_resource, &video_ram_resource);
1082 x86_quirks->arch_pre_time_init(); 1041 reserve_standard_io_resources();
1083} 1042}
1084 1043
1085/**
1086 * x86_quirk_time_init - do any specific initialisations for the system timer.
1087 *
1088 * Description:
1089 * Must plug the system timer interrupt source at HZ into the IRQ listed
1090 * in irq_vectors.h:TIMER_IRQ
1091 **/
1092void __init x86_quirk_time_init(void)
1093{
1094 if (x86_quirks->arch_time_init) {
1095 /*
1096 * A nonzero return code does not mean failure, it means
1097 * that the architecture quirk does not want any
1098 * generic (timer) setup to be performed after this:
1099 */
1100 if (x86_quirks->arch_time_init())
1101 return;
1102 }
1103
1104 irq0.mask = cpumask_of_cpu(0);
1105 setup_irq(0, &irq0);
1106}
1107#endif /* CONFIG_X86_32 */ 1044#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f212..d559af913e1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
55#define PERCPU_FIRST_CHUNK_RESERVE 0 55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif 56#endif
57 57
58#ifdef CONFIG_X86_32
58/** 59/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA 60 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 * 61 *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
83#endif 84#endif
84 return false; 85 return false;
85} 86}
87#endif
86 88
87/** 89/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 90 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 126}
125 127
126/* 128/*
127 * Large page remap allocator 129 * Helpers for first chunk memory allocation
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 130 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 131static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
140struct pcpul_ent {
141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
148
149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
150{ 132{
151 size_t off = (size_t)pageno << PAGE_SHIFT; 133 return pcpu_alloc_bootmem(cpu, size, align);
152
153 if (off >= pcpul_size)
154 return NULL;
155
156 return virt_to_page(pcpul_map[cpu].ptr + off);
157} 134}
158 135
159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 136static void __init pcpu_fc_free(void *ptr, size_t size)
160{ 137{
161 size_t map_size, dyn_size; 138 free_bootmem(__pa(ptr), size);
162 unsigned int cpu;
163 int i, j;
164 ssize_t ret;
165
166 if (!chosen) {
167 size_t vm_size = VMALLOC_END - VMALLOC_START;
168 size_t tot_size = nr_cpu_ids * PMD_SIZE;
169
170 /* on non-NUMA, embedding is better */
171 if (!pcpu_need_numa())
172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
185 return -EINVAL;
186 }
187
188 /*
189 * Currently supports only single page. Supporting multiple
190 * pages won't be too difficult if it ever becomes necessary.
191 */
192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
193 PERCPU_DYNAMIC_RESERVE);
194 if (pcpul_size > PMD_SIZE) {
195 pr_warning("PERCPU: static data is larger than large page, "
196 "can't use large page\n");
197 return -EINVAL;
198 }
199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
200
201 /* allocate pointer array and alloc large pages */
202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
203 pcpul_map = alloc_bootmem(map_size);
204
205 for_each_possible_cpu(cpu) {
206 pcpul_map[cpu].cpu = cpu;
207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
212 goto enomem;
213 }
214
215 /*
216 * Only use pcpul_size bytes and give back the rest.
217 *
218 * Ingo: The 2MB up-rounding bootmem is needed to make
219 * sure the partial 2MB page is still fully RAM - it's
220 * not well-specified to have a PAT-incompatible area
221 * (unmapped RAM, device memory, etc.) in that hole.
222 */
223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
224 PMD_SIZE - pcpul_size);
225
226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
227 }
228
229 /* allocate address and map */
230 pcpul_vm.flags = VM_ALLOC;
231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
233
234 for_each_possible_cpu(cpu) {
235 pmd_t *pmd, pmd_v;
236
237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
238 cpu * PMD_SIZE);
239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
242 }
243
244 /* we're ready, commit */
245 pr_info("PERCPU: Remapped at %p with large pages, static data "
246 "%zu bytes\n", pcpul_vm.addr, static_size);
247
248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
250 PMD_SIZE, pcpul_vm.addr, NULL);
251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
262
263enomem:
264 for_each_possible_cpu(cpu)
265 if (pcpul_map[cpu].ptr)
266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
267 free_bootmem(__pa(pcpul_map), map_size);
268 return -ENOMEM;
269} 139}
270 140
271/** 141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{ 142{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); 143#ifdef CONFIG_NEED_MULTIPLE_NODES
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; 144 if (early_cpu_to_node(from) == early_cpu_to_node(to))
291 int left = 0, right = nr_cpu_ids - 1; 145 return LOCAL_DISTANCE;
292 int pos; 146 else
293 147 return REMOTE_DISTANCE;
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
316}
317#else 148#else
318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 149 return LOCAL_DISTANCE;
319{
320 return -EINVAL;
321}
322#endif 150#endif
323
324/*
325 * Embedding allocator
326 *
327 * The first chunk is sized to just contain the static area plus
328 * module and dynamic reserves and embedded into linear physical
329 * mapping so that it can use PMD mapping without additional TLB
330 * pressure.
331 */
332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
333{
334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
335
336 /*
337 * If large page isn't supported, there's no benefit in doing
338 * this. Also, embedding allocation doesn't play well with
339 * NUMA.
340 */
341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
342 return -EINVAL;
343
344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
345 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
346} 151}
347 152
348/* 153static void __init pcpup_populate_pte(unsigned long addr)
349 * 4k page allocator
350 *
351 * This is the basic allocator. Static percpu area is allocated
352 * page-by-page and most of initialization is done by the generic
353 * setup function.
354 */
355static struct page **pcpu4k_pages __initdata;
356static int pcpu4k_nr_static_pages __initdata;
357
358static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
359{
360 if (pageno < pcpu4k_nr_static_pages)
361 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
362 return NULL;
363}
364
365static void __init pcpu4k_populate_pte(unsigned long addr)
366{ 154{
367 populate_extra_pte(addr); 155 populate_extra_pte(addr);
368} 156}
369 157
370static ssize_t __init setup_pcpu_4k(size_t static_size)
371{
372 size_t pages_size;
373 unsigned int cpu;
374 int i, j;
375 ssize_t ret;
376
377 pcpu4k_nr_static_pages = PFN_UP(static_size);
378
379 /* unaligned allocations can't be freed, round up to page size */
380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
381 * sizeof(pcpu4k_pages[0]));
382 pcpu4k_pages = alloc_bootmem(pages_size);
383
384 /* allocate and copy */
385 j = 0;
386 for_each_possible_cpu(cpu)
387 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
388 void *ptr;
389
390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
394 goto enomem;
395 }
396
397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
398 pcpu4k_pages[j++] = virt_to_page(ptr);
399 }
400
401 /* we're ready, commit */
402 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
403 pcpu4k_nr_static_pages, static_size);
404
405 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
406 PERCPU_FIRST_CHUNK_RESERVE, -1,
407 -1, NULL, pcpu4k_populate_pte);
408 goto out_free_ar;
409
410enomem:
411 while (--j >= 0)
412 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
413 ret = -ENOMEM;
414out_free_ar:
415 free_bootmem(__pa(pcpu4k_pages), pages_size);
416 return ret;
417}
418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
429static inline void setup_percpu_segment(int cpu) 158static inline void setup_percpu_segment(int cpu)
430{ 159{
431#ifdef CONFIG_X86_32 160#ifdef CONFIG_X86_32
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
441 170
442void __init setup_per_cpu_areas(void) 171void __init setup_per_cpu_areas(void)
443{ 172{
444 size_t static_size = __per_cpu_end - __per_cpu_start;
445 unsigned int cpu; 173 unsigned int cpu;
446 unsigned long delta; 174 unsigned long delta;
447 size_t pcpu_unit_size; 175 int rc;
448 ssize_t ret;
449 176
450 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 177 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
451 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 178 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
452 179
453 /* 180 /*
454 * Allocate percpu area. If PSE is supported, try to make use 181 * Allocate percpu area. Embedding allocator is our favorite;
455 * of large page mappings. Please read comments on top of 182 * however, on NUMA configurations, it can result in very
456 * each allocator for details. 183 * sparse unit mapping and vmalloc area isn't spacious enough
184 * on 32bit. Use page in that case.
457 */ 185 */
458 ret = -EINVAL; 186#ifdef CONFIG_X86_32
459 if (strlen(pcpu_chosen_alloc)) { 187 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
460 if (strcmp(pcpu_chosen_alloc, "4k")) { 188 pcpu_chosen_fc = PCPU_FC_PAGE;
461 if (!strcmp(pcpu_chosen_alloc, "lpage")) 189#endif
462 ret = setup_pcpu_lpage(static_size, true); 190 rc = -EINVAL;
463 else if (!strcmp(pcpu_chosen_alloc, "embed")) 191 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
464 ret = setup_pcpu_embed(static_size, true); 192 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
465 else 193 const size_t dyn_size = PERCPU_MODULE_RESERVE +
466 pr_warning("PERCPU: unknown allocator %s " 194 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
467 "specified\n", pcpu_chosen_alloc); 195
468 if (ret < 0) 196 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
469 pr_warning("PERCPU: %s allocator failed (%zd), " 197 dyn_size, atom_size,
470 "falling back to 4k\n", 198 pcpu_cpu_distance,
471 pcpu_chosen_alloc, ret); 199 pcpu_fc_alloc, pcpu_fc_free);
472 } 200 if (rc < 0)
473 } else { 201 pr_warning("PERCPU: %s allocator failed (%d), "
474 ret = setup_pcpu_lpage(static_size, false); 202 "falling back to page size\n",
475 if (ret < 0) 203 pcpu_fc_names[pcpu_chosen_fc], rc);
476 ret = setup_pcpu_embed(static_size, false);
477 } 204 }
478 if (ret < 0) 205 if (rc < 0)
479 ret = setup_pcpu_4k(static_size); 206 rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
480 if (ret < 0) 207 pcpu_fc_alloc, pcpu_fc_free,
481 panic("cannot allocate static percpu area (%zu bytes, err=%zd)", 208 pcpup_populate_pte);
482 static_size, ret); 209 if (rc < 0)
483 210 panic("cannot initialize percpu area (err=%d)", rc);
484 pcpu_unit_size = ret;
485 211
486 /* alrighty, percpu areas up and running */ 212 /* alrighty, percpu areas up and running */
487 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 213 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
488 for_each_possible_cpu(cpu) { 214 for_each_possible_cpu(cpu) {
489 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 215 per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
490 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 216 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
491 per_cpu(cpu_number, cpu) = cpu; 217 per_cpu(cpu_number, cpu) = cpu;
492 setup_percpu_segment(cpu); 218 setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
new file mode 100644
index 000000000000..34e099382651
--- /dev/null
+++ b/arch/x86/kernel/sfi.c
@@ -0,0 +1,122 @@
1/*
2 * sfi.c - x86 architecture SFI support.
3 *
4 * Copyright (c) 2009, Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#define KMSG_COMPONENT "SFI"
22#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24#include <linux/acpi.h>
25#include <linux/init.h>
26#include <linux/sfi.h>
27#include <linux/io.h>
28
29#include <asm/io_apic.h>
30#include <asm/mpspec.h>
31#include <asm/setup.h>
32#include <asm/apic.h>
33
34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36
37void __init mp_sfi_register_lapic_address(unsigned long address)
38{
39 mp_lapic_addr = address;
40
41 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
42 if (boot_cpu_physical_apicid == -1U)
43 boot_cpu_physical_apicid = read_apic_id();
44
45 pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
46}
47
48/* All CPUs enumerated by SFI must be present and enabled */
49void __cpuinit mp_sfi_register_lapic(u8 id)
50{
51 if (MAX_APICS - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n",
53 id, MAX_APICS);
54 return;
55 }
56
57 pr_info("registering lapic[%d]\n", id);
58
59 generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
60}
61
62static int __init sfi_parse_cpus(struct sfi_table_header *table)
63{
64 struct sfi_table_simple *sb;
65 struct sfi_cpu_table_entry *pentry;
66 int i;
67 int cpu_num;
68
69 sb = (struct sfi_table_simple *)table;
70 cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
71 pentry = (struct sfi_cpu_table_entry *)sb->pentry;
72
73 for (i = 0; i < cpu_num; i++) {
74 mp_sfi_register_lapic(pentry->apic_id);
75 pentry++;
76 }
77
78 smp_found_config = 1;
79 return 0;
80}
81#endif /* CONFIG_X86_LOCAL_APIC */
82
83#ifdef CONFIG_X86_IO_APIC
84static u32 gsi_base;
85
86static int __init sfi_parse_ioapic(struct sfi_table_header *table)
87{
88 struct sfi_table_simple *sb;
89 struct sfi_apic_table_entry *pentry;
90 int i, num;
91
92 sb = (struct sfi_table_simple *)table;
93 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
94 pentry = (struct sfi_apic_table_entry *)sb->pentry;
95
96 for (i = 0; i < num; i++) {
97 mp_register_ioapic(i, pentry->phys_addr, gsi_base);
98 gsi_base += io_apic_get_redir_entries(i);
99 pentry++;
100 }
101
102 WARN(pic_mode, KERN_WARNING
103 "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
104 pic_mode = 0;
105 return 0;
106}
107#endif /* CONFIG_X86_IO_APIC */
108
109/*
110 * sfi_platform_init(): register lapics & io-apics
111 */
112int __init sfi_platform_init(void)
113{
114#ifdef CONFIG_X86_LOCAL_APIC
115 mp_sfi_register_lapic_address(sfi_lapic_addr);
116 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
117#endif
118#ifdef CONFIG_X86_IO_APIC
119 sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
120#endif
121 return 0;
122}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c578751e94e..6a44a76055ad 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs)
856void 856void
857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
858{ 858{
859#ifdef CONFIG_X86_NEW_MCE 859#ifdef CONFIG_X86_MCE
860 /* notify userspace of pending MCEs */ 860 /* notify userspace of pending MCEs */
861 if (thread_info_flags & _TIF_MCE_NOTIFY) 861 if (thread_info_flags & _TIF_MCE_NOTIFY)
862 mce_notify_process(); 862 mce_notify_process();
@@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
869 if (thread_info_flags & _TIF_NOTIFY_RESUME) { 869 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
870 clear_thread_flag(TIF_NOTIFY_RESUME); 870 clear_thread_flag(TIF_NOTIFY_RESUME);
871 tracehook_notify_resume(regs); 871 tracehook_notify_resume(regs);
872 if (current->replacement_session_keyring)
873 key_replace_session_keyring();
872 } 874 }
873 875
874#ifdef CONFIG_X86_32 876#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2fecda69ee64..565ebc65920e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
47#include <linux/bootmem.h> 47#include <linux/bootmem.h>
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h>
50 51
51#include <asm/acpi.h> 52#include <asm/acpi.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
@@ -323,7 +324,7 @@ notrace static void __cpuinit start_secondary(void *unused)
323 /* enable local interrupts */ 324 /* enable local interrupts */
324 local_irq_enable(); 325 local_irq_enable();
325 326
326 setup_secondary_clock(); 327 x86_cpuinit.setup_percpu_clockev();
327 328
328 wmb(); 329 wmb();
329 cpu_idle(); 330 cpu_idle();
@@ -434,7 +435,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
434 * For perf, we return last level cache shared map. 435 * For perf, we return last level cache shared map.
435 * And for power savings, we return cpu_core_map 436 * And for power savings, we return cpu_core_map
436 */ 437 */
437 if (sched_mc_power_savings || sched_smt_power_savings) 438 if ((sched_mc_power_savings || sched_smt_power_savings) &&
439 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
438 return cpu_core_mask(cpu); 440 return cpu_core_mask(cpu);
439 else 441 else
440 return c->llc_shared_map; 442 return c->llc_shared_map;
@@ -1057,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1057#endif 1059#endif
1058 current_thread_info()->cpu = 0; /* needed? */ 1060 current_thread_info()->cpu = 0; /* needed? */
1059 for_each_possible_cpu(i) { 1061 for_each_possible_cpu(i) {
1060 alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1062 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1061 alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1063 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1062 alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1064 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
1063 cpumask_clear(per_cpu(cpu_core_map, i));
1064 cpumask_clear(per_cpu(cpu_sibling_map, i));
1065 cpumask_clear(cpu_data(i).llc_shared_map);
1066 } 1065 }
1067 set_cpu_sibling_map(0); 1066 set_cpu_sibling_map(0);
1068 1067
@@ -1112,13 +1111,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1112 1111
1113 printk(KERN_INFO "CPU%d: ", 0); 1112 printk(KERN_INFO "CPU%d: ", 0);
1114 print_cpu_info(&cpu_data(0)); 1113 print_cpu_info(&cpu_data(0));
1115 setup_boot_clock(); 1114 x86_init.timers.setup_percpu_clockev();
1116 1115
1117 if (is_uv_system()) 1116 if (is_uv_system())
1118 uv_system_init(); 1117 uv_system_init();
1118
1119 set_mtrr_aps_delayed_init();
1119out: 1120out:
1120 preempt_enable(); 1121 preempt_enable();
1121} 1122}
1123
1124void arch_enable_nonboot_cpus_begin(void)
1125{
1126 set_mtrr_aps_delayed_init();
1127}
1128
1129void arch_enable_nonboot_cpus_end(void)
1130{
1131 mtrr_aps_init();
1132}
1133
1122/* 1134/*
1123 * Early setup to make printk work. 1135 * Early setup to make printk work.
1124 */ 1136 */
@@ -1140,6 +1152,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1140 setup_ioapic_dest(); 1152 setup_ioapic_dest();
1141#endif 1153#endif
1142 check_nmi_watchdog(); 1154 check_nmi_watchdog();
1155 mtrr_aps_init();
1143} 1156}
1144 1157
1145static int __initdata setup_possible_cpus = -1; 1158static int __initdata setup_possible_cpus = -1;
@@ -1317,6 +1330,7 @@ void play_dead_common(void)
1317void native_play_dead(void) 1330void native_play_dead(void)
1318{ 1331{
1319 play_dead_common(); 1332 play_dead_common();
1333 tboot_shutdown(TB_SHUTDOWN_WFS);
1320 wbinvd_halt(); 1334 wbinvd_halt();
1321} 1335}
1322 1336
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index e8b9863ef8c4..3149032ff107 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -4,6 +4,7 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <asm/desc.h>
7 8
8unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) 9unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
9{ 10{
@@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
23 * and APM bios ones we just ignore here. 24 * and APM bios ones we just ignore here.
24 */ 25 */
25 if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { 26 if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
26 u32 *desc; 27 struct desc_struct *desc;
27 unsigned long base; 28 unsigned long base;
28 29
29 seg &= ~7UL; 30 seg &= ~7UL;
@@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
33 addr = -1L; /* bogus selector, access would fault */ 34 addr = -1L; /* bogus selector, access would fault */
34 else { 35 else {
35 desc = child->mm->context.ldt + seg; 36 desc = child->mm->context.ldt + seg;
36 base = ((desc[0] >> 16) | 37 base = get_desc_base(desc);
37 ((desc[1] & 0xff) << 16) |
38 (desc[1] & 0xff000000));
39 38
40 /* 16-bit code segment? */ 39 /* 16-bit code segment? */
41 if (!((desc[1] >> 22) & 1)) 40 if (!desc->d)
42 addr &= 0xffff; 41 addr &= 0xffff;
43 addr += base; 42 addr += base;
44 } 43 }
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6bc211accf08..45e00eb09c3a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,9 +18,9 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h> 19#include <asm/syscalls.h>
20 20
21asmlinkage long sys_mmap(unsigned long addr, unsigned long len, 21SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
22 unsigned long prot, unsigned long flags, 22 unsigned long, prot, unsigned long, flags,
23 unsigned long fd, unsigned long off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file; 26 struct file *file;
@@ -226,7 +226,7 @@ bottomup:
226} 226}
227 227
228 228
229asmlinkage long sys_uname(struct new_utsname __user *name) 229SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
230{ 230{
231 int err; 231 int err;
232 down_read(&uts_sem); 232 down_read(&uts_sem);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321ddafda..0157cd26d7cc 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,4 +335,4 @@ ENTRY(sys_call_table)
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open 338 .long sys_perf_event_open
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 000000000000..86c9f91b48ae
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,447 @@
1/*
2 * tboot.c: main implementation of helper functions used by kernel for
3 * runtime support of Intel(R) Trusted Execution Technology
4 *
5 * Copyright (c) 2006-2009, Intel Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 */
21
22#include <linux/dma_remapping.h>
23#include <linux/init_task.h>
24#include <linux/spinlock.h>
25#include <linux/delay.h>
26#include <linux/sched.h>
27#include <linux/init.h>
28#include <linux/dmar.h>
29#include <linux/cpu.h>
30#include <linux/pfn.h>
31#include <linux/mm.h>
32#include <linux/tboot.h>
33
34#include <asm/trampoline.h>
35#include <asm/processor.h>
36#include <asm/bootparam.h>
37#include <asm/pgtable.h>
38#include <asm/pgalloc.h>
39#include <asm/fixmap.h>
40#include <asm/proto.h>
41#include <asm/setup.h>
42#include <asm/e820.h>
43#include <asm/io.h>
44
45#include "acpi/realmode/wakeup.h"
46
47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly;
49
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1
52
53#undef pr_fmt
54#define pr_fmt(fmt) "tboot: " fmt
55
56static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
57
58void __init tboot_probe(void)
59{
60 /* Look for valid page-aligned address for shared page. */
61 if (!boot_params.tboot_addr)
62 return;
63 /*
64 * also verify that it is mapped as we expect it before calling
65 * set_fixmap(), to reduce chance of garbage value causing crash
66 */
67 if (!e820_any_mapped(boot_params.tboot_addr,
68 boot_params.tboot_addr, E820_RESERVED)) {
69 pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
70 return;
71 }
72
73 /* only a natively booted kernel should be using TXT */
74 if (paravirt_enabled()) {
75 pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
76 return;
77 }
78
79 /* Map and check for tboot UUID. */
80 set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
81 tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
82 if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
83 pr_warning("tboot at 0x%llx is invalid\n",
84 boot_params.tboot_addr);
85 tboot = NULL;
86 return;
87 }
88 if (tboot->version < 5) {
89 pr_warning("tboot version is invalid: %u\n", tboot->version);
90 tboot = NULL;
91 return;
92 }
93
94 pr_info("found shared page at phys addr 0x%llx:\n",
95 boot_params.tboot_addr);
96 pr_debug("version: %d\n", tboot->version);
97 pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
98 pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
99 pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
100 pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
101}
102
103static pgd_t *tboot_pg_dir;
104static struct mm_struct tboot_mm = {
105 .mm_rb = RB_ROOT,
106 .pgd = swapper_pg_dir,
107 .mm_users = ATOMIC_INIT(2),
108 .mm_count = ATOMIC_INIT(1),
109 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
110 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
111 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
112 .cpu_vm_mask = CPU_MASK_ALL,
113};
114
115static inline void switch_to_tboot_pt(void)
116{
117 write_cr3(virt_to_phys(tboot_pg_dir));
118}
119
120static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
121 pgprot_t prot)
122{
123 pgd_t *pgd;
124 pud_t *pud;
125 pmd_t *pmd;
126 pte_t *pte;
127
128 pgd = pgd_offset(&tboot_mm, vaddr);
129 pud = pud_alloc(&tboot_mm, pgd, vaddr);
130 if (!pud)
131 return -1;
132 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
133 if (!pmd)
134 return -1;
135 pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
136 if (!pte)
137 return -1;
138 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
139 pte_unmap(pte);
140 return 0;
141}
142
143static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
144 unsigned long nr)
145{
146 /* Reuse the original kernel mapping */
147 tboot_pg_dir = pgd_alloc(&tboot_mm);
148 if (!tboot_pg_dir)
149 return -1;
150
151 for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
152 if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
153 return -1;
154 }
155
156 return 0;
157}
158
159static void tboot_create_trampoline(void)
160{
161 u32 map_base, map_size;
162
163 /* Create identity map for tboot shutdown code. */
164 map_base = PFN_DOWN(tboot->tboot_base);
165 map_size = PFN_UP(tboot->tboot_size);
166 if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
167 panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
168 map_base, map_size);
169}
170
171#ifdef CONFIG_ACPI_SLEEP
172
173static void add_mac_region(phys_addr_t start, unsigned long size)
174{
175 struct tboot_mac_region *mr;
176 phys_addr_t end = start + size;
177
178 if (start && size) {
179 mr = &tboot->mac_regions[tboot->num_mac_regions++];
180 mr->start = round_down(start, PAGE_SIZE);
181 mr->size = round_up(end, PAGE_SIZE) - mr->start;
182 }
183}
184
185static int tboot_setup_sleep(void)
186{
187 tboot->num_mac_regions = 0;
188
189 /* S3 resume code */
190 add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
191
192#ifdef CONFIG_X86_TRAMPOLINE
193 /* AP trampoline code */
194 add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
195#endif
196
197 /* kernel code + data + bss */
198 add_mac_region(virt_to_phys(_text), _end - _text);
199
200 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
201
202 return 0;
203}
204
205#else /* no CONFIG_ACPI_SLEEP */
206
207static int tboot_setup_sleep(void)
208{
209 /* S3 shutdown requested, but S3 not supported by the kernel... */
210 BUG();
211 return -1;
212}
213
214#endif
215
216void tboot_shutdown(u32 shutdown_type)
217{
218 void (*shutdown)(void);
219
220 if (!tboot_enabled())
221 return;
222
223 /*
224 * if we're being called before the 1:1 mapping is set up then just
225 * return and let the normal shutdown happen; this should only be
226 * due to very early panic()
227 */
228 if (!tboot_pg_dir)
229 return;
230
231 /* if this is S3 then set regions to MAC */
232 if (shutdown_type == TB_SHUTDOWN_S3)
233 if (tboot_setup_sleep())
234 return;
235
236 tboot->shutdown_type = shutdown_type;
237
238 switch_to_tboot_pt();
239
240 shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
241 shutdown();
242
243 /* should not reach here */
244 while (1)
245 halt();
246}
247
248static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
249{
250#define TB_COPY_GAS(tbg, g) \
251 tbg.space_id = g.space_id; \
252 tbg.bit_width = g.bit_width; \
253 tbg.bit_offset = g.bit_offset; \
254 tbg.access_width = g.access_width; \
255 tbg.address = g.address;
256
257 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
258 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
259 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
260 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
261
262 /*
263 * We need phys addr of waking vector, but can't use virt_to_phys() on
264 * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
265 * addr.
266 */
267 tboot->acpi_sinfo.wakeup_vector = fadt->facs +
268 offsetof(struct acpi_table_facs, firmware_waking_vector);
269}
270
271void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
272{
273 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
274 /* S0,1,2: */ -1, -1, -1,
275 /* S3: */ TB_SHUTDOWN_S3,
276 /* S4: */ TB_SHUTDOWN_S4,
277 /* S5: */ TB_SHUTDOWN_S5 };
278
279 if (!tboot_enabled())
280 return;
281
282 tboot_copy_fadt(&acpi_gbl_FADT);
283 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
284 tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
285 /* we always use the 32b wakeup vector */
286 tboot->acpi_sinfo.vector_width = 32;
287
288 if (sleep_state >= ACPI_S_STATE_COUNT ||
289 acpi_shutdown_map[sleep_state] == -1) {
290 pr_warning("unsupported sleep state 0x%x\n", sleep_state);
291 return;
292 }
293
294 tboot_shutdown(acpi_shutdown_map[sleep_state]);
295}
296
297static atomic_t ap_wfs_count;
298
299static int tboot_wait_for_aps(int num_aps)
300{
301 unsigned long timeout;
302
303 timeout = AP_WAIT_TIMEOUT*HZ;
304 while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
305 timeout) {
306 mdelay(1);
307 timeout--;
308 }
309
310 if (timeout)
311 pr_warning("tboot wait for APs timeout\n");
312
313 return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
314}
315
316static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
317 unsigned long action, void *hcpu)
318{
319 switch (action) {
320 case CPU_DYING:
321 atomic_inc(&ap_wfs_count);
322 if (num_online_cpus() == 1)
323 if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
324 return NOTIFY_BAD;
325 break;
326 }
327 return NOTIFY_OK;
328}
329
330static struct notifier_block tboot_cpu_notifier __cpuinitdata =
331{
332 .notifier_call = tboot_cpu_callback,
333};
334
335static __init int tboot_late_init(void)
336{
337 if (!tboot_enabled())
338 return 0;
339
340 tboot_create_trampoline();
341
342 atomic_set(&ap_wfs_count, 0);
343 register_hotcpu_notifier(&tboot_cpu_notifier);
344 return 0;
345}
346
347late_initcall(tboot_late_init);
348
349/*
350 * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
351 */
352
353#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
354#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
355
356/* # pages for each config regs space - used by fixmap */
357#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
358 TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
359
360/* offsets from pub/priv config space */
361#define TXTCR_HEAP_BASE 0x0300
362#define TXTCR_HEAP_SIZE 0x0308
363
364#define SHA1_SIZE 20
365
366struct sha1_hash {
367 u8 hash[SHA1_SIZE];
368};
369
370struct sinit_mle_data {
371 u32 version; /* currently 6 */
372 struct sha1_hash bios_acm_id;
373 u32 edx_senter_flags;
374 u64 mseg_valid;
375 struct sha1_hash sinit_hash;
376 struct sha1_hash mle_hash;
377 struct sha1_hash stm_hash;
378 struct sha1_hash lcp_policy_hash;
379 u32 lcp_policy_control;
380 u32 rlp_wakeup_addr;
381 u32 reserved;
382 u32 num_mdrs;
383 u32 mdrs_off;
384 u32 num_vtd_dmars;
385 u32 vtd_dmars_off;
386} __packed;
387
388struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
389{
390 void *heap_base, *heap_ptr, *config;
391
392 if (!tboot_enabled())
393 return dmar_tbl;
394
395 /*
396 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
397 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
398 */
399
400 /* map config space in order to get heap addr */
401 config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
402 PAGE_SIZE);
403 if (!config)
404 return NULL;
405
406 /* now map TXT heap */
407 heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
408 *(u64 *)(config + TXTCR_HEAP_SIZE));
409 iounmap(config);
410 if (!heap_base)
411 return NULL;
412
413 /* walk heap to SinitMleData */
414 /* skip BiosData */
415 heap_ptr = heap_base + *(u64 *)heap_base;
416 /* skip OsMleData */
417 heap_ptr += *(u64 *)heap_ptr;
418 /* skip OsSinitData */
419 heap_ptr += *(u64 *)heap_ptr;
420 /* now points to SinitMleDataSize; set to SinitMleData */
421 heap_ptr += sizeof(u64);
422 /* get addr of DMAR table */
423 dmar_tbl = (struct acpi_table_header *)(heap_ptr +
424 ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
425 sizeof(u64));
426
427 /* don't unmap heap because dmar.c needs access to this */
428
429 return dmar_tbl;
430}
431
432int tboot_force_iommu(void)
433{
434 if (!tboot_enabled())
435 return 0;
436
437 if (no_iommu || swiotlb || dmar_disabled)
438 pr_warning("Forcing Intel-IOMMU to enabled\n");
439
440 dmar_disabled = 0;
441#ifdef CONFIG_SWIOTLB
442 swiotlb = 0;
443#endif
444 no_iommu = 0;
445
446 return 1;
447}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 000000000000..dcb00d278512
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,120 @@
1/*
2 * Copyright (c) 1991,1992,1995 Linus Torvalds
3 * Copyright (c) 1994 Alan Modra
4 * Copyright (c) 1995 Markus Kuhn
5 * Copyright (c) 1996 Ingo Molnar
6 * Copyright (c) 1998 Andrea Arcangeli
7 * Copyright (c) 2002,2006 Vojtech Pavlik
8 * Copyright (c) 2003 Andi Kleen
9 *
10 */
11
12#include <linux/clockchips.h>
13#include <linux/interrupt.h>
14#include <linux/time.h>
15#include <linux/mca.h>
16
17#include <asm/vsyscall.h>
18#include <asm/x86_init.h>
19#include <asm/i8259.h>
20#include <asm/i8253.h>
21#include <asm/timer.h>
22#include <asm/hpet.h>
23#include <asm/time.h>
24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
31#endif
32
33unsigned long profile_pc(struct pt_regs *regs)
34{
35 unsigned long pc = instruction_pointer(regs);
36
37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
41 unsigned long *sp = (unsigned long *)regs->sp;
42 /*
43 * Return address is either directly at stack pointer
44 * or above a saved flags. Eflags has bits 22-31 zero,
45 * kernel addresses don't.
46 */
47 if (sp[0] >> 22)
48 return sp[0];
49 if (sp[1] >> 22)
50 return sp[1];
51#endif
52 }
53 return pc;
54}
55EXPORT_SYMBOL(profile_pc);
56
57/*
58 * Default timer interrupt handler for PIT/HPET
59 */
60static irqreturn_t timer_interrupt(int irq, void *dev_id)
61{
62 /* Keep nmi watchdog up to date */
63 inc_irq_stat(irq0_irqs);
64
65 /* Optimized out for !IO_APIC and x86_64 */
66 if (timer_ack) {
67 /*
68 * Subtle, when I/O APICs are used we have to ack timer IRQ
69 * manually to deassert NMI lines for the watchdog if run
70 * on an 82489DX-based system.
71 */
72 spin_lock(&i8259A_lock);
73 outb(0x0c, PIC_MASTER_OCW3);
74 /* Ack the IRQ; AEOI will end it automatically. */
75 inb(PIC_MASTER_POLL);
76 spin_unlock(&i8259A_lock);
77 }
78
79 global_clock_event->event_handler(global_clock_event);
80
81 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
82 if (MCA_bus)
83 outb_p(inb_p(0x61)| 0x80, 0x61);
84
85 return IRQ_HANDLED;
86}
87
88static struct irqaction irq0 = {
89 .handler = timer_interrupt,
90 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
91 .name = "timer"
92};
93
94void __init setup_default_timer_irq(void)
95{
96 setup_irq(0, &irq0);
97}
98
99/* Default timer init function */
100void __init hpet_time_init(void)
101{
102 if (!hpet_enable())
103 setup_pit_timer();
104 setup_default_timer_irq();
105}
106
107static __init void x86_late_time_init(void)
108{
109 x86_init.timers.timer_init();
110 tsc_init();
111}
112
113/*
114 * Initialize TSC and delay the periodic timer init to
115 * late x86_late_time_init() so ioremap works.
116 */
117void __init time_init(void)
118{
119 late_time_init = x86_late_time_init;
120}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
deleted file mode 100644
index 5c5d87f0b2e1..000000000000
--- a/arch/x86/kernel/time_32.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
3 *
4 * This file contains the PC-specific time handling details:
5 * reading the RTC at bootup, etc..
6 * 1994-07-02 Alan Modra
7 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
8 * 1995-03-26 Markus Kuhn
9 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
10 * precision CMOS clock update
11 * 1996-05-03 Ingo Molnar
12 * fixed time warps in do_[slow|fast]_gettimeoffset()
13 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
14 * "A Kernel Model for Precision Timekeeping" by Dave Mills
15 * 1998-09-05 (Various)
16 * More robust do_fast_gettimeoffset() algorithm implemented
17 * (works with APM, Cyrix 6x86MX and Centaur C6),
18 * monotonic gettimeofday() with fast_get_timeoffset(),
19 * drift-proof precision TSC calibration on boot
20 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
21 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
22 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
23 * 1998-12-16 Andrea Arcangeli
24 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
25 * because was not accounting lost_ticks.
26 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
27 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
28 * serialize accesses to xtime/lost_ticks).
29 */
30
31#include <linux/init.h>
32#include <linux/interrupt.h>
33#include <linux/time.h>
34#include <linux/mca.h>
35
36#include <asm/setup.h>
37#include <asm/hpet.h>
38#include <asm/time.h>
39#include <asm/timer.h>
40
41#include <asm/do_timer.h>
42
43int timer_ack;
44
45unsigned long profile_pc(struct pt_regs *regs)
46{
47 unsigned long pc = instruction_pointer(regs);
48
49#ifdef CONFIG_SMP
50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
51#ifdef CONFIG_FRAME_POINTER
52 return *(unsigned long *)(regs->bp + sizeof(long));
53#else
54 unsigned long *sp = (unsigned long *)&regs->sp;
55
56 /* Return address is either directly at stack pointer
57 or above a saved flags. Eflags has bits 22-31 zero,
58 kernel addresses don't. */
59 if (sp[0] >> 22)
60 return sp[0];
61 if (sp[1] >> 22)
62 return sp[1];
63#endif
64 }
65#endif
66 return pc;
67}
68EXPORT_SYMBOL(profile_pc);
69
70/*
71 * This is the same as the above, except we _also_ save the current
72 * Time Stamp Counter value at the time of the timer interrupt, so that
73 * we later on can estimate the time of day more exactly.
74 */
75irqreturn_t timer_interrupt(int irq, void *dev_id)
76{
77 /* Keep nmi watchdog up to date */
78 inc_irq_stat(irq0_irqs);
79
80#ifdef CONFIG_X86_IO_APIC
81 if (timer_ack) {
82 /*
83 * Subtle, when I/O APICs are used we have to ack timer IRQ
84 * manually to deassert NMI lines for the watchdog if run
85 * on an 82489DX-based system.
86 */
87 spin_lock(&i8259A_lock);
88 outb(0x0c, PIC_MASTER_OCW3);
89 /* Ack the IRQ; AEOI will end it automatically. */
90 inb(PIC_MASTER_POLL);
91 spin_unlock(&i8259A_lock);
92 }
93#endif
94
95 do_timer_interrupt_hook();
96
97#ifdef CONFIG_MCA
98 if (MCA_bus) {
99 /* The PS/2 uses level-triggered interrupts. You can't
100 turn them off, nor would you want to (any attempt to
101 enable edge-triggered interrupts usually gets intercepted by a
102 special hardware circuit). Hence we have to acknowledge
103 the timer interrupt. Through some incredibly stupid
104 design idea, the reset for IRQ 0 is done by setting the
105 high bit of the PPI port B (0x61). Note that some PS/2s,
106 notably the 55SX, work fine if this is removed. */
107
108 u8 irq_v = inb_p(0x61); /* read the current state */
109 outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */
110 }
111#endif
112
113 return IRQ_HANDLED;
114}
115
116/* Duplicate of time_init() below, with hpet_enable part added */
117void __init hpet_time_init(void)
118{
119 if (!hpet_enable())
120 setup_pit_timer();
121 x86_quirk_time_init();
122}
123
124/*
125 * This is called directly from init code; we must delay timer setup in the
126 * HPET case as we can't make the decision to turn on HPET this early in the
127 * boot process.
128 *
129 * The chosen time_init function will usually be hpet_time_init, above, but
130 * in the case of virtual hardware, an alternative function may be substituted.
131 */
132void __init time_init(void)
133{
134 x86_quirk_pre_time_init();
135 tsc_init();
136 late_time_init = choose_time_init();
137}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
deleted file mode 100644
index 5ba343e61844..000000000000
--- a/arch/x86/kernel/time_64.c
+++ /dev/null
@@ -1,135 +0,0 @@
1/*
2 * "High Precision Event Timer" based timekeeping.
3 *
4 * Copyright (c) 1991,1992,1995 Linus Torvalds
5 * Copyright (c) 1994 Alan Modra
6 * Copyright (c) 1995 Markus Kuhn
7 * Copyright (c) 1996 Ingo Molnar
8 * Copyright (c) 1998 Andrea Arcangeli
9 * Copyright (c) 2002,2006 Vojtech Pavlik
10 * Copyright (c) 2003 Andi Kleen
11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
12 */
13
14#include <linux/clockchips.h>
15#include <linux/init.h>
16#include <linux/interrupt.h>
17#include <linux/module.h>
18#include <linux/time.h>
19#include <linux/mca.h>
20#include <linux/nmi.h>
21
22#include <asm/i8253.h>
23#include <asm/hpet.h>
24#include <asm/vgtod.h>
25#include <asm/time.h>
26#include <asm/timer.h>
27
28volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
29
30unsigned long profile_pc(struct pt_regs *regs)
31{
32 unsigned long pc = instruction_pointer(regs);
33
34 /* Assume the lock function has either no stack frame or a copy
35 of flags from PUSHF
36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
41 unsigned long *sp = (unsigned long *)regs->sp;
42 if (sp[0] >> 22)
43 return sp[0];
44 if (sp[1] >> 22)
45 return sp[1];
46#endif
47 }
48 return pc;
49}
50EXPORT_SYMBOL(profile_pc);
51
52static irqreturn_t timer_interrupt(int irq, void *dev_id)
53{
54 inc_irq_stat(irq0_irqs);
55
56 global_clock_event->event_handler(global_clock_event);
57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
65 return IRQ_HANDLED;
66}
67
68/* calibrate_cpu is used on systems with fixed rate TSCs to determine
69 * processor frequency */
70#define TICK_COUNT 100000000
71unsigned long __init calibrate_cpu(void)
72{
73 int tsc_start, tsc_now;
74 int i, no_ctr_free;
75 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
76 unsigned long flags;
77
78 for (i = 0; i < 4; i++)
79 if (avail_to_resrv_perfctr_nmi_bit(i))
80 break;
81 no_ctr_free = (i == 4);
82 if (no_ctr_free) {
83 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
84 "cpu_khz value may be incorrect.\n");
85 i = 3;
86 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
87 wrmsrl(MSR_K7_EVNTSEL3, 0);
88 rdmsrl(MSR_K7_PERFCTR3, pmc3);
89 } else {
90 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
91 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
92 }
93 local_irq_save(flags);
94 /* start measuring cycles, incrementing from 0 */
95 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
96 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
97 rdtscl(tsc_start);
98 do {
99 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
100 tsc_now = get_cycles();
101 } while ((tsc_now - tsc_start) < TICK_COUNT);
102
103 local_irq_restore(flags);
104 if (no_ctr_free) {
105 wrmsrl(MSR_K7_EVNTSEL3, 0);
106 wrmsrl(MSR_K7_PERFCTR3, pmc3);
107 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
108 } else {
109 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
110 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
111 }
112
113 return pmc_now * tsc_khz / (tsc_now - tsc_start);
114}
115
116static struct irqaction irq0 = {
117 .handler = timer_interrupt,
118 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
119 .name = "timer"
120};
121
122void __init hpet_time_init(void)
123{
124 if (!hpet_enable())
125 setup_pit_timer();
126
127 setup_irq(0, &irq0);
128}
129
130void __init time_init(void)
131{
132 tsc_init();
133
134 late_time_init = choose_time_init();
135}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 77b9689f8edb..503c1f2e8835 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -640,13 +640,13 @@ static int __init uv_ptc_init(void)
640 if (!is_uv_system()) 640 if (!is_uv_system())
641 return 0; 641 return 0;
642 642
643 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); 643 proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
644 &proc_uv_ptc_operations);
644 if (!proc_uv_ptc) { 645 if (!proc_uv_ptc) {
645 printk(KERN_ERR "unable to create %s proc entry\n", 646 printk(KERN_ERR "unable to create %s proc entry\n",
646 UV_PTC_BASENAME); 647 UV_PTC_BASENAME);
647 return -EINVAL; 648 return -EINVAL;
648 } 649 }
649 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
650 return 0; 650 return 0;
651} 651}
652 652
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 808031a5ba19..699f7eeb896a 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -4,7 +4,7 @@
4#include <asm/e820.h> 4#include <asm/e820.h>
5 5
6/* ready for x86_64 and x86 */ 6/* ready for x86_64 and x86 */
7unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 7unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE);
8 8
9void __init reserve_trampoline_memory(void) 9void __init reserve_trampoline_memory(void)
10{ 10{
@@ -26,7 +26,7 @@ void __init reserve_trampoline_memory(void)
26 * bootstrap into the page concerned. The caller 26 * bootstrap into the page concerned. The caller
27 * has made sure it's suitably aligned. 27 * has made sure it's suitably aligned.
28 */ 28 */
29unsigned long setup_trampoline(void) 29unsigned long __cpuinit setup_trampoline(void)
30{ 30{
31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
32 return virt_to_phys(trampoline_base); 32 return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 66d874e5404c..8508237e8e43 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -28,16 +28,12 @@
28 */ 28 */
29 29
30#include <linux/linkage.h> 30#include <linux/linkage.h>
31#include <linux/init.h>
31#include <asm/segment.h> 32#include <asm/segment.h>
32#include <asm/page_types.h> 33#include <asm/page_types.h>
33 34
34/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35/* We can free up trampoline after bootup if cpu hotplug is not supported. */
35#ifndef CONFIG_HOTPLUG_CPU 36__CPUINITRODATA
36.section ".cpuinit.data","aw",@progbits
37#else
38.section .rodata,"a",@progbits
39#endif
40
41.code16 37.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index cddfb8d386b9..596d54c660a5 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,14 +25,15 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/init.h>
28#include <asm/pgtable_types.h> 29#include <asm/pgtable_types.h>
29#include <asm/page_types.h> 30#include <asm/page_types.h>
30#include <asm/msr.h> 31#include <asm/msr.h>
31#include <asm/segment.h> 32#include <asm/segment.h>
32#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
33 34
34.section .rodata, "a", @progbits 35/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
35 36__CPUINITRODATA
36.code16 37.code16
37 38
38ENTRY(trampoline_data) 39ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5204332f475d..a665c71352b8 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -14,7 +14,6 @@
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/kprobes.h> 15#include <linux/kprobes.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/utsname.h>
18#include <linux/kdebug.h> 17#include <linux/kdebug.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
20#include <linux/module.h> 19#include <linux/module.h>
@@ -59,12 +58,12 @@
59#include <asm/mach_traps.h> 58#include <asm/mach_traps.h>
60 59
61#ifdef CONFIG_X86_64 60#ifdef CONFIG_X86_64
61#include <asm/x86_init.h>
62#include <asm/pgalloc.h> 62#include <asm/pgalloc.h>
63#include <asm/proto.h> 63#include <asm/proto.h>
64#else 64#else
65#include <asm/processor-flags.h> 65#include <asm/processor-flags.h>
66#include <asm/setup.h> 66#include <asm/setup.h>
67#include <asm/traps.h>
68 67
69asmlinkage int system_call(void); 68asmlinkage int system_call(void);
70 69
@@ -76,7 +75,7 @@ char ignore_fpu_irq;
76 * F0 0F bug workaround.. We have a special link segment 75 * F0 0F bug workaround.. We have a special link segment
77 * for this. 76 * for this.
78 */ 77 */
79gate_desc idt_table[256] 78gate_desc idt_table[NR_VECTORS]
80 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
81#endif 80#endif
82 81
@@ -786,33 +785,34 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
786#endif 785#endif
787} 786}
788 787
789#ifdef CONFIG_X86_32 788asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
790unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
791{ 789{
792 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
793 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
794 unsigned long new_kesp = kesp - base;
795 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
796 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
797
798 /* Set up base for espfix segment */
799 desc &= 0x00f0ff0000000000ULL;
800 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
801 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
802 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
803 (lim_pages & 0xffff);
804 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
805
806 return new_kesp;
807} 790}
808#endif
809 791
810asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 792asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
811{ 793{
812} 794}
813 795
814asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) 796/*
797 * __math_state_restore assumes that cr0.TS is already clear and the
798 * fpu state is all ready for use. Used during context switch.
799 */
800void __math_state_restore(void)
815{ 801{
802 struct thread_info *thread = current_thread_info();
803 struct task_struct *tsk = thread->task;
804
805 /*
806 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
807 */
808 if (unlikely(restore_fpu_checking(tsk))) {
809 stts();
810 force_sig(SIGSEGV, tsk);
811 return;
812 }
813
814 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
815 tsk->fpu_counter++;
816} 816}
817 817
818/* 818/*
@@ -846,17 +846,8 @@ asmlinkage void math_state_restore(void)
846 } 846 }
847 847
848 clts(); /* Allow maths ops (or we recurse) */ 848 clts(); /* Allow maths ops (or we recurse) */
849 /*
850 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
851 */
852 if (unlikely(restore_fpu_checking(tsk))) {
853 stts();
854 force_sig(SIGSEGV, tsk);
855 return;
856 }
857 849
858 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 850 __math_state_restore();
859 tsk->fpu_counter++;
860} 851}
861EXPORT_SYMBOL_GPL(math_state_restore); 852EXPORT_SYMBOL_GPL(math_state_restore);
862 853
@@ -980,7 +971,5 @@ void __init trap_init(void)
980 */ 971 */
981 cpu_init(); 972 cpu_init();
982 973
983#ifdef CONFIG_X86_32 974 x86_init.irqs.trap_init();
984 x86_quirk_trap_init();
985#endif
986} 975}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357e..cd982f48e23e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,6 +17,8 @@
17#include <asm/time.h> 17#include <asm/time.h>
18#include <asm/delay.h> 18#include <asm/delay.h>
19#include <asm/hypervisor.h> 19#include <asm/hypervisor.h>
20#include <asm/nmi.h>
21#include <asm/x86_init.h>
20 22
21unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ 23unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
22EXPORT_SYMBOL(cpu_khz); 24EXPORT_SYMBOL(cpu_khz);
@@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void)
400{ 402{
401 u64 tsc1, tsc2, delta, ref1, ref2; 403 u64 tsc1, tsc2, delta, ref1, ref2;
402 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 404 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
403 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; 405 unsigned long flags, latch, ms, fast_calibrate;
404 int hpet = is_hpet_enabled(), i, loopmin; 406 int hpet = is_hpet_enabled(), i, loopmin;
405 407
406 hv_tsc_khz = get_hypervisor_tsc_freq();
407 if (hv_tsc_khz) {
408 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
409 return hv_tsc_khz;
410 }
411
412 local_irq_save(flags); 408 local_irq_save(flags);
413 fast_calibrate = quick_pit_calibrate(); 409 fast_calibrate = quick_pit_calibrate();
414 local_irq_restore(flags); 410 local_irq_restore(flags);
@@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void)
566 unsigned long cpu_khz_old = cpu_khz; 562 unsigned long cpu_khz_old = cpu_khz;
567 563
568 if (cpu_has_tsc) { 564 if (cpu_has_tsc) {
569 tsc_khz = calibrate_tsc(); 565 tsc_khz = x86_platform.calibrate_tsc();
570 cpu_khz = tsc_khz; 566 cpu_khz = tsc_khz;
571 cpu_data(0).loops_per_jiffy = 567 cpu_data(0).loops_per_jiffy =
572 cpufreq_scale(cpu_data(0).loops_per_jiffy, 568 cpufreq_scale(cpu_data(0).loops_per_jiffy,
@@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
670 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || 666 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
671 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || 667 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
672 (val == CPUFREQ_RESUMECHANGE)) { 668 (val == CPUFREQ_RESUMECHANGE)) {
673 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); 669 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
674 670
675 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 671 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
676 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) 672 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
@@ -744,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void)
744} 740}
745#endif 741#endif
746 742
743static void resume_tsc(void)
744{
745 clocksource_tsc.cycle_last = 0;
746}
747
747static struct clocksource clocksource_tsc = { 748static struct clocksource clocksource_tsc = {
748 .name = "tsc", 749 .name = "tsc",
749 .rating = 300, 750 .rating = 300,
750 .read = read_tsc, 751 .read = read_tsc,
752 .resume = resume_tsc,
751 .mask = CLOCKSOURCE_MASK(64), 753 .mask = CLOCKSOURCE_MASK(64),
752 .shift = 22, 754 .shift = 22,
753 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 755 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -761,12 +763,14 @@ void mark_tsc_unstable(char *reason)
761{ 763{
762 if (!tsc_unstable) { 764 if (!tsc_unstable) {
763 tsc_unstable = 1; 765 tsc_unstable = 1;
764 printk("Marking TSC unstable due to %s\n", reason); 766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
765 /* Change only the rating, when not registered */ 767 /* Change only the rating, when not registered */
766 if (clocksource_tsc.mult) 768 if (clocksource_tsc.mult)
767 clocksource_change_rating(&clocksource_tsc, 0); 769 clocksource_mark_unstable(&clocksource_tsc);
768 else 770 else {
771 clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
769 clocksource_tsc.rating = 0; 772 clocksource_tsc.rating = 0;
773 }
770 } 774 }
771} 775}
772 776
@@ -852,15 +856,71 @@ static void __init init_tsc_clocksource(void)
852 clocksource_register(&clocksource_tsc); 856 clocksource_register(&clocksource_tsc);
853} 857}
854 858
859#ifdef CONFIG_X86_64
860/*
861 * calibrate_cpu is used on systems with fixed rate TSCs to determine
862 * processor frequency
863 */
864#define TICK_COUNT 100000000
865static unsigned long __init calibrate_cpu(void)
866{
867 int tsc_start, tsc_now;
868 int i, no_ctr_free;
869 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
870 unsigned long flags;
871
872 for (i = 0; i < 4; i++)
873 if (avail_to_resrv_perfctr_nmi_bit(i))
874 break;
875 no_ctr_free = (i == 4);
876 if (no_ctr_free) {
877 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
878 "cpu_khz value may be incorrect.\n");
879 i = 3;
880 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
881 wrmsrl(MSR_K7_EVNTSEL3, 0);
882 rdmsrl(MSR_K7_PERFCTR3, pmc3);
883 } else {
884 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
885 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
886 }
887 local_irq_save(flags);
888 /* start measuring cycles, incrementing from 0 */
889 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
890 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
891 rdtscl(tsc_start);
892 do {
893 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
894 tsc_now = get_cycles();
895 } while ((tsc_now - tsc_start) < TICK_COUNT);
896
897 local_irq_restore(flags);
898 if (no_ctr_free) {
899 wrmsrl(MSR_K7_EVNTSEL3, 0);
900 wrmsrl(MSR_K7_PERFCTR3, pmc3);
901 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
902 } else {
903 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
904 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
905 }
906
907 return pmc_now * tsc_khz / (tsc_now - tsc_start);
908}
909#else
910static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
911#endif
912
855void __init tsc_init(void) 913void __init tsc_init(void)
856{ 914{
857 u64 lpj; 915 u64 lpj;
858 int cpu; 916 int cpu;
859 917
918 x86_init.timers.tsc_pre_init();
919
860 if (!cpu_has_tsc) 920 if (!cpu_has_tsc)
861 return; 921 return;
862 922
863 tsc_khz = calibrate_tsc(); 923 tsc_khz = x86_platform.calibrate_tsc();
864 cpu_khz = tsc_khz; 924 cpu_khz = tsc_khz;
865 925
866 if (!tsc_khz) { 926 if (!tsc_khz) {
@@ -868,11 +928,9 @@ void __init tsc_init(void)
868 return; 928 return;
869 } 929 }
870 930
871#ifdef CONFIG_X86_64
872 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && 931 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
873 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) 932 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
874 cpu_khz = calibrate_cpu(); 933 cpu_khz = calibrate_cpu();
875#endif
876 934
877 printk("Detected %lu.%03lu MHz processor.\n", 935 printk("Detected %lu.%03lu MHz processor.\n",
878 (unsigned long)cpu_khz / 1000, 936 (unsigned long)cpu_khz / 1000,
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 31ffc24eec4d..f068553a1b17 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -30,6 +30,7 @@
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/e820.h> 32#include <asm/e820.h>
33#include <asm/time.h>
33#include <asm/io.h> 34#include <asm/io.h>
34 35
35#include <linux/kernel_stat.h> 36#include <linux/kernel_stat.h>
@@ -53,7 +54,7 @@ int is_visws_box(void)
53 return visws_board_type >= 0; 54 return visws_board_type >= 0;
54} 55}
55 56
56static int __init visws_time_init(void) 57static void __init visws_time_init(void)
57{ 58{
58 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 59 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
59 60
@@ -66,21 +67,13 @@ static int __init visws_time_init(void)
66 /* Enable (unmask) the timer interrupt */ 67 /* Enable (unmask) the timer interrupt */
67 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); 68 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
68 69
69 /* 70 setup_default_timer_irq();
70 * Zero return means the generic timer setup code will set up
71 * the standard vector:
72 */
73 return 0;
74} 71}
75 72
76static int __init visws_pre_intr_init(void) 73/* Replaces the default init_ISA_irqs in the generic setup */
74static void __init visws_pre_intr_init(void)
77{ 75{
78 init_VISWS_APIC_irqs(); 76 init_VISWS_APIC_irqs();
79
80 /*
81 * We dont want ISA irqs to be set up by the generic code:
82 */
83 return 1;
84} 77}
85 78
86/* Quirk for machine specific memory setup. */ 79/* Quirk for machine specific memory setup. */
@@ -156,12 +149,8 @@ static void visws_machine_power_off(void)
156 outl(PIIX_SPECIAL_STOP, 0xCFC); 149 outl(PIIX_SPECIAL_STOP, 0xCFC);
157} 150}
158 151
159static int __init visws_get_smp_config(unsigned int early) 152static void __init visws_get_smp_config(unsigned int early)
160{ 153{
161 /*
162 * Prevent MP-table parsing by the generic code:
163 */
164 return 1;
165} 154}
166 155
167/* 156/*
@@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
208 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
209} 198}
210 199
211static int __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(unsigned int reserve)
212{ 201{
213 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
214 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve)
230 MP_processor_info(mp++); 219 MP_processor_info(mp++);
231 220
232 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 221 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
233
234 return 1;
235} 222}
236 223
237static int visws_trap_init(void); 224static void visws_trap_init(void);
238
239static struct x86_quirks visws_x86_quirks __initdata = {
240 .arch_time_init = visws_time_init,
241 .arch_pre_intr_init = visws_pre_intr_init,
242 .arch_memory_setup = visws_memory_setup,
243 .arch_intr_init = NULL,
244 .arch_trap_init = visws_trap_init,
245 .mach_get_smp_config = visws_get_smp_config,
246 .mach_find_smp_config = visws_find_smp_config,
247};
248 225
249void __init visws_early_detect(void) 226void __init visws_early_detect(void)
250{ 227{
@@ -257,11 +234,14 @@ void __init visws_early_detect(void)
257 return; 234 return;
258 235
259 /* 236 /*
260 * Install special quirks for timer, interrupt and memory setup: 237 * Override the default platform setup functions
261 * Fall back to generic behavior for traps:
262 * Override generic MP-table parsing:
263 */ 238 */
264 x86_quirks = &visws_x86_quirks; 239 x86_init.resources.memory_setup = visws_memory_setup;
240 x86_init.mpparse.get_smp_config = visws_get_smp_config;
241 x86_init.mpparse.find_smp_config = visws_find_smp_config;
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init;
265 245
266 /* 246 /*
267 * Install reboot quirks: 247 * Install reboot quirks:
@@ -400,12 +380,10 @@ static __init void cobalt_init(void)
400 co_apic_read(CO_APIC_ID)); 380 co_apic_read(CO_APIC_ID));
401} 381}
402 382
403static int __init visws_trap_init(void) 383static void __init visws_trap_init(void)
404{ 384{
405 lithium_init(); 385 lithium_init();
406 cobalt_init(); 386 cobalt_init();
407
408 return 1;
409} 387}
410 388
411/* 389/*
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95a7289e4b0c..31e6f6cfe53e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -817,15 +817,15 @@ static inline int __init activate_vmi(void)
817 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); 817 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
818 vmi_timer_ops.cancel_alarm = 818 vmi_timer_ops.cancel_alarm =
819 vmi_get_function(VMI_CALL_CancelAlarm); 819 vmi_get_function(VMI_CALL_CancelAlarm);
820 pv_time_ops.time_init = vmi_time_init; 820 x86_init.timers.timer_init = vmi_time_init;
821 pv_time_ops.get_wallclock = vmi_get_wallclock;
822 pv_time_ops.set_wallclock = vmi_set_wallclock;
823#ifdef CONFIG_X86_LOCAL_APIC 821#ifdef CONFIG_X86_LOCAL_APIC
824 pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; 822 x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
825 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; 823 x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
826#endif 824#endif
827 pv_time_ops.sched_clock = vmi_sched_clock; 825 pv_time_ops.sched_clock = vmi_sched_clock;
828 pv_time_ops.get_tsc_khz = vmi_tsc_khz; 826 x86_platform.calibrate_tsc = vmi_tsc_khz;
827 x86_platform.get_wallclock = vmi_get_wallclock;
828 x86_platform.set_wallclock = vmi_set_wallclock;
829 829
830 /* We have true wallclock functions; disable CMOS clock sync */ 830 /* We have true wallclock functions; disable CMOS clock sync */
831 no_sync_cmos_clock = 1; 831 no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 2b3eb82efeeb..611b9e2360d3 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void)
68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); 68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
69} 69}
70 70
71/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ 71/* x86_platform.calibrate_tsc = vmi_tsc_khz */
72unsigned long vmi_tsc_khz(void) 72unsigned long vmi_tsc_khz(void)
73{ 73{
74 unsigned long long khz; 74 unsigned long long khz;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9fc178255c04..a46acccec38a 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -45,9 +45,9 @@ PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64 47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */ 48 user PT_LOAD FLAGS(5); /* R_E */
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 percpu PT_LOAD FLAGS(7); /* RWE */ 50 percpu PT_LOAD FLAGS(6); /* RW_ */
51#endif 51#endif
52 init PT_LOAD FLAGS(7); /* RWE */ 52 init PT_LOAD FLAGS(7); /* RWE */
53#endif 53#endif
@@ -348,15 +348,12 @@ SECTIONS
348 _end = .; 348 _end = .;
349 } 349 }
350 350
351 /* Sections to be discarded */
352 /DISCARD/ : {
353 *(.exitcall.exit)
354 *(.eh_frame)
355 *(.discard)
356 }
357
358 STABS_DEBUG 351 STABS_DEBUG
359 DWARF_DEBUG 352 DWARF_DEBUG
353
354 /* Sections to be discarded */
355 DISCARDS
356 /DISCARD/ : { *(.eh_frame) }
360} 357}
361 358
362 359
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 25ee06a80aad..8cb4974ff599 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
90 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
90 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 91 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
91} 92}
92 93
@@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
227} 228}
228 229
229#ifdef CONFIG_SYSCTL 230#ifdef CONFIG_SYSCTL
230
231static int
232vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
233 void __user *buffer, size_t *lenp, loff_t *ppos)
234{
235 return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
236}
237
238static ctl_table kernel_table2[] = { 231static ctl_table kernel_table2[] = {
239 { .procname = "vsyscall64", 232 { .procname = "vsyscall64",
240 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), 233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
241 .mode = 0644, 234 .mode = 0644,
242 .proc_handler = vsyscall_sysctl_change }, 235 .proc_handler = proc_dointvec },
243 {} 236 {}
244}; 237};
245 238
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 000000000000..4449a4a2c2ed
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,75 @@
1/*
2 * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
3 *
4 * For licencing details see kernel-base/COPYING
5 */
6#include <linux/init.h>
7
8#include <asm/bios_ebda.h>
9#include <asm/paravirt.h>
10#include <asm/mpspec.h>
11#include <asm/setup.h>
12#include <asm/apic.h>
13#include <asm/e820.h>
14#include <asm/time.h>
15#include <asm/irq.h>
16#include <asm/tsc.h>
17
18void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { }
21
22/*
23 * The platform setup functions are preset with the default functions
24 * for standard PC hardware.
25 */
26struct x86_init_ops x86_init __initdata = {
27
28 .resources = {
29 .probe_roms = x86_init_noop,
30 .reserve_resources = reserve_standard_io_resources,
31 .memory_setup = default_machine_specific_memory_setup,
32 },
33
34 .mpparse = {
35 .mpc_record = x86_init_uint_noop,
36 .setup_ioapic_ids = x86_init_noop,
37 .mpc_apic_id = default_mpc_apic_id,
38 .smp_read_mpc_oem = default_smp_read_mpc_oem,
39 .mpc_oem_bus_info = default_mpc_oem_bus_info,
40 .find_smp_config = default_find_smp_config,
41 .get_smp_config = default_get_smp_config,
42 },
43
44 .irqs = {
45 .pre_vector_init = init_ISA_irqs,
46 .intr_init = native_init_IRQ,
47 .trap_init = x86_init_noop,
48 },
49
50 .oem = {
51 .arch_setup = x86_init_noop,
52 .banner = default_banner,
53 },
54
55 .paging = {
56 .pagetable_setup_start = native_pagetable_setup_start,
57 .pagetable_setup_done = native_pagetable_setup_done,
58 },
59
60 .timers = {
61 .setup_percpu_clockev = setup_boot_APIC_clock,
62 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init,
64 },
65};
66
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
68 .setup_percpu_clockev = setup_secondary_APIC_clock,
69};
70
71struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss,
75};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8600a09e0c6c..b84e571f4175 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,12 +1,8 @@
1# 1#
2# KVM configuration 2# KVM configuration
3# 3#
4config HAVE_KVM
5 bool
6 4
7config HAVE_KVM_IRQCHIP 5source "virt/kvm/Kconfig"
8 bool
9 default y
10 6
11menuconfig VIRTUALIZATION 7menuconfig VIRTUALIZATION
12 bool "Virtualization" 8 bool "Virtualization"
@@ -29,6 +25,9 @@ config KVM
29 select PREEMPT_NOTIFIERS 25 select PREEMPT_NOTIFIERS
30 select MMU_NOTIFIER 26 select MMU_NOTIFIER
31 select ANON_INODES 27 select ANON_INODES
28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE
32 ---help--- 31 ---help---
33 Support hosting fully virtualized guest machines using hardware 32 Support hosting fully virtualized guest machines using hardware
34 virtualization extensions. You will need a fairly recent 33 virtualization extensions. You will need a fairly recent
@@ -63,18 +62,6 @@ config KVM_AMD
63 To compile this as a module, choose M here: the module 62 To compile this as a module, choose M here: the module
64 will be called kvm-amd. 63 will be called kvm-amd.
65 64
66config KVM_TRACE
67 bool "KVM trace support"
68 depends on KVM && SYSFS
69 select MARKERS
70 select RELAY
71 select DEBUG_FS
72 default n
73 ---help---
74 This option allows reading a trace of kvm-related events through
75 relayfs. Note the ABI is not considered stable and will be
76 modified in future updates.
77
78# OK, it's a little counter-intuitive to do this, but it puts it neatly under 65# OK, it's a little counter-intuitive to do this, but it puts it neatly under
79# the virtualization menu. 66# the virtualization menu.
80source drivers/lguest/Kconfig 67source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4efafe80..0e7fe78d0f74 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,22 +1,19 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o irq_comm.o)
7ifeq ($(CONFIG_KVM_TRACE),y)
8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
9endif
10ifeq ($(CONFIG_IOMMU_API),y)
11common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
12endif
13 1
14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
15 3
16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ 4CFLAGS_x86.o := -I.
17 i8254.o timer.o 5CFLAGS_svm.o := -I.
18obj-$(CONFIG_KVM) += kvm.o 6CFLAGS_vmx.o := -I.
19kvm-intel-objs = vmx.o 7
20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
21kvm-amd-objs = svm.o 9 coalesced_mmio.o irq_comm.o eventfd.o)
22obj-$(CONFIG_KVM_AMD) += kvm-amd.o 10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
13 i8254.o timer.o
14kvm-intel-y += vmx.o
15kvm-amd-y += svm.o
16
17obj-$(CONFIG_KVM) += kvm.o
18obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
19obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c
index 616de4628d60..1be5cd640e93 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1,5 +1,5 @@
1/****************************************************************************** 1/******************************************************************************
2 * x86_emulate.c 2 * emulate.c
3 * 3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. 4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 * 5 *
@@ -30,7 +30,9 @@
30#define DPRINTF(x...) do {} while (0) 30#define DPRINTF(x...) do {} while (0)
31#endif 31#endif
32#include <linux/module.h> 32#include <linux/module.h>
33#include <asm/kvm_x86_emulate.h> 33#include <asm/kvm_emulate.h>
34
35#include "mmu.h" /* for is_long_mode() */
34 36
35/* 37/*
36 * Opcode effective-address decode tables. 38 * Opcode effective-address decode tables.
@@ -60,6 +62,7 @@
60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 62#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
61#define SrcOne (7<<4) /* Implied '1' */ 63#define SrcOne (7<<4) /* Implied '1' */
62#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 64#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
65#define SrcImmU (9<<4) /* Immediate operand, unsigned */
63#define SrcMask (0xf<<4) 66#define SrcMask (0xf<<4)
64/* Generic ModRM decode. */ 67/* Generic ModRM decode. */
65#define ModRM (1<<8) 68#define ModRM (1<<8)
@@ -97,11 +100,11 @@ static u32 opcode_table[256] = {
97 /* 0x10 - 0x17 */ 100 /* 0x10 - 0x17 */
98 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
99 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
100 0, 0, 0, 0, 103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
101 /* 0x18 - 0x1F */ 104 /* 0x18 - 0x1F */
102 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
103 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
104 0, 0, 0, 0, 107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
105 /* 0x20 - 0x27 */ 108 /* 0x20 - 0x27 */
106 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
107 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +198,7 @@ static u32 opcode_table[256] = {
195 ByteOp | SrcImmUByte, SrcImmUByte, 198 ByteOp | SrcImmUByte, SrcImmUByte,
196 /* 0xE8 - 0xEF */ 199 /* 0xE8 - 0xEF */
197 SrcImm | Stack, SrcImm | ImplicitOps, 200 SrcImm | Stack, SrcImm | ImplicitOps,
198 SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, 201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps,
199 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
200 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
201 /* 0xF0 - 0xF7 */ 204 /* 0xF0 - 0xF7 */
@@ -208,7 +211,7 @@ static u32 opcode_table[256] = {
208 211
209static u32 twobyte_table[256] = { 212static u32 twobyte_table[256] = {
210 /* 0x00 - 0x0F */ 213 /* 0x00 - 0x0F */
211 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, 214 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
212 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 215 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
213 /* 0x10 - 0x1F */ 216 /* 0x10 - 0x1F */
214 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -216,7 +219,9 @@ static u32 twobyte_table[256] = {
216 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 219 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0,
218 /* 0x30 - 0x3F */ 221 /* 0x30 - 0x3F */
219 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 222 ImplicitOps, 0, ImplicitOps, 0,
223 ImplicitOps, ImplicitOps, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
220 /* 0x40 - 0x47 */ 225 /* 0x40 - 0x47 */
221 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 226 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
222 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 227 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -319,8 +324,11 @@ static u32 group2_table[] = {
319}; 324};
320 325
321/* EFLAGS bit definitions. */ 326/* EFLAGS bit definitions. */
327#define EFLG_VM (1<<17)
328#define EFLG_RF (1<<16)
322#define EFLG_OF (1<<11) 329#define EFLG_OF (1<<11)
323#define EFLG_DF (1<<10) 330#define EFLG_DF (1<<10)
331#define EFLG_IF (1<<9)
324#define EFLG_SF (1<<7) 332#define EFLG_SF (1<<7)
325#define EFLG_ZF (1<<6) 333#define EFLG_ZF (1<<6)
326#define EFLG_AF (1<<4) 334#define EFLG_AF (1<<4)
@@ -1027,6 +1035,7 @@ done_prefixes:
1027 c->src.type = OP_MEM; 1035 c->src.type = OP_MEM;
1028 break; 1036 break;
1029 case SrcImm: 1037 case SrcImm:
1038 case SrcImmU:
1030 c->src.type = OP_IMM; 1039 c->src.type = OP_IMM;
1031 c->src.ptr = (unsigned long *)c->eip; 1040 c->src.ptr = (unsigned long *)c->eip;
1032 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1041 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
@@ -1044,6 +1053,19 @@ done_prefixes:
1044 c->src.val = insn_fetch(s32, 4, c->eip); 1053 c->src.val = insn_fetch(s32, 4, c->eip);
1045 break; 1054 break;
1046 } 1055 }
1056 if ((c->d & SrcMask) == SrcImmU) {
1057 switch (c->src.bytes) {
1058 case 1:
1059 c->src.val &= 0xff;
1060 break;
1061 case 2:
1062 c->src.val &= 0xffff;
1063 break;
1064 case 4:
1065 c->src.val &= 0xffffffff;
1066 break;
1067 }
1068 }
1047 break; 1069 break;
1048 case SrcImmByte: 1070 case SrcImmByte:
1049 case SrcImmUByte: 1071 case SrcImmUByte:
@@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1375 ctxt->interruptibility = mask; 1397 ctxt->interruptibility = mask;
1376} 1398}
1377 1399
1400static inline void
1401setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1402 struct kvm_segment *cs, struct kvm_segment *ss)
1403{
1404 memset(cs, 0, sizeof(struct kvm_segment));
1405 kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
1406 memset(ss, 0, sizeof(struct kvm_segment));
1407
1408 cs->l = 0; /* will be adjusted later */
1409 cs->base = 0; /* flat segment */
1410 cs->g = 1; /* 4kb granularity */
1411 cs->limit = 0xffffffff; /* 4GB limit */
1412 cs->type = 0x0b; /* Read, Execute, Accessed */
1413 cs->s = 1;
1414 cs->dpl = 0; /* will be adjusted later */
1415 cs->present = 1;
1416 cs->db = 1;
1417
1418 ss->unusable = 0;
1419 ss->base = 0; /* flat segment */
1420 ss->limit = 0xffffffff; /* 4GB limit */
1421 ss->g = 1; /* 4kb granularity */
1422 ss->s = 1;
1423 ss->type = 0x03; /* Read/Write, Accessed */
1424 ss->db = 1; /* 32bit stack segment */
1425 ss->dpl = 0;
1426 ss->present = 1;
1427}
1428
1429static int
1430emulate_syscall(struct x86_emulate_ctxt *ctxt)
1431{
1432 struct decode_cache *c = &ctxt->decode;
1433 struct kvm_segment cs, ss;
1434 u64 msr_data;
1435
1436 /* syscall is not available in real mode */
1437 if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
1438 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
1439 return -1;
1440
1441 setup_syscalls_segments(ctxt, &cs, &ss);
1442
1443 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1444 msr_data >>= 32;
1445 cs.selector = (u16)(msr_data & 0xfffc);
1446 ss.selector = (u16)(msr_data + 8);
1447
1448 if (is_long_mode(ctxt->vcpu)) {
1449 cs.db = 0;
1450 cs.l = 1;
1451 }
1452 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
1453 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
1454
1455 c->regs[VCPU_REGS_RCX] = c->eip;
1456 if (is_long_mode(ctxt->vcpu)) {
1457#ifdef CONFIG_X86_64
1458 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1459
1460 kvm_x86_ops->get_msr(ctxt->vcpu,
1461 ctxt->mode == X86EMUL_MODE_PROT64 ?
1462 MSR_LSTAR : MSR_CSTAR, &msr_data);
1463 c->eip = msr_data;
1464
1465 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
1466 ctxt->eflags &= ~(msr_data | EFLG_RF);
1467#endif
1468 } else {
1469 /* legacy mode */
1470 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1471 c->eip = (u32)msr_data;
1472
1473 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1474 }
1475
1476 return 0;
1477}
1478
1479static int
1480emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1481{
1482 struct decode_cache *c = &ctxt->decode;
1483 struct kvm_segment cs, ss;
1484 u64 msr_data;
1485
1486 /* inject #UD if LOCK prefix is used */
1487 if (c->lock_prefix)
1488 return -1;
1489
1490 /* inject #GP if in real mode or paging is disabled */
1491 if (ctxt->mode == X86EMUL_MODE_REAL ||
1492 !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1493 kvm_inject_gp(ctxt->vcpu, 0);
1494 return -1;
1495 }
1496
1497 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1498 * Therefore, we inject an #UD.
1499 */
1500 if (ctxt->mode == X86EMUL_MODE_PROT64)
1501 return -1;
1502
1503 setup_syscalls_segments(ctxt, &cs, &ss);
1504
1505 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1506 switch (ctxt->mode) {
1507 case X86EMUL_MODE_PROT32:
1508 if ((msr_data & 0xfffc) == 0x0) {
1509 kvm_inject_gp(ctxt->vcpu, 0);
1510 return -1;
1511 }
1512 break;
1513 case X86EMUL_MODE_PROT64:
1514 if (msr_data == 0x0) {
1515 kvm_inject_gp(ctxt->vcpu, 0);
1516 return -1;
1517 }
1518 break;
1519 }
1520
1521 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1522 cs.selector = (u16)msr_data;
1523 cs.selector &= ~SELECTOR_RPL_MASK;
1524 ss.selector = cs.selector + 8;
1525 ss.selector &= ~SELECTOR_RPL_MASK;
1526 if (ctxt->mode == X86EMUL_MODE_PROT64
1527 || is_long_mode(ctxt->vcpu)) {
1528 cs.db = 0;
1529 cs.l = 1;
1530 }
1531
1532 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
1533 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
1534
1535 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
1536 c->eip = msr_data;
1537
1538 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1539 c->regs[VCPU_REGS_RSP] = msr_data;
1540
1541 return 0;
1542}
1543
1544static int
1545emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1546{
1547 struct decode_cache *c = &ctxt->decode;
1548 struct kvm_segment cs, ss;
1549 u64 msr_data;
1550 int usermode;
1551
1552 /* inject #UD if LOCK prefix is used */
1553 if (c->lock_prefix)
1554 return -1;
1555
1556 /* inject #GP if in real mode or paging is disabled */
1557 if (ctxt->mode == X86EMUL_MODE_REAL
1558 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1559 kvm_inject_gp(ctxt->vcpu, 0);
1560 return -1;
1561 }
1562
1563 /* sysexit must be called from CPL 0 */
1564 if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
1565 kvm_inject_gp(ctxt->vcpu, 0);
1566 return -1;
1567 }
1568
1569 setup_syscalls_segments(ctxt, &cs, &ss);
1570
1571 if ((c->rex_prefix & 0x8) != 0x0)
1572 usermode = X86EMUL_MODE_PROT64;
1573 else
1574 usermode = X86EMUL_MODE_PROT32;
1575
1576 cs.dpl = 3;
1577 ss.dpl = 3;
1578 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1579 switch (usermode) {
1580 case X86EMUL_MODE_PROT32:
1581 cs.selector = (u16)(msr_data + 16);
1582 if ((msr_data & 0xfffc) == 0x0) {
1583 kvm_inject_gp(ctxt->vcpu, 0);
1584 return -1;
1585 }
1586 ss.selector = (u16)(msr_data + 24);
1587 break;
1588 case X86EMUL_MODE_PROT64:
1589 cs.selector = (u16)(msr_data + 32);
1590 if (msr_data == 0x0) {
1591 kvm_inject_gp(ctxt->vcpu, 0);
1592 return -1;
1593 }
1594 ss.selector = cs.selector + 8;
1595 cs.db = 0;
1596 cs.l = 1;
1597 break;
1598 }
1599 cs.selector |= SELECTOR_RPL_MASK;
1600 ss.selector |= SELECTOR_RPL_MASK;
1601
1602 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
1603 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
1604
1605 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
1606 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
1607
1608 return 0;
1609}
1610
1378int 1611int
1379x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 1612x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1380{ 1613{
@@ -1970,6 +2203,12 @@ twobyte_insn:
1970 goto cannot_emulate; 2203 goto cannot_emulate;
1971 } 2204 }
1972 break; 2205 break;
2206 case 0x05: /* syscall */
2207 if (emulate_syscall(ctxt) == -1)
2208 goto cannot_emulate;
2209 else
2210 goto writeback;
2211 break;
1973 case 0x06: 2212 case 0x06:
1974 emulate_clts(ctxt->vcpu); 2213 emulate_clts(ctxt->vcpu);
1975 c->dst.type = OP_NONE; 2214 c->dst.type = OP_NONE;
@@ -2036,6 +2275,18 @@ twobyte_insn:
2036 rc = X86EMUL_CONTINUE; 2275 rc = X86EMUL_CONTINUE;
2037 c->dst.type = OP_NONE; 2276 c->dst.type = OP_NONE;
2038 break; 2277 break;
2278 case 0x34: /* sysenter */
2279 if (emulate_sysenter(ctxt) == -1)
2280 goto cannot_emulate;
2281 else
2282 goto writeback;
2283 break;
2284 case 0x35: /* sysexit */
2285 if (emulate_sysexit(ctxt) == -1)
2286 goto cannot_emulate;
2287 else
2288 goto writeback;
2289 break;
2039 case 0x40 ... 0x4f: /* cmov */ 2290 case 0x40 ... 0x4f: /* cmov */
2040 c->dst.val = c->dst.orig_val = c->src.val; 2291 c->dst.val = c->dst.orig_val = c->src.val;
2041 if (!test_cc(c->b, ctxt->eflags)) 2292 if (!test_cc(c->b, ctxt->eflags))
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 21f68e00524f..82ad523b4901 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
231{ 231{
232 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 232 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
233 233
234 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) 234 if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
235 return atomic_read(&pit->pit_state.pit_timer.pending); 235 return atomic_read(&pit->pit_state.pit_timer.pending);
236 return 0; 236 return 0;
237} 237}
@@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
252 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 252 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
253 struct hrtimer *timer; 253 struct hrtimer *timer;
254 254
255 if (vcpu->vcpu_id != 0 || !pit) 255 if (!kvm_vcpu_is_bsp(vcpu) || !pit)
256 return; 256 return;
257 257
258 timer = &pit->pit_state.pit_timer.timer; 258 timer = &pit->pit_state.pit_timer.timer;
@@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
294 pt->timer.function = kvm_timer_fn; 294 pt->timer.function = kvm_timer_fn;
295 pt->t_ops = &kpit_ops; 295 pt->t_ops = &kpit_ops;
296 pt->kvm = ps->pit->kvm; 296 pt->kvm = ps->pit->kvm;
297 pt->vcpu_id = 0; 297 pt->vcpu = pt->kvm->bsp_vcpu;
298 298
299 atomic_set(&pt->pending, 0); 299 atomic_set(&pt->pending, 0);
300 ps->irq_ack = 1; 300 ps->irq_ack = 1;
@@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
332 case 1: 332 case 1:
333 /* FIXME: enhance mode 4 precision */ 333 /* FIXME: enhance mode 4 precision */
334 case 4: 334 case 4:
335 create_pit_timer(ps, val, 0); 335 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
336 create_pit_timer(ps, val, 0);
337 }
336 break; 338 break;
337 case 2: 339 case 2:
338 case 3: 340 case 3:
339 create_pit_timer(ps, val, 1); 341 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
342 create_pit_timer(ps, val, 1);
343 }
340 break; 344 break;
341 default: 345 default:
342 destroy_pit_timer(&ps->pit_timer); 346 destroy_pit_timer(&ps->pit_timer);
343 } 347 }
344} 348}
345 349
346void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) 350void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
351{
352 u8 saved_mode;
353 if (hpet_legacy_start) {
354 /* save existing mode for later reenablement */
355 saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
356 kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
357 pit_load_count(kvm, channel, val);
358 kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
359 } else {
360 pit_load_count(kvm, channel, val);
361 }
362}
363
364static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
365{
366 return container_of(dev, struct kvm_pit, dev);
367}
368
369static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
347{ 370{
348 mutex_lock(&kvm->arch.vpit->pit_state.lock); 371 return container_of(dev, struct kvm_pit, speaker_dev);
349 pit_load_count(kvm, channel, val);
350 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
351} 372}
352 373
353static void pit_ioport_write(struct kvm_io_device *this, 374static inline int pit_in_range(gpa_t addr)
354 gpa_t addr, int len, const void *data)
355{ 375{
356 struct kvm_pit *pit = (struct kvm_pit *)this->private; 376 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
377 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
378}
379
380static int pit_ioport_write(struct kvm_io_device *this,
381 gpa_t addr, int len, const void *data)
382{
383 struct kvm_pit *pit = dev_to_pit(this);
357 struct kvm_kpit_state *pit_state = &pit->pit_state; 384 struct kvm_kpit_state *pit_state = &pit->pit_state;
358 struct kvm *kvm = pit->kvm; 385 struct kvm *kvm = pit->kvm;
359 int channel, access; 386 int channel, access;
360 struct kvm_kpit_channel_state *s; 387 struct kvm_kpit_channel_state *s;
361 u32 val = *(u32 *) data; 388 u32 val = *(u32 *) data;
389 if (!pit_in_range(addr))
390 return -EOPNOTSUPP;
362 391
363 val &= 0xff; 392 val &= 0xff;
364 addr &= KVM_PIT_CHANNEL_MASK; 393 addr &= KVM_PIT_CHANNEL_MASK;
@@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this,
421 } 450 }
422 451
423 mutex_unlock(&pit_state->lock); 452 mutex_unlock(&pit_state->lock);
453 return 0;
424} 454}
425 455
426static void pit_ioport_read(struct kvm_io_device *this, 456static int pit_ioport_read(struct kvm_io_device *this,
427 gpa_t addr, int len, void *data) 457 gpa_t addr, int len, void *data)
428{ 458{
429 struct kvm_pit *pit = (struct kvm_pit *)this->private; 459 struct kvm_pit *pit = dev_to_pit(this);
430 struct kvm_kpit_state *pit_state = &pit->pit_state; 460 struct kvm_kpit_state *pit_state = &pit->pit_state;
431 struct kvm *kvm = pit->kvm; 461 struct kvm *kvm = pit->kvm;
432 int ret, count; 462 int ret, count;
433 struct kvm_kpit_channel_state *s; 463 struct kvm_kpit_channel_state *s;
464 if (!pit_in_range(addr))
465 return -EOPNOTSUPP;
434 466
435 addr &= KVM_PIT_CHANNEL_MASK; 467 addr &= KVM_PIT_CHANNEL_MASK;
436 s = &pit_state->channels[addr]; 468 s = &pit_state->channels[addr];
@@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this,
485 memcpy(data, (char *)&ret, len); 517 memcpy(data, (char *)&ret, len);
486 518
487 mutex_unlock(&pit_state->lock); 519 mutex_unlock(&pit_state->lock);
520 return 0;
488} 521}
489 522
490static int pit_in_range(struct kvm_io_device *this, gpa_t addr, 523static int speaker_ioport_write(struct kvm_io_device *this,
491 int len, int is_write) 524 gpa_t addr, int len, const void *data)
492{
493 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
494 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
495}
496
497static void speaker_ioport_write(struct kvm_io_device *this,
498 gpa_t addr, int len, const void *data)
499{ 525{
500 struct kvm_pit *pit = (struct kvm_pit *)this->private; 526 struct kvm_pit *pit = speaker_to_pit(this);
501 struct kvm_kpit_state *pit_state = &pit->pit_state; 527 struct kvm_kpit_state *pit_state = &pit->pit_state;
502 struct kvm *kvm = pit->kvm; 528 struct kvm *kvm = pit->kvm;
503 u32 val = *(u32 *) data; 529 u32 val = *(u32 *) data;
530 if (addr != KVM_SPEAKER_BASE_ADDRESS)
531 return -EOPNOTSUPP;
504 532
505 mutex_lock(&pit_state->lock); 533 mutex_lock(&pit_state->lock);
506 pit_state->speaker_data_on = (val >> 1) & 1; 534 pit_state->speaker_data_on = (val >> 1) & 1;
507 pit_set_gate(kvm, 2, val & 1); 535 pit_set_gate(kvm, 2, val & 1);
508 mutex_unlock(&pit_state->lock); 536 mutex_unlock(&pit_state->lock);
537 return 0;
509} 538}
510 539
511static void speaker_ioport_read(struct kvm_io_device *this, 540static int speaker_ioport_read(struct kvm_io_device *this,
512 gpa_t addr, int len, void *data) 541 gpa_t addr, int len, void *data)
513{ 542{
514 struct kvm_pit *pit = (struct kvm_pit *)this->private; 543 struct kvm_pit *pit = speaker_to_pit(this);
515 struct kvm_kpit_state *pit_state = &pit->pit_state; 544 struct kvm_kpit_state *pit_state = &pit->pit_state;
516 struct kvm *kvm = pit->kvm; 545 struct kvm *kvm = pit->kvm;
517 unsigned int refresh_clock; 546 unsigned int refresh_clock;
518 int ret; 547 int ret;
548 if (addr != KVM_SPEAKER_BASE_ADDRESS)
549 return -EOPNOTSUPP;
519 550
520 /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ 551 /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
521 refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; 552 refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
@@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this,
527 len = sizeof(ret); 558 len = sizeof(ret);
528 memcpy(data, (char *)&ret, len); 559 memcpy(data, (char *)&ret, len);
529 mutex_unlock(&pit_state->lock); 560 mutex_unlock(&pit_state->lock);
530} 561 return 0;
531
532static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
533 int len, int is_write)
534{
535 return (addr == KVM_SPEAKER_BASE_ADDRESS);
536} 562}
537 563
538void kvm_pit_reset(struct kvm_pit *pit) 564void kvm_pit_reset(struct kvm_pit *pit)
@@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
541 struct kvm_kpit_channel_state *c; 567 struct kvm_kpit_channel_state *c;
542 568
543 mutex_lock(&pit->pit_state.lock); 569 mutex_lock(&pit->pit_state.lock);
570 pit->pit_state.flags = 0;
544 for (i = 0; i < 3; i++) { 571 for (i = 0; i < 3; i++) {
545 c = &pit->pit_state.channels[i]; 572 c = &pit->pit_state.channels[i];
546 c->mode = 0xff; 573 c->mode = 0xff;
@@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
563 } 590 }
564} 591}
565 592
566struct kvm_pit *kvm_create_pit(struct kvm *kvm) 593static const struct kvm_io_device_ops pit_dev_ops = {
594 .read = pit_ioport_read,
595 .write = pit_ioport_write,
596};
597
598static const struct kvm_io_device_ops speaker_dev_ops = {
599 .read = speaker_ioport_read,
600 .write = speaker_ioport_write,
601};
602
603/* Caller must have writers lock on slots_lock */
604struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
567{ 605{
568 struct kvm_pit *pit; 606 struct kvm_pit *pit;
569 struct kvm_kpit_state *pit_state; 607 struct kvm_kpit_state *pit_state;
608 int ret;
570 609
571 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); 610 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
572 if (!pit) 611 if (!pit)
@@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
582 mutex_lock(&pit->pit_state.lock); 621 mutex_lock(&pit->pit_state.lock);
583 spin_lock_init(&pit->pit_state.inject_lock); 622 spin_lock_init(&pit->pit_state.inject_lock);
584 623
585 /* Initialize PIO device */
586 pit->dev.read = pit_ioport_read;
587 pit->dev.write = pit_ioport_write;
588 pit->dev.in_range = pit_in_range;
589 pit->dev.private = pit;
590 kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
591
592 pit->speaker_dev.read = speaker_ioport_read;
593 pit->speaker_dev.write = speaker_ioport_write;
594 pit->speaker_dev.in_range = speaker_in_range;
595 pit->speaker_dev.private = pit;
596 kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
597
598 kvm->arch.vpit = pit; 624 kvm->arch.vpit = pit;
599 pit->kvm = kvm; 625 pit->kvm = kvm;
600 626
@@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
613 pit->mask_notifier.func = pit_mask_notifer; 639 pit->mask_notifier.func = pit_mask_notifer;
614 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 640 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
615 641
642 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
643 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
644 if (ret < 0)
645 goto fail;
646
647 if (flags & KVM_PIT_SPEAKER_DUMMY) {
648 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
649 ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
650 &pit->speaker_dev);
651 if (ret < 0)
652 goto fail_unregister;
653 }
654
616 return pit; 655 return pit;
656
657fail_unregister:
658 __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
659
660fail:
661 if (pit->irq_source_id >= 0)
662 kvm_free_irq_source_id(kvm, pit->irq_source_id);
663
664 kfree(pit);
665 return NULL;
617} 666}
618 667
619void kvm_free_pit(struct kvm *kvm) 668void kvm_free_pit(struct kvm *kvm)
@@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm)
623 if (kvm->arch.vpit) { 672 if (kvm->arch.vpit) {
624 kvm_unregister_irq_mask_notifier(kvm, 0, 673 kvm_unregister_irq_mask_notifier(kvm, 0,
625 &kvm->arch.vpit->mask_notifier); 674 &kvm->arch.vpit->mask_notifier);
675 kvm_unregister_irq_ack_notifier(kvm,
676 &kvm->arch.vpit->pit_state.irq_ack_notifier);
626 mutex_lock(&kvm->arch.vpit->pit_state.lock); 677 mutex_lock(&kvm->arch.vpit->pit_state.lock);
627 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 678 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
628 hrtimer_cancel(timer); 679 hrtimer_cancel(timer);
@@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
637 struct kvm_vcpu *vcpu; 688 struct kvm_vcpu *vcpu;
638 int i; 689 int i;
639 690
640 mutex_lock(&kvm->lock); 691 mutex_lock(&kvm->irq_lock);
641 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
642 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
643 mutex_unlock(&kvm->lock); 694 mutex_unlock(&kvm->irq_lock);
644 695
645 /* 696 /*
646 * Provides NMI watchdog support via Virtual Wire mode. 697 * Provides NMI watchdog support via Virtual Wire mode.
@@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
652 * VCPU0, and only if its LVT0 is in EXTINT mode. 703 * VCPU0, and only if its LVT0 is in EXTINT mode.
653 */ 704 */
654 if (kvm->arch.vapics_in_nmi_mode > 0) 705 if (kvm->arch.vapics_in_nmi_mode > 0)
655 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 706 kvm_for_each_vcpu(i, vcpu, kvm)
656 vcpu = kvm->vcpus[i]; 707 kvm_apic_nmi_wd_deliver(vcpu);
657 if (vcpu)
658 kvm_apic_nmi_wd_deliver(vcpu);
659 }
660} 708}
661 709
662void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) 710void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
@@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
665 struct kvm *kvm = vcpu->kvm; 713 struct kvm *kvm = vcpu->kvm;
666 struct kvm_kpit_state *ps; 714 struct kvm_kpit_state *ps;
667 715
668 if (vcpu && pit) { 716 if (pit) {
669 int inject = 0; 717 int inject = 0;
670 ps = &pit->pit_state; 718 ps = &pit->pit_state;
671 719
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index bbd863ff60b7..d4c1c7ffdc09 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -21,6 +21,7 @@ struct kvm_kpit_channel_state {
21 21
22struct kvm_kpit_state { 22struct kvm_kpit_state {
23 struct kvm_kpit_channel_state channels[3]; 23 struct kvm_kpit_channel_state channels[3];
24 u32 flags;
24 struct kvm_timer pit_timer; 25 struct kvm_timer pit_timer;
25 bool is_periodic; 26 bool is_periodic;
26 u32 speaker_data_on; 27 u32 speaker_data_on;
@@ -49,8 +50,8 @@ struct kvm_pit {
49#define KVM_PIT_CHANNEL_MASK 0x3 50#define KVM_PIT_CHANNEL_MASK 0x3
50 51
51void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); 52void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
52void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); 53void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
53struct kvm_pit *kvm_create_pit(struct kvm *kvm); 54struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
54void kvm_free_pit(struct kvm *kvm); 55void kvm_free_pit(struct kvm *kvm);
55void kvm_pit_reset(struct kvm_pit *pit); 56void kvm_pit_reset(struct kvm_pit *pit);
56 57
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1ccb50c74f18..01f151682802 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,50 +30,24 @@
30#include "irq.h" 30#include "irq.h"
31 31
32#include <linux/kvm_host.h> 32#include <linux/kvm_host.h>
33 33#include "trace.h"
34static void pic_lock(struct kvm_pic *s)
35 __acquires(&s->lock)
36{
37 spin_lock(&s->lock);
38}
39
40static void pic_unlock(struct kvm_pic *s)
41 __releases(&s->lock)
42{
43 struct kvm *kvm = s->kvm;
44 unsigned acks = s->pending_acks;
45 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu;
47
48 s->pending_acks = 0;
49 s->wakeup_needed = false;
50
51 spin_unlock(&s->lock);
52
53 while (acks) {
54 kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
55 __ffs(acks));
56 acks &= acks - 1;
57 }
58
59 if (wakeup) {
60 vcpu = s->kvm->vcpus[0];
61 if (vcpu)
62 kvm_vcpu_kick(vcpu);
63 }
64}
65 34
66static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 35static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
67{ 36{
68 s->isr &= ~(1 << irq); 37 s->isr &= ~(1 << irq);
69 s->isr_ack |= (1 << irq); 38 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0])
40 irq += 8;
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
70} 42}
71 43
72void kvm_pic_clear_isr_ack(struct kvm *kvm) 44void kvm_pic_clear_isr_ack(struct kvm *kvm)
73{ 45{
74 struct kvm_pic *s = pic_irqchip(kvm); 46 struct kvm_pic *s = pic_irqchip(kvm);
47 spin_lock(&s->lock);
75 s->pics[0].isr_ack = 0xff; 48 s->pics[0].isr_ack = 0xff;
76 s->pics[1].isr_ack = 0xff; 49 s->pics[1].isr_ack = 0xff;
50 spin_unlock(&s->lock);
77} 51}
78 52
79/* 53/*
@@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s)
174 148
175void kvm_pic_update_irq(struct kvm_pic *s) 149void kvm_pic_update_irq(struct kvm_pic *s)
176{ 150{
177 pic_lock(s); 151 spin_lock(&s->lock);
178 pic_update_irq(s); 152 pic_update_irq(s);
179 pic_unlock(s); 153 spin_unlock(&s->lock);
180} 154}
181 155
182int kvm_pic_set_irq(void *opaque, int irq, int level) 156int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
184 struct kvm_pic *s = opaque; 158 struct kvm_pic *s = opaque;
185 int ret = -1; 159 int ret = -1;
186 160
187 pic_lock(s); 161 spin_lock(&s->lock);
188 if (irq >= 0 && irq < PIC_NUM_PINS) { 162 if (irq >= 0 && irq < PIC_NUM_PINS) {
189 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 163 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
190 pic_update_irq(s); 164 pic_update_irq(s);
165 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
166 s->pics[irq >> 3].imr, ret == 0);
191 } 167 }
192 pic_unlock(s); 168 spin_unlock(&s->lock);
193 169
194 return ret; 170 return ret;
195} 171}
@@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
217 int irq, irq2, intno; 193 int irq, irq2, intno;
218 struct kvm_pic *s = pic_irqchip(kvm); 194 struct kvm_pic *s = pic_irqchip(kvm);
219 195
220 pic_lock(s); 196 spin_lock(&s->lock);
221 irq = pic_get_irq(&s->pics[0]); 197 irq = pic_get_irq(&s->pics[0]);
222 if (irq >= 0) { 198 if (irq >= 0) {
223 pic_intack(&s->pics[0], irq); 199 pic_intack(&s->pics[0], irq);
@@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
242 intno = s->pics[0].irq_base + irq; 218 intno = s->pics[0].irq_base + irq;
243 } 219 }
244 pic_update_irq(s); 220 pic_update_irq(s);
245 pic_unlock(s); 221 spin_unlock(&s->lock);
246 kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
247 222
248 return intno; 223 return intno;
249} 224}
@@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
252{ 227{
253 int irq, irqbase, n; 228 int irq, irqbase, n;
254 struct kvm *kvm = s->pics_state->irq_request_opaque; 229 struct kvm *kvm = s->pics_state->irq_request_opaque;
255 struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; 230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
256 231
257 if (s == &s->pics_state->pics[0]) 232 if (s == &s->pics_state->pics[0])
258 irqbase = 0; 233 irqbase = 0;
@@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
263 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) 238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
264 if (s->irr & (1 << irq) || s->isr & (1 << irq)) { 239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
265 n = irq + irqbase; 240 n = irq + irqbase;
266 s->pics_state->pending_acks |= 1 << n; 241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
267 } 242 }
268 } 243 }
269 s->last_irr = 0; 244 s->last_irr = 0;
@@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
428 return s->elcr; 403 return s->elcr;
429} 404}
430 405
431static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, 406static int picdev_in_range(gpa_t addr)
432 int len, int is_write)
433{ 407{
434 switch (addr) { 408 switch (addr) {
435 case 0x20: 409 case 0x20:
@@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
444 } 418 }
445} 419}
446 420
447static void picdev_write(struct kvm_io_device *this, 421static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
422{
423 return container_of(dev, struct kvm_pic, dev);
424}
425
426static int picdev_write(struct kvm_io_device *this,
448 gpa_t addr, int len, const void *val) 427 gpa_t addr, int len, const void *val)
449{ 428{
450 struct kvm_pic *s = this->private; 429 struct kvm_pic *s = to_pic(this);
451 unsigned char data = *(unsigned char *)val; 430 unsigned char data = *(unsigned char *)val;
431 if (!picdev_in_range(addr))
432 return -EOPNOTSUPP;
452 433
453 if (len != 1) { 434 if (len != 1) {
454 if (printk_ratelimit()) 435 if (printk_ratelimit())
455 printk(KERN_ERR "PIC: non byte write\n"); 436 printk(KERN_ERR "PIC: non byte write\n");
456 return; 437 return 0;
457 } 438 }
458 pic_lock(s); 439 spin_lock(&s->lock);
459 switch (addr) { 440 switch (addr) {
460 case 0x20: 441 case 0x20:
461 case 0x21: 442 case 0x21:
@@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this,
468 elcr_ioport_write(&s->pics[addr & 1], addr, data); 449 elcr_ioport_write(&s->pics[addr & 1], addr, data);
469 break; 450 break;
470 } 451 }
471 pic_unlock(s); 452 spin_unlock(&s->lock);
453 return 0;
472} 454}
473 455
474static void picdev_read(struct kvm_io_device *this, 456static int picdev_read(struct kvm_io_device *this,
475 gpa_t addr, int len, void *val) 457 gpa_t addr, int len, void *val)
476{ 458{
477 struct kvm_pic *s = this->private; 459 struct kvm_pic *s = to_pic(this);
478 unsigned char data = 0; 460 unsigned char data = 0;
461 if (!picdev_in_range(addr))
462 return -EOPNOTSUPP;
479 463
480 if (len != 1) { 464 if (len != 1) {
481 if (printk_ratelimit()) 465 if (printk_ratelimit())
482 printk(KERN_ERR "PIC: non byte read\n"); 466 printk(KERN_ERR "PIC: non byte read\n");
483 return; 467 return 0;
484 } 468 }
485 pic_lock(s); 469 spin_lock(&s->lock);
486 switch (addr) { 470 switch (addr) {
487 case 0x20: 471 case 0x20:
488 case 0x21: 472 case 0x21:
@@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this,
496 break; 480 break;
497 } 481 }
498 *(unsigned char *)val = data; 482 *(unsigned char *)val = data;
499 pic_unlock(s); 483 spin_unlock(&s->lock);
484 return 0;
500} 485}
501 486
502/* 487/*
@@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this,
505static void pic_irq_request(void *opaque, int level) 490static void pic_irq_request(void *opaque, int level)
506{ 491{
507 struct kvm *kvm = opaque; 492 struct kvm *kvm = opaque;
508 struct kvm_vcpu *vcpu = kvm->vcpus[0]; 493 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
509 struct kvm_pic *s = pic_irqchip(kvm); 494 struct kvm_pic *s = pic_irqchip(kvm);
510 int irq = pic_get_irq(&s->pics[0]); 495 int irq = pic_get_irq(&s->pics[0]);
511 496
512 s->output = level; 497 s->output = level;
513 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 498 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
514 s->pics[0].isr_ack &= ~(1 << irq); 499 s->pics[0].isr_ack &= ~(1 << irq);
515 s->wakeup_needed = true; 500 kvm_vcpu_kick(vcpu);
516 } 501 }
517} 502}
518 503
504static const struct kvm_io_device_ops picdev_ops = {
505 .read = picdev_read,
506 .write = picdev_write,
507};
508
519struct kvm_pic *kvm_create_pic(struct kvm *kvm) 509struct kvm_pic *kvm_create_pic(struct kvm *kvm)
520{ 510{
521 struct kvm_pic *s; 511 struct kvm_pic *s;
512 int ret;
513
522 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 514 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
523 if (!s) 515 if (!s)
524 return NULL; 516 return NULL;
@@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
534 /* 526 /*
535 * Initialize PIO device 527 * Initialize PIO device
536 */ 528 */
537 s->dev.read = picdev_read; 529 kvm_iodevice_init(&s->dev, &picdev_ops);
538 s->dev.write = picdev_write; 530 ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
539 s->dev.in_range = picdev_in_range; 531 if (ret < 0) {
540 s->dev.private = s; 532 kfree(s);
541 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); 533 return NULL;
534 }
535
542 return s; 536 return s;
543} 537}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9f593188129e..7d6058a2fd38 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,7 +63,6 @@ struct kvm_kpic_state {
63 63
64struct kvm_pic { 64struct kvm_pic {
65 spinlock_t lock; 65 spinlock_t lock;
66 bool wakeup_needed;
67 unsigned pending_acks; 66 unsigned pending_acks;
68 struct kvm *kvm; 67 struct kvm *kvm;
69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1ff819dce7d3..7bcc5b6a4403 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
29 kvm_register_write(vcpu, VCPU_REGS_RIP, val); 29 kvm_register_write(vcpu, VCPU_REGS_RIP, val);
30} 30}
31 31
32static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
33{
34 if (!test_bit(VCPU_EXREG_PDPTR,
35 (unsigned long *)&vcpu->arch.regs_avail))
36 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
37
38 return vcpu->arch.pdptrs[index];
39}
40
32#endif 41#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
deleted file mode 100644
index ed66e4c078dc..000000000000
--- a/arch/x86/kvm/kvm_svm.h
+++ /dev/null
@@ -1,51 +0,0 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/list.h>
7#include <linux/kvm_host.h>
8#include <asm/msr.h>
9
10#include <asm/svm.h>
11
12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE,
16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18};
19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21
22struct kvm_vcpu;
23
24struct vcpu_svm {
25 struct kvm_vcpu vcpu;
26 struct vmcb *vmcb;
27 unsigned long vmcb_pa;
28 struct svm_cpu_data *svm_data;
29 uint64_t asid_generation;
30
31 u64 next_rip;
32
33 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
34 u64 host_gs_base;
35 unsigned long host_cr2;
36
37 u32 *msrpm;
38 struct vmcb *hsave;
39 u64 hsave_msr;
40
41 u64 nested_vmcb;
42
43 /* These are the merged vectors */
44 u32 *nested_msrpm;
45
46 /* gpa pointers to the real vectors */
47 u64 nested_vmcb_msrpm;
48};
49
50#endif
51
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 26bd6ba74e1c..55c7524dda54 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -6,7 +6,7 @@ struct kvm_timer {
6 bool reinject; 6 bool reinject;
7 struct kvm_timer_ops *t_ops; 7 struct kvm_timer_ops *t_ops;
8 struct kvm *kvm; 8 struct kvm *kvm;
9 int vcpu_id; 9 struct kvm_vcpu *vcpu;
10}; 10};
11 11
12struct kvm_timer_ops { 12struct kvm_timer_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ae99d83f81a3..1ae5ceba7eb2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,8 +32,11 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include <asm/apicdef.h>
35#include "kvm_cache_regs.h" 36#include "kvm_cache_regs.h"
36#include "irq.h" 37#include "irq.h"
38#include "trace.h"
39#include "x86.h"
37 40
38#ifndef CONFIG_X86_64 41#ifndef CONFIG_X86_64
39#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) 42#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
141 return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; 144 return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
142} 145}
143 146
147void kvm_apic_set_version(struct kvm_vcpu *vcpu)
148{
149 struct kvm_lapic *apic = vcpu->arch.apic;
150 struct kvm_cpuid_entry2 *feat;
151 u32 v = APIC_VERSION;
152
153 if (!irqchip_in_kernel(vcpu->kvm))
154 return;
155
156 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
157 if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
158 v |= APIC_LVR_DIRECTED_EOI;
159 apic_set_reg(apic, APIC_LVR, v);
160}
161
162static inline int apic_x2apic_mode(struct kvm_lapic *apic)
163{
164 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
165}
166
144static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { 167static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
145 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ 168 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
146 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 169 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap)
165 188
166static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 189static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
167{ 190{
191 apic->irr_pending = true;
168 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); 192 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
169} 193}
170 194
171static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) 195static inline int apic_search_irr(struct kvm_lapic *apic)
172{ 196{
173 apic_clear_vector(vec, apic->regs + APIC_IRR); 197 return find_highest_vector(apic->regs + APIC_IRR);
174} 198}
175 199
176static inline int apic_find_highest_irr(struct kvm_lapic *apic) 200static inline int apic_find_highest_irr(struct kvm_lapic *apic)
177{ 201{
178 int result; 202 int result;
179 203
180 result = find_highest_vector(apic->regs + APIC_IRR); 204 if (!apic->irr_pending)
205 return -1;
206
207 result = apic_search_irr(apic);
181 ASSERT(result == -1 || result >= 16); 208 ASSERT(result == -1 || result >= 16);
182 209
183 return result; 210 return result;
184} 211}
185 212
213static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
214{
215 apic->irr_pending = false;
216 apic_clear_vector(vec, apic->regs + APIC_IRR);
217 if (apic_search_irr(apic) != -1)
218 apic->irr_pending = true;
219}
220
186int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 221int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
187{ 222{
188 struct kvm_lapic *apic = vcpu->arch.apic; 223 struct kvm_lapic *apic = vcpu->arch.apic;
189 int highest_irr; 224 int highest_irr;
190 225
226 /* This may race with setting of irr in __apic_accept_irq() and
227 * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
228 * will cause vmexit immediately and the value will be recalculated
229 * on the next vmentry.
230 */
191 if (!apic) 231 if (!apic)
192 return 0; 232 return 0;
193 highest_irr = apic_find_highest_irr(apic); 233 highest_irr = apic_find_highest_irr(apic);
194 234
195 return highest_irr; 235 return highest_irr;
196} 236}
197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
198 237
199static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 238static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
200 int vector, int level, int trig_mode); 239 int vector, int level, int trig_mode);
@@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
251int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) 290int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
252{ 291{
253 int result = 0; 292 int result = 0;
254 u8 logical_id; 293 u32 logical_id;
294
295 if (apic_x2apic_mode(apic)) {
296 logical_id = apic_get_reg(apic, APIC_LDR);
297 return logical_id & mda;
298 }
255 299
256 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); 300 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
257 301
@@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
331 break; 375 break;
332 376
333 result = !apic_test_and_set_irr(vector, apic); 377 result = !apic_test_and_set_irr(vector, apic);
378 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
379 trig_mode, vector, !result);
334 if (!result) { 380 if (!result) {
335 if (trig_mode) 381 if (trig_mode)
336 apic_debug("level trig mode repeatedly for " 382 apic_debug("level trig mode repeatedly for "
@@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic)
425 trigger_mode = IOAPIC_LEVEL_TRIG; 471 trigger_mode = IOAPIC_LEVEL_TRIG;
426 else 472 else
427 trigger_mode = IOAPIC_EDGE_TRIG; 473 trigger_mode = IOAPIC_EDGE_TRIG;
428 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) {
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
429} 479}
430 480
431static void apic_send_ipi(struct kvm_lapic *apic) 481static void apic_send_ipi(struct kvm_lapic *apic)
@@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic)
440 irq.level = icr_low & APIC_INT_ASSERT; 490 irq.level = icr_low & APIC_INT_ASSERT;
441 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; 491 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
442 irq.shorthand = icr_low & APIC_SHORT_MASK; 492 irq.shorthand = icr_low & APIC_SHORT_MASK;
443 irq.dest_id = GET_APIC_DEST_FIELD(icr_high); 493 if (apic_x2apic_mode(apic))
494 irq.dest_id = icr_high;
495 else
496 irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
497
498 trace_kvm_apic_ipi(icr_low, irq.dest_id);
444 499
445 apic_debug("icr_high 0x%x, icr_low 0x%x, " 500 apic_debug("icr_high 0x%x, icr_low 0x%x, "
446 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " 501 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
@@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic)
449 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
450 irq.vector); 505 irq.vector);
451 506
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
452 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
453} 510}
454 511
455static u32 apic_get_tmcct(struct kvm_lapic *apic) 512static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
495{ 552{
496 u32 val = 0; 553 u32 val = 0;
497 554
498 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
499
500 if (offset >= LAPIC_MMIO_LENGTH) 555 if (offset >= LAPIC_MMIO_LENGTH)
501 return 0; 556 return 0;
502 557
503 switch (offset) { 558 switch (offset) {
559 case APIC_ID:
560 if (apic_x2apic_mode(apic))
561 val = kvm_apic_id(apic);
562 else
563 val = kvm_apic_id(apic) << 24;
564 break;
504 case APIC_ARBPRI: 565 case APIC_ARBPRI:
505 printk(KERN_WARNING "Access APIC ARBPRI register " 566 printk(KERN_WARNING "Access APIC ARBPRI register "
506 "which is for P6\n"); 567 "which is for P6\n");
@@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
522 return val; 583 return val;
523} 584}
524 585
525static void apic_mmio_read(struct kvm_io_device *this, 586static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
526 gpa_t address, int len, void *data) 587{
588 return container_of(dev, struct kvm_lapic, dev);
589}
590
591static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
592 void *data)
527{ 593{
528 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
529 unsigned int offset = address - apic->base_address;
530 unsigned char alignment = offset & 0xf; 594 unsigned char alignment = offset & 0xf;
531 u32 result; 595 u32 result;
596 /* this bitmask has a bit cleared for each reserver register */
597 static const u64 rmask = 0x43ff01ffffffe70cULL;
532 598
533 if ((alignment + len) > 4) { 599 if ((alignment + len) > 4) {
534 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", 600 apic_debug("KVM_APIC_READ: alignment error %x %d\n",
535 (unsigned long)address, len); 601 offset, len);
536 return; 602 return 1;
537 } 603 }
604
605 if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
606 apic_debug("KVM_APIC_READ: read reserved register %x\n",
607 offset);
608 return 1;
609 }
610
538 result = __apic_read(apic, offset & ~0xf); 611 result = __apic_read(apic, offset & ~0xf);
539 612
613 trace_kvm_apic_read(offset, result);
614
540 switch (len) { 615 switch (len) {
541 case 1: 616 case 1:
542 case 2: 617 case 2:
@@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this,
548 "should be 1,2, or 4 instead\n", len); 623 "should be 1,2, or 4 instead\n", len);
549 break; 624 break;
550 } 625 }
626 return 0;
627}
628
629static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
630{
631 return apic_hw_enabled(apic) &&
632 addr >= apic->base_address &&
633 addr < apic->base_address + LAPIC_MMIO_LENGTH;
634}
635
636static int apic_mmio_read(struct kvm_io_device *this,
637 gpa_t address, int len, void *data)
638{
639 struct kvm_lapic *apic = to_lapic(this);
640 u32 offset = address - apic->base_address;
641
642 if (!apic_mmio_in_range(apic, address))
643 return -EOPNOTSUPP;
644
645 apic_reg_read(apic, offset, len, data);
646
647 return 0;
551} 648}
552 649
553static void update_divide_count(struct kvm_lapic *apic) 650static void update_divide_count(struct kvm_lapic *apic)
@@ -573,6 +670,15 @@ static void start_apic_timer(struct kvm_lapic *apic)
573 670
574 if (!apic->lapic_timer.period) 671 if (!apic->lapic_timer.period)
575 return; 672 return;
673 /*
674 * Do not allow the guest to program periodic timers with small
675 * interval, since the hrtimers are not throttled by the host
676 * scheduler.
677 */
678 if (apic_lvtt_period(apic)) {
679 if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
680 apic->lapic_timer.period = NSEC_PER_MSEC/2;
681 }
576 682
577 hrtimer_start(&apic->lapic_timer.timer, 683 hrtimer_start(&apic->lapic_timer.timer,
578 ktime_add_ns(now, apic->lapic_timer.period), 684 ktime_add_ns(now, apic->lapic_timer.period),
@@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
603 apic->vcpu->kvm->arch.vapics_in_nmi_mode--; 709 apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
604} 710}
605 711
606static void apic_mmio_write(struct kvm_io_device *this, 712static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
607 gpa_t address, int len, const void *data)
608{ 713{
609 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 714 int ret = 0;
610 unsigned int offset = address - apic->base_address;
611 unsigned char alignment = offset & 0xf;
612 u32 val;
613
614 /*
615 * APIC register must be aligned on 128-bits boundary.
616 * 32/64/128 bits registers must be accessed thru 32 bits.
617 * Refer SDM 8.4.1
618 */
619 if (len != 4 || alignment) {
620 /* Don't shout loud, $infamous_os would cause only noise. */
621 apic_debug("apic write: bad size=%d %lx\n",
622 len, (long)address);
623 return;
624 }
625
626 val = *(u32 *) data;
627
628 /* too common printing */
629 if (offset != APIC_EOI)
630 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
631 "0x%x\n", __func__, offset, len, val);
632
633 offset &= 0xff0;
634 715
635 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); 716 trace_kvm_apic_write(reg, val);
636 717
637 switch (offset) { 718 switch (reg) {
638 case APIC_ID: /* Local APIC ID */ 719 case APIC_ID: /* Local APIC ID */
639 apic_set_reg(apic, APIC_ID, val); 720 if (!apic_x2apic_mode(apic))
721 apic_set_reg(apic, APIC_ID, val);
722 else
723 ret = 1;
640 break; 724 break;
641 725
642 case APIC_TASKPRI: 726 case APIC_TASKPRI:
@@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this,
649 break; 733 break;
650 734
651 case APIC_LDR: 735 case APIC_LDR:
652 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); 736 if (!apic_x2apic_mode(apic))
737 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
738 else
739 ret = 1;
653 break; 740 break;
654 741
655 case APIC_DFR: 742 case APIC_DFR:
656 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); 743 if (!apic_x2apic_mode(apic))
744 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
745 else
746 ret = 1;
657 break; 747 break;
658 748
659 case APIC_SPIV: 749 case APIC_SPIV: {
660 apic_set_reg(apic, APIC_SPIV, val & 0x3ff); 750 u32 mask = 0x3ff;
751 if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
752 mask |= APIC_SPIV_DIRECTED_EOI;
753 apic_set_reg(apic, APIC_SPIV, val & mask);
661 if (!(val & APIC_SPIV_APIC_ENABLED)) { 754 if (!(val & APIC_SPIV_APIC_ENABLED)) {
662 int i; 755 int i;
663 u32 lvt_val; 756 u32 lvt_val;
@@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
672 765
673 } 766 }
674 break; 767 break;
675 768 }
676 case APIC_ICR: 769 case APIC_ICR:
677 /* No delay here, so we always clear the pending bit */ 770 /* No delay here, so we always clear the pending bit */
678 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); 771 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
@@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
680 break; 773 break;
681 774
682 case APIC_ICR2: 775 case APIC_ICR2:
683 apic_set_reg(apic, APIC_ICR2, val & 0xff000000); 776 if (!apic_x2apic_mode(apic))
777 val &= 0xff000000;
778 apic_set_reg(apic, APIC_ICR2, val);
684 break; 779 break;
685 780
686 case APIC_LVT0: 781 case APIC_LVT0:
@@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
694 if (!apic_sw_enabled(apic)) 789 if (!apic_sw_enabled(apic))
695 val |= APIC_LVT_MASKED; 790 val |= APIC_LVT_MASKED;
696 791
697 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; 792 val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
698 apic_set_reg(apic, offset, val); 793 apic_set_reg(apic, reg, val);
699 794
700 break; 795 break;
701 796
@@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
703 hrtimer_cancel(&apic->lapic_timer.timer); 798 hrtimer_cancel(&apic->lapic_timer.timer);
704 apic_set_reg(apic, APIC_TMICT, val); 799 apic_set_reg(apic, APIC_TMICT, val);
705 start_apic_timer(apic); 800 start_apic_timer(apic);
706 return; 801 break;
707 802
708 case APIC_TDCR: 803 case APIC_TDCR:
709 if (val & 4) 804 if (val & 4)
@@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this,
712 update_divide_count(apic); 807 update_divide_count(apic);
713 break; 808 break;
714 809
810 case APIC_ESR:
811 if (apic_x2apic_mode(apic) && val != 0) {
812 printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
813 ret = 1;
814 }
815 break;
816
817 case APIC_SELF_IPI:
818 if (apic_x2apic_mode(apic)) {
819 apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
820 } else
821 ret = 1;
822 break;
715 default: 823 default:
716 apic_debug("Local APIC Write to read-only register %x\n", 824 ret = 1;
717 offset);
718 break; 825 break;
719 } 826 }
720 827 if (ret)
828 apic_debug("Local APIC Write to read-only register %x\n", reg);
829 return ret;
721} 830}
722 831
723static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, 832static int apic_mmio_write(struct kvm_io_device *this,
724 int len, int size) 833 gpa_t address, int len, const void *data)
725{ 834{
726 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 835 struct kvm_lapic *apic = to_lapic(this);
727 int ret = 0; 836 unsigned int offset = address - apic->base_address;
837 u32 val;
728 838
839 if (!apic_mmio_in_range(apic, address))
840 return -EOPNOTSUPP;
729 841
730 if (apic_hw_enabled(apic) && 842 /*
731 (addr >= apic->base_address) && 843 * APIC register must be aligned on 128-bits boundary.
732 (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) 844 * 32/64/128 bits registers must be accessed thru 32 bits.
733 ret = 1; 845 * Refer SDM 8.4.1
846 */
847 if (len != 4 || (offset & 0xf)) {
848 /* Don't shout loud, $infamous_os would cause only noise. */
849 apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
850 return 0;
851 }
734 852
735 return ret; 853 val = *(u32*)data;
854
855 /* too common printing */
856 if (offset != APIC_EOI)
857 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
858 "0x%x\n", __func__, offset, len, val);
859
860 apic_reg_write(apic, offset & 0xff0, val);
861
862 return 0;
736} 863}
737 864
738void kvm_free_lapic(struct kvm_vcpu *vcpu) 865void kvm_free_lapic(struct kvm_vcpu *vcpu)
@@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
763 apic_set_tpr(apic, ((cr8 & 0x0f) << 4) 890 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
764 | (apic_get_reg(apic, APIC_TASKPRI) & 4)); 891 | (apic_get_reg(apic, APIC_TASKPRI) & 4));
765} 892}
766EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
767 893
768u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 894u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
769{ 895{
@@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
776 902
777 return (tpr & 0xf0) >> 4; 903 return (tpr & 0xf0) >> 4;
778} 904}
779EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
780 905
781void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 906void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
782{ 907{
@@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
787 vcpu->arch.apic_base = value; 912 vcpu->arch.apic_base = value;
788 return; 913 return;
789 } 914 }
790 if (apic->vcpu->vcpu_id) 915
916 if (!kvm_vcpu_is_bsp(apic->vcpu))
791 value &= ~MSR_IA32_APICBASE_BSP; 917 value &= ~MSR_IA32_APICBASE_BSP;
792 918
793 vcpu->arch.apic_base = value; 919 vcpu->arch.apic_base = value;
920 if (apic_x2apic_mode(apic)) {
921 u32 id = kvm_apic_id(apic);
922 u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
923 apic_set_reg(apic, APIC_LDR, ldr);
924 }
794 apic->base_address = apic->vcpu->arch.apic_base & 925 apic->base_address = apic->vcpu->arch.apic_base &
795 MSR_IA32_APICBASE_BASE; 926 MSR_IA32_APICBASE_BASE;
796 927
@@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
800 931
801} 932}
802 933
803u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
804{
805 return vcpu->arch.apic_base;
806}
807EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
808
809void kvm_lapic_reset(struct kvm_vcpu *vcpu) 934void kvm_lapic_reset(struct kvm_vcpu *vcpu)
810{ 935{
811 struct kvm_lapic *apic; 936 struct kvm_lapic *apic;
@@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
821 hrtimer_cancel(&apic->lapic_timer.timer); 946 hrtimer_cancel(&apic->lapic_timer.timer);
822 947
823 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 948 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
824 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 949 kvm_apic_set_version(apic->vcpu);
825 950
826 for (i = 0; i < APIC_LVT_NUM; i++) 951 for (i = 0; i < APIC_LVT_NUM; i++)
827 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); 952 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
@@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
842 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 967 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
843 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 968 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
844 } 969 }
970 apic->irr_pending = false;
845 update_divide_count(apic); 971 update_divide_count(apic);
846 atomic_set(&apic->lapic_timer.pending, 0); 972 atomic_set(&apic->lapic_timer.pending, 0);
847 if (vcpu->vcpu_id == 0) 973 if (kvm_vcpu_is_bsp(vcpu))
848 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 974 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
849 apic_update_ppr(apic); 975 apic_update_ppr(apic);
850 976
@@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
855 vcpu, kvm_apic_id(apic), 981 vcpu, kvm_apic_id(apic),
856 vcpu->arch.apic_base, apic->base_address); 982 vcpu->arch.apic_base, apic->base_address);
857} 983}
858EXPORT_SYMBOL_GPL(kvm_lapic_reset);
859 984
860bool kvm_apic_present(struct kvm_vcpu *vcpu) 985bool kvm_apic_present(struct kvm_vcpu *vcpu)
861{ 986{
@@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
866{ 991{
867 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); 992 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
868} 993}
869EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
870 994
871/* 995/*
872 *---------------------------------------------------------------------- 996 *----------------------------------------------------------------------
@@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = {
917 .is_periodic = lapic_is_periodic, 1041 .is_periodic = lapic_is_periodic,
918}; 1042};
919 1043
1044static const struct kvm_io_device_ops apic_mmio_ops = {
1045 .read = apic_mmio_read,
1046 .write = apic_mmio_write,
1047};
1048
920int kvm_create_lapic(struct kvm_vcpu *vcpu) 1049int kvm_create_lapic(struct kvm_vcpu *vcpu)
921{ 1050{
922 struct kvm_lapic *apic; 1051 struct kvm_lapic *apic;
@@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
945 apic->lapic_timer.timer.function = kvm_timer_fn; 1074 apic->lapic_timer.timer.function = kvm_timer_fn;
946 apic->lapic_timer.t_ops = &lapic_timer_ops; 1075 apic->lapic_timer.t_ops = &lapic_timer_ops;
947 apic->lapic_timer.kvm = vcpu->kvm; 1076 apic->lapic_timer.kvm = vcpu->kvm;
948 apic->lapic_timer.vcpu_id = vcpu->vcpu_id; 1077 apic->lapic_timer.vcpu = vcpu;
949 1078
950 apic->base_address = APIC_DEFAULT_PHYS_BASE; 1079 apic->base_address = APIC_DEFAULT_PHYS_BASE;
951 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 1080 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
952 1081
953 kvm_lapic_reset(vcpu); 1082 kvm_lapic_reset(vcpu);
954 apic->dev.read = apic_mmio_read; 1083 kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
955 apic->dev.write = apic_mmio_write;
956 apic->dev.in_range = apic_mmio_range;
957 apic->dev.private = apic;
958 1084
959 return 0; 1085 return 0;
960nomem_free_apic: 1086nomem_free_apic:
@@ -962,7 +1088,6 @@ nomem_free_apic:
962nomem: 1088nomem:
963 return -ENOMEM; 1089 return -ENOMEM;
964} 1090}
965EXPORT_SYMBOL_GPL(kvm_create_lapic);
966 1091
967int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) 1092int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
968{ 1093{
@@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
985 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1110 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
986 int r = 0; 1111 int r = 0;
987 1112
988 if (vcpu->vcpu_id == 0) { 1113 if (kvm_vcpu_is_bsp(vcpu)) {
989 if (!apic_hw_enabled(vcpu->arch.apic)) 1114 if (!apic_hw_enabled(vcpu->arch.apic))
990 r = 1; 1115 r = 1;
991 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1116 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
@@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1025 1150
1026 apic->base_address = vcpu->arch.apic_base & 1151 apic->base_address = vcpu->arch.apic_base &
1027 MSR_IA32_APICBASE_BASE; 1152 MSR_IA32_APICBASE_BASE;
1028 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1153 kvm_apic_set_version(vcpu);
1154
1029 apic_update_ppr(apic); 1155 apic_update_ppr(apic);
1030 hrtimer_cancel(&apic->lapic_timer.timer); 1156 hrtimer_cancel(&apic->lapic_timer.timer);
1031 update_divide_count(apic); 1157 update_divide_count(apic);
@@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1092 1218
1093 vcpu->arch.apic->vapic_addr = vapic_addr; 1219 vcpu->arch.apic->vapic_addr = vapic_addr;
1094} 1220}
1221
1222int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1223{
1224 struct kvm_lapic *apic = vcpu->arch.apic;
1225 u32 reg = (msr - APIC_BASE_MSR) << 4;
1226
1227 if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
1228 return 1;
1229
1230 /* if this is ICR write vector before command */
1231 if (msr == 0x830)
1232 apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
1233 return apic_reg_write(apic, reg, (u32)data);
1234}
1235
1236int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1237{
1238 struct kvm_lapic *apic = vcpu->arch.apic;
1239 u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
1240
1241 if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
1242 return 1;
1243
1244 if (apic_reg_read(apic, reg, 4, &low))
1245 return 1;
1246 if (msr == 0x830)
1247 apic_reg_read(apic, APIC_ICR2, 4, &high);
1248
1249 *data = (((u64)high) << 32) | low;
1250
1251 return 0;
1252}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a587f8349c46..40010b09c4aa 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,7 @@ struct kvm_lapic {
12 struct kvm_timer lapic_timer; 12 struct kvm_timer lapic_timer;
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending;
15 struct page *regs_page; 16 struct page *regs_page;
16 void *regs; 17 void *regs;
17 gpa_t vapic_addr; 18 gpa_t vapic_addr;
@@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
28void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 29void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
29void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 30void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
30u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
32void kvm_apic_set_version(struct kvm_vcpu *vcpu);
31 33
32int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 34int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
33int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 35int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
@@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
44void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 46void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
45void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 47void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
46 48
49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
47#endif 51#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0ef5bb2b4043..eca41ae9f453 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h"
21 22
22#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
23#include <linux/types.h> 24#include <linux/types.h>
@@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644);
107 108
108#define PT32_LEVEL_MASK(level) \ 109#define PT32_LEVEL_MASK(level) \
109 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) 110 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
111#define PT32_LVL_OFFSET_MASK(level) \
112 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
113 * PT32_LEVEL_BITS))) - 1))
110 114
111#define PT32_INDEX(address, level)\ 115#define PT32_INDEX(address, level)\
112 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 116 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
@@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644);
115#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 119#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
116#define PT64_DIR_BASE_ADDR_MASK \ 120#define PT64_DIR_BASE_ADDR_MASK \
117 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 121 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
122#define PT64_LVL_ADDR_MASK(level) \
123 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
124 * PT64_LEVEL_BITS))) - 1))
125#define PT64_LVL_OFFSET_MASK(level) \
126 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
127 * PT64_LEVEL_BITS))) - 1))
118 128
119#define PT32_BASE_ADDR_MASK PAGE_MASK 129#define PT32_BASE_ADDR_MASK PAGE_MASK
120#define PT32_DIR_BASE_ADDR_MASK \ 130#define PT32_DIR_BASE_ADDR_MASK \
121 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 131 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
132#define PT32_LVL_ADDR_MASK(level) \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
134 * PT32_LEVEL_BITS))) - 1))
122 135
123#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 136#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
124 | PT64_NX_MASK) 137 | PT64_NX_MASK)
@@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644);
129#define PFERR_RSVD_MASK (1U << 3) 142#define PFERR_RSVD_MASK (1U << 3)
130#define PFERR_FETCH_MASK (1U << 4) 143#define PFERR_FETCH_MASK (1U << 4)
131 144
145#define PT_PDPE_LEVEL 3
132#define PT_DIRECTORY_LEVEL 2 146#define PT_DIRECTORY_LEVEL 2
133#define PT_PAGE_TABLE_LEVEL 1 147#define PT_PAGE_TABLE_LEVEL 1
134 148
@@ -139,10 +153,13 @@ module_param(oos_shadow, bool, 0644);
139#define ACC_USER_MASK PT_USER_MASK 153#define ACC_USER_MASK PT_USER_MASK
140#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 154#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
141 155
156#define CREATE_TRACE_POINTS
157#include "mmutrace.h"
158
142#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 159#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
143 160
144struct kvm_rmap_desc { 161struct kvm_rmap_desc {
145 u64 *shadow_ptes[RMAP_EXT]; 162 u64 *sptes[RMAP_EXT];
146 struct kvm_rmap_desc *more; 163 struct kvm_rmap_desc *more;
147}; 164};
148 165
@@ -239,16 +256,25 @@ static int is_writeble_pte(unsigned long pte)
239 return pte & PT_WRITABLE_MASK; 256 return pte & PT_WRITABLE_MASK;
240} 257}
241 258
242static int is_dirty_pte(unsigned long pte) 259static int is_dirty_gpte(unsigned long pte)
243{ 260{
244 return pte & shadow_dirty_mask; 261 return pte & PT_DIRTY_MASK;
245} 262}
246 263
247static int is_rmap_pte(u64 pte) 264static int is_rmap_spte(u64 pte)
248{ 265{
249 return is_shadow_present_pte(pte); 266 return is_shadow_present_pte(pte);
250} 267}
251 268
269static int is_last_spte(u64 pte, int level)
270{
271 if (level == PT_PAGE_TABLE_LEVEL)
272 return 1;
273 if (is_large_pte(pte))
274 return 1;
275 return 0;
276}
277
252static pfn_t spte_to_pfn(u64 pte) 278static pfn_t spte_to_pfn(u64 pte)
253{ 279{
254 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 280 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -261,7 +287,7 @@ static gfn_t pse36_gfn_delta(u32 gpte)
261 return (gpte & PT32_DIR_PSE36_MASK) << shift; 287 return (gpte & PT32_DIR_PSE36_MASK) << shift;
262} 288}
263 289
264static void set_shadow_pte(u64 *sptep, u64 spte) 290static void __set_spte(u64 *sptep, u64 spte)
265{ 291{
266#ifdef CONFIG_X86_64 292#ifdef CONFIG_X86_64
267 set_64bit((unsigned long *)sptep, spte); 293 set_64bit((unsigned long *)sptep, spte);
@@ -380,37 +406,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
380 * Return the pointer to the largepage write count for a given 406 * Return the pointer to the largepage write count for a given
381 * gfn, handling slots that are not large page aligned. 407 * gfn, handling slots that are not large page aligned.
382 */ 408 */
383static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) 409static int *slot_largepage_idx(gfn_t gfn,
410 struct kvm_memory_slot *slot,
411 int level)
384{ 412{
385 unsigned long idx; 413 unsigned long idx;
386 414
387 idx = (gfn / KVM_PAGES_PER_HPAGE) - 415 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
388 (slot->base_gfn / KVM_PAGES_PER_HPAGE); 416 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
389 return &slot->lpage_info[idx].write_count; 417 return &slot->lpage_info[level - 2][idx].write_count;
390} 418}
391 419
392static void account_shadowed(struct kvm *kvm, gfn_t gfn) 420static void account_shadowed(struct kvm *kvm, gfn_t gfn)
393{ 421{
422 struct kvm_memory_slot *slot;
394 int *write_count; 423 int *write_count;
424 int i;
395 425
396 gfn = unalias_gfn(kvm, gfn); 426 gfn = unalias_gfn(kvm, gfn);
397 write_count = slot_largepage_idx(gfn, 427
398 gfn_to_memslot_unaliased(kvm, gfn)); 428 slot = gfn_to_memslot_unaliased(kvm, gfn);
399 *write_count += 1; 429 for (i = PT_DIRECTORY_LEVEL;
430 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
431 write_count = slot_largepage_idx(gfn, slot, i);
432 *write_count += 1;
433 }
400} 434}
401 435
402static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 436static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
403{ 437{
438 struct kvm_memory_slot *slot;
404 int *write_count; 439 int *write_count;
440 int i;
405 441
406 gfn = unalias_gfn(kvm, gfn); 442 gfn = unalias_gfn(kvm, gfn);
407 write_count = slot_largepage_idx(gfn, 443 for (i = PT_DIRECTORY_LEVEL;
408 gfn_to_memslot_unaliased(kvm, gfn)); 444 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
409 *write_count -= 1; 445 slot = gfn_to_memslot_unaliased(kvm, gfn);
410 WARN_ON(*write_count < 0); 446 write_count = slot_largepage_idx(gfn, slot, i);
447 *write_count -= 1;
448 WARN_ON(*write_count < 0);
449 }
411} 450}
412 451
413static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) 452static int has_wrprotected_page(struct kvm *kvm,
453 gfn_t gfn,
454 int level)
414{ 455{
415 struct kvm_memory_slot *slot; 456 struct kvm_memory_slot *slot;
416 int *largepage_idx; 457 int *largepage_idx;
@@ -418,47 +459,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
418 gfn = unalias_gfn(kvm, gfn); 459 gfn = unalias_gfn(kvm, gfn);
419 slot = gfn_to_memslot_unaliased(kvm, gfn); 460 slot = gfn_to_memslot_unaliased(kvm, gfn);
420 if (slot) { 461 if (slot) {
421 largepage_idx = slot_largepage_idx(gfn, slot); 462 largepage_idx = slot_largepage_idx(gfn, slot, level);
422 return *largepage_idx; 463 return *largepage_idx;
423 } 464 }
424 465
425 return 1; 466 return 1;
426} 467}
427 468
428static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) 469static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
429{ 470{
471 unsigned long page_size = PAGE_SIZE;
430 struct vm_area_struct *vma; 472 struct vm_area_struct *vma;
431 unsigned long addr; 473 unsigned long addr;
432 int ret = 0; 474 int i, ret = 0;
433 475
434 addr = gfn_to_hva(kvm, gfn); 476 addr = gfn_to_hva(kvm, gfn);
435 if (kvm_is_error_hva(addr)) 477 if (kvm_is_error_hva(addr))
436 return ret; 478 return page_size;
437 479
438 down_read(&current->mm->mmap_sem); 480 down_read(&current->mm->mmap_sem);
439 vma = find_vma(current->mm, addr); 481 vma = find_vma(current->mm, addr);
440 if (vma && is_vm_hugetlb_page(vma)) 482 if (!vma)
441 ret = 1; 483 goto out;
484
485 page_size = vma_kernel_pagesize(vma);
486
487out:
442 up_read(&current->mm->mmap_sem); 488 up_read(&current->mm->mmap_sem);
443 489
490 for (i = PT_PAGE_TABLE_LEVEL;
491 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
492 if (page_size >= KVM_HPAGE_SIZE(i))
493 ret = i;
494 else
495 break;
496 }
497
444 return ret; 498 return ret;
445} 499}
446 500
447static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 501static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
448{ 502{
449 struct kvm_memory_slot *slot; 503 struct kvm_memory_slot *slot;
450 504 int host_level;
451 if (has_wrprotected_page(vcpu->kvm, large_gfn)) 505 int level = PT_PAGE_TABLE_LEVEL;
452 return 0;
453
454 if (!host_largepage_backed(vcpu->kvm, large_gfn))
455 return 0;
456 506
457 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 507 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
458 if (slot && slot->dirty_bitmap) 508 if (slot && slot->dirty_bitmap)
459 return 0; 509 return PT_PAGE_TABLE_LEVEL;
460 510
461 return 1; 511 host_level = host_mapping_level(vcpu->kvm, large_gfn);
512
513 if (host_level == PT_PAGE_TABLE_LEVEL)
514 return host_level;
515
516 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) {
517
518 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
519 break;
520 }
521
522 return level - 1;
462} 523}
463 524
464/* 525/*
@@ -466,19 +527,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
466 * Note: gfn must be unaliased before this function get called 527 * Note: gfn must be unaliased before this function get called
467 */ 528 */
468 529
469static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) 530static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
470{ 531{
471 struct kvm_memory_slot *slot; 532 struct kvm_memory_slot *slot;
472 unsigned long idx; 533 unsigned long idx;
473 534
474 slot = gfn_to_memslot(kvm, gfn); 535 slot = gfn_to_memslot(kvm, gfn);
475 if (!lpage) 536 if (likely(level == PT_PAGE_TABLE_LEVEL))
476 return &slot->rmap[gfn - slot->base_gfn]; 537 return &slot->rmap[gfn - slot->base_gfn];
477 538
478 idx = (gfn / KVM_PAGES_PER_HPAGE) - 539 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
479 (slot->base_gfn / KVM_PAGES_PER_HPAGE); 540 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
480 541
481 return &slot->lpage_info[idx].rmap_pde; 542 return &slot->lpage_info[level - 2][idx].rmap_pde;
482} 543}
483 544
484/* 545/*
@@ -494,42 +555,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
494 * the spte was not added. 555 * the spte was not added.
495 * 556 *
496 */ 557 */
497static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) 558static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
498{ 559{
499 struct kvm_mmu_page *sp; 560 struct kvm_mmu_page *sp;
500 struct kvm_rmap_desc *desc; 561 struct kvm_rmap_desc *desc;
501 unsigned long *rmapp; 562 unsigned long *rmapp;
502 int i, count = 0; 563 int i, count = 0;
503 564
504 if (!is_rmap_pte(*spte)) 565 if (!is_rmap_spte(*spte))
505 return count; 566 return count;
506 gfn = unalias_gfn(vcpu->kvm, gfn); 567 gfn = unalias_gfn(vcpu->kvm, gfn);
507 sp = page_header(__pa(spte)); 568 sp = page_header(__pa(spte));
508 sp->gfns[spte - sp->spt] = gfn; 569 sp->gfns[spte - sp->spt] = gfn;
509 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); 570 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
510 if (!*rmapp) { 571 if (!*rmapp) {
511 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 572 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
512 *rmapp = (unsigned long)spte; 573 *rmapp = (unsigned long)spte;
513 } else if (!(*rmapp & 1)) { 574 } else if (!(*rmapp & 1)) {
514 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); 575 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
515 desc = mmu_alloc_rmap_desc(vcpu); 576 desc = mmu_alloc_rmap_desc(vcpu);
516 desc->shadow_ptes[0] = (u64 *)*rmapp; 577 desc->sptes[0] = (u64 *)*rmapp;
517 desc->shadow_ptes[1] = spte; 578 desc->sptes[1] = spte;
518 *rmapp = (unsigned long)desc | 1; 579 *rmapp = (unsigned long)desc | 1;
519 } else { 580 } else {
520 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 581 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
521 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 582 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
522 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { 583 while (desc->sptes[RMAP_EXT-1] && desc->more) {
523 desc = desc->more; 584 desc = desc->more;
524 count += RMAP_EXT; 585 count += RMAP_EXT;
525 } 586 }
526 if (desc->shadow_ptes[RMAP_EXT-1]) { 587 if (desc->sptes[RMAP_EXT-1]) {
527 desc->more = mmu_alloc_rmap_desc(vcpu); 588 desc->more = mmu_alloc_rmap_desc(vcpu);
528 desc = desc->more; 589 desc = desc->more;
529 } 590 }
530 for (i = 0; desc->shadow_ptes[i]; ++i) 591 for (i = 0; desc->sptes[i]; ++i)
531 ; 592 ;
532 desc->shadow_ptes[i] = spte; 593 desc->sptes[i] = spte;
533 } 594 }
534 return count; 595 return count;
535} 596}
@@ -541,14 +602,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
541{ 602{
542 int j; 603 int j;
543 604
544 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) 605 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
545 ; 606 ;
546 desc->shadow_ptes[i] = desc->shadow_ptes[j]; 607 desc->sptes[i] = desc->sptes[j];
547 desc->shadow_ptes[j] = NULL; 608 desc->sptes[j] = NULL;
548 if (j != 0) 609 if (j != 0)
549 return; 610 return;
550 if (!prev_desc && !desc->more) 611 if (!prev_desc && !desc->more)
551 *rmapp = (unsigned long)desc->shadow_ptes[0]; 612 *rmapp = (unsigned long)desc->sptes[0];
552 else 613 else
553 if (prev_desc) 614 if (prev_desc)
554 prev_desc->more = desc->more; 615 prev_desc->more = desc->more;
@@ -566,7 +627,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
566 unsigned long *rmapp; 627 unsigned long *rmapp;
567 int i; 628 int i;
568 629
569 if (!is_rmap_pte(*spte)) 630 if (!is_rmap_spte(*spte))
570 return; 631 return;
571 sp = page_header(__pa(spte)); 632 sp = page_header(__pa(spte));
572 pfn = spte_to_pfn(*spte); 633 pfn = spte_to_pfn(*spte);
@@ -576,7 +637,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
576 kvm_release_pfn_dirty(pfn); 637 kvm_release_pfn_dirty(pfn);
577 else 638 else
578 kvm_release_pfn_clean(pfn); 639 kvm_release_pfn_clean(pfn);
579 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); 640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
580 if (!*rmapp) { 641 if (!*rmapp) {
581 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 642 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
582 BUG(); 643 BUG();
@@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
593 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 654 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
594 prev_desc = NULL; 655 prev_desc = NULL;
595 while (desc) { 656 while (desc) {
596 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) 657 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
597 if (desc->shadow_ptes[i] == spte) { 658 if (desc->sptes[i] == spte) {
598 rmap_desc_remove_entry(rmapp, 659 rmap_desc_remove_entry(rmapp,
599 desc, i, 660 desc, i,
600 prev_desc); 661 prev_desc);
@@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
625 prev_desc = NULL; 686 prev_desc = NULL;
626 prev_spte = NULL; 687 prev_spte = NULL;
627 while (desc) { 688 while (desc) {
628 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { 689 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
629 if (prev_spte == spte) 690 if (prev_spte == spte)
630 return desc->shadow_ptes[i]; 691 return desc->sptes[i];
631 prev_spte = desc->shadow_ptes[i]; 692 prev_spte = desc->sptes[i];
632 } 693 }
633 desc = desc->more; 694 desc = desc->more;
634 } 695 }
@@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
639{ 700{
640 unsigned long *rmapp; 701 unsigned long *rmapp;
641 u64 *spte; 702 u64 *spte;
642 int write_protected = 0; 703 int i, write_protected = 0;
643 704
644 gfn = unalias_gfn(kvm, gfn); 705 gfn = unalias_gfn(kvm, gfn);
645 rmapp = gfn_to_rmap(kvm, gfn, 0); 706 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
646 707
647 spte = rmap_next(kvm, rmapp, NULL); 708 spte = rmap_next(kvm, rmapp, NULL);
648 while (spte) { 709 while (spte) {
@@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
650 BUG_ON(!(*spte & PT_PRESENT_MASK)); 711 BUG_ON(!(*spte & PT_PRESENT_MASK));
651 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 712 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
652 if (is_writeble_pte(*spte)) { 713 if (is_writeble_pte(*spte)) {
653 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); 714 __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
654 write_protected = 1; 715 write_protected = 1;
655 } 716 }
656 spte = rmap_next(kvm, rmapp, spte); 717 spte = rmap_next(kvm, rmapp, spte);
@@ -664,21 +725,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
664 } 725 }
665 726
666 /* check for huge page mappings */ 727 /* check for huge page mappings */
667 rmapp = gfn_to_rmap(kvm, gfn, 1); 728 for (i = PT_DIRECTORY_LEVEL;
668 spte = rmap_next(kvm, rmapp, NULL); 729 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
669 while (spte) { 730 rmapp = gfn_to_rmap(kvm, gfn, i);
670 BUG_ON(!spte); 731 spte = rmap_next(kvm, rmapp, NULL);
671 BUG_ON(!(*spte & PT_PRESENT_MASK)); 732 while (spte) {
672 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 733 BUG_ON(!spte);
673 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 734 BUG_ON(!(*spte & PT_PRESENT_MASK));
674 if (is_writeble_pte(*spte)) { 735 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
675 rmap_remove(kvm, spte); 736 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
676 --kvm->stat.lpages; 737 if (is_writeble_pte(*spte)) {
677 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 738 rmap_remove(kvm, spte);
678 spte = NULL; 739 --kvm->stat.lpages;
679 write_protected = 1; 740 __set_spte(spte, shadow_trap_nonpresent_pte);
741 spte = NULL;
742 write_protected = 1;
743 }
744 spte = rmap_next(kvm, rmapp, spte);
680 } 745 }
681 spte = rmap_next(kvm, rmapp, spte);
682 } 746 }
683 747
684 return write_protected; 748 return write_protected;
@@ -693,7 +757,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
693 BUG_ON(!(*spte & PT_PRESENT_MASK)); 757 BUG_ON(!(*spte & PT_PRESENT_MASK));
694 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 758 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
695 rmap_remove(kvm, spte); 759 rmap_remove(kvm, spte);
696 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 760 __set_spte(spte, shadow_trap_nonpresent_pte);
697 need_tlb_flush = 1; 761 need_tlb_flush = 1;
698 } 762 }
699 return need_tlb_flush; 763 return need_tlb_flush;
@@ -702,7 +766,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
702static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 766static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
703 int (*handler)(struct kvm *kvm, unsigned long *rmapp)) 767 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
704{ 768{
705 int i; 769 int i, j;
706 int retval = 0; 770 int retval = 0;
707 771
708 /* 772 /*
@@ -721,11 +785,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
721 end = start + (memslot->npages << PAGE_SHIFT); 785 end = start + (memslot->npages << PAGE_SHIFT);
722 if (hva >= start && hva < end) { 786 if (hva >= start && hva < end) {
723 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 787 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
788
724 retval |= handler(kvm, &memslot->rmap[gfn_offset]); 789 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
725 retval |= handler(kvm, 790
726 &memslot->lpage_info[ 791 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
727 gfn_offset / 792 int idx = gfn_offset;
728 KVM_PAGES_PER_HPAGE].rmap_pde); 793 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
794 retval |= handler(kvm,
795 &memslot->lpage_info[j][idx].rmap_pde);
796 }
729 } 797 }
730 } 798 }
731 799
@@ -763,12 +831,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
763 831
764#define RMAP_RECYCLE_THRESHOLD 1000 832#define RMAP_RECYCLE_THRESHOLD 1000
765 833
766static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) 834static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
767{ 835{
768 unsigned long *rmapp; 836 unsigned long *rmapp;
837 struct kvm_mmu_page *sp;
838
839 sp = page_header(__pa(spte));
769 840
770 gfn = unalias_gfn(vcpu->kvm, gfn); 841 gfn = unalias_gfn(vcpu->kvm, gfn);
771 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); 842 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
772 843
773 kvm_unmap_rmapp(vcpu->kvm, rmapp); 844 kvm_unmap_rmapp(vcpu->kvm, rmapp);
774 kvm_flush_remote_tlbs(vcpu->kvm); 845 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1109,6 +1180,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1109 return 1; 1180 return 1;
1110 } 1181 }
1111 1182
1183 trace_kvm_mmu_sync_page(sp);
1112 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1184 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1113 kvm_flush_remote_tlbs(vcpu->kvm); 1185 kvm_flush_remote_tlbs(vcpu->kvm);
1114 kvm_unlink_unsync_page(vcpu->kvm, sp); 1186 kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1231,8 +1303,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1231 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1303 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1232 role.quadrant = quadrant; 1304 role.quadrant = quadrant;
1233 } 1305 }
1234 pgprintk("%s: looking gfn %lx role %x\n", __func__,
1235 gfn, role.word);
1236 index = kvm_page_table_hashfn(gfn); 1306 index = kvm_page_table_hashfn(gfn);
1237 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1307 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1238 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1308 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
@@ -1249,14 +1319,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1249 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1319 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1250 kvm_mmu_mark_parents_unsync(vcpu, sp); 1320 kvm_mmu_mark_parents_unsync(vcpu, sp);
1251 } 1321 }
1252 pgprintk("%s: found\n", __func__); 1322 trace_kvm_mmu_get_page(sp, false);
1253 return sp; 1323 return sp;
1254 } 1324 }
1255 ++vcpu->kvm->stat.mmu_cache_miss; 1325 ++vcpu->kvm->stat.mmu_cache_miss;
1256 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1326 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
1257 if (!sp) 1327 if (!sp)
1258 return sp; 1328 return sp;
1259 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1260 sp->gfn = gfn; 1329 sp->gfn = gfn;
1261 sp->role = role; 1330 sp->role = role;
1262 hlist_add_head(&sp->hash_link, bucket); 1331 hlist_add_head(&sp->hash_link, bucket);
@@ -1269,6 +1338,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1269 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1338 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1270 else 1339 else
1271 nonpaging_prefetch_page(vcpu, sp); 1340 nonpaging_prefetch_page(vcpu, sp);
1341 trace_kvm_mmu_get_page(sp, true);
1272 return sp; 1342 return sp;
1273} 1343}
1274 1344
@@ -1292,6 +1362,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1292{ 1362{
1293 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1363 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1294 return false; 1364 return false;
1365
1366 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1367 if (is_large_pte(*iterator->sptep))
1368 return false;
1369
1295 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1370 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1296 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1371 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1297 return true; 1372 return true;
@@ -1312,25 +1387,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1312 1387
1313 pt = sp->spt; 1388 pt = sp->spt;
1314 1389
1315 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1316 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1317 if (is_shadow_present_pte(pt[i]))
1318 rmap_remove(kvm, &pt[i]);
1319 pt[i] = shadow_trap_nonpresent_pte;
1320 }
1321 return;
1322 }
1323
1324 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1390 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1325 ent = pt[i]; 1391 ent = pt[i];
1326 1392
1327 if (is_shadow_present_pte(ent)) { 1393 if (is_shadow_present_pte(ent)) {
1328 if (!is_large_pte(ent)) { 1394 if (!is_last_spte(ent, sp->role.level)) {
1329 ent &= PT64_BASE_ADDR_MASK; 1395 ent &= PT64_BASE_ADDR_MASK;
1330 mmu_page_remove_parent_pte(page_header(ent), 1396 mmu_page_remove_parent_pte(page_header(ent),
1331 &pt[i]); 1397 &pt[i]);
1332 } else { 1398 } else {
1333 --kvm->stat.lpages; 1399 if (is_large_pte(ent))
1400 --kvm->stat.lpages;
1334 rmap_remove(kvm, &pt[i]); 1401 rmap_remove(kvm, &pt[i]);
1335 } 1402 }
1336 } 1403 }
@@ -1346,10 +1413,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1346static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) 1413static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1347{ 1414{
1348 int i; 1415 int i;
1416 struct kvm_vcpu *vcpu;
1349 1417
1350 for (i = 0; i < KVM_MAX_VCPUS; ++i) 1418 kvm_for_each_vcpu(i, vcpu, kvm)
1351 if (kvm->vcpus[i]) 1419 vcpu->arch.last_pte_updated = NULL;
1352 kvm->vcpus[i]->arch.last_pte_updated = NULL;
1353} 1420}
1354 1421
1355static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1422static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1368,7 +1435,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1368 } 1435 }
1369 BUG_ON(!parent_pte); 1436 BUG_ON(!parent_pte);
1370 kvm_mmu_put_page(sp, parent_pte); 1437 kvm_mmu_put_page(sp, parent_pte);
1371 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1438 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1372 } 1439 }
1373} 1440}
1374 1441
@@ -1400,6 +1467,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1400static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1467static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1401{ 1468{
1402 int ret; 1469 int ret;
1470
1471 trace_kvm_mmu_zap_page(sp);
1403 ++kvm->stat.mmu_shadow_zapped; 1472 ++kvm->stat.mmu_shadow_zapped;
1404 ret = mmu_zap_unsync_children(kvm, sp); 1473 ret = mmu_zap_unsync_children(kvm, sp);
1405 kvm_mmu_page_unlink_children(kvm, sp); 1474 kvm_mmu_page_unlink_children(kvm, sp);
@@ -1516,7 +1585,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1516 1585
1517 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1586 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1518 if (pt[i] == shadow_notrap_nonpresent_pte) 1587 if (pt[i] == shadow_notrap_nonpresent_pte)
1519 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); 1588 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1520 } 1589 }
1521} 1590}
1522 1591
@@ -1646,6 +1715,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1646 struct kvm_mmu_page *s; 1715 struct kvm_mmu_page *s;
1647 struct hlist_node *node, *n; 1716 struct hlist_node *node, *n;
1648 1717
1718 trace_kvm_mmu_unsync_page(sp);
1649 index = kvm_page_table_hashfn(sp->gfn); 1719 index = kvm_page_table_hashfn(sp->gfn);
1650 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1720 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1651 /* don't unsync if pagetable is shadowed with multiple roles */ 1721 /* don't unsync if pagetable is shadowed with multiple roles */
@@ -1682,9 +1752,9 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1682 return 0; 1752 return 0;
1683} 1753}
1684 1754
1685static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1755static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1686 unsigned pte_access, int user_fault, 1756 unsigned pte_access, int user_fault,
1687 int write_fault, int dirty, int largepage, 1757 int write_fault, int dirty, int level,
1688 gfn_t gfn, pfn_t pfn, bool speculative, 1758 gfn_t gfn, pfn_t pfn, bool speculative,
1689 bool can_unsync) 1759 bool can_unsync)
1690{ 1760{
@@ -1707,7 +1777,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1707 spte |= shadow_nx_mask; 1777 spte |= shadow_nx_mask;
1708 if (pte_access & ACC_USER_MASK) 1778 if (pte_access & ACC_USER_MASK)
1709 spte |= shadow_user_mask; 1779 spte |= shadow_user_mask;
1710 if (largepage) 1780 if (level > PT_PAGE_TABLE_LEVEL)
1711 spte |= PT_PAGE_SIZE_MASK; 1781 spte |= PT_PAGE_SIZE_MASK;
1712 if (tdp_enabled) 1782 if (tdp_enabled)
1713 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 1783 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
@@ -1718,7 +1788,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1718 if ((pte_access & ACC_WRITE_MASK) 1788 if ((pte_access & ACC_WRITE_MASK)
1719 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1789 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1720 1790
1721 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { 1791 if (level > PT_PAGE_TABLE_LEVEL &&
1792 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1722 ret = 1; 1793 ret = 1;
1723 spte = shadow_trap_nonpresent_pte; 1794 spte = shadow_trap_nonpresent_pte;
1724 goto set_pte; 1795 goto set_pte;
@@ -1732,7 +1803,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1732 * is responsibility of mmu_get_page / kvm_sync_page. 1803 * is responsibility of mmu_get_page / kvm_sync_page.
1733 * Same reasoning can be applied to dirty page accounting. 1804 * Same reasoning can be applied to dirty page accounting.
1734 */ 1805 */
1735 if (!can_unsync && is_writeble_pte(*shadow_pte)) 1806 if (!can_unsync && is_writeble_pte(*sptep))
1736 goto set_pte; 1807 goto set_pte;
1737 1808
1738 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1809 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1749,65 +1820,67 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1749 mark_page_dirty(vcpu->kvm, gfn); 1820 mark_page_dirty(vcpu->kvm, gfn);
1750 1821
1751set_pte: 1822set_pte:
1752 set_shadow_pte(shadow_pte, spte); 1823 __set_spte(sptep, spte);
1753 return ret; 1824 return ret;
1754} 1825}
1755 1826
1756static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1827static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1757 unsigned pt_access, unsigned pte_access, 1828 unsigned pt_access, unsigned pte_access,
1758 int user_fault, int write_fault, int dirty, 1829 int user_fault, int write_fault, int dirty,
1759 int *ptwrite, int largepage, gfn_t gfn, 1830 int *ptwrite, int level, gfn_t gfn,
1760 pfn_t pfn, bool speculative) 1831 pfn_t pfn, bool speculative)
1761{ 1832{
1762 int was_rmapped = 0; 1833 int was_rmapped = 0;
1763 int was_writeble = is_writeble_pte(*shadow_pte); 1834 int was_writeble = is_writeble_pte(*sptep);
1764 int rmap_count; 1835 int rmap_count;
1765 1836
1766 pgprintk("%s: spte %llx access %x write_fault %d" 1837 pgprintk("%s: spte %llx access %x write_fault %d"
1767 " user_fault %d gfn %lx\n", 1838 " user_fault %d gfn %lx\n",
1768 __func__, *shadow_pte, pt_access, 1839 __func__, *sptep, pt_access,
1769 write_fault, user_fault, gfn); 1840 write_fault, user_fault, gfn);
1770 1841
1771 if (is_rmap_pte(*shadow_pte)) { 1842 if (is_rmap_spte(*sptep)) {
1772 /* 1843 /*
1773 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1844 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1774 * the parent of the now unreachable PTE. 1845 * the parent of the now unreachable PTE.
1775 */ 1846 */
1776 if (largepage && !is_large_pte(*shadow_pte)) { 1847 if (level > PT_PAGE_TABLE_LEVEL &&
1848 !is_large_pte(*sptep)) {
1777 struct kvm_mmu_page *child; 1849 struct kvm_mmu_page *child;
1778 u64 pte = *shadow_pte; 1850 u64 pte = *sptep;
1779 1851
1780 child = page_header(pte & PT64_BASE_ADDR_MASK); 1852 child = page_header(pte & PT64_BASE_ADDR_MASK);
1781 mmu_page_remove_parent_pte(child, shadow_pte); 1853 mmu_page_remove_parent_pte(child, sptep);
1782 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1854 } else if (pfn != spte_to_pfn(*sptep)) {
1783 pgprintk("hfn old %lx new %lx\n", 1855 pgprintk("hfn old %lx new %lx\n",
1784 spte_to_pfn(*shadow_pte), pfn); 1856 spte_to_pfn(*sptep), pfn);
1785 rmap_remove(vcpu->kvm, shadow_pte); 1857 rmap_remove(vcpu->kvm, sptep);
1786 } else 1858 } else
1787 was_rmapped = 1; 1859 was_rmapped = 1;
1788 } 1860 }
1789 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1861
1790 dirty, largepage, gfn, pfn, speculative, true)) { 1862 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1863 dirty, level, gfn, pfn, speculative, true)) {
1791 if (write_fault) 1864 if (write_fault)
1792 *ptwrite = 1; 1865 *ptwrite = 1;
1793 kvm_x86_ops->tlb_flush(vcpu); 1866 kvm_x86_ops->tlb_flush(vcpu);
1794 } 1867 }
1795 1868
1796 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); 1869 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
1797 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 1870 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1798 is_large_pte(*shadow_pte)? "2MB" : "4kB", 1871 is_large_pte(*sptep)? "2MB" : "4kB",
1799 is_present_pte(*shadow_pte)?"RW":"R", gfn, 1872 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
1800 *shadow_pte, shadow_pte); 1873 *sptep, sptep);
1801 if (!was_rmapped && is_large_pte(*shadow_pte)) 1874 if (!was_rmapped && is_large_pte(*sptep))
1802 ++vcpu->kvm->stat.lpages; 1875 ++vcpu->kvm->stat.lpages;
1803 1876
1804 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1877 page_header_update_slot(vcpu->kvm, sptep, gfn);
1805 if (!was_rmapped) { 1878 if (!was_rmapped) {
1806 rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); 1879 rmap_count = rmap_add(vcpu, sptep, gfn);
1807 if (!is_rmap_pte(*shadow_pte)) 1880 if (!is_rmap_spte(*sptep))
1808 kvm_release_pfn_clean(pfn); 1881 kvm_release_pfn_clean(pfn);
1809 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1882 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1810 rmap_recycle(vcpu, gfn, largepage); 1883 rmap_recycle(vcpu, sptep, gfn);
1811 } else { 1884 } else {
1812 if (was_writeble) 1885 if (was_writeble)
1813 kvm_release_pfn_dirty(pfn); 1886 kvm_release_pfn_dirty(pfn);
@@ -1815,7 +1888,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1815 kvm_release_pfn_clean(pfn); 1888 kvm_release_pfn_clean(pfn);
1816 } 1889 }
1817 if (speculative) { 1890 if (speculative) {
1818 vcpu->arch.last_pte_updated = shadow_pte; 1891 vcpu->arch.last_pte_updated = sptep;
1819 vcpu->arch.last_pte_gfn = gfn; 1892 vcpu->arch.last_pte_gfn = gfn;
1820 } 1893 }
1821} 1894}
@@ -1825,7 +1898,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1825} 1898}
1826 1899
1827static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1900static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1828 int largepage, gfn_t gfn, pfn_t pfn) 1901 int level, gfn_t gfn, pfn_t pfn)
1829{ 1902{
1830 struct kvm_shadow_walk_iterator iterator; 1903 struct kvm_shadow_walk_iterator iterator;
1831 struct kvm_mmu_page *sp; 1904 struct kvm_mmu_page *sp;
@@ -1833,11 +1906,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1833 gfn_t pseudo_gfn; 1906 gfn_t pseudo_gfn;
1834 1907
1835 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 1908 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
1836 if (iterator.level == PT_PAGE_TABLE_LEVEL 1909 if (iterator.level == level) {
1837 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
1838 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1910 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1839 0, write, 1, &pt_write, 1911 0, write, 1, &pt_write,
1840 largepage, gfn, pfn, false); 1912 level, gfn, pfn, false);
1841 ++vcpu->stat.pf_fixed; 1913 ++vcpu->stat.pf_fixed;
1842 break; 1914 break;
1843 } 1915 }
@@ -1853,10 +1925,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1853 return -ENOMEM; 1925 return -ENOMEM;
1854 } 1926 }
1855 1927
1856 set_shadow_pte(iterator.sptep, 1928 __set_spte(iterator.sptep,
1857 __pa(sp->spt) 1929 __pa(sp->spt)
1858 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1930 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1859 | shadow_user_mask | shadow_x_mask); 1931 | shadow_user_mask | shadow_x_mask);
1860 } 1932 }
1861 } 1933 }
1862 return pt_write; 1934 return pt_write;
@@ -1865,14 +1937,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1865static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1937static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1866{ 1938{
1867 int r; 1939 int r;
1868 int largepage = 0; 1940 int level;
1869 pfn_t pfn; 1941 pfn_t pfn;
1870 unsigned long mmu_seq; 1942 unsigned long mmu_seq;
1871 1943
1872 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1944 level = mapping_level(vcpu, gfn);
1873 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1945
1874 largepage = 1; 1946 /*
1875 } 1947 * This path builds a PAE pagetable - so we can map 2mb pages at
1948 * maximum. Therefore check if the level is larger than that.
1949 */
1950 if (level > PT_DIRECTORY_LEVEL)
1951 level = PT_DIRECTORY_LEVEL;
1952
1953 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
1876 1954
1877 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1955 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1878 smp_rmb(); 1956 smp_rmb();
@@ -1888,7 +1966,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1888 if (mmu_notifier_retry(vcpu, mmu_seq)) 1966 if (mmu_notifier_retry(vcpu, mmu_seq))
1889 goto out_unlock; 1967 goto out_unlock;
1890 kvm_mmu_free_some_pages(vcpu); 1968 kvm_mmu_free_some_pages(vcpu);
1891 r = __direct_map(vcpu, v, write, largepage, gfn, pfn); 1969 r = __direct_map(vcpu, v, write, level, gfn, pfn);
1892 spin_unlock(&vcpu->kvm->mmu_lock); 1970 spin_unlock(&vcpu->kvm->mmu_lock);
1893 1971
1894 1972
@@ -1954,6 +2032,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1954 gfn_t root_gfn; 2032 gfn_t root_gfn;
1955 struct kvm_mmu_page *sp; 2033 struct kvm_mmu_page *sp;
1956 int direct = 0; 2034 int direct = 0;
2035 u64 pdptr;
1957 2036
1958 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; 2037 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1959 2038
@@ -1981,11 +2060,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1981 2060
1982 ASSERT(!VALID_PAGE(root)); 2061 ASSERT(!VALID_PAGE(root));
1983 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2062 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1984 if (!is_present_pte(vcpu->arch.pdptrs[i])) { 2063 pdptr = kvm_pdptr_read(vcpu, i);
2064 if (!is_present_gpte(pdptr)) {
1985 vcpu->arch.mmu.pae_root[i] = 0; 2065 vcpu->arch.mmu.pae_root[i] = 0;
1986 continue; 2066 continue;
1987 } 2067 }
1988 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; 2068 root_gfn = pdptr >> PAGE_SHIFT;
1989 } else if (vcpu->arch.mmu.root_level == 0) 2069 } else if (vcpu->arch.mmu.root_level == 0)
1990 root_gfn = 0; 2070 root_gfn = 0;
1991 if (mmu_check_root(vcpu, root_gfn)) 2071 if (mmu_check_root(vcpu, root_gfn))
@@ -2062,7 +2142,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2062{ 2142{
2063 pfn_t pfn; 2143 pfn_t pfn;
2064 int r; 2144 int r;
2065 int largepage = 0; 2145 int level;
2066 gfn_t gfn = gpa >> PAGE_SHIFT; 2146 gfn_t gfn = gpa >> PAGE_SHIFT;
2067 unsigned long mmu_seq; 2147 unsigned long mmu_seq;
2068 2148
@@ -2073,10 +2153,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2073 if (r) 2153 if (r)
2074 return r; 2154 return r;
2075 2155
2076 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 2156 level = mapping_level(vcpu, gfn);
2077 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2157
2078 largepage = 1; 2158 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2079 } 2159
2080 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2160 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2081 smp_rmb(); 2161 smp_rmb();
2082 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2162 pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2089,7 +2169,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2089 goto out_unlock; 2169 goto out_unlock;
2090 kvm_mmu_free_some_pages(vcpu); 2170 kvm_mmu_free_some_pages(vcpu);
2091 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2171 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2092 largepage, gfn, pfn); 2172 level, gfn, pfn);
2093 spin_unlock(&vcpu->kvm->mmu_lock); 2173 spin_unlock(&vcpu->kvm->mmu_lock);
2094 2174
2095 return r; 2175 return r;
@@ -2206,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2206 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 2286 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2207 rsvd_bits(maxphyaddr, 51); 2287 rsvd_bits(maxphyaddr, 51);
2208 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; 2288 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2209 context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; 2289 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2290 rsvd_bits(maxphyaddr, 51) |
2291 rsvd_bits(13, 29);
2210 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2292 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2211 rsvd_bits(maxphyaddr, 51) | 2293 rsvd_bits(maxphyaddr, 51) |
2212 rsvd_bits(13, 20); /* large page */ 2294 rsvd_bits(13, 20); /* large page */
@@ -2357,8 +2439,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2357 spin_unlock(&vcpu->kvm->mmu_lock); 2439 spin_unlock(&vcpu->kvm->mmu_lock);
2358 if (r) 2440 if (r)
2359 goto out; 2441 goto out;
2442 /* set_cr3() should ensure TLB has been flushed */
2360 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2443 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2361 kvm_mmu_flush_tlb(vcpu);
2362out: 2444out:
2363 return r; 2445 return r;
2364} 2446}
@@ -2378,15 +2460,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2378 2460
2379 pte = *spte; 2461 pte = *spte;
2380 if (is_shadow_present_pte(pte)) { 2462 if (is_shadow_present_pte(pte)) {
2381 if (sp->role.level == PT_PAGE_TABLE_LEVEL || 2463 if (is_last_spte(pte, sp->role.level))
2382 is_large_pte(pte))
2383 rmap_remove(vcpu->kvm, spte); 2464 rmap_remove(vcpu->kvm, spte);
2384 else { 2465 else {
2385 child = page_header(pte & PT64_BASE_ADDR_MASK); 2466 child = page_header(pte & PT64_BASE_ADDR_MASK);
2386 mmu_page_remove_parent_pte(child, spte); 2467 mmu_page_remove_parent_pte(child, spte);
2387 } 2468 }
2388 } 2469 }
2389 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 2470 __set_spte(spte, shadow_trap_nonpresent_pte);
2390 if (is_large_pte(pte)) 2471 if (is_large_pte(pte))
2391 --vcpu->kvm->stat.lpages; 2472 --vcpu->kvm->stat.lpages;
2392} 2473}
@@ -2397,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2397 const void *new) 2478 const void *new)
2398{ 2479{
2399 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 2480 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2400 if (!vcpu->arch.update_pte.largepage || 2481 ++vcpu->kvm->stat.mmu_pde_zapped;
2401 sp->role.glevels == PT32_ROOT_LEVEL) { 2482 return;
2402 ++vcpu->kvm->stat.mmu_pde_zapped;
2403 return;
2404 }
2405 } 2483 }
2406 2484
2407 ++vcpu->kvm->stat.mmu_pte_updated; 2485 ++vcpu->kvm->stat.mmu_pte_updated;
@@ -2447,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2447 u64 gpte = 0; 2525 u64 gpte = 0;
2448 pfn_t pfn; 2526 pfn_t pfn;
2449 2527
2450 vcpu->arch.update_pte.largepage = 0;
2451
2452 if (bytes != 4 && bytes != 8) 2528 if (bytes != 4 && bytes != 8)
2453 return; 2529 return;
2454 2530
@@ -2472,14 +2548,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2472 if ((bytes == 4) && (gpa % 4 == 0)) 2548 if ((bytes == 4) && (gpa % 4 == 0))
2473 memcpy((void *)&gpte, new, 4); 2549 memcpy((void *)&gpte, new, 4);
2474 } 2550 }
2475 if (!is_present_pte(gpte)) 2551 if (!is_present_gpte(gpte))
2476 return; 2552 return;
2477 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2553 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2478 2554
2479 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
2480 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
2481 vcpu->arch.update_pte.largepage = 1;
2482 }
2483 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2555 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2484 smp_rmb(); 2556 smp_rmb();
2485 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2557 pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2622,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2622 gpa_t gpa; 2694 gpa_t gpa;
2623 int r; 2695 int r;
2624 2696
2697 if (tdp_enabled)
2698 return 0;
2699
2625 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 2700 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
2626 2701
2627 spin_lock(&vcpu->kvm->mmu_lock); 2702 spin_lock(&vcpu->kvm->mmu_lock);
@@ -2633,7 +2708,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2633 2708
2634void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2709void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2635{ 2710{
2636 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { 2711 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
2712 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2637 struct kvm_mmu_page *sp; 2713 struct kvm_mmu_page *sp;
2638 2714
2639 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2715 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
@@ -2670,8 +2746,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2670 ++vcpu->stat.mmio_exits; 2746 ++vcpu->stat.mmio_exits;
2671 return 0; 2747 return 0;
2672 case EMULATE_FAIL: 2748 case EMULATE_FAIL:
2673 kvm_report_emulation_failure(vcpu, "pagetable"); 2749 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2674 return 1; 2750 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2751 return 0;
2675 default: 2752 default:
2676 BUG(); 2753 BUG();
2677 } 2754 }
@@ -2712,12 +2789,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2712 2789
2713 ASSERT(vcpu); 2790 ASSERT(vcpu);
2714 2791
2715 if (vcpu->kvm->arch.n_requested_mmu_pages)
2716 vcpu->kvm->arch.n_free_mmu_pages =
2717 vcpu->kvm->arch.n_requested_mmu_pages;
2718 else
2719 vcpu->kvm->arch.n_free_mmu_pages =
2720 vcpu->kvm->arch.n_alloc_mmu_pages;
2721 /* 2792 /*
2722 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 2793 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2723 * Therefore we need to allocate shadow page tables in the first 2794 * Therefore we need to allocate shadow page tables in the first
@@ -3029,6 +3100,24 @@ out:
3029 return r; 3100 return r;
3030} 3101}
3031 3102
3103int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3104{
3105 struct kvm_shadow_walk_iterator iterator;
3106 int nr_sptes = 0;
3107
3108 spin_lock(&vcpu->kvm->mmu_lock);
3109 for_each_shadow_entry(vcpu, addr, iterator) {
3110 sptes[iterator.level-1] = *iterator.sptep;
3111 nr_sptes++;
3112 if (!is_shadow_present_pte(*iterator.sptep))
3113 break;
3114 }
3115 spin_unlock(&vcpu->kvm->mmu_lock);
3116
3117 return nr_sptes;
3118}
3119EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3120
3032#ifdef AUDIT 3121#ifdef AUDIT
3033 3122
3034static const char *audit_msg; 3123static const char *audit_msg;
@@ -3041,6 +3130,54 @@ static gva_t canonicalize(gva_t gva)
3041 return gva; 3130 return gva;
3042} 3131}
3043 3132
3133
3134typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
3135 u64 *sptep);
3136
3137static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3138 inspect_spte_fn fn)
3139{
3140 int i;
3141
3142 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3143 u64 ent = sp->spt[i];
3144
3145 if (is_shadow_present_pte(ent)) {
3146 if (!is_last_spte(ent, sp->role.level)) {
3147 struct kvm_mmu_page *child;
3148 child = page_header(ent & PT64_BASE_ADDR_MASK);
3149 __mmu_spte_walk(kvm, child, fn);
3150 } else
3151 fn(kvm, sp, &sp->spt[i]);
3152 }
3153 }
3154}
3155
3156static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3157{
3158 int i;
3159 struct kvm_mmu_page *sp;
3160
3161 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3162 return;
3163 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3164 hpa_t root = vcpu->arch.mmu.root_hpa;
3165 sp = page_header(root);
3166 __mmu_spte_walk(vcpu->kvm, sp, fn);
3167 return;
3168 }
3169 for (i = 0; i < 4; ++i) {
3170 hpa_t root = vcpu->arch.mmu.pae_root[i];
3171
3172 if (root && VALID_PAGE(root)) {
3173 root &= PT64_BASE_ADDR_MASK;
3174 sp = page_header(root);
3175 __mmu_spte_walk(vcpu->kvm, sp, fn);
3176 }
3177 }
3178 return;
3179}
3180
3044static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, 3181static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3045 gva_t va, int level) 3182 gva_t va, int level)
3046{ 3183{
@@ -3055,20 +3192,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3055 continue; 3192 continue;
3056 3193
3057 va = canonicalize(va); 3194 va = canonicalize(va);
3058 if (level > 1) { 3195 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3059 if (ent == shadow_notrap_nonpresent_pte) 3196 audit_mappings_page(vcpu, ent, va, level - 1);
3060 printk(KERN_ERR "audit: (%s) nontrapping pte" 3197 else {
3061 " in nonleaf level: levels %d gva %lx"
3062 " level %d pte %llx\n", audit_msg,
3063 vcpu->arch.mmu.root_level, va, level, ent);
3064 else
3065 audit_mappings_page(vcpu, ent, va, level - 1);
3066 } else {
3067 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3198 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
3068 gfn_t gfn = gpa >> PAGE_SHIFT; 3199 gfn_t gfn = gpa >> PAGE_SHIFT;
3069 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3200 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3070 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3201 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3071 3202
3203 if (is_error_pfn(pfn)) {
3204 kvm_release_pfn_clean(pfn);
3205 continue;
3206 }
3207
3072 if (is_shadow_present_pte(ent) 3208 if (is_shadow_present_pte(ent)
3073 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3209 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3074 printk(KERN_ERR "xx audit error: (%s) levels %d" 3210 printk(KERN_ERR "xx audit error: (%s) levels %d"
@@ -3122,7 +3258,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3122 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 3258 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3123 while (d) { 3259 while (d) {
3124 for (k = 0; k < RMAP_EXT; ++k) 3260 for (k = 0; k < RMAP_EXT; ++k)
3125 if (d->shadow_ptes[k]) 3261 if (d->sptes[k])
3126 ++nmaps; 3262 ++nmaps;
3127 else 3263 else
3128 break; 3264 break;
@@ -3133,9 +3269,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3133 return nmaps; 3269 return nmaps;
3134} 3270}
3135 3271
3136static int count_writable_mappings(struct kvm_vcpu *vcpu) 3272void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3273{
3274 unsigned long *rmapp;
3275 struct kvm_mmu_page *rev_sp;
3276 gfn_t gfn;
3277
3278 if (*sptep & PT_WRITABLE_MASK) {
3279 rev_sp = page_header(__pa(sptep));
3280 gfn = rev_sp->gfns[sptep - rev_sp->spt];
3281
3282 if (!gfn_to_memslot(kvm, gfn)) {
3283 if (!printk_ratelimit())
3284 return;
3285 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3286 audit_msg, gfn);
3287 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3288 audit_msg, sptep - rev_sp->spt,
3289 rev_sp->gfn);
3290 dump_stack();
3291 return;
3292 }
3293
3294 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3295 is_large_pte(*sptep));
3296 if (!*rmapp) {
3297 if (!printk_ratelimit())
3298 return;
3299 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3300 audit_msg, *sptep);
3301 dump_stack();
3302 }
3303 }
3304
3305}
3306
3307void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3308{
3309 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3310}
3311
3312static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3137{ 3313{
3138 int nmaps = 0;
3139 struct kvm_mmu_page *sp; 3314 struct kvm_mmu_page *sp;
3140 int i; 3315 int i;
3141 3316
@@ -3152,20 +3327,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
3152 continue; 3327 continue;
3153 if (!(ent & PT_WRITABLE_MASK)) 3328 if (!(ent & PT_WRITABLE_MASK))
3154 continue; 3329 continue;
3155 ++nmaps; 3330 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
3156 } 3331 }
3157 } 3332 }
3158 return nmaps; 3333 return;
3159} 3334}
3160 3335
3161static void audit_rmap(struct kvm_vcpu *vcpu) 3336static void audit_rmap(struct kvm_vcpu *vcpu)
3162{ 3337{
3163 int n_rmap = count_rmaps(vcpu); 3338 check_writable_mappings_rmap(vcpu);
3164 int n_actual = count_writable_mappings(vcpu); 3339 count_rmaps(vcpu);
3165
3166 if (n_rmap != n_actual)
3167 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
3168 __func__, audit_msg, n_rmap, n_actual);
3169} 3340}
3170 3341
3171static void audit_write_protection(struct kvm_vcpu *vcpu) 3342static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3173,20 +3344,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3173 struct kvm_mmu_page *sp; 3344 struct kvm_mmu_page *sp;
3174 struct kvm_memory_slot *slot; 3345 struct kvm_memory_slot *slot;
3175 unsigned long *rmapp; 3346 unsigned long *rmapp;
3347 u64 *spte;
3176 gfn_t gfn; 3348 gfn_t gfn;
3177 3349
3178 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { 3350 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3179 if (sp->role.direct) 3351 if (sp->role.direct)
3180 continue; 3352 continue;
3353 if (sp->unsync)
3354 continue;
3181 3355
3182 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3356 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
3183 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); 3357 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
3184 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3358 rmapp = &slot->rmap[gfn - slot->base_gfn];
3185 if (*rmapp) 3359
3186 printk(KERN_ERR "%s: (%s) shadow page has writable" 3360 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3187 " mappings: gfn %lx role %x\n", 3361 while (spte) {
3362 if (*spte & PT_WRITABLE_MASK)
3363 printk(KERN_ERR "%s: (%s) shadow page has "
3364 "writable mappings: gfn %lx role %x\n",
3188 __func__, audit_msg, sp->gfn, 3365 __func__, audit_msg, sp->gfn,
3189 sp->role.word); 3366 sp->role.word);
3367 spte = rmap_next(vcpu->kvm, rmapp, spte);
3368 }
3190 } 3369 }
3191} 3370}
3192 3371
@@ -3198,7 +3377,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3198 audit_msg = msg; 3377 audit_msg = msg;
3199 audit_rmap(vcpu); 3378 audit_rmap(vcpu);
3200 audit_write_protection(vcpu); 3379 audit_write_protection(vcpu);
3201 audit_mappings(vcpu); 3380 if (strcmp("pre pte write", audit_msg) != 0)
3381 audit_mappings(vcpu);
3382 audit_writable_sptes_have_rmaps(vcpu);
3202 dbg = olddbg; 3383 dbg = olddbg;
3203} 3384}
3204 3385
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3494a2fb136e..61a1b3884b49 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,6 +37,8 @@
37#define PT32_ROOT_LEVEL 2 37#define PT32_ROOT_LEVEL 2
38#define PT32E_ROOT_LEVEL 3 38#define PT32E_ROOT_LEVEL 3
39 39
40int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
41
40static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 42static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
41{ 43{
42 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) 44 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
@@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
75 return vcpu->arch.cr0 & X86_CR0_PG; 77 return vcpu->arch.cr0 & X86_CR0_PG;
76} 78}
77 79
78static inline int is_present_pte(unsigned long pte) 80static inline int is_present_gpte(unsigned long pte)
79{ 81{
80 return pte & PT_PRESENT_MASK; 82 return pte & PT_PRESENT_MASK;
81} 83}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
new file mode 100644
index 000000000000..3e4a5c6ca2a9
--- /dev/null
+++ b/arch/x86/kvm/mmutrace.h
@@ -0,0 +1,220 @@
1#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVMMMU_H
3
4#include <linux/tracepoint.h>
5#include <linux/ftrace_event.h>
6
7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu
9#define TRACE_INCLUDE_PATH .
10#define TRACE_INCLUDE_FILE mmutrace
11
12#define KVM_MMU_PAGE_FIELDS \
13 __field(__u64, gfn) \
14 __field(__u32, role) \
15 __field(__u32, root_count) \
16 __field(__u32, unsync)
17
18#define KVM_MMU_PAGE_ASSIGN(sp) \
19 __entry->gfn = sp->gfn; \
20 __entry->role = sp->role.word; \
21 __entry->root_count = sp->root_count; \
22 __entry->unsync = sp->unsync;
23
24#define KVM_MMU_PAGE_PRINTK() ({ \
25 const char *ret = p->buffer + p->len; \
26 static const char *access_str[] = { \
27 "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
28 }; \
29 union kvm_mmu_page_role role; \
30 \
31 role.word = __entry->role; \
32 \
33 trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \
34 " %snxe root %u %s%c", \
35 __entry->gfn, role.level, role.glevels, \
36 role.quadrant, \
37 role.direct ? " direct" : "", \
38 access_str[role.access], \
39 role.invalid ? " invalid" : "", \
40 role.cr4_pge ? "" : "!", \
41 role.nxe ? "" : "!", \
42 __entry->root_count, \
43 __entry->unsync ? "unsync" : "sync", 0); \
44 ret; \
45 })
46
47#define kvm_mmu_trace_pferr_flags \
48 { PFERR_PRESENT_MASK, "P" }, \
49 { PFERR_WRITE_MASK, "W" }, \
50 { PFERR_USER_MASK, "U" }, \
51 { PFERR_RSVD_MASK, "RSVD" }, \
52 { PFERR_FETCH_MASK, "F" }
53
54/*
55 * A pagetable walk has started
56 */
57TRACE_EVENT(
58 kvm_mmu_pagetable_walk,
59 TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
60 TP_ARGS(addr, write_fault, user_fault, fetch_fault),
61
62 TP_STRUCT__entry(
63 __field(__u64, addr)
64 __field(__u32, pferr)
65 ),
66
67 TP_fast_assign(
68 __entry->addr = addr;
69 __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
70 | (!!fetch_fault << 4);
71 ),
72
73 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
74 __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
75);
76
77
78/* We just walked a paging element */
79TRACE_EVENT(
80 kvm_mmu_paging_element,
81 TP_PROTO(u64 pte, int level),
82 TP_ARGS(pte, level),
83
84 TP_STRUCT__entry(
85 __field(__u64, pte)
86 __field(__u32, level)
87 ),
88
89 TP_fast_assign(
90 __entry->pte = pte;
91 __entry->level = level;
92 ),
93
94 TP_printk("pte %llx level %u", __entry->pte, __entry->level)
95);
96
97/* We set a pte accessed bit */
98TRACE_EVENT(
99 kvm_mmu_set_accessed_bit,
100 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
101 TP_ARGS(table_gfn, index, size),
102
103 TP_STRUCT__entry(
104 __field(__u64, gpa)
105 ),
106
107 TP_fast_assign(
108 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
109 + index * size;
110 ),
111
112 TP_printk("gpa %llx", __entry->gpa)
113);
114
115/* We set a pte dirty bit */
116TRACE_EVENT(
117 kvm_mmu_set_dirty_bit,
118 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
119 TP_ARGS(table_gfn, index, size),
120
121 TP_STRUCT__entry(
122 __field(__u64, gpa)
123 ),
124
125 TP_fast_assign(
126 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
127 + index * size;
128 ),
129
130 TP_printk("gpa %llx", __entry->gpa)
131);
132
133TRACE_EVENT(
134 kvm_mmu_walker_error,
135 TP_PROTO(u32 pferr),
136 TP_ARGS(pferr),
137
138 TP_STRUCT__entry(
139 __field(__u32, pferr)
140 ),
141
142 TP_fast_assign(
143 __entry->pferr = pferr;
144 ),
145
146 TP_printk("pferr %x %s", __entry->pferr,
147 __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
148);
149
150TRACE_EVENT(
151 kvm_mmu_get_page,
152 TP_PROTO(struct kvm_mmu_page *sp, bool created),
153 TP_ARGS(sp, created),
154
155 TP_STRUCT__entry(
156 KVM_MMU_PAGE_FIELDS
157 __field(bool, created)
158 ),
159
160 TP_fast_assign(
161 KVM_MMU_PAGE_ASSIGN(sp)
162 __entry->created = created;
163 ),
164
165 TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
166 __entry->created ? "new" : "existing")
167);
168
169TRACE_EVENT(
170 kvm_mmu_sync_page,
171 TP_PROTO(struct kvm_mmu_page *sp),
172 TP_ARGS(sp),
173
174 TP_STRUCT__entry(
175 KVM_MMU_PAGE_FIELDS
176 ),
177
178 TP_fast_assign(
179 KVM_MMU_PAGE_ASSIGN(sp)
180 ),
181
182 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
183);
184
185TRACE_EVENT(
186 kvm_mmu_unsync_page,
187 TP_PROTO(struct kvm_mmu_page *sp),
188 TP_ARGS(sp),
189
190 TP_STRUCT__entry(
191 KVM_MMU_PAGE_FIELDS
192 ),
193
194 TP_fast_assign(
195 KVM_MMU_PAGE_ASSIGN(sp)
196 ),
197
198 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
199);
200
201TRACE_EVENT(
202 kvm_mmu_zap_page,
203 TP_PROTO(struct kvm_mmu_page *sp),
204 TP_ARGS(sp),
205
206 TP_STRUCT__entry(
207 KVM_MMU_PAGE_FIELDS
208 ),
209
210 TP_fast_assign(
211 KVM_MMU_PAGE_ASSIGN(sp)
212 ),
213
214 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
215);
216
217#endif /* _TRACE_KVMMMU_H */
218
219/* This part must be outside protection */
220#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67785f635399..d2fec9c12d22 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -27,7 +27,8 @@
27 #define guest_walker guest_walker64 27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name 28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 30 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
31 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
33 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
@@ -43,7 +44,8 @@
43 #define guest_walker guest_walker32 44 #define guest_walker guest_walker32
44 #define FNAME(name) paging##32_##name 45 #define FNAME(name) paging##32_##name
45 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
46 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
47 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
48 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
49 #define PT_LEVEL_BITS PT32_LEVEL_BITS 51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
@@ -53,8 +55,8 @@
53 #error Invalid PTTYPE value 55 #error Invalid PTTYPE value
54#endif 56#endif
55 57
56#define gpte_to_gfn FNAME(gpte_to_gfn) 58#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
57#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) 59#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
58 60
59/* 61/*
60 * The guest_walker structure emulates the behavior of the hardware page 62 * The guest_walker structure emulates the behavior of the hardware page
@@ -71,14 +73,9 @@ struct guest_walker {
71 u32 error_code; 73 u32 error_code;
72}; 74};
73 75
74static gfn_t gpte_to_gfn(pt_element_t gpte) 76static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
75{ 77{
76 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 78 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
77}
78
79static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
80{
81 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
82} 79}
83 80
84static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, 81static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
@@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
125 gpa_t pte_gpa; 122 gpa_t pte_gpa;
126 int rsvd_fault = 0; 123 int rsvd_fault = 0;
127 124
128 pgprintk("%s: addr %lx\n", __func__, addr); 125 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
126 fetch_fault);
129walk: 127walk:
130 walker->level = vcpu->arch.mmu.root_level; 128 walker->level = vcpu->arch.mmu.root_level;
131 pte = vcpu->arch.cr3; 129 pte = vcpu->arch.cr3;
132#if PTTYPE == 64 130#if PTTYPE == 64
133 if (!is_long_mode(vcpu)) { 131 if (!is_long_mode(vcpu)) {
134 pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; 132 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
135 if (!is_present_pte(pte)) 133 trace_kvm_mmu_paging_element(pte, walker->level);
134 if (!is_present_gpte(pte))
136 goto not_present; 135 goto not_present;
137 --walker->level; 136 --walker->level;
138 } 137 }
@@ -150,12 +149,11 @@ walk:
150 pte_gpa += index * sizeof(pt_element_t); 149 pte_gpa += index * sizeof(pt_element_t);
151 walker->table_gfn[walker->level - 1] = table_gfn; 150 walker->table_gfn[walker->level - 1] = table_gfn;
152 walker->pte_gpa[walker->level - 1] = pte_gpa; 151 walker->pte_gpa[walker->level - 1] = pte_gpa;
153 pgprintk("%s: table_gfn[%d] %lx\n", __func__,
154 walker->level - 1, table_gfn);
155 152
156 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 153 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
154 trace_kvm_mmu_paging_element(pte, walker->level);
157 155
158 if (!is_present_pte(pte)) 156 if (!is_present_gpte(pte))
159 goto not_present; 157 goto not_present;
160 158
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 159 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
@@ -175,6 +173,8 @@ walk:
175#endif 173#endif
176 174
177 if (!(pte & PT_ACCESSED_MASK)) { 175 if (!(pte & PT_ACCESSED_MASK)) {
176 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
177 sizeof(pte));
178 mark_page_dirty(vcpu->kvm, table_gfn); 178 mark_page_dirty(vcpu->kvm, table_gfn);
179 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 179 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
180 index, pte, pte|PT_ACCESSED_MASK)) 180 index, pte, pte|PT_ACCESSED_MASK))
@@ -186,18 +186,24 @@ walk:
186 186
187 walker->ptes[walker->level - 1] = pte; 187 walker->ptes[walker->level - 1] = pte;
188 188
189 if (walker->level == PT_PAGE_TABLE_LEVEL) { 189 if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
190 walker->gfn = gpte_to_gfn(pte); 190 ((walker->level == PT_DIRECTORY_LEVEL) &&
191 break; 191 (pte & PT_PAGE_SIZE_MASK) &&
192 } 192 (PTTYPE == 64 || is_pse(vcpu))) ||
193 193 ((walker->level == PT_PDPE_LEVEL) &&
194 if (walker->level == PT_DIRECTORY_LEVEL 194 (pte & PT_PAGE_SIZE_MASK) &&
195 && (pte & PT_PAGE_SIZE_MASK) 195 is_long_mode(vcpu))) {
196 && (PTTYPE == 64 || is_pse(vcpu))) { 196 int lvl = walker->level;
197 walker->gfn = gpte_to_gfn_pde(pte); 197
198 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); 198 walker->gfn = gpte_to_gfn_lvl(pte, lvl);
199 if (PTTYPE == 32 && is_cpuid_PSE36()) 199 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
200 >> PAGE_SHIFT;
201
202 if (PTTYPE == 32 &&
203 walker->level == PT_DIRECTORY_LEVEL &&
204 is_cpuid_PSE36())
200 walker->gfn += pse36_gfn_delta(pte); 205 walker->gfn += pse36_gfn_delta(pte);
206
201 break; 207 break;
202 } 208 }
203 209
@@ -205,9 +211,10 @@ walk:
205 --walker->level; 211 --walker->level;
206 } 212 }
207 213
208 if (write_fault && !is_dirty_pte(pte)) { 214 if (write_fault && !is_dirty_gpte(pte)) {
209 bool ret; 215 bool ret;
210 216
217 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
211 mark_page_dirty(vcpu->kvm, table_gfn); 218 mark_page_dirty(vcpu->kvm, table_gfn);
212 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 219 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
213 pte|PT_DIRTY_MASK); 220 pte|PT_DIRTY_MASK);
@@ -239,6 +246,7 @@ err:
239 walker->error_code |= PFERR_FETCH_MASK; 246 walker->error_code |= PFERR_FETCH_MASK;
240 if (rsvd_fault) 247 if (rsvd_fault)
241 walker->error_code |= PFERR_RSVD_MASK; 248 walker->error_code |= PFERR_RSVD_MASK;
249 trace_kvm_mmu_walker_error(walker->error_code);
242 return 0; 250 return 0;
243} 251}
244 252
@@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
248 pt_element_t gpte; 256 pt_element_t gpte;
249 unsigned pte_access; 257 unsigned pte_access;
250 pfn_t pfn; 258 pfn_t pfn;
251 int largepage = vcpu->arch.update_pte.largepage;
252 259
253 gpte = *(const pt_element_t *)pte; 260 gpte = *(const pt_element_t *)pte;
254 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 261 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
255 if (!is_present_pte(gpte)) 262 if (!is_present_gpte(gpte))
256 set_shadow_pte(spte, shadow_notrap_nonpresent_pte); 263 __set_spte(spte, shadow_notrap_nonpresent_pte);
257 return; 264 return;
258 } 265 }
259 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 266 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -267,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
267 return; 274 return;
268 kvm_get_pfn(pfn); 275 kvm_get_pfn(pfn);
269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 276 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
270 gpte & PT_DIRTY_MASK, NULL, largepage, 277 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
271 gpte_to_gfn(gpte), pfn, true); 278 gpte_to_gfn(gpte), pfn, true);
272} 279}
273 280
@@ -276,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
276 */ 283 */
277static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 284static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
278 struct guest_walker *gw, 285 struct guest_walker *gw,
279 int user_fault, int write_fault, int largepage, 286 int user_fault, int write_fault, int hlevel,
280 int *ptwrite, pfn_t pfn) 287 int *ptwrite, pfn_t pfn)
281{ 288{
282 unsigned access = gw->pt_access; 289 unsigned access = gw->pt_access;
@@ -289,19 +296,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
289 pt_element_t curr_pte; 296 pt_element_t curr_pte;
290 struct kvm_shadow_walk_iterator iterator; 297 struct kvm_shadow_walk_iterator iterator;
291 298
292 if (!is_present_pte(gw->ptes[gw->level - 1])) 299 if (!is_present_gpte(gw->ptes[gw->level - 1]))
293 return NULL; 300 return NULL;
294 301
295 for_each_shadow_entry(vcpu, addr, iterator) { 302 for_each_shadow_entry(vcpu, addr, iterator) {
296 level = iterator.level; 303 level = iterator.level;
297 sptep = iterator.sptep; 304 sptep = iterator.sptep;
298 if (level == PT_PAGE_TABLE_LEVEL 305 if (iterator.level == hlevel) {
299 || (largepage && level == PT_DIRECTORY_LEVEL)) {
300 mmu_set_spte(vcpu, sptep, access, 306 mmu_set_spte(vcpu, sptep, access,
301 gw->pte_access & access, 307 gw->pte_access & access,
302 user_fault, write_fault, 308 user_fault, write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 309 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 ptwrite, largepage, 310 ptwrite, level,
305 gw->gfn, pfn, false); 311 gw->gfn, pfn, false);
306 break; 312 break;
307 } 313 }
@@ -311,16 +317,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
311 317
312 if (is_large_pte(*sptep)) { 318 if (is_large_pte(*sptep)) {
313 rmap_remove(vcpu->kvm, sptep); 319 rmap_remove(vcpu->kvm, sptep);
314 set_shadow_pte(sptep, shadow_trap_nonpresent_pte); 320 __set_spte(sptep, shadow_trap_nonpresent_pte);
315 kvm_flush_remote_tlbs(vcpu->kvm); 321 kvm_flush_remote_tlbs(vcpu->kvm);
316 } 322 }
317 323
318 if (level == PT_DIRECTORY_LEVEL 324 if (level <= gw->level) {
319 && gw->level == PT_DIRECTORY_LEVEL) { 325 int delta = level - gw->level + 1;
320 direct = 1; 326 direct = 1;
321 if (!is_dirty_pte(gw->ptes[level - 1])) 327 if (!is_dirty_gpte(gw->ptes[level - delta]))
322 access &= ~ACC_WRITE_MASK; 328 access &= ~ACC_WRITE_MASK;
323 table_gfn = gpte_to_gfn(gw->ptes[level - 1]); 329 table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
330 /* advance table_gfn when emulating 1gb pages with 4k */
331 if (delta == 0)
332 table_gfn += PT_INDEX(addr, level);
324 } else { 333 } else {
325 direct = 0; 334 direct = 0;
326 table_gfn = gw->table_gfn[level - 2]; 335 table_gfn = gw->table_gfn[level - 2];
@@ -369,11 +378,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
369 int user_fault = error_code & PFERR_USER_MASK; 378 int user_fault = error_code & PFERR_USER_MASK;
370 int fetch_fault = error_code & PFERR_FETCH_MASK; 379 int fetch_fault = error_code & PFERR_FETCH_MASK;
371 struct guest_walker walker; 380 struct guest_walker walker;
372 u64 *shadow_pte; 381 u64 *sptep;
373 int write_pt = 0; 382 int write_pt = 0;
374 int r; 383 int r;
375 pfn_t pfn; 384 pfn_t pfn;
376 int largepage = 0; 385 int level = PT_PAGE_TABLE_LEVEL;
377 unsigned long mmu_seq; 386 unsigned long mmu_seq;
378 387
379 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 388 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -399,14 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
399 return 0; 408 return 0;
400 } 409 }
401 410
402 if (walker.level == PT_DIRECTORY_LEVEL) { 411 if (walker.level >= PT_DIRECTORY_LEVEL) {
403 gfn_t large_gfn; 412 level = min(walker.level, mapping_level(vcpu, walker.gfn));
404 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); 413 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
405 if (is_largepage_backed(vcpu, large_gfn)) {
406 walker.gfn = large_gfn;
407 largepage = 1;
408 }
409 } 414 }
415
410 mmu_seq = vcpu->kvm->mmu_notifier_seq; 416 mmu_seq = vcpu->kvm->mmu_notifier_seq;
411 smp_rmb(); 417 smp_rmb();
412 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 418 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
@@ -422,11 +428,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
422 if (mmu_notifier_retry(vcpu, mmu_seq)) 428 if (mmu_notifier_retry(vcpu, mmu_seq))
423 goto out_unlock; 429 goto out_unlock;
424 kvm_mmu_free_some_pages(vcpu); 430 kvm_mmu_free_some_pages(vcpu);
425 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 431 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
426 largepage, &write_pt, pfn); 432 level, &write_pt, pfn);
427
428 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 433 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
429 shadow_pte, *shadow_pte, write_pt); 434 sptep, *sptep, write_pt);
430 435
431 if (!write_pt) 436 if (!write_pt)
432 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 437 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
@@ -459,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
459 sptep = iterator.sptep; 464 sptep = iterator.sptep;
460 465
461 /* FIXME: properly handle invlpg on large guest pages */ 466 /* FIXME: properly handle invlpg on large guest pages */
462 if (level == PT_PAGE_TABLE_LEVEL || 467 if (level == PT_PAGE_TABLE_LEVEL ||
463 ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { 468 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
469 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
464 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 470 struct kvm_mmu_page *sp = page_header(__pa(sptep));
465 471
466 pte_gpa = (sp->gfn << PAGE_SHIFT); 472 pte_gpa = (sp->gfn << PAGE_SHIFT);
@@ -472,7 +478,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
472 --vcpu->kvm->stat.lpages; 478 --vcpu->kvm->stat.lpages;
473 need_flush = 1; 479 need_flush = 1;
474 } 480 }
475 set_shadow_pte(sptep, shadow_trap_nonpresent_pte); 481 __set_spte(sptep, shadow_trap_nonpresent_pte);
476 break; 482 break;
477 } 483 }
478 484
@@ -489,7 +495,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
489 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 495 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
490 sizeof(pt_element_t))) 496 sizeof(pt_element_t)))
491 return; 497 return;
492 if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { 498 if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
493 if (mmu_topup_memory_caches(vcpu)) 499 if (mmu_topup_memory_caches(vcpu))
494 return; 500 return;
495 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, 501 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
@@ -536,7 +542,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
536 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); 542 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
537 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); 543 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
538 for (j = 0; j < ARRAY_SIZE(pt); ++j) 544 for (j = 0; j < ARRAY_SIZE(pt); ++j)
539 if (r || is_present_pte(pt[j])) 545 if (r || is_present_gpte(pt[j]))
540 sp->spt[i+j] = shadow_trap_nonpresent_pte; 546 sp->spt[i+j] = shadow_trap_nonpresent_pte;
541 else 547 else
542 sp->spt[i+j] = shadow_notrap_nonpresent_pte; 548 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
@@ -574,23 +580,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
574 sizeof(pt_element_t))) 580 sizeof(pt_element_t)))
575 return -EINVAL; 581 return -EINVAL;
576 582
577 if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || 583 if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
578 !(gpte & PT_ACCESSED_MASK)) { 584 !(gpte & PT_ACCESSED_MASK)) {
579 u64 nonpresent; 585 u64 nonpresent;
580 586
581 rmap_remove(vcpu->kvm, &sp->spt[i]); 587 rmap_remove(vcpu->kvm, &sp->spt[i]);
582 if (is_present_pte(gpte)) 588 if (is_present_gpte(gpte))
583 nonpresent = shadow_trap_nonpresent_pte; 589 nonpresent = shadow_trap_nonpresent_pte;
584 else 590 else
585 nonpresent = shadow_notrap_nonpresent_pte; 591 nonpresent = shadow_notrap_nonpresent_pte;
586 set_shadow_pte(&sp->spt[i], nonpresent); 592 __set_spte(&sp->spt[i], nonpresent);
587 continue; 593 continue;
588 } 594 }
589 595
590 nr_present++; 596 nr_present++;
591 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 597 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
592 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 598 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
593 is_dirty_pte(gpte), 0, gfn, 599 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
594 spte_to_pfn(sp->spt[i]), true, false); 600 spte_to_pfn(sp->spt[i]), true, false);
595 } 601 }
596 602
@@ -603,9 +609,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
603#undef PT_BASE_ADDR_MASK 609#undef PT_BASE_ADDR_MASK
604#undef PT_INDEX 610#undef PT_INDEX
605#undef PT_LEVEL_MASK 611#undef PT_LEVEL_MASK
606#undef PT_DIR_BASE_ADDR_MASK 612#undef PT_LVL_ADDR_MASK
613#undef PT_LVL_OFFSET_MASK
607#undef PT_LEVEL_BITS 614#undef PT_LEVEL_BITS
608#undef PT_MAX_FULL_LEVELS 615#undef PT_MAX_FULL_LEVELS
609#undef gpte_to_gfn 616#undef gpte_to_gfn
610#undef gpte_to_gfn_pde 617#undef gpte_to_gfn_lvl
611#undef CMPXCHG 618#undef CMPXCHG
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b1f658ad2f06..944cc9c04b3c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -15,7 +15,6 @@
15 */ 15 */
16#include <linux/kvm_host.h> 16#include <linux/kvm_host.h>
17 17
18#include "kvm_svm.h"
19#include "irq.h" 18#include "irq.h"
20#include "mmu.h" 19#include "mmu.h"
21#include "kvm_cache_regs.h" 20#include "kvm_cache_regs.h"
@@ -26,10 +25,12 @@
26#include <linux/vmalloc.h> 25#include <linux/vmalloc.h>
27#include <linux/highmem.h> 26#include <linux/highmem.h>
28#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/ftrace_event.h>
29 29
30#include <asm/desc.h> 30#include <asm/desc.h>
31 31
32#include <asm/virtext.h> 32#include <asm/virtext.h>
33#include "trace.h"
33 34
34#define __ex(x) __kvm_handle_fault_on_reboot(x) 35#define __ex(x) __kvm_handle_fault_on_reboot(x)
35 36
@@ -46,6 +47,10 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
47#define SVM_FEATURE_SVML (1 << 2) 48#define SVM_FEATURE_SVML (1 << 2)
48 49
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
52#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
53
49#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
50 55
51/* Turn on to get debugging output*/ 56/* Turn on to get debugging output*/
@@ -57,6 +62,58 @@ MODULE_LICENSE("GPL");
57#define nsvm_printk(fmt, args...) do {} while(0) 62#define nsvm_printk(fmt, args...) do {} while(0)
58#endif 63#endif
59 64
65static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
68 MSR_FS_BASE,
69#endif
70 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
71};
72
73#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
74
75struct kvm_vcpu;
76
77struct nested_state {
78 struct vmcb *hsave;
79 u64 hsave_msr;
80 u64 vmcb;
81
82 /* These are the merged vectors */
83 u32 *msrpm;
84
85 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm;
87
88 /* cache for intercepts of the guest */
89 u16 intercept_cr_read;
90 u16 intercept_cr_write;
91 u16 intercept_dr_read;
92 u16 intercept_dr_write;
93 u32 intercept_exceptions;
94 u64 intercept;
95
96};
97
98struct vcpu_svm {
99 struct kvm_vcpu vcpu;
100 struct vmcb *vmcb;
101 unsigned long vmcb_pa;
102 struct svm_cpu_data *svm_data;
103 uint64_t asid_generation;
104 uint64_t sysenter_esp;
105 uint64_t sysenter_eip;
106
107 u64 next_rip;
108
109 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
110 u64 host_gs_base;
111
112 u32 *msrpm;
113
114 struct nested_state nested;
115};
116
60/* enable NPT for AMD64 and X86 with PAE */ 117/* enable NPT for AMD64 and X86 with PAE */
61#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 118#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
62static bool npt_enabled = true; 119static bool npt_enabled = true;
@@ -67,15 +124,14 @@ static int npt = 1;
67 124
68module_param(npt, int, S_IRUGO); 125module_param(npt, int, S_IRUGO);
69 126
70static int nested = 0; 127static int nested = 1;
71module_param(nested, int, S_IRUGO); 128module_param(nested, int, S_IRUGO);
72 129
73static void svm_flush_tlb(struct kvm_vcpu *vcpu); 130static void svm_flush_tlb(struct kvm_vcpu *vcpu);
131static void svm_complete_interrupts(struct vcpu_svm *svm);
74 132
75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); 133static int nested_svm_exit_handled(struct vcpu_svm *svm);
76static int nested_svm_vmexit(struct vcpu_svm *svm); 134static int nested_svm_vmexit(struct vcpu_svm *svm);
77static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
78 void *arg2, void *opaque);
79static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 135static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
80 bool has_error_code, u32 error_code); 136 bool has_error_code, u32 error_code);
81 137
@@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
86 142
87static inline bool is_nested(struct vcpu_svm *svm) 143static inline bool is_nested(struct vcpu_svm *svm)
88{ 144{
89 return svm->nested_vmcb; 145 return svm->nested.vmcb;
146}
147
148static inline void enable_gif(struct vcpu_svm *svm)
149{
150 svm->vcpu.arch.hflags |= HF_GIF_MASK;
151}
152
153static inline void disable_gif(struct vcpu_svm *svm)
154{
155 svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
156}
157
158static inline bool gif_set(struct vcpu_svm *svm)
159{
160 return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
90} 161}
91 162
92static unsigned long iopm_base; 163static unsigned long iopm_base;
@@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
147 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); 218 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
148} 219}
149 220
150static inline unsigned long kvm_read_cr2(void)
151{
152 unsigned long cr2;
153
154 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
155 return cr2;
156}
157
158static inline void kvm_write_cr2(unsigned long val)
159{
160 asm volatile ("mov %0, %%cr2" :: "r" (val));
161}
162
163static inline void force_new_asid(struct kvm_vcpu *vcpu) 221static inline void force_new_asid(struct kvm_vcpu *vcpu)
164{ 222{
165 to_svm(vcpu)->asid_generation--; 223 to_svm(vcpu)->asid_generation--;
@@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage)
263 321
264 struct svm_cpu_data *svm_data; 322 struct svm_cpu_data *svm_data;
265 uint64_t efer; 323 uint64_t efer;
266 struct desc_ptr gdt_descr; 324 struct descriptor_table gdt_descr;
267 struct desc_struct *gdt; 325 struct desc_struct *gdt;
268 int me = raw_smp_processor_id(); 326 int me = raw_smp_processor_id();
269 327
@@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage)
283 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 341 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
284 svm_data->next_asid = svm_data->max_asid + 1; 342 svm_data->next_asid = svm_data->max_asid + 1;
285 343
286 asm volatile ("sgdt %0" : "=m"(gdt_descr)); 344 kvm_get_gdt(&gdt_descr);
287 gdt = (struct desc_struct *)gdt_descr.address; 345 gdt = (struct desc_struct *)gdt_descr.base;
288 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
289 347
290 rdmsrl(MSR_EFER, efer); 348 rdmsrl(MSR_EFER, efer);
@@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
367#endif 425#endif
368 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); 426 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
369 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); 427 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
370 set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
371 set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
372} 428}
373 429
374static void svm_enable_lbrv(struct vcpu_svm *svm) 430static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm)
595 } 651 }
596 force_new_asid(&svm->vcpu); 652 force_new_asid(&svm->vcpu);
597 653
598 svm->nested_vmcb = 0; 654 svm->nested.vmcb = 0;
599 svm->vcpu.arch.hflags = HF_GIF_MASK; 655 svm->vcpu.arch.hflags = 0;
656
657 enable_gif(svm);
600} 658}
601 659
602static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 660static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
605 663
606 init_vmcb(svm); 664 init_vmcb(svm);
607 665
608 if (vcpu->vcpu_id != 0) { 666 if (!kvm_vcpu_is_bsp(vcpu)) {
609 kvm_rip_write(vcpu, 0); 667 kvm_rip_write(vcpu, 0);
610 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 668 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
611 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 669 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
@@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
656 hsave_page = alloc_page(GFP_KERNEL); 714 hsave_page = alloc_page(GFP_KERNEL);
657 if (!hsave_page) 715 if (!hsave_page)
658 goto uninit; 716 goto uninit;
659 svm->hsave = page_address(hsave_page); 717 svm->nested.hsave = page_address(hsave_page);
660 718
661 svm->nested_msrpm = page_address(nested_msrpm_pages); 719 svm->nested.msrpm = page_address(nested_msrpm_pages);
662 720
663 svm->vmcb = page_address(page); 721 svm->vmcb = page_address(page);
664 clear_page(svm->vmcb); 722 clear_page(svm->vmcb);
@@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
669 fx_init(&svm->vcpu); 727 fx_init(&svm->vcpu);
670 svm->vcpu.fpu_active = 1; 728 svm->vcpu.fpu_active = 1;
671 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 729 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
672 if (svm->vcpu.vcpu_id == 0) 730 if (kvm_vcpu_is_bsp(&svm->vcpu))
673 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 731 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
674 732
675 return &svm->vcpu; 733 return &svm->vcpu;
@@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
688 746
689 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); 747 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
690 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 748 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
691 __free_page(virt_to_page(svm->hsave)); 749 __free_page(virt_to_page(svm->nested.hsave));
692 __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); 750 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
693 kvm_vcpu_uninit(vcpu); 751 kvm_vcpu_uninit(vcpu);
694 kmem_cache_free(kvm_vcpu_cache, svm); 752 kmem_cache_free(kvm_vcpu_cache, svm);
695} 753}
@@ -740,6 +798,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
740 to_svm(vcpu)->vmcb->save.rflags = rflags; 798 to_svm(vcpu)->vmcb->save.rflags = rflags;
741} 799}
742 800
801static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
802{
803 switch (reg) {
804 case VCPU_EXREG_PDPTR:
805 BUG_ON(!npt_enabled);
806 load_pdptrs(vcpu, vcpu->arch.cr3);
807 break;
808 default:
809 BUG();
810 }
811}
812
743static void svm_set_vintr(struct vcpu_svm *svm) 813static void svm_set_vintr(struct vcpu_svm *svm)
744{ 814{
745 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 815 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
@@ -1061,7 +1131,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
1061 val = 0; 1131 val = 0;
1062 } 1132 }
1063 1133
1064 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
1065 return val; 1134 return val;
1066} 1135}
1067 1136
@@ -1070,8 +1139,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1070{ 1139{
1071 struct vcpu_svm *svm = to_svm(vcpu); 1140 struct vcpu_svm *svm = to_svm(vcpu);
1072 1141
1073 KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
1074
1075 *exception = 0; 1142 *exception = 0;
1076 1143
1077 switch (dr) { 1144 switch (dr) {
@@ -1119,25 +1186,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1119 fault_address = svm->vmcb->control.exit_info_2; 1186 fault_address = svm->vmcb->control.exit_info_2;
1120 error_code = svm->vmcb->control.exit_info_1; 1187 error_code = svm->vmcb->control.exit_info_1;
1121 1188
1122 if (!npt_enabled) 1189 trace_kvm_page_fault(fault_address, error_code);
1123 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, 1190 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1124 (u32)fault_address, (u32)(fault_address >> 32), 1191 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1125 handler);
1126 else
1127 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1128 (u32)fault_address, (u32)(fault_address >> 32),
1129 handler);
1130 /*
1131 * FIXME: Tis shouldn't be necessary here, but there is a flush
1132 * missing in the MMU code. Until we find this bug, flush the
1133 * complete TLB here on an NPF
1134 */
1135 if (npt_enabled)
1136 svm_flush_tlb(&svm->vcpu);
1137 else {
1138 if (kvm_event_needs_reinjection(&svm->vcpu))
1139 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1140 }
1141 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1192 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1142} 1193}
1143 1194
@@ -1253,14 +1304,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1253 1304
1254static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1305static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1255{ 1306{
1256 KVMTRACE_0D(NMI, &svm->vcpu, handler);
1257 return 1; 1307 return 1;
1258} 1308}
1259 1309
1260static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1310static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1261{ 1311{
1262 ++svm->vcpu.stat.irq_exits; 1312 ++svm->vcpu.stat.irq_exits;
1263 KVMTRACE_0D(INTR, &svm->vcpu, handler);
1264 return 1; 1313 return 1;
1265} 1314}
1266 1315
@@ -1303,44 +1352,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
1303static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 1352static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1304 bool has_error_code, u32 error_code) 1353 bool has_error_code, u32 error_code)
1305{ 1354{
1306 if (is_nested(svm)) { 1355 if (!is_nested(svm))
1307 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1356 return 0;
1308 svm->vmcb->control.exit_code_hi = 0;
1309 svm->vmcb->control.exit_info_1 = error_code;
1310 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1311 if (nested_svm_exit_handled(svm, false)) {
1312 nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
1313
1314 nested_svm_vmexit(svm);
1315 return 1;
1316 }
1317 }
1318 1357
1319 return 0; 1358 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
1359 svm->vmcb->control.exit_code_hi = 0;
1360 svm->vmcb->control.exit_info_1 = error_code;
1361 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1362
1363 return nested_svm_exit_handled(svm);
1320} 1364}
1321 1365
1322static inline int nested_svm_intr(struct vcpu_svm *svm) 1366static inline int nested_svm_intr(struct vcpu_svm *svm)
1323{ 1367{
1324 if (is_nested(svm)) { 1368 if (!is_nested(svm))
1325 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1369 return 0;
1326 return 0;
1327 1370
1328 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1371 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1329 return 0; 1372 return 0;
1330 1373
1331 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1374 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1375 return 0;
1332 1376
1333 if (nested_svm_exit_handled(svm, false)) { 1377 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1334 nsvm_printk("VMexit -> INTR\n"); 1378
1335 nested_svm_vmexit(svm); 1379 if (nested_svm_exit_handled(svm)) {
1336 return 1; 1380 nsvm_printk("VMexit -> INTR\n");
1337 } 1381 return 1;
1338 } 1382 }
1339 1383
1340 return 0; 1384 return 0;
1341} 1385}
1342 1386
1343static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) 1387static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1344{ 1388{
1345 struct page *page; 1389 struct page *page;
1346 1390
@@ -1348,236 +1392,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
1348 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1392 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1349 up_read(&current->mm->mmap_sem); 1393 up_read(&current->mm->mmap_sem);
1350 1394
1351 if (is_error_page(page)) { 1395 if (is_error_page(page))
1352 printk(KERN_INFO "%s: could not find page at 0x%llx\n", 1396 goto error;
1353 __func__, gpa); 1397
1354 kvm_release_page_clean(page); 1398 return kmap_atomic(page, idx);
1355 kvm_inject_gp(&svm->vcpu, 0); 1399
1356 return NULL; 1400error:
1357 } 1401 kvm_release_page_clean(page);
1358 return page; 1402 kvm_inject_gp(&svm->vcpu, 0);
1403
1404 return NULL;
1359} 1405}
1360 1406
1361static int nested_svm_do(struct vcpu_svm *svm, 1407static void nested_svm_unmap(void *addr, enum km_type idx)
1362 u64 arg1_gpa, u64 arg2_gpa, void *opaque,
1363 int (*handler)(struct vcpu_svm *svm,
1364 void *arg1,
1365 void *arg2,
1366 void *opaque))
1367{ 1408{
1368 struct page *arg1_page; 1409 struct page *page;
1369 struct page *arg2_page = NULL;
1370 void *arg1;
1371 void *arg2 = NULL;
1372 int retval;
1373 1410
1374 arg1_page = nested_svm_get_page(svm, arg1_gpa); 1411 if (!addr)
1375 if(arg1_page == NULL) 1412 return;
1376 return 1;
1377 1413
1378 if (arg2_gpa) { 1414 page = kmap_atomic_to_page(addr);
1379 arg2_page = nested_svm_get_page(svm, arg2_gpa); 1415
1380 if(arg2_page == NULL) { 1416 kunmap_atomic(addr, idx);
1381 kvm_release_page_clean(arg1_page); 1417 kvm_release_page_dirty(page);
1382 return 1; 1418}
1383 } 1419
1384 } 1420static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1421{
1422 u32 param = svm->vmcb->control.exit_info_1 & 1;
1423 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1424 bool ret = false;
1425 u32 t0, t1;
1426 u8 *msrpm;
1385 1427
1386 arg1 = kmap_atomic(arg1_page, KM_USER0); 1428 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1387 if (arg2_gpa) 1429 return false;
1388 arg2 = kmap_atomic(arg2_page, KM_USER1);
1389 1430
1390 retval = handler(svm, arg1, arg2, opaque); 1431 msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
1432
1433 if (!msrpm)
1434 goto out;
1435
1436 switch (msr) {
1437 case 0 ... 0x1fff:
1438 t0 = (msr * 2) % 8;
1439 t1 = msr / 8;
1440 break;
1441 case 0xc0000000 ... 0xc0001fff:
1442 t0 = (8192 + msr - 0xc0000000) * 2;
1443 t1 = (t0 / 8);
1444 t0 %= 8;
1445 break;
1446 case 0xc0010000 ... 0xc0011fff:
1447 t0 = (16384 + msr - 0xc0010000) * 2;
1448 t1 = (t0 / 8);
1449 t0 %= 8;
1450 break;
1451 default:
1452 ret = true;
1453 goto out;
1454 }
1391 1455
1392 kunmap_atomic(arg1, KM_USER0); 1456 ret = msrpm[t1] & ((1 << param) << t0);
1393 if (arg2_gpa)
1394 kunmap_atomic(arg2, KM_USER1);
1395 1457
1396 kvm_release_page_dirty(arg1_page); 1458out:
1397 if (arg2_gpa) 1459 nested_svm_unmap(msrpm, KM_USER0);
1398 kvm_release_page_dirty(arg2_page);
1399 1460
1400 return retval; 1461 return ret;
1401} 1462}
1402 1463
1403static int nested_svm_exit_handled_real(struct vcpu_svm *svm, 1464static int nested_svm_exit_special(struct vcpu_svm *svm)
1404 void *arg1,
1405 void *arg2,
1406 void *opaque)
1407{ 1465{
1408 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1409 bool kvm_overrides = *(bool *)opaque;
1410 u32 exit_code = svm->vmcb->control.exit_code; 1466 u32 exit_code = svm->vmcb->control.exit_code;
1411 1467
1412 if (kvm_overrides) { 1468 switch (exit_code) {
1413 switch (exit_code) { 1469 case SVM_EXIT_INTR:
1414 case SVM_EXIT_INTR: 1470 case SVM_EXIT_NMI:
1415 case SVM_EXIT_NMI: 1471 return NESTED_EXIT_HOST;
1416 return 0;
1417 /* For now we are always handling NPFs when using them */ 1472 /* For now we are always handling NPFs when using them */
1418 case SVM_EXIT_NPF: 1473 case SVM_EXIT_NPF:
1419 if (npt_enabled) 1474 if (npt_enabled)
1420 return 0; 1475 return NESTED_EXIT_HOST;
1421 break; 1476 break;
1422 /* When we're shadowing, trap PFs */ 1477 /* When we're shadowing, trap PFs */
1423 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1478 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1424 if (!npt_enabled) 1479 if (!npt_enabled)
1425 return 0; 1480 return NESTED_EXIT_HOST;
1426 break; 1481 break;
1427 default: 1482 default:
1428 break; 1483 break;
1429 }
1430 } 1484 }
1431 1485
1486 return NESTED_EXIT_CONTINUE;
1487}
1488
1489/*
1490 * If this function returns true, this #vmexit was already handled
1491 */
1492static int nested_svm_exit_handled(struct vcpu_svm *svm)
1493{
1494 u32 exit_code = svm->vmcb->control.exit_code;
1495 int vmexit = NESTED_EXIT_HOST;
1496
1432 switch (exit_code) { 1497 switch (exit_code) {
1498 case SVM_EXIT_MSR:
1499 vmexit = nested_svm_exit_handled_msr(svm);
1500 break;
1433 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 1501 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1434 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 1502 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1435 if (nested_vmcb->control.intercept_cr_read & cr_bits) 1503 if (svm->nested.intercept_cr_read & cr_bits)
1436 return 1; 1504 vmexit = NESTED_EXIT_DONE;
1437 break; 1505 break;
1438 } 1506 }
1439 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { 1507 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
1440 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); 1508 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
1441 if (nested_vmcb->control.intercept_cr_write & cr_bits) 1509 if (svm->nested.intercept_cr_write & cr_bits)
1442 return 1; 1510 vmexit = NESTED_EXIT_DONE;
1443 break; 1511 break;
1444 } 1512 }
1445 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { 1513 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1446 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); 1514 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1447 if (nested_vmcb->control.intercept_dr_read & dr_bits) 1515 if (svm->nested.intercept_dr_read & dr_bits)
1448 return 1; 1516 vmexit = NESTED_EXIT_DONE;
1449 break; 1517 break;
1450 } 1518 }
1451 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { 1519 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1452 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); 1520 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1453 if (nested_vmcb->control.intercept_dr_write & dr_bits) 1521 if (svm->nested.intercept_dr_write & dr_bits)
1454 return 1; 1522 vmexit = NESTED_EXIT_DONE;
1455 break; 1523 break;
1456 } 1524 }
1457 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { 1525 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1458 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 1526 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1459 if (nested_vmcb->control.intercept_exceptions & excp_bits) 1527 if (svm->nested.intercept_exceptions & excp_bits)
1460 return 1; 1528 vmexit = NESTED_EXIT_DONE;
1461 break; 1529 break;
1462 } 1530 }
1463 default: { 1531 default: {
1464 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1532 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1465 nsvm_printk("exit code: 0x%x\n", exit_code); 1533 nsvm_printk("exit code: 0x%x\n", exit_code);
1466 if (nested_vmcb->control.intercept & exit_bits) 1534 if (svm->nested.intercept & exit_bits)
1467 return 1; 1535 vmexit = NESTED_EXIT_DONE;
1468 } 1536 }
1469 } 1537 }
1470 1538
1471 return 0; 1539 if (vmexit == NESTED_EXIT_DONE) {
1472} 1540 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1473 1541 nested_svm_vmexit(svm);
1474static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
1475 void *arg1, void *arg2,
1476 void *opaque)
1477{
1478 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1479 u8 *msrpm = (u8 *)arg2;
1480 u32 t0, t1;
1481 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1482 u32 param = svm->vmcb->control.exit_info_1 & 1;
1483
1484 if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1485 return 0;
1486
1487 switch(msr) {
1488 case 0 ... 0x1fff:
1489 t0 = (msr * 2) % 8;
1490 t1 = msr / 8;
1491 break;
1492 case 0xc0000000 ... 0xc0001fff:
1493 t0 = (8192 + msr - 0xc0000000) * 2;
1494 t1 = (t0 / 8);
1495 t0 %= 8;
1496 break;
1497 case 0xc0010000 ... 0xc0011fff:
1498 t0 = (16384 + msr - 0xc0010000) * 2;
1499 t1 = (t0 / 8);
1500 t0 %= 8;
1501 break;
1502 default:
1503 return 1;
1504 break;
1505 } 1542 }
1506 if (msrpm[t1] & ((1 << param) << t0))
1507 return 1;
1508 1543
1509 return 0; 1544 return vmexit;
1545}
1546
1547static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
1548{
1549 struct vmcb_control_area *dst = &dst_vmcb->control;
1550 struct vmcb_control_area *from = &from_vmcb->control;
1551
1552 dst->intercept_cr_read = from->intercept_cr_read;
1553 dst->intercept_cr_write = from->intercept_cr_write;
1554 dst->intercept_dr_read = from->intercept_dr_read;
1555 dst->intercept_dr_write = from->intercept_dr_write;
1556 dst->intercept_exceptions = from->intercept_exceptions;
1557 dst->intercept = from->intercept;
1558 dst->iopm_base_pa = from->iopm_base_pa;
1559 dst->msrpm_base_pa = from->msrpm_base_pa;
1560 dst->tsc_offset = from->tsc_offset;
1561 dst->asid = from->asid;
1562 dst->tlb_ctl = from->tlb_ctl;
1563 dst->int_ctl = from->int_ctl;
1564 dst->int_vector = from->int_vector;
1565 dst->int_state = from->int_state;
1566 dst->exit_code = from->exit_code;
1567 dst->exit_code_hi = from->exit_code_hi;
1568 dst->exit_info_1 = from->exit_info_1;
1569 dst->exit_info_2 = from->exit_info_2;
1570 dst->exit_int_info = from->exit_int_info;
1571 dst->exit_int_info_err = from->exit_int_info_err;
1572 dst->nested_ctl = from->nested_ctl;
1573 dst->event_inj = from->event_inj;
1574 dst->event_inj_err = from->event_inj_err;
1575 dst->nested_cr3 = from->nested_cr3;
1576 dst->lbr_ctl = from->lbr_ctl;
1510} 1577}
1511 1578
1512static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) 1579static int nested_svm_vmexit(struct vcpu_svm *svm)
1513{ 1580{
1514 bool k = kvm_override; 1581 struct vmcb *nested_vmcb;
1515 1582 struct vmcb *hsave = svm->nested.hsave;
1516 switch (svm->vmcb->control.exit_code) { 1583 struct vmcb *vmcb = svm->vmcb;
1517 case SVM_EXIT_MSR:
1518 return nested_svm_do(svm, svm->nested_vmcb,
1519 svm->nested_vmcb_msrpm, NULL,
1520 nested_svm_exit_handled_msr);
1521 default: break;
1522 }
1523 1584
1524 return nested_svm_do(svm, svm->nested_vmcb, 0, &k, 1585 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1525 nested_svm_exit_handled_real); 1586 if (!nested_vmcb)
1526} 1587 return 1;
1527
1528static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
1529 void *arg2, void *opaque)
1530{
1531 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1532 struct vmcb *hsave = svm->hsave;
1533 u64 nested_save[] = { nested_vmcb->save.cr0,
1534 nested_vmcb->save.cr3,
1535 nested_vmcb->save.cr4,
1536 nested_vmcb->save.efer,
1537 nested_vmcb->control.intercept_cr_read,
1538 nested_vmcb->control.intercept_cr_write,
1539 nested_vmcb->control.intercept_dr_read,
1540 nested_vmcb->control.intercept_dr_write,
1541 nested_vmcb->control.intercept_exceptions,
1542 nested_vmcb->control.intercept,
1543 nested_vmcb->control.msrpm_base_pa,
1544 nested_vmcb->control.iopm_base_pa,
1545 nested_vmcb->control.tsc_offset };
1546 1588
1547 /* Give the current vmcb to the guest */ 1589 /* Give the current vmcb to the guest */
1548 memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); 1590 disable_gif(svm);
1549 nested_vmcb->save.cr0 = nested_save[0]; 1591
1550 if (!npt_enabled) 1592 nested_vmcb->save.es = vmcb->save.es;
1551 nested_vmcb->save.cr3 = nested_save[1]; 1593 nested_vmcb->save.cs = vmcb->save.cs;
1552 nested_vmcb->save.cr4 = nested_save[2]; 1594 nested_vmcb->save.ss = vmcb->save.ss;
1553 nested_vmcb->save.efer = nested_save[3]; 1595 nested_vmcb->save.ds = vmcb->save.ds;
1554 nested_vmcb->control.intercept_cr_read = nested_save[4]; 1596 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1555 nested_vmcb->control.intercept_cr_write = nested_save[5]; 1597 nested_vmcb->save.idtr = vmcb->save.idtr;
1556 nested_vmcb->control.intercept_dr_read = nested_save[6]; 1598 if (npt_enabled)
1557 nested_vmcb->control.intercept_dr_write = nested_save[7]; 1599 nested_vmcb->save.cr3 = vmcb->save.cr3;
1558 nested_vmcb->control.intercept_exceptions = nested_save[8]; 1600 nested_vmcb->save.cr2 = vmcb->save.cr2;
1559 nested_vmcb->control.intercept = nested_save[9]; 1601 nested_vmcb->save.rflags = vmcb->save.rflags;
1560 nested_vmcb->control.msrpm_base_pa = nested_save[10]; 1602 nested_vmcb->save.rip = vmcb->save.rip;
1561 nested_vmcb->control.iopm_base_pa = nested_save[11]; 1603 nested_vmcb->save.rsp = vmcb->save.rsp;
1562 nested_vmcb->control.tsc_offset = nested_save[12]; 1604 nested_vmcb->save.rax = vmcb->save.rax;
1605 nested_vmcb->save.dr7 = vmcb->save.dr7;
1606 nested_vmcb->save.dr6 = vmcb->save.dr6;
1607 nested_vmcb->save.cpl = vmcb->save.cpl;
1608
1609 nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
1610 nested_vmcb->control.int_vector = vmcb->control.int_vector;
1611 nested_vmcb->control.int_state = vmcb->control.int_state;
1612 nested_vmcb->control.exit_code = vmcb->control.exit_code;
1613 nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
1614 nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
1615 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1616 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1617 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1618 nested_vmcb->control.tlb_ctl = 0;
1619 nested_vmcb->control.event_inj = 0;
1620 nested_vmcb->control.event_inj_err = 0;
1563 1621
1564 /* We always set V_INTR_MASKING and remember the old value in hflags */ 1622 /* We always set V_INTR_MASKING and remember the old value in hflags */
1565 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1623 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1566 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; 1624 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
1567 1625
1568 if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
1569 (nested_vmcb->control.int_vector)) {
1570 nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
1571 nested_vmcb->control.int_vector);
1572 }
1573
1574 /* Restore the original control entries */ 1626 /* Restore the original control entries */
1575 svm->vmcb->control = hsave->control; 1627 copy_vmcb_control_area(vmcb, hsave);
1576 1628
1577 /* Kill any pending exceptions */ 1629 /* Kill any pending exceptions */
1578 if (svm->vcpu.arch.exception.pending == true) 1630 if (svm->vcpu.arch.exception.pending == true)
1579 nsvm_printk("WARNING: Pending Exception\n"); 1631 nsvm_printk("WARNING: Pending Exception\n");
1580 svm->vcpu.arch.exception.pending = false; 1632
1633 kvm_clear_exception_queue(&svm->vcpu);
1634 kvm_clear_interrupt_queue(&svm->vcpu);
1581 1635
1582 /* Restore selected save entries */ 1636 /* Restore selected save entries */
1583 svm->vmcb->save.es = hsave->save.es; 1637 svm->vmcb->save.es = hsave->save.es;
@@ -1603,19 +1657,10 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
1603 svm->vmcb->save.cpl = 0; 1657 svm->vmcb->save.cpl = 0;
1604 svm->vmcb->control.exit_int_info = 0; 1658 svm->vmcb->control.exit_int_info = 0;
1605 1659
1606 svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
1607 /* Exit nested SVM mode */ 1660 /* Exit nested SVM mode */
1608 svm->nested_vmcb = 0; 1661 svm->nested.vmcb = 0;
1609 1662
1610 return 0; 1663 nested_svm_unmap(nested_vmcb, KM_USER0);
1611}
1612
1613static int nested_svm_vmexit(struct vcpu_svm *svm)
1614{
1615 nsvm_printk("VMexit\n");
1616 if (nested_svm_do(svm, svm->nested_vmcb, 0,
1617 NULL, nested_svm_vmexit_real))
1618 return 1;
1619 1664
1620 kvm_mmu_reset_context(&svm->vcpu); 1665 kvm_mmu_reset_context(&svm->vcpu);
1621 kvm_mmu_load(&svm->vcpu); 1666 kvm_mmu_load(&svm->vcpu);
@@ -1623,38 +1668,63 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1623 return 0; 1668 return 0;
1624} 1669}
1625 1670
1626static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, 1671static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
1627 void *arg2, void *opaque)
1628{ 1672{
1673 u32 *nested_msrpm;
1629 int i; 1674 int i;
1630 u32 *nested_msrpm = (u32*)arg1; 1675
1676 nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
1677 if (!nested_msrpm)
1678 return false;
1679
1631 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) 1680 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
1632 svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; 1681 svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
1633 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
1634 1682
1635 return 0; 1683 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
1684
1685 nested_svm_unmap(nested_msrpm, KM_USER0);
1686
1687 return true;
1636} 1688}
1637 1689
1638static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, 1690static bool nested_svm_vmrun(struct vcpu_svm *svm)
1639 void *arg2, void *opaque)
1640{ 1691{
1641 struct vmcb *nested_vmcb = (struct vmcb *)arg1; 1692 struct vmcb *nested_vmcb;
1642 struct vmcb *hsave = svm->hsave; 1693 struct vmcb *hsave = svm->nested.hsave;
1694 struct vmcb *vmcb = svm->vmcb;
1695
1696 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
1697 if (!nested_vmcb)
1698 return false;
1643 1699
1644 /* nested_vmcb is our indicator if nested SVM is activated */ 1700 /* nested_vmcb is our indicator if nested SVM is activated */
1645 svm->nested_vmcb = svm->vmcb->save.rax; 1701 svm->nested.vmcb = svm->vmcb->save.rax;
1646 1702
1647 /* Clear internal status */ 1703 /* Clear internal status */
1648 svm->vcpu.arch.exception.pending = false; 1704 kvm_clear_exception_queue(&svm->vcpu);
1705 kvm_clear_interrupt_queue(&svm->vcpu);
1649 1706
1650 /* Save the old vmcb, so we don't need to pick what we save, but 1707 /* Save the old vmcb, so we don't need to pick what we save, but
1651 can restore everything when a VMEXIT occurs */ 1708 can restore everything when a VMEXIT occurs */
1652 memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); 1709 hsave->save.es = vmcb->save.es;
1653 /* We need to remember the original CR3 in the SPT case */ 1710 hsave->save.cs = vmcb->save.cs;
1654 if (!npt_enabled) 1711 hsave->save.ss = vmcb->save.ss;
1655 hsave->save.cr3 = svm->vcpu.arch.cr3; 1712 hsave->save.ds = vmcb->save.ds;
1656 hsave->save.cr4 = svm->vcpu.arch.cr4; 1713 hsave->save.gdtr = vmcb->save.gdtr;
1657 hsave->save.rip = svm->next_rip; 1714 hsave->save.idtr = vmcb->save.idtr;
1715 hsave->save.efer = svm->vcpu.arch.shadow_efer;
1716 hsave->save.cr0 = svm->vcpu.arch.cr0;
1717 hsave->save.cr4 = svm->vcpu.arch.cr4;
1718 hsave->save.rflags = vmcb->save.rflags;
1719 hsave->save.rip = svm->next_rip;
1720 hsave->save.rsp = vmcb->save.rsp;
1721 hsave->save.rax = vmcb->save.rax;
1722 if (npt_enabled)
1723 hsave->save.cr3 = vmcb->save.cr3;
1724 else
1725 hsave->save.cr3 = svm->vcpu.arch.cr3;
1726
1727 copy_vmcb_control_area(hsave, vmcb);
1658 1728
1659 if (svm->vmcb->save.rflags & X86_EFLAGS_IF) 1729 if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
1660 svm->vcpu.arch.hflags |= HF_HIF_MASK; 1730 svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -1679,7 +1749,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1679 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 1749 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1680 kvm_mmu_reset_context(&svm->vcpu); 1750 kvm_mmu_reset_context(&svm->vcpu);
1681 } 1751 }
1682 svm->vmcb->save.cr2 = nested_vmcb->save.cr2; 1752 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
1683 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 1753 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1684 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 1754 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1685 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 1755 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
@@ -1706,7 +1776,15 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1706 1776
1707 svm->vmcb->control.intercept |= nested_vmcb->control.intercept; 1777 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1708 1778
1709 svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; 1779 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1780
1781 /* cache intercepts */
1782 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
1783 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
1784 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
1785 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
1786 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
1787 svm->nested.intercept = nested_vmcb->control.intercept;
1710 1788
1711 force_new_asid(&svm->vcpu); 1789 force_new_asid(&svm->vcpu);
1712 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; 1790 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
@@ -1734,12 +1812,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1734 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1812 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1735 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1813 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1736 1814
1737 svm->vcpu.arch.hflags |= HF_GIF_MASK; 1815 nested_svm_unmap(nested_vmcb, KM_USER0);
1738 1816
1739 return 0; 1817 enable_gif(svm);
1818
1819 return true;
1740} 1820}
1741 1821
1742static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) 1822static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1743{ 1823{
1744 to_vmcb->save.fs = from_vmcb->save.fs; 1824 to_vmcb->save.fs = from_vmcb->save.fs;
1745 to_vmcb->save.gs = from_vmcb->save.gs; 1825 to_vmcb->save.gs = from_vmcb->save.gs;
@@ -1753,44 +1833,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1753 to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; 1833 to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
1754 to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; 1834 to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
1755 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1835 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1756
1757 return 1;
1758}
1759
1760static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
1761 void *arg2, void *opaque)
1762{
1763 return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
1764}
1765
1766static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
1767 void *arg2, void *opaque)
1768{
1769 return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
1770} 1836}
1771 1837
1772static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1838static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1773{ 1839{
1840 struct vmcb *nested_vmcb;
1841
1774 if (nested_svm_check_permissions(svm)) 1842 if (nested_svm_check_permissions(svm))
1775 return 1; 1843 return 1;
1776 1844
1777 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1845 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1778 skip_emulated_instruction(&svm->vcpu); 1846 skip_emulated_instruction(&svm->vcpu);
1779 1847
1780 nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); 1848 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
1849 if (!nested_vmcb)
1850 return 1;
1851
1852 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1853 nested_svm_unmap(nested_vmcb, KM_USER0);
1781 1854
1782 return 1; 1855 return 1;
1783} 1856}
1784 1857
1785static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1858static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1786{ 1859{
1860 struct vmcb *nested_vmcb;
1861
1787 if (nested_svm_check_permissions(svm)) 1862 if (nested_svm_check_permissions(svm))
1788 return 1; 1863 return 1;
1789 1864
1790 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1865 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1791 skip_emulated_instruction(&svm->vcpu); 1866 skip_emulated_instruction(&svm->vcpu);
1792 1867
1793 nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); 1868 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
1869 if (!nested_vmcb)
1870 return 1;
1871
1872 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1873 nested_svm_unmap(nested_vmcb, KM_USER0);
1794 1874
1795 return 1; 1875 return 1;
1796} 1876}
@@ -1798,19 +1878,29 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1798static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1878static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1799{ 1879{
1800 nsvm_printk("VMrun\n"); 1880 nsvm_printk("VMrun\n");
1881
1801 if (nested_svm_check_permissions(svm)) 1882 if (nested_svm_check_permissions(svm))
1802 return 1; 1883 return 1;
1803 1884
1804 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1885 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1805 skip_emulated_instruction(&svm->vcpu); 1886 skip_emulated_instruction(&svm->vcpu);
1806 1887
1807 if (nested_svm_do(svm, svm->vmcb->save.rax, 0, 1888 if (!nested_svm_vmrun(svm))
1808 NULL, nested_svm_vmrun))
1809 return 1; 1889 return 1;
1810 1890
1811 if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, 1891 if (!nested_svm_vmrun_msrpm(svm))
1812 NULL, nested_svm_vmrun_msrpm)) 1892 goto failed;
1813 return 1; 1893
1894 return 1;
1895
1896failed:
1897
1898 svm->vmcb->control.exit_code = SVM_EXIT_ERR;
1899 svm->vmcb->control.exit_code_hi = 0;
1900 svm->vmcb->control.exit_info_1 = 0;
1901 svm->vmcb->control.exit_info_2 = 0;
1902
1903 nested_svm_vmexit(svm);
1814 1904
1815 return 1; 1905 return 1;
1816} 1906}
@@ -1823,7 +1913,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1823 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1913 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1824 skip_emulated_instruction(&svm->vcpu); 1914 skip_emulated_instruction(&svm->vcpu);
1825 1915
1826 svm->vcpu.arch.hflags |= HF_GIF_MASK; 1916 enable_gif(svm);
1827 1917
1828 return 1; 1918 return 1;
1829} 1919}
@@ -1836,7 +1926,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1836 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1926 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1837 skip_emulated_instruction(&svm->vcpu); 1927 skip_emulated_instruction(&svm->vcpu);
1838 1928
1839 svm->vcpu.arch.hflags &= ~HF_GIF_MASK; 1929 disable_gif(svm);
1840 1930
1841 /* After a CLGI no interrupts should come */ 1931 /* After a CLGI no interrupts should come */
1842 svm_clear_vintr(svm); 1932 svm_clear_vintr(svm);
@@ -1845,6 +1935,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1845 return 1; 1935 return 1;
1846} 1936}
1847 1937
1938static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1939{
1940 struct kvm_vcpu *vcpu = &svm->vcpu;
1941 nsvm_printk("INVLPGA\n");
1942
1943 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1944 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
1945
1946 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1947 skip_emulated_instruction(&svm->vcpu);
1948 return 1;
1949}
1950
1848static int invalid_op_interception(struct vcpu_svm *svm, 1951static int invalid_op_interception(struct vcpu_svm *svm,
1849 struct kvm_run *kvm_run) 1952 struct kvm_run *kvm_run)
1850{ 1953{
@@ -1953,7 +2056,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1953 struct vcpu_svm *svm = to_svm(vcpu); 2056 struct vcpu_svm *svm = to_svm(vcpu);
1954 2057
1955 switch (ecx) { 2058 switch (ecx) {
1956 case MSR_IA32_TIME_STAMP_COUNTER: { 2059 case MSR_IA32_TSC: {
1957 u64 tsc; 2060 u64 tsc;
1958 2061
1959 rdtscll(tsc); 2062 rdtscll(tsc);
@@ -1981,10 +2084,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1981 *data = svm->vmcb->save.sysenter_cs; 2084 *data = svm->vmcb->save.sysenter_cs;
1982 break; 2085 break;
1983 case MSR_IA32_SYSENTER_EIP: 2086 case MSR_IA32_SYSENTER_EIP:
1984 *data = svm->vmcb->save.sysenter_eip; 2087 *data = svm->sysenter_eip;
1985 break; 2088 break;
1986 case MSR_IA32_SYSENTER_ESP: 2089 case MSR_IA32_SYSENTER_ESP:
1987 *data = svm->vmcb->save.sysenter_esp; 2090 *data = svm->sysenter_esp;
1988 break; 2091 break;
1989 /* Nobody will change the following 5 values in the VMCB so 2092 /* Nobody will change the following 5 values in the VMCB so
1990 we can safely return them on rdmsr. They will always be 0 2093 we can safely return them on rdmsr. They will always be 0
@@ -2005,7 +2108,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2005 *data = svm->vmcb->save.last_excp_to; 2108 *data = svm->vmcb->save.last_excp_to;
2006 break; 2109 break;
2007 case MSR_VM_HSAVE_PA: 2110 case MSR_VM_HSAVE_PA:
2008 *data = svm->hsave_msr; 2111 *data = svm->nested.hsave_msr;
2009 break; 2112 break;
2010 case MSR_VM_CR: 2113 case MSR_VM_CR:
2011 *data = 0; 2114 *data = 0;
@@ -2027,8 +2130,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2027 if (svm_get_msr(&svm->vcpu, ecx, &data)) 2130 if (svm_get_msr(&svm->vcpu, ecx, &data))
2028 kvm_inject_gp(&svm->vcpu, 0); 2131 kvm_inject_gp(&svm->vcpu, 0);
2029 else { 2132 else {
2030 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, 2133 trace_kvm_msr_read(ecx, data);
2031 (u32)(data >> 32), handler);
2032 2134
2033 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 2135 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
2034 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 2136 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
@@ -2043,7 +2145,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2043 struct vcpu_svm *svm = to_svm(vcpu); 2145 struct vcpu_svm *svm = to_svm(vcpu);
2044 2146
2045 switch (ecx) { 2147 switch (ecx) {
2046 case MSR_IA32_TIME_STAMP_COUNTER: { 2148 case MSR_IA32_TSC: {
2047 u64 tsc; 2149 u64 tsc;
2048 2150
2049 rdtscll(tsc); 2151 rdtscll(tsc);
@@ -2071,9 +2173,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2071 svm->vmcb->save.sysenter_cs = data; 2173 svm->vmcb->save.sysenter_cs = data;
2072 break; 2174 break;
2073 case MSR_IA32_SYSENTER_EIP: 2175 case MSR_IA32_SYSENTER_EIP:
2176 svm->sysenter_eip = data;
2074 svm->vmcb->save.sysenter_eip = data; 2177 svm->vmcb->save.sysenter_eip = data;
2075 break; 2178 break;
2076 case MSR_IA32_SYSENTER_ESP: 2179 case MSR_IA32_SYSENTER_ESP:
2180 svm->sysenter_esp = data;
2077 svm->vmcb->save.sysenter_esp = data; 2181 svm->vmcb->save.sysenter_esp = data;
2078 break; 2182 break;
2079 case MSR_IA32_DEBUGCTLMSR: 2183 case MSR_IA32_DEBUGCTLMSR:
@@ -2091,24 +2195,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2091 else 2195 else
2092 svm_disable_lbrv(svm); 2196 svm_disable_lbrv(svm);
2093 break; 2197 break;
2094 case MSR_K7_EVNTSEL0:
2095 case MSR_K7_EVNTSEL1:
2096 case MSR_K7_EVNTSEL2:
2097 case MSR_K7_EVNTSEL3:
2098 case MSR_K7_PERFCTR0:
2099 case MSR_K7_PERFCTR1:
2100 case MSR_K7_PERFCTR2:
2101 case MSR_K7_PERFCTR3:
2102 /*
2103 * Just discard all writes to the performance counters; this
2104 * should keep both older linux and windows 64-bit guests
2105 * happy
2106 */
2107 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
2108
2109 break;
2110 case MSR_VM_HSAVE_PA: 2198 case MSR_VM_HSAVE_PA:
2111 svm->hsave_msr = data; 2199 svm->nested.hsave_msr = data;
2200 break;
2201 case MSR_VM_CR:
2202 case MSR_VM_IGNNE:
2203 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2112 break; 2204 break;
2113 default: 2205 default:
2114 return kvm_set_msr_common(vcpu, ecx, data); 2206 return kvm_set_msr_common(vcpu, ecx, data);
@@ -2122,8 +2214,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2122 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2214 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2123 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 2215 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2124 2216
2125 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), 2217 trace_kvm_msr_write(ecx, data);
2126 handler);
2127 2218
2128 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2219 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2129 if (svm_set_msr(&svm->vcpu, ecx, data)) 2220 if (svm_set_msr(&svm->vcpu, ecx, data))
@@ -2144,8 +2235,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2144static int interrupt_window_interception(struct vcpu_svm *svm, 2235static int interrupt_window_interception(struct vcpu_svm *svm,
2145 struct kvm_run *kvm_run) 2236 struct kvm_run *kvm_run)
2146{ 2237{
2147 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
2148
2149 svm_clear_vintr(svm); 2238 svm_clear_vintr(svm);
2150 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2239 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2151 /* 2240 /*
@@ -2201,7 +2290,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2201 [SVM_EXIT_INVD] = emulate_on_interception, 2290 [SVM_EXIT_INVD] = emulate_on_interception,
2202 [SVM_EXIT_HLT] = halt_interception, 2291 [SVM_EXIT_HLT] = halt_interception,
2203 [SVM_EXIT_INVLPG] = invlpg_interception, 2292 [SVM_EXIT_INVLPG] = invlpg_interception,
2204 [SVM_EXIT_INVLPGA] = invalid_op_interception, 2293 [SVM_EXIT_INVLPGA] = invlpga_interception,
2205 [SVM_EXIT_IOIO] = io_interception, 2294 [SVM_EXIT_IOIO] = io_interception,
2206 [SVM_EXIT_MSR] = msr_interception, 2295 [SVM_EXIT_MSR] = msr_interception,
2207 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 2296 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
@@ -2224,20 +2313,26 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2224 struct vcpu_svm *svm = to_svm(vcpu); 2313 struct vcpu_svm *svm = to_svm(vcpu);
2225 u32 exit_code = svm->vmcb->control.exit_code; 2314 u32 exit_code = svm->vmcb->control.exit_code;
2226 2315
2227 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, 2316 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2228 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
2229 2317
2230 if (is_nested(svm)) { 2318 if (is_nested(svm)) {
2319 int vmexit;
2320
2231 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2321 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
2232 exit_code, svm->vmcb->control.exit_info_1, 2322 exit_code, svm->vmcb->control.exit_info_1,
2233 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2323 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
2234 if (nested_svm_exit_handled(svm, true)) { 2324
2235 nested_svm_vmexit(svm); 2325 vmexit = nested_svm_exit_special(svm);
2236 nsvm_printk("-> #VMEXIT\n"); 2326
2327 if (vmexit == NESTED_EXIT_CONTINUE)
2328 vmexit = nested_svm_exit_handled(svm);
2329
2330 if (vmexit == NESTED_EXIT_DONE)
2237 return 1; 2331 return 1;
2238 }
2239 } 2332 }
2240 2333
2334 svm_complete_interrupts(svm);
2335
2241 if (npt_enabled) { 2336 if (npt_enabled) {
2242 int mmu_reload = 0; 2337 int mmu_reload = 0;
2243 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 2338 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -2246,12 +2341,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2246 } 2341 }
2247 vcpu->arch.cr0 = svm->vmcb->save.cr0; 2342 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2248 vcpu->arch.cr3 = svm->vmcb->save.cr3; 2343 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2249 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2250 if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
2251 kvm_inject_gp(vcpu, 0);
2252 return 1;
2253 }
2254 }
2255 if (mmu_reload) { 2344 if (mmu_reload) {
2256 kvm_mmu_reset_context(vcpu); 2345 kvm_mmu_reset_context(vcpu);
2257 kvm_mmu_load(vcpu); 2346 kvm_mmu_load(vcpu);
@@ -2319,7 +2408,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2319{ 2408{
2320 struct vmcb_control_area *control; 2409 struct vmcb_control_area *control;
2321 2410
2322 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); 2411 trace_kvm_inj_virq(irq);
2323 2412
2324 ++svm->vcpu.stat.irq_injections; 2413 ++svm->vcpu.stat.irq_injections;
2325 control = &svm->vmcb->control; 2414 control = &svm->vmcb->control;
@@ -2329,21 +2418,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2329 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 2418 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2330} 2419}
2331 2420
2332static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
2333{
2334 struct vcpu_svm *svm = to_svm(vcpu);
2335
2336 svm->vmcb->control.event_inj = nr |
2337 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2338}
2339
2340static void svm_set_irq(struct kvm_vcpu *vcpu) 2421static void svm_set_irq(struct kvm_vcpu *vcpu)
2341{ 2422{
2342 struct vcpu_svm *svm = to_svm(vcpu); 2423 struct vcpu_svm *svm = to_svm(vcpu);
2343 2424
2344 nested_svm_intr(svm); 2425 BUG_ON(!(gif_set(svm)));
2345 2426
2346 svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); 2427 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
2428 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2347} 2429}
2348 2430
2349static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 2431static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -2371,13 +2453,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2371 struct vmcb *vmcb = svm->vmcb; 2453 struct vmcb *vmcb = svm->vmcb;
2372 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2454 return (vmcb->save.rflags & X86_EFLAGS_IF) &&
2373 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2455 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2374 (svm->vcpu.arch.hflags & HF_GIF_MASK); 2456 gif_set(svm) &&
2457 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK));
2375} 2458}
2376 2459
2377static void enable_irq_window(struct kvm_vcpu *vcpu) 2460static void enable_irq_window(struct kvm_vcpu *vcpu)
2378{ 2461{
2379 svm_set_vintr(to_svm(vcpu)); 2462 struct vcpu_svm *svm = to_svm(vcpu);
2380 svm_inject_irq(to_svm(vcpu), 0x0); 2463 nsvm_printk("Trying to open IRQ window\n");
2464
2465 nested_svm_intr(svm);
2466
2467 /* In case GIF=0 we can't rely on the CPU to tell us when
2468 * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
2469 * The next time we get that intercept, this function will be
2470 * called again though and we'll get the vintr intercept. */
2471 if (gif_set(svm)) {
2472 svm_set_vintr(svm);
2473 svm_inject_irq(svm, 0x0);
2474 }
2381} 2475}
2382 2476
2383static void enable_nmi_window(struct kvm_vcpu *vcpu) 2477static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -2456,6 +2550,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2456 case SVM_EXITINTINFO_TYPE_EXEPT: 2550 case SVM_EXITINTINFO_TYPE_EXEPT:
2457 /* In case of software exception do not reinject an exception 2551 /* In case of software exception do not reinject an exception
2458 vector, but re-execute and instruction instead */ 2552 vector, but re-execute and instruction instead */
2553 if (is_nested(svm))
2554 break;
2459 if (kvm_exception_is_soft(vector)) 2555 if (kvm_exception_is_soft(vector))
2460 break; 2556 break;
2461 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 2557 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
@@ -2498,9 +2594,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2498 fs_selector = kvm_read_fs(); 2594 fs_selector = kvm_read_fs();
2499 gs_selector = kvm_read_gs(); 2595 gs_selector = kvm_read_gs();
2500 ldt_selector = kvm_read_ldt(); 2596 ldt_selector = kvm_read_ldt();
2501 svm->host_cr2 = kvm_read_cr2(); 2597 svm->vmcb->save.cr2 = vcpu->arch.cr2;
2502 if (!is_nested(svm))
2503 svm->vmcb->save.cr2 = vcpu->arch.cr2;
2504 /* required for live migration with NPT */ 2598 /* required for live migration with NPT */
2505 if (npt_enabled) 2599 if (npt_enabled)
2506 svm->vmcb->save.cr3 = vcpu->arch.cr3; 2600 svm->vmcb->save.cr3 = vcpu->arch.cr3;
@@ -2585,8 +2679,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2585 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 2679 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
2586 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 2680 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
2587 2681
2588 kvm_write_cr2(svm->host_cr2);
2589
2590 kvm_load_fs(fs_selector); 2682 kvm_load_fs(fs_selector);
2591 kvm_load_gs(gs_selector); 2683 kvm_load_gs(gs_selector);
2592 kvm_load_ldt(ldt_selector); 2684 kvm_load_ldt(ldt_selector);
@@ -2602,7 +2694,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2602 2694
2603 svm->next_rip = 0; 2695 svm->next_rip = 0;
2604 2696
2605 svm_complete_interrupts(svm); 2697 if (npt_enabled) {
2698 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
2699 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
2700 }
2606} 2701}
2607 2702
2608#undef R 2703#undef R
@@ -2673,6 +2768,64 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2673 return 0; 2768 return 0;
2674} 2769}
2675 2770
2771static const struct trace_print_flags svm_exit_reasons_str[] = {
2772 { SVM_EXIT_READ_CR0, "read_cr0" },
2773 { SVM_EXIT_READ_CR3, "read_cr3" },
2774 { SVM_EXIT_READ_CR4, "read_cr4" },
2775 { SVM_EXIT_READ_CR8, "read_cr8" },
2776 { SVM_EXIT_WRITE_CR0, "write_cr0" },
2777 { SVM_EXIT_WRITE_CR3, "write_cr3" },
2778 { SVM_EXIT_WRITE_CR4, "write_cr4" },
2779 { SVM_EXIT_WRITE_CR8, "write_cr8" },
2780 { SVM_EXIT_READ_DR0, "read_dr0" },
2781 { SVM_EXIT_READ_DR1, "read_dr1" },
2782 { SVM_EXIT_READ_DR2, "read_dr2" },
2783 { SVM_EXIT_READ_DR3, "read_dr3" },
2784 { SVM_EXIT_WRITE_DR0, "write_dr0" },
2785 { SVM_EXIT_WRITE_DR1, "write_dr1" },
2786 { SVM_EXIT_WRITE_DR2, "write_dr2" },
2787 { SVM_EXIT_WRITE_DR3, "write_dr3" },
2788 { SVM_EXIT_WRITE_DR5, "write_dr5" },
2789 { SVM_EXIT_WRITE_DR7, "write_dr7" },
2790 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
2791 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
2792 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
2793 { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
2794 { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
2795 { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
2796 { SVM_EXIT_INTR, "interrupt" },
2797 { SVM_EXIT_NMI, "nmi" },
2798 { SVM_EXIT_SMI, "smi" },
2799 { SVM_EXIT_INIT, "init" },
2800 { SVM_EXIT_VINTR, "vintr" },
2801 { SVM_EXIT_CPUID, "cpuid" },
2802 { SVM_EXIT_INVD, "invd" },
2803 { SVM_EXIT_HLT, "hlt" },
2804 { SVM_EXIT_INVLPG, "invlpg" },
2805 { SVM_EXIT_INVLPGA, "invlpga" },
2806 { SVM_EXIT_IOIO, "io" },
2807 { SVM_EXIT_MSR, "msr" },
2808 { SVM_EXIT_TASK_SWITCH, "task_switch" },
2809 { SVM_EXIT_SHUTDOWN, "shutdown" },
2810 { SVM_EXIT_VMRUN, "vmrun" },
2811 { SVM_EXIT_VMMCALL, "hypercall" },
2812 { SVM_EXIT_VMLOAD, "vmload" },
2813 { SVM_EXIT_VMSAVE, "vmsave" },
2814 { SVM_EXIT_STGI, "stgi" },
2815 { SVM_EXIT_CLGI, "clgi" },
2816 { SVM_EXIT_SKINIT, "skinit" },
2817 { SVM_EXIT_WBINVD, "wbinvd" },
2818 { SVM_EXIT_MONITOR, "monitor" },
2819 { SVM_EXIT_MWAIT, "mwait" },
2820 { SVM_EXIT_NPF, "npf" },
2821 { -1, NULL }
2822};
2823
2824static bool svm_gb_page_enable(void)
2825{
2826 return true;
2827}
2828
2676static struct kvm_x86_ops svm_x86_ops = { 2829static struct kvm_x86_ops svm_x86_ops = {
2677 .cpu_has_kvm_support = has_svm, 2830 .cpu_has_kvm_support = has_svm,
2678 .disabled_by_bios = is_disabled, 2831 .disabled_by_bios = is_disabled,
@@ -2710,6 +2863,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2710 .set_gdt = svm_set_gdt, 2863 .set_gdt = svm_set_gdt,
2711 .get_dr = svm_get_dr, 2864 .get_dr = svm_get_dr,
2712 .set_dr = svm_set_dr, 2865 .set_dr = svm_set_dr,
2866 .cache_reg = svm_cache_reg,
2713 .get_rflags = svm_get_rflags, 2867 .get_rflags = svm_get_rflags,
2714 .set_rflags = svm_set_rflags, 2868 .set_rflags = svm_set_rflags,
2715 2869
@@ -2733,6 +2887,9 @@ static struct kvm_x86_ops svm_x86_ops = {
2733 .set_tss_addr = svm_set_tss_addr, 2887 .set_tss_addr = svm_set_tss_addr,
2734 .get_tdp_level = get_npt_level, 2888 .get_tdp_level = get_npt_level,
2735 .get_mt_mask = svm_get_mt_mask, 2889 .get_mt_mask = svm_get_mt_mask,
2890
2891 .exit_reasons_str = svm_exit_reasons_str,
2892 .gb_page_enable = svm_gb_page_enable,
2736}; 2893};
2737 2894
2738static int __init svm_init(void) 2895static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 86dbac072d0c..eea40439066c 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
9 int restart_timer = 0; 9 int restart_timer = 0;
10 wait_queue_head_t *q = &vcpu->wq; 10 wait_queue_head_t *q = &vcpu->wq;
11 11
12 /* FIXME: this code should not know anything about vcpus */ 12 /*
13 if (!atomic_inc_and_test(&ktimer->pending)) 13 * There is a race window between reading and incrementing, but we do
14 * not care about potentially loosing timer events in the !reinject
15 * case anyway.
16 */
17 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
18 atomic_inc(&ktimer->pending);
19 /* FIXME: this code should not know anything about vcpus */
14 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 20 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
15 21 }
16 if (!ktimer->reinject)
17 atomic_set(&ktimer->pending, 1);
18 22
19 if (waitqueue_active(q)) 23 if (waitqueue_active(q))
20 wake_up_interruptible(q); 24 wake_up_interruptible(q);
@@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
33 struct kvm_vcpu *vcpu; 37 struct kvm_vcpu *vcpu;
34 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); 38 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
35 39
36 vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; 40 vcpu = ktimer->vcpu;
37 if (!vcpu) 41 if (!vcpu)
38 return HRTIMER_NORESTART; 42 return HRTIMER_NORESTART;
39 43
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 000000000000..0d480e77eacf
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,355 @@
1#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVM_H
3
4#include <linux/tracepoint.h>
5
6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_PATH arch/x86/kvm
9#define TRACE_INCLUDE_FILE trace
10
11/*
12 * Tracepoint for guest mode entry.
13 */
14TRACE_EVENT(kvm_entry,
15 TP_PROTO(unsigned int vcpu_id),
16 TP_ARGS(vcpu_id),
17
18 TP_STRUCT__entry(
19 __field( unsigned int, vcpu_id )
20 ),
21
22 TP_fast_assign(
23 __entry->vcpu_id = vcpu_id;
24 ),
25
26 TP_printk("vcpu %u", __entry->vcpu_id)
27);
28
29/*
30 * Tracepoint for hypercall.
31 */
32TRACE_EVENT(kvm_hypercall,
33 TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
34 unsigned long a2, unsigned long a3),
35 TP_ARGS(nr, a0, a1, a2, a3),
36
37 TP_STRUCT__entry(
38 __field( unsigned long, nr )
39 __field( unsigned long, a0 )
40 __field( unsigned long, a1 )
41 __field( unsigned long, a2 )
42 __field( unsigned long, a3 )
43 ),
44
45 TP_fast_assign(
46 __entry->nr = nr;
47 __entry->a0 = a0;
48 __entry->a1 = a1;
49 __entry->a2 = a2;
50 __entry->a3 = a3;
51 ),
52
53 TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
54 __entry->nr, __entry->a0, __entry->a1, __entry->a2,
55 __entry->a3)
56);
57
58/*
59 * Tracepoint for PIO.
60 */
61TRACE_EVENT(kvm_pio,
62 TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
63 unsigned int count),
64 TP_ARGS(rw, port, size, count),
65
66 TP_STRUCT__entry(
67 __field( unsigned int, rw )
68 __field( unsigned int, port )
69 __field( unsigned int, size )
70 __field( unsigned int, count )
71 ),
72
73 TP_fast_assign(
74 __entry->rw = rw;
75 __entry->port = port;
76 __entry->size = size;
77 __entry->count = count;
78 ),
79
80 TP_printk("pio_%s at 0x%x size %d count %d",
81 __entry->rw ? "write" : "read",
82 __entry->port, __entry->size, __entry->count)
83);
84
85/*
86 * Tracepoint for cpuid.
87 */
88TRACE_EVENT(kvm_cpuid,
89 TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
90 unsigned long rcx, unsigned long rdx),
91 TP_ARGS(function, rax, rbx, rcx, rdx),
92
93 TP_STRUCT__entry(
94 __field( unsigned int, function )
95 __field( unsigned long, rax )
96 __field( unsigned long, rbx )
97 __field( unsigned long, rcx )
98 __field( unsigned long, rdx )
99 ),
100
101 TP_fast_assign(
102 __entry->function = function;
103 __entry->rax = rax;
104 __entry->rbx = rbx;
105 __entry->rcx = rcx;
106 __entry->rdx = rdx;
107 ),
108
109 TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
110 __entry->function, __entry->rax,
111 __entry->rbx, __entry->rcx, __entry->rdx)
112);
113
114#define AREG(x) { APIC_##x, "APIC_" #x }
115
116#define kvm_trace_symbol_apic \
117 AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
118 AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
119 AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
120 AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
121 AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
122 AREG(ECTRL)
123/*
124 * Tracepoint for apic access.
125 */
126TRACE_EVENT(kvm_apic,
127 TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
128 TP_ARGS(rw, reg, val),
129
130 TP_STRUCT__entry(
131 __field( unsigned int, rw )
132 __field( unsigned int, reg )
133 __field( unsigned int, val )
134 ),
135
136 TP_fast_assign(
137 __entry->rw = rw;
138 __entry->reg = reg;
139 __entry->val = val;
140 ),
141
142 TP_printk("apic_%s %s = 0x%x",
143 __entry->rw ? "write" : "read",
144 __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
145 __entry->val)
146);
147
148#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
149#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
150
151/*
152 * Tracepoint for kvm guest exit:
153 */
154TRACE_EVENT(kvm_exit,
155 TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
156 TP_ARGS(exit_reason, guest_rip),
157
158 TP_STRUCT__entry(
159 __field( unsigned int, exit_reason )
160 __field( unsigned long, guest_rip )
161 ),
162
163 TP_fast_assign(
164 __entry->exit_reason = exit_reason;
165 __entry->guest_rip = guest_rip;
166 ),
167
168 TP_printk("reason %s rip 0x%lx",
169 ftrace_print_symbols_seq(p, __entry->exit_reason,
170 kvm_x86_ops->exit_reasons_str),
171 __entry->guest_rip)
172);
173
174/*
175 * Tracepoint for kvm interrupt injection:
176 */
177TRACE_EVENT(kvm_inj_virq,
178 TP_PROTO(unsigned int irq),
179 TP_ARGS(irq),
180
181 TP_STRUCT__entry(
182 __field( unsigned int, irq )
183 ),
184
185 TP_fast_assign(
186 __entry->irq = irq;
187 ),
188
189 TP_printk("irq %u", __entry->irq)
190);
191
192/*
193 * Tracepoint for page fault.
194 */
195TRACE_EVENT(kvm_page_fault,
196 TP_PROTO(unsigned long fault_address, unsigned int error_code),
197 TP_ARGS(fault_address, error_code),
198
199 TP_STRUCT__entry(
200 __field( unsigned long, fault_address )
201 __field( unsigned int, error_code )
202 ),
203
204 TP_fast_assign(
205 __entry->fault_address = fault_address;
206 __entry->error_code = error_code;
207 ),
208
209 TP_printk("address %lx error_code %x",
210 __entry->fault_address, __entry->error_code)
211);
212
213/*
214 * Tracepoint for guest MSR access.
215 */
216TRACE_EVENT(kvm_msr,
217 TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data),
218 TP_ARGS(rw, ecx, data),
219
220 TP_STRUCT__entry(
221 __field( unsigned int, rw )
222 __field( unsigned int, ecx )
223 __field( unsigned long, data )
224 ),
225
226 TP_fast_assign(
227 __entry->rw = rw;
228 __entry->ecx = ecx;
229 __entry->data = data;
230 ),
231
232 TP_printk("msr_%s %x = 0x%lx",
233 __entry->rw ? "write" : "read",
234 __entry->ecx, __entry->data)
235);
236
237#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data)
238#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data)
239
240/*
241 * Tracepoint for guest CR access.
242 */
243TRACE_EVENT(kvm_cr,
244 TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
245 TP_ARGS(rw, cr, val),
246
247 TP_STRUCT__entry(
248 __field( unsigned int, rw )
249 __field( unsigned int, cr )
250 __field( unsigned long, val )
251 ),
252
253 TP_fast_assign(
254 __entry->rw = rw;
255 __entry->cr = cr;
256 __entry->val = val;
257 ),
258
259 TP_printk("cr_%s %x = 0x%lx",
260 __entry->rw ? "write" : "read",
261 __entry->cr, __entry->val)
262);
263
264#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
265#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
266
267TRACE_EVENT(kvm_pic_set_irq,
268 TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
269 TP_ARGS(chip, pin, elcr, imr, coalesced),
270
271 TP_STRUCT__entry(
272 __field( __u8, chip )
273 __field( __u8, pin )
274 __field( __u8, elcr )
275 __field( __u8, imr )
276 __field( bool, coalesced )
277 ),
278
279 TP_fast_assign(
280 __entry->chip = chip;
281 __entry->pin = pin;
282 __entry->elcr = elcr;
283 __entry->imr = imr;
284 __entry->coalesced = coalesced;
285 ),
286
287 TP_printk("chip %u pin %u (%s%s)%s",
288 __entry->chip, __entry->pin,
289 (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
290 (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
291 __entry->coalesced ? " (coalesced)" : "")
292);
293
294#define kvm_apic_dst_shorthand \
295 {0x0, "dst"}, \
296 {0x1, "self"}, \
297 {0x2, "all"}, \
298 {0x3, "all-but-self"}
299
300TRACE_EVENT(kvm_apic_ipi,
301 TP_PROTO(__u32 icr_low, __u32 dest_id),
302 TP_ARGS(icr_low, dest_id),
303
304 TP_STRUCT__entry(
305 __field( __u32, icr_low )
306 __field( __u32, dest_id )
307 ),
308
309 TP_fast_assign(
310 __entry->icr_low = icr_low;
311 __entry->dest_id = dest_id;
312 ),
313
314 TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
315 __entry->dest_id, (u8)__entry->icr_low,
316 __print_symbolic((__entry->icr_low >> 8 & 0x7),
317 kvm_deliver_mode),
318 (__entry->icr_low & (1<<11)) ? "logical" : "physical",
319 (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
320 (__entry->icr_low & (1<<15)) ? "level" : "edge",
321 __print_symbolic((__entry->icr_low >> 18 & 0x3),
322 kvm_apic_dst_shorthand))
323);
324
325TRACE_EVENT(kvm_apic_accept_irq,
326 TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
327 TP_ARGS(apicid, dm, tm, vec, coalesced),
328
329 TP_STRUCT__entry(
330 __field( __u32, apicid )
331 __field( __u16, dm )
332 __field( __u8, tm )
333 __field( __u8, vec )
334 __field( bool, coalesced )
335 ),
336
337 TP_fast_assign(
338 __entry->apicid = apicid;
339 __entry->dm = dm;
340 __entry->tm = tm;
341 __entry->vec = vec;
342 __entry->coalesced = coalesced;
343 ),
344
345 TP_printk("apicid %x vec %u (%s|%s)%s",
346 __entry->apicid, __entry->vec,
347 __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
348 __entry->tm ? "level" : "edge",
349 __entry->coalesced ? " (coalesced)" : "")
350);
351
352#endif /* _TRACE_KVM_H */
353
354/* This part must be outside protection */
355#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29f912927a58..f3812014bd0b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -25,6 +25,7 @@
25#include <linux/highmem.h> 25#include <linux/highmem.h>
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/moduleparam.h> 27#include <linux/moduleparam.h>
28#include <linux/ftrace_event.h>
28#include "kvm_cache_regs.h" 29#include "kvm_cache_regs.h"
29#include "x86.h" 30#include "x86.h"
30 31
@@ -34,6 +35,8 @@
34#include <asm/virtext.h> 35#include <asm/virtext.h>
35#include <asm/mce.h> 36#include <asm/mce.h>
36 37
38#include "trace.h"
39
37#define __ex(x) __kvm_handle_fault_on_reboot(x) 40#define __ex(x) __kvm_handle_fault_on_reboot(x)
38 41
39MODULE_AUTHOR("Qumranet"); 42MODULE_AUTHOR("Qumranet");
@@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
51static int __read_mostly enable_ept = 1; 54static int __read_mostly enable_ept = 1;
52module_param_named(ept, enable_ept, bool, S_IRUGO); 55module_param_named(ept, enable_ept, bool, S_IRUGO);
53 56
57static int __read_mostly enable_unrestricted_guest = 1;
58module_param_named(unrestricted_guest,
59 enable_unrestricted_guest, bool, S_IRUGO);
60
54static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
55module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
56 63
@@ -84,6 +91,14 @@ struct vcpu_vmx {
84 int guest_efer_loaded; 91 int guest_efer_loaded;
85 } host_state; 92 } host_state;
86 struct { 93 struct {
94 int vm86_active;
95 u8 save_iopl;
96 struct kvm_save_segment {
97 u16 selector;
98 unsigned long base;
99 u32 limit;
100 u32 ar;
101 } tr, es, ds, fs, gs;
87 struct { 102 struct {
88 bool pending; 103 bool pending;
89 u8 vector; 104 u8 vector;
@@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field {
161 VMX_SEGMENT_FIELD(LDTR), 176 VMX_SEGMENT_FIELD(LDTR),
162}; 177};
163 178
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180
164/* 181/*
165 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it 182 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
166 * away by decrementing the array size. 183 * away by decrementing the array size.
@@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void)
256 cpu_has_vmx_virtualize_apic_accesses(); 273 cpu_has_vmx_virtualize_apic_accesses();
257} 274}
258 275
276static inline bool cpu_has_vmx_ept_execute_only(void)
277{
278 return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
279}
280
281static inline bool cpu_has_vmx_eptp_uncacheable(void)
282{
283 return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
284}
285
286static inline bool cpu_has_vmx_eptp_writeback(void)
287{
288 return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
289}
290
291static inline bool cpu_has_vmx_ept_2m_page(void)
292{
293 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
294}
295
259static inline int cpu_has_vmx_invept_individual_addr(void) 296static inline int cpu_has_vmx_invept_individual_addr(void)
260{ 297{
261 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 298 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void)
277 SECONDARY_EXEC_ENABLE_EPT; 314 SECONDARY_EXEC_ENABLE_EPT;
278} 315}
279 316
317static inline int cpu_has_vmx_unrestricted_guest(void)
318{
319 return vmcs_config.cpu_based_2nd_exec_ctrl &
320 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321}
322
280static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
281{ 324{
282 return flexpriority_enabled && 325 return flexpriority_enabled &&
@@ -497,14 +540,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
497 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); 540 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
498 if (!vcpu->fpu_active) 541 if (!vcpu->fpu_active)
499 eb |= 1u << NM_VECTOR; 542 eb |= 1u << NM_VECTOR;
543 /*
544 * Unconditionally intercept #DB so we can maintain dr6 without
545 * reading it every exit.
546 */
547 eb |= 1u << DB_VECTOR;
500 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 548 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
501 if (vcpu->guest_debug &
502 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
503 eb |= 1u << DB_VECTOR;
504 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 549 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
505 eb |= 1u << BP_VECTOR; 550 eb |= 1u << BP_VECTOR;
506 } 551 }
507 if (vcpu->arch.rmode.vm86_active) 552 if (to_vmx(vcpu)->rmode.vm86_active)
508 eb = ~0; 553 eb = ~0;
509 if (enable_ept) 554 if (enable_ept)
510 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 555 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -528,12 +573,15 @@ static void reload_tss(void)
528static void load_transition_efer(struct vcpu_vmx *vmx) 573static void load_transition_efer(struct vcpu_vmx *vmx)
529{ 574{
530 int efer_offset = vmx->msr_offset_efer; 575 int efer_offset = vmx->msr_offset_efer;
531 u64 host_efer = vmx->host_msrs[efer_offset].data; 576 u64 host_efer;
532 u64 guest_efer = vmx->guest_msrs[efer_offset].data; 577 u64 guest_efer;
533 u64 ignore_bits; 578 u64 ignore_bits;
534 579
535 if (efer_offset < 0) 580 if (efer_offset < 0)
536 return; 581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584
537 /* 585 /*
538 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
539 * outside long mode 587 * outside long mode
@@ -735,12 +783,17 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
735 783
736static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 784static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
737{ 785{
738 return vmcs_readl(GUEST_RFLAGS); 786 unsigned long rflags;
787
788 rflags = vmcs_readl(GUEST_RFLAGS);
789 if (to_vmx(vcpu)->rmode.vm86_active)
790 rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
791 return rflags;
739} 792}
740 793
741static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 794static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
742{ 795{
743 if (vcpu->arch.rmode.vm86_active) 796 if (to_vmx(vcpu)->rmode.vm86_active)
744 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 797 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
745 vmcs_writel(GUEST_RFLAGS, rflags); 798 vmcs_writel(GUEST_RFLAGS, rflags);
746} 799}
@@ -797,12 +850,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
797 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 850 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
798 } 851 }
799 852
800 if (vcpu->arch.rmode.vm86_active) { 853 if (vmx->rmode.vm86_active) {
801 vmx->rmode.irq.pending = true; 854 vmx->rmode.irq.pending = true;
802 vmx->rmode.irq.vector = nr; 855 vmx->rmode.irq.vector = nr;
803 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 856 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
804 if (nr == BP_VECTOR || nr == OF_VECTOR) 857 if (kvm_exception_is_soft(nr))
805 vmx->rmode.irq.rip++; 858 vmx->rmode.irq.rip +=
859 vmx->vcpu.arch.event_exit_inst_len;
806 intr_info |= INTR_TYPE_SOFT_INTR; 860 intr_info |= INTR_TYPE_SOFT_INTR;
807 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 861 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
808 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 862 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -940,7 +994,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
940 case MSR_EFER: 994 case MSR_EFER:
941 return kvm_get_msr_common(vcpu, msr_index, pdata); 995 return kvm_get_msr_common(vcpu, msr_index, pdata);
942#endif 996#endif
943 case MSR_IA32_TIME_STAMP_COUNTER: 997 case MSR_IA32_TSC:
944 data = guest_read_tsc(); 998 data = guest_read_tsc();
945 break; 999 break;
946 case MSR_IA32_SYSENTER_CS: 1000 case MSR_IA32_SYSENTER_CS:
@@ -953,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
953 data = vmcs_readl(GUEST_SYSENTER_ESP); 1007 data = vmcs_readl(GUEST_SYSENTER_ESP);
954 break; 1008 break;
955 default: 1009 default:
956 vmx_load_host_state(to_vmx(vcpu));
957 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1010 msr = find_msr_entry(to_vmx(vcpu), msr_index);
958 if (msr) { 1011 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu));
959 data = msr->data; 1013 data = msr->data;
960 break; 1014 break;
961 } 1015 }
@@ -1000,22 +1054,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1000 case MSR_IA32_SYSENTER_ESP: 1054 case MSR_IA32_SYSENTER_ESP:
1001 vmcs_writel(GUEST_SYSENTER_ESP, data); 1055 vmcs_writel(GUEST_SYSENTER_ESP, data);
1002 break; 1056 break;
1003 case MSR_IA32_TIME_STAMP_COUNTER: 1057 case MSR_IA32_TSC:
1004 rdtscll(host_tsc); 1058 rdtscll(host_tsc);
1005 guest_write_tsc(data, host_tsc); 1059 guest_write_tsc(data, host_tsc);
1006 break; 1060 break;
1007 case MSR_P6_PERFCTR0:
1008 case MSR_P6_PERFCTR1:
1009 case MSR_P6_EVNTSEL0:
1010 case MSR_P6_EVNTSEL1:
1011 /*
1012 * Just discard all writes to the performance counters; this
1013 * should keep both older linux and windows 64-bit guests
1014 * happy
1015 */
1016 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
1017
1018 break;
1019 case MSR_IA32_CR_PAT: 1061 case MSR_IA32_CR_PAT:
1020 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1062 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1021 vmcs_write64(GUEST_IA32_PAT, data); 1063 vmcs_write64(GUEST_IA32_PAT, data);
@@ -1024,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1024 } 1066 }
1025 /* Otherwise falls through to kvm_set_msr_common */ 1067 /* Otherwise falls through to kvm_set_msr_common */
1026 default: 1068 default:
1027 vmx_load_host_state(vmx);
1028 msr = find_msr_entry(vmx, msr_index); 1069 msr = find_msr_entry(vmx, msr_index);
1029 if (msr) { 1070 if (msr) {
1071 vmx_load_host_state(vmx);
1030 msr->data = data; 1072 msr->data = data;
1031 break; 1073 break;
1032 } 1074 }
@@ -1046,6 +1088,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1046 case VCPU_REGS_RIP: 1088 case VCPU_REGS_RIP:
1047 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 1089 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
1048 break; 1090 break;
1091 case VCPU_EXREG_PDPTR:
1092 if (enable_ept)
1093 ept_save_pdptrs(vcpu);
1094 break;
1049 default: 1095 default:
1050 break; 1096 break;
1051 } 1097 }
@@ -1203,7 +1249,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1203 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 1249 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
1204 SECONDARY_EXEC_WBINVD_EXITING | 1250 SECONDARY_EXEC_WBINVD_EXITING |
1205 SECONDARY_EXEC_ENABLE_VPID | 1251 SECONDARY_EXEC_ENABLE_VPID |
1206 SECONDARY_EXEC_ENABLE_EPT; 1252 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1207 if (adjust_vmx_controls(min2, opt2, 1254 if (adjust_vmx_controls(min2, opt2,
1208 MSR_IA32_VMX_PROCBASED_CTLS2, 1255 MSR_IA32_VMX_PROCBASED_CTLS2,
1209 &_cpu_based_2nd_exec_control) < 0) 1256 &_cpu_based_2nd_exec_control) < 0)
@@ -1217,12 +1264,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1217 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 1264 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1218 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 1265 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1219 enabled */ 1266 enabled */
1220 min &= ~(CPU_BASED_CR3_LOAD_EXITING | 1267 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
1221 CPU_BASED_CR3_STORE_EXITING | 1268 CPU_BASED_CR3_STORE_EXITING |
1222 CPU_BASED_INVLPG_EXITING); 1269 CPU_BASED_INVLPG_EXITING);
1223 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1224 &_cpu_based_exec_control) < 0)
1225 return -EIO;
1226 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, 1270 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
1227 vmx_capability.ept, vmx_capability.vpid); 1271 vmx_capability.ept, vmx_capability.vpid);
1228 } 1272 }
@@ -1333,8 +1377,13 @@ static __init int hardware_setup(void)
1333 if (!cpu_has_vmx_vpid()) 1377 if (!cpu_has_vmx_vpid())
1334 enable_vpid = 0; 1378 enable_vpid = 0;
1335 1379
1336 if (!cpu_has_vmx_ept()) 1380 if (!cpu_has_vmx_ept()) {
1337 enable_ept = 0; 1381 enable_ept = 0;
1382 enable_unrestricted_guest = 0;
1383 }
1384
1385 if (!cpu_has_vmx_unrestricted_guest())
1386 enable_unrestricted_guest = 0;
1338 1387
1339 if (!cpu_has_vmx_flexpriority()) 1388 if (!cpu_has_vmx_flexpriority())
1340 flexpriority_enabled = 0; 1389 flexpriority_enabled = 0;
@@ -1342,6 +1391,9 @@ static __init int hardware_setup(void)
1342 if (!cpu_has_vmx_tpr_shadow()) 1391 if (!cpu_has_vmx_tpr_shadow())
1343 kvm_x86_ops->update_cr8_intercept = NULL; 1392 kvm_x86_ops->update_cr8_intercept = NULL;
1344 1393
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages();
1396
1345 return alloc_kvm_area(); 1397 return alloc_kvm_area();
1346} 1398}
1347 1399
@@ -1372,15 +1424,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1372 struct vcpu_vmx *vmx = to_vmx(vcpu); 1424 struct vcpu_vmx *vmx = to_vmx(vcpu);
1373 1425
1374 vmx->emulation_required = 1; 1426 vmx->emulation_required = 1;
1375 vcpu->arch.rmode.vm86_active = 0; 1427 vmx->rmode.vm86_active = 0;
1376 1428
1377 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1429 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1378 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); 1430 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1379 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); 1431 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
1380 1432
1381 flags = vmcs_readl(GUEST_RFLAGS); 1433 flags = vmcs_readl(GUEST_RFLAGS);
1382 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1434 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1383 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); 1435 flags |= (vmx->rmode.save_iopl << IOPL_SHIFT);
1384 vmcs_writel(GUEST_RFLAGS, flags); 1436 vmcs_writel(GUEST_RFLAGS, flags);
1385 1437
1386 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1438 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1391,10 +1443,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1391 if (emulate_invalid_guest_state) 1443 if (emulate_invalid_guest_state)
1392 return; 1444 return;
1393 1445
1394 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1446 fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
1395 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1447 fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
1396 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1448 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
1397 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1449 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
1398 1450
1399 vmcs_write16(GUEST_SS_SELECTOR, 0); 1451 vmcs_write16(GUEST_SS_SELECTOR, 0);
1400 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1452 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1433,20 +1485,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1433 unsigned long flags; 1485 unsigned long flags;
1434 struct vcpu_vmx *vmx = to_vmx(vcpu); 1486 struct vcpu_vmx *vmx = to_vmx(vcpu);
1435 1487
1488 if (enable_unrestricted_guest)
1489 return;
1490
1436 vmx->emulation_required = 1; 1491 vmx->emulation_required = 1;
1437 vcpu->arch.rmode.vm86_active = 1; 1492 vmx->rmode.vm86_active = 1;
1438 1493
1439 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1494 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1440 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1495 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1441 1496
1442 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); 1497 vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1443 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 1498 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1444 1499
1445 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); 1500 vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1446 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1501 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1447 1502
1448 flags = vmcs_readl(GUEST_RFLAGS); 1503 flags = vmcs_readl(GUEST_RFLAGS);
1449 vcpu->arch.rmode.save_iopl 1504 vmx->rmode.save_iopl
1450 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1505 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1451 1506
1452 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1507 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1468,10 +1523,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1468 vmcs_writel(GUEST_CS_BASE, 0xf0000); 1523 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1469 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); 1524 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1470 1525
1471 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1526 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
1472 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1527 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
1473 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1528 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
1474 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1529 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
1475 1530
1476continue_rmode: 1531continue_rmode:
1477 kvm_mmu_reset_context(vcpu); 1532 kvm_mmu_reset_context(vcpu);
@@ -1545,11 +1600,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1545 1600
1546static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 1601static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1547{ 1602{
1603 if (!test_bit(VCPU_EXREG_PDPTR,
1604 (unsigned long *)&vcpu->arch.regs_dirty))
1605 return;
1606
1548 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 1607 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1549 if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
1550 printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
1551 return;
1552 }
1553 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); 1608 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
1554 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); 1609 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
1555 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); 1610 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
@@ -1557,6 +1612,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1557 } 1612 }
1558} 1613}
1559 1614
1615static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1616{
1617 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1618 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1619 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1620 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1621 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1622 }
1623
1624 __set_bit(VCPU_EXREG_PDPTR,
1625 (unsigned long *)&vcpu->arch.regs_avail);
1626 __set_bit(VCPU_EXREG_PDPTR,
1627 (unsigned long *)&vcpu->arch.regs_dirty);
1628}
1629
1560static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 1630static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1561 1631
1562static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 1632static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -1571,8 +1641,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1571 CPU_BASED_CR3_STORE_EXITING)); 1641 CPU_BASED_CR3_STORE_EXITING));
1572 vcpu->arch.cr0 = cr0; 1642 vcpu->arch.cr0 = cr0;
1573 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1643 vmx_set_cr4(vcpu, vcpu->arch.cr4);
1574 *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
1575 *hw_cr0 &= ~X86_CR0_WP;
1576 } else if (!is_paging(vcpu)) { 1644 } else if (!is_paging(vcpu)) {
1577 /* From nonpaging to paging */ 1645 /* From nonpaging to paging */
1578 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1646 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1581,9 +1649,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1581 CPU_BASED_CR3_STORE_EXITING)); 1649 CPU_BASED_CR3_STORE_EXITING));
1582 vcpu->arch.cr0 = cr0; 1650 vcpu->arch.cr0 = cr0;
1583 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1651 vmx_set_cr4(vcpu, vcpu->arch.cr4);
1584 if (!(vcpu->arch.cr0 & X86_CR0_WP))
1585 *hw_cr0 &= ~X86_CR0_WP;
1586 } 1652 }
1653
1654 if (!(cr0 & X86_CR0_WP))
1655 *hw_cr0 &= ~X86_CR0_WP;
1587} 1656}
1588 1657
1589static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, 1658static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
@@ -1598,15 +1667,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1598 1667
1599static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1668static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1600{ 1669{
1601 unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | 1670 struct vcpu_vmx *vmx = to_vmx(vcpu);
1602 KVM_VM_CR0_ALWAYS_ON; 1671 unsigned long hw_cr0;
1672
1673 if (enable_unrestricted_guest)
1674 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
1675 | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
1676 else
1677 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1603 1678
1604 vmx_fpu_deactivate(vcpu); 1679 vmx_fpu_deactivate(vcpu);
1605 1680
1606 if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) 1681 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
1607 enter_pmode(vcpu); 1682 enter_pmode(vcpu);
1608 1683
1609 if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) 1684 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
1610 enter_rmode(vcpu); 1685 enter_rmode(vcpu);
1611 1686
1612#ifdef CONFIG_X86_64 1687#ifdef CONFIG_X86_64
@@ -1650,10 +1725,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1650 if (enable_ept) { 1725 if (enable_ept) {
1651 eptp = construct_eptp(cr3); 1726 eptp = construct_eptp(cr3);
1652 vmcs_write64(EPT_POINTER, eptp); 1727 vmcs_write64(EPT_POINTER, eptp);
1653 ept_sync_context(eptp);
1654 ept_load_pdptrs(vcpu);
1655 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1656 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 1729 vcpu->kvm->arch.ept_identity_map_addr;
1657 } 1730 }
1658 1731
1659 vmx_flush_tlb(vcpu); 1732 vmx_flush_tlb(vcpu);
@@ -1664,7 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1664 1737
1665static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1738static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1666{ 1739{
1667 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? 1740 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
1668 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1741 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1669 1742
1670 vcpu->arch.cr4 = cr4; 1743 vcpu->arch.cr4 = cr4;
@@ -1707,16 +1780,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1707 1780
1708static int vmx_get_cpl(struct kvm_vcpu *vcpu) 1781static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1709{ 1782{
1710 struct kvm_segment kvm_seg;
1711
1712 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ 1783 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
1713 return 0; 1784 return 0;
1714 1785
1715 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 1786 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
1716 return 3; 1787 return 3;
1717 1788
1718 vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); 1789 return vmcs_read16(GUEST_CS_SELECTOR) & 3;
1719 return kvm_seg.selector & 3;
1720} 1790}
1721 1791
1722static u32 vmx_segment_access_rights(struct kvm_segment *var) 1792static u32 vmx_segment_access_rights(struct kvm_segment *var)
@@ -1744,20 +1814,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
1744static void vmx_set_segment(struct kvm_vcpu *vcpu, 1814static void vmx_set_segment(struct kvm_vcpu *vcpu,
1745 struct kvm_segment *var, int seg) 1815 struct kvm_segment *var, int seg)
1746{ 1816{
1817 struct vcpu_vmx *vmx = to_vmx(vcpu);
1747 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1818 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1748 u32 ar; 1819 u32 ar;
1749 1820
1750 if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { 1821 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
1751 vcpu->arch.rmode.tr.selector = var->selector; 1822 vmx->rmode.tr.selector = var->selector;
1752 vcpu->arch.rmode.tr.base = var->base; 1823 vmx->rmode.tr.base = var->base;
1753 vcpu->arch.rmode.tr.limit = var->limit; 1824 vmx->rmode.tr.limit = var->limit;
1754 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); 1825 vmx->rmode.tr.ar = vmx_segment_access_rights(var);
1755 return; 1826 return;
1756 } 1827 }
1757 vmcs_writel(sf->base, var->base); 1828 vmcs_writel(sf->base, var->base);
1758 vmcs_write32(sf->limit, var->limit); 1829 vmcs_write32(sf->limit, var->limit);
1759 vmcs_write16(sf->selector, var->selector); 1830 vmcs_write16(sf->selector, var->selector);
1760 if (vcpu->arch.rmode.vm86_active && var->s) { 1831 if (vmx->rmode.vm86_active && var->s) {
1761 /* 1832 /*
1762 * Hack real-mode segments into vm86 compatibility. 1833 * Hack real-mode segments into vm86 compatibility.
1763 */ 1834 */
@@ -1766,6 +1837,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1766 ar = 0xf3; 1837 ar = 0xf3;
1767 } else 1838 } else
1768 ar = vmx_segment_access_rights(var); 1839 ar = vmx_segment_access_rights(var);
1840
1841 /*
1842 * Fix the "Accessed" bit in AR field of segment registers for older
1843 * qemu binaries.
1844 * IA32 arch specifies that at the time of processor reset the
1845 * "Accessed" bit in the AR field of segment registers is 1. And qemu
1846 * is setting it to 0 in the usedland code. This causes invalid guest
1847 * state vmexit when "unrestricted guest" mode is turned on.
1848 * Fix for this setup issue in cpu_reset is being pushed in the qemu
1849 * tree. Newer qemu binaries with that qemu fix would not need this
1850 * kvm hack.
1851 */
1852 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
1853 ar |= 0x1; /* Accessed */
1854
1769 vmcs_write32(sf->ar_bytes, ar); 1855 vmcs_write32(sf->ar_bytes, ar);
1770} 1856}
1771 1857
@@ -2040,7 +2126,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2040 if (likely(kvm->arch.ept_identity_pagetable_done)) 2126 if (likely(kvm->arch.ept_identity_pagetable_done))
2041 return 1; 2127 return 1;
2042 ret = 0; 2128 ret = 0;
2043 identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; 2129 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2044 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2130 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2045 if (r < 0) 2131 if (r < 0)
2046 goto out; 2132 goto out;
@@ -2062,11 +2148,19 @@ out:
2062static void seg_setup(int seg) 2148static void seg_setup(int seg)
2063{ 2149{
2064 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2150 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2151 unsigned int ar;
2065 2152
2066 vmcs_write16(sf->selector, 0); 2153 vmcs_write16(sf->selector, 0);
2067 vmcs_writel(sf->base, 0); 2154 vmcs_writel(sf->base, 0);
2068 vmcs_write32(sf->limit, 0xffff); 2155 vmcs_write32(sf->limit, 0xffff);
2069 vmcs_write32(sf->ar_bytes, 0xf3); 2156 if (enable_unrestricted_guest) {
2157 ar = 0x93;
2158 if (seg == VCPU_SREG_CS)
2159 ar |= 0x08; /* code segment */
2160 } else
2161 ar = 0xf3;
2162
2163 vmcs_write32(sf->ar_bytes, ar);
2070} 2164}
2071 2165
2072static int alloc_apic_access_page(struct kvm *kvm) 2166static int alloc_apic_access_page(struct kvm *kvm)
@@ -2101,14 +2195,15 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2101 goto out; 2195 goto out;
2102 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 2196 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
2103 kvm_userspace_mem.flags = 0; 2197 kvm_userspace_mem.flags = 0;
2104 kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 2198 kvm_userspace_mem.guest_phys_addr =
2199 kvm->arch.ept_identity_map_addr;
2105 kvm_userspace_mem.memory_size = PAGE_SIZE; 2200 kvm_userspace_mem.memory_size = PAGE_SIZE;
2106 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 2201 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2107 if (r) 2202 if (r)
2108 goto out; 2203 goto out;
2109 2204
2110 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2205 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2111 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); 2206 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
2112out: 2207out:
2113 up_write(&kvm->slots_lock); 2208 up_write(&kvm->slots_lock);
2114 return r; 2209 return r;
@@ -2209,6 +2304,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2209 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2210 if (!enable_ept) 2305 if (!enable_ept)
2211 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2307 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2212 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2213 } 2310 }
2214 2311
@@ -2326,14 +2423,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2326 goto out; 2423 goto out;
2327 } 2424 }
2328 2425
2329 vmx->vcpu.arch.rmode.vm86_active = 0; 2426 vmx->rmode.vm86_active = 0;
2330 2427
2331 vmx->soft_vnmi_blocked = 0; 2428 vmx->soft_vnmi_blocked = 0;
2332 2429
2333 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 2430 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2334 kvm_set_cr8(&vmx->vcpu, 0); 2431 kvm_set_cr8(&vmx->vcpu, 0);
2335 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 2432 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
2336 if (vmx->vcpu.vcpu_id == 0) 2433 if (kvm_vcpu_is_bsp(&vmx->vcpu))
2337 msr |= MSR_IA32_APICBASE_BSP; 2434 msr |= MSR_IA32_APICBASE_BSP;
2338 kvm_set_apic_base(&vmx->vcpu, msr); 2435 kvm_set_apic_base(&vmx->vcpu, msr);
2339 2436
@@ -2344,7 +2441,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2344 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2441 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2345 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 2442 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
2346 */ 2443 */
2347 if (vmx->vcpu.vcpu_id == 0) { 2444 if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
2348 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 2445 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
2349 vmcs_writel(GUEST_CS_BASE, 0x000f0000); 2446 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
2350 } else { 2447 } else {
@@ -2373,7 +2470,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2373 vmcs_writel(GUEST_SYSENTER_EIP, 0); 2470 vmcs_writel(GUEST_SYSENTER_EIP, 0);
2374 2471
2375 vmcs_writel(GUEST_RFLAGS, 0x02); 2472 vmcs_writel(GUEST_RFLAGS, 0x02);
2376 if (vmx->vcpu.vcpu_id == 0) 2473 if (kvm_vcpu_is_bsp(&vmx->vcpu))
2377 kvm_rip_write(vcpu, 0xfff0); 2474 kvm_rip_write(vcpu, 0xfff0);
2378 else 2475 else
2379 kvm_rip_write(vcpu, 0); 2476 kvm_rip_write(vcpu, 0);
@@ -2461,13 +2558,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2461 uint32_t intr; 2558 uint32_t intr;
2462 int irq = vcpu->arch.interrupt.nr; 2559 int irq = vcpu->arch.interrupt.nr;
2463 2560
2464 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2561 trace_kvm_inj_virq(irq);
2465 2562
2466 ++vcpu->stat.irq_injections; 2563 ++vcpu->stat.irq_injections;
2467 if (vcpu->arch.rmode.vm86_active) { 2564 if (vmx->rmode.vm86_active) {
2468 vmx->rmode.irq.pending = true; 2565 vmx->rmode.irq.pending = true;
2469 vmx->rmode.irq.vector = irq; 2566 vmx->rmode.irq.vector = irq;
2470 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2567 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2568 if (vcpu->arch.interrupt.soft)
2569 vmx->rmode.irq.rip +=
2570 vmx->vcpu.arch.event_exit_inst_len;
2471 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2571 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2472 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); 2572 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2473 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 2573 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -2502,7 +2602,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2502 } 2602 }
2503 2603
2504 ++vcpu->stat.nmi_injections; 2604 ++vcpu->stat.nmi_injections;
2505 if (vcpu->arch.rmode.vm86_active) { 2605 if (vmx->rmode.vm86_active) {
2506 vmx->rmode.irq.pending = true; 2606 vmx->rmode.irq.pending = true;
2507 vmx->rmode.irq.vector = NMI_VECTOR; 2607 vmx->rmode.irq.vector = NMI_VECTOR;
2508 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2608 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2659,14 +2759,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2659 if (enable_ept) 2759 if (enable_ept)
2660 BUG(); 2760 BUG();
2661 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2761 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2662 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2762 trace_kvm_page_fault(cr2, error_code);
2663 (u32)((u64)cr2 >> 32), handler); 2763
2664 if (kvm_event_needs_reinjection(vcpu)) 2764 if (kvm_event_needs_reinjection(vcpu))
2665 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2765 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2666 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2766 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2667 } 2767 }
2668 2768
2669 if (vcpu->arch.rmode.vm86_active && 2769 if (vmx->rmode.vm86_active &&
2670 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 2770 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2671 error_code)) { 2771 error_code)) {
2672 if (vcpu->arch.halt_request) { 2772 if (vcpu->arch.halt_request) {
@@ -2707,7 +2807,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
2707 struct kvm_run *kvm_run) 2807 struct kvm_run *kvm_run)
2708{ 2808{
2709 ++vcpu->stat.irq_exits; 2809 ++vcpu->stat.irq_exits;
2710 KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
2711 return 1; 2810 return 1;
2712} 2811}
2713 2812
@@ -2755,7 +2854,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2755 2854
2756static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2757{ 2856{
2758 unsigned long exit_qualification; 2857 unsigned long exit_qualification, val;
2759 int cr; 2858 int cr;
2760 int reg; 2859 int reg;
2761 2860
@@ -2764,21 +2863,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2764 reg = (exit_qualification >> 8) & 15; 2863 reg = (exit_qualification >> 8) & 15;
2765 switch ((exit_qualification >> 4) & 3) { 2864 switch ((exit_qualification >> 4) & 3) {
2766 case 0: /* mov to cr */ 2865 case 0: /* mov to cr */
2767 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, 2866 val = kvm_register_read(vcpu, reg);
2768 (u32)kvm_register_read(vcpu, reg), 2867 trace_kvm_cr_write(cr, val);
2769 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2770 handler);
2771 switch (cr) { 2868 switch (cr) {
2772 case 0: 2869 case 0:
2773 kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); 2870 kvm_set_cr0(vcpu, val);
2774 skip_emulated_instruction(vcpu); 2871 skip_emulated_instruction(vcpu);
2775 return 1; 2872 return 1;
2776 case 3: 2873 case 3:
2777 kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); 2874 kvm_set_cr3(vcpu, val);
2778 skip_emulated_instruction(vcpu); 2875 skip_emulated_instruction(vcpu);
2779 return 1; 2876 return 1;
2780 case 4: 2877 case 4:
2781 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); 2878 kvm_set_cr4(vcpu, val);
2782 skip_emulated_instruction(vcpu); 2879 skip_emulated_instruction(vcpu);
2783 return 1; 2880 return 1;
2784 case 8: { 2881 case 8: {
@@ -2800,23 +2897,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2800 vcpu->arch.cr0 &= ~X86_CR0_TS; 2897 vcpu->arch.cr0 &= ~X86_CR0_TS;
2801 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2898 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2802 vmx_fpu_activate(vcpu); 2899 vmx_fpu_activate(vcpu);
2803 KVMTRACE_0D(CLTS, vcpu, handler);
2804 skip_emulated_instruction(vcpu); 2900 skip_emulated_instruction(vcpu);
2805 return 1; 2901 return 1;
2806 case 1: /*mov from cr*/ 2902 case 1: /*mov from cr*/
2807 switch (cr) { 2903 switch (cr) {
2808 case 3: 2904 case 3:
2809 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 2905 kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2810 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, 2906 trace_kvm_cr_read(cr, vcpu->arch.cr3);
2811 (u32)kvm_register_read(vcpu, reg),
2812 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2813 handler);
2814 skip_emulated_instruction(vcpu); 2907 skip_emulated_instruction(vcpu);
2815 return 1; 2908 return 1;
2816 case 8: 2909 case 8:
2817 kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); 2910 val = kvm_get_cr8(vcpu);
2818 KVMTRACE_2D(CR_READ, vcpu, (u32)cr, 2911 kvm_register_write(vcpu, reg, val);
2819 (u32)kvm_register_read(vcpu, reg), handler); 2912 trace_kvm_cr_read(cr, val);
2820 skip_emulated_instruction(vcpu); 2913 skip_emulated_instruction(vcpu);
2821 return 1; 2914 return 1;
2822 } 2915 }
@@ -2841,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2841 unsigned long val; 2934 unsigned long val;
2842 int dr, reg; 2935 int dr, reg;
2843 2936
2937 if (!kvm_require_cpl(vcpu, 0))
2938 return 1;
2844 dr = vmcs_readl(GUEST_DR7); 2939 dr = vmcs_readl(GUEST_DR7);
2845 if (dr & DR7_GD) { 2940 if (dr & DR7_GD) {
2846 /* 2941 /*
@@ -2884,7 +2979,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2884 val = 0; 2979 val = 0;
2885 } 2980 }
2886 kvm_register_write(vcpu, reg, val); 2981 kvm_register_write(vcpu, reg, val);
2887 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2888 } else { 2982 } else {
2889 val = vcpu->arch.regs[reg]; 2983 val = vcpu->arch.regs[reg];
2890 switch (dr) { 2984 switch (dr) {
@@ -2917,7 +3011,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2917 } 3011 }
2918 break; 3012 break;
2919 } 3013 }
2920 KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
2921 } 3014 }
2922 skip_emulated_instruction(vcpu); 3015 skip_emulated_instruction(vcpu);
2923 return 1; 3016 return 1;
@@ -2939,8 +3032,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2939 return 1; 3032 return 1;
2940 } 3033 }
2941 3034
2942 KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), 3035 trace_kvm_msr_read(ecx, data);
2943 handler);
2944 3036
2945 /* FIXME: handling of bits 32:63 of rax, rdx */ 3037 /* FIXME: handling of bits 32:63 of rax, rdx */
2946 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; 3038 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
@@ -2955,8 +3047,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2955 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2956 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3048 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2957 3049
2958 KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), 3050 trace_kvm_msr_write(ecx, data);
2959 handler);
2960 3051
2961 if (vmx_set_msr(vcpu, ecx, data) != 0) { 3052 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2962 kvm_inject_gp(vcpu, 0); 3053 kvm_inject_gp(vcpu, 0);
@@ -2983,7 +3074,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2983 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 3074 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2984 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3075 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2985 3076
2986 KVMTRACE_0D(PEND_INTR, vcpu, handler);
2987 ++vcpu->stat.irq_window_exits; 3077 ++vcpu->stat.irq_window_exits;
2988 3078
2989 /* 3079 /*
@@ -3049,7 +3139,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3049 printk(KERN_ERR 3139 printk(KERN_ERR
3050 "Fail to handle apic access vmexit! Offset is 0x%lx\n", 3140 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3051 offset); 3141 offset);
3052 return -ENOTSUPP; 3142 return -ENOEXEC;
3053 } 3143 }
3054 return 1; 3144 return 1;
3055} 3145}
@@ -3118,7 +3208,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3118 3208
3119 if (exit_qualification & (1 << 6)) { 3209 if (exit_qualification & (1 << 6)) {
3120 printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); 3210 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
3121 return -ENOTSUPP; 3211 return -EINVAL;
3122 } 3212 }
3123 3213
3124 gla_validity = (exit_qualification >> 7) & 0x3; 3214 gla_validity = (exit_qualification >> 7) & 0x3;
@@ -3130,14 +3220,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3130 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3131 (long unsigned int)exit_qualification); 3221 (long unsigned int)exit_qualification);
3132 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3133 kvm_run->hw.hardware_exit_reason = 0; 3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3134 return -ENOTSUPP; 3224 return 0;
3135 } 3225 }
3136 3226
3137 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3227 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3228 trace_kvm_page_fault(gpa, exit_qualification);
3138 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3229 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
3139} 3230}
3140 3231
3232static u64 ept_rsvd_mask(u64 spte, int level)
3233{
3234 int i;
3235 u64 mask = 0;
3236
3237 for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
3238 mask |= (1ULL << i);
3239
3240 if (level > 2)
3241 /* bits 7:3 reserved */
3242 mask |= 0xf8;
3243 else if (level == 2) {
3244 if (spte & (1ULL << 7))
3245 /* 2MB ref, bits 20:12 reserved */
3246 mask |= 0x1ff000;
3247 else
3248 /* bits 6:3 reserved */
3249 mask |= 0x78;
3250 }
3251
3252 return mask;
3253}
3254
3255static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3256 int level)
3257{
3258 printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
3259
3260 /* 010b (write-only) */
3261 WARN_ON((spte & 0x7) == 0x2);
3262
3263 /* 110b (write/execute) */
3264 WARN_ON((spte & 0x7) == 0x6);
3265
3266 /* 100b (execute-only) and value not supported by logical processor */
3267 if (!cpu_has_vmx_ept_execute_only())
3268 WARN_ON((spte & 0x7) == 0x4);
3269
3270 /* not 000b */
3271 if ((spte & 0x7)) {
3272 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
3273
3274 if (rsvd_bits != 0) {
3275 printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
3276 __func__, rsvd_bits);
3277 WARN_ON(1);
3278 }
3279
3280 if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
3281 u64 ept_mem_type = (spte & 0x38) >> 3;
3282
3283 if (ept_mem_type == 2 || ept_mem_type == 3 ||
3284 ept_mem_type == 7) {
3285 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
3286 __func__, ept_mem_type);
3287 WARN_ON(1);
3288 }
3289 }
3290 }
3291}
3292
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3294{
3295 u64 sptes[4];
3296 int nr_sptes, i;
3297 gpa_t gpa;
3298
3299 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3300
3301 printk(KERN_ERR "EPT: Misconfiguration.\n");
3302 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
3303
3304 nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
3305
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311
3312 return 0;
3313}
3314
3141static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3142{ 3316{
3143 u32 cpu_based_vm_exec_control; 3317 u32 cpu_based_vm_exec_control;
@@ -3217,8 +3391,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3217 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3391 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3218 [EXIT_REASON_WBINVD] = handle_wbinvd, 3392 [EXIT_REASON_WBINVD] = handle_wbinvd,
3219 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3393 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3220 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3221 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3222}; 3397};
3223 3398
3224static const int kvm_vmx_max_exit_handlers = 3399static const int kvm_vmx_max_exit_handlers =
@@ -3234,8 +3409,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3234 u32 exit_reason = vmx->exit_reason; 3409 u32 exit_reason = vmx->exit_reason;
3235 u32 vectoring_info = vmx->idt_vectoring_info; 3410 u32 vectoring_info = vmx->idt_vectoring_info;
3236 3411
3237 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), 3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3238 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
3239 3413
3240 /* If we need to emulate an MMIO from handle_invalid_guest_state 3414 /* If we need to emulate an MMIO from handle_invalid_guest_state
3241 * we just return 0 */ 3415 * we just return 0 */
@@ -3247,10 +3421,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3247 3421
3248 /* Access CR3 don't cause VMExit in paging mode, so we need 3422 /* Access CR3 don't cause VMExit in paging mode, so we need
3249 * to sync with guest real CR3. */ 3423 * to sync with guest real CR3. */
3250 if (enable_ept && is_paging(vcpu)) { 3424 if (enable_ept && is_paging(vcpu))
3251 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3252 ept_load_pdptrs(vcpu);
3253 }
3254 3426
3255 if (unlikely(vmx->fail)) { 3427 if (unlikely(vmx->fail)) {
3256 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -3326,10 +3498,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3326 3498
3327 /* We need to handle NMIs before interrupts are enabled */ 3499 /* We need to handle NMIs before interrupts are enabled */
3328 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && 3500 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3329 (exit_intr_info & INTR_INFO_VALID_MASK)) { 3501 (exit_intr_info & INTR_INFO_VALID_MASK))
3330 KVMTRACE_0D(NMI, &vmx->vcpu, handler);
3331 asm("int $2"); 3502 asm("int $2");
3332 }
3333 3503
3334 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3504 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3335 3505
@@ -3434,6 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3434{ 3604{
3435 struct vcpu_vmx *vmx = to_vmx(vcpu); 3605 struct vcpu_vmx *vmx = to_vmx(vcpu);
3436 3606
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3437 /* Record the guest's net vcpu time for enforced NMI injections. */ 3611 /* Record the guest's net vcpu time for enforced NMI injections. */
3438 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3439 vmx->entry_time = ktime_get(); 3613 vmx->entry_time = ktime_get();
@@ -3449,12 +3623,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3449 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 3623 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3450 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 3624 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3451 3625
3626 /* When single-stepping over STI and MOV SS, we must clear the
3627 * corresponding interruptibility bits in the guest state. Otherwise
3628 * vmentry fails as it then expects bit 14 (BS) in pending debug
3629 * exceptions being set, but that's not correct for the guest debugging
3630 * case. */
3631 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3632 vmx_set_interrupt_shadow(vcpu, 0);
3633
3452 /* 3634 /*
3453 * Loading guest fpu may have cleared host cr0.ts 3635 * Loading guest fpu may have cleared host cr0.ts
3454 */ 3636 */
3455 vmcs_writel(HOST_CR0, read_cr0()); 3637 vmcs_writel(HOST_CR0, read_cr0());
3456 3638
3457 set_debugreg(vcpu->arch.dr6, 6); 3639 if (vcpu->arch.switch_db_regs)
3640 set_debugreg(vcpu->arch.dr6, 6);
3458 3641
3459 asm( 3642 asm(
3460 /* Store host registers */ 3643 /* Store host registers */
@@ -3465,11 +3648,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3465 "mov %%"R"sp, %c[host_rsp](%0) \n\t" 3648 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
3466 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 3649 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3467 "1: \n\t" 3650 "1: \n\t"
3651 /* Reload cr2 if changed */
3652 "mov %c[cr2](%0), %%"R"ax \n\t"
3653 "mov %%cr2, %%"R"dx \n\t"
3654 "cmp %%"R"ax, %%"R"dx \n\t"
3655 "je 2f \n\t"
3656 "mov %%"R"ax, %%cr2 \n\t"
3657 "2: \n\t"
3468 /* Check if vmlaunch of vmresume is needed */ 3658 /* Check if vmlaunch of vmresume is needed */
3469 "cmpl $0, %c[launched](%0) \n\t" 3659 "cmpl $0, %c[launched](%0) \n\t"
3470 /* Load guest registers. Don't clobber flags. */ 3660 /* Load guest registers. Don't clobber flags. */
3471 "mov %c[cr2](%0), %%"R"ax \n\t"
3472 "mov %%"R"ax, %%cr2 \n\t"
3473 "mov %c[rax](%0), %%"R"ax \n\t" 3661 "mov %c[rax](%0), %%"R"ax \n\t"
3474 "mov %c[rbx](%0), %%"R"bx \n\t" 3662 "mov %c[rbx](%0), %%"R"bx \n\t"
3475 "mov %c[rdx](%0), %%"R"dx \n\t" 3663 "mov %c[rdx](%0), %%"R"dx \n\t"
@@ -3547,10 +3735,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3547#endif 3735#endif
3548 ); 3736 );
3549 3737
3550 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 3738 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
3739 | (1 << VCPU_EXREG_PDPTR));
3551 vcpu->arch.regs_dirty = 0; 3740 vcpu->arch.regs_dirty = 0;
3552 3741
3553 get_debugreg(vcpu->arch.dr6, 6); 3742 if (vcpu->arch.switch_db_regs)
3743 get_debugreg(vcpu->arch.dr6, 6);
3554 3744
3555 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3745 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3556 if (vmx->rmode.irq.pending) 3746 if (vmx->rmode.irq.pending)
@@ -3633,9 +3823,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3633 if (alloc_apic_access_page(kvm) != 0) 3823 if (alloc_apic_access_page(kvm) != 0)
3634 goto free_vmcs; 3824 goto free_vmcs;
3635 3825
3636 if (enable_ept) 3826 if (enable_ept) {
3827 if (!kvm->arch.ept_identity_map_addr)
3828 kvm->arch.ept_identity_map_addr =
3829 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3637 if (alloc_identity_pagetable(kvm) != 0) 3830 if (alloc_identity_pagetable(kvm) != 0)
3638 goto free_vmcs; 3831 goto free_vmcs;
3832 }
3639 3833
3640 return &vmx->vcpu; 3834 return &vmx->vcpu;
3641 3835
@@ -3699,6 +3893,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3699 return ret; 3893 return ret;
3700} 3894}
3701 3895
3896static const struct trace_print_flags vmx_exit_reasons_str[] = {
3897 { EXIT_REASON_EXCEPTION_NMI, "exception" },
3898 { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" },
3899 { EXIT_REASON_TRIPLE_FAULT, "triple_fault" },
3900 { EXIT_REASON_NMI_WINDOW, "nmi_window" },
3901 { EXIT_REASON_IO_INSTRUCTION, "io_instruction" },
3902 { EXIT_REASON_CR_ACCESS, "cr_access" },
3903 { EXIT_REASON_DR_ACCESS, "dr_access" },
3904 { EXIT_REASON_CPUID, "cpuid" },
3905 { EXIT_REASON_MSR_READ, "rdmsr" },
3906 { EXIT_REASON_MSR_WRITE, "wrmsr" },
3907 { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" },
3908 { EXIT_REASON_HLT, "halt" },
3909 { EXIT_REASON_INVLPG, "invlpg" },
3910 { EXIT_REASON_VMCALL, "hypercall" },
3911 { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" },
3912 { EXIT_REASON_APIC_ACCESS, "apic_access" },
3913 { EXIT_REASON_WBINVD, "wbinvd" },
3914 { EXIT_REASON_TASK_SWITCH, "task_switch" },
3915 { EXIT_REASON_EPT_VIOLATION, "ept_violation" },
3916 { -1, NULL }
3917};
3918
3919static bool vmx_gb_page_enable(void)
3920{
3921 return false;
3922}
3923
3702static struct kvm_x86_ops vmx_x86_ops = { 3924static struct kvm_x86_ops vmx_x86_ops = {
3703 .cpu_has_kvm_support = cpu_has_kvm_support, 3925 .cpu_has_kvm_support = cpu_has_kvm_support,
3704 .disabled_by_bios = vmx_disabled_by_bios, 3926 .disabled_by_bios = vmx_disabled_by_bios,
@@ -3758,6 +3980,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
3758 .set_tss_addr = vmx_set_tss_addr, 3980 .set_tss_addr = vmx_set_tss_addr,
3759 .get_tdp_level = get_ept_level, 3981 .get_tdp_level = get_ept_level,
3760 .get_mt_mask = vmx_get_mt_mask, 3982 .get_mt_mask = vmx_get_mt_mask,
3983
3984 .exit_reasons_str = vmx_exit_reasons_str,
3985 .gb_page_enable = vmx_gb_page_enable,
3761}; 3986};
3762 3987
3763static int __init vmx_init(void) 3988static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d4529011828..be451ee44249 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,16 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS
43#include "trace.h"
40 44
41#include <asm/uaccess.h> 45#include <asm/uaccess.h>
42#include <asm/msr.h> 46#include <asm/msr.h>
43#include <asm/desc.h> 47#include <asm/desc.h>
44#include <asm/mtrr.h> 48#include <asm/mtrr.h>
49#include <asm/mce.h>
45 50
46#define MAX_IO_MSRS 256 51#define MAX_IO_MSRS 256
47#define CR0_RESERVED_BITS \ 52#define CR0_RESERVED_BITS \
@@ -55,6 +60,10 @@
55 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 60 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
56 61
57#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 62#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
63
64#define KVM_MAX_MCE_BANKS 32
65#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
66
58/* EFER defaults: 67/* EFER defaults:
59 * - enable syscall per default because its emulated by KVM 68 * - enable syscall per default because its emulated by KVM
60 * - enable LME and LMA per default on 64 bit KVM 69 * - enable LME and LMA per default on 64 bit KVM
@@ -68,14 +77,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
68#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 77#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
69#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 78#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
70 79
80static void update_cr8_intercept(struct kvm_vcpu *vcpu);
71static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 81static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
72 struct kvm_cpuid_entry2 __user *entries); 82 struct kvm_cpuid_entry2 __user *entries);
73struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
74 u32 function, u32 index);
75 83
76struct kvm_x86_ops *kvm_x86_ops; 84struct kvm_x86_ops *kvm_x86_ops;
77EXPORT_SYMBOL_GPL(kvm_x86_ops); 85EXPORT_SYMBOL_GPL(kvm_x86_ops);
78 86
87int ignore_msrs = 0;
88module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
89
79struct kvm_stats_debugfs_item debugfs_entries[] = { 90struct kvm_stats_debugfs_item debugfs_entries[] = {
80 { "pf_fixed", VCPU_STAT(pf_fixed) }, 91 { "pf_fixed", VCPU_STAT(pf_fixed) },
81 { "pf_guest", VCPU_STAT(pf_guest) }, 92 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -122,18 +133,16 @@ unsigned long segment_base(u16 selector)
122 if (selector == 0) 133 if (selector == 0)
123 return 0; 134 return 0;
124 135
125 asm("sgdt %0" : "=m"(gdt)); 136 kvm_get_gdt(&gdt);
126 table_base = gdt.base; 137 table_base = gdt.base;
127 138
128 if (selector & 4) { /* from ldt */ 139 if (selector & 4) { /* from ldt */
129 u16 ldt_selector; 140 u16 ldt_selector = kvm_read_ldt();
130 141
131 asm("sldt %0" : "=g"(ldt_selector));
132 table_base = segment_base(ldt_selector); 142 table_base = segment_base(ldt_selector);
133 } 143 }
134 d = (struct desc_struct *)(table_base + (selector & ~7)); 144 d = (struct desc_struct *)(table_base + (selector & ~7));
135 v = d->base0 | ((unsigned long)d->base1 << 16) | 145 v = get_desc_base(d);
136 ((unsigned long)d->base2 << 24);
137#ifdef CONFIG_X86_64 146#ifdef CONFIG_X86_64
138 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 147 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
139 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 148 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
@@ -176,16 +185,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
176 ++vcpu->stat.pf_guest; 185 ++vcpu->stat.pf_guest;
177 186
178 if (vcpu->arch.exception.pending) { 187 if (vcpu->arch.exception.pending) {
179 if (vcpu->arch.exception.nr == PF_VECTOR) { 188 switch(vcpu->arch.exception.nr) {
180 printk(KERN_DEBUG "kvm: inject_page_fault:" 189 case DF_VECTOR:
181 " double fault 0x%lx\n", addr);
182 vcpu->arch.exception.nr = DF_VECTOR;
183 vcpu->arch.exception.error_code = 0;
184 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
185 /* triple fault -> shutdown */ 190 /* triple fault -> shutdown */
186 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
192 return;
193 case PF_VECTOR:
194 vcpu->arch.exception.nr = DF_VECTOR;
195 vcpu->arch.exception.error_code = 0;
196 return;
197 default:
198 /* replace previous exception with a new one in a hope
199 that instruction re-execution will regenerate lost
200 exception */
201 vcpu->arch.exception.pending = false;
202 break;
187 } 203 }
188 return;
189 } 204 }
190 vcpu->arch.cr2 = addr; 205 vcpu->arch.cr2 = addr;
191 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 206 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@ -207,12 +222,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
207} 222}
208EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 223EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
209 224
210static void __queue_exception(struct kvm_vcpu *vcpu) 225/*
226 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
227 * a #GP and return false.
228 */
229bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
211{ 230{
212 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 231 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
213 vcpu->arch.exception.has_error_code, 232 return true;
214 vcpu->arch.exception.error_code); 233 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
234 return false;
215} 235}
236EXPORT_SYMBOL_GPL(kvm_require_cpl);
216 237
217/* 238/*
218 * Load the pae pdptrs. Return true is they are all valid. 239 * Load the pae pdptrs. Return true is they are all valid.
@@ -232,7 +253,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
232 goto out; 253 goto out;
233 } 254 }
234 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 255 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
235 if (is_present_pte(pdpte[i]) && 256 if (is_present_gpte(pdpte[i]) &&
236 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 257 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
237 ret = 0; 258 ret = 0;
238 goto out; 259 goto out;
@@ -241,6 +262,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
241 ret = 1; 262 ret = 1;
242 263
243 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 264 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
265 __set_bit(VCPU_EXREG_PDPTR,
266 (unsigned long *)&vcpu->arch.regs_avail);
267 __set_bit(VCPU_EXREG_PDPTR,
268 (unsigned long *)&vcpu->arch.regs_dirty);
244out: 269out:
245 270
246 return ret; 271 return ret;
@@ -256,6 +281,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
256 if (is_long_mode(vcpu) || !is_pae(vcpu)) 281 if (is_long_mode(vcpu) || !is_pae(vcpu))
257 return false; 282 return false;
258 283
284 if (!test_bit(VCPU_EXREG_PDPTR,
285 (unsigned long *)&vcpu->arch.regs_avail))
286 return true;
287
259 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 288 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
260 if (r < 0) 289 if (r < 0)
261 goto out; 290 goto out;
@@ -328,9 +357,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
328void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 357void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
329{ 358{
330 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 359 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
331 KVMTRACE_1D(LMSW, vcpu,
332 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
333 handler);
334} 360}
335EXPORT_SYMBOL_GPL(kvm_lmsw); 361EXPORT_SYMBOL_GPL(kvm_lmsw);
336 362
@@ -466,7 +492,7 @@ static u32 msrs_to_save[] = {
466#ifdef CONFIG_X86_64 492#ifdef CONFIG_X86_64
467 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
468#endif 494#endif
469 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
470 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
471}; 497};
472 498
@@ -644,8 +670,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
644 670
645 /* Keep irq disabled to prevent changes to the clock */ 671 /* Keep irq disabled to prevent changes to the clock */
646 local_irq_save(flags); 672 local_irq_save(flags);
647 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 673 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
648 &vcpu->hv_clock.tsc_timestamp);
649 ktime_get_ts(&ts); 674 ktime_get_ts(&ts);
650 local_irq_restore(flags); 675 local_irq_restore(flags);
651 676
@@ -778,23 +803,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
778 return 0; 803 return 0;
779} 804}
780 805
806static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
807{
808 u64 mcg_cap = vcpu->arch.mcg_cap;
809 unsigned bank_num = mcg_cap & 0xff;
810
811 switch (msr) {
812 case MSR_IA32_MCG_STATUS:
813 vcpu->arch.mcg_status = data;
814 break;
815 case MSR_IA32_MCG_CTL:
816 if (!(mcg_cap & MCG_CTL_P))
817 return 1;
818 if (data != 0 && data != ~(u64)0)
819 return -1;
820 vcpu->arch.mcg_ctl = data;
821 break;
822 default:
823 if (msr >= MSR_IA32_MC0_CTL &&
824 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
825 u32 offset = msr - MSR_IA32_MC0_CTL;
826 /* only 0 or all 1s can be written to IA32_MCi_CTL */
827 if ((offset & 0x3) == 0 &&
828 data != 0 && data != ~(u64)0)
829 return -1;
830 vcpu->arch.mce_banks[offset] = data;
831 break;
832 }
833 return 1;
834 }
835 return 0;
836}
837
781int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 838int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
782{ 839{
783 switch (msr) { 840 switch (msr) {
784 case MSR_EFER: 841 case MSR_EFER:
785 set_efer(vcpu, data); 842 set_efer(vcpu, data);
786 break; 843 break;
787 case MSR_IA32_MC0_STATUS: 844 case MSR_K7_HWCR:
788 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 845 data &= ~(u64)0x40; /* ignore flush filter disable */
789 __func__, data); 846 if (data != 0) {
847 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
848 data);
849 return 1;
850 }
790 break; 851 break;
791 case MSR_IA32_MCG_STATUS: 852 case MSR_FAM10H_MMIO_CONF_BASE:
792 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 853 if (data != 0) {
793 __func__, data); 854 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
855 "0x%llx\n", data);
856 return 1;
857 }
794 break; 858 break;
795 case MSR_IA32_MCG_CTL: 859 case MSR_AMD64_NB_CFG:
796 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
797 __func__, data);
798 break; 860 break;
799 case MSR_IA32_DEBUGCTLMSR: 861 case MSR_IA32_DEBUGCTLMSR:
800 if (!data) { 862 if (!data) {
@@ -811,12 +873,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
811 case MSR_IA32_UCODE_REV: 873 case MSR_IA32_UCODE_REV:
812 case MSR_IA32_UCODE_WRITE: 874 case MSR_IA32_UCODE_WRITE:
813 case MSR_VM_HSAVE_PA: 875 case MSR_VM_HSAVE_PA:
876 case MSR_AMD64_PATCH_LOADER:
814 break; 877 break;
815 case 0x200 ... 0x2ff: 878 case 0x200 ... 0x2ff:
816 return set_msr_mtrr(vcpu, msr, data); 879 return set_msr_mtrr(vcpu, msr, data);
817 case MSR_IA32_APICBASE: 880 case MSR_IA32_APICBASE:
818 kvm_set_apic_base(vcpu, data); 881 kvm_set_apic_base(vcpu, data);
819 break; 882 break;
883 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
884 return kvm_x2apic_msr_write(vcpu, msr, data);
820 case MSR_IA32_MISC_ENABLE: 885 case MSR_IA32_MISC_ENABLE:
821 vcpu->arch.ia32_misc_enable_msr = data; 886 vcpu->arch.ia32_misc_enable_msr = data;
822 break; 887 break;
@@ -850,9 +915,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
850 kvm_request_guest_time_update(vcpu); 915 kvm_request_guest_time_update(vcpu);
851 break; 916 break;
852 } 917 }
918 case MSR_IA32_MCG_CTL:
919 case MSR_IA32_MCG_STATUS:
920 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
921 return set_msr_mce(vcpu, msr, data);
922
923 /* Performance counters are not protected by a CPUID bit,
924 * so we should check all of them in the generic path for the sake of
925 * cross vendor migration.
926 * Writing a zero into the event select MSRs disables them,
927 * which we perfectly emulate ;-). Any other value should be at least
928 * reported, some guests depend on them.
929 */
930 case MSR_P6_EVNTSEL0:
931 case MSR_P6_EVNTSEL1:
932 case MSR_K7_EVNTSEL0:
933 case MSR_K7_EVNTSEL1:
934 case MSR_K7_EVNTSEL2:
935 case MSR_K7_EVNTSEL3:
936 if (data != 0)
937 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
938 "0x%x data 0x%llx\n", msr, data);
939 break;
940 /* at least RHEL 4 unconditionally writes to the perfctr registers,
941 * so we ignore writes to make it happy.
942 */
943 case MSR_P6_PERFCTR0:
944 case MSR_P6_PERFCTR1:
945 case MSR_K7_PERFCTR0:
946 case MSR_K7_PERFCTR1:
947 case MSR_K7_PERFCTR2:
948 case MSR_K7_PERFCTR3:
949 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
950 "0x%x data 0x%llx\n", msr, data);
951 break;
853 default: 952 default:
854 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 953 if (!ignore_msrs) {
855 return 1; 954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
955 msr, data);
956 return 1;
957 } else {
958 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
959 msr, data);
960 break;
961 }
856 } 962 }
857 return 0; 963 return 0;
858} 964}
@@ -905,26 +1011,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
905 return 0; 1011 return 0;
906} 1012}
907 1013
908int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1014static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
909{ 1015{
910 u64 data; 1016 u64 data;
1017 u64 mcg_cap = vcpu->arch.mcg_cap;
1018 unsigned bank_num = mcg_cap & 0xff;
911 1019
912 switch (msr) { 1020 switch (msr) {
913 case 0xc0010010: /* SYSCFG */
914 case 0xc0010015: /* HWCR */
915 case MSR_IA32_PLATFORM_ID:
916 case MSR_IA32_P5_MC_ADDR: 1021 case MSR_IA32_P5_MC_ADDR:
917 case MSR_IA32_P5_MC_TYPE: 1022 case MSR_IA32_P5_MC_TYPE:
918 case MSR_IA32_MC0_CTL: 1023 data = 0;
919 case MSR_IA32_MCG_STATUS: 1024 break;
920 case MSR_IA32_MCG_CAP: 1025 case MSR_IA32_MCG_CAP:
1026 data = vcpu->arch.mcg_cap;
1027 break;
921 case MSR_IA32_MCG_CTL: 1028 case MSR_IA32_MCG_CTL:
922 case MSR_IA32_MC0_MISC: 1029 if (!(mcg_cap & MCG_CTL_P))
923 case MSR_IA32_MC0_MISC+4: 1030 return 1;
924 case MSR_IA32_MC0_MISC+8: 1031 data = vcpu->arch.mcg_ctl;
925 case MSR_IA32_MC0_MISC+12: 1032 break;
926 case MSR_IA32_MC0_MISC+16: 1033 case MSR_IA32_MCG_STATUS:
927 case MSR_IA32_MC0_MISC+20: 1034 data = vcpu->arch.mcg_status;
1035 break;
1036 default:
1037 if (msr >= MSR_IA32_MC0_CTL &&
1038 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1039 u32 offset = msr - MSR_IA32_MC0_CTL;
1040 data = vcpu->arch.mce_banks[offset];
1041 break;
1042 }
1043 return 1;
1044 }
1045 *pdata = data;
1046 return 0;
1047}
1048
1049int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1050{
1051 u64 data;
1052
1053 switch (msr) {
1054 case MSR_IA32_PLATFORM_ID:
928 case MSR_IA32_UCODE_REV: 1055 case MSR_IA32_UCODE_REV:
929 case MSR_IA32_EBL_CR_POWERON: 1056 case MSR_IA32_EBL_CR_POWERON:
930 case MSR_IA32_DEBUGCTLMSR: 1057 case MSR_IA32_DEBUGCTLMSR:
@@ -932,10 +1059,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
932 case MSR_IA32_LASTBRANCHTOIP: 1059 case MSR_IA32_LASTBRANCHTOIP:
933 case MSR_IA32_LASTINTFROMIP: 1060 case MSR_IA32_LASTINTFROMIP:
934 case MSR_IA32_LASTINTTOIP: 1061 case MSR_IA32_LASTINTTOIP:
1062 case MSR_K8_SYSCFG:
1063 case MSR_K7_HWCR:
935 case MSR_VM_HSAVE_PA: 1064 case MSR_VM_HSAVE_PA:
1065 case MSR_P6_PERFCTR0:
1066 case MSR_P6_PERFCTR1:
936 case MSR_P6_EVNTSEL0: 1067 case MSR_P6_EVNTSEL0:
937 case MSR_P6_EVNTSEL1: 1068 case MSR_P6_EVNTSEL1:
938 case MSR_K7_EVNTSEL0: 1069 case MSR_K7_EVNTSEL0:
1070 case MSR_K7_PERFCTR0:
1071 case MSR_K8_INT_PENDING_MSG:
1072 case MSR_AMD64_NB_CFG:
1073 case MSR_FAM10H_MMIO_CONF_BASE:
939 data = 0; 1074 data = 0;
940 break; 1075 break;
941 case MSR_MTRRcap: 1076 case MSR_MTRRcap:
@@ -949,6 +1084,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
949 case MSR_IA32_APICBASE: 1084 case MSR_IA32_APICBASE:
950 data = kvm_get_apic_base(vcpu); 1085 data = kvm_get_apic_base(vcpu);
951 break; 1086 break;
1087 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1088 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1089 break;
952 case MSR_IA32_MISC_ENABLE: 1090 case MSR_IA32_MISC_ENABLE:
953 data = vcpu->arch.ia32_misc_enable_msr; 1091 data = vcpu->arch.ia32_misc_enable_msr;
954 break; 1092 break;
@@ -967,9 +1105,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
967 case MSR_KVM_SYSTEM_TIME: 1105 case MSR_KVM_SYSTEM_TIME:
968 data = vcpu->arch.time; 1106 data = vcpu->arch.time;
969 break; 1107 break;
1108 case MSR_IA32_P5_MC_ADDR:
1109 case MSR_IA32_P5_MC_TYPE:
1110 case MSR_IA32_MCG_CAP:
1111 case MSR_IA32_MCG_CTL:
1112 case MSR_IA32_MCG_STATUS:
1113 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1114 return get_msr_mce(vcpu, msr, pdata);
970 default: 1115 default:
971 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1116 if (!ignore_msrs) {
972 return 1; 1117 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1118 return 1;
1119 } else {
1120 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1121 data = 0;
1122 }
1123 break;
973 } 1124 }
974 *pdata = data; 1125 *pdata = data;
975 return 0; 1126 return 0;
@@ -1068,6 +1219,11 @@ int kvm_dev_ioctl_check_extension(long ext)
1068 case KVM_CAP_REINJECT_CONTROL: 1219 case KVM_CAP_REINJECT_CONTROL:
1069 case KVM_CAP_IRQ_INJECT_STATUS: 1220 case KVM_CAP_IRQ_INJECT_STATUS:
1070 case KVM_CAP_ASSIGN_DEV_IRQ: 1221 case KVM_CAP_ASSIGN_DEV_IRQ:
1222 case KVM_CAP_IRQFD:
1223 case KVM_CAP_IOEVENTFD:
1224 case KVM_CAP_PIT2:
1225 case KVM_CAP_PIT_STATE2:
1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1071 r = 1; 1227 r = 1;
1072 break; 1228 break;
1073 case KVM_CAP_COALESCED_MMIO: 1229 case KVM_CAP_COALESCED_MMIO:
@@ -1088,6 +1244,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1088 case KVM_CAP_IOMMU: 1244 case KVM_CAP_IOMMU:
1089 r = iommu_found(); 1245 r = iommu_found();
1090 break; 1246 break;
1247 case KVM_CAP_MCE:
1248 r = KVM_MAX_MCE_BANKS;
1249 break;
1091 default: 1250 default:
1092 r = 0; 1251 r = 0;
1093 break; 1252 break;
@@ -1147,6 +1306,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
1147 r = 0; 1306 r = 0;
1148 break; 1307 break;
1149 } 1308 }
1309 case KVM_X86_GET_MCE_CAP_SUPPORTED: {
1310 u64 mce_cap;
1311
1312 mce_cap = KVM_MCE_CAP_SUPPORTED;
1313 r = -EFAULT;
1314 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
1315 goto out;
1316 r = 0;
1317 break;
1318 }
1150 default: 1319 default:
1151 r = -EINVAL; 1320 r = -EINVAL;
1152 } 1321 }
@@ -1227,6 +1396,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1227 vcpu->arch.cpuid_nent = cpuid->nent; 1396 vcpu->arch.cpuid_nent = cpuid->nent;
1228 cpuid_fix_nx_cap(vcpu); 1397 cpuid_fix_nx_cap(vcpu);
1229 r = 0; 1398 r = 0;
1399 kvm_apic_set_version(vcpu);
1230 1400
1231out_free: 1401out_free:
1232 vfree(cpuid_entries); 1402 vfree(cpuid_entries);
@@ -1248,6 +1418,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1248 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1418 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1249 goto out; 1419 goto out;
1250 vcpu->arch.cpuid_nent = cpuid->nent; 1420 vcpu->arch.cpuid_nent = cpuid->nent;
1421 kvm_apic_set_version(vcpu);
1251 return 0; 1422 return 0;
1252 1423
1253out: 1424out:
@@ -1290,6 +1461,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1290 u32 index, int *nent, int maxnent) 1461 u32 index, int *nent, int maxnent)
1291{ 1462{
1292 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1463 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1464 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1293#ifdef CONFIG_X86_64 1465#ifdef CONFIG_X86_64
1294 unsigned f_lm = F(LM); 1466 unsigned f_lm = F(LM);
1295#else 1467#else
@@ -1314,7 +1486,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1314 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1486 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1315 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1487 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1316 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1488 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1317 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | 1489 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
1318 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1490 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1319 /* cpuid 1.ecx */ 1491 /* cpuid 1.ecx */
1320 const u32 kvm_supported_word4_x86_features = 1492 const u32 kvm_supported_word4_x86_features =
@@ -1323,7 +1495,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1323 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1495 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1324 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1496 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1325 0 /* Reserved, DCA */ | F(XMM4_1) | 1497 0 /* Reserved, DCA */ | F(XMM4_1) |
1326 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | 1498 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1327 0 /* Reserved, XSAVE, OSXSAVE */; 1499 0 /* Reserved, XSAVE, OSXSAVE */;
1328 /* cpuid 0x80000001.ecx */ 1500 /* cpuid 0x80000001.ecx */
1329 const u32 kvm_supported_word6_x86_features = 1501 const u32 kvm_supported_word6_x86_features =
@@ -1344,6 +1516,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1344 case 1: 1516 case 1:
1345 entry->edx &= kvm_supported_word0_x86_features; 1517 entry->edx &= kvm_supported_word0_x86_features;
1346 entry->ecx &= kvm_supported_word4_x86_features; 1518 entry->ecx &= kvm_supported_word4_x86_features;
1519 /* we support x2apic emulation even if host does not support
1520 * it since we emulate x2apic in software */
1521 entry->ecx |= F(X2APIC);
1347 break; 1522 break;
1348 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1523 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1349 * may return different values. This forces us to get_cpu() before 1524 * may return different values. This forces us to get_cpu() before
@@ -1435,6 +1610,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1435 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1610 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1436 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1611 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1437 &nent, cpuid->nent); 1612 &nent, cpuid->nent);
1613 r = -E2BIG;
1614 if (nent >= cpuid->nent)
1615 goto out_free;
1616
1438 r = -EFAULT; 1617 r = -EFAULT;
1439 if (copy_to_user(entries, cpuid_entries, 1618 if (copy_to_user(entries, cpuid_entries,
1440 nent * sizeof(struct kvm_cpuid_entry2))) 1619 nent * sizeof(struct kvm_cpuid_entry2)))
@@ -1464,6 +1643,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1464 vcpu_load(vcpu); 1643 vcpu_load(vcpu);
1465 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1644 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1466 kvm_apic_post_state_restore(vcpu); 1645 kvm_apic_post_state_restore(vcpu);
1646 update_cr8_intercept(vcpu);
1467 vcpu_put(vcpu); 1647 vcpu_put(vcpu);
1468 1648
1469 return 0; 1649 return 0;
@@ -1503,6 +1683,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1503 return 0; 1683 return 0;
1504} 1684}
1505 1685
1686static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1687 u64 mcg_cap)
1688{
1689 int r;
1690 unsigned bank_num = mcg_cap & 0xff, bank;
1691
1692 r = -EINVAL;
1693 if (!bank_num)
1694 goto out;
1695 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1696 goto out;
1697 r = 0;
1698 vcpu->arch.mcg_cap = mcg_cap;
1699 /* Init IA32_MCG_CTL to all 1s */
1700 if (mcg_cap & MCG_CTL_P)
1701 vcpu->arch.mcg_ctl = ~(u64)0;
1702 /* Init IA32_MCi_CTL to all 1s */
1703 for (bank = 0; bank < bank_num; bank++)
1704 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
1705out:
1706 return r;
1707}
1708
1709static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1710 struct kvm_x86_mce *mce)
1711{
1712 u64 mcg_cap = vcpu->arch.mcg_cap;
1713 unsigned bank_num = mcg_cap & 0xff;
1714 u64 *banks = vcpu->arch.mce_banks;
1715
1716 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
1717 return -EINVAL;
1718 /*
1719 * if IA32_MCG_CTL is not all 1s, the uncorrected error
1720 * reporting is disabled
1721 */
1722 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
1723 vcpu->arch.mcg_ctl != ~(u64)0)
1724 return 0;
1725 banks += 4 * mce->bank;
1726 /*
1727 * if IA32_MCi_CTL is not all 1s, the uncorrected error
1728 * reporting is disabled for the bank
1729 */
1730 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
1731 return 0;
1732 if (mce->status & MCI_STATUS_UC) {
1733 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1734 !(vcpu->arch.cr4 & X86_CR4_MCE)) {
1735 printk(KERN_DEBUG "kvm: set_mce: "
1736 "injects mce exception while "
1737 "previous one is in progress!\n");
1738 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1739 return 0;
1740 }
1741 if (banks[1] & MCI_STATUS_VAL)
1742 mce->status |= MCI_STATUS_OVER;
1743 banks[2] = mce->addr;
1744 banks[3] = mce->misc;
1745 vcpu->arch.mcg_status = mce->mcg_status;
1746 banks[1] = mce->status;
1747 kvm_queue_exception(vcpu, MC_VECTOR);
1748 } else if (!(banks[1] & MCI_STATUS_VAL)
1749 || !(banks[1] & MCI_STATUS_UC)) {
1750 if (banks[1] & MCI_STATUS_VAL)
1751 mce->status |= MCI_STATUS_OVER;
1752 banks[2] = mce->addr;
1753 banks[3] = mce->misc;
1754 banks[1] = mce->status;
1755 } else
1756 banks[1] |= MCI_STATUS_OVER;
1757 return 0;
1758}
1759
1506long kvm_arch_vcpu_ioctl(struct file *filp, 1760long kvm_arch_vcpu_ioctl(struct file *filp,
1507 unsigned int ioctl, unsigned long arg) 1761 unsigned int ioctl, unsigned long arg)
1508{ 1762{
@@ -1636,6 +1890,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1636 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1890 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1637 break; 1891 break;
1638 } 1892 }
1893 case KVM_X86_SETUP_MCE: {
1894 u64 mcg_cap;
1895
1896 r = -EFAULT;
1897 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
1898 goto out;
1899 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
1900 break;
1901 }
1902 case KVM_X86_SET_MCE: {
1903 struct kvm_x86_mce mce;
1904
1905 r = -EFAULT;
1906 if (copy_from_user(&mce, argp, sizeof mce))
1907 goto out;
1908 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1909 break;
1910 }
1639 default: 1911 default:
1640 r = -EINVAL; 1912 r = -EINVAL;
1641 } 1913 }
@@ -1654,6 +1926,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1654 return ret; 1926 return ret;
1655} 1927}
1656 1928
1929static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
1930 u64 ident_addr)
1931{
1932 kvm->arch.ept_identity_map_addr = ident_addr;
1933 return 0;
1934}
1935
1657static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1936static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1658 u32 kvm_nr_mmu_pages) 1937 u32 kvm_nr_mmu_pages)
1659{ 1938{
@@ -1775,19 +2054,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1775 r = 0; 2054 r = 0;
1776 switch (chip->chip_id) { 2055 switch (chip->chip_id) {
1777 case KVM_IRQCHIP_PIC_MASTER: 2056 case KVM_IRQCHIP_PIC_MASTER:
2057 spin_lock(&pic_irqchip(kvm)->lock);
1778 memcpy(&pic_irqchip(kvm)->pics[0], 2058 memcpy(&pic_irqchip(kvm)->pics[0],
1779 &chip->chip.pic, 2059 &chip->chip.pic,
1780 sizeof(struct kvm_pic_state)); 2060 sizeof(struct kvm_pic_state));
2061 spin_unlock(&pic_irqchip(kvm)->lock);
1781 break; 2062 break;
1782 case KVM_IRQCHIP_PIC_SLAVE: 2063 case KVM_IRQCHIP_PIC_SLAVE:
2064 spin_lock(&pic_irqchip(kvm)->lock);
1783 memcpy(&pic_irqchip(kvm)->pics[1], 2065 memcpy(&pic_irqchip(kvm)->pics[1],
1784 &chip->chip.pic, 2066 &chip->chip.pic,
1785 sizeof(struct kvm_pic_state)); 2067 sizeof(struct kvm_pic_state));
2068 spin_unlock(&pic_irqchip(kvm)->lock);
1786 break; 2069 break;
1787 case KVM_IRQCHIP_IOAPIC: 2070 case KVM_IRQCHIP_IOAPIC:
2071 mutex_lock(&kvm->irq_lock);
1788 memcpy(ioapic_irqchip(kvm), 2072 memcpy(ioapic_irqchip(kvm),
1789 &chip->chip.ioapic, 2073 &chip->chip.ioapic,
1790 sizeof(struct kvm_ioapic_state)); 2074 sizeof(struct kvm_ioapic_state));
2075 mutex_unlock(&kvm->irq_lock);
1791 break; 2076 break;
1792 default: 2077 default:
1793 r = -EINVAL; 2078 r = -EINVAL;
@@ -1801,7 +2086,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1801{ 2086{
1802 int r = 0; 2087 int r = 0;
1803 2088
2089 mutex_lock(&kvm->arch.vpit->pit_state.lock);
1804 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 2090 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
2091 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
1805 return r; 2092 return r;
1806} 2093}
1807 2094
@@ -1809,8 +2096,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1809{ 2096{
1810 int r = 0; 2097 int r = 0;
1811 2098
2099 mutex_lock(&kvm->arch.vpit->pit_state.lock);
1812 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 2100 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1813 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 2101 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
2102 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2103 return r;
2104}
2105
2106static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2107{
2108 int r = 0;
2109
2110 mutex_lock(&kvm->arch.vpit->pit_state.lock);
2111 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
2112 sizeof(ps->channels));
2113 ps->flags = kvm->arch.vpit->pit_state.flags;
2114 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2115 return r;
2116}
2117
2118static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2119{
2120 int r = 0, start = 0;
2121 u32 prev_legacy, cur_legacy;
2122 mutex_lock(&kvm->arch.vpit->pit_state.lock);
2123 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2124 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2125 if (!prev_legacy && cur_legacy)
2126 start = 1;
2127 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
2128 sizeof(kvm->arch.vpit->pit_state.channels));
2129 kvm->arch.vpit->pit_state.flags = ps->flags;
2130 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
2131 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
1814 return r; 2132 return r;
1815} 2133}
1816 2134
@@ -1819,7 +2137,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
1819{ 2137{
1820 if (!kvm->arch.vpit) 2138 if (!kvm->arch.vpit)
1821 return -ENXIO; 2139 return -ENXIO;
2140 mutex_lock(&kvm->arch.vpit->pit_state.lock);
1822 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 2141 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
2142 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
1823 return 0; 2143 return 0;
1824} 2144}
1825 2145
@@ -1845,7 +2165,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1845 spin_lock(&kvm->mmu_lock); 2165 spin_lock(&kvm->mmu_lock);
1846 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2166 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1847 spin_unlock(&kvm->mmu_lock); 2167 spin_unlock(&kvm->mmu_lock);
1848 kvm_flush_remote_tlbs(kvm);
1849 memslot = &kvm->memslots[log->slot]; 2168 memslot = &kvm->memslots[log->slot];
1850 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2169 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1851 memset(memslot->dirty_bitmap, 0, n); 2170 memset(memslot->dirty_bitmap, 0, n);
@@ -1869,7 +2188,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
1869 */ 2188 */
1870 union { 2189 union {
1871 struct kvm_pit_state ps; 2190 struct kvm_pit_state ps;
2191 struct kvm_pit_state2 ps2;
1872 struct kvm_memory_alias alias; 2192 struct kvm_memory_alias alias;
2193 struct kvm_pit_config pit_config;
1873 } u; 2194 } u;
1874 2195
1875 switch (ioctl) { 2196 switch (ioctl) {
@@ -1878,6 +2199,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
1878 if (r < 0) 2199 if (r < 0)
1879 goto out; 2200 goto out;
1880 break; 2201 break;
2202 case KVM_SET_IDENTITY_MAP_ADDR: {
2203 u64 ident_addr;
2204
2205 r = -EFAULT;
2206 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
2207 goto out;
2208 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
2209 if (r < 0)
2210 goto out;
2211 break;
2212 }
1881 case KVM_SET_MEMORY_REGION: { 2213 case KVM_SET_MEMORY_REGION: {
1882 struct kvm_memory_region kvm_mem; 2214 struct kvm_memory_region kvm_mem;
1883 struct kvm_userspace_memory_region kvm_userspace_mem; 2215 struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -1930,16 +2262,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
1930 } 2262 }
1931 break; 2263 break;
1932 case KVM_CREATE_PIT: 2264 case KVM_CREATE_PIT:
1933 mutex_lock(&kvm->lock); 2265 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2266 goto create_pit;
2267 case KVM_CREATE_PIT2:
2268 r = -EFAULT;
2269 if (copy_from_user(&u.pit_config, argp,
2270 sizeof(struct kvm_pit_config)))
2271 goto out;
2272 create_pit:
2273 down_write(&kvm->slots_lock);
1934 r = -EEXIST; 2274 r = -EEXIST;
1935 if (kvm->arch.vpit) 2275 if (kvm->arch.vpit)
1936 goto create_pit_unlock; 2276 goto create_pit_unlock;
1937 r = -ENOMEM; 2277 r = -ENOMEM;
1938 kvm->arch.vpit = kvm_create_pit(kvm); 2278 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
1939 if (kvm->arch.vpit) 2279 if (kvm->arch.vpit)
1940 r = 0; 2280 r = 0;
1941 create_pit_unlock: 2281 create_pit_unlock:
1942 mutex_unlock(&kvm->lock); 2282 up_write(&kvm->slots_lock);
1943 break; 2283 break;
1944 case KVM_IRQ_LINE_STATUS: 2284 case KVM_IRQ_LINE_STATUS:
1945 case KVM_IRQ_LINE: { 2285 case KVM_IRQ_LINE: {
@@ -1950,10 +2290,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
1950 goto out; 2290 goto out;
1951 if (irqchip_in_kernel(kvm)) { 2291 if (irqchip_in_kernel(kvm)) {
1952 __s32 status; 2292 __s32 status;
1953 mutex_lock(&kvm->lock); 2293 mutex_lock(&kvm->irq_lock);
1954 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2294 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1955 irq_event.irq, irq_event.level); 2295 irq_event.irq, irq_event.level);
1956 mutex_unlock(&kvm->lock); 2296 mutex_unlock(&kvm->irq_lock);
1957 if (ioctl == KVM_IRQ_LINE_STATUS) { 2297 if (ioctl == KVM_IRQ_LINE_STATUS) {
1958 irq_event.status = status; 2298 irq_event.status = status;
1959 if (copy_to_user(argp, &irq_event, 2299 if (copy_to_user(argp, &irq_event,
@@ -2042,6 +2382,32 @@ long kvm_arch_vm_ioctl(struct file *filp,
2042 r = 0; 2382 r = 0;
2043 break; 2383 break;
2044 } 2384 }
2385 case KVM_GET_PIT2: {
2386 r = -ENXIO;
2387 if (!kvm->arch.vpit)
2388 goto out;
2389 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
2390 if (r)
2391 goto out;
2392 r = -EFAULT;
2393 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
2394 goto out;
2395 r = 0;
2396 break;
2397 }
2398 case KVM_SET_PIT2: {
2399 r = -EFAULT;
2400 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
2401 goto out;
2402 r = -ENXIO;
2403 if (!kvm->arch.vpit)
2404 goto out;
2405 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
2406 if (r)
2407 goto out;
2408 r = 0;
2409 break;
2410 }
2045 case KVM_REINJECT_CONTROL: { 2411 case KVM_REINJECT_CONTROL: {
2046 struct kvm_reinject_control control; 2412 struct kvm_reinject_control control;
2047 r = -EFAULT; 2413 r = -EFAULT;
@@ -2075,35 +2441,23 @@ static void kvm_init_msr_list(void)
2075 num_msrs_to_save = j; 2441 num_msrs_to_save = j;
2076} 2442}
2077 2443
2078/* 2444static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2079 * Only apic need an MMIO device hook, so shortcut now.. 2445 const void *v)
2080 */
2081static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
2082 gpa_t addr, int len,
2083 int is_write)
2084{ 2446{
2085 struct kvm_io_device *dev; 2447 if (vcpu->arch.apic &&
2448 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2449 return 0;
2086 2450
2087 if (vcpu->arch.apic) { 2451 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
2088 dev = &vcpu->arch.apic->dev;
2089 if (dev->in_range(dev, addr, len, is_write))
2090 return dev;
2091 }
2092 return NULL;
2093} 2452}
2094 2453
2095 2454static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2096static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
2097 gpa_t addr, int len,
2098 int is_write)
2099{ 2455{
2100 struct kvm_io_device *dev; 2456 if (vcpu->arch.apic &&
2457 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2458 return 0;
2101 2459
2102 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); 2460 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
2103 if (dev == NULL)
2104 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
2105 is_write);
2106 return dev;
2107} 2461}
2108 2462
2109static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2463static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
@@ -2172,11 +2526,12 @@ static int emulator_read_emulated(unsigned long addr,
2172 unsigned int bytes, 2526 unsigned int bytes,
2173 struct kvm_vcpu *vcpu) 2527 struct kvm_vcpu *vcpu)
2174{ 2528{
2175 struct kvm_io_device *mmio_dev;
2176 gpa_t gpa; 2529 gpa_t gpa;
2177 2530
2178 if (vcpu->mmio_read_completed) { 2531 if (vcpu->mmio_read_completed) {
2179 memcpy(val, vcpu->mmio_data, bytes); 2532 memcpy(val, vcpu->mmio_data, bytes);
2533 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
2534 vcpu->mmio_phys_addr, *(u64 *)val);
2180 vcpu->mmio_read_completed = 0; 2535 vcpu->mmio_read_completed = 0;
2181 return X86EMUL_CONTINUE; 2536 return X86EMUL_CONTINUE;
2182 } 2537 }
@@ -2197,14 +2552,12 @@ mmio:
2197 /* 2552 /*
2198 * Is this MMIO handled locally? 2553 * Is this MMIO handled locally?
2199 */ 2554 */
2200 mutex_lock(&vcpu->kvm->lock); 2555 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
2201 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); 2556 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
2202 if (mmio_dev) {
2203 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
2204 mutex_unlock(&vcpu->kvm->lock);
2205 return X86EMUL_CONTINUE; 2557 return X86EMUL_CONTINUE;
2206 } 2558 }
2207 mutex_unlock(&vcpu->kvm->lock); 2559
2560 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
2208 2561
2209 vcpu->mmio_needed = 1; 2562 vcpu->mmio_needed = 1;
2210 vcpu->mmio_phys_addr = gpa; 2563 vcpu->mmio_phys_addr = gpa;
@@ -2231,7 +2584,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2231 unsigned int bytes, 2584 unsigned int bytes,
2232 struct kvm_vcpu *vcpu) 2585 struct kvm_vcpu *vcpu)
2233{ 2586{
2234 struct kvm_io_device *mmio_dev;
2235 gpa_t gpa; 2587 gpa_t gpa;
2236 2588
2237 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2589 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
@@ -2249,17 +2601,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2249 return X86EMUL_CONTINUE; 2601 return X86EMUL_CONTINUE;
2250 2602
2251mmio: 2603mmio:
2604 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
2252 /* 2605 /*
2253 * Is this MMIO handled locally? 2606 * Is this MMIO handled locally?
2254 */ 2607 */
2255 mutex_lock(&vcpu->kvm->lock); 2608 if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
2256 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2257 if (mmio_dev) {
2258 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2259 mutex_unlock(&vcpu->kvm->lock);
2260 return X86EMUL_CONTINUE; 2609 return X86EMUL_CONTINUE;
2261 }
2262 mutex_unlock(&vcpu->kvm->lock);
2263 2610
2264 vcpu->mmio_needed = 1; 2611 vcpu->mmio_needed = 1;
2265 vcpu->mmio_phys_addr = gpa; 2612 vcpu->mmio_phys_addr = gpa;
@@ -2297,12 +2644,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2297 unsigned int bytes, 2644 unsigned int bytes,
2298 struct kvm_vcpu *vcpu) 2645 struct kvm_vcpu *vcpu)
2299{ 2646{
2300 static int reported; 2647 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
2301
2302 if (!reported) {
2303 reported = 1;
2304 printk(KERN_WARNING "kvm: emulating exchange as write\n");
2305 }
2306#ifndef CONFIG_X86_64 2648#ifndef CONFIG_X86_64
2307 /* guests cmpxchg8b have to be emulated atomically */ 2649 /* guests cmpxchg8b have to be emulated atomically */
2308 if (bytes == 8) { 2650 if (bytes == 8) {
@@ -2348,7 +2690,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2348 2690
2349int emulate_clts(struct kvm_vcpu *vcpu) 2691int emulate_clts(struct kvm_vcpu *vcpu)
2350{ 2692{
2351 KVMTRACE_0D(CLTS, vcpu, handler);
2352 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2693 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2353 return X86EMUL_CONTINUE; 2694 return X86EMUL_CONTINUE;
2354} 2695}
@@ -2425,7 +2766,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2425 kvm_clear_exception_queue(vcpu); 2766 kvm_clear_exception_queue(vcpu);
2426 vcpu->arch.mmio_fault_cr2 = cr2; 2767 vcpu->arch.mmio_fault_cr2 = cr2;
2427 /* 2768 /*
2428 * TODO: fix x86_emulate.c to use guest_read/write_register 2769 * TODO: fix emulate.c to use guest_read/write_register
2429 * instead of direct ->regs accesses, can save hundred cycles 2770 * instead of direct ->regs accesses, can save hundred cycles
2430 * on Intel for instructions that don't read/change RSP, for 2771 * on Intel for instructions that don't read/change RSP, for
2431 * for example. 2772 * for example.
@@ -2449,14 +2790,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2449 2790
2450 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2791 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2451 2792
2452 /* Reject the instructions other than VMCALL/VMMCALL when 2793 /* Only allow emulation of specific instructions on #UD
2453 * try to emulate invalid opcode */ 2794 * (namely VMMCALL, sysenter, sysexit, syscall)*/
2454 c = &vcpu->arch.emulate_ctxt.decode; 2795 c = &vcpu->arch.emulate_ctxt.decode;
2455 if ((emulation_type & EMULTYPE_TRAP_UD) && 2796 if (emulation_type & EMULTYPE_TRAP_UD) {
2456 (!(c->twobyte && c->b == 0x01 && 2797 if (!c->twobyte)
2457 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2798 return EMULATE_FAIL;
2458 c->modrm_mod == 3 && c->modrm_rm == 1))) 2799 switch (c->b) {
2459 return EMULATE_FAIL; 2800 case 0x01: /* VMMCALL */
2801 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2802 return EMULATE_FAIL;
2803 break;
2804 case 0x34: /* sysenter */
2805 case 0x35: /* sysexit */
2806 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2807 return EMULATE_FAIL;
2808 break;
2809 case 0x05: /* syscall */
2810 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2811 return EMULATE_FAIL;
2812 break;
2813 default:
2814 return EMULATE_FAIL;
2815 }
2816
2817 if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
2818 return EMULATE_FAIL;
2819 }
2460 2820
2461 ++vcpu->stat.insn_emulation; 2821 ++vcpu->stat.insn_emulation;
2462 if (r) { 2822 if (r) {
@@ -2576,52 +2936,40 @@ int complete_pio(struct kvm_vcpu *vcpu)
2576 return 0; 2936 return 0;
2577} 2937}
2578 2938
2579static void kernel_pio(struct kvm_io_device *pio_dev, 2939static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
2580 struct kvm_vcpu *vcpu,
2581 void *pd)
2582{ 2940{
2583 /* TODO: String I/O for in kernel device */ 2941 /* TODO: String I/O for in kernel device */
2942 int r;
2584 2943
2585 mutex_lock(&vcpu->kvm->lock);
2586 if (vcpu->arch.pio.in) 2944 if (vcpu->arch.pio.in)
2587 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2945 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
2588 vcpu->arch.pio.size, 2946 vcpu->arch.pio.size, pd);
2589 pd);
2590 else 2947 else
2591 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2948 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
2592 vcpu->arch.pio.size, 2949 vcpu->arch.pio.size, pd);
2593 pd); 2950 return r;
2594 mutex_unlock(&vcpu->kvm->lock);
2595} 2951}
2596 2952
2597static void pio_string_write(struct kvm_io_device *pio_dev, 2953static int pio_string_write(struct kvm_vcpu *vcpu)
2598 struct kvm_vcpu *vcpu)
2599{ 2954{
2600 struct kvm_pio_request *io = &vcpu->arch.pio; 2955 struct kvm_pio_request *io = &vcpu->arch.pio;
2601 void *pd = vcpu->arch.pio_data; 2956 void *pd = vcpu->arch.pio_data;
2602 int i; 2957 int i, r = 0;
2603 2958
2604 mutex_lock(&vcpu->kvm->lock);
2605 for (i = 0; i < io->cur_count; i++) { 2959 for (i = 0; i < io->cur_count; i++) {
2606 kvm_iodevice_write(pio_dev, io->port, 2960 if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
2607 io->size, 2961 io->port, io->size, pd)) {
2608 pd); 2962 r = -EOPNOTSUPP;
2963 break;
2964 }
2609 pd += io->size; 2965 pd += io->size;
2610 } 2966 }
2611 mutex_unlock(&vcpu->kvm->lock); 2967 return r;
2612}
2613
2614static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2615 gpa_t addr, int len,
2616 int is_write)
2617{
2618 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2619} 2968}
2620 2969
2621int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2970int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2622 int size, unsigned port) 2971 int size, unsigned port)
2623{ 2972{
2624 struct kvm_io_device *pio_dev;
2625 unsigned long val; 2973 unsigned long val;
2626 2974
2627 vcpu->run->exit_reason = KVM_EXIT_IO; 2975 vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2635,19 +2983,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2635 vcpu->arch.pio.down = 0; 2983 vcpu->arch.pio.down = 0;
2636 vcpu->arch.pio.rep = 0; 2984 vcpu->arch.pio.rep = 0;
2637 2985
2638 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2986 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
2639 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2987 size, 1);
2640 handler);
2641 else
2642 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2643 handler);
2644 2988
2645 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2989 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2646 memcpy(vcpu->arch.pio_data, &val, 4); 2990 memcpy(vcpu->arch.pio_data, &val, 4);
2647 2991
2648 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2992 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
2649 if (pio_dev) {
2650 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2651 complete_pio(vcpu); 2993 complete_pio(vcpu);
2652 return 1; 2994 return 1;
2653 } 2995 }
@@ -2661,7 +3003,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2661{ 3003{
2662 unsigned now, in_page; 3004 unsigned now, in_page;
2663 int ret = 0; 3005 int ret = 0;
2664 struct kvm_io_device *pio_dev;
2665 3006
2666 vcpu->run->exit_reason = KVM_EXIT_IO; 3007 vcpu->run->exit_reason = KVM_EXIT_IO;
2667 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3008 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2674,12 +3015,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2674 vcpu->arch.pio.down = down; 3015 vcpu->arch.pio.down = down;
2675 vcpu->arch.pio.rep = rep; 3016 vcpu->arch.pio.rep = rep;
2676 3017
2677 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 3018 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
2678 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 3019 size, count);
2679 handler);
2680 else
2681 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2682 handler);
2683 3020
2684 if (!count) { 3021 if (!count) {
2685 kvm_x86_ops->skip_emulated_instruction(vcpu); 3022 kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2709,9 +3046,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2709 3046
2710 vcpu->arch.pio.guest_gva = address; 3047 vcpu->arch.pio.guest_gva = address;
2711 3048
2712 pio_dev = vcpu_find_pio_dev(vcpu, port,
2713 vcpu->arch.pio.cur_count,
2714 !vcpu->arch.pio.in);
2715 if (!vcpu->arch.pio.in) { 3049 if (!vcpu->arch.pio.in) {
2716 /* string PIO write */ 3050 /* string PIO write */
2717 ret = pio_copy_data(vcpu); 3051 ret = pio_copy_data(vcpu);
@@ -2719,16 +3053,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2719 kvm_inject_gp(vcpu, 0); 3053 kvm_inject_gp(vcpu, 0);
2720 return 1; 3054 return 1;
2721 } 3055 }
2722 if (ret == 0 && pio_dev) { 3056 if (ret == 0 && !pio_string_write(vcpu)) {
2723 pio_string_write(pio_dev, vcpu);
2724 complete_pio(vcpu); 3057 complete_pio(vcpu);
2725 if (vcpu->arch.pio.count == 0) 3058 if (vcpu->arch.pio.count == 0)
2726 ret = 1; 3059 ret = 1;
2727 } 3060 }
2728 } else if (pio_dev) 3061 }
2729 pr_unimpl(vcpu, "no string pio read support yet, " 3062 /* no string PIO read support yet */
2730 "port %x size %d count %ld\n",
2731 port, size, count);
2732 3063
2733 return ret; 3064 return ret;
2734} 3065}
@@ -2761,10 +3092,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
2761 3092
2762 spin_lock(&kvm_lock); 3093 spin_lock(&kvm_lock);
2763 list_for_each_entry(kvm, &vm_list, vm_list) { 3094 list_for_each_entry(kvm, &vm_list, vm_list) {
2764 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 3095 kvm_for_each_vcpu(i, vcpu, kvm) {
2765 vcpu = kvm->vcpus[i];
2766 if (!vcpu)
2767 continue;
2768 if (vcpu->cpu != freq->cpu) 3096 if (vcpu->cpu != freq->cpu)
2769 continue; 3097 continue;
2770 if (!kvm_request_guest_time_update(vcpu)) 3098 if (!kvm_request_guest_time_update(vcpu))
@@ -2857,7 +3185,6 @@ void kvm_arch_exit(void)
2857int kvm_emulate_halt(struct kvm_vcpu *vcpu) 3185int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2858{ 3186{
2859 ++vcpu->stat.halt_exits; 3187 ++vcpu->stat.halt_exits;
2860 KVMTRACE_0D(HLT, vcpu, handler);
2861 if (irqchip_in_kernel(vcpu->kvm)) { 3188 if (irqchip_in_kernel(vcpu->kvm)) {
2862 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 3189 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2863 return 1; 3190 return 1;
@@ -2888,7 +3215,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2888 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 3215 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2889 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 3216 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2890 3217
2891 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 3218 trace_kvm_hypercall(nr, a0, a1, a2, a3);
2892 3219
2893 if (!is_long_mode(vcpu)) { 3220 if (!is_long_mode(vcpu)) {
2894 nr &= 0xFFFFFFFF; 3221 nr &= 0xFFFFFFFF;
@@ -2898,6 +3225,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2898 a3 &= 0xFFFFFFFF; 3225 a3 &= 0xFFFFFFFF;
2899 } 3226 }
2900 3227
3228 if (kvm_x86_ops->get_cpl(vcpu) != 0) {
3229 ret = -KVM_EPERM;
3230 goto out;
3231 }
3232
2901 switch (nr) { 3233 switch (nr) {
2902 case KVM_HC_VAPIC_POLL_IRQ: 3234 case KVM_HC_VAPIC_POLL_IRQ:
2903 ret = 0; 3235 ret = 0;
@@ -2909,6 +3241,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2909 ret = -KVM_ENOSYS; 3241 ret = -KVM_ENOSYS;
2910 break; 3242 break;
2911 } 3243 }
3244out:
2912 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 3245 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2913 ++vcpu->stat.hypercalls; 3246 ++vcpu->stat.hypercalls;
2914 return r; 3247 return r;
@@ -2988,8 +3321,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2988 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3321 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2989 return 0; 3322 return 0;
2990 } 3323 }
2991 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2992 (u32)((u64)value >> 32), handler);
2993 3324
2994 return value; 3325 return value;
2995} 3326}
@@ -2997,9 +3328,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2997void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 3328void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2998 unsigned long *rflags) 3329 unsigned long *rflags)
2999{ 3330{
3000 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
3001 (u32)((u64)val >> 32), handler);
3002
3003 switch (cr) { 3331 switch (cr) {
3004 case 0: 3332 case 0:
3005 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3333 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -3109,11 +3437,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3109 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3437 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3110 } 3438 }
3111 kvm_x86_ops->skip_emulated_instruction(vcpu); 3439 kvm_x86_ops->skip_emulated_instruction(vcpu);
3112 KVMTRACE_5D(CPUID, vcpu, function, 3440 trace_kvm_cpuid(function,
3113 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), 3441 kvm_register_read(vcpu, VCPU_REGS_RAX),
3114 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), 3442 kvm_register_read(vcpu, VCPU_REGS_RBX),
3115 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), 3443 kvm_register_read(vcpu, VCPU_REGS_RCX),
3116 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); 3444 kvm_register_read(vcpu, VCPU_REGS_RDX));
3117} 3445}
3118EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3446EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3119 3447
@@ -3179,6 +3507,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3179 if (!kvm_x86_ops->update_cr8_intercept) 3507 if (!kvm_x86_ops->update_cr8_intercept)
3180 return; 3508 return;
3181 3509
3510 if (!vcpu->arch.apic)
3511 return;
3512
3182 if (!vcpu->arch.apic->vapic_addr) 3513 if (!vcpu->arch.apic->vapic_addr)
3183 max_irr = kvm_lapic_find_highest_irr(vcpu); 3514 max_irr = kvm_lapic_find_highest_irr(vcpu);
3184 else 3515 else
@@ -3192,12 +3523,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3192 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3523 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3193} 3524}
3194 3525
3195static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3526static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3196{ 3527{
3197 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3198 kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3199
3200 /* try to reinject previous events if any */ 3528 /* try to reinject previous events if any */
3529 if (vcpu->arch.exception.pending) {
3530 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
3531 vcpu->arch.exception.has_error_code,
3532 vcpu->arch.exception.error_code);
3533 return;
3534 }
3535
3201 if (vcpu->arch.nmi_injected) { 3536 if (vcpu->arch.nmi_injected) {
3202 kvm_x86_ops->set_nmi(vcpu); 3537 kvm_x86_ops->set_nmi(vcpu);
3203 return; 3538 return;
@@ -3271,16 +3606,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3271 smp_mb__after_clear_bit(); 3606 smp_mb__after_clear_bit();
3272 3607
3273 if (vcpu->requests || need_resched() || signal_pending(current)) { 3608 if (vcpu->requests || need_resched() || signal_pending(current)) {
3609 set_bit(KVM_REQ_KICK, &vcpu->requests);
3274 local_irq_enable(); 3610 local_irq_enable();
3275 preempt_enable(); 3611 preempt_enable();
3276 r = 1; 3612 r = 1;
3277 goto out; 3613 goto out;
3278 } 3614 }
3279 3615
3280 if (vcpu->arch.exception.pending) 3616 inject_pending_event(vcpu, kvm_run);
3281 __queue_exception(vcpu);
3282 else
3283 inject_pending_irq(vcpu, kvm_run);
3284 3617
3285 /* enable NMI/IRQ window open exits if needed */ 3618 /* enable NMI/IRQ window open exits if needed */
3286 if (vcpu->arch.nmi_pending) 3619 if (vcpu->arch.nmi_pending)
@@ -3297,14 +3630,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3297 3630
3298 kvm_guest_enter(); 3631 kvm_guest_enter();
3299 3632
3300 get_debugreg(vcpu->arch.host_dr6, 6);
3301 get_debugreg(vcpu->arch.host_dr7, 7);
3302 if (unlikely(vcpu->arch.switch_db_regs)) { 3633 if (unlikely(vcpu->arch.switch_db_regs)) {
3303 get_debugreg(vcpu->arch.host_db[0], 0);
3304 get_debugreg(vcpu->arch.host_db[1], 1);
3305 get_debugreg(vcpu->arch.host_db[2], 2);
3306 get_debugreg(vcpu->arch.host_db[3], 3);
3307
3308 set_debugreg(0, 7); 3634 set_debugreg(0, 7);
3309 set_debugreg(vcpu->arch.eff_db[0], 0); 3635 set_debugreg(vcpu->arch.eff_db[0], 0);
3310 set_debugreg(vcpu->arch.eff_db[1], 1); 3636 set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -3312,18 +3638,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3312 set_debugreg(vcpu->arch.eff_db[3], 3); 3638 set_debugreg(vcpu->arch.eff_db[3], 3);
3313 } 3639 }
3314 3640
3315 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 3641 trace_kvm_entry(vcpu->vcpu_id);
3316 kvm_x86_ops->run(vcpu, kvm_run); 3642 kvm_x86_ops->run(vcpu, kvm_run);
3317 3643
3318 if (unlikely(vcpu->arch.switch_db_regs)) { 3644 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
3319 set_debugreg(0, 7); 3645 set_debugreg(current->thread.debugreg0, 0);
3320 set_debugreg(vcpu->arch.host_db[0], 0); 3646 set_debugreg(current->thread.debugreg1, 1);
3321 set_debugreg(vcpu->arch.host_db[1], 1); 3647 set_debugreg(current->thread.debugreg2, 2);
3322 set_debugreg(vcpu->arch.host_db[2], 2); 3648 set_debugreg(current->thread.debugreg3, 3);
3323 set_debugreg(vcpu->arch.host_db[3], 3); 3649 set_debugreg(current->thread.debugreg6, 6);
3650 set_debugreg(current->thread.debugreg7, 7);
3324 } 3651 }
3325 set_debugreg(vcpu->arch.host_dr6, 6);
3326 set_debugreg(vcpu->arch.host_dr7, 7);
3327 3652
3328 set_bit(KVM_REQ_KICK, &vcpu->requests); 3653 set_bit(KVM_REQ_KICK, &vcpu->requests);
3329 local_irq_enable(); 3654 local_irq_enable();
@@ -3653,11 +3978,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu,
3653static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3978static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3654 struct kvm_segment *kvm_desct) 3979 struct kvm_segment *kvm_desct)
3655{ 3980{
3656 kvm_desct->base = seg_desc->base0; 3981 kvm_desct->base = get_desc_base(seg_desc);
3657 kvm_desct->base |= seg_desc->base1 << 16; 3982 kvm_desct->limit = get_desc_limit(seg_desc);
3658 kvm_desct->base |= seg_desc->base2 << 24;
3659 kvm_desct->limit = seg_desc->limit0;
3660 kvm_desct->limit |= seg_desc->limit << 16;
3661 if (seg_desc->g) { 3983 if (seg_desc->g) {
3662 kvm_desct->limit <<= 12; 3984 kvm_desct->limit <<= 12;
3663 kvm_desct->limit |= 0xfff; 3985 kvm_desct->limit |= 0xfff;
@@ -3701,7 +4023,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3701static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4023static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3702 struct desc_struct *seg_desc) 4024 struct desc_struct *seg_desc)
3703{ 4025{
3704 gpa_t gpa;
3705 struct descriptor_table dtable; 4026 struct descriptor_table dtable;
3706 u16 index = selector >> 3; 4027 u16 index = selector >> 3;
3707 4028
@@ -3711,16 +4032,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3711 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4032 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3712 return 1; 4033 return 1;
3713 } 4034 }
3714 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 4035 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
3715 gpa += index * 8;
3716 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3717} 4036}
3718 4037
3719/* allowed just for 8 bytes segments */ 4038/* allowed just for 8 bytes segments */
3720static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4039static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3721 struct desc_struct *seg_desc) 4040 struct desc_struct *seg_desc)
3722{ 4041{
3723 gpa_t gpa;
3724 struct descriptor_table dtable; 4042 struct descriptor_table dtable;
3725 u16 index = selector >> 3; 4043 u16 index = selector >> 3;
3726 4044
@@ -3728,19 +4046,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3728 4046
3729 if (dtable.limit < index * 8 + 7) 4047 if (dtable.limit < index * 8 + 7)
3730 return 1; 4048 return 1;
3731 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 4049 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
3732 gpa += index * 8;
3733 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3734} 4050}
3735 4051
3736static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 4052static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3737 struct desc_struct *seg_desc) 4053 struct desc_struct *seg_desc)
3738{ 4054{
3739 u32 base_addr; 4055 u32 base_addr = get_desc_base(seg_desc);
3740
3741 base_addr = seg_desc->base0;
3742 base_addr |= (seg_desc->base1 << 16);
3743 base_addr |= (seg_desc->base2 << 24);
3744 4056
3745 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4057 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3746} 4058}
@@ -3785,12 +4097,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
3785 return 0; 4097 return 0;
3786} 4098}
3787 4099
4100static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4101{
4102 return (seg != VCPU_SREG_LDTR) &&
4103 (seg != VCPU_SREG_TR) &&
4104 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM);
4105}
4106
3788int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4107int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3789 int type_bits, int seg) 4108 int type_bits, int seg)
3790{ 4109{
3791 struct kvm_segment kvm_seg; 4110 struct kvm_segment kvm_seg;
3792 4111
3793 if (!(vcpu->arch.cr0 & X86_CR0_PE)) 4112 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
3794 return kvm_load_realmode_segment(vcpu, selector, seg); 4113 return kvm_load_realmode_segment(vcpu, selector, seg);
3795 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 4114 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3796 return 1; 4115 return 1;
@@ -4029,7 +4348,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4029 } 4348 }
4030 } 4349 }
4031 4350
4032 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 4351 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
4033 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4352 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4034 return 1; 4353 return 1;
4035 } 4354 }
@@ -4099,13 +4418,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4099 4418
4100 vcpu->arch.cr2 = sregs->cr2; 4419 vcpu->arch.cr2 = sregs->cr2;
4101 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4420 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4102 4421 vcpu->arch.cr3 = sregs->cr3;
4103 down_read(&vcpu->kvm->slots_lock);
4104 if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
4105 vcpu->arch.cr3 = sregs->cr3;
4106 else
4107 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
4108 up_read(&vcpu->kvm->slots_lock);
4109 4422
4110 kvm_set_cr8(vcpu, sregs->cr8); 4423 kvm_set_cr8(vcpu, sregs->cr8);
4111 4424
@@ -4147,8 +4460,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4147 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4460 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4148 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4461 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4149 4462
4463 update_cr8_intercept(vcpu);
4464
4150 /* Older userspace won't unhalt the vcpu on reset. */ 4465 /* Older userspace won't unhalt the vcpu on reset. */
4151 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && 4466 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4152 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4467 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4153 !(vcpu->arch.cr0 & X86_CR0_PE)) 4468 !(vcpu->arch.cr0 & X86_CR0_PE))
4154 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4469 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4419,7 +4734,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4419 kvm = vcpu->kvm; 4734 kvm = vcpu->kvm;
4420 4735
4421 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4736 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4422 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 4737 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
4423 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4738 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4424 else 4739 else
4425 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4740 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -4441,6 +4756,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4441 goto fail_mmu_destroy; 4756 goto fail_mmu_destroy;
4442 } 4757 }
4443 4758
4759 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
4760 GFP_KERNEL);
4761 if (!vcpu->arch.mce_banks) {
4762 r = -ENOMEM;
4763 goto fail_mmu_destroy;
4764 }
4765 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4766
4444 return 0; 4767 return 0;
4445 4768
4446fail_mmu_destroy: 4769fail_mmu_destroy:
@@ -4488,20 +4811,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4488static void kvm_free_vcpus(struct kvm *kvm) 4811static void kvm_free_vcpus(struct kvm *kvm)
4489{ 4812{
4490 unsigned int i; 4813 unsigned int i;
4814 struct kvm_vcpu *vcpu;
4491 4815
4492 /* 4816 /*
4493 * Unpin any mmu pages first. 4817 * Unpin any mmu pages first.
4494 */ 4818 */
4495 for (i = 0; i < KVM_MAX_VCPUS; ++i) 4819 kvm_for_each_vcpu(i, vcpu, kvm)
4496 if (kvm->vcpus[i]) 4820 kvm_unload_vcpu_mmu(vcpu);
4497 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 4821 kvm_for_each_vcpu(i, vcpu, kvm)
4498 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 4822 kvm_arch_vcpu_free(vcpu);
4499 if (kvm->vcpus[i]) { 4823
4500 kvm_arch_vcpu_free(kvm->vcpus[i]); 4824 mutex_lock(&kvm->lock);
4501 kvm->vcpus[i] = NULL; 4825 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
4502 } 4826 kvm->vcpus[i] = NULL;
4503 }
4504 4827
4828 atomic_set(&kvm->online_vcpus, 0);
4829 mutex_unlock(&kvm->lock);
4505} 4830}
4506 4831
4507void kvm_arch_sync_events(struct kvm *kvm) 4832void kvm_arch_sync_events(struct kvm *kvm)
@@ -4578,7 +4903,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4578 4903
4579 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4904 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4580 spin_unlock(&kvm->mmu_lock); 4905 spin_unlock(&kvm->mmu_lock);
4581 kvm_flush_remote_tlbs(kvm);
4582 4906
4583 return 0; 4907 return 0;
4584} 4908}
@@ -4592,8 +4916,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
4592int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4916int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4593{ 4917{
4594 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4918 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4595 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4919 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4596 || vcpu->arch.nmi_pending; 4920 || vcpu->arch.nmi_pending ||
4921 (kvm_arch_interrupt_allowed(vcpu) &&
4922 kvm_cpu_has_interrupt(vcpu));
4597} 4923}
4598 4924
4599void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4925void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -4617,3 +4943,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4617{ 4943{
4618 return kvm_x86_ops->interrupt_allowed(vcpu); 4944 return kvm_x86_ops->interrupt_allowed(vcpu);
4619} 4945}
4946
4947EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4948EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4949EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4950EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 4c8e10af78e8..5eadea585d2a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
31{ 31{
32 return (nr == BP_VECTOR) || (nr == OF_VECTOR); 32 return (nr == BP_VECTOR) || (nr == OF_VECTOR);
33} 33}
34
35struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
36 u32 function, u32 index);
37
34#endif 38#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d677fa9ca650..7e59dc1d3fc2 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1135,11 +1135,6 @@ static struct notifier_block paniced = {
1135/* Setting up memory is fairly easy. */ 1135/* Setting up memory is fairly easy. */
1136static __init char *lguest_memory_setup(void) 1136static __init char *lguest_memory_setup(void)
1137{ 1137{
1138 /* We do this here and not earlier because lockcheck used to barf if we
1139 * did it before start_kernel(). I think we fixed that, so it'd be
1140 * nice to move it back to lguest_init. Patch welcome... */
1141 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
1142
1143 /* 1138 /*
1144 *The Linux bootloader header contains an "e820" memory map: the 1139 *The Linux bootloader header contains an "e820" memory map: the
1145 * Launcher populated the first entry with our memory limit. 1140 * Launcher populated the first entry with our memory limit.
@@ -1262,7 +1257,6 @@ __init void lguest_init(void)
1262 */ 1257 */
1263 1258
1264 /* Interrupt-related operations */ 1259 /* Interrupt-related operations */
1265 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1266 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1260 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1267 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1261 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
1268 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); 1262 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
@@ -1270,7 +1264,6 @@ __init void lguest_init(void)
1270 pv_irq_ops.safe_halt = lguest_safe_halt; 1264 pv_irq_ops.safe_halt = lguest_safe_halt;
1271 1265
1272 /* Setup operations */ 1266 /* Setup operations */
1273 pv_init_ops.memory_setup = lguest_memory_setup;
1274 pv_init_ops.patch = lguest_patch; 1267 pv_init_ops.patch = lguest_patch;
1275 1268
1276 /* Intercepts of various CPU instructions */ 1269 /* Intercepts of various CPU instructions */
@@ -1320,10 +1313,11 @@ __init void lguest_init(void)
1320 set_lguest_basic_apic_ops(); 1313 set_lguest_basic_apic_ops();
1321#endif 1314#endif
1322 1315
1323 /* Time operations */ 1316 x86_init.resources.memory_setup = lguest_memory_setup;
1324 pv_time_ops.get_wallclock = lguest_get_wallclock; 1317 x86_init.irqs.intr_init = lguest_init_IRQ;
1325 pv_time_ops.time_init = lguest_time_init; 1318 x86_init.timers.timer_init = lguest_time_init;
1326 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1319 x86_platform.calibrate_tsc = lguest_tsc_khz;
1320 x86_platform.get_wallclock = lguest_get_wallclock;
1327 1321
1328 /* 1322 /*
1329 * Now is a good time to look at the implementations of these functions 1323 * Now is a good time to look at the implementations of these functions
@@ -1365,10 +1359,13 @@ __init void lguest_init(void)
1365 1359
1366 /* 1360 /*
1367 * If we don't initialize the lock dependency checker now, it crashes 1361 * If we don't initialize the lock dependency checker now, it crashes
1368 * paravirt_disable_iospace. 1362 * atomic_notifier_chain_register, then paravirt_disable_iospace.
1369 */ 1363 */
1370 lockdep_init(); 1364 lockdep_init();
1371 1365
1366 /* Hook in our special panic hypercall code. */
1367 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
1368
1372 /* 1369 /*
1373 * The IDE code spends about 3 seconds probing for disks: if we reserve 1370 * The IDE code spends about 3 seconds probing for disks: if we reserve
1374 * all the I/O ports up front it can't get them and so doesn't probe. 1371 * all the I/O ports up front it can't get them and so doesn't probe.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 07c31899c9c2..9e609206fac9 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -9,6 +9,8 @@ lib-y += thunk_$(BITS).o
9lib-y += usercopy_$(BITS).o getuser.o putuser.o 9lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
11 11
12obj-y += msr-reg.o msr-reg-export.o
13
12ifeq ($(CONFIG_X86_32),y) 14ifeq ($(CONFIG_X86_32),y)
13 obj-y += atomic64_32.o 15 obj-y += atomic64_32.o
14 lib-y += checksum_32.o 16 lib-y += checksum_32.o
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c
new file mode 100644
index 000000000000..a311cc59b65d
--- /dev/null
+++ b/arch/x86/lib/msr-reg-export.c
@@ -0,0 +1,5 @@
1#include <linux/module.h>
2#include <asm/msr.h>
3
4EXPORT_SYMBOL(native_rdmsr_safe_regs);
5EXPORT_SYMBOL(native_wrmsr_safe_regs);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
new file mode 100644
index 000000000000..69fa10623f21
--- /dev/null
+++ b/arch/x86/lib/msr-reg.S
@@ -0,0 +1,102 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <asm/dwarf2.h>
4#include <asm/asm.h>
5#include <asm/msr.h>
6
7#ifdef CONFIG_X86_64
8/*
9 * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
10 *
11 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
12 *
13 */
14.macro op_safe_regs op
15ENTRY(native_\op\()_safe_regs)
16 CFI_STARTPROC
17 pushq_cfi %rbx
18 pushq_cfi %rbp
19 movq %rdi, %r10 /* Save pointer */
20 xorl %r11d, %r11d /* Return value */
21 movl (%rdi), %eax
22 movl 4(%rdi), %ecx
23 movl 8(%rdi), %edx
24 movl 12(%rdi), %ebx
25 movl 20(%rdi), %ebp
26 movl 24(%rdi), %esi
27 movl 28(%rdi), %edi
28 CFI_REMEMBER_STATE
291: \op
302: movl %eax, (%r10)
31 movl %r11d, %eax /* Return value */
32 movl %ecx, 4(%r10)
33 movl %edx, 8(%r10)
34 movl %ebx, 12(%r10)
35 movl %ebp, 20(%r10)
36 movl %esi, 24(%r10)
37 movl %edi, 28(%r10)
38 popq_cfi %rbp
39 popq_cfi %rbx
40 ret
413:
42 CFI_RESTORE_STATE
43 movl $-EIO, %r11d
44 jmp 2b
45
46 _ASM_EXTABLE(1b, 3b)
47 CFI_ENDPROC
48ENDPROC(native_\op\()_safe_regs)
49.endm
50
51#else /* X86_32 */
52
53.macro op_safe_regs op
54ENTRY(native_\op\()_safe_regs)
55 CFI_STARTPROC
56 pushl_cfi %ebx
57 pushl_cfi %ebp
58 pushl_cfi %esi
59 pushl_cfi %edi
60 pushl_cfi $0 /* Return value */
61 pushl_cfi %eax
62 movl 4(%eax), %ecx
63 movl 8(%eax), %edx
64 movl 12(%eax), %ebx
65 movl 20(%eax), %ebp
66 movl 24(%eax), %esi
67 movl 28(%eax), %edi
68 movl (%eax), %eax
69 CFI_REMEMBER_STATE
701: \op
712: pushl_cfi %eax
72 movl 4(%esp), %eax
73 popl_cfi (%eax)
74 addl $4, %esp
75 CFI_ADJUST_CFA_OFFSET -4
76 movl %ecx, 4(%eax)
77 movl %edx, 8(%eax)
78 movl %ebx, 12(%eax)
79 movl %ebp, 20(%eax)
80 movl %esi, 24(%eax)
81 movl %edi, 28(%eax)
82 popl_cfi %eax
83 popl_cfi %edi
84 popl_cfi %esi
85 popl_cfi %ebp
86 popl_cfi %ebx
87 ret
883:
89 CFI_RESTORE_STATE
90 movl $-EIO, 4(%esp)
91 jmp 2b
92
93 _ASM_EXTABLE(1b, 3b)
94 CFI_ENDPROC
95ENDPROC(native_\op\()_safe_regs)
96.endm
97
98#endif
99
100op_safe_regs rdmsr
101op_safe_regs wrmsr
102
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index caa24aca8115..33a1e3ca22d8 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -175,3 +175,52 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
175 return err ? err : rv.err; 175 return err ? err : rv.err;
176} 176}
177EXPORT_SYMBOL(wrmsr_safe_on_cpu); 177EXPORT_SYMBOL(wrmsr_safe_on_cpu);
178
179/*
180 * These variants are significantly slower, but allows control over
181 * the entire 32-bit GPR set.
182 */
183struct msr_regs_info {
184 u32 *regs;
185 int err;
186};
187
188static void __rdmsr_safe_regs_on_cpu(void *info)
189{
190 struct msr_regs_info *rv = info;
191
192 rv->err = rdmsr_safe_regs(rv->regs);
193}
194
195static void __wrmsr_safe_regs_on_cpu(void *info)
196{
197 struct msr_regs_info *rv = info;
198
199 rv->err = wrmsr_safe_regs(rv->regs);
200}
201
202int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
203{
204 int err;
205 struct msr_regs_info rv;
206
207 rv.regs = regs;
208 rv.err = -EIO;
209 err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
210
211 return err ? err : rv.err;
212}
213EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
214
215int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
216{
217 int err;
218 struct msr_regs_info rv;
219
220 rv.regs = regs;
221 rv.err = -EIO;
222 err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
223
224 return err ? err : rv.err;
225}
226EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index eefdeee8a871..9b5a9f59a478 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,9 @@
1obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o gup.o 2 pat.o pgtable.o physaddr.o gup.o
3
4# Make sure __phys_addr has no stackprotector
5nostackp := $(call cc-option, -fno-stack-protector)
6CFLAGS_physaddr.o := $(nostackp)
3 7
4obj-$(CONFIG_SMP) += tlb.o 8obj-$(CONFIG_SMP) += tlb.o
5 9
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index bfae139182ff..f4cee9028cf0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,7 +10,7 @@
10#include <linux/bootmem.h> /* max_low_pfn */ 10#include <linux/bootmem.h> /* max_low_pfn */
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_counter.h> /* perf_swcounter_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14 14
15#include <asm/traps.h> /* dotraplinkage, ... */ 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 16#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
167 info.si_errno = 0; 167 info.si_errno = 0;
168 info.si_code = si_code; 168 info.si_code = si_code;
169 info.si_addr = (void __user *)address; 169 info.si_addr = (void __user *)address;
170 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
170 171
171 force_sig_info(si_signo, &info, tsk); 172 force_sig_info(si_signo, &info, tsk);
172} 173}
@@ -285,26 +286,25 @@ check_v8086_mode(struct pt_regs *regs, unsigned long address,
285 tsk->thread.screen_bitmap |= 1 << bit; 286 tsk->thread.screen_bitmap |= 1 << bit;
286} 287}
287 288
288static void dump_pagetable(unsigned long address) 289static bool low_pfn(unsigned long pfn)
289{ 290{
290 __typeof__(pte_val(__pte(0))) page; 291 return pfn < max_low_pfn;
292}
291 293
292 page = read_cr3(); 294static void dump_pagetable(unsigned long address)
293 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 295{
296 pgd_t *base = __va(read_cr3());
297 pgd_t *pgd = &base[pgd_index(address)];
298 pmd_t *pmd;
299 pte_t *pte;
294 300
295#ifdef CONFIG_X86_PAE 301#ifdef CONFIG_X86_PAE
296 printk("*pdpt = %016Lx ", page); 302 printk("*pdpt = %016Lx ", pgd_val(*pgd));
297 if ((page >> PAGE_SHIFT) < max_low_pfn 303 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
298 && page & _PAGE_PRESENT) { 304 goto out;
299 page &= PAGE_MASK;
300 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
301 & (PTRS_PER_PMD - 1)];
302 printk(KERN_CONT "*pde = %016Lx ", page);
303 page &= ~_PAGE_NX;
304 }
305#else
306 printk("*pde = %08lx ", page);
307#endif 305#endif
306 pmd = pmd_offset(pud_offset(pgd, address), address);
307 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
308 308
309 /* 309 /*
310 * We must not directly access the pte in the highpte 310 * We must not directly access the pte in the highpte
@@ -312,16 +312,12 @@ static void dump_pagetable(unsigned long address)
312 * And let's rather not kmap-atomic the pte, just in case 312 * And let's rather not kmap-atomic the pte, just in case
313 * it's allocated already: 313 * it's allocated already:
314 */ 314 */
315 if ((page >> PAGE_SHIFT) < max_low_pfn 315 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
316 && (page & _PAGE_PRESENT) 316 goto out;
317 && !(page & _PAGE_PSE)) {
318
319 page &= PAGE_MASK;
320 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
321 & (PTRS_PER_PTE - 1)];
322 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
323 }
324 317
318 pte = pte_offset_kernel(pmd, address);
319 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
320out:
325 printk("\n"); 321 printk("\n");
326} 322}
327 323
@@ -450,16 +446,12 @@ static int bad_address(void *p)
450 446
451static void dump_pagetable(unsigned long address) 447static void dump_pagetable(unsigned long address)
452{ 448{
453 pgd_t *pgd; 449 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
450 pgd_t *pgd = base + pgd_index(address);
454 pud_t *pud; 451 pud_t *pud;
455 pmd_t *pmd; 452 pmd_t *pmd;
456 pte_t *pte; 453 pte_t *pte;
457 454
458 pgd = (pgd_t *)read_cr3();
459
460 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
461
462 pgd += pgd_index(address);
463 if (bad_address(pgd)) 455 if (bad_address(pgd))
464 goto bad; 456 goto bad;
465 457
@@ -799,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
799} 791}
800 792
801static void 793static void
802do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) 794do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
795 unsigned int fault)
803{ 796{
804 struct task_struct *tsk = current; 797 struct task_struct *tsk = current;
805 struct mm_struct *mm = tsk->mm; 798 struct mm_struct *mm = tsk->mm;
799 int code = BUS_ADRERR;
806 800
807 up_read(&mm->mmap_sem); 801 up_read(&mm->mmap_sem);
808 802
@@ -818,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
818 tsk->thread.error_code = error_code; 812 tsk->thread.error_code = error_code;
819 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
820 814
821 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 815#ifdef CONFIG_MEMORY_FAILURE
816 if (fault & VM_FAULT_HWPOISON) {
817 printk(KERN_ERR
818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
819 tsk->comm, tsk->pid, address);
820 code = BUS_MCEERR_AR;
821 }
822#endif
823 force_sig_info_fault(SIGBUS, code, address, tsk);
822} 824}
823 825
824static noinline void 826static noinline void
@@ -828,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
828 if (fault & VM_FAULT_OOM) { 830 if (fault & VM_FAULT_OOM) {
829 out_of_memory(regs, error_code, address); 831 out_of_memory(regs, error_code, address);
830 } else { 832 } else {
831 if (fault & VM_FAULT_SIGBUS) 833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
832 do_sigbus(regs, error_code, address); 834 do_sigbus(regs, error_code, address, fault);
833 else 835 else
834 BUG(); 836 BUG();
835 } 837 }
@@ -1026,7 +1028,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1026 if (unlikely(error_code & PF_RSVD)) 1028 if (unlikely(error_code & PF_RSVD))
1027 pgtable_bad(regs, error_code, address); 1029 pgtable_bad(regs, error_code, address);
1028 1030
1029 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 1031 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
1030 1032
1031 /* 1033 /*
1032 * If we're in an interrupt, have no user context or are running 1034 * If we're in an interrupt, have no user context or are running
@@ -1123,11 +1125,11 @@ good_area:
1123 1125
1124 if (fault & VM_FAULT_MAJOR) { 1126 if (fault & VM_FAULT_MAJOR) {
1125 tsk->maj_flt++; 1127 tsk->maj_flt++;
1126 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1128 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1127 regs, address); 1129 regs, address);
1128 } else { 1130 } else {
1129 tsk->min_flt++; 1131 tsk->min_flt++;
1130 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1132 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1131 regs, address); 1133 regs, address);
1132 } 1134 }
1133 1135
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 2112ed55e7ea..63a6ba66cbe0 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -24,7 +24,7 @@ void kunmap(struct page *page)
24 * no global lock is needed and because the kmap code must perform a global TLB 24 * no global lock is needed and because the kmap code must perform a global TLB
25 * invalidation when the kmap pool wraps. 25 * invalidation when the kmap pool wraps.
26 * 26 *
27 * However when holding an atomic kmap is is not legal to sleep, so atomic 27 * However when holding an atomic kmap it is not legal to sleep, so atomic
28 * kmaps are appropriate for short, tight code paths only. 28 * kmaps are appropriate for short, tight code paths only.
29 */ 29 */
30void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) 30void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
@@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic); 104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic); 105EXPORT_SYMBOL(kunmap_atomic);
106EXPORT_SYMBOL(kmap_atomic_prot); 106EXPORT_SYMBOL(kmap_atomic_prot);
107EXPORT_SYMBOL(kmap_atomic_to_page);
107 108
108void __init set_highmem_pages_init(void) 109void __init set_highmem_pages_init(void)
109{ 110{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3cd7711bb949..30938c1d8d5d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
84#ifdef CONFIG_X86_PAE 84#ifdef CONFIG_X86_PAE
85 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 85 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
86 if (after_bootmem) 86 if (after_bootmem)
87 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 87 pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
88 else 88 else
89 pmd_table = (pmd_t *)alloc_low_page(); 89 pmd_table = (pmd_t *)alloc_low_page();
90 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 90 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
@@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
116#endif 116#endif
117 if (!page_table) 117 if (!page_table)
118 page_table = 118 page_table =
119 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 119 (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
120 } else 120 } else
121 page_table = (pte_t *)alloc_low_page(); 121 page_table = (pte_t *)alloc_low_page();
122 122
@@ -857,8 +857,6 @@ static void __init test_wp_bit(void)
857 } 857 }
858} 858}
859 859
860static struct kcore_list kcore_mem, kcore_vmalloc;
861
862void __init mem_init(void) 860void __init mem_init(void)
863{ 861{
864 int codesize, reservedpages, datasize, initsize; 862 int codesize, reservedpages, datasize, initsize;
@@ -886,13 +884,9 @@ void __init mem_init(void)
886 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 884 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
887 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 885 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
888 886
889 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
890 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
891 VMALLOC_END-VMALLOC_START);
892
893 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " 887 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
894 "%dk reserved, %dk data, %dk init, %ldk highmem)\n", 888 "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
895 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 889 nr_free_pages() << (PAGE_SHIFT-10),
896 num_physpages << (PAGE_SHIFT-10), 890 num_physpages << (PAGE_SHIFT-10),
897 codesize >> 10, 891 codesize >> 10,
898 reservedpages << (PAGE_SHIFT-10), 892 reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ea56b8cbb6a6..5a4398a6006b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,8 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
647 647
648#endif /* CONFIG_MEMORY_HOTPLUG */ 648#endif /* CONFIG_MEMORY_HOTPLUG */
649 649
650static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, 650static struct kcore_list kcore_vsyscall;
651 kcore_modules, kcore_vsyscall;
652 651
653void __init mem_init(void) 652void __init mem_init(void)
654{ 653{
@@ -677,17 +676,12 @@ void __init mem_init(void)
677 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 676 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
678 677
679 /* Register memory areas for /proc/kcore */ 678 /* Register memory areas for /proc/kcore */
680 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
681 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
682 VMALLOC_END-VMALLOC_START);
683 kclist_add(&kcore_kernel, &_stext, _end - _stext);
684 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
685 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 679 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
686 VSYSCALL_END - VSYSCALL_START); 680 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
687 681
688 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 682 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
689 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", 683 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
690 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 684 nr_free_pages() << (PAGE_SHIFT-10),
691 max_pfn << (PAGE_SHIFT-10), 685 max_pfn << (PAGE_SHIFT-10),
692 codesize >> 10, 686 codesize >> 10,
693 absent_pages << (PAGE_SHIFT-10), 687 absent_pages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index fe6f84ca121e..84e236ce76ba 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -21,7 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23 23
24int is_io_mapping_possible(resource_size_t base, unsigned long size) 24static int is_io_mapping_possible(resource_size_t base, unsigned long size)
25{ 25{
26#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) 26#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
27 /* There is no way to map greater than 1 << 32 address without PAE */ 27 /* There is no way to map greater than 1 << 32 address without PAE */
@@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
30#endif 30#endif
31 return 1; 31 return 1;
32} 32}
33EXPORT_SYMBOL_GPL(is_io_mapping_possible); 33
34int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
35{
36 unsigned long flag = _PAGE_CACHE_WC;
37 int ret;
38
39 if (!is_io_mapping_possible(base, size))
40 return -EINVAL;
41
42 ret = io_reserve_memtype(base, base + size, &flag);
43 if (ret)
44 return ret;
45
46 *prot = __pgprot(__PAGE_KERNEL | flag);
47 return 0;
48}
49EXPORT_SYMBOL_GPL(iomap_create_wc);
50
51void
52iomap_free(resource_size_t base, unsigned long size)
53{
54 io_free_memtype(base, base + size);
55}
56EXPORT_SYMBOL_GPL(iomap_free);
34 57
35void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 58void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
36{ 59{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 8a450930834f..334e63ca7b2b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,77 +22,7 @@
22#include <asm/pgalloc.h> 22#include <asm/pgalloc.h>
23#include <asm/pat.h> 23#include <asm/pat.h>
24 24
25static inline int phys_addr_valid(resource_size_t addr) 25#include "physaddr.h"
26{
27#ifdef CONFIG_PHYS_ADDR_T_64BIT
28 return !(addr >> boot_cpu_data.x86_phys_bits);
29#else
30 return 1;
31#endif
32}
33
34#ifdef CONFIG_X86_64
35
36unsigned long __phys_addr(unsigned long x)
37{
38 if (x >= __START_KERNEL_map) {
39 x -= __START_KERNEL_map;
40 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
41 x += phys_base;
42 } else {
43 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
44 x -= PAGE_OFFSET;
45 VIRTUAL_BUG_ON(!phys_addr_valid(x));
46 }
47 return x;
48}
49EXPORT_SYMBOL(__phys_addr);
50
51bool __virt_addr_valid(unsigned long x)
52{
53 if (x >= __START_KERNEL_map) {
54 x -= __START_KERNEL_map;
55 if (x >= KERNEL_IMAGE_SIZE)
56 return false;
57 x += phys_base;
58 } else {
59 if (x < PAGE_OFFSET)
60 return false;
61 x -= PAGE_OFFSET;
62 if (!phys_addr_valid(x))
63 return false;
64 }
65
66 return pfn_valid(x >> PAGE_SHIFT);
67}
68EXPORT_SYMBOL(__virt_addr_valid);
69
70#else
71
72#ifdef CONFIG_DEBUG_VIRTUAL
73unsigned long __phys_addr(unsigned long x)
74{
75 /* VMALLOC_* aren't constants */
76 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
77 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
78 return x - PAGE_OFFSET;
79}
80EXPORT_SYMBOL(__phys_addr);
81#endif
82
83bool __virt_addr_valid(unsigned long x)
84{
85 if (x < PAGE_OFFSET)
86 return false;
87 if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
88 return false;
89 if (x >= FIXADDR_START)
90 return false;
91 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
92}
93EXPORT_SYMBOL(__virt_addr_valid);
94
95#endif
96 26
97int page_is_ram(unsigned long pagenr) 27int page_is_ram(unsigned long pagenr)
98{ 28{
@@ -228,24 +158,14 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
228 retval = reserve_memtype(phys_addr, (u64)phys_addr + size, 158 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
229 prot_val, &new_prot_val); 159 prot_val, &new_prot_val);
230 if (retval) { 160 if (retval) {
231 pr_debug("Warning: reserve_memtype returned %d\n", retval); 161 printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
232 return NULL; 162 return NULL;
233 } 163 }
234 164
235 if (prot_val != new_prot_val) { 165 if (prot_val != new_prot_val) {
236 /* 166 if (!is_new_memtype_allowed(phys_addr, size,
237 * Do not fallback to certain memory types with certain 167 prot_val, new_prot_val)) {
238 * requested type: 168 printk(KERN_ERR
239 * - request is uc-, return cannot be write-back
240 * - request is uc-, return cannot be write-combine
241 * - request is write-combine, return cannot be write-back
242 */
243 if ((prot_val == _PAGE_CACHE_UC_MINUS &&
244 (new_prot_val == _PAGE_CACHE_WB ||
245 new_prot_val == _PAGE_CACHE_WC)) ||
246 (prot_val == _PAGE_CACHE_WC &&
247 new_prot_val == _PAGE_CACHE_WB)) {
248 pr_debug(
249 "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", 169 "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
250 (unsigned long long)phys_addr, 170 (unsigned long long)phys_addr,
251 (unsigned long long)(phys_addr + size), 171 (unsigned long long)(phys_addr + size),
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 2c55ed098654..8cc183344140 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs)
225 225
226 BUG_ON(!irqs_disabled()); 226 BUG_ON(!irqs_disabled());
227 227
228 if (data->balance == 0)
229 return;
230
231 if (unlikely(data->balance != 1)) { 228 if (unlikely(data->balance != 1)) {
232 kmemcheck_show_all(); 229 kmemcheck_show_all();
233 kmemcheck_error_save_bug(regs); 230 kmemcheck_error_save_bug(regs);
@@ -331,6 +328,20 @@ static void kmemcheck_read_strict(struct pt_regs *regs,
331 kmemcheck_shadow_set(shadow, size); 328 kmemcheck_shadow_set(shadow, size);
332} 329}
333 330
331bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
332{
333 enum kmemcheck_shadow status;
334 void *shadow;
335
336 shadow = kmemcheck_shadow_lookup(addr);
337 if (!shadow)
338 return true;
339
340 status = kmemcheck_shadow_test(shadow, size);
341
342 return status == KMEMCHECK_SHADOW_INITIALIZED;
343}
344
334/* Access may cross page boundary */ 345/* Access may cross page boundary */
335static void kmemcheck_read(struct pt_regs *regs, 346static void kmemcheck_read(struct pt_regs *regs,
336 unsigned long addr, unsigned int size) 347 unsigned long addr, unsigned int size)
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index e773b6bd0079..3f66b82076a3 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -1,7 +1,6 @@
1#include <linux/kmemcheck.h> 1#include <linux/kmemcheck.h>
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/module.h>
5 4
6#include <asm/page.h> 5#include <asm/page.h>
7#include <asm/pgtable.h> 6#include <asm/pgtable.h>
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 165829600566..c8191defc38a 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -29,13 +29,26 @@
29#include <linux/random.h> 29#include <linux/random.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <asm/elf.h>
33
34static unsigned int stack_maxrandom_size(void)
35{
36 unsigned int max = 0;
37 if ((current->flags & PF_RANDOMIZE) &&
38 !(current->personality & ADDR_NO_RANDOMIZE)) {
39 max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
40 }
41
42 return max;
43}
44
32 45
33/* 46/*
34 * Top of mmap area (just below the process stack). 47 * Top of mmap area (just below the process stack).
35 * 48 *
36 * Leave an at least ~128 MB hole. 49 * Leave an at least ~128 MB hole with possible stack randomization.
37 */ 50 */
38#define MIN_GAP (128*1024*1024) 51#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
39#define MAX_GAP (TASK_SIZE/6*5) 52#define MAX_GAP (TASK_SIZE/6*5)
40 53
41/* 54/*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 5866b28eede1..dd38bfbefd1f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/pfn.h> 14#include <linux/pfn.h>
15#include <linux/percpu.h>
15 16
16#include <asm/e820.h> 17#include <asm/e820.h>
17#include <asm/processor.h> 18#include <asm/processor.h>
@@ -687,7 +688,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
687{ 688{
688 struct cpa_data alias_cpa; 689 struct cpa_data alias_cpa;
689 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); 690 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
690 unsigned long vaddr, remapped; 691 unsigned long vaddr;
691 int ret; 692 int ret;
692 693
693 if (cpa->pfn >= max_pfn_mapped) 694 if (cpa->pfn >= max_pfn_mapped)
@@ -745,24 +746,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
745 } 746 }
746#endif 747#endif
747 748
748 /*
749 * If the PMD page was partially used for per-cpu remapping,
750 * the recycled area needs to be split and modified. Because
751 * the area is always proper subset of a PMD page
752 * cpa->numpages is guaranteed to be 1 for these areas, so
753 * there's no need to loop over and check for further remaps.
754 */
755 remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
756 if (remapped) {
757 WARN_ON(cpa->numpages > 1);
758 alias_cpa = *cpa;
759 alias_cpa.vaddr = &remapped;
760 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
761 ret = __change_page_attr_set_clr(&alias_cpa, 0);
762 if (ret)
763 return ret;
764 }
765
766 return 0; 749 return 0;
767} 750}
768 751
@@ -823,6 +806,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
823{ 806{
824 struct cpa_data cpa; 807 struct cpa_data cpa;
825 int ret, cache, checkalias; 808 int ret, cache, checkalias;
809 unsigned long baddr = 0;
826 810
827 /* 811 /*
828 * Check, if we are requested to change a not supported 812 * Check, if we are requested to change a not supported
@@ -854,6 +838,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
854 */ 838 */
855 WARN_ON_ONCE(1); 839 WARN_ON_ONCE(1);
856 } 840 }
841 /*
842 * Save address for cache flush. *addr is modified in the call
843 * to __change_page_attr_set_clr() below.
844 */
845 baddr = *addr;
857 } 846 }
858 847
859 /* Must avoid aliasing mappings in the highmem code */ 848 /* Must avoid aliasing mappings in the highmem code */
@@ -901,7 +890,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
901 cpa_flush_array(addr, numpages, cache, 890 cpa_flush_array(addr, numpages, cache,
902 cpa.flags, pages); 891 cpa.flags, pages);
903 } else 892 } else
904 cpa_flush_range(*addr, numpages, cache); 893 cpa_flush_range(baddr, numpages, cache);
905 } else 894 } else
906 cpa_flush_all(cache); 895 cpa_flush_all(cache);
907 896
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 352aa9e927e2..7257cf3decf9 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -15,6 +15,7 @@
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/rbtree.h>
18 19
19#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
20#include <asm/processor.h> 21#include <asm/processor.h>
@@ -148,11 +149,10 @@ static char *cattr_name(unsigned long flags)
148 * areas). All the aliases have the same cache attributes of course. 149 * areas). All the aliases have the same cache attributes of course.
149 * Zero attributes are represented as holes. 150 * Zero attributes are represented as holes.
150 * 151 *
151 * Currently the data structure is a list because the number of mappings 152 * The data structure is a list that is also organized as an rbtree
152 * are expected to be relatively small. If this should be a problem 153 * sorted on the start address of memtype range.
153 * it could be changed to a rbtree or similar.
154 * 154 *
155 * memtype_lock protects the whole list. 155 * memtype_lock protects both the linear list and rbtree.
156 */ 156 */
157 157
158struct memtype { 158struct memtype {
@@ -160,11 +160,53 @@ struct memtype {
160 u64 end; 160 u64 end;
161 unsigned long type; 161 unsigned long type;
162 struct list_head nd; 162 struct list_head nd;
163 struct rb_node rb;
163}; 164};
164 165
166static struct rb_root memtype_rbroot = RB_ROOT;
165static LIST_HEAD(memtype_list); 167static LIST_HEAD(memtype_list);
166static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 168static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
167 169
170static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
171{
172 struct rb_node *node = root->rb_node;
173 struct memtype *last_lower = NULL;
174
175 while (node) {
176 struct memtype *data = container_of(node, struct memtype, rb);
177
178 if (data->start < start) {
179 last_lower = data;
180 node = node->rb_right;
181 } else if (data->start > start) {
182 node = node->rb_left;
183 } else
184 return data;
185 }
186
187 /* Will return NULL if there is no entry with its start <= start */
188 return last_lower;
189}
190
191static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
192{
193 struct rb_node **new = &(root->rb_node);
194 struct rb_node *parent = NULL;
195
196 while (*new) {
197 struct memtype *this = container_of(*new, struct memtype, rb);
198
199 parent = *new;
200 if (data->start <= this->start)
201 new = &((*new)->rb_left);
202 else if (data->start > this->start)
203 new = &((*new)->rb_right);
204 }
205
206 rb_link_node(&data->rb, parent, new);
207 rb_insert_color(&data->rb, root);
208}
209
168/* 210/*
169 * Does intersection of PAT memory type and MTRR memory type and returns 211 * Does intersection of PAT memory type and MTRR memory type and returns
170 * the resulting memory type as PAT understands it. 212 * the resulting memory type as PAT understands it.
@@ -218,9 +260,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
218 return -EBUSY; 260 return -EBUSY;
219} 261}
220 262
221static struct memtype *cached_entry;
222static u64 cached_start;
223
224static int pat_pagerange_is_ram(unsigned long start, unsigned long end) 263static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
225{ 264{
226 int ram_page = 0, not_rampage = 0; 265 int ram_page = 0, not_rampage = 0;
@@ -249,63 +288,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
249} 288}
250 289
251/* 290/*
252 * For RAM pages, mark the pages as non WB memory type using 291 * For RAM pages, we use page flags to mark the pages with appropriate type.
253 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or 292 * Here we do two pass:
254 * set_memory_wc() on a RAM page at a time before marking it as WB again. 293 * - Find the memtype of all the pages in the range, look for any conflicts
255 * This is ok, because only one driver will be owning the page and 294 * - In case of no conflicts, set the new memtype for pages in the range
256 * doing set_memory_*() calls.
257 * 295 *
258 * For now, we use PageNonWB to track that the RAM page is being mapped 296 * Caller must hold memtype_lock for atomicity.
259 * as non WB. In future, we will have to use one more flag
260 * (or some other mechanism in page_struct) to distinguish between
261 * UC and WC mapping.
262 */ 297 */
263static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, 298static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
264 unsigned long *new_type) 299 unsigned long *new_type)
265{ 300{
266 struct page *page; 301 struct page *page;
267 u64 pfn, end_pfn; 302 u64 pfn;
303
304 if (req_type == _PAGE_CACHE_UC) {
305 /* We do not support strong UC */
306 WARN_ON_ONCE(1);
307 req_type = _PAGE_CACHE_UC_MINUS;
308 }
268 309
269 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 310 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
270 page = pfn_to_page(pfn); 311 unsigned long type;
271 if (page_mapped(page) || PageNonWB(page))
272 goto out;
273 312
274 SetPageNonWB(page); 313 page = pfn_to_page(pfn);
314 type = get_page_memtype(page);
315 if (type != -1) {
316 printk(KERN_INFO "reserve_ram_pages_type failed "
317 "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
318 start, end, type, req_type);
319 if (new_type)
320 *new_type = type;
321
322 return -EBUSY;
323 }
275 } 324 }
276 return 0;
277 325
278out: 326 if (new_type)
279 end_pfn = pfn; 327 *new_type = req_type;
280 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { 328
329 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
281 page = pfn_to_page(pfn); 330 page = pfn_to_page(pfn);
282 ClearPageNonWB(page); 331 set_page_memtype(page, req_type);
283 } 332 }
284 333 return 0;
285 return -EINVAL;
286} 334}
287 335
288static int free_ram_pages_type(u64 start, u64 end) 336static int free_ram_pages_type(u64 start, u64 end)
289{ 337{
290 struct page *page; 338 struct page *page;
291 u64 pfn, end_pfn; 339 u64 pfn;
292 340
293 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 341 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
294 page = pfn_to_page(pfn); 342 page = pfn_to_page(pfn);
295 if (page_mapped(page) || !PageNonWB(page)) 343 set_page_memtype(page, -1);
296 goto out;
297
298 ClearPageNonWB(page);
299 } 344 }
300 return 0; 345 return 0;
301
302out:
303 end_pfn = pfn;
304 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
305 page = pfn_to_page(pfn);
306 SetPageNonWB(page);
307 }
308 return -EINVAL;
309} 346}
310 347
311/* 348/*
@@ -339,6 +376,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
339 if (new_type) { 376 if (new_type) {
340 if (req_type == -1) 377 if (req_type == -1)
341 *new_type = _PAGE_CACHE_WB; 378 *new_type = _PAGE_CACHE_WB;
379 else if (req_type == _PAGE_CACHE_WC)
380 *new_type = _PAGE_CACHE_UC_MINUS;
342 else 381 else
343 *new_type = req_type & _PAGE_CACHE_MASK; 382 *new_type = req_type & _PAGE_CACHE_MASK;
344 } 383 }
@@ -364,11 +403,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
364 *new_type = actual_type; 403 *new_type = actual_type;
365 404
366 is_range_ram = pat_pagerange_is_ram(start, end); 405 is_range_ram = pat_pagerange_is_ram(start, end);
367 if (is_range_ram == 1) 406 if (is_range_ram == 1) {
368 return reserve_ram_pages_type(start, end, req_type, 407
369 new_type); 408 spin_lock(&memtype_lock);
370 else if (is_range_ram < 0) 409 err = reserve_ram_pages_type(start, end, req_type, new_type);
410 spin_unlock(&memtype_lock);
411
412 return err;
413 } else if (is_range_ram < 0) {
371 return -EINVAL; 414 return -EINVAL;
415 }
372 416
373 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 417 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
374 if (!new) 418 if (!new)
@@ -380,17 +424,11 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
380 424
381 spin_lock(&memtype_lock); 425 spin_lock(&memtype_lock);
382 426
383 if (cached_entry && start >= cached_start)
384 entry = cached_entry;
385 else
386 entry = list_entry(&memtype_list, struct memtype, nd);
387
388 /* Search for existing mapping that overlaps the current range */ 427 /* Search for existing mapping that overlaps the current range */
389 where = NULL; 428 where = NULL;
390 list_for_each_entry_continue(entry, &memtype_list, nd) { 429 list_for_each_entry(entry, &memtype_list, nd) {
391 if (end <= entry->start) { 430 if (end <= entry->start) {
392 where = entry->nd.prev; 431 where = entry->nd.prev;
393 cached_entry = list_entry(where, struct memtype, nd);
394 break; 432 break;
395 } else if (start <= entry->start) { /* end > entry->start */ 433 } else if (start <= entry->start) { /* end > entry->start */
396 err = chk_conflict(new, entry, new_type); 434 err = chk_conflict(new, entry, new_type);
@@ -398,8 +436,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
398 dprintk("Overlap at 0x%Lx-0x%Lx\n", 436 dprintk("Overlap at 0x%Lx-0x%Lx\n",
399 entry->start, entry->end); 437 entry->start, entry->end);
400 where = entry->nd.prev; 438 where = entry->nd.prev;
401 cached_entry = list_entry(where,
402 struct memtype, nd);
403 } 439 }
404 break; 440 break;
405 } else if (start < entry->end) { /* start > entry->start */ 441 } else if (start < entry->end) { /* start > entry->start */
@@ -407,8 +443,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
407 if (!err) { 443 if (!err) {
408 dprintk("Overlap at 0x%Lx-0x%Lx\n", 444 dprintk("Overlap at 0x%Lx-0x%Lx\n",
409 entry->start, entry->end); 445 entry->start, entry->end);
410 cached_entry = list_entry(entry->nd.prev,
411 struct memtype, nd);
412 446
413 /* 447 /*
414 * Move to right position in the linked 448 * Move to right position in the linked
@@ -436,13 +470,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
436 return err; 470 return err;
437 } 471 }
438 472
439 cached_start = start;
440
441 if (where) 473 if (where)
442 list_add(&new->nd, where); 474 list_add(&new->nd, where);
443 else 475 else
444 list_add_tail(&new->nd, &memtype_list); 476 list_add_tail(&new->nd, &memtype_list);
445 477
478 memtype_rb_insert(&memtype_rbroot, new);
479
446 spin_unlock(&memtype_lock); 480 spin_unlock(&memtype_lock);
447 481
448 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", 482 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -454,7 +488,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
454 488
455int free_memtype(u64 start, u64 end) 489int free_memtype(u64 start, u64 end)
456{ 490{
457 struct memtype *entry; 491 struct memtype *entry, *saved_entry;
458 int err = -EINVAL; 492 int err = -EINVAL;
459 int is_range_ram; 493 int is_range_ram;
460 494
@@ -466,23 +500,58 @@ int free_memtype(u64 start, u64 end)
466 return 0; 500 return 0;
467 501
468 is_range_ram = pat_pagerange_is_ram(start, end); 502 is_range_ram = pat_pagerange_is_ram(start, end);
469 if (is_range_ram == 1) 503 if (is_range_ram == 1) {
470 return free_ram_pages_type(start, end); 504
471 else if (is_range_ram < 0) 505 spin_lock(&memtype_lock);
506 err = free_ram_pages_type(start, end);
507 spin_unlock(&memtype_lock);
508
509 return err;
510 } else if (is_range_ram < 0) {
472 return -EINVAL; 511 return -EINVAL;
512 }
473 513
474 spin_lock(&memtype_lock); 514 spin_lock(&memtype_lock);
475 list_for_each_entry(entry, &memtype_list, nd) { 515
516 entry = memtype_rb_search(&memtype_rbroot, start);
517 if (unlikely(entry == NULL))
518 goto unlock_ret;
519
520 /*
521 * Saved entry points to an entry with start same or less than what
522 * we searched for. Now go through the list in both directions to look
523 * for the entry that matches with both start and end, with list stored
524 * in sorted start address
525 */
526 saved_entry = entry;
527 list_for_each_entry_from(entry, &memtype_list, nd) {
476 if (entry->start == start && entry->end == end) { 528 if (entry->start == start && entry->end == end) {
477 if (cached_entry == entry || cached_start == start) 529 rb_erase(&entry->rb, &memtype_rbroot);
478 cached_entry = NULL; 530 list_del(&entry->nd);
531 kfree(entry);
532 err = 0;
533 break;
534 } else if (entry->start > start) {
535 break;
536 }
537 }
538
539 if (!err)
540 goto unlock_ret;
479 541
542 entry = saved_entry;
543 list_for_each_entry_reverse(entry, &memtype_list, nd) {
544 if (entry->start == start && entry->end == end) {
545 rb_erase(&entry->rb, &memtype_rbroot);
480 list_del(&entry->nd); 546 list_del(&entry->nd);
481 kfree(entry); 547 kfree(entry);
482 err = 0; 548 err = 0;
483 break; 549 break;
550 } else if (entry->start < start) {
551 break;
484 } 552 }
485 } 553 }
554unlock_ret:
486 spin_unlock(&memtype_lock); 555 spin_unlock(&memtype_lock);
487 556
488 if (err) { 557 if (err) {
@@ -496,6 +565,101 @@ int free_memtype(u64 start, u64 end)
496} 565}
497 566
498 567
568/**
569 * lookup_memtype - Looksup the memory type for a physical address
570 * @paddr: physical address of which memory type needs to be looked up
571 *
572 * Only to be called when PAT is enabled
573 *
574 * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
575 * _PAGE_CACHE_UC
576 */
577static unsigned long lookup_memtype(u64 paddr)
578{
579 int rettype = _PAGE_CACHE_WB;
580 struct memtype *entry;
581
582 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
583 return rettype;
584
585 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
586 struct page *page;
587 spin_lock(&memtype_lock);
588 page = pfn_to_page(paddr >> PAGE_SHIFT);
589 rettype = get_page_memtype(page);
590 spin_unlock(&memtype_lock);
591 /*
592 * -1 from get_page_memtype() implies RAM page is in its
593 * default state and not reserved, and hence of type WB
594 */
595 if (rettype == -1)
596 rettype = _PAGE_CACHE_WB;
597
598 return rettype;
599 }
600
601 spin_lock(&memtype_lock);
602
603 entry = memtype_rb_search(&memtype_rbroot, paddr);
604 if (entry != NULL)
605 rettype = entry->type;
606 else
607 rettype = _PAGE_CACHE_UC_MINUS;
608
609 spin_unlock(&memtype_lock);
610 return rettype;
611}
612
613/**
614 * io_reserve_memtype - Request a memory type mapping for a region of memory
615 * @start: start (physical address) of the region
616 * @end: end (physical address) of the region
617 * @type: A pointer to memtype, with requested type. On success, requested
618 * or any other compatible type that was available for the region is returned
619 *
620 * On success, returns 0
621 * On failure, returns non-zero
622 */
623int io_reserve_memtype(resource_size_t start, resource_size_t end,
624 unsigned long *type)
625{
626 resource_size_t size = end - start;
627 unsigned long req_type = *type;
628 unsigned long new_type;
629 int ret;
630
631 WARN_ON_ONCE(iomem_map_sanity_check(start, size));
632
633 ret = reserve_memtype(start, end, req_type, &new_type);
634 if (ret)
635 goto out_err;
636
637 if (!is_new_memtype_allowed(start, size, req_type, new_type))
638 goto out_free;
639
640 if (kernel_map_sync_memtype(start, size, new_type) < 0)
641 goto out_free;
642
643 *type = new_type;
644 return 0;
645
646out_free:
647 free_memtype(start, end);
648 ret = -EBUSY;
649out_err:
650 return ret;
651}
652
653/**
654 * io_free_memtype - Release a memory type mapping for a region of memory
655 * @start: start (physical address) of the region
656 * @end: end (physical address) of the region
657 */
658void io_free_memtype(resource_size_t start, resource_size_t end)
659{
660 free_memtype(start, end);
661}
662
499pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 663pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
500 unsigned long size, pgprot_t vma_prot) 664 unsigned long size, pgprot_t vma_prot)
501{ 665{
@@ -577,7 +741,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
577{ 741{
578 unsigned long id_sz; 742 unsigned long id_sz;
579 743
580 if (!pat_enabled || base >= __pa(high_memory)) 744 if (base >= __pa(high_memory))
581 return 0; 745 return 0;
582 746
583 id_sz = (__pa(high_memory) < base + size) ? 747 id_sz = (__pa(high_memory) < base + size) ?
@@ -612,11 +776,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
612 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 776 is_ram = pat_pagerange_is_ram(paddr, paddr + size);
613 777
614 /* 778 /*
615 * reserve_pfn_range() doesn't support RAM pages. Maintain the current 779 * reserve_pfn_range() for RAM pages. We do not refcount to keep
616 * behavior with RAM pages by returning success. 780 * track of number of mappings of RAM pages. We can assert that
781 * the type requested matches the type of first page in the range.
617 */ 782 */
618 if (is_ram != 0) 783 if (is_ram) {
784 if (!pat_enabled)
785 return 0;
786
787 flags = lookup_memtype(paddr);
788 if (want_flags != flags) {
789 printk(KERN_WARNING
790 "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
791 current->comm, current->pid,
792 cattr_name(want_flags),
793 (unsigned long long)paddr,
794 (unsigned long long)(paddr + size),
795 cattr_name(flags));
796 *vma_prot = __pgprot((pgprot_val(*vma_prot) &
797 (~_PAGE_CACHE_MASK)) |
798 flags);
799 }
619 return 0; 800 return 0;
801 }
620 802
621 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); 803 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
622 if (ret) 804 if (ret)
@@ -678,14 +860,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
678 unsigned long vma_size = vma->vm_end - vma->vm_start; 860 unsigned long vma_size = vma->vm_end - vma->vm_start;
679 pgprot_t pgprot; 861 pgprot_t pgprot;
680 862
681 if (!pat_enabled)
682 return 0;
683
684 /*
685 * For now, only handle remap_pfn_range() vmas where
686 * is_linear_pfn_mapping() == TRUE. Handling of
687 * vm_insert_pfn() is TBD.
688 */
689 if (is_linear_pfn_mapping(vma)) { 863 if (is_linear_pfn_mapping(vma)) {
690 /* 864 /*
691 * reserve the whole chunk covered by vma. We need the 865 * reserve the whole chunk covered by vma. We need the
@@ -713,23 +887,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
713int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, 887int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
714 unsigned long pfn, unsigned long size) 888 unsigned long pfn, unsigned long size)
715{ 889{
890 unsigned long flags;
716 resource_size_t paddr; 891 resource_size_t paddr;
717 unsigned long vma_size = vma->vm_end - vma->vm_start; 892 unsigned long vma_size = vma->vm_end - vma->vm_start;
718 893
719 if (!pat_enabled)
720 return 0;
721
722 /*
723 * For now, only handle remap_pfn_range() vmas where
724 * is_linear_pfn_mapping() == TRUE. Handling of
725 * vm_insert_pfn() is TBD.
726 */
727 if (is_linear_pfn_mapping(vma)) { 894 if (is_linear_pfn_mapping(vma)) {
728 /* reserve the whole chunk starting from vm_pgoff */ 895 /* reserve the whole chunk starting from vm_pgoff */
729 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; 896 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
730 return reserve_pfn_range(paddr, vma_size, prot, 0); 897 return reserve_pfn_range(paddr, vma_size, prot, 0);
731 } 898 }
732 899
900 if (!pat_enabled)
901 return 0;
902
903 /* for vm_insert_pfn and friends, we set prot based on lookup */
904 flags = lookup_memtype(pfn << PAGE_SHIFT);
905 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
906 flags);
907
733 return 0; 908 return 0;
734} 909}
735 910
@@ -744,14 +919,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
744 resource_size_t paddr; 919 resource_size_t paddr;
745 unsigned long vma_size = vma->vm_end - vma->vm_start; 920 unsigned long vma_size = vma->vm_end - vma->vm_start;
746 921
747 if (!pat_enabled)
748 return;
749
750 /*
751 * For now, only handle remap_pfn_range() vmas where
752 * is_linear_pfn_mapping() == TRUE. Handling of
753 * vm_insert_pfn() is TBD.
754 */
755 if (is_linear_pfn_mapping(vma)) { 922 if (is_linear_pfn_mapping(vma)) {
756 /* free the whole chunk starting from vm_pgoff */ 923 /* free the whole chunk starting from vm_pgoff */
757 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; 924 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -827,7 +994,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
827 return 0; 994 return 0;
828} 995}
829 996
830static struct seq_operations memtype_seq_ops = { 997static const struct seq_operations memtype_seq_ops = {
831 .start = memtype_seq_start, 998 .start = memtype_seq_start,
832 .next = memtype_seq_next, 999 .next = memtype_seq_next,
833 .stop = memtype_seq_stop, 1000 .stop = memtype_seq_stop,
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
new file mode 100644
index 000000000000..d2e2735327b4
--- /dev/null
+++ b/arch/x86/mm/physaddr.c
@@ -0,0 +1,70 @@
1#include <linux/mmdebug.h>
2#include <linux/module.h>
3#include <linux/mm.h>
4
5#include <asm/page.h>
6
7#include "physaddr.h"
8
9#ifdef CONFIG_X86_64
10
11unsigned long __phys_addr(unsigned long x)
12{
13 if (x >= __START_KERNEL_map) {
14 x -= __START_KERNEL_map;
15 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
16 x += phys_base;
17 } else {
18 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
19 x -= PAGE_OFFSET;
20 VIRTUAL_BUG_ON(!phys_addr_valid(x));
21 }
22 return x;
23}
24EXPORT_SYMBOL(__phys_addr);
25
26bool __virt_addr_valid(unsigned long x)
27{
28 if (x >= __START_KERNEL_map) {
29 x -= __START_KERNEL_map;
30 if (x >= KERNEL_IMAGE_SIZE)
31 return false;
32 x += phys_base;
33 } else {
34 if (x < PAGE_OFFSET)
35 return false;
36 x -= PAGE_OFFSET;
37 if (!phys_addr_valid(x))
38 return false;
39 }
40
41 return pfn_valid(x >> PAGE_SHIFT);
42}
43EXPORT_SYMBOL(__virt_addr_valid);
44
45#else
46
47#ifdef CONFIG_DEBUG_VIRTUAL
48unsigned long __phys_addr(unsigned long x)
49{
50 /* VMALLOC_* aren't constants */
51 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
52 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
53 return x - PAGE_OFFSET;
54}
55EXPORT_SYMBOL(__phys_addr);
56#endif
57
58bool __virt_addr_valid(unsigned long x)
59{
60 if (x < PAGE_OFFSET)
61 return false;
62 if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
63 return false;
64 if (x >= FIXADDR_START)
65 return false;
66 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
67}
68EXPORT_SYMBOL(__virt_addr_valid);
69
70#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
new file mode 100644
index 000000000000..a3cd5a0c97b3
--- /dev/null
+++ b/arch/x86/mm/physaddr.h
@@ -0,0 +1,10 @@
1#include <asm/processor.h>
2
3static inline int phys_addr_valid(resource_size_t addr)
4{
5#ifdef CONFIG_PHYS_ADDR_T_64BIT
6 return !(addr >> boot_cpu_data.x86_phys_bits);
7#else
8 return 1;
9#endif
10}
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 29a0e37114f8..6f8aa33031c7 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -215,7 +215,7 @@ int __init get_memcfg_from_srat(void)
215 goto out_fail; 215 goto out_fail;
216 216
217 if (num_memory_chunks == 0) { 217 if (num_memory_chunks == 0) {
218 printk(KERN_WARNING 218 printk(KERN_DEBUG
219 "could not find any ACPI SRAT memory areas.\n"); 219 "could not find any ACPI SRAT memory areas.\n");
220 goto out_fail; 220 goto out_fail;
221 } 221 }
@@ -277,7 +277,7 @@ int __init get_memcfg_from_srat(void)
277 } 277 }
278 return 1; 278 return 1;
279out_fail: 279out_fail:
280 printk(KERN_ERR "failed to get NUMA memory information from SRAT" 280 printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
281 " table\n"); 281 " table\n");
282 return 0; 282 return 0;
283} 283}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c814e144a3f0..36fe08eeb5c3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -59,7 +59,8 @@ void leave_mm(int cpu)
59{ 59{
60 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 60 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
61 BUG(); 61 BUG();
62 cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); 62 cpumask_clear_cpu(cpu,
63 mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
63 load_cr3(swapper_pg_dir); 64 load_cr3(swapper_pg_dir);
64} 65}
65EXPORT_SYMBOL_GPL(leave_mm); 66EXPORT_SYMBOL_GPL(leave_mm);
@@ -234,8 +235,8 @@ void flush_tlb_current_task(void)
234 preempt_disable(); 235 preempt_disable();
235 236
236 local_flush_tlb(); 237 local_flush_tlb();
237 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) 238 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
238 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); 239 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
239 preempt_enable(); 240 preempt_enable();
240} 241}
241 242
@@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm)
249 else 250 else
250 leave_mm(smp_processor_id()); 251 leave_mm(smp_processor_id());
251 } 252 }
252 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) 253 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
253 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); 254 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
254 255
255 preempt_enable(); 256 preempt_enable();
256} 257}
@@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
268 leave_mm(smp_processor_id()); 269 leave_mm(smp_processor_id());
269 } 270 }
270 271
271 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) 272 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
272 flush_tlb_others(&mm->cpu_vm_mask, mm, va); 273 flush_tlb_others(mm_cpumask(mm), mm, va);
273 274
274 preempt_enable(); 275 preempt_enable();
275} 276}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 89b9a5cd63da..cb88b1a0bd5f 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -1,11 +1,14 @@
1/** 1/**
2 * @file nmi_int.c 2 * @file nmi_int.c
3 * 3 *
4 * @remark Copyright 2002-2008 OProfile authors 4 * @remark Copyright 2002-2009 OProfile authors
5 * @remark Read the file COPYING 5 * @remark Read the file COPYING
6 * 6 *
7 * @author John Levon <levon@movementarian.org> 7 * @author John Levon <levon@movementarian.org>
8 * @author Robert Richter <robert.richter@amd.com> 8 * @author Robert Richter <robert.richter@amd.com>
9 * @author Barry Kasindorf <barry.kasindorf@amd.com>
10 * @author Jason Yeh <jason.yeh@amd.com>
11 * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
9 */ 12 */
10 13
11#include <linux/init.h> 14#include <linux/init.h>
@@ -24,13 +27,35 @@
24#include "op_counter.h" 27#include "op_counter.h"
25#include "op_x86_model.h" 28#include "op_x86_model.h"
26 29
27static struct op_x86_model_spec const *model; 30static struct op_x86_model_spec *model;
28static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); 31static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
29static DEFINE_PER_CPU(unsigned long, saved_lvtpc); 32static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
30 33
31/* 0 == registered but off, 1 == registered and on */ 34/* 0 == registered but off, 1 == registered and on */
32static int nmi_enabled = 0; 35static int nmi_enabled = 0;
33 36
37struct op_counter_config counter_config[OP_MAX_COUNTER];
38
39/* common functions */
40
41u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
42 struct op_counter_config *counter_config)
43{
44 u64 val = 0;
45 u16 event = (u16)counter_config->event;
46
47 val |= ARCH_PERFMON_EVENTSEL_INT;
48 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
49 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
50 val |= (counter_config->unit_mask & 0xFF) << 8;
51 event &= model->event_mask ? model->event_mask : 0xFF;
52 val |= event & 0xFF;
53 val |= (event & 0x0F00) << 24;
54
55 return val;
56}
57
58
34static int profile_exceptions_notify(struct notifier_block *self, 59static int profile_exceptions_notify(struct notifier_block *self,
35 unsigned long val, void *data) 60 unsigned long val, void *data)
36{ 61{
@@ -52,36 +77,214 @@ static int profile_exceptions_notify(struct notifier_block *self,
52 77
53static void nmi_cpu_save_registers(struct op_msrs *msrs) 78static void nmi_cpu_save_registers(struct op_msrs *msrs)
54{ 79{
55 unsigned int const nr_ctrs = model->num_counters;
56 unsigned int const nr_ctrls = model->num_controls;
57 struct op_msr *counters = msrs->counters; 80 struct op_msr *counters = msrs->counters;
58 struct op_msr *controls = msrs->controls; 81 struct op_msr *controls = msrs->controls;
59 unsigned int i; 82 unsigned int i;
60 83
61 for (i = 0; i < nr_ctrs; ++i) { 84 for (i = 0; i < model->num_counters; ++i) {
62 if (counters[i].addr) { 85 if (counters[i].addr)
63 rdmsr(counters[i].addr, 86 rdmsrl(counters[i].addr, counters[i].saved);
64 counters[i].saved.low, 87 }
65 counters[i].saved.high); 88
66 } 89 for (i = 0; i < model->num_controls; ++i) {
90 if (controls[i].addr)
91 rdmsrl(controls[i].addr, controls[i].saved);
92 }
93}
94
95static void nmi_cpu_start(void *dummy)
96{
97 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
98 model->start(msrs);
99}
100
101static int nmi_start(void)
102{
103 on_each_cpu(nmi_cpu_start, NULL, 1);
104 return 0;
105}
106
107static void nmi_cpu_stop(void *dummy)
108{
109 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
110 model->stop(msrs);
111}
112
113static void nmi_stop(void)
114{
115 on_each_cpu(nmi_cpu_stop, NULL, 1);
116}
117
118#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
119
120static DEFINE_PER_CPU(int, switch_index);
121
122static inline int has_mux(void)
123{
124 return !!model->switch_ctrl;
125}
126
127inline int op_x86_phys_to_virt(int phys)
128{
129 return __get_cpu_var(switch_index) + phys;
130}
131
132inline int op_x86_virt_to_phys(int virt)
133{
134 return virt % model->num_counters;
135}
136
137static void nmi_shutdown_mux(void)
138{
139 int i;
140
141 if (!has_mux())
142 return;
143
144 for_each_possible_cpu(i) {
145 kfree(per_cpu(cpu_msrs, i).multiplex);
146 per_cpu(cpu_msrs, i).multiplex = NULL;
147 per_cpu(switch_index, i) = 0;
67 } 148 }
149}
150
151static int nmi_setup_mux(void)
152{
153 size_t multiplex_size =
154 sizeof(struct op_msr) * model->num_virt_counters;
155 int i;
156
157 if (!has_mux())
158 return 1;
159
160 for_each_possible_cpu(i) {
161 per_cpu(cpu_msrs, i).multiplex =
162 kmalloc(multiplex_size, GFP_KERNEL);
163 if (!per_cpu(cpu_msrs, i).multiplex)
164 return 0;
165 }
166
167 return 1;
168}
169
170static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
171{
172 int i;
173 struct op_msr *multiplex = msrs->multiplex;
174
175 if (!has_mux())
176 return;
68 177
69 for (i = 0; i < nr_ctrls; ++i) { 178 for (i = 0; i < model->num_virt_counters; ++i) {
70 if (controls[i].addr) { 179 if (counter_config[i].enabled) {
71 rdmsr(controls[i].addr, 180 multiplex[i].saved = -(u64)counter_config[i].count;
72 controls[i].saved.low, 181 } else {
73 controls[i].saved.high); 182 multiplex[i].addr = 0;
183 multiplex[i].saved = 0;
74 } 184 }
75 } 185 }
186
187 per_cpu(switch_index, cpu) = 0;
188}
189
190static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
191{
192 struct op_msr *multiplex = msrs->multiplex;
193 int i;
194
195 for (i = 0; i < model->num_counters; ++i) {
196 int virt = op_x86_phys_to_virt(i);
197 if (multiplex[virt].addr)
198 rdmsrl(multiplex[virt].addr, multiplex[virt].saved);
199 }
200}
201
202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
203{
204 struct op_msr *multiplex = msrs->multiplex;
205 int i;
206
207 for (i = 0; i < model->num_counters; ++i) {
208 int virt = op_x86_phys_to_virt(i);
209 if (multiplex[virt].addr)
210 wrmsrl(multiplex[virt].addr, multiplex[virt].saved);
211 }
76} 212}
77 213
78static void nmi_save_registers(void *dummy) 214static void nmi_cpu_switch(void *dummy)
79{ 215{
80 int cpu = smp_processor_id(); 216 int cpu = smp_processor_id();
217 int si = per_cpu(switch_index, cpu);
81 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); 218 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
82 nmi_cpu_save_registers(msrs); 219
220 nmi_cpu_stop(NULL);
221 nmi_cpu_save_mpx_registers(msrs);
222
223 /* move to next set */
224 si += model->num_counters;
225 if ((si > model->num_virt_counters) || (counter_config[si].count == 0))
226 per_cpu(switch_index, cpu) = 0;
227 else
228 per_cpu(switch_index, cpu) = si;
229
230 model->switch_ctrl(model, msrs);
231 nmi_cpu_restore_mpx_registers(msrs);
232
233 nmi_cpu_start(NULL);
234}
235
236
237/*
238 * Quick check to see if multiplexing is necessary.
239 * The check should be sufficient since counters are used
240 * in ordre.
241 */
242static int nmi_multiplex_on(void)
243{
244 return counter_config[model->num_counters].count ? 0 : -EINVAL;
245}
246
247static int nmi_switch_event(void)
248{
249 if (!has_mux())
250 return -ENOSYS; /* not implemented */
251 if (nmi_multiplex_on() < 0)
252 return -EINVAL; /* not necessary */
253
254 on_each_cpu(nmi_cpu_switch, NULL, 1);
255
256 return 0;
257}
258
259static inline void mux_init(struct oprofile_operations *ops)
260{
261 if (has_mux())
262 ops->switch_events = nmi_switch_event;
263}
264
265static void mux_clone(int cpu)
266{
267 if (!has_mux())
268 return;
269
270 memcpy(per_cpu(cpu_msrs, cpu).multiplex,
271 per_cpu(cpu_msrs, 0).multiplex,
272 sizeof(struct op_msr) * model->num_virt_counters);
83} 273}
84 274
275#else
276
277inline int op_x86_phys_to_virt(int phys) { return phys; }
278inline int op_x86_virt_to_phys(int virt) { return virt; }
279static inline void nmi_shutdown_mux(void) { }
280static inline int nmi_setup_mux(void) { return 1; }
281static inline void
282nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
283static inline void mux_init(struct oprofile_operations *ops) { }
284static void mux_clone(int cpu) { }
285
286#endif
287
85static void free_msrs(void) 288static void free_msrs(void)
86{ 289{
87 int i; 290 int i;
@@ -95,38 +298,32 @@ static void free_msrs(void)
95 298
96static int allocate_msrs(void) 299static int allocate_msrs(void)
97{ 300{
98 int success = 1;
99 size_t controls_size = sizeof(struct op_msr) * model->num_controls; 301 size_t controls_size = sizeof(struct op_msr) * model->num_controls;
100 size_t counters_size = sizeof(struct op_msr) * model->num_counters; 302 size_t counters_size = sizeof(struct op_msr) * model->num_counters;
101 303
102 int i; 304 int i;
103 for_each_possible_cpu(i) { 305 for_each_possible_cpu(i) {
104 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, 306 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size,
105 GFP_KERNEL); 307 GFP_KERNEL);
106 if (!per_cpu(cpu_msrs, i).counters) { 308 if (!per_cpu(cpu_msrs, i).counters)
107 success = 0; 309 return 0;
108 break;
109 }
110 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, 310 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size,
111 GFP_KERNEL); 311 GFP_KERNEL);
112 if (!per_cpu(cpu_msrs, i).controls) { 312 if (!per_cpu(cpu_msrs, i).controls)
113 success = 0; 313 return 0;
114 break;
115 }
116 } 314 }
117 315
118 if (!success) 316 return 1;
119 free_msrs();
120
121 return success;
122} 317}
123 318
124static void nmi_cpu_setup(void *dummy) 319static void nmi_cpu_setup(void *dummy)
125{ 320{
126 int cpu = smp_processor_id(); 321 int cpu = smp_processor_id();
127 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); 322 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
323 nmi_cpu_save_registers(msrs);
128 spin_lock(&oprofilefs_lock); 324 spin_lock(&oprofilefs_lock);
129 model->setup_ctrs(msrs); 325 model->setup_ctrs(model, msrs);
326 nmi_cpu_setup_mux(cpu, msrs);
130 spin_unlock(&oprofilefs_lock); 327 spin_unlock(&oprofilefs_lock);
131 per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); 328 per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
132 apic_write(APIC_LVTPC, APIC_DM_NMI); 329 apic_write(APIC_LVTPC, APIC_DM_NMI);
@@ -144,11 +341,15 @@ static int nmi_setup(void)
144 int cpu; 341 int cpu;
145 342
146 if (!allocate_msrs()) 343 if (!allocate_msrs())
147 return -ENOMEM; 344 err = -ENOMEM;
345 else if (!nmi_setup_mux())
346 err = -ENOMEM;
347 else
348 err = register_die_notifier(&profile_exceptions_nb);
148 349
149 err = register_die_notifier(&profile_exceptions_nb);
150 if (err) { 350 if (err) {
151 free_msrs(); 351 free_msrs();
352 nmi_shutdown_mux();
152 return err; 353 return err;
153 } 354 }
154 355
@@ -159,45 +360,38 @@ static int nmi_setup(void)
159 /* Assume saved/restored counters are the same on all CPUs */ 360 /* Assume saved/restored counters are the same on all CPUs */
160 model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); 361 model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
161 for_each_possible_cpu(cpu) { 362 for_each_possible_cpu(cpu) {
162 if (cpu != 0) { 363 if (!cpu)
163 memcpy(per_cpu(cpu_msrs, cpu).counters, 364 continue;
164 per_cpu(cpu_msrs, 0).counters, 365
165 sizeof(struct op_msr) * model->num_counters); 366 memcpy(per_cpu(cpu_msrs, cpu).counters,
166 367 per_cpu(cpu_msrs, 0).counters,
167 memcpy(per_cpu(cpu_msrs, cpu).controls, 368 sizeof(struct op_msr) * model->num_counters);
168 per_cpu(cpu_msrs, 0).controls, 369
169 sizeof(struct op_msr) * model->num_controls); 370 memcpy(per_cpu(cpu_msrs, cpu).controls,
170 } 371 per_cpu(cpu_msrs, 0).controls,
372 sizeof(struct op_msr) * model->num_controls);
171 373
374 mux_clone(cpu);
172 } 375 }
173 on_each_cpu(nmi_save_registers, NULL, 1);
174 on_each_cpu(nmi_cpu_setup, NULL, 1); 376 on_each_cpu(nmi_cpu_setup, NULL, 1);
175 nmi_enabled = 1; 377 nmi_enabled = 1;
176 return 0; 378 return 0;
177} 379}
178 380
179static void nmi_restore_registers(struct op_msrs *msrs) 381static void nmi_cpu_restore_registers(struct op_msrs *msrs)
180{ 382{
181 unsigned int const nr_ctrs = model->num_counters;
182 unsigned int const nr_ctrls = model->num_controls;
183 struct op_msr *counters = msrs->counters; 383 struct op_msr *counters = msrs->counters;
184 struct op_msr *controls = msrs->controls; 384 struct op_msr *controls = msrs->controls;
185 unsigned int i; 385 unsigned int i;
186 386
187 for (i = 0; i < nr_ctrls; ++i) { 387 for (i = 0; i < model->num_controls; ++i) {
188 if (controls[i].addr) { 388 if (controls[i].addr)
189 wrmsr(controls[i].addr, 389 wrmsrl(controls[i].addr, controls[i].saved);
190 controls[i].saved.low,
191 controls[i].saved.high);
192 }
193 } 390 }
194 391
195 for (i = 0; i < nr_ctrs; ++i) { 392 for (i = 0; i < model->num_counters; ++i) {
196 if (counters[i].addr) { 393 if (counters[i].addr)
197 wrmsr(counters[i].addr, 394 wrmsrl(counters[i].addr, counters[i].saved);
198 counters[i].saved.low,
199 counters[i].saved.high);
200 }
201 } 395 }
202} 396}
203 397
@@ -205,7 +399,7 @@ static void nmi_cpu_shutdown(void *dummy)
205{ 399{
206 unsigned int v; 400 unsigned int v;
207 int cpu = smp_processor_id(); 401 int cpu = smp_processor_id();
208 struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); 402 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
209 403
210 /* restoring APIC_LVTPC can trigger an apic error because the delivery 404 /* restoring APIC_LVTPC can trigger an apic error because the delivery
211 * mode and vector nr combination can be illegal. That's by design: on 405 * mode and vector nr combination can be illegal. That's by design: on
@@ -216,7 +410,7 @@ static void nmi_cpu_shutdown(void *dummy)
216 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); 410 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
217 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); 411 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
218 apic_write(APIC_LVTERR, v); 412 apic_write(APIC_LVTERR, v);
219 nmi_restore_registers(msrs); 413 nmi_cpu_restore_registers(msrs);
220} 414}
221 415
222static void nmi_shutdown(void) 416static void nmi_shutdown(void)
@@ -226,42 +420,18 @@ static void nmi_shutdown(void)
226 nmi_enabled = 0; 420 nmi_enabled = 0;
227 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 421 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
228 unregister_die_notifier(&profile_exceptions_nb); 422 unregister_die_notifier(&profile_exceptions_nb);
423 nmi_shutdown_mux();
229 msrs = &get_cpu_var(cpu_msrs); 424 msrs = &get_cpu_var(cpu_msrs);
230 model->shutdown(msrs); 425 model->shutdown(msrs);
231 free_msrs(); 426 free_msrs();
232 put_cpu_var(cpu_msrs); 427 put_cpu_var(cpu_msrs);
233} 428}
234 429
235static void nmi_cpu_start(void *dummy)
236{
237 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
238 model->start(msrs);
239}
240
241static int nmi_start(void)
242{
243 on_each_cpu(nmi_cpu_start, NULL, 1);
244 return 0;
245}
246
247static void nmi_cpu_stop(void *dummy)
248{
249 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
250 model->stop(msrs);
251}
252
253static void nmi_stop(void)
254{
255 on_each_cpu(nmi_cpu_stop, NULL, 1);
256}
257
258struct op_counter_config counter_config[OP_MAX_COUNTER];
259
260static int nmi_create_files(struct super_block *sb, struct dentry *root) 430static int nmi_create_files(struct super_block *sb, struct dentry *root)
261{ 431{
262 unsigned int i; 432 unsigned int i;
263 433
264 for (i = 0; i < model->num_counters; ++i) { 434 for (i = 0; i < model->num_virt_counters; ++i) {
265 struct dentry *dir; 435 struct dentry *dir;
266 char buf[4]; 436 char buf[4];
267 437
@@ -270,7 +440,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
270 * NOTE: assumes 1:1 mapping here (that counters are organized 440 * NOTE: assumes 1:1 mapping here (that counters are organized
271 * sequentially in their struct assignment). 441 * sequentially in their struct assignment).
272 */ 442 */
273 if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) 443 if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
274 continue; 444 continue;
275 445
276 snprintf(buf, sizeof(buf), "%d", i); 446 snprintf(buf, sizeof(buf), "%d", i);
@@ -402,6 +572,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
402static int __init ppro_init(char **cpu_type) 572static int __init ppro_init(char **cpu_type)
403{ 573{
404 __u8 cpu_model = boot_cpu_data.x86_model; 574 __u8 cpu_model = boot_cpu_data.x86_model;
575 struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
405 576
406 if (force_arch_perfmon && cpu_has_arch_perfmon) 577 if (force_arch_perfmon && cpu_has_arch_perfmon)
407 return 0; 578 return 0;
@@ -428,7 +599,7 @@ static int __init ppro_init(char **cpu_type)
428 *cpu_type = "i386/core_2"; 599 *cpu_type = "i386/core_2";
429 break; 600 break;
430 case 26: 601 case 26:
431 arch_perfmon_setup_counters(); 602 spec = &op_arch_perfmon_spec;
432 *cpu_type = "i386/core_i7"; 603 *cpu_type = "i386/core_i7";
433 break; 604 break;
434 case 28: 605 case 28:
@@ -439,17 +610,7 @@ static int __init ppro_init(char **cpu_type)
439 return 0; 610 return 0;
440 } 611 }
441 612
442 model = &op_ppro_spec; 613 model = spec;
443 return 1;
444}
445
446static int __init arch_perfmon_init(char **cpu_type)
447{
448 if (!cpu_has_arch_perfmon)
449 return 0;
450 *cpu_type = "i386/arch_perfmon";
451 model = &op_arch_perfmon_spec;
452 arch_perfmon_setup_counters();
453 return 1; 614 return 1;
454} 615}
455 616
@@ -471,27 +632,26 @@ int __init op_nmi_init(struct oprofile_operations *ops)
471 /* Needs to be at least an Athlon (or hammer in 32bit mode) */ 632 /* Needs to be at least an Athlon (or hammer in 32bit mode) */
472 633
473 switch (family) { 634 switch (family) {
474 default:
475 return -ENODEV;
476 case 6: 635 case 6:
477 model = &op_amd_spec;
478 cpu_type = "i386/athlon"; 636 cpu_type = "i386/athlon";
479 break; 637 break;
480 case 0xf: 638 case 0xf:
481 model = &op_amd_spec; 639 /*
482 /* Actually it could be i386/hammer too, but give 640 * Actually it could be i386/hammer too, but
483 user space an consistent name. */ 641 * give user space an consistent name.
642 */
484 cpu_type = "x86-64/hammer"; 643 cpu_type = "x86-64/hammer";
485 break; 644 break;
486 case 0x10: 645 case 0x10:
487 model = &op_amd_spec;
488 cpu_type = "x86-64/family10"; 646 cpu_type = "x86-64/family10";
489 break; 647 break;
490 case 0x11: 648 case 0x11:
491 model = &op_amd_spec;
492 cpu_type = "x86-64/family11h"; 649 cpu_type = "x86-64/family11h";
493 break; 650 break;
651 default:
652 return -ENODEV;
494 } 653 }
654 model = &op_amd_spec;
495 break; 655 break;
496 656
497 case X86_VENDOR_INTEL: 657 case X86_VENDOR_INTEL:
@@ -510,8 +670,15 @@ int __init op_nmi_init(struct oprofile_operations *ops)
510 break; 670 break;
511 } 671 }
512 672
513 if (!cpu_type && !arch_perfmon_init(&cpu_type)) 673 if (cpu_type)
674 break;
675
676 if (!cpu_has_arch_perfmon)
514 return -ENODEV; 677 return -ENODEV;
678
679 /* use arch perfmon as fallback */
680 cpu_type = "i386/arch_perfmon";
681 model = &op_arch_perfmon_spec;
515 break; 682 break;
516 683
517 default: 684 default:
@@ -522,18 +689,23 @@ int __init op_nmi_init(struct oprofile_operations *ops)
522 register_cpu_notifier(&oprofile_cpu_nb); 689 register_cpu_notifier(&oprofile_cpu_nb);
523#endif 690#endif
524 /* default values, can be overwritten by model */ 691 /* default values, can be overwritten by model */
525 ops->create_files = nmi_create_files; 692 ops->create_files = nmi_create_files;
526 ops->setup = nmi_setup; 693 ops->setup = nmi_setup;
527 ops->shutdown = nmi_shutdown; 694 ops->shutdown = nmi_shutdown;
528 ops->start = nmi_start; 695 ops->start = nmi_start;
529 ops->stop = nmi_stop; 696 ops->stop = nmi_stop;
530 ops->cpu_type = cpu_type; 697 ops->cpu_type = cpu_type;
531 698
532 if (model->init) 699 if (model->init)
533 ret = model->init(ops); 700 ret = model->init(ops);
534 if (ret) 701 if (ret)
535 return ret; 702 return ret;
536 703
704 if (!model->num_virt_counters)
705 model->num_virt_counters = model->num_counters;
706
707 mux_init(ops);
708
537 init_sysfs(); 709 init_sysfs();
538 using_nmi = 1; 710 using_nmi = 1;
539 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 711 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index 91b6a116165e..e28398df0df2 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -10,7 +10,7 @@
10#ifndef OP_COUNTER_H 10#ifndef OP_COUNTER_H
11#define OP_COUNTER_H 11#define OP_COUNTER_H
12 12
13#define OP_MAX_COUNTER 8 13#define OP_MAX_COUNTER 32
14 14
15/* Per-perfctr configuration as set via 15/* Per-perfctr configuration as set via
16 * oprofilefs. 16 * oprofilefs.
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 8fdf06e4edf9..39686c29f03a 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -9,12 +9,15 @@
9 * @author Philippe Elie 9 * @author Philippe Elie
10 * @author Graydon Hoare 10 * @author Graydon Hoare
11 * @author Robert Richter <robert.richter@amd.com> 11 * @author Robert Richter <robert.richter@amd.com>
12 * @author Barry Kasindorf 12 * @author Barry Kasindorf <barry.kasindorf@amd.com>
13 * @author Jason Yeh <jason.yeh@amd.com>
14 * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
13 */ 15 */
14 16
15#include <linux/oprofile.h> 17#include <linux/oprofile.h>
16#include <linux/device.h> 18#include <linux/device.h>
17#include <linux/pci.h> 19#include <linux/pci.h>
20#include <linux/percpu.h>
18 21
19#include <asm/ptrace.h> 22#include <asm/ptrace.h>
20#include <asm/msr.h> 23#include <asm/msr.h>
@@ -25,43 +28,36 @@
25 28
26#define NUM_COUNTERS 4 29#define NUM_COUNTERS 4
27#define NUM_CONTROLS 4 30#define NUM_CONTROLS 4
31#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
32#define NUM_VIRT_COUNTERS 32
33#define NUM_VIRT_CONTROLS 32
34#else
35#define NUM_VIRT_COUNTERS NUM_COUNTERS
36#define NUM_VIRT_CONTROLS NUM_CONTROLS
37#endif
38
39#define OP_EVENT_MASK 0x0FFF
40#define OP_CTR_OVERFLOW (1ULL<<31)
28 41
29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) 42#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
30#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) 43
31#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) 44static unsigned long reset_value[NUM_VIRT_COUNTERS];
32#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
33
34#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
35#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
36#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
37#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
38#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
39#define CTRL_CLEAR_LO(x) (x &= (1<<21))
40#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
41#define CTRL_SET_ENABLE(val) (val |= 1<<20)
42#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
43#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
44#define CTRL_SET_UM(val, m) (val |= (m << 8))
45#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
46#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
47#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
48#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
49
50static unsigned long reset_value[NUM_COUNTERS];
51 45
52#ifdef CONFIG_OPROFILE_IBS 46#ifdef CONFIG_OPROFILE_IBS
53 47
54/* IbsFetchCtl bits/masks */ 48/* IbsFetchCtl bits/masks */
55#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ 49#define IBS_FETCH_RAND_EN (1ULL<<57)
56#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ 50#define IBS_FETCH_VAL (1ULL<<49)
57#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ 51#define IBS_FETCH_ENABLE (1ULL<<48)
52#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL
58 53
59/*IbsOpCtl bits */ 54/*IbsOpCtl bits */
60#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ 55#define IBS_OP_CNT_CTL (1ULL<<19)
61#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ 56#define IBS_OP_VAL (1ULL<<18)
57#define IBS_OP_ENABLE (1ULL<<17)
62 58
63#define IBS_FETCH_SIZE 6 59#define IBS_FETCH_SIZE 6
64#define IBS_OP_SIZE 12 60#define IBS_OP_SIZE 12
65 61
66static int has_ibs; /* AMD Family10h and later */ 62static int has_ibs; /* AMD Family10h and later */
67 63
@@ -78,6 +74,45 @@ static struct op_ibs_config ibs_config;
78 74
79#endif 75#endif
80 76
77#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
78
79static void op_mux_fill_in_addresses(struct op_msrs * const msrs)
80{
81 int i;
82
83 for (i = 0; i < NUM_VIRT_COUNTERS; i++) {
84 int hw_counter = op_x86_virt_to_phys(i);
85 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
86 msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter;
87 else
88 msrs->multiplex[i].addr = 0;
89 }
90}
91
92static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
93 struct op_msrs const * const msrs)
94{
95 u64 val;
96 int i;
97
98 /* enable active counters */
99 for (i = 0; i < NUM_COUNTERS; ++i) {
100 int virt = op_x86_phys_to_virt(i);
101 if (!counter_config[virt].enabled)
102 continue;
103 rdmsrl(msrs->controls[i].addr, val);
104 val &= model->reserved;
105 val |= op_x86_get_ctrl(model, &counter_config[virt]);
106 wrmsrl(msrs->controls[i].addr, val);
107 }
108}
109
110#else
111
112static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { }
113
114#endif
115
81/* functions for op_amd_spec */ 116/* functions for op_amd_spec */
82 117
83static void op_amd_fill_in_addresses(struct op_msrs * const msrs) 118static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
@@ -97,150 +132,174 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
97 else 132 else
98 msrs->controls[i].addr = 0; 133 msrs->controls[i].addr = 0;
99 } 134 }
100}
101 135
136 op_mux_fill_in_addresses(msrs);
137}
102 138
103static void op_amd_setup_ctrs(struct op_msrs const * const msrs) 139static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
140 struct op_msrs const * const msrs)
104{ 141{
105 unsigned int low, high; 142 u64 val;
106 int i; 143 int i;
107 144
145 /* setup reset_value */
146 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
147 if (counter_config[i].enabled)
148 reset_value[i] = counter_config[i].count;
149 else
150 reset_value[i] = 0;
151 }
152
108 /* clear all counters */ 153 /* clear all counters */
109 for (i = 0 ; i < NUM_CONTROLS; ++i) { 154 for (i = 0; i < NUM_CONTROLS; ++i) {
110 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 155 if (unlikely(!msrs->controls[i].addr))
111 continue; 156 continue;
112 CTRL_READ(low, high, msrs, i); 157 rdmsrl(msrs->controls[i].addr, val);
113 CTRL_CLEAR_LO(low); 158 val &= model->reserved;
114 CTRL_CLEAR_HI(high); 159 wrmsrl(msrs->controls[i].addr, val);
115 CTRL_WRITE(low, high, msrs, i);
116 } 160 }
117 161
118 /* avoid a false detection of ctr overflows in NMI handler */ 162 /* avoid a false detection of ctr overflows in NMI handler */
119 for (i = 0; i < NUM_COUNTERS; ++i) { 163 for (i = 0; i < NUM_COUNTERS; ++i) {
120 if (unlikely(!CTR_IS_RESERVED(msrs, i))) 164 if (unlikely(!msrs->counters[i].addr))
121 continue; 165 continue;
122 CTR_WRITE(1, msrs, i); 166 wrmsrl(msrs->counters[i].addr, -1LL);
123 } 167 }
124 168
125 /* enable active counters */ 169 /* enable active counters */
126 for (i = 0; i < NUM_COUNTERS; ++i) { 170 for (i = 0; i < NUM_COUNTERS; ++i) {
127 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { 171 int virt = op_x86_phys_to_virt(i);
128 reset_value[i] = counter_config[i].count; 172 if (!counter_config[virt].enabled)
173 continue;
174 if (!msrs->counters[i].addr)
175 continue;
129 176
130 CTR_WRITE(counter_config[i].count, msrs, i); 177 /* setup counter registers */
131 178 wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
132 CTRL_READ(low, high, msrs, i); 179
133 CTRL_CLEAR_LO(low); 180 /* setup control registers */
134 CTRL_CLEAR_HI(high); 181 rdmsrl(msrs->controls[i].addr, val);
135 CTRL_SET_ENABLE(low); 182 val &= model->reserved;
136 CTRL_SET_USR(low, counter_config[i].user); 183 val |= op_x86_get_ctrl(model, &counter_config[virt]);
137 CTRL_SET_KERN(low, counter_config[i].kernel); 184 wrmsrl(msrs->controls[i].addr, val);
138 CTRL_SET_UM(low, counter_config[i].unit_mask);
139 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
140 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
141 CTRL_SET_HOST_ONLY(high, 0);
142 CTRL_SET_GUEST_ONLY(high, 0);
143
144 CTRL_WRITE(low, high, msrs, i);
145 } else {
146 reset_value[i] = 0;
147 }
148 } 185 }
149} 186}
150 187
151#ifdef CONFIG_OPROFILE_IBS 188#ifdef CONFIG_OPROFILE_IBS
152 189
153static inline int 190static inline void
154op_amd_handle_ibs(struct pt_regs * const regs, 191op_amd_handle_ibs(struct pt_regs * const regs,
155 struct op_msrs const * const msrs) 192 struct op_msrs const * const msrs)
156{ 193{
157 u32 low, high; 194 u64 val, ctl;
158 u64 msr;
159 struct op_entry entry; 195 struct op_entry entry;
160 196
161 if (!has_ibs) 197 if (!has_ibs)
162 return 1; 198 return;
163 199
164 if (ibs_config.fetch_enabled) { 200 if (ibs_config.fetch_enabled) {
165 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); 201 rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
166 if (high & IBS_FETCH_HIGH_VALID_BIT) { 202 if (ctl & IBS_FETCH_VAL) {
167 rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); 203 rdmsrl(MSR_AMD64_IBSFETCHLINAD, val);
168 oprofile_write_reserve(&entry, regs, msr, 204 oprofile_write_reserve(&entry, regs, val,
169 IBS_FETCH_CODE, IBS_FETCH_SIZE); 205 IBS_FETCH_CODE, IBS_FETCH_SIZE);
170 oprofile_add_data(&entry, (u32)msr); 206 oprofile_add_data64(&entry, val);
171 oprofile_add_data(&entry, (u32)(msr >> 32)); 207 oprofile_add_data64(&entry, ctl);
172 oprofile_add_data(&entry, low); 208 rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val);
173 oprofile_add_data(&entry, high); 209 oprofile_add_data64(&entry, val);
174 rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr);
175 oprofile_add_data(&entry, (u32)msr);
176 oprofile_add_data(&entry, (u32)(msr >> 32));
177 oprofile_write_commit(&entry); 210 oprofile_write_commit(&entry);
178 211
179 /* reenable the IRQ */ 212 /* reenable the IRQ */
180 high &= ~IBS_FETCH_HIGH_VALID_BIT; 213 ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK);
181 high |= IBS_FETCH_HIGH_ENABLE; 214 ctl |= IBS_FETCH_ENABLE;
182 low &= IBS_FETCH_LOW_MAX_CNT_MASK; 215 wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
183 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
184 } 216 }
185 } 217 }
186 218
187 if (ibs_config.op_enabled) { 219 if (ibs_config.op_enabled) {
188 rdmsr(MSR_AMD64_IBSOPCTL, low, high); 220 rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
189 if (low & IBS_OP_LOW_VALID_BIT) { 221 if (ctl & IBS_OP_VAL) {
190 rdmsrl(MSR_AMD64_IBSOPRIP, msr); 222 rdmsrl(MSR_AMD64_IBSOPRIP, val);
191 oprofile_write_reserve(&entry, regs, msr, 223 oprofile_write_reserve(&entry, regs, val,
192 IBS_OP_CODE, IBS_OP_SIZE); 224 IBS_OP_CODE, IBS_OP_SIZE);
193 oprofile_add_data(&entry, (u32)msr); 225 oprofile_add_data64(&entry, val);
194 oprofile_add_data(&entry, (u32)(msr >> 32)); 226 rdmsrl(MSR_AMD64_IBSOPDATA, val);
195 rdmsrl(MSR_AMD64_IBSOPDATA, msr); 227 oprofile_add_data64(&entry, val);
196 oprofile_add_data(&entry, (u32)msr); 228 rdmsrl(MSR_AMD64_IBSOPDATA2, val);
197 oprofile_add_data(&entry, (u32)(msr >> 32)); 229 oprofile_add_data64(&entry, val);
198 rdmsrl(MSR_AMD64_IBSOPDATA2, msr); 230 rdmsrl(MSR_AMD64_IBSOPDATA3, val);
199 oprofile_add_data(&entry, (u32)msr); 231 oprofile_add_data64(&entry, val);
200 oprofile_add_data(&entry, (u32)(msr >> 32)); 232 rdmsrl(MSR_AMD64_IBSDCLINAD, val);
201 rdmsrl(MSR_AMD64_IBSOPDATA3, msr); 233 oprofile_add_data64(&entry, val);
202 oprofile_add_data(&entry, (u32)msr); 234 rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
203 oprofile_add_data(&entry, (u32)(msr >> 32)); 235 oprofile_add_data64(&entry, val);
204 rdmsrl(MSR_AMD64_IBSDCLINAD, msr);
205 oprofile_add_data(&entry, (u32)msr);
206 oprofile_add_data(&entry, (u32)(msr >> 32));
207 rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr);
208 oprofile_add_data(&entry, (u32)msr);
209 oprofile_add_data(&entry, (u32)(msr >> 32));
210 oprofile_write_commit(&entry); 236 oprofile_write_commit(&entry);
211 237
212 /* reenable the IRQ */ 238 /* reenable the IRQ */
213 high = 0; 239 ctl &= ~IBS_OP_VAL & 0xFFFFFFFF;
214 low &= ~IBS_OP_LOW_VALID_BIT; 240 ctl |= IBS_OP_ENABLE;
215 low |= IBS_OP_LOW_ENABLE; 241 wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
216 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
217 } 242 }
218 } 243 }
244}
219 245
220 return 1; 246static inline void op_amd_start_ibs(void)
247{
248 u64 val;
249 if (has_ibs && ibs_config.fetch_enabled) {
250 val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
251 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
252 val |= IBS_FETCH_ENABLE;
253 wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
254 }
255
256 if (has_ibs && ibs_config.op_enabled) {
257 val = (ibs_config.max_cnt_op >> 4) & 0xFFFF;
258 val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
259 val |= IBS_OP_ENABLE;
260 wrmsrl(MSR_AMD64_IBSOPCTL, val);
261 }
262}
263
264static void op_amd_stop_ibs(void)
265{
266 if (has_ibs && ibs_config.fetch_enabled)
267 /* clear max count and enable */
268 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
269
270 if (has_ibs && ibs_config.op_enabled)
271 /* clear max count and enable */
272 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
221} 273}
222 274
275#else
276
277static inline void op_amd_handle_ibs(struct pt_regs * const regs,
278 struct op_msrs const * const msrs) { }
279static inline void op_amd_start_ibs(void) { }
280static inline void op_amd_stop_ibs(void) { }
281
223#endif 282#endif
224 283
225static int op_amd_check_ctrs(struct pt_regs * const regs, 284static int op_amd_check_ctrs(struct pt_regs * const regs,
226 struct op_msrs const * const msrs) 285 struct op_msrs const * const msrs)
227{ 286{
228 unsigned int low, high; 287 u64 val;
229 int i; 288 int i;
230 289
231 for (i = 0 ; i < NUM_COUNTERS; ++i) { 290 for (i = 0; i < NUM_COUNTERS; ++i) {
232 if (!reset_value[i]) 291 int virt = op_x86_phys_to_virt(i);
292 if (!reset_value[virt])
233 continue; 293 continue;
234 CTR_READ(low, high, msrs, i); 294 rdmsrl(msrs->counters[i].addr, val);
235 if (CTR_OVERFLOWED(low)) { 295 /* bit is clear if overflowed: */
236 oprofile_add_sample(regs, i); 296 if (val & OP_CTR_OVERFLOW)
237 CTR_WRITE(reset_value[i], msrs, i); 297 continue;
238 } 298 oprofile_add_sample(regs, virt);
299 wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
239 } 300 }
240 301
241#ifdef CONFIG_OPROFILE_IBS
242 op_amd_handle_ibs(regs, msrs); 302 op_amd_handle_ibs(regs, msrs);
243#endif
244 303
245 /* See op_model_ppro.c */ 304 /* See op_model_ppro.c */
246 return 1; 305 return 1;
@@ -248,79 +307,50 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
248 307
249static void op_amd_start(struct op_msrs const * const msrs) 308static void op_amd_start(struct op_msrs const * const msrs)
250{ 309{
251 unsigned int low, high; 310 u64 val;
252 int i; 311 int i;
253 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
254 if (reset_value[i]) {
255 CTRL_READ(low, high, msrs, i);
256 CTRL_SET_ACTIVE(low);
257 CTRL_WRITE(low, high, msrs, i);
258 }
259 }
260 312
261#ifdef CONFIG_OPROFILE_IBS 313 for (i = 0; i < NUM_COUNTERS; ++i) {
262 if (has_ibs && ibs_config.fetch_enabled) { 314 if (!reset_value[op_x86_phys_to_virt(i)])
263 low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; 315 continue;
264 high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ 316 rdmsrl(msrs->controls[i].addr, val);
265 + IBS_FETCH_HIGH_ENABLE; 317 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
266 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); 318 wrmsrl(msrs->controls[i].addr, val);
267 } 319 }
268 320
269 if (has_ibs && ibs_config.op_enabled) { 321 op_amd_start_ibs();
270 low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
271 + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
272 + IBS_OP_LOW_ENABLE;
273 high = 0;
274 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
275 }
276#endif
277} 322}
278 323
279
280static void op_amd_stop(struct op_msrs const * const msrs) 324static void op_amd_stop(struct op_msrs const * const msrs)
281{ 325{
282 unsigned int low, high; 326 u64 val;
283 int i; 327 int i;
284 328
285 /* 329 /*
286 * Subtle: stop on all counters to avoid race with setting our 330 * Subtle: stop on all counters to avoid race with setting our
287 * pm callback 331 * pm callback
288 */ 332 */
289 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 333 for (i = 0; i < NUM_COUNTERS; ++i) {
290 if (!reset_value[i]) 334 if (!reset_value[op_x86_phys_to_virt(i)])
291 continue; 335 continue;
292 CTRL_READ(low, high, msrs, i); 336 rdmsrl(msrs->controls[i].addr, val);
293 CTRL_SET_INACTIVE(low); 337 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
294 CTRL_WRITE(low, high, msrs, i); 338 wrmsrl(msrs->controls[i].addr, val);
295 }
296
297#ifdef CONFIG_OPROFILE_IBS
298 if (has_ibs && ibs_config.fetch_enabled) {
299 /* clear max count and enable */
300 low = 0;
301 high = 0;
302 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
303 } 339 }
304 340
305 if (has_ibs && ibs_config.op_enabled) { 341 op_amd_stop_ibs();
306 /* clear max count and enable */
307 low = 0;
308 high = 0;
309 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
310 }
311#endif
312} 342}
313 343
314static void op_amd_shutdown(struct op_msrs const * const msrs) 344static void op_amd_shutdown(struct op_msrs const * const msrs)
315{ 345{
316 int i; 346 int i;
317 347
318 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 348 for (i = 0; i < NUM_COUNTERS; ++i) {
319 if (CTR_IS_RESERVED(msrs, i)) 349 if (msrs->counters[i].addr)
320 release_perfctr_nmi(MSR_K7_PERFCTR0 + i); 350 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
321 } 351 }
322 for (i = 0 ; i < NUM_CONTROLS ; ++i) { 352 for (i = 0; i < NUM_CONTROLS; ++i) {
323 if (CTRL_IS_RESERVED(msrs, i)) 353 if (msrs->controls[i].addr)
324 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); 354 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
325 } 355 }
326} 356}
@@ -490,15 +520,21 @@ static void op_amd_exit(void) {}
490 520
491#endif /* CONFIG_OPROFILE_IBS */ 521#endif /* CONFIG_OPROFILE_IBS */
492 522
493struct op_x86_model_spec const op_amd_spec = { 523struct op_x86_model_spec op_amd_spec = {
494 .init = op_amd_init,
495 .exit = op_amd_exit,
496 .num_counters = NUM_COUNTERS, 524 .num_counters = NUM_COUNTERS,
497 .num_controls = NUM_CONTROLS, 525 .num_controls = NUM_CONTROLS,
526 .num_virt_counters = NUM_VIRT_COUNTERS,
527 .reserved = MSR_AMD_EVENTSEL_RESERVED,
528 .event_mask = OP_EVENT_MASK,
529 .init = op_amd_init,
530 .exit = op_amd_exit,
498 .fill_in_addresses = &op_amd_fill_in_addresses, 531 .fill_in_addresses = &op_amd_fill_in_addresses,
499 .setup_ctrs = &op_amd_setup_ctrs, 532 .setup_ctrs = &op_amd_setup_ctrs,
500 .check_ctrs = &op_amd_check_ctrs, 533 .check_ctrs = &op_amd_check_ctrs,
501 .start = &op_amd_start, 534 .start = &op_amd_start,
502 .stop = &op_amd_stop, 535 .stop = &op_amd_stop,
503 .shutdown = &op_amd_shutdown 536 .shutdown = &op_amd_shutdown,
537#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
538 .switch_ctrl = &op_mux_switch_ctrl,
539#endif
504}; 540};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 819b131fd752..ac6b354becdf 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -32,6 +32,8 @@
32#define NUM_CCCRS_HT2 9 32#define NUM_CCCRS_HT2 9
33#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) 33#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
34 34
35#define OP_CTR_OVERFLOW (1ULL<<31)
36
35static unsigned int num_counters = NUM_COUNTERS_NON_HT; 37static unsigned int num_counters = NUM_COUNTERS_NON_HT;
36static unsigned int num_controls = NUM_CONTROLS_NON_HT; 38static unsigned int num_controls = NUM_CONTROLS_NON_HT;
37 39
@@ -350,8 +352,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
350#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) 352#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
351#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) 353#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
352#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) 354#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
353#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
354#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
355 355
356#define CCCR_RESERVED_BITS 0x38030FFF 356#define CCCR_RESERVED_BITS 0x38030FFF
357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) 357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -361,17 +361,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) 361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) 362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) 363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
364#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
365#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
366#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) 364#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
367#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) 365#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
368 366
369#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
370#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
371#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
372#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
373#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
374
375 367
376/* this assigns a "stagger" to the current CPU, which is used throughout 368/* this assigns a "stagger" to the current CPU, which is used throughout
377 the code in this module as an extra array offset, to select the "even" 369 the code in this module as an extra array offset, to select the "even"
@@ -515,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
515 if (ev->bindings[i].virt_counter & counter_bit) { 507 if (ev->bindings[i].virt_counter & counter_bit) {
516 508
517 /* modify ESCR */ 509 /* modify ESCR */
518 ESCR_READ(escr, high, ev, i); 510 rdmsr(ev->bindings[i].escr_address, escr, high);
519 ESCR_CLEAR(escr); 511 ESCR_CLEAR(escr);
520 if (stag == 0) { 512 if (stag == 0) {
521 ESCR_SET_USR_0(escr, counter_config[ctr].user); 513 ESCR_SET_USR_0(escr, counter_config[ctr].user);
@@ -526,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
526 } 518 }
527 ESCR_SET_EVENT_SELECT(escr, ev->event_select); 519 ESCR_SET_EVENT_SELECT(escr, ev->event_select);
528 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); 520 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
529 ESCR_WRITE(escr, high, ev, i); 521 wrmsr(ev->bindings[i].escr_address, escr, high);
530 522
531 /* modify CCCR */ 523 /* modify CCCR */
532 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); 524 rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
525 cccr, high);
533 CCCR_CLEAR(cccr); 526 CCCR_CLEAR(cccr);
534 CCCR_SET_REQUIRED_BITS(cccr); 527 CCCR_SET_REQUIRED_BITS(cccr);
535 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); 528 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
@@ -537,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
537 CCCR_SET_PMI_OVF_0(cccr); 530 CCCR_SET_PMI_OVF_0(cccr);
538 else 531 else
539 CCCR_SET_PMI_OVF_1(cccr); 532 CCCR_SET_PMI_OVF_1(cccr);
540 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); 533 wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
534 cccr, high);
541 return; 535 return;
542 } 536 }
543 } 537 }
@@ -548,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
548} 542}
549 543
550 544
551static void p4_setup_ctrs(struct op_msrs const * const msrs) 545static void p4_setup_ctrs(struct op_x86_model_spec const *model,
546 struct op_msrs const * const msrs)
552{ 547{
553 unsigned int i; 548 unsigned int i;
554 unsigned int low, high; 549 unsigned int low, high;
@@ -563,8 +558,8 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
563 } 558 }
564 559
565 /* clear the cccrs we will use */ 560 /* clear the cccrs we will use */
566 for (i = 0 ; i < num_counters ; i++) { 561 for (i = 0; i < num_counters; i++) {
567 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 562 if (unlikely(!msrs->controls[i].addr))
568 continue; 563 continue;
569 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); 564 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
570 CCCR_CLEAR(low); 565 CCCR_CLEAR(low);
@@ -574,17 +569,18 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
574 569
575 /* clear all escrs (including those outside our concern) */ 570 /* clear all escrs (including those outside our concern) */
576 for (i = num_counters; i < num_controls; i++) { 571 for (i = num_counters; i < num_controls; i++) {
577 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 572 if (unlikely(!msrs->controls[i].addr))
578 continue; 573 continue;
579 wrmsr(msrs->controls[i].addr, 0, 0); 574 wrmsr(msrs->controls[i].addr, 0, 0);
580 } 575 }
581 576
582 /* setup all counters */ 577 /* setup all counters */
583 for (i = 0 ; i < num_counters ; ++i) { 578 for (i = 0; i < num_counters; ++i) {
584 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { 579 if (counter_config[i].enabled && msrs->controls[i].addr) {
585 reset_value[i] = counter_config[i].count; 580 reset_value[i] = counter_config[i].count;
586 pmc_setup_one_p4_counter(i); 581 pmc_setup_one_p4_counter(i);
587 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); 582 wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address,
583 -(u64)counter_config[i].count);
588 } else { 584 } else {
589 reset_value[i] = 0; 585 reset_value[i] = 0;
590 } 586 }
@@ -624,14 +620,16 @@ static int p4_check_ctrs(struct pt_regs * const regs,
624 620
625 real = VIRT_CTR(stag, i); 621 real = VIRT_CTR(stag, i);
626 622
627 CCCR_READ(low, high, real); 623 rdmsr(p4_counters[real].cccr_address, low, high);
628 CTR_READ(ctr, high, real); 624 rdmsr(p4_counters[real].counter_address, ctr, high);
629 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { 625 if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) {
630 oprofile_add_sample(regs, i); 626 oprofile_add_sample(regs, i);
631 CTR_WRITE(reset_value[i], real); 627 wrmsrl(p4_counters[real].counter_address,
628 -(u64)reset_value[i]);
632 CCCR_CLEAR_OVF(low); 629 CCCR_CLEAR_OVF(low);
633 CCCR_WRITE(low, high, real); 630 wrmsr(p4_counters[real].cccr_address, low, high);
634 CTR_WRITE(reset_value[i], real); 631 wrmsrl(p4_counters[real].counter_address,
632 -(u64)reset_value[i]);
635 } 633 }
636 } 634 }
637 635
@@ -653,9 +651,9 @@ static void p4_start(struct op_msrs const * const msrs)
653 for (i = 0; i < num_counters; ++i) { 651 for (i = 0; i < num_counters; ++i) {
654 if (!reset_value[i]) 652 if (!reset_value[i])
655 continue; 653 continue;
656 CCCR_READ(low, high, VIRT_CTR(stag, i)); 654 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
657 CCCR_SET_ENABLE(low); 655 CCCR_SET_ENABLE(low);
658 CCCR_WRITE(low, high, VIRT_CTR(stag, i)); 656 wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
659 } 657 }
660} 658}
661 659
@@ -670,9 +668,9 @@ static void p4_stop(struct op_msrs const * const msrs)
670 for (i = 0; i < num_counters; ++i) { 668 for (i = 0; i < num_counters; ++i) {
671 if (!reset_value[i]) 669 if (!reset_value[i])
672 continue; 670 continue;
673 CCCR_READ(low, high, VIRT_CTR(stag, i)); 671 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
674 CCCR_SET_DISABLE(low); 672 CCCR_SET_DISABLE(low);
675 CCCR_WRITE(low, high, VIRT_CTR(stag, i)); 673 wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
676 } 674 }
677} 675}
678 676
@@ -680,8 +678,8 @@ static void p4_shutdown(struct op_msrs const * const msrs)
680{ 678{
681 int i; 679 int i;
682 680
683 for (i = 0 ; i < num_counters ; ++i) { 681 for (i = 0; i < num_counters; ++i) {
684 if (CTR_IS_RESERVED(msrs, i)) 682 if (msrs->counters[i].addr)
685 release_perfctr_nmi(msrs->counters[i].addr); 683 release_perfctr_nmi(msrs->counters[i].addr);
686 } 684 }
687 /* 685 /*
@@ -689,15 +687,15 @@ static void p4_shutdown(struct op_msrs const * const msrs)
689 * conjunction with the counter registers (hence the starting offset). 687 * conjunction with the counter registers (hence the starting offset).
690 * This saves a few bits. 688 * This saves a few bits.
691 */ 689 */
692 for (i = num_counters ; i < num_controls ; ++i) { 690 for (i = num_counters; i < num_controls; ++i) {
693 if (CTRL_IS_RESERVED(msrs, i)) 691 if (msrs->controls[i].addr)
694 release_evntsel_nmi(msrs->controls[i].addr); 692 release_evntsel_nmi(msrs->controls[i].addr);
695 } 693 }
696} 694}
697 695
698 696
699#ifdef CONFIG_SMP 697#ifdef CONFIG_SMP
700struct op_x86_model_spec const op_p4_ht2_spec = { 698struct op_x86_model_spec op_p4_ht2_spec = {
701 .num_counters = NUM_COUNTERS_HT2, 699 .num_counters = NUM_COUNTERS_HT2,
702 .num_controls = NUM_CONTROLS_HT2, 700 .num_controls = NUM_CONTROLS_HT2,
703 .fill_in_addresses = &p4_fill_in_addresses, 701 .fill_in_addresses = &p4_fill_in_addresses,
@@ -709,7 +707,7 @@ struct op_x86_model_spec const op_p4_ht2_spec = {
709}; 707};
710#endif 708#endif
711 709
712struct op_x86_model_spec const op_p4_spec = { 710struct op_x86_model_spec op_p4_spec = {
713 .num_counters = NUM_COUNTERS_NON_HT, 711 .num_counters = NUM_COUNTERS_NON_HT,
714 .num_controls = NUM_CONTROLS_NON_HT, 712 .num_controls = NUM_CONTROLS_NON_HT,
715 .fill_in_addresses = &p4_fill_in_addresses, 713 .fill_in_addresses = &p4_fill_in_addresses,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 4da7230b3d17..8eb05878554c 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -10,6 +10,7 @@
10 * @author Philippe Elie 10 * @author Philippe Elie
11 * @author Graydon Hoare 11 * @author Graydon Hoare
12 * @author Andi Kleen 12 * @author Andi Kleen
13 * @author Robert Richter <robert.richter@amd.com>
13 */ 14 */
14 15
15#include <linux/oprofile.h> 16#include <linux/oprofile.h>
@@ -18,7 +19,6 @@
18#include <asm/msr.h> 19#include <asm/msr.h>
19#include <asm/apic.h> 20#include <asm/apic.h>
20#include <asm/nmi.h> 21#include <asm/nmi.h>
21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -26,20 +26,7 @@
26static int num_counters = 2; 26static int num_counters = 2;
27static int counter_width = 32; 27static int counter_width = 32;
28 28
29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) 29#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21))
30#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1))))
31
32#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
33#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
34#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
35#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
36#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
37#define CTRL_CLEAR(x) (x &= (1<<21))
38#define CTRL_SET_ENABLE(val) (val |= 1<<20)
39#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
40#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
41#define CTRL_SET_UM(val, m) (val |= (m << 8))
42#define CTRL_SET_EVENT(val, e) (val |= e)
43 30
44static u64 *reset_value; 31static u64 *reset_value;
45 32
@@ -63,9 +50,10 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
63} 50}
64 51
65 52
66static void ppro_setup_ctrs(struct op_msrs const * const msrs) 53static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
54 struct op_msrs const * const msrs)
67{ 55{
68 unsigned int low, high; 56 u64 val;
69 int i; 57 int i;
70 58
71 if (!reset_value) { 59 if (!reset_value) {
@@ -93,36 +81,30 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
93 } 81 }
94 82
95 /* clear all counters */ 83 /* clear all counters */
96 for (i = 0 ; i < num_counters; ++i) { 84 for (i = 0; i < num_counters; ++i) {
97 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 85 if (unlikely(!msrs->controls[i].addr))
98 continue; 86 continue;
99 CTRL_READ(low, high, msrs, i); 87 rdmsrl(msrs->controls[i].addr, val);
100 CTRL_CLEAR(low); 88 val &= model->reserved;
101 CTRL_WRITE(low, high, msrs, i); 89 wrmsrl(msrs->controls[i].addr, val);
102 } 90 }
103 91
104 /* avoid a false detection of ctr overflows in NMI handler */ 92 /* avoid a false detection of ctr overflows in NMI handler */
105 for (i = 0; i < num_counters; ++i) { 93 for (i = 0; i < num_counters; ++i) {
106 if (unlikely(!CTR_IS_RESERVED(msrs, i))) 94 if (unlikely(!msrs->counters[i].addr))
107 continue; 95 continue;
108 wrmsrl(msrs->counters[i].addr, -1LL); 96 wrmsrl(msrs->counters[i].addr, -1LL);
109 } 97 }
110 98
111 /* enable active counters */ 99 /* enable active counters */
112 for (i = 0; i < num_counters; ++i) { 100 for (i = 0; i < num_counters; ++i) {
113 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { 101 if (counter_config[i].enabled && msrs->counters[i].addr) {
114 reset_value[i] = counter_config[i].count; 102 reset_value[i] = counter_config[i].count;
115
116 wrmsrl(msrs->counters[i].addr, -reset_value[i]); 103 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
117 104 rdmsrl(msrs->controls[i].addr, val);
118 CTRL_READ(low, high, msrs, i); 105 val &= model->reserved;
119 CTRL_CLEAR(low); 106 val |= op_x86_get_ctrl(model, &counter_config[i]);
120 CTRL_SET_ENABLE(low); 107 wrmsrl(msrs->controls[i].addr, val);
121 CTRL_SET_USR(low, counter_config[i].user);
122 CTRL_SET_KERN(low, counter_config[i].kernel);
123 CTRL_SET_UM(low, counter_config[i].unit_mask);
124 CTRL_SET_EVENT(low, counter_config[i].event);
125 CTRL_WRITE(low, high, msrs, i);
126 } else { 108 } else {
127 reset_value[i] = 0; 109 reset_value[i] = 0;
128 } 110 }
@@ -143,14 +125,14 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
143 if (unlikely(!reset_value)) 125 if (unlikely(!reset_value))
144 goto out; 126 goto out;
145 127
146 for (i = 0 ; i < num_counters; ++i) { 128 for (i = 0; i < num_counters; ++i) {
147 if (!reset_value[i]) 129 if (!reset_value[i])
148 continue; 130 continue;
149 rdmsrl(msrs->counters[i].addr, val); 131 rdmsrl(msrs->counters[i].addr, val);
150 if (CTR_OVERFLOWED(val)) { 132 if (val & (1ULL << (counter_width - 1)))
151 oprofile_add_sample(regs, i); 133 continue;
152 wrmsrl(msrs->counters[i].addr, -reset_value[i]); 134 oprofile_add_sample(regs, i);
153 } 135 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
154 } 136 }
155 137
156out: 138out:
@@ -171,16 +153,16 @@ out:
171 153
172static void ppro_start(struct op_msrs const * const msrs) 154static void ppro_start(struct op_msrs const * const msrs)
173{ 155{
174 unsigned int low, high; 156 u64 val;
175 int i; 157 int i;
176 158
177 if (!reset_value) 159 if (!reset_value)
178 return; 160 return;
179 for (i = 0; i < num_counters; ++i) { 161 for (i = 0; i < num_counters; ++i) {
180 if (reset_value[i]) { 162 if (reset_value[i]) {
181 CTRL_READ(low, high, msrs, i); 163 rdmsrl(msrs->controls[i].addr, val);
182 CTRL_SET_ACTIVE(low); 164 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
183 CTRL_WRITE(low, high, msrs, i); 165 wrmsrl(msrs->controls[i].addr, val);
184 } 166 }
185 } 167 }
186} 168}
@@ -188,7 +170,7 @@ static void ppro_start(struct op_msrs const * const msrs)
188 170
189static void ppro_stop(struct op_msrs const * const msrs) 171static void ppro_stop(struct op_msrs const * const msrs)
190{ 172{
191 unsigned int low, high; 173 u64 val;
192 int i; 174 int i;
193 175
194 if (!reset_value) 176 if (!reset_value)
@@ -196,9 +178,9 @@ static void ppro_stop(struct op_msrs const * const msrs)
196 for (i = 0; i < num_counters; ++i) { 178 for (i = 0; i < num_counters; ++i) {
197 if (!reset_value[i]) 179 if (!reset_value[i])
198 continue; 180 continue;
199 CTRL_READ(low, high, msrs, i); 181 rdmsrl(msrs->controls[i].addr, val);
200 CTRL_SET_INACTIVE(low); 182 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
201 CTRL_WRITE(low, high, msrs, i); 183 wrmsrl(msrs->controls[i].addr, val);
202 } 184 }
203} 185}
204 186
@@ -206,12 +188,12 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
206{ 188{
207 int i; 189 int i;
208 190
209 for (i = 0 ; i < num_counters ; ++i) { 191 for (i = 0; i < num_counters; ++i) {
210 if (CTR_IS_RESERVED(msrs, i)) 192 if (msrs->counters[i].addr)
211 release_perfctr_nmi(MSR_P6_PERFCTR0 + i); 193 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
212 } 194 }
213 for (i = 0 ; i < num_counters ; ++i) { 195 for (i = 0; i < num_counters; ++i) {
214 if (CTRL_IS_RESERVED(msrs, i)) 196 if (msrs->controls[i].addr)
215 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); 197 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
216 } 198 }
217 if (reset_value) { 199 if (reset_value) {
@@ -222,8 +204,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
222 204
223 205
224struct op_x86_model_spec op_ppro_spec = { 206struct op_x86_model_spec op_ppro_spec = {
225 .num_counters = 2, /* can be overriden */ 207 .num_counters = 2,
226 .num_controls = 2, /* dito */ 208 .num_controls = 2,
209 .reserved = MSR_PPRO_EVENTSEL_RESERVED,
227 .fill_in_addresses = &ppro_fill_in_addresses, 210 .fill_in_addresses = &ppro_fill_in_addresses,
228 .setup_ctrs = &ppro_setup_ctrs, 211 .setup_ctrs = &ppro_setup_ctrs,
229 .check_ctrs = &ppro_check_ctrs, 212 .check_ctrs = &ppro_check_ctrs,
@@ -241,7 +224,7 @@ struct op_x86_model_spec op_ppro_spec = {
241 * the specific CPU. 224 * the specific CPU.
242 */ 225 */
243 226
244void arch_perfmon_setup_counters(void) 227static void arch_perfmon_setup_counters(void)
245{ 228{
246 union cpuid10_eax eax; 229 union cpuid10_eax eax;
247 230
@@ -251,19 +234,25 @@ void arch_perfmon_setup_counters(void)
251 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && 234 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
252 current_cpu_data.x86_model == 15) { 235 current_cpu_data.x86_model == 15) {
253 eax.split.version_id = 2; 236 eax.split.version_id = 2;
254 eax.split.num_counters = 2; 237 eax.split.num_events = 2;
255 eax.split.bit_width = 40; 238 eax.split.bit_width = 40;
256 } 239 }
257 240
258 num_counters = eax.split.num_counters; 241 num_counters = eax.split.num_events;
259 242
260 op_arch_perfmon_spec.num_counters = num_counters; 243 op_arch_perfmon_spec.num_counters = num_counters;
261 op_arch_perfmon_spec.num_controls = num_counters; 244 op_arch_perfmon_spec.num_controls = num_counters;
262 op_ppro_spec.num_counters = num_counters; 245}
263 op_ppro_spec.num_controls = num_counters; 246
247static int arch_perfmon_init(struct oprofile_operations *ignore)
248{
249 arch_perfmon_setup_counters();
250 return 0;
264} 251}
265 252
266struct op_x86_model_spec op_arch_perfmon_spec = { 253struct op_x86_model_spec op_arch_perfmon_spec = {
254 .reserved = MSR_PPRO_EVENTSEL_RESERVED,
255 .init = &arch_perfmon_init,
267 /* num_counters/num_controls filled in at runtime */ 256 /* num_counters/num_controls filled in at runtime */
268 .fill_in_addresses = &ppro_fill_in_addresses, 257 .fill_in_addresses = &ppro_fill_in_addresses,
269 /* user space does the cpuid check for available events */ 258 /* user space does the cpuid check for available events */
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 825e79064d64..7b8e75d16081 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -6,51 +6,66 @@
6 * @remark Read the file COPYING 6 * @remark Read the file COPYING
7 * 7 *
8 * @author Graydon Hoare 8 * @author Graydon Hoare
9 * @author Robert Richter <robert.richter@amd.com>
9 */ 10 */
10 11
11#ifndef OP_X86_MODEL_H 12#ifndef OP_X86_MODEL_H
12#define OP_X86_MODEL_H 13#define OP_X86_MODEL_H
13 14
14struct op_saved_msr { 15#include <asm/types.h>
15 unsigned int high; 16#include <asm/perf_event.h>
16 unsigned int low;
17};
18 17
19struct op_msr { 18struct op_msr {
20 unsigned long addr; 19 unsigned long addr;
21 struct op_saved_msr saved; 20 u64 saved;
22}; 21};
23 22
24struct op_msrs { 23struct op_msrs {
25 struct op_msr *counters; 24 struct op_msr *counters;
26 struct op_msr *controls; 25 struct op_msr *controls;
26 struct op_msr *multiplex;
27}; 27};
28 28
29struct pt_regs; 29struct pt_regs;
30 30
31struct oprofile_operations;
32
31/* The model vtable abstracts the differences between 33/* The model vtable abstracts the differences between
32 * various x86 CPU models' perfctr support. 34 * various x86 CPU models' perfctr support.
33 */ 35 */
34struct op_x86_model_spec { 36struct op_x86_model_spec {
35 int (*init)(struct oprofile_operations *ops); 37 unsigned int num_counters;
36 void (*exit)(void); 38 unsigned int num_controls;
37 unsigned int num_counters; 39 unsigned int num_virt_counters;
38 unsigned int num_controls; 40 u64 reserved;
39 void (*fill_in_addresses)(struct op_msrs * const msrs); 41 u16 event_mask;
40 void (*setup_ctrs)(struct op_msrs const * const msrs); 42 int (*init)(struct oprofile_operations *ops);
41 int (*check_ctrs)(struct pt_regs * const regs, 43 void (*exit)(void);
42 struct op_msrs const * const msrs); 44 void (*fill_in_addresses)(struct op_msrs * const msrs);
43 void (*start)(struct op_msrs const * const msrs); 45 void (*setup_ctrs)(struct op_x86_model_spec const *model,
44 void (*stop)(struct op_msrs const * const msrs); 46 struct op_msrs const * const msrs);
45 void (*shutdown)(struct op_msrs const * const msrs); 47 int (*check_ctrs)(struct pt_regs * const regs,
48 struct op_msrs const * const msrs);
49 void (*start)(struct op_msrs const * const msrs);
50 void (*stop)(struct op_msrs const * const msrs);
51 void (*shutdown)(struct op_msrs const * const msrs);
52#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
53 void (*switch_ctrl)(struct op_x86_model_spec const *model,
54 struct op_msrs const * const msrs);
55#endif
46}; 56};
47 57
58struct op_counter_config;
59
60extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
61 struct op_counter_config *counter_config);
62extern int op_x86_phys_to_virt(int phys);
63extern int op_x86_virt_to_phys(int virt);
64
48extern struct op_x86_model_spec op_ppro_spec; 65extern struct op_x86_model_spec op_ppro_spec;
49extern struct op_x86_model_spec const op_p4_spec; 66extern struct op_x86_model_spec op_p4_spec;
50extern struct op_x86_model_spec const op_p4_ht2_spec; 67extern struct op_x86_model_spec op_p4_ht2_spec;
51extern struct op_x86_model_spec const op_amd_spec; 68extern struct op_x86_model_spec op_amd_spec;
52extern struct op_x86_model_spec op_arch_perfmon_spec; 69extern struct op_x86_model_spec op_arch_perfmon_spec;
53 70
54extern void arch_perfmon_setup_counters(void);
55
56#endif /* OP_X86_MODEL_H */ 71#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 3ffa10df20b9..572ee9782f2a 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -15,63 +15,6 @@
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
16 */ 16 */
17 17
18#ifdef CONFIG_NUMA
19
20#define BUS_NR 256
21
22#ifdef CONFIG_X86_64
23
24static int mp_bus_to_node[BUS_NR];
25
26void set_mp_bus_to_node(int busnum, int node)
27{
28 if (busnum >= 0 && busnum < BUS_NR)
29 mp_bus_to_node[busnum] = node;
30}
31
32int get_mp_bus_to_node(int busnum)
33{
34 int node = -1;
35
36 if (busnum < 0 || busnum > (BUS_NR - 1))
37 return node;
38
39 node = mp_bus_to_node[busnum];
40
41 /*
42 * let numa_node_id to decide it later in dma_alloc_pages
43 * if there is no ram on that node
44 */
45 if (node != -1 && !node_online(node))
46 node = -1;
47
48 return node;
49}
50
51#else /* CONFIG_X86_32 */
52
53static unsigned char mp_bus_to_node[BUS_NR];
54
55void set_mp_bus_to_node(int busnum, int node)
56{
57 if (busnum >= 0 && busnum < BUS_NR)
58 mp_bus_to_node[busnum] = (unsigned char) node;
59}
60
61int get_mp_bus_to_node(int busnum)
62{
63 int node;
64
65 if (busnum < 0 || busnum > (BUS_NR - 1))
66 return 0;
67 node = mp_bus_to_node[busnum];
68 return node;
69}
70
71#endif /* CONFIG_X86_32 */
72
73#endif /* CONFIG_NUMA */
74
75#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
76 19
77/* 20/*
@@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void)
301 u64 val; 244 u64 val;
302 u32 address; 245 u32 address;
303 246
304#ifdef CONFIG_NUMA
305 for (i = 0; i < BUS_NR; i++)
306 mp_bus_to_node[i] = -1;
307#endif
308
309 if (!early_pci_allowed()) 247 if (!early_pci_allowed())
310 return -1; 248 return -1;
311 249
@@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void)
346 node = (reg >> 4) & 0x07; 284 node = (reg >> 4) & 0x07;
347#ifdef CONFIG_NUMA 285#ifdef CONFIG_NUMA
348 for (j = min_bus; j <= max_bus; j++) 286 for (j = min_bus; j <= max_bus; j++)
349 mp_bus_to_node[j] = (unsigned char) node; 287 set_mp_bus_to_node(j, node);
350#endif 288#endif
351 link = (reg >> 8) & 0x03; 289 link = (reg >> 8) & 0x03;
352 290
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2202b6257b82..1331fcf26143 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
600{ 600{
601 return pci_scan_bus_on_node(busno, &pci_root_ops, -1); 601 return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
602} 602}
603
604/*
605 * NUMA info for PCI busses
606 *
607 * Early arch code is responsible for filling in reasonable values here.
608 * A node id of "-1" means "use current node". In other words, if a bus
609 * has a -1 node id, it's not tightly coupled to any particular chunk
610 * of memory (as is the case on some Nehalem systems).
611 */
612#ifdef CONFIG_NUMA
613
614#define BUS_NR 256
615
616#ifdef CONFIG_X86_64
617
618static int mp_bus_to_node[BUS_NR] = {
619 [0 ... BUS_NR - 1] = -1
620};
621
622void set_mp_bus_to_node(int busnum, int node)
623{
624 if (busnum >= 0 && busnum < BUS_NR)
625 mp_bus_to_node[busnum] = node;
626}
627
628int get_mp_bus_to_node(int busnum)
629{
630 int node = -1;
631
632 if (busnum < 0 || busnum > (BUS_NR - 1))
633 return node;
634
635 node = mp_bus_to_node[busnum];
636
637 /*
638 * let numa_node_id to decide it later in dma_alloc_pages
639 * if there is no ram on that node
640 */
641 if (node != -1 && !node_online(node))
642 node = -1;
643
644 return node;
645}
646
647#else /* CONFIG_X86_32 */
648
649static int mp_bus_to_node[BUS_NR] = {
650 [0 ... BUS_NR - 1] = -1
651};
652
653void set_mp_bus_to_node(int busnum, int node)
654{
655 if (busnum >= 0 && busnum < BUS_NR)
656 mp_bus_to_node[busnum] = (unsigned char) node;
657}
658
659int get_mp_bus_to_node(int busnum)
660{
661 int node;
662
663 if (busnum < 0 || busnum > (BUS_NR - 1))
664 return 0;
665 node = mp_bus_to_node[busnum];
666 return node;
667}
668
669#endif /* CONFIG_X86_32 */
670
671#endif /* CONFIG_NUMA */
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index bd13c3e4c6db..347d882b3bb3 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -192,13 +192,14 @@ struct pci_raw_ops pci_direct_conf2 = {
192static int __init pci_sanity_check(struct pci_raw_ops *o) 192static int __init pci_sanity_check(struct pci_raw_ops *o)
193{ 193{
194 u32 x = 0; 194 u32 x = 0;
195 int devfn; 195 int year, devfn;
196 196
197 if (pci_probe & PCI_NO_CHECKS) 197 if (pci_probe & PCI_NO_CHECKS)
198 return 1; 198 return 1;
199 /* Assume Type 1 works for newer systems. 199 /* Assume Type 1 works for newer systems.
200 This handles machines that don't have anything on PCI Bus 0. */ 200 This handles machines that don't have anything on PCI Bus 0. */
201 if (dmi_get_year(DMI_BIOS_DATE) >= 2001) 201 dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL);
202 if (year >= 2001)
202 return 1; 203 return 1;
203 204
204 for (devfn = 0; devfn < 0x100; devfn++) { 205 for (devfn = 0; devfn < 0x100; devfn++) {
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 712443ec6d43..602c172d3bd5 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -13,10 +13,14 @@
13#include <linux/pci.h> 13#include <linux/pci.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/acpi.h> 15#include <linux/acpi.h>
16#include <linux/sfi_acpi.h>
16#include <linux/bitmap.h> 17#include <linux/bitmap.h>
17#include <linux/sort.h> 18#include <linux/sort.h>
18#include <asm/e820.h> 19#include <asm/e820.h>
19#include <asm/pci_x86.h> 20#include <asm/pci_x86.h>
21#include <asm/acpi.h>
22
23#define PREFIX "PCI: "
20 24
21/* aperture is up to 256MB but BIOS may reserve less */ 25/* aperture is up to 256MB but BIOS may reserve less */
22#define MMCONFIG_APER_MIN (2 * 1024*1024) 26#define MMCONFIG_APER_MIN (2 * 1024*1024)
@@ -491,7 +495,7 @@ static void __init pci_mmcfg_reject_broken(int early)
491 (unsigned int)cfg->start_bus_number, 495 (unsigned int)cfg->start_bus_number,
492 (unsigned int)cfg->end_bus_number); 496 (unsigned int)cfg->end_bus_number);
493 497
494 if (!early) 498 if (!early && !acpi_disabled)
495 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); 499 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
496 500
497 if (valid) 501 if (valid)
@@ -606,7 +610,7 @@ static void __init __pci_mmcfg_init(int early)
606 } 610 }
607 611
608 if (!known_bridge) 612 if (!known_bridge)
609 acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); 613 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
610 614
611 pci_mmcfg_reject_broken(early); 615 pci_mmcfg_reject_broken(early);
612 616
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 8b2d561046a3..f10a7e94a84c 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -11,9 +11,9 @@
11 11
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/acpi.h>
15#include <asm/e820.h> 14#include <asm/e820.h>
16#include <asm/pci_x86.h> 15#include <asm/pci_x86.h>
16#include <acpi/acpi.h>
17 17
18/* Assume systems with more busses have correct MCFG */ 18/* Assume systems with more busses have correct MCFG */
19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) 19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index b3d20b9cac63..8aa85f17667e 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -242,11 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
242 fix_processor_context(); 242 fix_processor_context();
243 243
244 do_fpu_end(); 244 do_fpu_end();
245 mtrr_ap_init(); 245 mtrr_bp_restore();
246
247#ifdef CONFIG_X86_OLD_MCE
248 mcheck_init(&boot_cpu_data);
249#endif
250} 246}
251 247
252/* Needed by apm.c */ 248/* Needed by apm.c */
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 88112b49f02c..6b4ffedb93c9 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO $@
122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ 122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) 123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
124 124
125VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) 125VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
126GCOV_PROFILE := n 126GCOV_PROFILE := n
127 127
128# 128#
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6a40b78b46aa..ee55754cc3c5 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts)
86 return 0; 86 return 0;
87} 87}
88 88
89notrace static noinline int do_realtime_coarse(struct timespec *ts)
90{
91 unsigned long seq;
92 do {
93 seq = read_seqbegin(&gtod->lock);
94 ts->tv_sec = gtod->wall_time_coarse.tv_sec;
95 ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
96 } while (unlikely(read_seqretry(&gtod->lock, seq)));
97 return 0;
98}
99
100notrace static noinline int do_monotonic_coarse(struct timespec *ts)
101{
102 unsigned long seq, ns, secs;
103 do {
104 seq = read_seqbegin(&gtod->lock);
105 secs = gtod->wall_time_coarse.tv_sec;
106 ns = gtod->wall_time_coarse.tv_nsec;
107 secs += gtod->wall_to_monotonic.tv_sec;
108 ns += gtod->wall_to_monotonic.tv_nsec;
109 } while (unlikely(read_seqretry(&gtod->lock, seq)));
110 vset_normalized_timespec(ts, secs, ns);
111 return 0;
112}
113
89notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 114notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
90{ 115{
91 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) 116 if (likely(gtod->sysctl_enabled))
92 switch (clock) { 117 switch (clock) {
93 case CLOCK_REALTIME: 118 case CLOCK_REALTIME:
94 return do_realtime(ts); 119 if (likely(gtod->clock.vread))
120 return do_realtime(ts);
121 break;
95 case CLOCK_MONOTONIC: 122 case CLOCK_MONOTONIC:
96 return do_monotonic(ts); 123 if (likely(gtod->clock.vread))
124 return do_monotonic(ts);
125 break;
126 case CLOCK_REALTIME_COARSE:
127 return do_realtime_coarse(ts);
128 case CLOCK_MONOTONIC_COARSE:
129 return do_monotonic_coarse(ts);
97 } 130 }
98 return vdso_fallback_gettime(clock, ts); 131 return vdso_fallback_gettime(clock, ts);
99} 132}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 7410640db173..3bb4fc21f4f2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -8,6 +8,7 @@ endif
8# Make sure early boot has no stackprotector 8# Make sure early boot has no stackprotector
9nostackp := $(call cc-option, -fno-stack-protector) 9nostackp := $(call cc-option, -fno-stack-protector)
10CFLAGS_enlighten.o := $(nostackp) 10CFLAGS_enlighten.o := $(nostackp)
11CFLAGS_mmu.o := $(nostackp)
11 12
12obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
13 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
@@ -16,3 +17,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
16obj-$(CONFIG_SMP) += smp.o 17obj-$(CONFIG_SMP) += smp.o
17obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
18obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o 19obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
20
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index eb33aaa8415d..544eb7496531 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -51,6 +51,7 @@
51#include <asm/pgtable.h> 51#include <asm/pgtable.h>
52#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
53#include <asm/reboot.h> 53#include <asm/reboot.h>
54#include <asm/stackprotector.h>
54 55
55#include "xen-ops.h" 56#include "xen-ops.h"
56#include "mmu.h" 57#include "mmu.h"
@@ -330,18 +331,28 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
330 unsigned long frames[pages]; 331 unsigned long frames[pages];
331 int f; 332 int f;
332 333
333 /* A GDT can be up to 64k in size, which corresponds to 8192 334 /*
334 8-byte entries, or 16 4k pages.. */ 335 * A GDT can be up to 64k in size, which corresponds to 8192
336 * 8-byte entries, or 16 4k pages..
337 */
335 338
336 BUG_ON(size > 65536); 339 BUG_ON(size > 65536);
337 BUG_ON(va & ~PAGE_MASK); 340 BUG_ON(va & ~PAGE_MASK);
338 341
339 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 342 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
340 int level; 343 int level;
341 pte_t *ptep = lookup_address(va, &level); 344 pte_t *ptep;
342 unsigned long pfn, mfn; 345 unsigned long pfn, mfn;
343 void *virt; 346 void *virt;
344 347
348 /*
349 * The GDT is per-cpu and is in the percpu data area.
350 * That can be virtually mapped, so we need to do a
351 * page-walk to get the underlying MFN for the
352 * hypercall. The page can also be in the kernel's
353 * linear range, so we need to RO that mapping too.
354 */
355 ptep = lookup_address(va, &level);
345 BUG_ON(ptep == NULL); 356 BUG_ON(ptep == NULL);
346 357
347 pfn = pte_pfn(*ptep); 358 pfn = pte_pfn(*ptep);
@@ -358,6 +369,44 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
358 BUG(); 369 BUG();
359} 370}
360 371
372/*
373 * load_gdt for early boot, when the gdt is only mapped once
374 */
375static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
376{
377 unsigned long va = dtr->address;
378 unsigned int size = dtr->size + 1;
379 unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
380 unsigned long frames[pages];
381 int f;
382
383 /*
384 * A GDT can be up to 64k in size, which corresponds to 8192
385 * 8-byte entries, or 16 4k pages..
386 */
387
388 BUG_ON(size > 65536);
389 BUG_ON(va & ~PAGE_MASK);
390
391 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
392 pte_t pte;
393 unsigned long pfn, mfn;
394
395 pfn = virt_to_pfn(va);
396 mfn = pfn_to_mfn(pfn);
397
398 pte = pfn_pte(pfn, PAGE_KERNEL_RO);
399
400 if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
401 BUG();
402
403 frames[f] = mfn;
404 }
405
406 if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
407 BUG();
408}
409
361static void load_TLS_descriptor(struct thread_struct *t, 410static void load_TLS_descriptor(struct thread_struct *t,
362 unsigned int cpu, unsigned int i) 411 unsigned int cpu, unsigned int i)
363{ 412{
@@ -581,6 +630,29 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
581 preempt_enable(); 630 preempt_enable();
582} 631}
583 632
633/*
634 * Version of write_gdt_entry for use at early boot-time needed to
635 * update an entry as simply as possible.
636 */
637static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
638 const void *desc, int type)
639{
640 switch (type) {
641 case DESC_LDT:
642 case DESC_TSS:
643 /* ignore */
644 break;
645
646 default: {
647 xmaddr_t maddr = virt_to_machine(&dt[entry]);
648
649 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
650 dt[entry] = *(struct desc_struct *)desc;
651 }
652
653 }
654}
655
584static void xen_load_sp0(struct tss_struct *tss, 656static void xen_load_sp0(struct tss_struct *tss,
585 struct thread_struct *thread) 657 struct thread_struct *thread)
586{ 658{
@@ -714,7 +786,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
714 set: 786 set:
715 base = ((u64)high << 32) | low; 787 base = ((u64)high << 32) | low;
716 if (HYPERVISOR_set_segment_base(which, base) != 0) 788 if (HYPERVISOR_set_segment_base(which, base) != 0)
717 ret = -EFAULT; 789 ret = -EIO;
718 break; 790 break;
719#endif 791#endif
720 792
@@ -840,19 +912,9 @@ static const struct pv_info xen_info __initdata = {
840 912
841static const struct pv_init_ops xen_init_ops __initdata = { 913static const struct pv_init_ops xen_init_ops __initdata = {
842 .patch = xen_patch, 914 .patch = xen_patch,
843
844 .banner = xen_banner,
845 .memory_setup = xen_memory_setup,
846 .arch_setup = xen_arch_setup,
847 .post_allocator_init = xen_post_allocator_init,
848}; 915};
849 916
850static const struct pv_time_ops xen_time_ops __initdata = { 917static const struct pv_time_ops xen_time_ops __initdata = {
851 .time_init = xen_time_init,
852
853 .set_wallclock = xen_set_wallclock,
854 .get_wallclock = xen_get_wallclock,
855 .get_tsc_khz = xen_tsc_khz,
856 .sched_clock = xen_sched_clock, 918 .sched_clock = xen_sched_clock,
857}; 919};
858 920
@@ -918,8 +980,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
918 980
919static const struct pv_apic_ops xen_apic_ops __initdata = { 981static const struct pv_apic_ops xen_apic_ops __initdata = {
920#ifdef CONFIG_X86_LOCAL_APIC 982#ifdef CONFIG_X86_LOCAL_APIC
921 .setup_boot_clock = paravirt_nop,
922 .setup_secondary_clock = paravirt_nop,
923 .startup_ipi_hook = paravirt_nop, 983 .startup_ipi_hook = paravirt_nop,
924#endif 984#endif
925}; 985};
@@ -965,6 +1025,23 @@ static const struct machine_ops __initdata xen_machine_ops = {
965 .emergency_restart = xen_emergency_restart, 1025 .emergency_restart = xen_emergency_restart,
966}; 1026};
967 1027
1028/*
1029 * Set up the GDT and segment registers for -fstack-protector. Until
1030 * we do this, we have to be careful not to call any stack-protected
1031 * function, which is most of the kernel.
1032 */
1033static void __init xen_setup_stackprotector(void)
1034{
1035 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
1036 pv_cpu_ops.load_gdt = xen_load_gdt_boot;
1037
1038 setup_stack_canary_segment(0);
1039 switch_to_new_gdt(0);
1040
1041 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
1042 pv_cpu_ops.load_gdt = xen_load_gdt;
1043}
1044
968/* First C function to be called on Xen boot */ 1045/* First C function to be called on Xen boot */
969asmlinkage void __init xen_start_kernel(void) 1046asmlinkage void __init xen_start_kernel(void)
970{ 1047{
@@ -981,16 +1058,43 @@ asmlinkage void __init xen_start_kernel(void)
981 pv_time_ops = xen_time_ops; 1058 pv_time_ops = xen_time_ops;
982 pv_cpu_ops = xen_cpu_ops; 1059 pv_cpu_ops = xen_cpu_ops;
983 pv_apic_ops = xen_apic_ops; 1060 pv_apic_ops = xen_apic_ops;
984 pv_mmu_ops = xen_mmu_ops;
985 1061
986#ifdef CONFIG_X86_64 1062 x86_init.resources.memory_setup = xen_memory_setup;
1063 x86_init.oem.arch_setup = xen_arch_setup;
1064 x86_init.oem.banner = xen_banner;
1065
1066 x86_init.timers.timer_init = xen_time_init;
1067 x86_init.timers.setup_percpu_clockev = x86_init_noop;
1068 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
1069
1070 x86_platform.calibrate_tsc = xen_tsc_khz;
1071 x86_platform.get_wallclock = xen_get_wallclock;
1072 x86_platform.set_wallclock = xen_set_wallclock;
1073
987 /* 1074 /*
988 * Setup percpu state. We only need to do this for 64-bit 1075 * Set up some pagetable state before starting to set any ptes.
989 * because 32-bit already has %fs set properly.
990 */ 1076 */
991 load_percpu_segment(0);
992#endif
993 1077
1078 /* Prevent unwanted bits from being set in PTEs. */
1079 __supported_pte_mask &= ~_PAGE_GLOBAL;
1080 if (!xen_initial_domain())
1081 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1082
1083 __supported_pte_mask |= _PAGE_IOMAP;
1084
1085 xen_setup_features();
1086
1087 /* Get mfn list */
1088 if (!xen_feature(XENFEAT_auto_translated_physmap))
1089 xen_build_dynamic_phys_to_machine();
1090
1091 /*
1092 * Set up kernel GDT and segment registers, mainly so that
1093 * -fstack-protector code can be executed.
1094 */
1095 xen_setup_stackprotector();
1096
1097 xen_init_mmu_ops();
994 xen_init_irq_ops(); 1098 xen_init_irq_ops();
995 xen_init_cpuid_mask(); 1099 xen_init_cpuid_mask();
996 1100
@@ -1001,8 +1105,6 @@ asmlinkage void __init xen_start_kernel(void)
1001 set_xen_basic_apic_ops(); 1105 set_xen_basic_apic_ops();
1002#endif 1106#endif
1003 1107
1004 xen_setup_features();
1005
1006 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1108 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1007 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1109 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1008 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1110 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1019,17 +1121,8 @@ asmlinkage void __init xen_start_kernel(void)
1019 1121
1020 xen_smp_init(); 1122 xen_smp_init();
1021 1123
1022 /* Get mfn list */
1023 if (!xen_feature(XENFEAT_auto_translated_physmap))
1024 xen_build_dynamic_phys_to_machine();
1025
1026 pgd = (pgd_t *)xen_start_info->pt_base; 1124 pgd = (pgd_t *)xen_start_info->pt_base;
1027 1125
1028 /* Prevent unwanted bits from being set in PTEs. */
1029 __supported_pte_mask &= ~_PAGE_GLOBAL;
1030 if (!xen_initial_domain())
1031 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1032
1033#ifdef CONFIG_X86_64 1126#ifdef CONFIG_X86_64
1034 /* Work out if we support NX */ 1127 /* Work out if we support NX */
1035 check_efer(); 1128 check_efer();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index cfd17799bd6d..9d30105a0c4a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -1,5 +1,7 @@
1#include <linux/hardirq.h> 1#include <linux/hardirq.h>
2 2
3#include <asm/x86_init.h>
4
3#include <xen/interface/xen.h> 5#include <xen/interface/xen.h>
4#include <xen/interface/sched.h> 6#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h> 7#include <xen/interface/vcpu.h>
@@ -112,8 +114,6 @@ static void xen_halt(void)
112} 114}
113 115
114static const struct pv_irq_ops xen_irq_ops __initdata = { 116static const struct pv_irq_ops xen_irq_ops __initdata = {
115 .init_IRQ = xen_init_IRQ,
116
117 .save_fl = PV_CALLEE_SAVE(xen_save_fl), 117 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), 118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), 119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
@@ -129,4 +129,5 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
129void __init xen_init_irq_ops() 129void __init xen_init_irq_ops()
130{ 130{
131 pv_irq_ops = xen_irq_ops; 131 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ;
132} 133}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4ceb28581652..3bf7b1d250ce 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1165 /* Get the "official" set of cpus referring to our pagetable. */ 1165 /* Get the "official" set of cpus referring to our pagetable. */
1166 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1166 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1167 for_each_online_cpu(cpu) { 1167 for_each_online_cpu(cpu) {
1168 if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) 1168 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1169 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1169 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1170 continue; 1170 continue;
1171 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); 1171 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1172 } 1172 }
1173 return; 1173 return;
1174 } 1174 }
1175 cpumask_copy(mask, &mm->cpu_vm_mask); 1175 cpumask_copy(mask, mm_cpumask(mm));
1176 1176
1177 /* It's possible that a vcpu may have a stale reference to our 1177 /* It's possible that a vcpu may have a stale reference to our
1178 cr3, because its in lazy mode, and it hasn't yet flushed 1178 cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1229,9 +1229,12 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
1229{ 1229{
1230} 1230}
1231 1231
1232static void xen_post_allocator_init(void);
1233
1232static __init void xen_pagetable_setup_done(pgd_t *base) 1234static __init void xen_pagetable_setup_done(pgd_t *base)
1233{ 1235{
1234 xen_setup_shared_info(); 1236 xen_setup_shared_info();
1237 xen_post_allocator_init();
1235} 1238}
1236 1239
1237static void xen_write_cr2(unsigned long cr2) 1240static void xen_write_cr2(unsigned long cr2)
@@ -1841,7 +1844,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1841#endif 1844#endif
1842} 1845}
1843 1846
1844__init void xen_post_allocator_init(void) 1847static __init void xen_post_allocator_init(void)
1845{ 1848{
1846 pv_mmu_ops.set_pte = xen_set_pte; 1849 pv_mmu_ops.set_pte = xen_set_pte;
1847 pv_mmu_ops.set_pmd = xen_set_pmd; 1850 pv_mmu_ops.set_pmd = xen_set_pmd;
@@ -1875,10 +1878,7 @@ static void xen_leave_lazy_mmu(void)
1875 preempt_enable(); 1878 preempt_enable();
1876} 1879}
1877 1880
1878const struct pv_mmu_ops xen_mmu_ops __initdata = { 1881static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1879 .pagetable_setup_start = xen_pagetable_setup_start,
1880 .pagetable_setup_done = xen_pagetable_setup_done,
1881
1882 .read_cr2 = xen_read_cr2, 1882 .read_cr2 = xen_read_cr2,
1883 .write_cr2 = xen_write_cr2, 1883 .write_cr2 = xen_write_cr2,
1884 1884
@@ -1954,6 +1954,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1954 .set_fixmap = xen_set_fixmap, 1954 .set_fixmap = xen_set_fixmap,
1955}; 1955};
1956 1956
1957void __init xen_init_mmu_ops(void)
1958{
1959 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
1960 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
1961 pv_mmu_ops = xen_mmu_ops;
1962}
1957 1963
1958#ifdef CONFIG_XEN_DEBUG_FS 1964#ifdef CONFIG_XEN_DEBUG_FS
1959 1965
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index da7302624897..5fe6bc7f5ecf 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -59,5 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
59 59
60unsigned long xen_read_cr2_direct(void); 60unsigned long xen_read_cr2_direct(void);
61 61
62extern const struct pv_mmu_ops xen_mmu_ops; 62extern void xen_init_mmu_ops(void);
63#endif /* _XEN_MMU_H */ 63#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 429834ec1687..fe03eeed7b48 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -236,6 +236,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
236 ctxt->user_regs.ss = __KERNEL_DS; 236 ctxt->user_regs.ss = __KERNEL_DS;
237#ifdef CONFIG_X86_32 237#ifdef CONFIG_X86_32
238 ctxt->user_regs.fs = __KERNEL_PERCPU; 238 ctxt->user_regs.fs = __KERNEL_PERCPU;
239 ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
239#else 240#else
240 ctxt->gs_base_kernel = per_cpu_offset(cpu); 241 ctxt->gs_base_kernel = per_cpu_offset(cpu);
241#endif 242#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 5601506f2dd9..36a5141108df 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -187,7 +187,6 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
187 struct xen_spinlock *prev; 187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq); 188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret; 189 int ret;
190 unsigned long flags;
191 u64 start; 190 u64 start;
192 191
193 /* If kicker interrupts not initialized yet, just spin */ 192 /* If kicker interrupts not initialized yet, just spin */
@@ -199,16 +198,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
199 /* announce we're spinning */ 198 /* announce we're spinning */
200 prev = spinning_lock(xl); 199 prev = spinning_lock(xl);
201 200
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1); 201 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL); 202 ADD_STATS(taken_slow_nested, prev != NULL);
210 203
211 do { 204 do {
205 unsigned long flags;
206
212 /* clear pending */ 207 /* clear pending */
213 xen_clear_irq_pending(irq); 208 xen_clear_irq_pending(irq);
214 209
@@ -228,6 +223,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
228 goto out; 223 goto out;
229 } 224 }
230 225
226 flags = __raw_local_save_flags();
227 if (irq_enable) {
228 ADD_STATS(taken_slow_irqenable, 1);
229 raw_local_irq_enable();
230 }
231
231 /* 232 /*
232 * Block until irq becomes pending. If we're 233 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but 234 * interrupted at this point (after the trylock but
@@ -238,13 +239,15 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
238 * pending. 239 * pending.
239 */ 240 */
240 xen_poll_irq(irq); 241 xen_poll_irq(irq);
242
243 raw_local_irq_restore(flags);
244
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); 245 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ 246 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243 247
244 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 248 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
245 249
246out: 250out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev); 251 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start); 252 spin_time_accum_blocked(start);
250 253
@@ -323,8 +326,13 @@ static void xen_spin_unlock(struct raw_spinlock *lock)
323 smp_wmb(); /* make sure no writes get moved after unlock */ 326 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */ 327 xl->lock = 0; /* release lock */
325 328
326 /* make sure unlock happens before kick */ 329 /*
327 barrier(); 330 * Make sure unlock happens before checking for waiting
331 * spinners. We need a strong barrier to enforce the
332 * write-read ordering to different memory locations, as the
333 * CPU makes no implied guarantees about their ordering.
334 */
335 mb();
328 336
329 if (unlikely(xl->spinners)) 337 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl); 338 xen_spin_unlock_slow(xl);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 22494fd4c9b5..355fa6b99c9c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,8 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 30void xen_ident_map_ISA(void);
31void xen_reserve_top(void); 31void xen_reserve_top(void);
32 32
33void xen_post_allocator_init(void);
34
35char * __init xen_memory_setup(void); 33char * __init xen_memory_setup(void);
36void __init xen_arch_setup(void); 34void __init xen_arch_setup(void);
37void __init xen_init_IRQ(void); 35void __init xen_init_IRQ(void);