aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2010-04-29 19:53:17 -0400
committerH. Peter Anvin <hpa@zytor.com>2010-04-29 19:53:17 -0400
commitd9c5841e22231e4e49fd0a1004164e6fce59b7a6 (patch)
treee1f589c46b3ff79bbe7b1b2469f6362f94576da6 /arch/x86
parentb701a47ba48b698976fb2fe05fb285b0edc1d26a (diff)
parent5967ed87ade85a421ef814296c3c7f182b08c225 (diff)
Merge branch 'x86/asm' into x86/atomic
Merge reason: Conflict between LOCK_PREFIX_HERE and relative alternatives pointers Resolved Conflicts: arch/x86/include/asm/alternative.h arch/x86/kernel/alternative.c Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig55
-rw-r--r--arch/x86/Kconfig.cpu4
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile6
-rw-r--r--arch/x86/boot/compressed/misc.c19
-rw-r--r--arch/x86/boot/mkcpustr.c2
-rw-r--r--arch/x86/boot/video-vga.c9
-rw-r--r--arch/x86/boot/video.c7
-rw-r--r--arch/x86/crypto/fpu.c1
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S10
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S20
-rw-r--r--arch/x86/ia32/ia32_aout.c15
-rw-r--r--arch/x86/ia32/ia32entry.S8
-rw-r--r--arch/x86/ia32/sys_ia32.c77
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/alternative.h13
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h1
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h3
-rw-r--r--arch/x86/include/asm/apb_timer.h70
-rw-r--r--arch/x86/include/asm/compat.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h127
-rw-r--r--arch/x86/include/asm/cpufeature.h4
-rw-r--r--arch/x86/include/asm/debugreg.h3
-rw-r--r--arch/x86/include/asm/e820.h5
-rw-r--r--arch/x86/include/asm/elf.h15
-rw-r--r--arch/x86/include/asm/fb.h4
-rw-r--r--arch/x86/include/asm/fixmap.h6
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/highmem.h4
-rw-r--r--arch/x86/include/asm/hpet.h1
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h8
-rw-r--r--arch/x86/include/asm/hyperv.h186
-rw-r--r--arch/x86/include/asm/i387.h12
-rw-r--r--arch/x86/include/asm/i8259.h21
-rw-r--r--arch/x86/include/asm/io.h155
-rw-r--r--arch/x86/include/asm/io_32.h196
-rw-r--r--arch/x86/include/asm/io_64.h181
-rw-r--r--arch/x86/include/asm/io_apic.h8
-rw-r--r--arch/x86/include/asm/irq_vectors.h48
-rw-r--r--arch/x86/include/asm/kprobes.h31
-rw-r--r--arch/x86/include/asm/kvm.h4
-rw-r--r--arch/x86/include/asm/kvm_emulate.h17
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/kvm_para.h1
-rw-r--r--arch/x86/include/asm/lguest_hcall.h29
-rw-r--r--arch/x86/include/asm/local.h37
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mmzone_64.h6
-rw-r--r--arch/x86/include/asm/mrst.h19
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/numa_64.h5
-rw-r--r--arch/x86/include/asm/numaq.h5
-rw-r--r--arch/x86/include/asm/olpc.h20
-rw-r--r--arch/x86/include/asm/page_types.h1
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h4
-rw-r--r--arch/x86/include/asm/pci.h39
-rw-r--r--arch/x86/include/asm/pci_64.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h23
-rw-r--r--arch/x86/include/asm/percpu.h143
-rw-r--r--arch/x86/include/asm/perf_event.h31
-rw-r--r--arch/x86/include/asm/pgalloc.h5
-rw-r--r--arch/x86/include/asm/pgtable_32.h7
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/proto.h10
-rw-r--r--arch/x86/include/asm/ptrace.h11
-rw-r--r--arch/x86/include/asm/rwsem.h59
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/svm.h2
-rw-r--r--arch/x86/include/asm/sys_ia32.h11
-rw-r--r--arch/x86/include/asm/syscall.h2
-rw-r--r--arch/x86/include/asm/syscalls.h15
-rw-r--r--arch/x86/include/asm/system.h12
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/uaccess_32.h5
-rw-r--r--arch/x86/include/asm/uaccess_64.h5
-rw-r--r--arch/x86/include/asm/unistd_32.h4
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/include/asm/user.h58
-rw-r--r--arch/x86/include/asm/uv/bios.h11
-rw-r--r--arch/x86/include/asm/uv/uv.h1
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h101
-rw-r--r--arch/x86/include/asm/visws/cobalt.h2
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/x86_init.h17
-rw-r--r--arch/x86/include/asm/xsave.h2
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c170
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/alternative.c122
-rw-r--r--arch/x86/kernel/amd_iommu.c45
-rw-r--r--arch/x86/kernel/amd_iommu_init.c53
-rw-r--r--arch/x86/kernel/apb_timer.c785
-rw-r--r--arch/x86/kernel/aperture_64.c15
-rw-r--r--arch/x86/kernel/apic/apic.c30
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c7
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c358
-rw-r--r--arch/x86/kernel/apic/nmi.c15
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_32.c29
-rw-r--r--arch/x86/kernel/apic/probe_64.c13
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c132
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c39
-rw-r--r--arch/x86/kernel/bootflag.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c4
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c621
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c252
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c5
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c208
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1968
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c422
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c980
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c159
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c13
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c3
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/crash_dump_32.c1
-rw-r--r--arch/x86/kernel/dumpstack.c14
-rw-r--r--arch/x86/kernel/dumpstack.h24
-rw-r--r--arch/x86/kernel/dumpstack_32.c5
-rw-r--r--arch/x86/kernel/dumpstack_64.c20
-rw-r--r--arch/x86/kernel/e820.c379
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/ftrace.c36
-rw-r--r--arch/x86/kernel/head32.c14
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/hpet.c20
-rw-r--r--arch/x86/kernel/hw_breakpoint.c38
-rw-r--r--arch/x86/kernel/i387.c72
-rw-r--r--arch/x86/kernel/i8259.c95
-rw-r--r--arch/x86/kernel/irqinit.c59
-rw-r--r--arch/x86/kernel/k8.c16
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c222
-rw-r--r--arch/x86/kernel/kprobes.c614
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/microcode_amd.c44
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/microcode_intel.c2
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c11
-rw-r--r--arch/x86/kernel/mrst.c216
-rw-r--r--arch/x86/kernel/msr.c3
-rw-r--r--arch/x86/kernel/olpc.c10
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c16
-rw-r--r--arch/x86/kernel/pci-gart_64.c6
-rw-r--r--arch/x86/kernel/pci-nommu.c1
-rw-r--r--arch/x86/kernel/process.c52
-rw-r--r--arch/x86/kernel/process_32.c14
-rw-r--r--arch/x86/kernel/process_64.c36
-rw-r--r--arch/x86/kernel/ptrace.c68
-rw-r--r--arch/x86/kernel/quirks.c13
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/setup.c70
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c28
-rw-r--r--arch/x86/kernel/sys_i386_32.c185
-rw-r--r--arch/x86/kernel/sys_x86_64.c12
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/tlb_uv.c1
-rw-r--r--arch/x86/kernel/traps.c3
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/uv_irq.c1
-rw-r--r--arch/x86/kernel/uv_sysfs.c6
-rw-r--r--arch/x86/kernel/uv_time.c14
-rw-r--r--arch/x86/kernel/visws_quirks.c27
-rw-r--r--arch/x86/kernel/vmi_32.c36
-rw-r--r--arch/x86/kernel/vmiclock_32.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S6
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c11
-rw-r--r--arch/x86/kernel/xsave.c1
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/emulate.c440
-rw-r--r--arch/x86/kvm/i8254.c27
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c47
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c44
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c149
-rw-r--r--arch/x86/kvm/mmu.h35
-rw-r--r--arch/x86/kvm/paging_tmpl.h35
-rw-r--r--arch/x86/kvm/svm.c263
-rw-r--r--arch/x86/kvm/trace.h59
-rw-r--r--arch/x86/kvm/vmx.c421
-rw-r--r--arch/x86/kvm/x86.c1166
-rw-r--r--arch/x86/kvm/x86.h30
-rw-r--r--arch/x86/lguest/boot.c61
-rw-r--r--arch/x86/lguest/i386_head.S2
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/cache-smp.c19
-rw-r--r--arch/x86/lib/io_64.c25
-rw-r--r--arch/x86/lib/rwsem_64.S81
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c1
-rw-r--r--arch/x86/mm/init.c40
-rw-r--r--arch/x86/mm/init_32.c20
-rw-r--r--arch/x86/mm/init_64.c29
-rw-r--r--arch/x86/mm/ioremap.c37
-rw-r--r--arch/x86/mm/kmemcheck/error.c19
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c16
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h2
-rw-r--r--arch/x86/mm/kmmio.c8
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/mmio-mod.c1
-rw-r--r--arch/x86/mm/numa_32.c3
-rw-r--r--arch/x86/mm/numa_64.c332
-rw-r--r--arch/x86/mm/pageattr.c27
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/pgtable.c32
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/srat_64.c4
-rw-r--r--arch/x86/mm/tlb.c8
-rw-r--r--arch/x86/oprofile/nmi_int.c20
-rw-r--r--arch/x86/oprofile/op_model_amd.c261
-rw-r--r--arch/x86/oprofile/op_model_p4.c6
-rw-r--r--arch/x86/oprofile/op_model_ppro.c21
-rw-r--r--arch/x86/oprofile/op_x86_model.h20
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/acpi.c156
-rw-r--r--arch/x86/pci/amd_bus.c127
-rw-r--r--arch/x86/pci/bus_numa.c28
-rw-r--r--arch/x86/pci/bus_numa.h12
-rw-r--r--arch/x86/pci/common.c10
-rw-r--r--arch/x86/pci/i386.c23
-rw-r--r--arch/x86/pci/init.c8
-rw-r--r--arch/x86/pci/intel_bus.c90
-rw-r--r--arch/x86/pci/irq.c19
-rw-r--r--arch/x86/pci/legacy.c24
-rw-r--r--arch/x86/pci/mmconfig-shared.c18
-rw-r--r--arch/x86/pci/mrst.c262
-rw-r--r--arch/x86/pci/numaq_32.c12
-rw-r--r--arch/x86/pci/olpc.c3
-rw-r--r--arch/x86/pci/pcbios.c1
-rw-r--r--arch/x86/pci/visws.c6
-rw-r--r--arch/x86/power/hibernate_32.c1
-rw-r--r--arch/x86/power/hibernate_64.c1
-rw-r--r--arch/x86/power/hibernate_asm_32.S15
-rw-r--r--arch/x86/tools/chkobjdump.awk16
-rw-r--r--arch/x86/tools/test_get_len.c4
-rw-r--r--arch/x86/vdso/vma.c1
-rw-r--r--arch/x86/xen/debugfs.c1
-rw-r--r--arch/x86/xen/enlighten.c12
-rw-r--r--arch/x86/xen/mmu.c22
-rw-r--r--arch/x86/xen/smp.c3
-rw-r--r--arch/x86/xen/spinlock.c1
-rw-r--r--arch/x86/xen/time.c1
-rw-r--r--arch/x86/xen/xen-asm_32.S4
298 files changed, 10290 insertions, 6602 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 55298e891571..9458685902bd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
31 select ARCH_WANT_FRAME_POINTERS 31 select ARCH_WANT_FRAME_POINTERS
32 select HAVE_DMA_ATTRS 32 select HAVE_DMA_ATTRS
33 select HAVE_KRETPROBES 33 select HAVE_KRETPROBES
34 select HAVE_OPTPROBES
34 select HAVE_FTRACE_MCOUNT_RECORD 35 select HAVE_FTRACE_MCOUNT_RECORD
35 select HAVE_DYNAMIC_FTRACE 36 select HAVE_DYNAMIC_FTRACE
36 select HAVE_FUNCTION_TRACER 37 select HAVE_FUNCTION_TRACER
@@ -45,10 +46,12 @@ config X86
45 select HAVE_GENERIC_DMA_COHERENT if X86_32 46 select HAVE_GENERIC_DMA_COHERENT if X86_32
46 select HAVE_EFFICIENT_UNALIGNED_ACCESS 47 select HAVE_EFFICIENT_UNALIGNED_ACCESS
47 select USER_STACKTRACE_SUPPORT 48 select USER_STACKTRACE_SUPPORT
49 select HAVE_REGS_AND_STACK_ACCESS_API
48 select HAVE_DMA_API_DEBUG 50 select HAVE_DMA_API_DEBUG
49 select HAVE_KERNEL_GZIP 51 select HAVE_KERNEL_GZIP
50 select HAVE_KERNEL_BZIP2 52 select HAVE_KERNEL_BZIP2
51 select HAVE_KERNEL_LZMA 53 select HAVE_KERNEL_LZMA
54 select HAVE_KERNEL_LZO
52 select HAVE_HW_BREAKPOINT 55 select HAVE_HW_BREAKPOINT
53 select PERF_EVENTS 56 select PERF_EVENTS
54 select ANON_INODES 57 select ANON_INODES
@@ -99,6 +102,9 @@ config ZONE_DMA
99config SBUS 102config SBUS
100 bool 103 bool
101 104
105config NEED_DMA_MAP_STATE
106 def_bool (X86_64 || DMAR || DMA_API_DEBUG)
107
102config GENERIC_ISA_DMA 108config GENERIC_ISA_DMA
103 def_bool y 109 def_bool y
104 110
@@ -182,6 +188,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
182config ARCH_SUPPORTS_DEBUG_PAGEALLOC 188config ARCH_SUPPORTS_DEBUG_PAGEALLOC
183 def_bool y 189 def_bool y
184 190
191config HAVE_EARLY_RES
192 def_bool y
193
185config HAVE_INTEL_TXT 194config HAVE_INTEL_TXT
186 def_bool y 195 def_bool y
187 depends on EXPERIMENTAL && DMAR && ACPI 196 depends on EXPERIMENTAL && DMAR && ACPI
@@ -387,8 +396,12 @@ config X86_ELAN
387 396
388config X86_MRST 397config X86_MRST
389 bool "Moorestown MID platform" 398 bool "Moorestown MID platform"
399 depends on PCI
400 depends on PCI_GOANY
390 depends on X86_32 401 depends on X86_32
391 depends on X86_EXTENDED_PLATFORM 402 depends on X86_EXTENDED_PLATFORM
403 depends on X86_IO_APIC
404 select APB_TIMER
392 ---help--- 405 ---help---
393 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin 406 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
394 Internet Device(MID) platform. Moorestown consists of two chips: 407 Internet Device(MID) platform. Moorestown consists of two chips:
@@ -423,6 +436,7 @@ config X86_32_NON_STANDARD
423config X86_NUMAQ 436config X86_NUMAQ
424 bool "NUMAQ (IBM/Sequent)" 437 bool "NUMAQ (IBM/Sequent)"
425 depends on X86_32_NON_STANDARD 438 depends on X86_32_NON_STANDARD
439 depends on PCI
426 select NUMA 440 select NUMA
427 select X86_MPPARSE 441 select X86_MPPARSE
428 ---help--- 442 ---help---
@@ -567,6 +581,18 @@ config PARAVIRT_DEBUG
567 Enable to debug paravirt_ops internals. Specifically, BUG if 581 Enable to debug paravirt_ops internals. Specifically, BUG if
568 a paravirt_op is missing when it is called. 582 a paravirt_op is missing when it is called.
569 583
584config NO_BOOTMEM
585 default y
586 bool "Disable Bootmem code"
587 ---help---
588 Use early_res directly instead of bootmem before slab is ready.
589 - allocator (buddy) [generic]
590 - early allocator (bootmem) [generic]
591 - very early allocator (reserve_early*()) [x86]
592 - very very early allocator (early brk model) [x86]
593 So reduce one layer between early allocator to final allocator
594
595
570config MEMTEST 596config MEMTEST
571 bool "Memtest" 597 bool "Memtest"
572 ---help--- 598 ---help---
@@ -611,6 +637,16 @@ config HPET_EMULATE_RTC
611 def_bool y 637 def_bool y
612 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) 638 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
613 639
640config APB_TIMER
641 def_bool y if MRST
642 prompt "Langwell APB Timer Support" if X86_MRST
643 help
644 APB timer is the replacement for 8254, HPET on X86 MID platforms.
645 The APBT provides a stable time base on SMP
646 systems, unlike the TSC, but it is more expensive to access,
647 as it is off-chip. APB timers are always running regardless of CPU
648 C states, they are used as per CPU clockevent device when possible.
649
614# Mark as embedded because too many people got it wrong. 650# Mark as embedded because too many people got it wrong.
615# The code disables itself when not needed. 651# The code disables itself when not needed.
616config DMI 652config DMI
@@ -626,7 +662,7 @@ config GART_IOMMU
626 bool "GART IOMMU support" if EMBEDDED 662 bool "GART IOMMU support" if EMBEDDED
627 default y 663 default y
628 select SWIOTLB 664 select SWIOTLB
629 depends on X86_64 && PCI 665 depends on X86_64 && PCI && K8_NB
630 ---help--- 666 ---help---
631 Support for full DMA access of devices with 32bit memory access only 667 Support for full DMA access of devices with 32bit memory access only
632 on systems with more than 3GB. This is usually needed for USB, 668 on systems with more than 3GB. This is usually needed for USB,
@@ -988,12 +1024,6 @@ config X86_CPUID
988 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to 1024 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
989 /dev/cpu/31/cpuid. 1025 /dev/cpu/31/cpuid.
990 1026
991config X86_CPU_DEBUG
992 tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
993 ---help---
994 If you select this option, this will provide various x86 CPUs
995 information through debugfs.
996
997choice 1027choice
998 prompt "High Memory Support" 1028 prompt "High Memory Support"
999 default HIGHMEM4G if !X86_NUMAQ 1029 default HIGHMEM4G if !X86_NUMAQ
@@ -1186,8 +1216,8 @@ config NUMA_EMU
1186 1216
1187config NODES_SHIFT 1217config NODES_SHIFT
1188 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP 1218 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1189 range 1 9 1219 range 1 10
1190 default "9" if MAXSMP 1220 default "10" if MAXSMP
1191 default "6" if X86_64 1221 default "6" if X86_64
1192 default "4" if X86_NUMAQ 1222 default "4" if X86_NUMAQ
1193 default "3" 1223 default "3"
@@ -1246,6 +1276,11 @@ config ARCH_MEMORY_PROBE
1246 def_bool X86_64 1276 def_bool X86_64
1247 depends on MEMORY_HOTPLUG 1277 depends on MEMORY_HOTPLUG
1248 1278
1279config ILLEGAL_POINTER_VALUE
1280 hex
1281 default 0 if X86_32
1282 default 0xdead000000000000 if X86_64
1283
1249source "mm/Kconfig" 1284source "mm/Kconfig"
1250 1285
1251config HIGHPTE 1286config HIGHPTE
@@ -2026,7 +2061,7 @@ endif # X86_32
2026 2061
2027config K8_NB 2062config K8_NB
2028 def_bool y 2063 def_bool y
2029 depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) 2064 depends on CPU_SUP_AMD && PCI
2030 2065
2031source "drivers/pcmcia/Kconfig" 2066source "drivers/pcmcia/Kconfig"
2032 2067
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 08e442bc3ab9..a19829374e6a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT
319 319
320config X86_XADD 320config X86_XADD
321 def_bool y 321 def_bool y
322 depends on X86_32 && !M386 322 depends on X86_64 || !M386
323 323
324config X86_PPRO_FENCE 324config X86_PPRO_FENCE
325 bool "PentiumPro memory ordering errata workaround" 325 bool "PentiumPro memory ordering errata workaround"
@@ -396,7 +396,7 @@ config X86_TSC
396 396
397config X86_CMPXCHG64 397config X86_CMPXCHG64
398 def_bool y 398 def_bool y
399 depends on !M386 && !M486 399 depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
400 400
401# this should be set for all -march=.. options where the compiler 401# this should be set for all -march=.. options where the compiler
402# generates cmov. 402# generates cmov.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 78b32be55e9e..0a43dc515e4c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -135,9 +135,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
135# suspend and hibernation support 135# suspend and hibernation support
136drivers-$(CONFIG_PM) += arch/x86/power/ 136drivers-$(CONFIG_PM) += arch/x86/power/
137 137
138ifeq ($(CONFIG_X86_32),y)
139drivers-$(CONFIG_FB) += arch/x86/video/ 138drivers-$(CONFIG_FB) += arch/x86/video/
140endif
141 139
142#### 140####
143# boot loader support. Several targets are kept for legacy purposes 141# boot loader support. Several targets are kept for legacy purposes
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f8ed0658404c..fbb47daf2459 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,11 +4,12 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
12cflags-$(CONFIG_X86_32) := -march=i386
12cflags-$(CONFIG_X86_64) := -mcmodel=small 13cflags-$(CONFIG_X86_64) := -mcmodel=small
13KBUILD_CFLAGS += $(cflags-y) 14KBUILD_CFLAGS += $(cflags-y)
14KBUILD_CFLAGS += $(call cc-option,-ffreestanding) 15KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
@@ -48,10 +49,13 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
48 $(call if_changed,bzip2) 49 $(call if_changed,bzip2)
49$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE 50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
50 $(call if_changed,lzma) 51 $(call if_changed,lzma)
52$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,lzo)
51 54
52suffix-$(CONFIG_KERNEL_GZIP) := gz 55suffix-$(CONFIG_KERNEL_GZIP) := gz
53suffix-$(CONFIG_KERNEL_BZIP2) := bz2 56suffix-$(CONFIG_KERNEL_BZIP2) := bz2
54suffix-$(CONFIG_KERNEL_LZMA) := lzma 57suffix-$(CONFIG_KERNEL_LZMA) := lzma
58suffix-$(CONFIG_KERNEL_LZO) := lzo
55 59
56quiet_cmd_mkpiggy = MKPIGGY $@ 60quiet_cmd_mkpiggy = MKPIGGY $@
57 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) 61 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 842b2a36174a..51e240779a44 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -19,11 +19,6 @@
19#define _ASM_X86_DESC_H 1 19#define _ASM_X86_DESC_H 1
20#endif 20#endif
21 21
22#ifdef CONFIG_X86_64
23#define _LINUX_STRING_H_ 1
24#define __LINUX_BITMAP_H 1
25#endif
26
27#include <linux/linkage.h> 22#include <linux/linkage.h>
28#include <linux/screen_info.h> 23#include <linux/screen_info.h>
29#include <linux/elf.h> 24#include <linux/elf.h>
@@ -131,8 +126,8 @@ static void error(char *m);
131static struct boot_params *real_mode; /* Pointer to real-mode data */ 126static struct boot_params *real_mode; /* Pointer to real-mode data */
132static int quiet; 127static int quiet;
133 128
134static void *memset(void *s, int c, unsigned n); 129void *memset(void *s, int c, size_t n);
135void *memcpy(void *dest, const void *src, unsigned n); 130void *memcpy(void *dest, const void *src, size_t n);
136 131
137static void __putstr(int, const char *); 132static void __putstr(int, const char *);
138#define putstr(__x) __putstr(0, __x) 133#define putstr(__x) __putstr(0, __x)
@@ -162,6 +157,10 @@ static int lines, cols;
162#include "../../../../lib/decompress_unlzma.c" 157#include "../../../../lib/decompress_unlzma.c"
163#endif 158#endif
164 159
160#ifdef CONFIG_KERNEL_LZO
161#include "../../../../lib/decompress_unlzo.c"
162#endif
163
165static void scroll(void) 164static void scroll(void)
166{ 165{
167 int i; 166 int i;
@@ -181,11 +180,9 @@ static void __putstr(int error, const char *s)
181 return; 180 return;
182#endif 181#endif
183 182
184#ifdef CONFIG_X86_32
185 if (real_mode->screen_info.orig_video_mode == 0 && 183 if (real_mode->screen_info.orig_video_mode == 0 &&
186 lines == 0 && cols == 0) 184 lines == 0 && cols == 0)
187 return; 185 return;
188#endif
189 186
190 x = real_mode->screen_info.orig_x; 187 x = real_mode->screen_info.orig_x;
191 y = real_mode->screen_info.orig_y; 188 y = real_mode->screen_info.orig_y;
@@ -219,7 +216,7 @@ static void __putstr(int error, const char *s)
219 outb(0xff & (pos >> 1), vidport+1); 216 outb(0xff & (pos >> 1), vidport+1);
220} 217}
221 218
222static void *memset(void *s, int c, unsigned n) 219void *memset(void *s, int c, size_t n)
223{ 220{
224 int i; 221 int i;
225 char *ss = s; 222 char *ss = s;
@@ -229,7 +226,7 @@ static void *memset(void *s, int c, unsigned n)
229 return s; 226 return s;
230} 227}
231 228
232void *memcpy(void *dest, const void *src, unsigned n) 229void *memcpy(void *dest, const void *src, size_t n)
233{ 230{
234 int i; 231 int i;
235 const char *s = src; 232 const char *s = src;
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 8ef60f20b371..919257f526f2 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -22,7 +22,7 @@ int main(void)
22 int i, j; 22 int i, j;
23 const char *str; 23 const char *str;
24 24
25 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] =\n");
26 26
27 for (i = 0; i < NCAPINTS; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
28 for (j = 0; j < 32; j++) { 28 for (j = 0; j < 32; j++) {
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 819caa1f2008..ed7aeff786b2 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -42,22 +42,15 @@ static u8 vga_set_basic_mode(void)
42{ 42{
43 struct biosregs ireg, oreg; 43 struct biosregs ireg, oreg;
44 u16 ax; 44 u16 ax;
45 u8 rows;
46 u8 mode; 45 u8 mode;
47 46
48 initregs(&ireg); 47 initregs(&ireg);
49 48
49 /* Query current mode */
50 ax = 0x0f00; 50 ax = 0x0f00;
51 intcall(0x10, &ireg, &oreg); 51 intcall(0x10, &ireg, &oreg);
52 mode = oreg.al; 52 mode = oreg.al;
53 53
54 set_fs(0);
55 rows = rdfs8(0x484); /* rows minus one */
56
57 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
58 (rows == 0 || rows == 24))
59 return mode;
60
61 if (mode != 3 && mode != 7) 54 if (mode != 3 && mode != 7)
62 mode = 3; 55 mode = 3;
63 56
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index f767164cd5df..43eda284d27f 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -298,11 +298,18 @@ static void restore_screen(void)
298 } 298 }
299 299
300 /* Restore cursor position */ 300 /* Restore cursor position */
301 if (saved.curx >= xs)
302 saved.curx = xs-1;
303 if (saved.cury >= ys)
304 saved.cury = ys-1;
305
301 initregs(&ireg); 306 initregs(&ireg);
302 ireg.ah = 0x02; /* Set cursor position */ 307 ireg.ah = 0x02; /* Set cursor position */
303 ireg.dh = saved.cury; 308 ireg.dh = saved.cury;
304 ireg.dl = saved.curx; 309 ireg.dl = saved.curx;
305 intcall(0x10, &ireg, NULL); 310 intcall(0x10, &ireg, NULL);
311
312 store_cursor_position();
306} 313}
307 314
308void set_video(void) 315void set_video(void)
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index daef6cd2b45d..1a8f8649c035 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/slab.h>
19#include <asm/i387.h> 20#include <asm/i387.h>
20 21
21struct crypto_fpu_ctx { 22struct crypto_fpu_ctx {
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index 39b98ed2c1b9..575331cb2a8a 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -22,7 +22,7 @@
22 22
23#include <asm/asm-offsets.h> 23#include <asm/asm-offsets.h>
24 24
25/* return adress at 0 */ 25/* return address at 0 */
26 26
27#define in_blk 12 /* input byte array address parameter*/ 27#define in_blk 12 /* input byte array address parameter*/
28#define out_blk 8 /* output byte array address parameter*/ 28#define out_blk 8 /* output byte array address parameter*/
@@ -230,8 +230,8 @@ twofish_enc_blk:
230 push %edi 230 push %edi
231 231
232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ 232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
233 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ 233 add $crypto_tfm_ctx_offset, %ebp /* ctx address */
234 mov in_blk+16(%esp),%edi /* input adress in edi */ 234 mov in_blk+16(%esp),%edi /* input address in edi */
235 235
236 mov (%edi), %eax 236 mov (%edi), %eax
237 mov b_offset(%edi), %ebx 237 mov b_offset(%edi), %ebx
@@ -286,8 +286,8 @@ twofish_dec_blk:
286 286
287 287
288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ 288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
289 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ 289 add $crypto_tfm_ctx_offset, %ebp /* ctx address */
290 mov in_blk+16(%esp),%edi /* input adress in edi */ 290 mov in_blk+16(%esp),%edi /* input address in edi */
291 291
292 mov (%edi), %eax 292 mov (%edi), %eax
293 mov b_offset(%edi), %ebx 293 mov b_offset(%edi), %ebx
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 35974a586615..573aa102542e 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -221,11 +221,11 @@
221twofish_enc_blk: 221twofish_enc_blk:
222 pushq R1 222 pushq R1
223 223
224 /* %rdi contains the crypto tfm adress */ 224 /* %rdi contains the crypto tfm address */
225 /* %rsi contains the output adress */ 225 /* %rsi contains the output address */
226 /* %rdx contains the input adress */ 226 /* %rdx contains the input address */
227 add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ 227 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
228 /* ctx adress is moved to free one non-rex register 228 /* ctx address is moved to free one non-rex register
229 as target for the 8bit high operations */ 229 as target for the 8bit high operations */
230 mov %rdi, %r11 230 mov %rdi, %r11
231 231
@@ -274,11 +274,11 @@ twofish_enc_blk:
274twofish_dec_blk: 274twofish_dec_blk:
275 pushq R1 275 pushq R1
276 276
277 /* %rdi contains the crypto tfm adress */ 277 /* %rdi contains the crypto tfm address */
278 /* %rsi contains the output adress */ 278 /* %rsi contains the output address */
279 /* %rdx contains the input adress */ 279 /* %rdx contains the input address */
280 add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ 280 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
281 /* ctx adress is moved to free one non-rex register 281 /* ctx address is moved to free one non-rex register
282 as target for the 8bit high operations */ 282 as target for the 8bit high operations */
283 mov %rdi, %r11 283 mov %rdi, %r11
284 284
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2a4d073d2cf1..0350311906ae 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -21,7 +21,6 @@
21#include <linux/fcntl.h> 21#include <linux/fcntl.h>
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/user.h> 23#include <linux/user.h>
24#include <linux/slab.h>
25#include <linux/binfmts.h> 24#include <linux/binfmts.h>
26#include <linux/personality.h> 25#include <linux/personality.h>
27#include <linux/init.h> 26#include <linux/init.h>
@@ -297,7 +296,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
297 * size limits imposed on them by creating programs with large 296 * size limits imposed on them by creating programs with large
298 * arrays in the data or bss. 297 * arrays in the data or bss.
299 */ 298 */
300 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 299 rlim = rlimit(RLIMIT_DATA);
301 if (rlim >= RLIM_INFINITY) 300 if (rlim >= RLIM_INFINITY)
302 rlim = ~0; 301 rlim = ~0;
303 if (ex.a_data + ex.a_bss > rlim) 302 if (ex.a_data + ex.a_bss > rlim)
@@ -308,14 +307,15 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
308 if (retval) 307 if (retval)
309 return retval; 308 return retval;
310 309
311 regs->cs = __USER32_CS;
312 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
313 regs->r13 = regs->r14 = regs->r15 = 0;
314
315 /* OK, This is the point of no return */ 310 /* OK, This is the point of no return */
316 set_personality(PER_LINUX); 311 set_personality(PER_LINUX);
317 set_thread_flag(TIF_IA32); 312 set_thread_flag(TIF_IA32);
318 clear_thread_flag(TIF_ABI_PENDING); 313
314 setup_new_exec(bprm);
315
316 regs->cs = __USER32_CS;
317 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
318 regs->r13 = regs->r14 = regs->r15 = 0;
319 319
320 current->mm->end_code = ex.a_text + 320 current->mm->end_code = ex.a_text +
321 (current->mm->start_code = N_TXTADDR(ex)); 321 (current->mm->start_code = N_TXTADDR(ex));
@@ -326,7 +326,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
326 current->mm->free_area_cache = TASK_UNMAPPED_BASE; 326 current->mm->free_area_cache = TASK_UNMAPPED_BASE;
327 current->mm->cached_hole_size = 0; 327 current->mm->cached_hole_size = 0;
328 328
329 current->mm->mmap = NULL;
330 install_exec_creds(bprm); 329 install_exec_creds(bprm);
331 current->flags &= ~PF_FORKNOEXEC; 330 current->flags &= ~PF_FORKNOEXEC;
332 331
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 53147ad85b96..e790bc1fbfa3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -563,7 +563,7 @@ ia32_sys_call_table:
563 .quad quiet_ni_syscall /* old mpx syscall holder */ 563 .quad quiet_ni_syscall /* old mpx syscall holder */
564 .quad sys_setpgid 564 .quad sys_setpgid
565 .quad quiet_ni_syscall /* old ulimit syscall holder */ 565 .quad quiet_ni_syscall /* old ulimit syscall holder */
566 .quad sys32_olduname 566 .quad sys_olduname
567 .quad sys_umask /* 60 */ 567 .quad sys_umask /* 60 */
568 .quad sys_chroot 568 .quad sys_chroot
569 .quad compat_sys_ustat 569 .quad compat_sys_ustat
@@ -586,7 +586,7 @@ ia32_sys_call_table:
586 .quad compat_sys_settimeofday 586 .quad compat_sys_settimeofday
587 .quad sys_getgroups16 /* 80 */ 587 .quad sys_getgroups16 /* 80 */
588 .quad sys_setgroups16 588 .quad sys_setgroups16
589 .quad sys32_old_select 589 .quad compat_sys_old_select
590 .quad sys_symlink 590 .quad sys_symlink
591 .quad sys_lstat 591 .quad sys_lstat
592 .quad sys_readlink /* 85 */ 592 .quad sys_readlink /* 85 */
@@ -613,7 +613,7 @@ ia32_sys_call_table:
613 .quad compat_sys_newstat 613 .quad compat_sys_newstat
614 .quad compat_sys_newlstat 614 .quad compat_sys_newlstat
615 .quad compat_sys_newfstat 615 .quad compat_sys_newfstat
616 .quad sys32_uname 616 .quad sys_uname
617 .quad stub32_iopl /* 110 */ 617 .quad stub32_iopl /* 110 */
618 .quad sys_vhangup 618 .quad sys_vhangup
619 .quad quiet_ni_syscall /* old "idle" system call */ 619 .quad quiet_ni_syscall /* old "idle" system call */
@@ -626,7 +626,7 @@ ia32_sys_call_table:
626 .quad stub32_sigreturn 626 .quad stub32_sigreturn
627 .quad stub32_clone /* 120 */ 627 .quad stub32_clone /* 120 */
628 .quad sys_setdomainname 628 .quad sys_setdomainname
629 .quad sys_uname 629 .quad sys_newuname
630 .quad sys_modify_ldt 630 .quad sys_modify_ldt
631 .quad compat_sys_adjtimex 631 .quad compat_sys_adjtimex
632 .quad sys32_mprotect /* 125 */ 632 .quad sys32_mprotect /* 125 */
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 422572c77923..626be156d88d 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -40,6 +40,7 @@
40#include <linux/ptrace.h> 40#include <linux/ptrace.h>
41#include <linux/highuid.h> 41#include <linux/highuid.h>
42#include <linux/sysctl.h> 42#include <linux/sysctl.h>
43#include <linux/slab.h>
43#include <asm/mman.h> 44#include <asm/mman.h>
44#include <asm/types.h> 45#include <asm/types.h>
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
@@ -143,7 +144,7 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
143 * block for parameter passing.. 144 * block for parameter passing..
144 */ 145 */
145 146
146struct mmap_arg_struct { 147struct mmap_arg_struct32 {
147 unsigned int addr; 148 unsigned int addr;
148 unsigned int len; 149 unsigned int len;
149 unsigned int prot; 150 unsigned int prot;
@@ -152,9 +153,9 @@ struct mmap_arg_struct {
152 unsigned int offset; 153 unsigned int offset;
153}; 154};
154 155
155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) 156asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg)
156{ 157{
157 struct mmap_arg_struct a; 158 struct mmap_arg_struct32 a;
158 159
159 if (copy_from_user(&a, arg, sizeof(a))) 160 if (copy_from_user(&a, arg, sizeof(a)))
160 return -EFAULT; 161 return -EFAULT;
@@ -332,24 +333,6 @@ asmlinkage long sys32_alarm(unsigned int seconds)
332 return alarm_setitimer(seconds); 333 return alarm_setitimer(seconds);
333} 334}
334 335
335struct sel_arg_struct {
336 unsigned int n;
337 unsigned int inp;
338 unsigned int outp;
339 unsigned int exp;
340 unsigned int tvp;
341};
342
343asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg)
344{
345 struct sel_arg_struct a;
346
347 if (copy_from_user(&a, arg, sizeof(a)))
348 return -EFAULT;
349 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
350 compat_ptr(a.exp), compat_ptr(a.tvp));
351}
352
353asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, 336asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
354 int options) 337 int options)
355{ 338{
@@ -466,58 +449,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
466 return ret; 449 return ret;
467} 450}
468 451
469asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
470{
471 char *arch = "x86_64";
472 int err;
473
474 if (!name)
475 return -EFAULT;
476 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
477 return -EFAULT;
478
479 down_read(&uts_sem);
480
481 err = __copy_to_user(&name->sysname, &utsname()->sysname,
482 __OLD_UTS_LEN);
483 err |= __put_user(0, name->sysname+__OLD_UTS_LEN);
484 err |= __copy_to_user(&name->nodename, &utsname()->nodename,
485 __OLD_UTS_LEN);
486 err |= __put_user(0, name->nodename+__OLD_UTS_LEN);
487 err |= __copy_to_user(&name->release, &utsname()->release,
488 __OLD_UTS_LEN);
489 err |= __put_user(0, name->release+__OLD_UTS_LEN);
490 err |= __copy_to_user(&name->version, &utsname()->version,
491 __OLD_UTS_LEN);
492 err |= __put_user(0, name->version+__OLD_UTS_LEN);
493
494 if (personality(current->personality) == PER_LINUX32)
495 arch = "i686";
496
497 err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1);
498
499 up_read(&uts_sem);
500
501 err = err ? -EFAULT : 0;
502
503 return err;
504}
505
506long sys32_uname(struct old_utsname __user *name)
507{
508 int err;
509
510 if (!name)
511 return -EFAULT;
512 down_read(&uts_sem);
513 err = copy_to_user(name, utsname(), sizeof(*name));
514 up_read(&uts_sem);
515 if (personality(current->personality) == PER_LINUX32)
516 err |= copy_to_user(&name->machine, "i686", 5);
517
518 return err ? -EFAULT : 0;
519}
520
521asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, 452asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
522 compat_uptr_t __user *envp, struct pt_regs *regs) 453 compat_uptr_t __user *envp, struct pt_regs *regs)
523{ 454{
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 9f828f87ca35..493092efaa3b 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -11,6 +11,7 @@ header-y += sigcontext32.h
11header-y += ucontext.h 11header-y += ucontext.h
12header-y += processor-flags.h 12header-y += processor-flags.h
13header-y += hw_breakpoint.h 13header-y += hw_breakpoint.h
14header-y += hyperv.h
14 15
15unifdef-y += e820.h 16unifdef-y += e820.h
16unifdef-y += ist.h 17unifdef-y += ist.h
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index b97f786a48d5..a63a68be1cce 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -6,8 +6,8 @@
6 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
71: lock 71: lock
8 .section .smp_locks,"a" 8 .section .smp_locks,"a"
9 _ASM_ALIGN 9 .balign 4
10 _ASM_PTR 1b 10 .long 1b - .
11 .previous 11 .previous
12 .endm 12 .endm
13#else 13#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index e29a6c9bba00..92a9033c14d1 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -30,8 +30,8 @@
30#ifdef CONFIG_SMP 30#ifdef CONFIG_SMP
31#define LOCK_PREFIX_HERE \ 31#define LOCK_PREFIX_HERE \
32 ".section .smp_locks,\"a\"\n" \ 32 ".section .smp_locks,\"a\"\n" \
33 _ASM_ALIGN "\n" \ 33 ".balign 4\n" \
34 _ASM_PTR "671f\n" /* address */ \ 34 ".long 671f - .\n" /* offset */ \
35 ".previous\n" \ 35 ".previous\n" \
36 "671:" 36 "671:"
37 37
@@ -68,12 +68,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
68 void *text, void *text_end); 68 void *text, void *text_end);
69extern void alternatives_smp_module_del(struct module *mod); 69extern void alternatives_smp_module_del(struct module *mod);
70extern void alternatives_smp_switch(int smp); 70extern void alternatives_smp_switch(int smp);
71extern int alternatives_text_reserved(void *start, void *end);
71#else 72#else
72static inline void alternatives_smp_module_add(struct module *mod, char *name, 73static inline void alternatives_smp_module_add(struct module *mod, char *name,
73 void *locks, void *locks_end, 74 void *locks, void *locks_end,
74 void *text, void *text_end) {} 75 void *text, void *text_end) {}
75static inline void alternatives_smp_module_del(struct module *mod) {} 76static inline void alternatives_smp_module_del(struct module *mod) {}
76static inline void alternatives_smp_switch(int smp) {} 77static inline void alternatives_smp_switch(int smp) {}
78static inline int alternatives_text_reserved(void *start, void *end)
79{
80 return 0;
81}
77#endif /* CONFIG_SMP */ 82#endif /* CONFIG_SMP */
78 83
79/* alternative assembly primitive: */ 84/* alternative assembly primitive: */
@@ -163,10 +168,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
163 * invalid instruction possible) or if the instructions are changed from a 168 * invalid instruction possible) or if the instructions are changed from a
164 * consistent state to another consistent state atomically. 169 * consistent state to another consistent state atomically.
165 * More care must be taken when modifying code in the SMP case because of 170 * More care must be taken when modifying code in the SMP case because of
166 * Intel's errata. 171 * Intel's errata. text_poke_smp() takes care that errata, but still
172 * doesn't support NMI/MCE handler code modifying.
167 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 173 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
168 * inconsistent instruction while you patch. 174 * inconsistent instruction while you patch.
169 */ 175 */
170extern void *text_poke(void *addr, const void *opcode, size_t len); 176extern void *text_poke(void *addr, const void *opcode, size_t len);
177extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
171 178
172#endif /* _ASM_X86_ALTERNATIVE_H */ 179#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index 4d817f9e6e77..d2544f1d705d 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -31,6 +31,7 @@ extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31extern int amd_iommu_init_devices(void); 31extern int amd_iommu_init_devices(void);
32extern void amd_iommu_uninit_devices(void); 32extern void amd_iommu_uninit_devices(void);
33extern void amd_iommu_init_notifier(void); 33extern void amd_iommu_init_notifier(void);
34extern void amd_iommu_init_api(void);
34#ifndef CONFIG_AMD_IOMMU_STATS 35#ifndef CONFIG_AMD_IOMMU_STATS
35 36
36static inline void amd_iommu_stats_init(void) { } 37static inline void amd_iommu_stats_init(void) { }
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ba19ad4c47d0..86a0ff0aeac7 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -21,6 +21,7 @@
21#define _ASM_X86_AMD_IOMMU_TYPES_H 21#define _ASM_X86_AMD_IOMMU_TYPES_H
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/mutex.h>
24#include <linux/list.h> 25#include <linux/list.h>
25#include <linux/spinlock.h> 26#include <linux/spinlock.h>
26 27
@@ -140,6 +141,7 @@
140 141
141/* constants to configure the command buffer */ 142/* constants to configure the command buffer */
142#define CMD_BUFFER_SIZE 8192 143#define CMD_BUFFER_SIZE 8192
144#define CMD_BUFFER_UNINITIALIZED 1
143#define CMD_BUFFER_ENTRIES 512 145#define CMD_BUFFER_ENTRIES 512
144#define MMIO_CMD_SIZE_SHIFT 56 146#define MMIO_CMD_SIZE_SHIFT 56
145#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) 147#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
@@ -237,6 +239,7 @@ struct protection_domain {
237 struct list_head list; /* for list of all protection domains */ 239 struct list_head list; /* for list of all protection domains */
238 struct list_head dev_list; /* List of all devices in this domain */ 240 struct list_head dev_list; /* List of all devices in this domain */
239 spinlock_t lock; /* mostly used to lock the page table*/ 241 spinlock_t lock; /* mostly used to lock the page table*/
242 struct mutex api_lock; /* protect page tables in the iommu-api path */
240 u16 id; /* the domain id written to the device table */ 243 u16 id; /* the domain id written to the device table */
241 int mode; /* paging mode (0-6 levels) */ 244 int mode; /* paging mode (0-6 levels) */
242 u64 *pt_root; /* page table root pointer */ 245 u64 *pt_root; /* page table root pointer */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
new file mode 100644
index 000000000000..c74a2eebe570
--- /dev/null
+++ b/arch/x86/include/asm/apb_timer.h
@@ -0,0 +1,70 @@
1/*
2 * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 */
14
15#ifndef ASM_X86_APBT_H
16#define ASM_X86_APBT_H
17#include <linux/sfi.h>
18
19#ifdef CONFIG_APB_TIMER
20
21/* Langwell DW APB timer registers */
22#define APBTMR_N_LOAD_COUNT 0x00
23#define APBTMR_N_CURRENT_VALUE 0x04
24#define APBTMR_N_CONTROL 0x08
25#define APBTMR_N_EOI 0x0c
26#define APBTMR_N_INT_STATUS 0x10
27
28#define APBTMRS_INT_STATUS 0xa0
29#define APBTMRS_EOI 0xa4
30#define APBTMRS_RAW_INT_STATUS 0xa8
31#define APBTMRS_COMP_VERSION 0xac
32#define APBTMRS_REG_SIZE 0x14
33
34/* register bits */
35#define APBTMR_CONTROL_ENABLE (1<<0)
36#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */
37#define APBTMR_CONTROL_INT (1<<2)
38
39/* default memory mapped register base */
40#define LNW_SCU_ADDR 0xFF100000
41#define LNW_EXT_TIMER_OFFSET 0x1B800
42#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET)
43#define LNW_EXT_TIMER_PGOFFSET 0x800
44
45/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
46#define APBT_MAX_FREQ 50
47#define APBT_MIN_FREQ 1
48#define APBT_MMAP_SIZE 1024
49
50#define APBT_DEV_USED 1
51
52extern void apbt_time_init(void);
53extern struct clock_event_device *global_clock_event;
54extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void);
57extern unsigned int boot_cpu_id;
58extern int disable_apbt_percpu;
59
60extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
61extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
62extern int sfi_mtimer_num;
63
64#else /* CONFIG_APB_TIMER */
65
66static inline unsigned long apbt_quick_calibrate(void) {return 0; }
67static inline void apbt_time_init(void) {return 0; }
68
69#endif
70#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 9a9c7bdc923d..306160e58b48 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -8,7 +8,8 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <asm/user32.h> 9#include <asm/user32.h>
10 10
11#define COMPAT_USER_HZ 100 11#define COMPAT_USER_HZ 100
12#define COMPAT_UTS_MACHINE "i686\0\0"
12 13
13typedef u32 compat_size_t; 14typedef u32 compat_size_t;
14typedef s32 compat_ssize_t; 15typedef s32 compat_ssize_t;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
deleted file mode 100644
index d96c1ee3a95c..000000000000
--- a/arch/x86/include/asm/cpu_debug.h
+++ /dev/null
@@ -1,127 +0,0 @@
1#ifndef _ASM_X86_CPU_DEBUG_H
2#define _ASM_X86_CPU_DEBUG_H
3
4/*
5 * CPU x86 architecture debug
6 *
7 * Copyright(C) 2009 Jaswinder Singh Rajput
8 */
9
10/* Register flags */
11enum cpu_debug_bit {
12/* Model Specific Registers (MSRs) */
13 CPU_MC_BIT, /* Machine Check */
14 CPU_MONITOR_BIT, /* Monitor */
15 CPU_TIME_BIT, /* Time */
16 CPU_PMC_BIT, /* Performance Monitor */
17 CPU_PLATFORM_BIT, /* Platform */
18 CPU_APIC_BIT, /* APIC */
19 CPU_POWERON_BIT, /* Power-on */
20 CPU_CONTROL_BIT, /* Control */
21 CPU_FEATURES_BIT, /* Features control */
22 CPU_LBRANCH_BIT, /* Last Branch */
23 CPU_BIOS_BIT, /* BIOS */
24 CPU_FREQ_BIT, /* Frequency */
25 CPU_MTTR_BIT, /* MTRR */
26 CPU_PERF_BIT, /* Performance */
27 CPU_CACHE_BIT, /* Cache */
28 CPU_SYSENTER_BIT, /* Sysenter */
29 CPU_THERM_BIT, /* Thermal */
30 CPU_MISC_BIT, /* Miscellaneous */
31 CPU_DEBUG_BIT, /* Debug */
32 CPU_PAT_BIT, /* PAT */
33 CPU_VMX_BIT, /* VMX */
34 CPU_CALL_BIT, /* System Call */
35 CPU_BASE_BIT, /* BASE Address */
36 CPU_VER_BIT, /* Version ID */
37 CPU_CONF_BIT, /* Configuration */
38 CPU_SMM_BIT, /* System mgmt mode */
39 CPU_SVM_BIT, /*Secure Virtual Machine*/
40 CPU_OSVM_BIT, /* OS-Visible Workaround*/
41/* Standard Registers */
42 CPU_TSS_BIT, /* Task Stack Segment */
43 CPU_CR_BIT, /* Control Registers */
44 CPU_DT_BIT, /* Descriptor Table */
45/* End of Registers flags */
46 CPU_REG_ALL_BIT, /* Select all Registers */
47};
48
49#define CPU_REG_ALL (~0) /* Select all Registers */
50
51#define CPU_MC (1 << CPU_MC_BIT)
52#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
53#define CPU_TIME (1 << CPU_TIME_BIT)
54#define CPU_PMC (1 << CPU_PMC_BIT)
55#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
56#define CPU_APIC (1 << CPU_APIC_BIT)
57#define CPU_POWERON (1 << CPU_POWERON_BIT)
58#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
59#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
60#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
61#define CPU_BIOS (1 << CPU_BIOS_BIT)
62#define CPU_FREQ (1 << CPU_FREQ_BIT)
63#define CPU_MTRR (1 << CPU_MTTR_BIT)
64#define CPU_PERF (1 << CPU_PERF_BIT)
65#define CPU_CACHE (1 << CPU_CACHE_BIT)
66#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
67#define CPU_THERM (1 << CPU_THERM_BIT)
68#define CPU_MISC (1 << CPU_MISC_BIT)
69#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
70#define CPU_PAT (1 << CPU_PAT_BIT)
71#define CPU_VMX (1 << CPU_VMX_BIT)
72#define CPU_CALL (1 << CPU_CALL_BIT)
73#define CPU_BASE (1 << CPU_BASE_BIT)
74#define CPU_VER (1 << CPU_VER_BIT)
75#define CPU_CONF (1 << CPU_CONF_BIT)
76#define CPU_SMM (1 << CPU_SMM_BIT)
77#define CPU_SVM (1 << CPU_SVM_BIT)
78#define CPU_OSVM (1 << CPU_OSVM_BIT)
79#define CPU_TSS (1 << CPU_TSS_BIT)
80#define CPU_CR (1 << CPU_CR_BIT)
81#define CPU_DT (1 << CPU_DT_BIT)
82
83/* Register file flags */
84enum cpu_file_bit {
85 CPU_INDEX_BIT, /* index */
86 CPU_VALUE_BIT, /* value */
87};
88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91#define MAX_CPU_FILES 512
92
93struct cpu_private {
94 unsigned cpu;
95 unsigned type;
96 unsigned reg;
97 unsigned file;
98};
99
100struct cpu_debug_base {
101 char *name; /* Register name */
102 unsigned flag; /* Register flag */
103 unsigned write; /* Register write flag */
104};
105
106/*
107 * Currently it looks similar to cpu_debug_base but once we add more files
108 * cpu_file_base will go in different direction
109 */
110struct cpu_file_base {
111 char *name; /* Register file name */
112 unsigned flag; /* Register file flag */
113 unsigned write; /* Register write flag */
114};
115
116struct cpu_cpuX_base {
117 struct dentry *dentry; /* Register dentry */
118 int init; /* Register index file */
119};
120
121struct cpu_debug_range {
122 unsigned min; /* Register range min */
123 unsigned max; /* Register range max */
124 unsigned flag; /* Supported flags */
125};
126
127#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 637e1ec963c3..0cd82d068613 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -168,6 +168,10 @@
168#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ 168#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
169#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ 169#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
170#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ 170#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
171#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */
172#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */
173#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */
174#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */
171 175
172#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 176#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
173 177
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 8240f76b531e..b81002f23614 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -14,6 +14,9 @@
14 which debugging register was responsible for the trap. The other bits 14 which debugging register was responsible for the trap. The other bits
15 are either reserved or not of interest to us. */ 15 are either reserved or not of interest to us. */
16 16
17/* Define reserved bits in DR6 which are always set to 1 */
18#define DR6_RESERVED (0xFFFF0FF0)
19
17#define DR_TRAP0 (0x1) /* db0 */ 20#define DR_TRAP0 (0x1) /* db0 */
18#define DR_TRAP1 (0x2) /* db1 */ 21#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */ 22#define DR_TRAP2 (0x4) /* db2 */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 761249e396fe..0e22296790d3 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -111,11 +111,8 @@ extern unsigned long end_user_pfn;
111 111
112extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); 112extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
113extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); 113extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
114extern void reserve_early(u64 start, u64 end, char *name);
115extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
116extern void free_early(u64 start, u64 end);
117extern void early_res_to_bootmem(u64 start, u64 end);
118extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); 114extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
115#include <linux/early_res.h>
119 116
120extern unsigned long e820_end_of_ram_pfn(void); 117extern unsigned long e820_end_of_ram_pfn(void);
121extern unsigned long e820_end_of_low_ram_pfn(void); 118extern unsigned long e820_end_of_low_ram_pfn(void);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index b4501ee223ad..f2ad2163109d 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -170,10 +170,7 @@ static inline void elf_common_init(struct thread_struct *t,
170} 170}
171 171
172#define ELF_PLAT_INIT(_r, load_addr) \ 172#define ELF_PLAT_INIT(_r, load_addr) \
173do { \ 173 elf_common_init(&current->thread, _r, 0)
174 elf_common_init(&current->thread, _r, 0); \
175 clear_thread_flag(TIF_IA32); \
176} while (0)
177 174
178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 175#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
179 elf_common_init(&current->thread, regs, __USER_DS) 176 elf_common_init(&current->thread, regs, __USER_DS)
@@ -181,14 +178,8 @@ do { \
181void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); 178void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
182#define compat_start_thread start_thread_ia32 179#define compat_start_thread start_thread_ia32
183 180
184#define COMPAT_SET_PERSONALITY(ex) \ 181void set_personality_ia32(void);
185do { \ 182#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32()
186 if (test_thread_flag(TIF_IA32)) \
187 clear_thread_flag(TIF_ABI_PENDING); \
188 else \
189 set_thread_flag(TIF_ABI_PENDING); \
190 current->personality |= force_personality32; \
191} while (0)
192 183
193#define COMPAT_ELF_PLATFORM ("i686") 184#define COMPAT_ELF_PLATFORM ("i686")
194 185
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
index 53018464aea6..2519d0679d99 100644
--- a/arch/x86/include/asm/fb.h
+++ b/arch/x86/include/asm/fb.h
@@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; 12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
13} 13}
14 14
15#ifdef CONFIG_X86_32
16extern int fb_is_primary_device(struct fb_info *info); 15extern int fb_is_primary_device(struct fb_info *info);
17#else
18static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
19#endif
20 16
21#endif /* _ASM_X86_FB_H */ 17#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 635f03bb4995..d07b44f7d1dc 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -82,6 +82,9 @@ enum fixed_addresses {
82#endif 82#endif
83 FIX_DBGP_BASE, 83 FIX_DBGP_BASE,
84 FIX_EARLYCON_MEM_BASE, 84 FIX_EARLYCON_MEM_BASE,
85#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
86 FIX_OHCI1394_BASE,
87#endif
85#ifdef CONFIG_X86_LOCAL_APIC 88#ifdef CONFIG_X86_LOCAL_APIC
86 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ 89 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
87#endif 90#endif
@@ -132,9 +135,6 @@ enum fixed_addresses {
132 (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1)) 135 (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
133 : __end_of_permanent_fixed_addresses, 136 : __end_of_permanent_fixed_addresses,
134 FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, 137 FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
135#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
136 FIX_OHCI1394_BASE,
137#endif
138#ifdef CONFIG_X86_32 138#ifdef CONFIG_X86_32
139 FIX_WP_TEST, 139 FIX_WP_TEST,
140#endif 140#endif
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f8576427cfe..aeab29aee617 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
35 35
36#define __ARCH_IRQ_STAT 36#define __ARCH_IRQ_STAT
37 37
38#define inc_irq_stat(member) percpu_add(irq_stat.member, 1) 38#define inc_irq_stat(member) percpu_inc(irq_stat.member)
39 39
40#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending) 40#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
41 41
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae45..a726650fc80f 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
67struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
68 68
69#ifndef CONFIG_PARAVIRT
70#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
71#endif
72
73#define flush_cache_kmaps() do { } while (0) 69#define flush_cache_kmaps() do { } while (0)
74 70
75extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, 71extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 5d89fd2a3690..1d5c08a1bdfd 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -67,6 +67,7 @@ extern unsigned long hpet_address;
67extern unsigned long force_hpet_address; 67extern unsigned long force_hpet_address;
68extern u8 hpet_blockid; 68extern u8 hpet_blockid;
69extern int hpet_force_user; 69extern int hpet_force_user;
70extern u8 hpet_msi_disable;
70extern int is_hpet_enabled(void); 71extern int is_hpet_enabled(void);
71extern int hpet_enable(void); 72extern int hpet_enable(void);
72extern void hpet_disable(void); 73extern void hpet_disable(void);
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 0675a7c4c20e..2a1bd8f4f23a 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -10,7 +10,6 @@
10 * (display/resolving) 10 * (display/resolving)
11 */ 11 */
12struct arch_hw_breakpoint { 12struct arch_hw_breakpoint {
13 char *name; /* Contains name of the symbol to set bkpt */
14 unsigned long address; 13 unsigned long address;
15 u8 len; 14 u8 len;
16 u8 type; 15 u8 type;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eeac829a0f44..46c0fe05f230 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -53,13 +53,6 @@ extern void threshold_interrupt(void);
53extern void call_function_interrupt(void); 53extern void call_function_interrupt(void);
54extern void call_function_single_interrupt(void); 54extern void call_function_single_interrupt(void);
55 55
56/* PIC specific functions */
57extern void disable_8259A_irq(unsigned int irq);
58extern void enable_8259A_irq(unsigned int irq);
59extern int i8259A_irq_pending(unsigned int irq);
60extern void make_8259A_irq(unsigned int irq);
61extern void init_8259A(int aeoi);
62
63/* IOAPIC */ 56/* IOAPIC */
64#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) 57#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
65extern unsigned long io_apic_irqs; 58extern unsigned long io_apic_irqs;
@@ -140,6 +133,7 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
140 133
141typedef int vector_irq_t[NR_VECTORS]; 134typedef int vector_irq_t[NR_VECTORS];
142DECLARE_PER_CPU(vector_irq_t, vector_irq); 135DECLARE_PER_CPU(vector_irq_t, vector_irq);
136extern void setup_vector_irq(int cpu);
143 137
144#ifdef CONFIG_X86_IO_APIC 138#ifdef CONFIG_X86_IO_APIC
145extern void lock_vector_lock(void); 139extern void lock_vector_lock(void);
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
new file mode 100644
index 000000000000..e153a2b3889a
--- /dev/null
+++ b/arch/x86/include/asm/hyperv.h
@@ -0,0 +1,186 @@
1#ifndef _ASM_X86_KVM_HYPERV_H
2#define _ASM_X86_KVM_HYPERV_H
3
4#include <linux/types.h>
5
6/*
7 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
8 * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
9 */
10#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
11#define HYPERV_CPUID_INTERFACE 0x40000001
12#define HYPERV_CPUID_VERSION 0x40000002
13#define HYPERV_CPUID_FEATURES 0x40000003
14#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
15#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
16
17/*
18 * Feature identification. EAX indicates which features are available
19 * to the partition based upon the current partition privileges.
20 */
21
22/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
23#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
24/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
25#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
26/*
27 * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
28 * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
29 */
30#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
31/*
32 * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
33 * HV_X64_MSR_STIMER3_COUNT) available
34 */
35#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
36/*
37 * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
38 * are available
39 */
40#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
41/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
42#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
43/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
44#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
45/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
46#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
47 /*
48 * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
49 * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
50 * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
51 */
52#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
53
54/*
55 * Feature identification: EBX indicates which flags were specified at
56 * partition creation. The format is the same as the partition creation
57 * flag structure defined in section Partition Creation Flags.
58 */
59#define HV_X64_CREATE_PARTITIONS (1 << 0)
60#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
61#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
62#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
63#define HV_X64_POST_MESSAGES (1 << 4)
64#define HV_X64_SIGNAL_EVENTS (1 << 5)
65#define HV_X64_CREATE_PORT (1 << 6)
66#define HV_X64_CONNECT_PORT (1 << 7)
67#define HV_X64_ACCESS_STATS (1 << 8)
68#define HV_X64_DEBUGGING (1 << 11)
69#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
70#define HV_X64_CONFIGURE_PROFILER (1 << 13)
71
72/*
73 * Feature identification. EDX indicates which miscellaneous features
74 * are available to the partition.
75 */
76/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
77#define HV_X64_MWAIT_AVAILABLE (1 << 0)
78/* Guest debugging support is available */
79#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
80/* Performance Monitor support is available*/
81#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
82/* Support for physical CPU dynamic partitioning events is available*/
83#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
84/*
85 * Support for passing hypercall input parameter block via XMM
86 * registers is available
87 */
88#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
89/* Support for a virtual guest idle state is available */
90#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
91
92/*
93 * Implementation recommendations. Indicates which behaviors the hypervisor
94 * recommends the OS implement for optimal performance.
95 */
96 /*
97 * Recommend using hypercall for address space switches rather
98 * than MOV to CR3 instruction
99 */
100#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
101/* Recommend using hypercall for local TLB flushes rather
102 * than INVLPG or MOV to CR3 instructions */
103#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
104/*
105 * Recommend using hypercall for remote TLB flushes rather
106 * than inter-processor interrupts
107 */
108#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
109/*
110 * Recommend using MSRs for accessing APIC registers
111 * EOI, ICR and TPR rather than their memory-mapped counterparts
112 */
113#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
114/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
115#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
116/*
117 * Recommend using relaxed timing for this partition. If used,
118 * the VM should disable any watchdog timeouts that rely on the
119 * timely delivery of external interrupts
120 */
121#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
122
123/* MSR used to identify the guest OS. */
124#define HV_X64_MSR_GUEST_OS_ID 0x40000000
125
126/* MSR used to setup pages used to communicate with the hypervisor. */
127#define HV_X64_MSR_HYPERCALL 0x40000001
128
129/* MSR used to provide vcpu index */
130#define HV_X64_MSR_VP_INDEX 0x40000002
131
132/* Define the virtual APIC registers */
133#define HV_X64_MSR_EOI 0x40000070
134#define HV_X64_MSR_ICR 0x40000071
135#define HV_X64_MSR_TPR 0x40000072
136#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
137
138/* Define synthetic interrupt controller model specific registers. */
139#define HV_X64_MSR_SCONTROL 0x40000080
140#define HV_X64_MSR_SVERSION 0x40000081
141#define HV_X64_MSR_SIEFP 0x40000082
142#define HV_X64_MSR_SIMP 0x40000083
143#define HV_X64_MSR_EOM 0x40000084
144#define HV_X64_MSR_SINT0 0x40000090
145#define HV_X64_MSR_SINT1 0x40000091
146#define HV_X64_MSR_SINT2 0x40000092
147#define HV_X64_MSR_SINT3 0x40000093
148#define HV_X64_MSR_SINT4 0x40000094
149#define HV_X64_MSR_SINT5 0x40000095
150#define HV_X64_MSR_SINT6 0x40000096
151#define HV_X64_MSR_SINT7 0x40000097
152#define HV_X64_MSR_SINT8 0x40000098
153#define HV_X64_MSR_SINT9 0x40000099
154#define HV_X64_MSR_SINT10 0x4000009A
155#define HV_X64_MSR_SINT11 0x4000009B
156#define HV_X64_MSR_SINT12 0x4000009C
157#define HV_X64_MSR_SINT13 0x4000009D
158#define HV_X64_MSR_SINT14 0x4000009E
159#define HV_X64_MSR_SINT15 0x4000009F
160
161
162#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
163#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
164#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
165 (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
166
167/* Declare the various hypercall operations. */
168#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
169
170#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
171#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
172#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
173 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
174
175#define HV_PROCESSOR_POWER_STATE_C0 0
176#define HV_PROCESSOR_POWER_STATE_C1 1
177#define HV_PROCESSOR_POWER_STATE_C2 2
178#define HV_PROCESSOR_POWER_STATE_C3 3
179
180/* hypercall status code */
181#define HV_STATUS_SUCCESS 0
182#define HV_STATUS_INVALID_HYPERCALL_CODE 2
183#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
184#define HV_STATUS_INVALID_ALIGNMENT 4
185
186#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ebfb8a9e11f7..da2930924501 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -33,8 +33,16 @@ extern void init_thread_xstate(void);
33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); 33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
34 34
35extern user_regset_active_fn fpregs_active, xfpregs_active; 35extern user_regset_active_fn fpregs_active, xfpregs_active;
36extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; 36extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
37extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; 37 xstateregs_get;
38extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
39 xstateregs_set;
40
41/*
42 * xstateregs_active == fpregs_active. Please refer to the comment
43 * at the definition of fpregs_active.
44 */
45#define xstateregs_active fpregs_active
38 46
39extern struct _fpx_sw_bytes fx_sw_reserved; 47extern struct _fpx_sw_bytes fx_sw_reserved;
40#ifdef CONFIG_IA32_EMULATION 48#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 58d7091eeb1f..1655147646aa 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -24,12 +24,7 @@ extern unsigned int cached_irq_mask;
24#define SLAVE_ICW4_DEFAULT 0x01 24#define SLAVE_ICW4_DEFAULT 0x01
25#define PIC_ICW4_AEOI 2 25#define PIC_ICW4_AEOI 2
26 26
27extern spinlock_t i8259A_lock; 27extern raw_spinlock_t i8259A_lock;
28
29extern void init_8259A(int auto_eoi);
30extern void enable_8259A_irq(unsigned int irq);
31extern void disable_8259A_irq(unsigned int irq);
32extern unsigned int startup_8259A_irq(unsigned int irq);
33 28
34/* the PIC may need a careful delay on some platforms, hence specific calls */ 29/* the PIC may need a careful delay on some platforms, hence specific calls */
35static inline unsigned char inb_pic(unsigned int port) 30static inline unsigned char inb_pic(unsigned int port)
@@ -57,7 +52,17 @@ static inline void outb_pic(unsigned char value, unsigned int port)
57 52
58extern struct irq_chip i8259A_chip; 53extern struct irq_chip i8259A_chip;
59 54
60extern void mask_8259A(void); 55struct legacy_pic {
61extern void unmask_8259A(void); 56 int nr_legacy_irqs;
57 struct irq_chip *chip;
58 void (*mask_all)(void);
59 void (*restore_mask)(void);
60 void (*init)(int auto_eoi);
61 int (*irq_pending)(unsigned int irq);
62 void (*make_irq)(unsigned int irq);
63};
64
65extern struct legacy_pic *legacy_pic;
66extern struct legacy_pic null_legacy_pic;
62 67
63#endif /* _ASM_X86_I8259_H */ 68#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 73739322b6d0..a1dcfa3ab17d 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -1,8 +1,42 @@
1#ifndef _ASM_X86_IO_H 1#ifndef _ASM_X86_IO_H
2#define _ASM_X86_IO_H 2#define _ASM_X86_IO_H
3 3
4/*
5 * This file contains the definitions for the x86 IO instructions
6 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
7 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
8 * versions of the single-IO instructions (inb_p/inw_p/..).
9 *
10 * This file is not meant to be obfuscating: it's just complicated
11 * to (a) handle it all in a way that makes gcc able to optimize it
12 * as well as possible and (b) trying to avoid writing the same thing
13 * over and over again with slight variations and possibly making a
14 * mistake somewhere.
15 */
16
17/*
18 * Thanks to James van Artsdalen for a better timing-fix than
19 * the two short jumps: using outb's to a nonexistent port seems
20 * to guarantee better timings even on fast machines.
21 *
22 * On the other hand, I'd like to be sure of a non-existent port:
23 * I feel a bit unsafe about using 0x80 (should be safe, though)
24 *
25 * Linus
26 */
27
28 /*
29 * Bit simplified and optimized by Jan Hubicka
30 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
31 *
32 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
33 * isa_read[wl] and isa_write[wl] fixed
34 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
35 */
36
4#define ARCH_HAS_IOREMAP_WC 37#define ARCH_HAS_IOREMAP_WC
5 38
39#include <linux/string.h>
6#include <linux/compiler.h> 40#include <linux/compiler.h>
7#include <asm-generic/int-ll64.h> 41#include <asm-generic/int-ll64.h>
8#include <asm/page.h> 42#include <asm/page.h>
@@ -173,11 +207,126 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
173extern void iounmap(volatile void __iomem *addr); 207extern void iounmap(volatile void __iomem *addr);
174 208
175 209
176#ifdef CONFIG_X86_32 210#ifdef __KERNEL__
177# include "io_32.h" 211
212#include <asm-generic/iomap.h>
213
214#include <linux/vmalloc.h>
215
216/*
217 * Convert a virtual cached pointer to an uncached pointer
218 */
219#define xlate_dev_kmem_ptr(p) p
220
221static inline void
222memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
223{
224 memset((void __force *)addr, val, count);
225}
226
227static inline void
228memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
229{
230 memcpy(dst, (const void __force *)src, count);
231}
232
233static inline void
234memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
235{
236 memcpy((void __force *)dst, src, count);
237}
238
239/*
240 * ISA space is 'always mapped' on a typical x86 system, no need to
241 * explicitly ioremap() it. The fact that the ISA IO space is mapped
242 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
243 * are physical addresses. The following constant pointer can be
244 * used as the IO-area pointer (it can be iounmapped as well, so the
245 * analogy with PCI is quite large):
246 */
247#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
248
249/*
250 * Cache management
251 *
252 * This needed for two cases
253 * 1. Out of order aware processors
254 * 2. Accidentally out of order processors (PPro errata #51)
255 */
256
257static inline void flush_write_buffers(void)
258{
259#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
260 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
261#endif
262}
263
264#endif /* __KERNEL__ */
265
266extern void native_io_delay(void);
267
268extern int io_delay_type;
269extern void io_delay_init(void);
270
271#if defined(CONFIG_PARAVIRT)
272#include <asm/paravirt.h>
178#else 273#else
179# include "io_64.h" 274
275static inline void slow_down_io(void)
276{
277 native_io_delay();
278#ifdef REALLY_SLOW_IO
279 native_io_delay();
280 native_io_delay();
281 native_io_delay();
180#endif 282#endif
283}
284
285#endif
286
287#define BUILDIO(bwl, bw, type) \
288static inline void out##bwl(unsigned type value, int port) \
289{ \
290 asm volatile("out" #bwl " %" #bw "0, %w1" \
291 : : "a"(value), "Nd"(port)); \
292} \
293 \
294static inline unsigned type in##bwl(int port) \
295{ \
296 unsigned type value; \
297 asm volatile("in" #bwl " %w1, %" #bw "0" \
298 : "=a"(value) : "Nd"(port)); \
299 return value; \
300} \
301 \
302static inline void out##bwl##_p(unsigned type value, int port) \
303{ \
304 out##bwl(value, port); \
305 slow_down_io(); \
306} \
307 \
308static inline unsigned type in##bwl##_p(int port) \
309{ \
310 unsigned type value = in##bwl(port); \
311 slow_down_io(); \
312 return value; \
313} \
314 \
315static inline void outs##bwl(int port, const void *addr, unsigned long count) \
316{ \
317 asm volatile("rep; outs" #bwl \
318 : "+S"(addr), "+c"(count) : "d"(port)); \
319} \
320 \
321static inline void ins##bwl(int port, void *addr, unsigned long count) \
322{ \
323 asm volatile("rep; ins" #bwl \
324 : "+D"(addr), "+c"(count) : "d"(port)); \
325}
326
327BUILDIO(b, b, char)
328BUILDIO(w, w, short)
329BUILDIO(l, , int)
181 330
182extern void *xlate_dev_mem_ptr(unsigned long phys); 331extern void *xlate_dev_mem_ptr(unsigned long phys);
183extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); 332extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
deleted file mode 100644
index a299900f5920..000000000000
--- a/arch/x86/include/asm/io_32.h
+++ /dev/null
@@ -1,196 +0,0 @@
1#ifndef _ASM_X86_IO_32_H
2#define _ASM_X86_IO_32_H
3
4#include <linux/string.h>
5#include <linux/compiler.h>
6
7/*
8 * This file contains the definitions for the x86 IO instructions
9 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
10 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
11 * versions of the single-IO instructions (inb_p/inw_p/..).
12 *
13 * This file is not meant to be obfuscating: it's just complicated
14 * to (a) handle it all in a way that makes gcc able to optimize it
15 * as well as possible and (b) trying to avoid writing the same thing
16 * over and over again with slight variations and possibly making a
17 * mistake somewhere.
18 */
19
20/*
21 * Thanks to James van Artsdalen for a better timing-fix than
22 * the two short jumps: using outb's to a nonexistent port seems
23 * to guarantee better timings even on fast machines.
24 *
25 * On the other hand, I'd like to be sure of a non-existent port:
26 * I feel a bit unsafe about using 0x80 (should be safe, though)
27 *
28 * Linus
29 */
30
31 /*
32 * Bit simplified and optimized by Jan Hubicka
33 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
34 *
35 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
36 * isa_read[wl] and isa_write[wl] fixed
37 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
38 */
39
40#define XQUAD_PORTIO_BASE 0xfe400000
41#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
42
43#ifdef __KERNEL__
44
45#include <asm-generic/iomap.h>
46
47#include <linux/vmalloc.h>
48
49/*
50 * Convert a virtual cached pointer to an uncached pointer
51 */
52#define xlate_dev_kmem_ptr(p) p
53
54static inline void
55memset_io(volatile void __iomem *addr, unsigned char val, int count)
56{
57 memset((void __force *)addr, val, count);
58}
59
60static inline void
61memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
62{
63 __memcpy(dst, (const void __force *)src, count);
64}
65
66static inline void
67memcpy_toio(volatile void __iomem *dst, const void *src, int count)
68{
69 __memcpy((void __force *)dst, src, count);
70}
71
72/*
73 * ISA space is 'always mapped' on a typical x86 system, no need to
74 * explicitly ioremap() it. The fact that the ISA IO space is mapped
75 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
76 * are physical addresses. The following constant pointer can be
77 * used as the IO-area pointer (it can be iounmapped as well, so the
78 * analogy with PCI is quite large):
79 */
80#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
81
82/*
83 * Cache management
84 *
85 * This needed for two cases
86 * 1. Out of order aware processors
87 * 2. Accidentally out of order processors (PPro errata #51)
88 */
89
90#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
91
92static inline void flush_write_buffers(void)
93{
94 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
95}
96
97#else
98
99#define flush_write_buffers() do { } while (0)
100
101#endif
102
103#endif /* __KERNEL__ */
104
105extern void native_io_delay(void);
106
107extern int io_delay_type;
108extern void io_delay_init(void);
109
110#if defined(CONFIG_PARAVIRT)
111#include <asm/paravirt.h>
112#else
113
114static inline void slow_down_io(void)
115{
116 native_io_delay();
117#ifdef REALLY_SLOW_IO
118 native_io_delay();
119 native_io_delay();
120 native_io_delay();
121#endif
122}
123
124#endif
125
126#define __BUILDIO(bwl, bw, type) \
127static inline void out##bwl(unsigned type value, int port) \
128{ \
129 out##bwl##_local(value, port); \
130} \
131 \
132static inline unsigned type in##bwl(int port) \
133{ \
134 return in##bwl##_local(port); \
135}
136
137#define BUILDIO(bwl, bw, type) \
138static inline void out##bwl##_local(unsigned type value, int port) \
139{ \
140 asm volatile("out" #bwl " %" #bw "0, %w1" \
141 : : "a"(value), "Nd"(port)); \
142} \
143 \
144static inline unsigned type in##bwl##_local(int port) \
145{ \
146 unsigned type value; \
147 asm volatile("in" #bwl " %w1, %" #bw "0" \
148 : "=a"(value) : "Nd"(port)); \
149 return value; \
150} \
151 \
152static inline void out##bwl##_local_p(unsigned type value, int port) \
153{ \
154 out##bwl##_local(value, port); \
155 slow_down_io(); \
156} \
157 \
158static inline unsigned type in##bwl##_local_p(int port) \
159{ \
160 unsigned type value = in##bwl##_local(port); \
161 slow_down_io(); \
162 return value; \
163} \
164 \
165__BUILDIO(bwl, bw, type) \
166 \
167static inline void out##bwl##_p(unsigned type value, int port) \
168{ \
169 out##bwl(value, port); \
170 slow_down_io(); \
171} \
172 \
173static inline unsigned type in##bwl##_p(int port) \
174{ \
175 unsigned type value = in##bwl(port); \
176 slow_down_io(); \
177 return value; \
178} \
179 \
180static inline void outs##bwl(int port, const void *addr, unsigned long count) \
181{ \
182 asm volatile("rep; outs" #bwl \
183 : "+S"(addr), "+c"(count) : "d"(port)); \
184} \
185 \
186static inline void ins##bwl(int port, void *addr, unsigned long count) \
187{ \
188 asm volatile("rep; ins" #bwl \
189 : "+D"(addr), "+c"(count) : "d"(port)); \
190}
191
192BUILDIO(b, b, char)
193BUILDIO(w, w, short)
194BUILDIO(l, , int)
195
196#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
deleted file mode 100644
index 244067893af4..000000000000
--- a/arch/x86/include/asm/io_64.h
+++ /dev/null
@@ -1,181 +0,0 @@
1#ifndef _ASM_X86_IO_64_H
2#define _ASM_X86_IO_64_H
3
4
5/*
6 * This file contains the definitions for the x86 IO instructions
7 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
8 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
9 * versions of the single-IO instructions (inb_p/inw_p/..).
10 *
11 * This file is not meant to be obfuscating: it's just complicated
12 * to (a) handle it all in a way that makes gcc able to optimize it
13 * as well as possible and (b) trying to avoid writing the same thing
14 * over and over again with slight variations and possibly making a
15 * mistake somewhere.
16 */
17
18/*
19 * Thanks to James van Artsdalen for a better timing-fix than
20 * the two short jumps: using outb's to a nonexistent port seems
21 * to guarantee better timings even on fast machines.
22 *
23 * On the other hand, I'd like to be sure of a non-existent port:
24 * I feel a bit unsafe about using 0x80 (should be safe, though)
25 *
26 * Linus
27 */
28
29 /*
30 * Bit simplified and optimized by Jan Hubicka
31 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
32 *
33 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
34 * isa_read[wl] and isa_write[wl] fixed
35 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
36 */
37
38extern void native_io_delay(void);
39
40extern int io_delay_type;
41extern void io_delay_init(void);
42
43#if defined(CONFIG_PARAVIRT)
44#include <asm/paravirt.h>
45#else
46
47static inline void slow_down_io(void)
48{
49 native_io_delay();
50#ifdef REALLY_SLOW_IO
51 native_io_delay();
52 native_io_delay();
53 native_io_delay();
54#endif
55}
56#endif
57
58/*
59 * Talk about misusing macros..
60 */
61#define __OUT1(s, x) \
62static inline void out##s(unsigned x value, unsigned short port) {
63
64#define __OUT2(s, s1, s2) \
65asm volatile ("out" #s " %" s1 "0,%" s2 "1"
66
67#ifndef REALLY_SLOW_IO
68#define REALLY_SLOW_IO
69#define UNSET_REALLY_SLOW_IO
70#endif
71
72#define __OUT(s, s1, x) \
73 __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
74 } \
75 __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
76 slow_down_io(); \
77}
78
79#define __IN1(s) \
80static inline RETURN_TYPE in##s(unsigned short port) \
81{ \
82 RETURN_TYPE _v;
83
84#define __IN2(s, s1, s2) \
85 asm volatile ("in" #s " %" s2 "1,%" s1 "0"
86
87#define __IN(s, s1, i...) \
88 __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
89 return _v; \
90 } \
91 __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
92 slow_down_io(); \
93 return _v; }
94
95#ifdef UNSET_REALLY_SLOW_IO
96#undef REALLY_SLOW_IO
97#endif
98
99#define __INS(s) \
100static inline void ins##s(unsigned short port, void *addr, \
101 unsigned long count) \
102{ \
103 asm volatile ("rep ; ins" #s \
104 : "=D" (addr), "=c" (count) \
105 : "d" (port), "0" (addr), "1" (count)); \
106}
107
108#define __OUTS(s) \
109static inline void outs##s(unsigned short port, const void *addr, \
110 unsigned long count) \
111{ \
112 asm volatile ("rep ; outs" #s \
113 : "=S" (addr), "=c" (count) \
114 : "d" (port), "0" (addr), "1" (count)); \
115}
116
117#define RETURN_TYPE unsigned char
118__IN(b, "")
119#undef RETURN_TYPE
120#define RETURN_TYPE unsigned short
121__IN(w, "")
122#undef RETURN_TYPE
123#define RETURN_TYPE unsigned int
124__IN(l, "")
125#undef RETURN_TYPE
126
127__OUT(b, "b", char)
128__OUT(w, "w", short)
129__OUT(l, , int)
130
131__INS(b)
132__INS(w)
133__INS(l)
134
135__OUTS(b)
136__OUTS(w)
137__OUTS(l)
138
139#if defined(__KERNEL__) && defined(__x86_64__)
140
141#include <linux/vmalloc.h>
142
143#include <asm-generic/iomap.h>
144
145void __memcpy_fromio(void *, unsigned long, unsigned);
146void __memcpy_toio(unsigned long, const void *, unsigned);
147
148static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
149 unsigned len)
150{
151 __memcpy_fromio(to, (unsigned long)from, len);
152}
153
154static inline void memcpy_toio(volatile void __iomem *to, const void *from,
155 unsigned len)
156{
157 __memcpy_toio((unsigned long)to, from, len);
158}
159
160void memset_io(volatile void __iomem *a, int b, size_t c);
161
162/*
163 * ISA space is 'always mapped' on a typical x86 system, no need to
164 * explicitly ioremap() it. The fact that the ISA IO space is mapped
165 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
166 * are physical addresses. The following constant pointer can be
167 * used as the IO-area pointer (it can be iounmapped as well, so the
168 * analogy with PCI is quite large):
169 */
170#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
171
172#define flush_write_buffers()
173
174/*
175 * Convert a virtual cached pointer to an uncached pointer
176 */
177#define xlate_dev_kmem_ptr(p) p
178
179#endif /* __KERNEL__ */
180
181#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 7c7c16cde1f8..35832a03a515 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,8 +143,6 @@ extern int noioapicreroute;
143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ 143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
144extern int timer_through_8259; 144extern int timer_through_8259;
145 145
146extern void io_apic_disable_legacy(void);
147
148/* 146/*
149 * If we use the IO-APIC for IRQ routing, disable automatic 147 * If we use the IO-APIC for IRQ routing, disable automatic
150 * assignment of PCI IRQ's. 148 * assignment of PCI IRQ's.
@@ -160,6 +158,7 @@ extern int io_apic_get_redir_entries(int ioapic);
160struct io_apic_irq_attr; 158struct io_apic_irq_attr;
161extern int io_apic_set_pci_routing(struct device *dev, int irq, 159extern int io_apic_set_pci_routing(struct device *dev, int irq,
162 struct io_apic_irq_attr *irq_attr); 160 struct io_apic_irq_attr *irq_attr);
161void setup_IO_APIC_irq_extra(u32 gsi);
163extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
164extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
165extern void ioapic_insert_resources(void); 164extern void ioapic_insert_resources(void);
@@ -188,6 +187,7 @@ extern struct mp_ioapic_gsi mp_gsi_routing[];
188int mp_find_ioapic(int gsi); 187int mp_find_ioapic(int gsi);
189int mp_find_ioapic_pin(int ioapic, int gsi); 188int mp_find_ioapic_pin(int ioapic, int gsi);
190void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); 189void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
190extern void __init pre_init_apic_IRQ0(void);
191 191
192#else /* !CONFIG_X86_IO_APIC */ 192#else /* !CONFIG_X86_IO_APIC */
193 193
@@ -197,7 +197,11 @@ static const int timer_through_8259 = 0;
197static inline void ioapic_init_mappings(void) { } 197static inline void ioapic_init_mappings(void) { }
198static inline void ioapic_insert_resources(void) { } 198static inline void ioapic_insert_resources(void) { }
199static inline void probe_nr_irqs_gsi(void) { } 199static inline void probe_nr_irqs_gsi(void) { }
200static inline int mp_find_ioapic(int gsi) { return 0; }
200 201
202struct io_apic_irq_attr;
203static inline int io_apic_set_pci_routing(struct device *dev, int irq,
204 struct io_apic_irq_attr *irq_attr) { return 0; }
201#endif 205#endif
202 206
203#endif /* _ASM_X86_IO_APIC_H */ 207#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4611f085cd43..8767d99c4f64 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -28,28 +28,33 @@
28#define MCE_VECTOR 0x12 28#define MCE_VECTOR 0x12
29 29
30/* 30/*
31 * IDT vectors usable for external interrupt sources start 31 * IDT vectors usable for external interrupt sources start at 0x20.
32 * at 0x20: 32 * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
33 */ 33 */
34#define FIRST_EXTERNAL_VECTOR 0x20 34#define FIRST_EXTERNAL_VECTOR 0x20
35 35/*
36#ifdef CONFIG_X86_32 36 * We start allocating at 0x21 to spread out vectors evenly between
37# define SYSCALL_VECTOR 0x80 37 * priority levels. (0x80 is the syscall vector)
38# define IA32_SYSCALL_VECTOR 0x80 38 */
39#else 39#define VECTOR_OFFSET_START 1
40# define IA32_SYSCALL_VECTOR 0x80
41#endif
42 40
43/* 41/*
44 * Reserve the lowest usable priority level 0x20 - 0x2f for triggering 42 * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
45 * cleanup after irq migration. 43 * triggering cleanup after irq migration. 0x21-0x2f will still be used
44 * for device interrupts.
46 */ 45 */
47#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR 46#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
48 47
48#define IA32_SYSCALL_VECTOR 0x80
49#ifdef CONFIG_X86_32
50# define SYSCALL_VECTOR 0x80
51#endif
52
49/* 53/*
50 * Vectors 0x30-0x3f are used for ISA interrupts. 54 * Vectors 0x30-0x3f are used for ISA interrupts.
55 * round up to the next 16-vector boundary
51 */ 56 */
52#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) 57#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
53 58
54#define IRQ1_VECTOR (IRQ0_VECTOR + 1) 59#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
55#define IRQ2_VECTOR (IRQ0_VECTOR + 2) 60#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
@@ -120,13 +125,6 @@
120 */ 125 */
121#define MCE_SELF_VECTOR 0xeb 126#define MCE_SELF_VECTOR 0xeb
122 127
123/*
124 * First APIC vector available to drivers: (vectors 0x30-0xee) we
125 * start at 0x31(0x41) to spread out vectors evenly between priority
126 * levels. (0x80 is the syscall vector)
127 */
128#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
129
130#define NR_VECTORS 256 128#define NR_VECTORS 256
131 129
132#define FPU_IRQ 13 130#define FPU_IRQ 13
@@ -154,21 +152,21 @@ static inline int invalid_vm86_irq(int irq)
154 152
155#define NR_IRQS_LEGACY 16 153#define NR_IRQS_LEGACY 16
156 154
157#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
158#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) 155#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
159 156
160#ifdef CONFIG_X86_IO_APIC 157#ifdef CONFIG_X86_IO_APIC
161# ifdef CONFIG_SPARSE_IRQ 158# ifdef CONFIG_SPARSE_IRQ
159# define CPU_VECTOR_LIMIT (64 * NR_CPUS)
162# define NR_IRQS \ 160# define NR_IRQS \
163 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ 161 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
164 (NR_VECTORS + CPU_VECTOR_LIMIT) : \ 162 (NR_VECTORS + CPU_VECTOR_LIMIT) : \
165 (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) 163 (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
166# else 164# else
167# if NR_CPUS < MAX_IO_APICS 165# define CPU_VECTOR_LIMIT (32 * NR_CPUS)
168# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) 166# define NR_IRQS \
169# else 167 (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \
170# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) 168 (NR_VECTORS + CPU_VECTOR_LIMIT) : \
171# endif 169 (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
172# endif 170# endif
173#else /* !CONFIG_X86_IO_APIC: */ 171#else /* !CONFIG_X86_IO_APIC: */
174# define NR_IRQS NR_IRQS_LEGACY 172# define NR_IRQS NR_IRQS_LEGACY
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681de1e76..4ffa345a8ccb 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -32,7 +32,10 @@ struct kprobe;
32 32
33typedef u8 kprobe_opcode_t; 33typedef u8 kprobe_opcode_t;
34#define BREAKPOINT_INSTRUCTION 0xcc 34#define BREAKPOINT_INSTRUCTION 0xcc
35#define RELATIVEJUMP_INSTRUCTION 0xe9 35#define RELATIVEJUMP_OPCODE 0xe9
36#define RELATIVEJUMP_SIZE 5
37#define RELATIVECALL_OPCODE 0xe8
38#define RELATIVE_ADDR_SIZE 4
36#define MAX_INSN_SIZE 16 39#define MAX_INSN_SIZE 16
37#define MAX_STACK_SIZE 64 40#define MAX_STACK_SIZE 64
38#define MIN_STACK_SIZE(ADDR) \ 41#define MIN_STACK_SIZE(ADDR) \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
44 47
45#define flush_insn_slot(p) do { } while (0) 48#define flush_insn_slot(p) do { } while (0)
46 49
50/* optinsn template addresses */
51extern kprobe_opcode_t optprobe_template_entry;
52extern kprobe_opcode_t optprobe_template_val;
53extern kprobe_opcode_t optprobe_template_call;
54extern kprobe_opcode_t optprobe_template_end;
55#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
56#define MAX_OPTINSN_SIZE \
57 (((unsigned long)&optprobe_template_end - \
58 (unsigned long)&optprobe_template_entry) + \
59 MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
60
47extern const int kretprobe_blacklist_size; 61extern const int kretprobe_blacklist_size;
48 62
49void arch_remove_kprobe(struct kprobe *p); 63void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
64 int boostable; 78 int boostable;
65}; 79};
66 80
81struct arch_optimized_insn {
82 /* copy of the original instructions */
83 kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
84 /* detour code buffer */
85 kprobe_opcode_t *insn;
86 /* the size of instructions copied to detour code buffer */
87 size_t size;
88};
89
90/* Return true (!0) if optinsn is prepared for optimization. */
91static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
92{
93 return optinsn->size;
94}
95
67struct prev_kprobe { 96struct prev_kprobe {
68 struct kprobe *kp; 97 struct kprobe *kp;
69 unsigned long status; 98 unsigned long status;
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 950df434763f..f46b79f6c16c 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -254,6 +254,10 @@ struct kvm_reinject_control {
254 __u8 reserved[31]; 254 __u8 reserved[31];
255}; 255};
256 256
257/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
258#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
259#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
260
257/* for KVM_GET/SET_VCPU_EVENTS */ 261/* for KVM_GET/SET_VCPU_EVENTS */
258struct kvm_vcpu_events { 262struct kvm_vcpu_events {
259 struct { 263 struct {
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7c18e1230f54..7a6f54fa13ba 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -54,13 +54,23 @@ struct x86_emulate_ctxt;
54struct x86_emulate_ops { 54struct x86_emulate_ops {
55 /* 55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory. 56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others. 57 * Used for descriptor reading.
58 * @addr: [IN ] Linear address from which to read. 58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory. 60 * @bytes: [IN ] Number of bytes to read from memory.
61 */ 61 */
62 int (*read_std)(unsigned long addr, void *val, 62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu); 63 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64
65 /*
66 * fetch: Read bytes of standard (non-emulated/special) memory.
67 * Used for instruction fetch.
68 * @addr: [IN ] Linear address from which to read.
69 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
70 * @bytes: [IN ] Number of bytes to read from memory.
71 */
72 int (*fetch)(unsigned long addr, void *val,
73 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64 74
65 /* 75 /*
66 * read_emulated: Read bytes from emulated/special memory area. 76 * read_emulated: Read bytes from emulated/special memory area.
@@ -74,7 +84,7 @@ struct x86_emulate_ops {
74 struct kvm_vcpu *vcpu); 84 struct kvm_vcpu *vcpu);
75 85
76 /* 86 /*
77 * write_emulated: Read bytes from emulated/special memory area. 87 * write_emulated: Write bytes to emulated/special memory area.
78 * @addr: [IN ] Linear address to which to write. 88 * @addr: [IN ] Linear address to which to write.
79 * @val: [IN ] Value to write to memory (low-order bytes used as 89 * @val: [IN ] Value to write to memory (low-order bytes used as
80 * required). 90 * required).
@@ -168,6 +178,7 @@ struct x86_emulate_ctxt {
168 178
169/* Execution mode, passed to the emulator. */ 179/* Execution mode, passed to the emulator. */
170#define X86EMUL_MODE_REAL 0 /* Real mode. */ 180#define X86EMUL_MODE_REAL 0 /* Real mode. */
181#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
171#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ 182#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
172#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ 183#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
173#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 184#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f865e8b8540..06d9e79ca37d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,7 +25,7 @@
25#include <asm/mtrr.h> 25#include <asm/mtrr.h>
26#include <asm/msr-index.h> 26#include <asm/msr-index.h>
27 27
28#define KVM_MAX_VCPUS 16 28#define KVM_MAX_VCPUS 64
29#define KVM_MEMORY_SLOTS 32 29#define KVM_MEMORY_SLOTS 32
30/* memory slots that does not exposed to userspace */ 30/* memory slots that does not exposed to userspace */
31#define KVM_PRIVATE_MEM_SLOTS 4 31#define KVM_PRIVATE_MEM_SLOTS 4
@@ -38,19 +38,6 @@
38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
39 0xFFFFFF0000000000ULL) 39 0xFFFFFF0000000000ULL)
40 40
41#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
42 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
43#define KVM_GUEST_CR0_MASK \
44 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
45#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
46 (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
47#define KVM_VM_CR0_ALWAYS_ON \
48 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
49#define KVM_GUEST_CR4_MASK \
50 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
51#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
52#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
53
54#define INVALID_PAGE (~(hpa_t)0) 41#define INVALID_PAGE (~(hpa_t)0)
55#define UNMAPPED_GVA (~(gpa_t)0) 42#define UNMAPPED_GVA (~(gpa_t)0)
56 43
@@ -256,7 +243,8 @@ struct kvm_mmu {
256 void (*new_cr3)(struct kvm_vcpu *vcpu); 243 void (*new_cr3)(struct kvm_vcpu *vcpu);
257 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 244 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
258 void (*free)(struct kvm_vcpu *vcpu); 245 void (*free)(struct kvm_vcpu *vcpu);
259 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 246 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
247 u32 *error);
260 void (*prefetch_page)(struct kvm_vcpu *vcpu, 248 void (*prefetch_page)(struct kvm_vcpu *vcpu,
261 struct kvm_mmu_page *page); 249 struct kvm_mmu_page *page);
262 int (*sync_page)(struct kvm_vcpu *vcpu, 250 int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -282,13 +270,15 @@ struct kvm_vcpu_arch {
282 u32 regs_dirty; 270 u32 regs_dirty;
283 271
284 unsigned long cr0; 272 unsigned long cr0;
273 unsigned long cr0_guest_owned_bits;
285 unsigned long cr2; 274 unsigned long cr2;
286 unsigned long cr3; 275 unsigned long cr3;
287 unsigned long cr4; 276 unsigned long cr4;
277 unsigned long cr4_guest_owned_bits;
288 unsigned long cr8; 278 unsigned long cr8;
289 u32 hflags; 279 u32 hflags;
290 u64 pdptrs[4]; /* pae */ 280 u64 pdptrs[4]; /* pae */
291 u64 shadow_efer; 281 u64 efer;
292 u64 apic_base; 282 u64 apic_base;
293 struct kvm_lapic *apic; /* kernel irqchip context */ 283 struct kvm_lapic *apic; /* kernel irqchip context */
294 int32_t apic_arb_prio; 284 int32_t apic_arb_prio;
@@ -374,17 +364,27 @@ struct kvm_vcpu_arch {
374 /* used for guest single stepping over the given code position */ 364 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs; 365 u16 singlestep_cs;
376 unsigned long singlestep_rip; 366 unsigned long singlestep_rip;
367 /* fields used by HYPER-V emulation */
368 u64 hv_vapic;
377}; 369};
378 370
379struct kvm_mem_alias { 371struct kvm_mem_alias {
380 gfn_t base_gfn; 372 gfn_t base_gfn;
381 unsigned long npages; 373 unsigned long npages;
382 gfn_t target_gfn; 374 gfn_t target_gfn;
375#define KVM_ALIAS_INVALID 1UL
376 unsigned long flags;
383}; 377};
384 378
385struct kvm_arch{ 379#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
386 int naliases; 380
381struct kvm_mem_aliases {
387 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 382 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
383 int naliases;
384};
385
386struct kvm_arch {
387 struct kvm_mem_aliases *aliases;
388 388
389 unsigned int n_free_mmu_pages; 389 unsigned int n_free_mmu_pages;
390 unsigned int n_requested_mmu_pages; 390 unsigned int n_requested_mmu_pages;
@@ -416,6 +416,10 @@ struct kvm_arch{
416 s64 kvmclock_offset; 416 s64 kvmclock_offset;
417 417
418 struct kvm_xen_hvm_config xen_hvm_config; 418 struct kvm_xen_hvm_config xen_hvm_config;
419
420 /* fields used by HYPER-V emulation */
421 u64 hv_guest_os_id;
422 u64 hv_hypercall;
419}; 423};
420 424
421struct kvm_vm_stat { 425struct kvm_vm_stat {
@@ -471,6 +475,7 @@ struct kvm_x86_ops {
471 int (*hardware_setup)(void); /* __init */ 475 int (*hardware_setup)(void); /* __init */
472 void (*hardware_unsetup)(void); /* __exit */ 476 void (*hardware_unsetup)(void); /* __exit */
473 bool (*cpu_has_accelerated_tpr)(void); 477 bool (*cpu_has_accelerated_tpr)(void);
478 void (*cpuid_update)(struct kvm_vcpu *vcpu);
474 479
475 /* Create, but do not attach this VCPU */ 480 /* Create, but do not attach this VCPU */
476 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 481 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
@@ -492,6 +497,7 @@ struct kvm_x86_ops {
492 void (*set_segment)(struct kvm_vcpu *vcpu, 497 void (*set_segment)(struct kvm_vcpu *vcpu,
493 struct kvm_segment *var, int seg); 498 struct kvm_segment *var, int seg);
494 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 499 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
500 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
495 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 501 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
496 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 502 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
497 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 503 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -501,12 +507,13 @@ struct kvm_x86_ops {
501 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 507 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
502 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 508 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
503 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 509 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
504 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); 510 int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
505 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, 511 int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
506 int *exception);
507 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 512 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
508 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 513 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
509 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 514 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
515 void (*fpu_activate)(struct kvm_vcpu *vcpu);
516 void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
510 517
511 void (*tlb_flush)(struct kvm_vcpu *vcpu); 518 void (*tlb_flush)(struct kvm_vcpu *vcpu);
512 519
@@ -531,7 +538,8 @@ struct kvm_x86_ops {
531 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 538 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
532 int (*get_tdp_level)(void); 539 int (*get_tdp_level)(void);
533 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 540 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
534 bool (*gb_page_enable)(void); 541 int (*get_lpage_level)(void);
542 bool (*rdtscp_supported)(void);
535 543
536 const struct trace_print_flags *exit_reasons_str; 544 const struct trace_print_flags *exit_reasons_str;
537}; 545};
@@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
606 unsigned long value); 614 unsigned long value);
607 615
608void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 616void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
609int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 617int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
610 int type_bits, int seg);
611 618
612int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); 619int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
613 620
@@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
653int kvm_mmu_load(struct kvm_vcpu *vcpu); 660int kvm_mmu_load(struct kvm_vcpu *vcpu);
654void kvm_mmu_unload(struct kvm_vcpu *vcpu); 661void kvm_mmu_unload(struct kvm_vcpu *vcpu);
655void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 662void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
663gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
664gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
665gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
666gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
656 667
657int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 668int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
658 669
@@ -666,6 +677,7 @@ void kvm_disable_tdp(void);
666 677
667int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 678int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
668int complete_pio(struct kvm_vcpu *vcpu); 679int complete_pio(struct kvm_vcpu *vcpu);
680bool kvm_check_iopl(struct kvm_vcpu *vcpu);
669 681
670struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); 682struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
671 683
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c584076a47f4..ffae1420e7d7 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_KVM_PARA_H 2#define _ASM_X86_KVM_PARA_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/hyperv.h>
5 6
6/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It 7/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
7 * should be used to determine that a VM is running under KVM. 8 * should be used to determine that a VM is running under KVM.
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index ba0eed8aa1a6..b60f2924c413 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -28,22 +28,39 @@
28 28
29#ifndef __ASSEMBLY__ 29#ifndef __ASSEMBLY__
30#include <asm/hw_irq.h> 30#include <asm/hw_irq.h>
31#include <asm/kvm_para.h>
32 31
33/*G:030 32/*G:030
34 * But first, how does our Guest contact the Host to ask for privileged 33 * But first, how does our Guest contact the Host to ask for privileged
35 * operations? There are two ways: the direct way is to make a "hypercall", 34 * operations? There are two ways: the direct way is to make a "hypercall",
36 * to make requests of the Host Itself. 35 * to make requests of the Host Itself.
37 * 36 *
38 * We use the KVM hypercall mechanism, though completely different hypercall 37 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
39 * numbers. Seventeen hypercalls are available: the hypercall number is put in 38 * above are used by real hardware interrupts). Seventeen hypercalls are
40 * the %eax register, and the arguments (when required) are placed in %ebx, 39 * available: the hypercall number is put in the %eax register, and the
41 * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. 40 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
41 * If a return value makes sense, it's returned in %eax.
42 * 42 *
43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
44 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
45 * definition of a gentleman: "someone who is only rude intentionally". 45 * definition of a gentleman: "someone who is only rude intentionally".
46:*/ 46 */
47static inline unsigned long
48hcall(unsigned long call,
49 unsigned long arg1, unsigned long arg2, unsigned long arg3,
50 unsigned long arg4)
51{
52 /* "int" is the Intel instruction to trigger a trap. */
53 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
54 /* The call in %eax (aka "a") might be overwritten */
55 : "=a"(call)
56 /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
57 : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
58 /* "memory" means this might write somewhere in memory.
59 * This isn't true for all calls, but it's safe to tell
60 * gcc that it might happen so it doesn't get clever. */
61 : "memory");
62 return call;
63}
47 64
48/* Can't use our min() macro here: needs to be a constant */ 65/* Can't use our min() macro here: needs to be a constant */
49#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 66#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 47b9b6f19057..2e9972468a5d 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -195,41 +195,4 @@ static inline long local_sub_return(long i, local_t *l)
195#define __local_add(i, l) local_add((i), (l)) 195#define __local_add(i, l) local_add((i), (l))
196#define __local_sub(i, l) local_sub((i), (l)) 196#define __local_sub(i, l) local_sub((i), (l))
197 197
198/* Use these for per-cpu local_t variables: on some archs they are
199 * much more efficient than these naive implementations. Note they take
200 * a variable, not an address.
201 *
202 * X86_64: This could be done better if we moved the per cpu data directly
203 * after GS.
204 */
205
206/* Need to disable preemption for the cpu local counters otherwise we could
207 still access a variable of a previous CPU in a non atomic way. */
208#define cpu_local_wrap_v(l) \
209({ \
210 local_t res__; \
211 preempt_disable(); \
212 res__ = (l); \
213 preempt_enable(); \
214 res__; \
215})
216#define cpu_local_wrap(l) \
217({ \
218 preempt_disable(); \
219 (l); \
220 preempt_enable(); \
221}) \
222
223#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
224#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
225#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
226#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
227#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
228#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
229
230#define __cpu_local_inc(l) cpu_local_inc((l))
231#define __cpu_local_dec(l) cpu_local_dec((l))
232#define __cpu_local_add(i, l) cpu_local_add((i), (l))
233#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
234
235#endif /* _ASM_X86_LOCAL_H */ 198#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 858baa061cfc..6c3fdd631ed3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -108,10 +108,11 @@ struct mce_log {
108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
110 110
111extern struct atomic_notifier_head x86_mce_decoder_chain;
112 111
113#ifdef __KERNEL__ 112#ifdef __KERNEL__
114 113
114extern struct atomic_notifier_head x86_mce_decoder_chain;
115
115#include <linux/percpu.h> 116#include <linux/percpu.h>
116#include <linux/init.h> 117#include <linux/init.h>
117#include <asm/atomic.h> 118#include <asm/atomic.h>
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c24ca9a56458..ef51b501e22a 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -12,8 +12,6 @@ struct device;
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; 12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13 13
14struct microcode_ops { 14struct microcode_ops {
15 void (*init)(struct device *device);
16 void (*fini)(void);
17 enum ucode_state (*request_microcode_user) (int cpu, 15 enum ucode_state (*request_microcode_user) (int cpu,
18 const void __user *buf, size_t size); 16 const void __user *buf, size_t size);
19 17
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a29f48c2a322..288b96f815a6 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ 40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages) 41 NODE_DATA(nid)->node_spanned_pages)
42
43#ifdef CONFIG_NUMA_EMU
44#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
45#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
46#endif
47
48#endif 42#endif
49#endif /* _ASM_X86_MMZONE_64_H */ 43#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
new file mode 100644
index 000000000000..451d30e7f62d
--- /dev/null
+++ b/arch/x86/include/asm/mrst.h
@@ -0,0 +1,19 @@
1/*
2 * mrst.h: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2009 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11#ifndef _ASM_X86_MRST_H
12#define _ASM_X86_MRST_H
13extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table);
15
16#define SFI_MTMR_MAX_NUM 8
17#define SFI_MRTC_MAX 8
18
19#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1cd58cdbc03f..4604e6a54d36 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -105,6 +105,8 @@
105#define MSR_AMD64_PATCH_LEVEL 0x0000008b 105#define MSR_AMD64_PATCH_LEVEL 0x0000008b
106#define MSR_AMD64_NB_CFG 0xc001001f 106#define MSR_AMD64_NB_CFG 0xc001001f
107#define MSR_AMD64_PATCH_LOADER 0xc0010020 107#define MSR_AMD64_PATCH_LOADER 0xc0010020
108#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
109#define MSR_AMD64_OSVW_STATUS 0xc0010141
108#define MSR_AMD64_IBSFETCHCTL 0xc0011030 110#define MSR_AMD64_IBSFETCHCTL 0xc0011030
109#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 111#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
110#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 112#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 139d4c1a33a7..93da9c3f3341 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -19,7 +19,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void); 19extern int check_nmi_watchdog(void);
20extern int nmi_watchdog_enabled; 20extern int nmi_watchdog_enabled;
21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
22extern int avail_to_resrv_perfctr_nmi(unsigned int);
23extern int reserve_perfctr_nmi(unsigned int); 22extern int reserve_perfctr_nmi(unsigned int);
24extern void release_perfctr_nmi(unsigned int); 23extern void release_perfctr_nmi(unsigned int);
25extern int reserve_evntsel_nmi(unsigned int); 24extern int reserve_evntsel_nmi(unsigned int);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index c4ae822e415f..823e070e7c26 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node);
36extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 37extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu); 38extern void __cpuinit numa_remove_cpu(int cpu);
39
40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43#endif /* CONFIG_NUMA_EMU */
39#else 44#else
40static inline void init_cpu_to_node(void) { } 45static inline void init_cpu_to_node(void) { }
41static inline void numa_set_node(int cpu, int node) { } 46static inline void numa_set_node(int cpu, int node) { }
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 9f0a5f5d29ec..37c516545ec8 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -30,9 +30,14 @@
30 30
31extern int found_numaq; 31extern int found_numaq;
32extern int get_memcfg_numaq(void); 32extern int get_memcfg_numaq(void);
33extern int pci_numaq_init(void);
33 34
34extern void *xquad_portio; 35extern void *xquad_portio;
35 36
37#define XQUAD_PORTIO_BASE 0xfe400000
38#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
39#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
40
36/* 41/*
37 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the 42 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
38 */ 43 */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 3a57385d9fa7..101229b0d8ed 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -13,7 +13,6 @@ struct olpc_platform_t {
13 13
14#define OLPC_F_PRESENT 0x01 14#define OLPC_F_PRESENT 0x01
15#define OLPC_F_DCON 0x02 15#define OLPC_F_DCON 0x02
16#define OLPC_F_VSA 0x04
17 16
18#ifdef CONFIG_OLPC 17#ifdef CONFIG_OLPC
19 18
@@ -51,18 +50,6 @@ static inline int olpc_has_dcon(void)
51} 50}
52 51
53/* 52/*
54 * The VSA is software from AMD that typical Geode bioses will include.
55 * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
56 * not include the VSA; instead, PCI is emulated by the kernel.
57 *
58 * The VSA is described further in arch/x86/pci/olpc.c.
59 */
60static inline int olpc_has_vsa(void)
61{
62 return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
63}
64
65/*
66 * The "Mass Production" version of OLPC's XO is identified as being model 53 * The "Mass Production" version of OLPC's XO is identified as being model
67 * C2. During the prototype phase, the following models (in chronological 54 * C2. During the prototype phase, the following models (in chronological
68 * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models 55 * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
@@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void)
87 return 0; 74 return 0;
88} 75}
89 76
90static inline int olpc_has_vsa(void)
91{
92 return 0;
93}
94
95#endif 77#endif
96 78
79extern int pci_olpc_init(void);
80
97/* EC related functions */ 81/* EC related functions */
98 82
99extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, 83extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 642fe34b36a2..a667f24c7254 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,7 +40,6 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42 42
43extern int page_is_ram(unsigned long pagenr);
44extern int devmem_is_allowed(unsigned long pagenr); 43extern int devmem_is_allowed(unsigned long pagenr);
45 44
46extern unsigned long max_low_pfn_mapped; 45extern unsigned long max_low_pfn_mapped;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index dd59a85a918f..5653f43d90e5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn)
435 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); 435 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
436} 436}
437 437
438#ifdef CONFIG_HIGHPTE
439static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
440{
441 unsigned long ret;
442 ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
443 return (void *)ret;
444}
445#endif
446
447static inline void pte_update(struct mm_struct *mm, unsigned long addr, 438static inline void pte_update(struct mm_struct *mm, unsigned long addr,
448 pte_t *ptep) 439 pte_t *ptep)
449{ 440{
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b1e70d51e40c..db9ef5532341 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -304,10 +304,6 @@ struct pv_mmu_ops {
304#endif /* PAGETABLE_LEVELS == 4 */ 304#endif /* PAGETABLE_LEVELS == 4 */
305#endif /* PAGETABLE_LEVELS >= 3 */ 305#endif /* PAGETABLE_LEVELS >= 3 */
306 306
307#ifdef CONFIG_HIGHPTE
308 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
309#endif
310
311 struct pv_lazy_ops lazy_mode; 307 struct pv_lazy_ops lazy_mode;
312 308
313 /* dom0 ops */ 309 /* dom0 ops */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ada8c201d513..404a880ea325 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus)
45 45
46#ifdef CONFIG_PCI 46#ifdef CONFIG_PCI
47extern unsigned int pcibios_assign_all_busses(void); 47extern unsigned int pcibios_assign_all_busses(void);
48extern int pci_legacy_init(void);
49# ifdef CONFIG_ACPI
50# define x86_default_pci_init pci_acpi_init
51# else
52# define x86_default_pci_init pci_legacy_init
53# endif
48#else 54#else
49#define pcibios_assign_all_busses() 0 55# define pcibios_assign_all_busses() 0
56# define x86_default_pci_init NULL
50#endif 57#endif
51 58
52extern unsigned long pci_mem_start; 59extern unsigned long pci_mem_start;
@@ -90,40 +97,14 @@ extern void pci_iommu_alloc(void);
90 97
91#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 98#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
92 99
93#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG)
94
95#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
96 dma_addr_t ADDR_NAME;
97#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
98 __u32 LEN_NAME;
99#define pci_unmap_addr(PTR, ADDR_NAME) \
100 ((PTR)->ADDR_NAME)
101#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
102 (((PTR)->ADDR_NAME) = (VAL))
103#define pci_unmap_len(PTR, LEN_NAME) \
104 ((PTR)->LEN_NAME)
105#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
106 (((PTR)->LEN_NAME) = (VAL))
107
108#else
109
110#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
111#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
112#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
113#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
114 do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
115#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
116#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
117 do { break; } while (pci_unmap_len(PTR, LEN_NAME))
118
119#endif
120
121#endif /* __KERNEL__ */ 100#endif /* __KERNEL__ */
122 101
123#ifdef CONFIG_X86_64 102#ifdef CONFIG_X86_64
124#include "pci_64.h" 103#include "pci_64.h"
125#endif 104#endif
126 105
106void dma32_reserve_bootmem(void);
107
127/* implement the pci_ DMA API in terms of the generic device dma_ one */ 108/* implement the pci_ DMA API in terms of the generic device dma_ one */
128#include <asm-generic/pci-dma-compat.h> 109#include <asm-generic/pci-dma-compat.h>
129 110
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index ae5e40f67daf..fe15cfb21b9b 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
22extern int (*pci_config_write)(int seg, int bus, int dev, int fn, 22extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value); 23 int reg, int len, u32 value);
24 24
25extern void dma32_reserve_bootmem(void);
26
27#endif /* __KERNEL__ */ 25#endif /* __KERNEL__ */
28 26
29#endif /* _ASM_X86_PCI_64_H */ 27#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b4bf9a942ed0..1a0422348d6d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -29,6 +29,7 @@
29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
30#define PCI_HAS_IO_ECS 0x40000 30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000 31#define PCI_NOASSIGN_ROMS 0x80000
32#define PCI_ROOT_NO_CRS 0x100000
32 33
33extern unsigned int pci_probe; 34extern unsigned int pci_probe;
34extern unsigned long pirq_table_addr; 35extern unsigned long pirq_table_addr;
@@ -82,7 +83,6 @@ struct irq_routing_table {
82 83
83extern unsigned int pcibios_irq_mask; 84extern unsigned int pcibios_irq_mask;
84 85
85extern int pcibios_scanned;
86extern spinlock_t pci_config_lock; 86extern spinlock_t pci_config_lock;
87 87
88extern int (*pcibios_enable_irq)(struct pci_dev *dev); 88extern int (*pcibios_enable_irq)(struct pci_dev *dev);
@@ -105,16 +105,15 @@ extern bool port_cf9_safe;
105extern int pci_direct_probe(void); 105extern int pci_direct_probe(void);
106extern void pci_direct_init(int type); 106extern void pci_direct_init(int type);
107extern void pci_pcbios_init(void); 107extern void pci_pcbios_init(void);
108extern int pci_olpc_init(void);
109extern void __init dmi_check_pciprobe(void); 108extern void __init dmi_check_pciprobe(void);
110extern void __init dmi_check_skip_isa_align(void); 109extern void __init dmi_check_skip_isa_align(void);
111 110
112/* some common used subsys_initcalls */ 111/* some common used subsys_initcalls */
113extern int __init pci_acpi_init(void); 112extern int __init pci_acpi_init(void);
114extern int __init pcibios_irq_init(void); 113extern void __init pcibios_irq_init(void);
115extern int __init pci_visws_init(void);
116extern int __init pci_numaq_init(void);
117extern int __init pcibios_init(void); 114extern int __init pcibios_init(void);
115extern int pci_legacy_init(void);
116extern void pcibios_fixup_irqs(void);
118 117
119/* pci-mmconfig.c */ 118/* pci-mmconfig.c */
120 119
@@ -182,3 +181,17 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
182{ 181{
183 asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); 182 asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
184} 183}
184
185#ifdef CONFIG_PCI
186# ifdef CONFIG_ACPI
187# define x86_default_pci_init pci_acpi_init
188# else
189# define x86_default_pci_init pci_legacy_init
190# endif
191# define x86_default_pci_init_irq pcibios_irq_init
192# define x86_default_pci_fixup_irqs pcibios_fixup_irqs
193#else
194# define x86_default_pci_init NULL
195# define x86_default_pci_init_irq NULL
196# define x86_default_pci_fixup_irqs NULL
197#endif
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 0c44196b78ac..0ec6d12d84e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -25,19 +25,18 @@
25 */ 25 */
26#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
27#define PER_CPU(var, reg) \ 27#define PER_CPU(var, reg) \
28 __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \ 28 __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \
29 lea per_cpu__##var(reg), reg 29 lea var(reg), reg
30#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var 30#define PER_CPU_VAR(var) %__percpu_seg:var
31#else /* ! SMP */ 31#else /* ! SMP */
32#define PER_CPU(var, reg) \ 32#define PER_CPU(var, reg) __percpu_mov_op $var, reg
33 __percpu_mov_op $per_cpu__##var, reg 33#define PER_CPU_VAR(var) var
34#define PER_CPU_VAR(var) per_cpu__##var
35#endif /* SMP */ 34#endif /* SMP */
36 35
37#ifdef CONFIG_X86_64_SMP 36#ifdef CONFIG_X86_64_SMP
38#define INIT_PER_CPU_VAR(var) init_per_cpu__##var 37#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
39#else 38#else
40#define INIT_PER_CPU_VAR(var) per_cpu__##var 39#define INIT_PER_CPU_VAR(var) var
41#endif 40#endif
42 41
43#else /* ...!ASSEMBLY */ 42#else /* ...!ASSEMBLY */
@@ -60,12 +59,12 @@
60 * There also must be an entry in vmlinux_64.lds.S 59 * There also must be an entry in vmlinux_64.lds.S
61 */ 60 */
62#define DECLARE_INIT_PER_CPU(var) \ 61#define DECLARE_INIT_PER_CPU(var) \
63 extern typeof(per_cpu_var(var)) init_per_cpu_var(var) 62 extern typeof(var) init_per_cpu_var(var)
64 63
65#ifdef CONFIG_X86_64_SMP 64#ifdef CONFIG_X86_64_SMP
66#define init_per_cpu_var(var) init_per_cpu__##var 65#define init_per_cpu_var(var) init_per_cpu__##var
67#else 66#else
68#define init_per_cpu_var(var) per_cpu_var(var) 67#define init_per_cpu_var(var) var
69#endif 68#endif
70 69
71/* For arch-specific code, we can use direct single-insn ops (they 70/* For arch-specific code, we can use direct single-insn ops (they
@@ -104,6 +103,64 @@ do { \
104 } \ 103 } \
105} while (0) 104} while (0)
106 105
106/*
107 * Generate a percpu add to memory instruction and optimize code
108 * if a one is added or subtracted.
109 */
110#define percpu_add_op(var, val) \
111do { \
112 typedef typeof(var) pao_T__; \
113 const int pao_ID__ = (__builtin_constant_p(val) && \
114 ((val) == 1 || (val) == -1)) ? (val) : 0; \
115 if (0) { \
116 pao_T__ pao_tmp__; \
117 pao_tmp__ = (val); \
118 } \
119 switch (sizeof(var)) { \
120 case 1: \
121 if (pao_ID__ == 1) \
122 asm("incb "__percpu_arg(0) : "+m" (var)); \
123 else if (pao_ID__ == -1) \
124 asm("decb "__percpu_arg(0) : "+m" (var)); \
125 else \
126 asm("addb %1, "__percpu_arg(0) \
127 : "+m" (var) \
128 : "qi" ((pao_T__)(val))); \
129 break; \
130 case 2: \
131 if (pao_ID__ == 1) \
132 asm("incw "__percpu_arg(0) : "+m" (var)); \
133 else if (pao_ID__ == -1) \
134 asm("decw "__percpu_arg(0) : "+m" (var)); \
135 else \
136 asm("addw %1, "__percpu_arg(0) \
137 : "+m" (var) \
138 : "ri" ((pao_T__)(val))); \
139 break; \
140 case 4: \
141 if (pao_ID__ == 1) \
142 asm("incl "__percpu_arg(0) : "+m" (var)); \
143 else if (pao_ID__ == -1) \
144 asm("decl "__percpu_arg(0) : "+m" (var)); \
145 else \
146 asm("addl %1, "__percpu_arg(0) \
147 : "+m" (var) \
148 : "ri" ((pao_T__)(val))); \
149 break; \
150 case 8: \
151 if (pao_ID__ == 1) \
152 asm("incq "__percpu_arg(0) : "+m" (var)); \
153 else if (pao_ID__ == -1) \
154 asm("decq "__percpu_arg(0) : "+m" (var)); \
155 else \
156 asm("addq %1, "__percpu_arg(0) \
157 : "+m" (var) \
158 : "re" ((pao_T__)(val))); \
159 break; \
160 default: __bad_percpu_size(); \
161 } \
162} while (0)
163
107#define percpu_from_op(op, var, constraint) \ 164#define percpu_from_op(op, var, constraint) \
108({ \ 165({ \
109 typeof(var) pfo_ret__; \ 166 typeof(var) pfo_ret__; \
@@ -133,6 +190,29 @@ do { \
133 pfo_ret__; \ 190 pfo_ret__; \
134}) 191})
135 192
193#define percpu_unary_op(op, var) \
194({ \
195 switch (sizeof(var)) { \
196 case 1: \
197 asm(op "b "__percpu_arg(0) \
198 : "+m" (var)); \
199 break; \
200 case 2: \
201 asm(op "w "__percpu_arg(0) \
202 : "+m" (var)); \
203 break; \
204 case 4: \
205 asm(op "l "__percpu_arg(0) \
206 : "+m" (var)); \
207 break; \
208 case 8: \
209 asm(op "q "__percpu_arg(0) \
210 : "+m" (var)); \
211 break; \
212 default: __bad_percpu_size(); \
213 } \
214})
215
136/* 216/*
137 * percpu_read() makes gcc load the percpu variable every time it is 217 * percpu_read() makes gcc load the percpu variable every time it is
138 * accessed while percpu_read_stable() allows the value to be cached. 218 * accessed while percpu_read_stable() allows the value to be cached.
@@ -142,16 +222,15 @@ do { \
142 * per-thread variables implemented as per-cpu variables and thus 222 * per-thread variables implemented as per-cpu variables and thus
143 * stable for the duration of the respective task. 223 * stable for the duration of the respective task.
144 */ 224 */
145#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ 225#define percpu_read(var) percpu_from_op("mov", var, "m" (var))
146 "m" (per_cpu__##var)) 226#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var)))
147#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ 227#define percpu_write(var, val) percpu_to_op("mov", var, val)
148 "p" (&per_cpu__##var)) 228#define percpu_add(var, val) percpu_add_op(var, val)
149#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) 229#define percpu_sub(var, val) percpu_add_op(var, -(val))
150#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) 230#define percpu_and(var, val) percpu_to_op("and", var, val)
151#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) 231#define percpu_or(var, val) percpu_to_op("or", var, val)
152#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) 232#define percpu_xor(var, val) percpu_to_op("xor", var, val)
153#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) 233#define percpu_inc(var) percpu_unary_op("inc", var)
154#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
155 234
156#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 235#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
157#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 236#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -160,9 +239,9 @@ do { \
160#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 239#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
161#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) 240#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
162#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) 241#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
163#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 242#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
164#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 243#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
165#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 244#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
166#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 245#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
167#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 246#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
168#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 247#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -179,9 +258,9 @@ do { \
179#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 258#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
180#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) 259#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
181#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) 260#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
182#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 261#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
183#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 262#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
184#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 263#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
185#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 264#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
186#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 265#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
187#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 266#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -192,9 +271,9 @@ do { \
192#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 271#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
193#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 272#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
194 273
195#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 274#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
196#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 275#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
197#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 276#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
198#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 277#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
199#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 278#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
200#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 279#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -212,19 +291,19 @@ do { \
212#ifdef CONFIG_X86_64 291#ifdef CONFIG_X86_64
213#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 292#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
214#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 293#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
215#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 294#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
216#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 295#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
217#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 296#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
218#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 297#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
219 298
220#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 299#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
221#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 300#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
222#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 301#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
223#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 302#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
224#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 303#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
225#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 304#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
226 305
227#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 306#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
228#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 307#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
229#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 308#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
230#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 309#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
@@ -236,7 +315,7 @@ do { \
236({ \ 315({ \
237 int old__; \ 316 int old__; \
238 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ 317 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
239 : "=r" (old__), "+m" (per_cpu__##var) \ 318 : "=r" (old__), "+m" (var) \
240 : "dIr" (bit)); \ 319 : "dIr" (bit)); \
241 old__; \ 320 old__; \
242}) 321})
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8d9f8548a870..db6109a885a7 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -18,7 +18,8 @@
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20 20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) 21#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) 23#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) 24#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) 25#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
@@ -26,7 +27,14 @@
26/* 27/*
27 * Includes eventsel and unit mask as well: 28 * Includes eventsel and unit mask as well:
28 */ 29 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff 30
31
32#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL
33#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL
34#define INTEL_ARCH_EDGE_MASK 0x00040000ULL
35#define INTEL_ARCH_INV_MASK 0x00800000ULL
36#define INTEL_ARCH_CNT_MASK 0xFF000000ULL
37#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
30 38
31/* 39/*
32 * filter mask to validate fixed counter events. 40 * filter mask to validate fixed counter events.
@@ -37,7 +45,12 @@
37 * The other filters are supported by fixed counters. 45 * The other filters are supported by fixed counters.
38 * The any-thread option is supported starting with v3. 46 * The any-thread option is supported starting with v3.
39 */ 47 */
40#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 48#define INTEL_ARCH_FIXED_MASK \
49 (INTEL_ARCH_CNT_MASK| \
50 INTEL_ARCH_INV_MASK| \
51 INTEL_ARCH_EDGE_MASK|\
52 INTEL_ARCH_UNIT_MASK|\
53 INTEL_ARCH_EVENT_MASK)
41 54
42#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
43#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 56#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
@@ -104,6 +117,18 @@ union cpuid10_edx {
104 */ 117 */
105#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 118#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
106 119
120/* IbsFetchCtl bits/masks */
121#define IBS_FETCH_RAND_EN (1ULL<<57)
122#define IBS_FETCH_VAL (1ULL<<49)
123#define IBS_FETCH_ENABLE (1ULL<<48)
124#define IBS_FETCH_CNT 0xFFFF0000ULL
125#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
126
127/* IbsOpCtl bits */
128#define IBS_OP_CNT_CTL (1ULL<<19)
129#define IBS_OP_VAL (1ULL<<18)
130#define IBS_OP_ENABLE (1ULL<<17)
131#define IBS_OP_MAX_CNT 0x0000FFFFULL
107 132
108#ifdef CONFIG_PERF_EVENTS 133#ifdef CONFIG_PERF_EVENTS
109extern void init_hw_perf_events(void); 134extern void init_hw_perf_events(void);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 0e8c2a0fd922..271de94c3810 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
23#endif 23#endif
24 24
25/* 25/*
26 * Flags to use when allocating a user page table page.
27 */
28extern gfp_t __userpte_alloc_gfp;
29
30/*
26 * Allocate and free page tables. 31 * Allocate and free page tables.
27 */ 32 */
28extern pgd_t *pgd_alloc(struct mm_struct *); 33extern pgd_t *pgd_alloc(struct mm_struct *);
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 01fd9461d323..2984a25ff383 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -19,7 +19,6 @@
19#include <asm/paravirt.h> 19#include <asm/paravirt.h>
20 20
21#include <linux/bitops.h> 21#include <linux/bitops.h>
22#include <linux/slab.h>
23#include <linux/list.h> 22#include <linux/list.h>
24#include <linux/spinlock.h> 23#include <linux/spinlock.h>
25 24
@@ -54,10 +53,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
54 in_irq() ? KM_IRQ_PTE : \ 53 in_irq() ? KM_IRQ_PTE : \
55 KM_PTE0) 54 KM_PTE0)
56#define pte_offset_map(dir, address) \ 55#define pte_offset_map(dir, address) \
57 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ 56 ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \
58 pte_index((address))) 57 pte_index((address)))
59#define pte_offset_map_nested(dir, address) \ 58#define pte_offset_map_nested(dir, address) \
60 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ 59 ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
61 pte_index((address))) 60 pte_index((address)))
62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) 61#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) 62#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
@@ -80,7 +79,7 @@ do { \
80 * The i386 doesn't have any external MMU info: the kernel page 79 * The i386 doesn't have any external MMU info: the kernel page
81 * tables contain all the necessary information. 80 * tables contain all the necessary information.
82 */ 81 */
83#define update_mmu_cache(vma, address, pte) do { } while (0) 82#define update_mmu_cache(vma, address, ptep) do { } while (0)
84 83
85#endif /* !__ASSEMBLY__ */ 84#endif /* !__ASSEMBLY__ */
86 85
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a30117149..181be528c612 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -129,7 +129,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
129#define pte_unmap(pte) /* NOP */ 129#define pte_unmap(pte) /* NOP */
130#define pte_unmap_nested(pte) /* NOP */ 130#define pte_unmap_nested(pte) /* NOP */
131 131
132#define update_mmu_cache(vma, address, pte) do { } while (0) 132#define update_mmu_cache(vma, address, ptep) do { } while (0)
133 133
134/* Encode and de-code a swap entry */ 134/* Encode and de-code a swap entry */
135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fc801bab1b3b..b753ea59703a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -450,6 +450,8 @@ struct thread_struct {
450 struct perf_event *ptrace_bps[HBP_NUM]; 450 struct perf_event *ptrace_bps[HBP_NUM];
451 /* Debug status used for traps, single steps, etc... */ 451 /* Debug status used for traps, single steps, etc... */
452 unsigned long debugreg6; 452 unsigned long debugreg6;
453 /* Keep track of the exact dr7 value set by the user */
454 unsigned long ptrace_dr7;
453 /* Fault info: */ 455 /* Fault info: */
454 unsigned long cr2; 456 unsigned long cr2;
455 unsigned long trap_no; 457 unsigned long trap_no;
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 4009f6534f52..6f414ed88620 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -23,14 +23,4 @@ extern int reboot_force;
23 23
24long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); 24long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
25 25
26/*
27 * This looks more complex than it should be. But we need to
28 * get the type for the ~ right in round_down (it needs to be
29 * as wide as the result!), and we want to evaluate the macro
30 * arguments just once each.
31 */
32#define __round_mask(x,y) ((__typeof__(x))((y)-1))
33#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
34#define round_down(x,y) ((x) & ~__round_mask(x,y))
35
36#endif /* _ASM_X86_PROTO_H */ 26#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 9d369f680321..69a686a7dff0 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -274,18 +274,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
274 return 0; 274 return 0;
275} 275}
276 276
277/* Get Nth argument at function call */
278extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
279 unsigned int n);
280
281/*
282 * These are defined as per linux/ptrace.h, which see.
283 */
284#define arch_has_single_step() (1) 277#define arch_has_single_step() (1)
285extern void user_enable_single_step(struct task_struct *);
286extern void user_disable_single_step(struct task_struct *);
287
288extern void user_enable_block_step(struct task_struct *);
289#ifdef CONFIG_X86_DEBUGCTLMSR 278#ifdef CONFIG_X86_DEBUGCTLMSR
290#define arch_has_block_step() (1) 279#define arch_has_block_step() (1)
291#else 280#else
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 413620024768..606ede126972 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -41,6 +41,7 @@
41#include <linux/list.h> 41#include <linux/list.h>
42#include <linux/spinlock.h> 42#include <linux/spinlock.h>
43#include <linux/lockdep.h> 43#include <linux/lockdep.h>
44#include <asm/asm.h>
44 45
45struct rwsem_waiter; 46struct rwsem_waiter;
46 47
@@ -55,17 +56,28 @@ extern asmregparm struct rw_semaphore *
55 56
56/* 57/*
57 * the semaphore definition 58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits.
58 */ 63 */
59 64
60#define RWSEM_UNLOCKED_VALUE 0x00000000 65#ifdef CONFIG_X86_64
61#define RWSEM_ACTIVE_BIAS 0x00000001 66# define RWSEM_ACTIVE_MASK 0xffffffffL
62#define RWSEM_ACTIVE_MASK 0x0000ffff 67#else
63#define RWSEM_WAITING_BIAS (-0x00010000) 68# define RWSEM_ACTIVE_MASK 0x0000ffffL
69#endif
70
71#define RWSEM_UNLOCKED_VALUE 0x00000000L
72#define RWSEM_ACTIVE_BIAS 0x00000001L
73#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
64#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
65#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
66 76
77typedef signed long rwsem_count_t;
78
67struct rw_semaphore { 79struct rw_semaphore {
68 signed long count; 80 rwsem_count_t count;
69 spinlock_t wait_lock; 81 spinlock_t wait_lock;
70 struct list_head wait_list; 82 struct list_head wait_list;
71#ifdef CONFIG_DEBUG_LOCK_ALLOC 83#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -105,7 +117,7 @@ do { \
105static inline void __down_read(struct rw_semaphore *sem) 117static inline void __down_read(struct rw_semaphore *sem)
106{ 118{
107 asm volatile("# beginning down_read\n\t" 119 asm volatile("# beginning down_read\n\t"
108 LOCK_PREFIX " inc%z0 (%1)\n\t" 120 LOCK_PREFIX _ASM_INC "(%1)\n\t"
109 /* adds 0x00000001, returns the old value */ 121 /* adds 0x00000001, returns the old value */
110 " jns 1f\n" 122 " jns 1f\n"
111 " call call_rwsem_down_read_failed\n" 123 " call call_rwsem_down_read_failed\n"
@@ -121,7 +133,7 @@ static inline void __down_read(struct rw_semaphore *sem)
121 */ 133 */
122static inline int __down_read_trylock(struct rw_semaphore *sem) 134static inline int __down_read_trylock(struct rw_semaphore *sem)
123{ 135{
124 __s32 result, tmp; 136 rwsem_count_t result, tmp;
125 asm volatile("# beginning __down_read_trylock\n\t" 137 asm volatile("# beginning __down_read_trylock\n\t"
126 " mov %0,%1\n\t" 138 " mov %0,%1\n\t"
127 "1:\n\t" 139 "1:\n\t"
@@ -143,7 +155,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
143 */ 155 */
144static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
145{ 157{
146 int tmp; 158 rwsem_count_t tmp;
147 159
148 tmp = RWSEM_ACTIVE_WRITE_BIAS; 160 tmp = RWSEM_ACTIVE_WRITE_BIAS;
149 asm volatile("# beginning down_write\n\t" 161 asm volatile("# beginning down_write\n\t"
@@ -170,9 +182,9 @@ static inline void __down_write(struct rw_semaphore *sem)
170 */ 182 */
171static inline int __down_write_trylock(struct rw_semaphore *sem) 183static inline int __down_write_trylock(struct rw_semaphore *sem)
172{ 184{
173 signed long ret = cmpxchg(&sem->count, 185 rwsem_count_t ret = cmpxchg(&sem->count,
174 RWSEM_UNLOCKED_VALUE, 186 RWSEM_UNLOCKED_VALUE,
175 RWSEM_ACTIVE_WRITE_BIAS); 187 RWSEM_ACTIVE_WRITE_BIAS);
176 if (ret == RWSEM_UNLOCKED_VALUE) 188 if (ret == RWSEM_UNLOCKED_VALUE)
177 return 1; 189 return 1;
178 return 0; 190 return 0;
@@ -183,7 +195,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
183 */ 195 */
184static inline void __up_read(struct rw_semaphore *sem) 196static inline void __up_read(struct rw_semaphore *sem)
185{ 197{
186 __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; 198 rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS;
187 asm volatile("# beginning __up_read\n\t" 199 asm volatile("# beginning __up_read\n\t"
188 LOCK_PREFIX " xadd %1,(%2)\n\t" 200 LOCK_PREFIX " xadd %1,(%2)\n\t"
189 /* subtracts 1, returns the old value */ 201 /* subtracts 1, returns the old value */
@@ -201,7 +213,7 @@ static inline void __up_read(struct rw_semaphore *sem)
201 */ 213 */
202static inline void __up_write(struct rw_semaphore *sem) 214static inline void __up_write(struct rw_semaphore *sem)
203{ 215{
204 unsigned long tmp; 216 rwsem_count_t tmp;
205 asm volatile("# beginning __up_write\n\t" 217 asm volatile("# beginning __up_write\n\t"
206 LOCK_PREFIX " xadd %1,(%2)\n\t" 218 LOCK_PREFIX " xadd %1,(%2)\n\t"
207 /* tries to transition 219 /* tries to transition
@@ -221,33 +233,38 @@ static inline void __up_write(struct rw_semaphore *sem)
221static inline void __downgrade_write(struct rw_semaphore *sem) 233static inline void __downgrade_write(struct rw_semaphore *sem)
222{ 234{
223 asm volatile("# beginning __downgrade_write\n\t" 235 asm volatile("# beginning __downgrade_write\n\t"
224 LOCK_PREFIX " add%z0 %2,(%1)\n\t" 236 LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t"
225 /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ 237 /*
238 * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
239 * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
240 */
226 " jns 1f\n\t" 241 " jns 1f\n\t"
227 " call call_rwsem_downgrade_wake\n" 242 " call call_rwsem_downgrade_wake\n"
228 "1:\n\t" 243 "1:\n\t"
229 "# ending __downgrade_write\n" 244 "# ending __downgrade_write\n"
230 : "+m" (sem->count) 245 : "+m" (sem->count)
231 : "a" (sem), "i" (-RWSEM_WAITING_BIAS) 246 : "a" (sem), "er" (-RWSEM_WAITING_BIAS)
232 : "memory", "cc"); 247 : "memory", "cc");
233} 248}
234 249
235/* 250/*
236 * implement atomic add functionality 251 * implement atomic add functionality
237 */ 252 */
238static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) 253static inline void rwsem_atomic_add(rwsem_count_t delta,
254 struct rw_semaphore *sem)
239{ 255{
240 asm volatile(LOCK_PREFIX "add%z0 %1,%0" 256 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
241 : "+m" (sem->count) 257 : "+m" (sem->count)
242 : "ir" (delta)); 258 : "er" (delta));
243} 259}
244 260
245/* 261/*
246 * implement exchange and add functionality 262 * implement exchange and add functionality
247 */ 263 */
248static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) 264static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
265 struct rw_semaphore *sem)
249{ 266{
250 int tmp = delta; 267 rwsem_count_t tmp = delta;
251 268
252 asm volatile(LOCK_PREFIX "xadd %0,%1" 269 asm volatile(LOCK_PREFIX "xadd %0,%1"
253 : "+r" (tmp), "+m" (sem->count) 270 : "+r" (tmp), "+m" (sem->count)
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 18e496c98ff0..86b1506f4179 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -37,10 +37,8 @@ void setup_bios_corruption_check(void);
37 37
38#ifdef CONFIG_X86_VISWS 38#ifdef CONFIG_X86_VISWS
39extern void visws_early_detect(void); 39extern void visws_early_detect(void);
40extern int is_visws_box(void);
41#else 40#else
42static inline void visws_early_detect(void) { } 41static inline void visws_early_detect(void) { }
43static inline int is_visws_box(void) { return 0; }
44#endif 42#endif
45 43
46extern unsigned long saved_video_mode; 44extern unsigned long saved_video_mode;
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1e796782cd7b..4cfc90824068 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -135,6 +135,8 @@ int native_cpu_disable(void);
135void native_cpu_die(unsigned int cpu); 135void native_cpu_die(unsigned int cpu);
136void native_play_dead(void); 136void native_play_dead(void);
137void play_dead_common(void); 137void play_dead_common(void);
138void wbinvd_on_cpu(int cpu);
139int wbinvd_on_all_cpus(void);
138 140
139void native_send_call_func_ipi(const struct cpumask *mask); 141void native_send_call_func_ipi(const struct cpumask *mask);
140void native_send_call_func_single_ipi(int cpu); 142void native_send_call_func_single_ipi(int cpu);
@@ -147,6 +149,13 @@ static inline int num_booting_cpus(void)
147{ 149{
148 return cpumask_weight(cpu_callout_mask); 150 return cpumask_weight(cpu_callout_mask);
149} 151}
152#else /* !CONFIG_SMP */
153#define wbinvd_on_cpu(cpu) wbinvd()
154static inline int wbinvd_on_all_cpus(void)
155{
156 wbinvd();
157 return 0;
158}
150#endif /* CONFIG_SMP */ 159#endif /* CONFIG_SMP */
151 160
152extern unsigned disabled_cpus __cpuinitdata; 161extern unsigned disabled_cpus __cpuinitdata;
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 35e89122a42f..4dab78edbad9 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,8 +3,6 @@
3 3
4extern int kstack_depth_to_print; 4extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name);
7
8struct thread_info; 6struct thread_info;
9struct stacktrace_ops; 7struct stacktrace_ops;
10 8
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1fecb7e61130..38638cd2fa4c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb {
313 313
314#define SVM_EXIT_ERR -1 314#define SVM_EXIT_ERR -1
315 315
316#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ 316#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
317 317
318#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" 318#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
319#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" 319#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index d5f69045c100..3ad421784ae7 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -26,8 +26,8 @@ asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); 26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
27asmlinkage long sys32_fstatat(unsigned int, char __user *, 27asmlinkage long sys32_fstatat(unsigned int, char __user *,
28 struct stat64 __user *, int); 28 struct stat64 __user *, int);
29struct mmap_arg_struct; 29struct mmap_arg_struct32;
30asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); 30asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); 31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
32 32
33struct sigaction32; 33struct sigaction32;
@@ -40,8 +40,6 @@ asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
40 compat_sigset_t __user *, unsigned int); 40 compat_sigset_t __user *, unsigned int);
41asmlinkage long sys32_alarm(unsigned int); 41asmlinkage long sys32_alarm(unsigned int);
42 42
43struct sel_arg_struct;
44asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
45asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); 43asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
46asmlinkage long sys32_sysfs(int, u32, u32); 44asmlinkage long sys32_sysfs(int, u32, u32);
47 45
@@ -56,11 +54,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
56asmlinkage long sys32_personality(unsigned long); 54asmlinkage long sys32_personality(unsigned long);
57asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); 55asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
58 56
59struct oldold_utsname;
60struct old_utsname;
61asmlinkage long sys32_olduname(struct oldold_utsname __user *);
62long sys32_uname(struct old_utsname __user *);
63
64asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, 57asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
65 compat_uptr_t __user *, struct pt_regs *); 58 compat_uptr_t __user *, struct pt_regs *);
66asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); 59asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 8d33bc5462d1..c4a348f7bd43 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,6 +16,8 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/err.h> 17#include <linux/err.h>
18 18
19extern const unsigned long sys_call_table[];
20
19/* 21/*
20 * Only the low 32 bits of orig_ax are meaningful, so we return int. 22 * Only the low 32 bits of orig_ax are meaningful, so we return int.
21 * This importantly ignores the high bits on 64-bit, so comparisons 23 * This importantly ignores the high bits on 64-bit, so comparisons
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 8868b9420b0e..5c044b43e9a7 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -50,18 +50,6 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
50 struct old_sigaction __user *); 50 struct old_sigaction __user *);
51unsigned long sys_sigreturn(struct pt_regs *); 51unsigned long sys_sigreturn(struct pt_regs *);
52 52
53/* kernel/sys_i386_32.c */
54struct mmap_arg_struct;
55struct sel_arg_struct;
56struct oldold_utsname;
57struct old_utsname;
58
59asmlinkage int old_mmap(struct mmap_arg_struct __user *);
60asmlinkage int old_select(struct sel_arg_struct __user *);
61asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
62asmlinkage int sys_uname(struct old_utsname __user *);
63asmlinkage int sys_olduname(struct oldold_utsname __user *);
64
65/* kernel/vm86_32.c */ 53/* kernel/vm86_32.c */
66int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); 54int sys_vm86old(struct vm86_struct __user *, struct pt_regs *);
67int sys_vm86(unsigned long, unsigned long, struct pt_regs *); 55int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
@@ -73,11 +61,8 @@ int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
73long sys_arch_prctl(int, unsigned long); 61long sys_arch_prctl(int, unsigned long);
74 62
75/* kernel/sys_x86_64.c */ 63/* kernel/sys_x86_64.c */
76struct new_utsname;
77
78asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 64asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
79 unsigned long, unsigned long, unsigned long); 65 unsigned long, unsigned long, unsigned long);
80asmlinkage long sys_uname(struct new_utsname __user *);
81 66
82#endif /* CONFIG_X86_32 */ 67#endif /* CONFIG_X86_32 */
83#endif /* _ASM_X86_SYSCALLS_H */ 68#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index ecb544e65382..b8fe48ee2ed9 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -11,9 +11,9 @@
11#include <linux/irqflags.h> 11#include <linux/irqflags.h>
12 12
13/* entries in ARCH_DLINFO: */ 13/* entries in ARCH_DLINFO: */
14#ifdef CONFIG_IA32_EMULATION 14#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
15# define AT_VECTOR_SIZE_ARCH 2 15# define AT_VECTOR_SIZE_ARCH 2
16#else 16#else /* else it's non-compat x86-64 */
17# define AT_VECTOR_SIZE_ARCH 1 17# define AT_VECTOR_SIZE_ARCH 1
18#endif 18#endif
19 19
@@ -32,7 +32,7 @@ extern void show_regs_common(void);
32 "movl %P[task_canary](%[next]), %%ebx\n\t" \ 32 "movl %P[task_canary](%[next]), %%ebx\n\t" \
33 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" 33 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
34#define __switch_canary_oparam \ 34#define __switch_canary_oparam \
35 , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) 35 , [stack_canary] "=m" (stack_canary.canary)
36#define __switch_canary_iparam \ 36#define __switch_canary_iparam \
37 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 37 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
38#else /* CC_STACKPROTECTOR */ 38#else /* CC_STACKPROTECTOR */
@@ -114,7 +114,7 @@ do { \
114 "movq %P[task_canary](%%rsi),%%r8\n\t" \ 114 "movq %P[task_canary](%%rsi),%%r8\n\t" \
115 "movq %%r8,"__percpu_arg([gs_canary])"\n\t" 115 "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
116#define __switch_canary_oparam \ 116#define __switch_canary_oparam \
117 , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) 117 , [gs_canary] "=m" (irq_stack_union.stack_canary)
118#define __switch_canary_iparam \ 118#define __switch_canary_iparam \
119 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 119 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
120#else /* CC_STACKPROTECTOR */ 120#else /* CC_STACKPROTECTOR */
@@ -133,7 +133,7 @@ do { \
133 __switch_canary \ 133 __switch_canary \
134 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 134 "movq %P[thread_info](%%rsi),%%r8\n\t" \
135 "movq %%rax,%%rdi\n\t" \ 135 "movq %%rax,%%rdi\n\t" \
136 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ 136 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
137 "jnz ret_from_fork\n\t" \ 137 "jnz ret_from_fork\n\t" \
138 RESTORE_CONTEXT \ 138 RESTORE_CONTEXT \
139 : "=a" (last) \ 139 : "=a" (last) \
@@ -143,7 +143,7 @@ do { \
143 [ti_flags] "i" (offsetof(struct thread_info, flags)), \ 143 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
144 [_tif_fork] "i" (_TIF_FORK), \ 144 [_tif_fork] "i" (_TIF_FORK), \
145 [thread_info] "i" (offsetof(struct task_struct, stack)), \ 145 [thread_info] "i" (offsetof(struct task_struct, stack)), \
146 [current_task] "m" (per_cpu_var(current_task)) \ 146 [current_task] "m" (current_task) \
147 __switch_canary_iparam \ 147 __switch_canary_iparam \
148 : "memory", "cc" __EXTRA_CLOBBER) 148 : "memory", "cc" __EXTRA_CLOBBER)
149#endif 149#endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 375c917c37d2..e0d28901e969 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,7 +87,6 @@ struct thread_info {
87#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
88#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
89#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
90#define TIF_ABI_PENDING 19
91#define TIF_MEMDIE 20 90#define TIF_MEMDIE 20
92#define TIF_DEBUG 21 /* uses debug registers */ 91#define TIF_DEBUG 21 /* uses debug registers */
93#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -112,7 +111,6 @@ struct thread_info {
112#define _TIF_NOTSC (1 << TIF_NOTSC) 111#define _TIF_NOTSC (1 << TIF_NOTSC)
113#define _TIF_IA32 (1 << TIF_IA32) 112#define _TIF_IA32 (1 << TIF_IA32)
114#define _TIF_FORK (1 << TIF_FORK) 113#define _TIF_FORK (1 << TIF_FORK)
115#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
116#define _TIF_DEBUG (1 << TIF_DEBUG) 114#define _TIF_DEBUG (1 << TIF_DEBUG)
117#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 115#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
118#define _TIF_FREEZE (1 << TIF_FREEZE) 116#define _TIF_FREEZE (1 << TIF_FREEZE)
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 0c9825e97f36..088d09fb1615 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -205,14 +205,13 @@ static inline unsigned long __must_check copy_from_user(void *to,
205 unsigned long n) 205 unsigned long n)
206{ 206{
207 int sz = __compiletime_object_size(to); 207 int sz = __compiletime_object_size(to);
208 int ret = -EFAULT;
209 208
210 if (likely(sz == -1 || sz >= n)) 209 if (likely(sz == -1 || sz >= n))
211 ret = _copy_from_user(to, from, n); 210 n = _copy_from_user(to, from, n);
212 else 211 else
213 copy_from_user_overflow(); 212 copy_from_user_overflow();
214 213
215 return ret; 214 return n;
216} 215}
217 216
218long __must_check strncpy_from_user(char *dst, const char __user *src, 217long __must_check strncpy_from_user(char *dst, const char __user *src,
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index a78c40305447..316708d5af92 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -49,16 +49,15 @@ static inline unsigned long __must_check copy_from_user(void *to,
49 unsigned long n) 49 unsigned long n)
50{ 50{
51 int sz = __compiletime_object_size(to); 51 int sz = __compiletime_object_size(to);
52 int ret = -EFAULT;
53 52
54 might_fault(); 53 might_fault();
55 if (likely(sz == -1 || sz >= n)) 54 if (likely(sz == -1 || sz >= n))
56 ret = _copy_from_user(to, from, n); 55 n = _copy_from_user(to, from, n);
57#ifdef CONFIG_DEBUG_VM 56#ifdef CONFIG_DEBUG_VM
58 else 57 else
59 WARN(1, "Buffer overflow detected!\n"); 58 WARN(1, "Buffer overflow detected!\n");
60#endif 59#endif
61 return ret; 60 return n;
62} 61}
63 62
64static __always_inline __must_check 63static __always_inline __must_check
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 3baf379fa840..beb9b5f8f8a4 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -354,6 +354,7 @@
354#define __ARCH_WANT_STAT64 354#define __ARCH_WANT_STAT64
355#define __ARCH_WANT_SYS_ALARM 355#define __ARCH_WANT_SYS_ALARM
356#define __ARCH_WANT_SYS_GETHOSTNAME 356#define __ARCH_WANT_SYS_GETHOSTNAME
357#define __ARCH_WANT_SYS_IPC
357#define __ARCH_WANT_SYS_PAUSE 358#define __ARCH_WANT_SYS_PAUSE
358#define __ARCH_WANT_SYS_SGETMASK 359#define __ARCH_WANT_SYS_SGETMASK
359#define __ARCH_WANT_SYS_SIGNAL 360#define __ARCH_WANT_SYS_SIGNAL
@@ -366,6 +367,9 @@
366#define __ARCH_WANT_SYS_LLSEEK 367#define __ARCH_WANT_SYS_LLSEEK
367#define __ARCH_WANT_SYS_NICE 368#define __ARCH_WANT_SYS_NICE
368#define __ARCH_WANT_SYS_OLD_GETRLIMIT 369#define __ARCH_WANT_SYS_OLD_GETRLIMIT
370#define __ARCH_WANT_SYS_OLD_UNAME
371#define __ARCH_WANT_SYS_OLD_MMAP
372#define __ARCH_WANT_SYS_OLD_SELECT
369#define __ARCH_WANT_SYS_OLDUMOUNT 373#define __ARCH_WANT_SYS_OLDUMOUNT
370#define __ARCH_WANT_SYS_SIGPENDING 374#define __ARCH_WANT_SYS_SIGPENDING
371#define __ARCH_WANT_SYS_SIGPROCMASK 375#define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4843f7ba754a..ff4307b0e81e 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -146,7 +146,7 @@ __SYSCALL(__NR_wait4, sys_wait4)
146#define __NR_kill 62 146#define __NR_kill 62
147__SYSCALL(__NR_kill, sys_kill) 147__SYSCALL(__NR_kill, sys_kill)
148#define __NR_uname 63 148#define __NR_uname 63
149__SYSCALL(__NR_uname, sys_uname) 149__SYSCALL(__NR_uname, sys_newuname)
150 150
151#define __NR_semget 64 151#define __NR_semget 64
152__SYSCALL(__NR_semget, sys_semget) 152__SYSCALL(__NR_semget, sys_semget)
@@ -680,6 +680,7 @@ __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
680#define __ARCH_WANT_SYS_LLSEEK 680#define __ARCH_WANT_SYS_LLSEEK
681#define __ARCH_WANT_SYS_NICE 681#define __ARCH_WANT_SYS_NICE
682#define __ARCH_WANT_SYS_OLD_GETRLIMIT 682#define __ARCH_WANT_SYS_OLD_GETRLIMIT
683#define __ARCH_WANT_SYS_OLD_UNAME
683#define __ARCH_WANT_SYS_OLDUMOUNT 684#define __ARCH_WANT_SYS_OLDUMOUNT
684#define __ARCH_WANT_SYS_SIGPENDING 685#define __ARCH_WANT_SYS_SIGPENDING
685#define __ARCH_WANT_SYS_SIGPROCMASK 686#define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
index 999873b22e7f..24532c7da3d6 100644
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -1,5 +1,63 @@
1#ifndef _ASM_X86_USER_H
2#define _ASM_X86_USER_H
3
1#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
2# include "user_32.h" 5# include "user_32.h"
3#else 6#else
4# include "user_64.h" 7# include "user_64.h"
5#endif 8#endif
9
10#include <asm/types.h>
11
12struct user_ymmh_regs {
13 /* 16 * 16 bytes for each YMMH-reg */
14 __u32 ymmh_space[64];
15};
16
17struct user_xsave_hdr {
18 __u64 xstate_bv;
19 __u64 reserved1[2];
20 __u64 reserved2[5];
21};
22
23/*
24 * The structure layout of user_xstateregs, used for exporting the
25 * extended register state through ptrace and core-dump (NT_X86_XSTATE note)
26 * interfaces will be same as the memory layout of xsave used by the processor
27 * (except for the bytes 464..511, which can be used by the software) and hence
28 * the size of this structure varies depending on the features supported by the
29 * processor and OS. The size of the structure that users need to use can be
30 * obtained by doing:
31 * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx);
32 * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.)
33 * need to use.
34 *
35 * For now, only the first 8 bytes of the software usable bytes[464..471] will
36 * be used and will be set to OS enabled xstate mask (which is same as the
37 * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump
38 * remotely, etc.) can use this mask as well as the mask saved in the
39 * xstate_hdr bytes and interpret what states the processor/OS supports
40 * and what states are in modified/initialized conditions for the
41 * particular process/thread.
42 *
43 * Also when the user modifies certain state FP/SSE/etc through the
44 * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
45 * bytes[512..519] of the memory layout are updated correspondingly.
46 * i.e., for example when FP state is modified to a non-init state,
47 * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
48 * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
49 */
50#define USER_XSTATE_FX_SW_WORDS 6
51#define USER_XSTATE_XCR0_WORD 0
52
53struct user_xstateregs {
54 struct {
55 __u64 fpx_space[58];
56 __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
57 } i387;
58 struct user_xsave_hdr xsave_hdr;
59 struct user_ymmh_regs ymmh;
60 /* further processor state extensions go here */
61};
62
63#endif /* _ASM_X86_USER_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 2751f3075d8b..71605c7d5c5c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -18,8 +18,8 @@
18 * along with this program; if not, write to the Free Software 18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * 20 *
21 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 21 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
22 * Copyright (c) Russ Anderson 22 * Copyright (c) Russ Anderson <rja@sgi.com>
23 */ 23 */
24 24
25#include <linux/rtc.h> 25#include <linux/rtc.h>
@@ -36,7 +36,8 @@ enum uv_bios_cmd {
36 UV_BIOS_WATCHLIST_ALLOC, 36 UV_BIOS_WATCHLIST_ALLOC,
37 UV_BIOS_WATCHLIST_FREE, 37 UV_BIOS_WATCHLIST_FREE,
38 UV_BIOS_MEMPROTECT, 38 UV_BIOS_MEMPROTECT,
39 UV_BIOS_GET_PARTITION_ADDR 39 UV_BIOS_GET_PARTITION_ADDR,
40 UV_BIOS_SET_LEGACY_VGA_TARGET
40}; 41};
41 42
42/* 43/*
@@ -89,13 +90,14 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
89extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); 90extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
90extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); 91extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
91 92
92extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); 93extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
93extern s64 uv_bios_freq_base(u64, u64 *); 94extern s64 uv_bios_freq_base(u64, u64 *);
94extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, 95extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
95 unsigned long *); 96 unsigned long *);
96extern int uv_bios_mq_watchlist_free(int, int); 97extern int uv_bios_mq_watchlist_free(int, int);
97extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); 98extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
98extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); 99extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
100extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
99 101
100extern void uv_bios_init(void); 102extern void uv_bios_init(void);
101 103
@@ -104,6 +106,7 @@ extern int uv_type;
104extern long sn_partition_id; 106extern long sn_partition_id;
105extern long sn_coherency_id; 107extern long sn_coherency_id;
106extern long sn_region_size; 108extern long sn_region_size;
109extern long system_serial_number;
107#define partition_coherence_id() (sn_coherency_id) 110#define partition_coherence_id() (sn_coherency_id)
108 111
109extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ 112extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index c0a01b5d985b..3bb9491b7659 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -11,6 +11,7 @@ struct mm_struct;
11extern enum uv_system_type get_uv_system_type(void); 11extern enum uv_system_type get_uv_system_type(void);
12extern int is_uv_system(void); 12extern int is_uv_system(void);
13extern void uv_cpu_init(void); 13extern void uv_cpu_init(void);
14extern void uv_nmi_init(void);
14extern void uv_system_init(void); 15extern void uv_system_init(void);
15extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
16 struct mm_struct *mm, 17 struct mm_struct *mm,
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 811bfabc80b7..14cc74ba5d23 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -31,20 +31,20 @@
31 * contiguous (although various IO spaces may punch holes in 31 * contiguous (although various IO spaces may punch holes in
32 * it).. 32 * it)..
33 * 33 *
34 * N - Number of bits in the node portion of a socket physical 34 * N - Number of bits in the node portion of a socket physical
35 * address. 35 * address.
36 * 36 *
37 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of 37 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
38 * routers always have low bit of 1, C/MBricks have low bit 38 * routers always have low bit of 1, C/MBricks have low bit
39 * equal to 0. Most addressing macros that target UV hub chips 39 * equal to 0. Most addressing macros that target UV hub chips
40 * right shift the NASID by 1 to exclude the always-zero bit. 40 * right shift the NASID by 1 to exclude the always-zero bit.
41 * NASIDs contain up to 15 bits. 41 * NASIDs contain up to 15 bits.
42 * 42 *
43 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead 43 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
44 * of nasids. 44 * of nasids.
45 * 45 *
46 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant 46 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
47 * of the nasid for socket usage. 47 * of the nasid for socket usage.
48 * 48 *
49 * 49 *
50 * NumaLink Global Physical Address Format: 50 * NumaLink Global Physical Address Format:
@@ -71,12 +71,12 @@
71 * 71 *
72 * 72 *
73 * APICID format 73 * APICID format
74 * NOTE!!!!!! This is the current format of the APICID. However, code 74 * NOTE!!!!!! This is the current format of the APICID. However, code
75 * should assume that this will change in the future. Use functions 75 * should assume that this will change in the future. Use functions
76 * in this file for all APICID bit manipulations and conversion. 76 * in this file for all APICID bit manipulations and conversion.
77 * 77 *
78 * 1111110000000000 78 * 1111110000000000
79 * 5432109876543210 79 * 5432109876543210
80 * pppppppppplc0cch 80 * pppppppppplc0cch
81 * sssssssssss 81 * sssssssssss
82 * 82 *
@@ -89,9 +89,9 @@
89 * Note: Processor only supports 12 bits in the APICID register. The ACPI 89 * Note: Processor only supports 12 bits in the APICID register. The ACPI
90 * tables hold all 16 bits. Software needs to be aware of this. 90 * tables hold all 16 bits. Software needs to be aware of this.
91 * 91 *
92 * Unless otherwise specified, all references to APICID refer to 92 * Unless otherwise specified, all references to APICID refer to
93 * the FULL value contained in ACPI tables, not the subset in the 93 * the FULL value contained in ACPI tables, not the subset in the
94 * processor APICID register. 94 * processor APICID register.
95 */ 95 */
96 96
97 97
@@ -151,16 +151,16 @@ struct uv_hub_info_s {
151}; 151};
152 152
153DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 153DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
154#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) 154#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
155#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) 155#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
156 156
157/* 157/*
158 * Local & Global MMR space macros. 158 * Local & Global MMR space macros.
159 * Note: macros are intended to be used ONLY by inline functions 159 * Note: macros are intended to be used ONLY by inline functions
160 * in this file - not by other kernel code. 160 * in this file - not by other kernel code.
161 * n - NASID (full 15-bit global nasid) 161 * n - NASID (full 15-bit global nasid)
162 * g - GNODE (full 15-bit global nasid, right shifted 1) 162 * g - GNODE (full 15-bit global nasid, right shifted 1)
163 * p - PNODE (local part of nsids, right shifted 1) 163 * p - PNODE (local part of nsids, right shifted 1)
164 */ 164 */
165#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) 165#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
166#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) 166#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
@@ -215,8 +215,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
215/* 215/*
216 * Macros for converting between kernel virtual addresses, socket local physical 216 * Macros for converting between kernel virtual addresses, socket local physical
217 * addresses, and UV global physical addresses. 217 * addresses, and UV global physical addresses.
218 * Note: use the standard __pa() & __va() macros for converting 218 * Note: use the standard __pa() & __va() macros for converting
219 * between socket virtual and socket physical addresses. 219 * between socket virtual and socket physical addresses.
220 */ 220 */
221 221
222/* socket phys RAM --> UV global physical address */ 222/* socket phys RAM --> UV global physical address */
@@ -287,21 +287,18 @@ static inline int uv_apicid_to_pnode(int apicid)
287 * Access global MMRs using the low memory MMR32 space. This region supports 287 * Access global MMRs using the low memory MMR32 space. This region supports
288 * faster MMR access but not all MMRs are accessible in this space. 288 * faster MMR access but not all MMRs are accessible in this space.
289 */ 289 */
290static inline unsigned long *uv_global_mmr32_address(int pnode, 290static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset)
291 unsigned long offset)
292{ 291{
293 return __va(UV_GLOBAL_MMR32_BASE | 292 return __va(UV_GLOBAL_MMR32_BASE |
294 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); 293 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
295} 294}
296 295
297static inline void uv_write_global_mmr32(int pnode, unsigned long offset, 296static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val)
298 unsigned long val)
299{ 297{
300 writeq(val, uv_global_mmr32_address(pnode, offset)); 298 writeq(val, uv_global_mmr32_address(pnode, offset));
301} 299}
302 300
303static inline unsigned long uv_read_global_mmr32(int pnode, 301static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset)
304 unsigned long offset)
305{ 302{
306 return readq(uv_global_mmr32_address(pnode, offset)); 303 return readq(uv_global_mmr32_address(pnode, offset));
307} 304}
@@ -310,21 +307,18 @@ static inline unsigned long uv_read_global_mmr32(int pnode,
310 * Access Global MMR space using the MMR space located at the top of physical 307 * Access Global MMR space using the MMR space located at the top of physical
311 * memory. 308 * memory.
312 */ 309 */
313static inline unsigned long *uv_global_mmr64_address(int pnode, 310static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset)
314 unsigned long offset)
315{ 311{
316 return __va(UV_GLOBAL_MMR64_BASE | 312 return __va(UV_GLOBAL_MMR64_BASE |
317 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); 313 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
318} 314}
319 315
320static inline void uv_write_global_mmr64(int pnode, unsigned long offset, 316static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val)
321 unsigned long val)
322{ 317{
323 writeq(val, uv_global_mmr64_address(pnode, offset)); 318 writeq(val, uv_global_mmr64_address(pnode, offset));
324} 319}
325 320
326static inline unsigned long uv_read_global_mmr64(int pnode, 321static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset)
327 unsigned long offset)
328{ 322{
329 return readq(uv_global_mmr64_address(pnode, offset)); 323 return readq(uv_global_mmr64_address(pnode, offset));
330} 324}
@@ -335,7 +329,18 @@ static inline unsigned long uv_read_global_mmr64(int pnode,
335 */ 329 */
336static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) 330static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset)
337{ 331{
338 return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val); 332 return UV_GLOBAL_GRU_MMR_BASE | offset |
333 ((unsigned long)pnode << uv_hub_info->m_val);
334}
335
336static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
337{
338 writeb(val, uv_global_mmr64_address(pnode, offset));
339}
340
341static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset)
342{
343 return readb(uv_global_mmr64_address(pnode, offset));
339} 344}
340 345
341/* 346/*
@@ -457,11 +462,17 @@ static inline void uv_set_scir_bits(unsigned char value)
457 } 462 }
458} 463}
459 464
465static inline unsigned long uv_scir_offset(int apicid)
466{
467 return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f);
468}
469
460static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) 470static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
461{ 471{
462 if (uv_cpu_hub_info(cpu)->scir.state != value) { 472 if (uv_cpu_hub_info(cpu)->scir.state != value) {
473 uv_write_global_mmr8(uv_cpu_to_pnode(cpu),
474 uv_cpu_hub_info(cpu)->scir.offset, value);
463 uv_cpu_hub_info(cpu)->scir.state = value; 475 uv_cpu_hub_info(cpu)->scir.state = value;
464 uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
465 } 476 }
466} 477}
467 478
@@ -485,5 +496,17 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
485 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 496 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
486} 497}
487 498
499/*
500 * Get the minimum revision number of the hub chips within the partition.
501 * 1 - initial rev 1.0 silicon
502 * 2 - rev 2.0 production silicon
503 */
504static inline int uv_get_min_hub_revision_id(void)
505{
506 extern int uv_min_hub_revision_id;
507
508 return uv_min_hub_revision_id;
509}
510
488#endif /* CONFIG_X86_64 */ 511#endif /* CONFIG_X86_64 */
489#endif /* _ASM_X86_UV_UV_HUB_H */ 512#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
index 166adf61e770..2edb37637ead 100644
--- a/arch/x86/include/asm/visws/cobalt.h
+++ b/arch/x86/include/asm/visws/cobalt.h
@@ -122,4 +122,6 @@ extern char visws_board_type;
122 122
123extern char visws_board_rev; 123extern char visws_board_rev;
124 124
125extern int pci_visws_init(void);
126
125#endif /* _ASM_X86_VISWS_COBALT_H */ 127#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b4945419a84..fb9a080740ec 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -53,6 +53,7 @@
53 */ 53 */
54#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 54#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
56#define SECONDARY_EXEC_RDTSCP 0x00000008
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 57#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 58#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 59#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
@@ -251,6 +252,7 @@ enum vmcs_field {
251#define EXIT_REASON_MSR_READ 31 252#define EXIT_REASON_MSR_READ 31
252#define EXIT_REASON_MSR_WRITE 32 253#define EXIT_REASON_MSR_WRITE 32
253#define EXIT_REASON_MWAIT_INSTRUCTION 36 254#define EXIT_REASON_MWAIT_INSTRUCTION 36
255#define EXIT_REASON_MONITOR_INSTRUCTION 39
254#define EXIT_REASON_PAUSE_INSTRUCTION 40 256#define EXIT_REASON_PAUSE_INSTRUCTION 40
255#define EXIT_REASON_MCE_DURING_VMENTRY 41 257#define EXIT_REASON_MCE_DURING_VMENTRY 41
256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 258#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
@@ -362,6 +364,7 @@ enum vmcs_field {
362#define VMX_EPTP_UC_BIT (1ull << 8) 364#define VMX_EPTP_UC_BIT (1ull << 8)
363#define VMX_EPTP_WB_BIT (1ull << 14) 365#define VMX_EPTP_WB_BIT (1ull << 14)
364#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 366#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
367#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
365#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 368#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
366#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 369#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
367#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 370#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -374,7 +377,7 @@ enum vmcs_field {
374#define VMX_EPT_READABLE_MASK 0x1ull 377#define VMX_EPT_READABLE_MASK 0x1ull
375#define VMX_EPT_WRITABLE_MASK 0x2ull 378#define VMX_EPT_WRITABLE_MASK 0x2ull
376#define VMX_EPT_EXECUTABLE_MASK 0x4ull 379#define VMX_EPT_EXECUTABLE_MASK 0x4ull
377#define VMX_EPT_IGMT_BIT (1ull << 6) 380#define VMX_EPT_IPAT_BIT (1ull << 6)
378 381
379#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 382#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
380 383
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ea0e8ea15e15..519b54327d75 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -99,6 +99,20 @@ struct x86_init_iommu {
99}; 99};
100 100
101/** 101/**
102 * struct x86_init_pci - platform specific pci init functions
103 * @arch_init: platform specific pci arch init call
104 * @init: platform specific pci subsystem init
105 * @init_irq: platform specific pci irq init
106 * @fixup_irqs: platform specific pci irq fixup
107 */
108struct x86_init_pci {
109 int (*arch_init)(void);
110 int (*init)(void);
111 void (*init_irq)(void);
112 void (*fixup_irqs)(void);
113};
114
115/**
102 * struct x86_init_ops - functions for platform specific setup 116 * struct x86_init_ops - functions for platform specific setup
103 * 117 *
104 */ 118 */
@@ -110,6 +124,7 @@ struct x86_init_ops {
110 struct x86_init_paging paging; 124 struct x86_init_paging paging;
111 struct x86_init_timers timers; 125 struct x86_init_timers timers;
112 struct x86_init_iommu iommu; 126 struct x86_init_iommu iommu;
127 struct x86_init_pci pci;
113}; 128};
114 129
115/** 130/**
@@ -126,6 +141,7 @@ struct x86_cpuinit_ops {
126 * @get_wallclock: get time from HW clock like RTC etc. 141 * @get_wallclock: get time from HW clock like RTC etc.
127 * @set_wallclock: set time back to HW clock 142 * @set_wallclock: set time back to HW clock
128 * @is_untracked_pat_range exclude from PAT logic 143 * @is_untracked_pat_range exclude from PAT logic
144 * @nmi_init enable NMI on cpus
129 */ 145 */
130struct x86_platform_ops { 146struct x86_platform_ops {
131 unsigned long (*calibrate_tsc)(void); 147 unsigned long (*calibrate_tsc)(void);
@@ -133,6 +149,7 @@ struct x86_platform_ops {
133 int (*set_wallclock)(unsigned long nowtime); 149 int (*set_wallclock)(unsigned long nowtime);
134 void (*iommu_shutdown)(void); 150 void (*iommu_shutdown)(void);
135 bool (*is_untracked_pat_range)(u64 start, u64 end); 151 bool (*is_untracked_pat_range)(u64 start, u64 end);
152 void (*nmi_init)(void);
136}; 153};
137 154
138extern struct x86_init_ops x86_init; 155extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 727acc152344..ddc04ccad03b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -27,9 +27,11 @@
27extern unsigned int xstate_size; 27extern unsigned int xstate_size;
28extern u64 pcntxt_mask; 28extern u64 pcntxt_mask;
29extern struct xsave_struct *init_xstate_buf; 29extern struct xsave_struct *init_xstate_buf;
30extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
30 31
31extern void xsave_cntxt_init(void); 32extern void xsave_cntxt_init(void);
32extern void xsave_init(void); 33extern void xsave_init(void);
34extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
33extern int init_fpu(struct task_struct *child); 35extern int init_fpu(struct task_struct *child);
34extern int check_for_xstate(struct i387_fxsave_struct __user *buf, 36extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
35 void __user *fpstate, 37 void __user *fpstate,
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d87f09bc5a52..4c58352209e0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o
87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
88 88
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o
90 91
91obj-$(CONFIG_K8_NB) += k8.o 92obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index fb1035cd9a6a..cd40aba6aa95 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,10 +31,12 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/dmi.h> 32#include <linux/dmi.h>
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/slab.h>
34#include <linux/bootmem.h> 35#include <linux/bootmem.h>
35#include <linux/ioport.h> 36#include <linux/ioport.h>
36#include <linux/pci.h> 37#include <linux/pci.h>
37 38
39#include <asm/pci_x86.h>
38#include <asm/pgtable.h> 40#include <asm/pgtable.h>
39#include <asm/io_apic.h> 41#include <asm/io_apic.h>
40#include <asm/apic.h> 42#include <asm/apic.h>
@@ -49,6 +51,7 @@ EXPORT_SYMBOL(acpi_disabled);
49 51
50#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
51# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
52#endif /* X86 */ 55#endif /* X86 */
53 56
54#define BAD_MADT_ENTRY(entry, end) ( \ 57#define BAD_MADT_ENTRY(entry, end) ( \
@@ -446,6 +449,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
446int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
447{ 450{
448 *irq = gsi; 451 *irq = gsi;
452
453#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
455 setup_IO_APIC_irq_extra(gsi);
456#endif
457
449 return 0; 458 return 0;
450} 459}
451 460
@@ -473,7 +482,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
473 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
474 } 483 }
475#endif 484#endif
476 acpi_gsi_to_irq(plat_gsi, &irq); 485 irq = plat_gsi;
486
477 return irq; 487 return irq;
478} 488}
479 489
@@ -481,6 +491,26 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
481 * ACPI based hotplug support for CPU 491 * ACPI based hotplug support for CPU
482 */ 492 */
483#ifdef CONFIG_ACPI_HOTPLUG_CPU 493#ifdef CONFIG_ACPI_HOTPLUG_CPU
494#include <acpi/processor.h>
495
496static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
497{
498#ifdef CONFIG_ACPI_NUMA
499 int nid;
500
501 nid = acpi_get_node(handle);
502 if (nid == -1 || !node_online(nid))
503 return;
504#ifdef CONFIG_X86_64
505 apicid_to_node[physid] = nid;
506 numa_set_node(cpu, nid);
507#else /* CONFIG_X86_32 */
508 apicid_2_node[physid] = nid;
509 cpu_to_node_map[cpu] = nid;
510#endif
511
512#endif
513}
484 514
485static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) 515static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
486{ 516{
@@ -539,7 +569,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
539 goto free_new_map; 569 goto free_new_map;
540 } 570 }
541 571
572 acpi_processor_set_pdc(handle);
573
542 cpu = cpumask_first(new_map); 574 cpu = cpumask_first(new_map);
575 acpi_map_cpu2node(handle, cpu, physid);
543 576
544 *pcpu = cpu; 577 *pcpu = cpu;
545 retval = 0; 578 retval = 0;
@@ -1185,9 +1218,6 @@ static void __init acpi_process_madt(void)
1185 if (!error) { 1218 if (!error) {
1186 acpi_lapic = 1; 1219 acpi_lapic = 1;
1187 1220
1188#ifdef CONFIG_X86_BIGSMP
1189 generic_bigsmp_probe();
1190#endif
1191 /* 1221 /*
1192 * Parse MADT IO-APIC entries 1222 * Parse MADT IO-APIC entries
1193 */ 1223 */
@@ -1197,8 +1227,6 @@ static void __init acpi_process_madt(void)
1197 acpi_ioapic = 1; 1227 acpi_ioapic = 1;
1198 1228
1199 smp_found_config = 1; 1229 smp_found_config = 1;
1200 if (apic->setup_apic_routing)
1201 apic->setup_apic_routing();
1202 } 1230 }
1203 } 1231 }
1204 if (error == -EINVAL) { 1232 if (error == -EINVAL) {
@@ -1269,23 +1297,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
1269} 1297}
1270 1298
1271/* 1299/*
1272 * Limit ACPI to CPU enumeration for HT
1273 */
1274static int __init force_acpi_ht(const struct dmi_system_id *d)
1275{
1276 if (!acpi_force) {
1277 printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1278 d->ident);
1279 disable_acpi();
1280 acpi_ht = 1;
1281 } else {
1282 printk(KERN_NOTICE
1283 "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1284 }
1285 return 0;
1286}
1287
1288/*
1289 * Force ignoring BIOS IRQ0 pin2 override 1300 * Force ignoring BIOS IRQ0 pin2 override
1290 */ 1301 */
1291static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1302static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
@@ -1321,90 +1332,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1321 }, 1332 },
1322 1333
1323 /* 1334 /*
1324 * Boxes that need acpi=ht
1325 */
1326 {
1327 .callback = force_acpi_ht,
1328 .ident = "FSC Primergy T850",
1329 .matches = {
1330 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1331 DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1332 },
1333 },
1334 {
1335 .callback = force_acpi_ht,
1336 .ident = "HP VISUALIZE NT Workstation",
1337 .matches = {
1338 DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1339 DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1340 },
1341 },
1342 {
1343 .callback = force_acpi_ht,
1344 .ident = "Compaq Workstation W8000",
1345 .matches = {
1346 DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1347 DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1348 },
1349 },
1350 {
1351 .callback = force_acpi_ht,
1352 .ident = "ASUS P2B-DS",
1353 .matches = {
1354 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1355 DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1356 },
1357 },
1358 {
1359 .callback = force_acpi_ht,
1360 .ident = "ASUS CUR-DLS",
1361 .matches = {
1362 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1363 DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1364 },
1365 },
1366 {
1367 .callback = force_acpi_ht,
1368 .ident = "ABIT i440BX-W83977",
1369 .matches = {
1370 DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1371 DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1372 },
1373 },
1374 {
1375 .callback = force_acpi_ht,
1376 .ident = "IBM Bladecenter",
1377 .matches = {
1378 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1379 DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1380 },
1381 },
1382 {
1383 .callback = force_acpi_ht,
1384 .ident = "IBM eServer xSeries 360",
1385 .matches = {
1386 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1387 DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1388 },
1389 },
1390 {
1391 .callback = force_acpi_ht,
1392 .ident = "IBM eserver xSeries 330",
1393 .matches = {
1394 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1395 DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1396 },
1397 },
1398 {
1399 .callback = force_acpi_ht,
1400 .ident = "IBM eserver xSeries 440",
1401 .matches = {
1402 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1403 DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1404 },
1405 },
1406
1407 /*
1408 * Boxes that need ACPI PCI IRQ routing disabled 1335 * Boxes that need ACPI PCI IRQ routing disabled
1409 */ 1336 */
1410 { 1337 {
@@ -1529,16 +1456,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1529 * if acpi_blacklisted() acpi_disabled = 1; 1456 * if acpi_blacklisted() acpi_disabled = 1;
1530 * acpi_irq_model=... 1457 * acpi_irq_model=...
1531 * ... 1458 * ...
1532 *
1533 * return value: (currently ignored)
1534 * 0: success
1535 * !0: failure
1536 */ 1459 */
1537 1460
1538int __init acpi_boot_table_init(void) 1461void __init acpi_boot_table_init(void)
1539{ 1462{
1540 int error;
1541
1542 dmi_check_system(acpi_dmi_table); 1463 dmi_check_system(acpi_dmi_table);
1543 1464
1544 /* 1465 /*
@@ -1546,15 +1467,14 @@ int __init acpi_boot_table_init(void)
1546 * One exception: acpi=ht continues far enough to enumerate LAPICs 1467 * One exception: acpi=ht continues far enough to enumerate LAPICs
1547 */ 1468 */
1548 if (acpi_disabled && !acpi_ht) 1469 if (acpi_disabled && !acpi_ht)
1549 return 1; 1470 return;
1550 1471
1551 /* 1472 /*
1552 * Initialize the ACPI boot-time table parser. 1473 * Initialize the ACPI boot-time table parser.
1553 */ 1474 */
1554 error = acpi_table_init(); 1475 if (acpi_table_init()) {
1555 if (error) {
1556 disable_acpi(); 1476 disable_acpi();
1557 return error; 1477 return;
1558 } 1478 }
1559 1479
1560 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1480 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1562,18 +1482,15 @@ int __init acpi_boot_table_init(void)
1562 /* 1482 /*
1563 * blacklist may disable ACPI entirely 1483 * blacklist may disable ACPI entirely
1564 */ 1484 */
1565 error = acpi_blacklisted(); 1485 if (acpi_blacklisted()) {
1566 if (error) {
1567 if (acpi_force) { 1486 if (acpi_force) {
1568 printk(KERN_WARNING PREFIX "acpi=force override\n"); 1487 printk(KERN_WARNING PREFIX "acpi=force override\n");
1569 } else { 1488 } else {
1570 printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); 1489 printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1571 disable_acpi(); 1490 disable_acpi();
1572 return error; 1491 return;
1573 } 1492 }
1574 } 1493 }
1575
1576 return 0;
1577} 1494}
1578 1495
1579int __init early_acpi_boot_init(void) 1496int __init early_acpi_boot_init(void)
@@ -1619,6 +1536,9 @@ int __init acpi_boot_init(void)
1619 1536
1620 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); 1537 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
1621 1538
1539 if (!acpi_noirq)
1540 x86_init.pci.init = pci_acpi_init;
1541
1622 return 0; 1542 return 0;
1623} 1543}
1624 1544
@@ -1643,8 +1563,10 @@ static int __init parse_acpi(char *arg)
1643 } 1563 }
1644 /* Limit ACPI just to boot-time to enable HT */ 1564 /* Limit ACPI just to boot-time to enable HT */
1645 else if (strcmp(arg, "ht") == 0) { 1565 else if (strcmp(arg, "ht") == 0) {
1646 if (!acpi_force) 1566 if (!acpi_force) {
1567 printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
1647 disable_acpi(); 1568 disable_acpi();
1569 }
1648 acpi_ht = 1; 1570 acpi_ht = 1;
1649 } 1571 }
1650 /* acpi=rsdt use RSDT instead of XSDT */ 1572 /* acpi=rsdt use RSDT instead of XSDT */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 82e508677b91..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
165 str = strchr(str, ','); 167 str = strchr(str, ',');
166 if (str != NULL) 168 if (str != NULL)
167 str += strspn(str, ", \t"); 169 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 80b222ea4cf6..70237732a6c7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,8 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
10#include <asm/alternative.h> 12#include <asm/alternative.h>
11#include <asm/sections.h> 13#include <asm/sections.h>
12#include <asm/pgtable.h> 14#include <asm/pgtable.h>
@@ -192,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
192} 194}
193 195
194extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 196extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
195extern u8 *__smp_locks[], *__smp_locks_end[]; 197extern s32 __smp_locks[], __smp_locks_end[];
196static void *text_poke_early(void *addr, const void *opcode, size_t len); 198static void *text_poke_early(void *addr, const void *opcode, size_t len);
197 199
198/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
@@ -233,39 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
233 235
234#ifdef CONFIG_SMP 236#ifdef CONFIG_SMP
235 237
236static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) 238static void alternatives_smp_lock(const s32 *start, const s32 *end,
239 u8 *text, u8 *text_end)
237{ 240{
238 u8 **ptr; 241 const s32 *poff;
239 242
240 mutex_lock(&text_mutex); 243 mutex_lock(&text_mutex);
241 for (ptr = start; ptr < end; ptr++) { 244 for (poff = start; poff < end; poff++) {
242 if (*ptr < text) 245 u8 *ptr = (u8 *)poff + *poff;
243 continue; 246
244 if (*ptr > text_end) 247 if (!*poff || ptr < text || ptr >= text_end)
245 continue; 248 continue;
246 /* turn DS segment override prefix into lock prefix */ 249 /* turn DS segment override prefix into lock prefix */
247 if (**ptr == 0x3e) 250 if (*ptr == 0x3e)
248 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 251 text_poke(ptr, ((unsigned char []){0xf0}), 1);
249 }; 252 };
250 mutex_unlock(&text_mutex); 253 mutex_unlock(&text_mutex);
251} 254}
252 255
253static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 256static void alternatives_smp_unlock(const s32 *start, const s32 *end,
257 u8 *text, u8 *text_end)
254{ 258{
255 u8 **ptr; 259 const s32 *poff;
256 260
257 if (noreplace_smp) 261 if (noreplace_smp)
258 return; 262 return;
259 263
260 mutex_lock(&text_mutex); 264 mutex_lock(&text_mutex);
261 for (ptr = start; ptr < end; ptr++) { 265 for (poff = start; poff < end; poff++) {
262 if (*ptr < text) 266 u8 *ptr = (u8 *)poff + *poff;
263 continue; 267
264 if (*ptr > text_end) 268 if (!*poff || ptr < text || ptr >= text_end)
265 continue; 269 continue;
266 /* turn lock prefix into DS segment override prefix */ 270 /* turn lock prefix into DS segment override prefix */
267 if (**ptr == 0xf0) 271 if (*ptr == 0xf0)
268 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 272 text_poke(ptr, ((unsigned char []){0x3E}), 1);
269 }; 273 };
270 mutex_unlock(&text_mutex); 274 mutex_unlock(&text_mutex);
271} 275}
@@ -276,8 +280,8 @@ struct smp_alt_module {
276 char *name; 280 char *name;
277 281
278 /* ptrs to lock prefixes */ 282 /* ptrs to lock prefixes */
279 u8 **locks; 283 const s32 *locks;
280 u8 **locks_end; 284 const s32 *locks_end;
281 285
282 /* .text segment, needed to avoid patching init code ;) */ 286 /* .text segment, needed to avoid patching init code ;) */
283 u8 *text; 287 u8 *text;
@@ -394,6 +398,27 @@ void alternatives_smp_switch(int smp)
394 mutex_unlock(&smp_alt); 398 mutex_unlock(&smp_alt);
395} 399}
396 400
401/* Return 1 if the address range is reserved for smp-alternatives */
402int alternatives_text_reserved(void *start, void *end)
403{
404 struct smp_alt_module *mod;
405 const s32 *poff;
406 u8 *text_start = start;
407 u8 *text_end = end;
408
409 list_for_each_entry(mod, &smp_alt_modules, next) {
410 if (mod->text > text_end || mod->text_end < text_start)
411 continue;
412 for (poff = mod->locks; poff < mod->locks_end; poff++) {
413 const u8 *ptr = (const u8 *)poff + *poff;
414
415 if (text_start <= ptr && text_end > ptr)
416 return 1;
417 }
418 }
419
420 return 0;
421}
397#endif 422#endif
398 423
399#ifdef CONFIG_PARAVIRT 424#ifdef CONFIG_PARAVIRT
@@ -556,3 +581,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
556 local_irq_restore(flags); 581 local_irq_restore(flags);
557 return addr; 582 return addr;
558} 583}
584
585/*
586 * Cross-modifying kernel text with stop_machine().
587 * This code originally comes from immediate value.
588 */
589static atomic_t stop_machine_first;
590static int wrote_text;
591
592struct text_poke_params {
593 void *addr;
594 const void *opcode;
595 size_t len;
596};
597
598static int __kprobes stop_machine_text_poke(void *data)
599{
600 struct text_poke_params *tpp = data;
601
602 if (atomic_dec_and_test(&stop_machine_first)) {
603 text_poke(tpp->addr, tpp->opcode, tpp->len);
604 smp_wmb(); /* Make sure other cpus see that this has run */
605 wrote_text = 1;
606 } else {
607 while (!wrote_text)
608 cpu_relax();
609 smp_mb(); /* Load wrote_text before following execution */
610 }
611
612 flush_icache_range((unsigned long)tpp->addr,
613 (unsigned long)tpp->addr + tpp->len);
614 return 0;
615}
616
617/**
618 * text_poke_smp - Update instructions on a live kernel on SMP
619 * @addr: address to modify
620 * @opcode: source of the copy
621 * @len: length to copy
622 *
623 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
624 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
625 * should be allowed, since stop_machine() does _not_ protect code against
626 * NMI and MCE.
627 *
628 * Note: Must be called under get_online_cpus() and text_mutex.
629 */
630void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
631{
632 struct text_poke_params tpp;
633
634 tpp.addr = addr;
635 tpp.opcode = opcode;
636 tpp.len = len;
637 atomic_set(&stop_machine_first, 1);
638 wrote_text = 0;
639 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
640 return addr;
641}
642
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 23824fef789c..f854d89b7edf 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -18,8 +18,8 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h>
22#include <linux/bitmap.h> 21#include <linux/bitmap.h>
22#include <linux/slab.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -118,7 +118,7 @@ static bool check_device(struct device *dev)
118 return false; 118 return false;
119 119
120 /* No device or no PCI device */ 120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type) 121 if (dev->bus != &pci_bus_type)
122 return false; 122 return false;
123 123
124 devid = get_device_id(dev); 124 devid = get_device_id(dev);
@@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
392 u32 tail, head; 392 u32 tail, head;
393 u8 *target; 393 u8 *target;
394 394
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
395 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
396 target = iommu->cmd_buf + tail; 397 target = iommu->cmd_buf + tail;
397 memcpy_toio(target, cmd, sizeof(*cmd)); 398 memcpy_toio(target, cmd, sizeof(*cmd));
@@ -980,7 +981,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
980{ 981{
981 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 982 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
982 struct amd_iommu *iommu; 983 struct amd_iommu *iommu;
983 int i; 984 unsigned long i;
984 985
985#ifdef CONFIG_IOMMU_STRESS 986#ifdef CONFIG_IOMMU_STRESS
986 populate = false; 987 populate = false;
@@ -1489,11 +1490,14 @@ static void __detach_device(struct device *dev)
1489{ 1490{
1490 struct iommu_dev_data *dev_data = get_dev_data(dev); 1491 struct iommu_dev_data *dev_data = get_dev_data(dev);
1491 struct iommu_dev_data *alias_data; 1492 struct iommu_dev_data *alias_data;
1493 struct protection_domain *domain;
1492 unsigned long flags; 1494 unsigned long flags;
1493 1495
1494 BUG_ON(!dev_data->domain); 1496 BUG_ON(!dev_data->domain);
1495 1497
1496 spin_lock_irqsave(&dev_data->domain->lock, flags); 1498 domain = dev_data->domain;
1499
1500 spin_lock_irqsave(&domain->lock, flags);
1497 1501
1498 if (dev_data->alias != dev) { 1502 if (dev_data->alias != dev) {
1499 alias_data = get_dev_data(dev_data->alias); 1503 alias_data = get_dev_data(dev_data->alias);
@@ -1504,13 +1508,15 @@ static void __detach_device(struct device *dev)
1504 if (atomic_dec_and_test(&dev_data->bind)) 1508 if (atomic_dec_and_test(&dev_data->bind))
1505 do_detach(dev); 1509 do_detach(dev);
1506 1510
1507 spin_unlock_irqrestore(&dev_data->domain->lock, flags); 1511 spin_unlock_irqrestore(&domain->lock, flags);
1508 1512
1509 /* 1513 /*
1510 * If we run in passthrough mode the device must be assigned to the 1514 * If we run in passthrough mode the device must be assigned to the
1511 * passthrough domain if it is detached from any other domain 1515 * passthrough domain if it is detached from any other domain.
1516 * Make sure we can deassign from the pt_domain itself.
1512 */ 1517 */
1513 if (iommu_pass_through && dev_data->domain == NULL) 1518 if (iommu_pass_through &&
1519 (dev_data->domain == NULL && domain != pt_domain))
1514 __attach_device(dev, pt_domain); 1520 __attach_device(dev, pt_domain);
1515} 1521}
1516 1522
@@ -2181,7 +2187,7 @@ static void prealloc_protection_domains(void)
2181 struct dma_ops_domain *dma_dom; 2187 struct dma_ops_domain *dma_dom;
2182 u16 devid; 2188 u16 devid;
2183 2189
2184 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2190 for_each_pci_dev(dev) {
2185 2191
2186 /* Do we handle this device? */ 2192 /* Do we handle this device? */
2187 if (!check_device(&dev->dev)) 2193 if (!check_device(&dev->dev))
@@ -2218,6 +2224,12 @@ static struct dma_map_ops amd_iommu_dma_ops = {
2218/* 2224/*
2219 * The function which clues the AMD IOMMU driver into dma_ops. 2225 * The function which clues the AMD IOMMU driver into dma_ops.
2220 */ 2226 */
2227
2228void __init amd_iommu_init_api(void)
2229{
2230 register_iommu(&amd_iommu_ops);
2231}
2232
2221int __init amd_iommu_init_dma_ops(void) 2233int __init amd_iommu_init_dma_ops(void)
2222{ 2234{
2223 struct amd_iommu *iommu; 2235 struct amd_iommu *iommu;
@@ -2253,8 +2265,6 @@ int __init amd_iommu_init_dma_ops(void)
2253 /* Make the driver finally visible to the drivers */ 2265 /* Make the driver finally visible to the drivers */
2254 dma_ops = &amd_iommu_dma_ops; 2266 dma_ops = &amd_iommu_dma_ops;
2255 2267
2256 register_iommu(&amd_iommu_ops);
2257
2258 amd_iommu_stats_init(); 2268 amd_iommu_stats_init();
2259 2269
2260 return 0; 2270 return 0;
@@ -2289,7 +2299,7 @@ static void cleanup_domain(struct protection_domain *domain)
2289 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { 2299 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2290 struct device *dev = dev_data->dev; 2300 struct device *dev = dev_data->dev;
2291 2301
2292 do_detach(dev); 2302 __detach_device(dev);
2293 atomic_set(&dev_data->bind, 0); 2303 atomic_set(&dev_data->bind, 0);
2294 } 2304 }
2295 2305
@@ -2318,6 +2328,7 @@ static struct protection_domain *protection_domain_alloc(void)
2318 return NULL; 2328 return NULL;
2319 2329
2320 spin_lock_init(&domain->lock); 2330 spin_lock_init(&domain->lock);
2331 mutex_init(&domain->api_lock);
2321 domain->id = domain_id_alloc(); 2332 domain->id = domain_id_alloc();
2322 if (!domain->id) 2333 if (!domain->id)
2323 goto out_err; 2334 goto out_err;
@@ -2370,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2370 2381
2371 free_pagetable(domain); 2382 free_pagetable(domain);
2372 2383
2373 domain_id_free(domain->id); 2384 protection_domain_free(domain);
2374
2375 kfree(domain);
2376 2385
2377 dom->priv = NULL; 2386 dom->priv = NULL;
2378} 2387}
@@ -2447,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2447 iova &= PAGE_MASK; 2456 iova &= PAGE_MASK;
2448 paddr &= PAGE_MASK; 2457 paddr &= PAGE_MASK;
2449 2458
2459 mutex_lock(&domain->api_lock);
2460
2450 for (i = 0; i < npages; ++i) { 2461 for (i = 0; i < npages; ++i) {
2451 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); 2462 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2452 if (ret) 2463 if (ret)
@@ -2456,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2456 paddr += PAGE_SIZE; 2467 paddr += PAGE_SIZE;
2457 } 2468 }
2458 2469
2470 mutex_unlock(&domain->api_lock);
2471
2459 return 0; 2472 return 0;
2460} 2473}
2461 2474
@@ -2468,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2468 2481
2469 iova &= PAGE_MASK; 2482 iova &= PAGE_MASK;
2470 2483
2484 mutex_lock(&domain->api_lock);
2485
2471 for (i = 0; i < npages; ++i) { 2486 for (i = 0; i < npages; ++i) {
2472 iommu_unmap_page(domain, iova, PM_MAP_4k); 2487 iommu_unmap_page(domain, iova, PM_MAP_4k);
2473 iova += PAGE_SIZE; 2488 iova += PAGE_SIZE;
2474 } 2489 }
2475 2490
2476 iommu_flush_tlb_pde(domain); 2491 iommu_flush_tlb_pde(domain);
2492
2493 mutex_unlock(&domain->api_lock);
2477} 2494}
2478 2495
2479static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2496static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 1dca9c34eaeb..6360abf993d4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -19,8 +19,8 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/gfp.h>
23#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
@@ -138,6 +138,11 @@ int amd_iommus_present;
138bool amd_iommu_np_cache __read_mostly; 138bool amd_iommu_np_cache __read_mostly;
139 139
140/* 140/*
141 * The ACPI table parsing functions set this variable on an error
142 */
143static int __initdata amd_iommu_init_err;
144
145/*
141 * List of protection domains - used during resume 146 * List of protection domains - used during resume
142 */ 147 */
143LIST_HEAD(amd_iommu_pd_list); 148LIST_HEAD(amd_iommu_pd_list);
@@ -386,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
386 */ 391 */
387 for (i = 0; i < table->length; ++i) 392 for (i = 0; i < table->length; ++i)
388 checksum += p[i]; 393 checksum += p[i];
389 if (checksum != 0) 394 if (checksum != 0) {
390 /* ACPI table corrupt */ 395 /* ACPI table corrupt */
391 return -ENODEV; 396 amd_iommu_init_err = -ENODEV;
397 return 0;
398 }
392 399
393 p += IVRS_HEADER_LENGTH; 400 p += IVRS_HEADER_LENGTH;
394 401
@@ -431,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
431 if (cmd_buf == NULL) 438 if (cmd_buf == NULL)
432 return NULL; 439 return NULL;
433 440
434 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 441 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
435 442
436 return cmd_buf; 443 return cmd_buf;
437} 444}
@@ -467,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
467 &entry, sizeof(entry)); 474 &entry, sizeof(entry));
468 475
469 amd_iommu_reset_cmd_buffer(iommu); 476 amd_iommu_reset_cmd_buffer(iommu);
477 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
470} 478}
471 479
472static void __init free_command_buffer(struct amd_iommu *iommu) 480static void __init free_command_buffer(struct amd_iommu *iommu)
473{ 481{
474 free_pages((unsigned long)iommu->cmd_buf, 482 free_pages((unsigned long)iommu->cmd_buf,
475 get_order(iommu->cmd_buf_size)); 483 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
476} 484}
477 485
478/* allocates the memory where the IOMMU will log its events to */ 486/* allocates the memory where the IOMMU will log its events to */
@@ -915,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
915 h->mmio_phys); 923 h->mmio_phys);
916 924
917 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 925 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
918 if (iommu == NULL) 926 if (iommu == NULL) {
919 return -ENOMEM; 927 amd_iommu_init_err = -ENOMEM;
928 return 0;
929 }
930
920 ret = init_iommu_one(iommu, h); 931 ret = init_iommu_one(iommu, h);
921 if (ret) 932 if (ret) {
922 return ret; 933 amd_iommu_init_err = ret;
934 return 0;
935 }
923 break; 936 break;
924 default: 937 default:
925 break; 938 break;
@@ -1204,6 +1217,10 @@ static int __init amd_iommu_init(void)
1204 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 1217 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1205 return -ENODEV; 1218 return -ENODEV;
1206 1219
1220 ret = amd_iommu_init_err;
1221 if (ret)
1222 goto out;
1223
1207 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); 1224 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1208 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); 1225 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1209 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); 1226 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
@@ -1263,9 +1280,19 @@ static int __init amd_iommu_init(void)
1263 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1280 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1264 goto free; 1281 goto free;
1265 1282
1283 if (amd_iommu_init_err) {
1284 ret = amd_iommu_init_err;
1285 goto free;
1286 }
1287
1266 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1288 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1267 goto free; 1289 goto free;
1268 1290
1291 if (amd_iommu_init_err) {
1292 ret = amd_iommu_init_err;
1293 goto free;
1294 }
1295
1269 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1296 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1270 if (ret) 1297 if (ret)
1271 goto free; 1298 goto free;
@@ -1278,16 +1305,19 @@ static int __init amd_iommu_init(void)
1278 if (ret) 1305 if (ret)
1279 goto free; 1306 goto free;
1280 1307
1308 enable_iommus();
1309
1281 if (iommu_pass_through) 1310 if (iommu_pass_through)
1282 ret = amd_iommu_init_passthrough(); 1311 ret = amd_iommu_init_passthrough();
1283 else 1312 else
1284 ret = amd_iommu_init_dma_ops(); 1313 ret = amd_iommu_init_dma_ops();
1314
1285 if (ret) 1315 if (ret)
1286 goto free; 1316 goto free;
1287 1317
1288 amd_iommu_init_notifier(); 1318 amd_iommu_init_api();
1289 1319
1290 enable_iommus(); 1320 amd_iommu_init_notifier();
1291 1321
1292 if (iommu_pass_through) 1322 if (iommu_pass_through)
1293 goto out; 1323 goto out;
@@ -1302,6 +1332,7 @@ out:
1302 return ret; 1332 return ret;
1303 1333
1304free: 1334free:
1335 disable_iommus();
1305 1336
1306 amd_iommu_uninit_devices(); 1337 amd_iommu_uninit_devices();
1307 1338
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 000000000000..ff469e470059
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,785 @@
1/*
2 * apb_timer.c: Driver for Langwell APB timers
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 * Langwell is the south complex of Intel Moorestown MID platform. There are
14 * eight external timers in total that can be used by the operating system.
15 * The timer information, such as frequency and addresses, is provided to the
16 * OS via SFI tables.
17 * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
18 * individual redirection table entries (RTE).
19 * Unlike HPET, there is no master counter, therefore one of the timers are
20 * used as clocksource. The overall allocation looks like:
21 * - timer 0 - NR_CPUs for per cpu timer
22 * - one timer for clocksource
23 * - one timer for watchdog driver.
24 * It is also worth notice that APB timer does not support true one-shot mode,
25 * free-running mode will be used here to emulate one-shot mode.
26 * APB timer can also be used as broadcast timer along with per cpu local APIC
27 * timer, but by default APB timer has higher rating than local APIC timers.
28 */
29
30#include <linux/clocksource.h>
31#include <linux/clockchips.h>
32#include <linux/delay.h>
33#include <linux/errno.h>
34#include <linux/init.h>
35#include <linux/sysdev.h>
36#include <linux/slab.h>
37#include <linux/pm.h>
38#include <linux/pci.h>
39#include <linux/sfi.h>
40#include <linux/interrupt.h>
41#include <linux/cpu.h>
42#include <linux/irq.h>
43
44#include <asm/fixmap.h>
45#include <asm/apb_timer.h>
46
47#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150
50#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200
52
53#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
54#define APBT_CLOCKEVENT0_NUM (0)
55#define APBT_CLOCKEVENT1_NUM (1)
56#define APBT_CLOCKSOURCE_NUM (2)
57
58static unsigned long apbt_address;
59static int apb_timer_block_enabled;
60static void __iomem *apbt_virt_address;
61static int phy_cs_timer_id;
62
63/*
64 * Common DW APB timer info
65 */
66static uint64_t apbt_freq;
67
68static void apbt_set_mode(enum clock_event_mode mode,
69 struct clock_event_device *evt);
70static int apbt_next_event(unsigned long delta,
71 struct clock_event_device *evt);
72static cycle_t apbt_read_clocksource(struct clocksource *cs);
73static void apbt_restart_clocksource(struct clocksource *cs);
74
75struct apbt_dev {
76 struct clock_event_device evt;
77 unsigned int num;
78 int cpu;
79 unsigned int irq;
80 unsigned int tick;
81 unsigned int count;
82 unsigned int flags;
83 char name[10];
84};
85
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89
90#ifdef CONFIG_SMP
91static unsigned int apbt_num_timers_used;
92static struct apbt_dev *apbt_devs;
93#endif
94
95static inline unsigned long apbt_readl_reg(unsigned long a)
96{
97 return readl(apbt_virt_address + a);
98}
99
100static inline void apbt_writel_reg(unsigned long d, unsigned long a)
101{
102 writel(d, apbt_virt_address + a);
103}
104
105static inline unsigned long apbt_readl(int n, unsigned long a)
106{
107 return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
108}
109
110static inline void apbt_writel(int n, unsigned long d, unsigned long a)
111{
112 writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
113}
114
115static inline void apbt_set_mapping(void)
116{
117 struct sfi_timer_table_entry *mtmr;
118
119 if (apbt_virt_address) {
120 pr_debug("APBT base already mapped\n");
121 return;
122 }
123 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
124 if (mtmr == NULL) {
125 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
126 APBT_CLOCKEVENT0_NUM);
127 return;
128 }
129 apbt_address = (unsigned long)mtmr->phys_addr;
130 if (!apbt_address) {
131 printk(KERN_WARNING "No timer base from SFI, use default\n");
132 apbt_address = APBT_DEFAULT_BASE;
133 }
134 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
135 if (apbt_virt_address) {
136 pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
137 (void *)apbt_address, (void *)apbt_virt_address);
138 } else {
139 pr_debug("Failed mapping APBT phy address at %p\n",\
140 (void *)apbt_address);
141 goto panic_noapbt;
142 }
143 apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
144 sfi_free_mtmr(mtmr);
145
146 /* Now figure out the physical timer id for clocksource device */
147 mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
148 if (mtmr == NULL)
149 goto panic_noapbt;
150
151 /* Now figure out the physical timer id */
152 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
153 / APBTMRS_REG_SIZE;
154 pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
155 return;
156
157panic_noapbt:
158 panic("Failed to setup APB system timer\n");
159
160}
161
162static inline void apbt_clear_mapping(void)
163{
164 iounmap(apbt_virt_address);
165 apbt_virt_address = NULL;
166}
167
168/*
169 * APBT timer interrupt enable / disable
170 */
171static inline int is_apbt_capable(void)
172{
173 return apbt_virt_address ? 1 : 0;
174}
175
176static struct clocksource clocksource_apbt = {
177 .name = "apbt",
178 .rating = APBT_CLOCKSOURCE_RATING,
179 .read = apbt_read_clocksource,
180 .mask = APBT_MASK,
181 .shift = APBT_SHIFT,
182 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
183 .resume = apbt_restart_clocksource,
184};
185
186/* boot APB clock event device */
187static struct clock_event_device apbt_clockevent = {
188 .name = "apbt0",
189 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
190 .set_mode = apbt_set_mode,
191 .set_next_event = apbt_next_event,
192 .shift = APBT_SHIFT,
193 .irq = 0,
194 .rating = APBT_CLOCKEVENT_RATING,
195};
196
197/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0.
223 */
224static void apbt_start_counter(int n)
225{
226 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
227
228 ctrl &= ~APBTMR_CONTROL_ENABLE;
229 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
230 apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
231 /* enable, mask interrupt */
232 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
233 ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
234 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
235 /* read it once to get cached counter value initialized */
236 apbt_read_clocksource(&clocksource_apbt);
237}
238
239static irqreturn_t apbt_interrupt_handler(int irq, void *data)
240{
241 struct apbt_dev *dev = (struct apbt_dev *)data;
242 struct clock_event_device *aevt = &dev->evt;
243
244 if (!aevt->event_handler) {
245 printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
246 dev->num);
247 return IRQ_NONE;
248 }
249 aevt->event_handler(aevt);
250 return IRQ_HANDLED;
251}
252
253static void apbt_restart_clocksource(struct clocksource *cs)
254{
255 apbt_start_counter(phy_cs_timer_id);
256}
257
258/* Setup IRQ routing via IOAPIC */
259#ifdef CONFIG_SMP
260static void apbt_setup_irq(struct apbt_dev *adev)
261{
262 struct irq_chip *chip;
263 struct irq_desc *desc;
264
265 /* timer0 irq has been setup early */
266 if (adev->irq == 0)
267 return;
268 desc = irq_to_desc(adev->irq);
269 chip = get_irq_chip(adev->irq);
270 disable_irq(adev->irq);
271 desc->status |= IRQ_MOVE_PCNTXT;
272 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
273 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
274 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
275 enable_irq(adev->irq);
276 if (system_state == SYSTEM_BOOTING)
277 if (request_irq(adev->irq, apbt_interrupt_handler,
278 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
279 adev->name, adev)) {
280 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
281 adev->num);
282 }
283}
284#endif
285
286static void apbt_enable_int(int n)
287{
288 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
289 /* clear pending intr */
290 apbt_readl(n, APBTMR_N_EOI);
291 ctrl &= ~APBTMR_CONTROL_INT;
292 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
293}
294
295static void apbt_disable_int(int n)
296{
297 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
298
299 ctrl |= APBTMR_CONTROL_INT;
300 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
301}
302
303
304static int __init apbt_clockevent_register(void)
305{
306 struct sfi_timer_table_entry *mtmr;
307 struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
308
309 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
310 if (mtmr == NULL) {
311 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
312 APBT_CLOCKEVENT0_NUM);
313 return -ENODEV;
314 }
315
316 /*
317 * We need to calculate the scaled math multiplication factor for
318 * nanosecond to apbt tick conversion.
319 * mult = (nsec/cycle)*2^APBT_SHIFT
320 */
321 apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
322 , NSEC_PER_SEC, APBT_SHIFT);
323
324 /* Calculate the min / max delta */
325 apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
326 &apbt_clockevent);
327 apbt_clockevent.min_delta_ns = clockevent_delta2ns(
328 APBT_MIN_DELTA_USEC*apbt_freq,
329 &apbt_clockevent);
330 /*
331 * Start apbt with the boot cpu mask and make it
332 * global if not used for per cpu timer.
333 */
334 apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
335 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337
338 if (disable_apbt_percpu) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n",
342 global_clock_event->name);
343 }
344
345 if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
346 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
347 apbt_clockevent.name, adev)) {
348 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
349 apbt_clockevent.irq);
350 }
351
352 clockevents_register_device(&adev->evt);
353 /* Start APBT 0 interrupts */
354 apbt_enable_int(APBT_CLOCKEVENT0_NUM);
355
356 sfi_free_mtmr(mtmr);
357 return 0;
358}
359
360#ifdef CONFIG_SMP
361/* Should be called with per cpu */
362void apbt_setup_secondary_clock(void)
363{
364 struct apbt_dev *adev;
365 struct clock_event_device *aevt;
366 int cpu;
367
368 /* Don't register boot CPU clockevent */
369 cpu = smp_processor_id();
370 if (cpu == boot_cpu_id)
371 return;
372 /*
373 * We need to calculate the scaled math multiplication factor for
374 * nanosecond to apbt tick conversion.
375 * mult = (nsec/cycle)*2^APBT_SHIFT
376 */
377 printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
378 adev = &per_cpu(cpu_apbt_dev, cpu);
379 aevt = &adev->evt;
380
381 memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
382 aevt->cpumask = cpumask_of(cpu);
383 aevt->name = adev->name;
384 aevt->mode = CLOCK_EVT_MODE_UNUSED;
385
386 printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
387 cpu, aevt->name, *(u32 *)aevt->cpumask);
388
389 apbt_setup_irq(adev);
390
391 clockevents_register_device(aevt);
392
393 apbt_enable_int(cpu);
394
395 return;
396}
397
398/*
399 * this notify handler process CPU hotplug events. in case of S0i3, nonboot
400 * cpus are disabled/enabled frequently, for performance reasons, we keep the
401 * per cpu timer irq registered so that we do need to do free_irq/request_irq.
402 *
403 * TODO: it might be more reliable to directly disable percpu clockevent device
404 * without the notifier chain. currently, cpu 0 may get interrupts from other
405 * cpu timers during the offline process due to the ordering of notification.
406 * the extra interrupt is harmless.
407 */
408static int apbt_cpuhp_notify(struct notifier_block *n,
409 unsigned long action, void *hcpu)
410{
411 unsigned long cpu = (unsigned long)hcpu;
412 struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
413
414 switch (action & 0xf) {
415 case CPU_DEAD:
416 apbt_disable_int(cpu);
417 if (system_state == SYSTEM_RUNNING)
418 pr_debug("skipping APBT CPU %lu offline\n", cpu);
419 else if (adev) {
420 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
421 free_irq(adev->irq, adev);
422 }
423 break;
424 default:
425 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
426 }
427 return NOTIFY_OK;
428}
429
430static __init int apbt_late_init(void)
431{
432 if (disable_apbt_percpu)
433 return 0;
434 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20);
436 return 0;
437}
438fs_initcall(apbt_late_init);
439#else
440
441void apbt_setup_secondary_clock(void) {}
442
443#endif /* CONFIG_SMP */
444
445static void apbt_set_mode(enum clock_event_mode mode,
446 struct clock_event_device *evt)
447{
448 unsigned long ctrl;
449 uint64_t delta;
450 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452
453 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode);
456
457 switch (mode) {
458 case CLOCK_EVT_MODE_PERIODIC:
459 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
460 delta >>= apbt_clockevent.shift;
461 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
462 ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
463 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
464 /*
465 * DW APB p. 46, have to disable timer before load counter,
466 * may cause sync problem.
467 */
468 ctrl &= ~APBTMR_CONTROL_ENABLE;
469 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
470 udelay(1);
471 pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
472 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
473 ctrl |= APBTMR_CONTROL_ENABLE;
474 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
475 break;
476 /* APB timer does not have one-shot mode, use free running mode */
477 case CLOCK_EVT_MODE_ONESHOT:
478 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
479 /*
480 * set free running mode, this mode will let timer reload max
481 * timeout which will give time (3min on 25MHz clock) to rearm
482 * the next event, therefore emulate the one-shot mode.
483 */
484 ctrl &= ~APBTMR_CONTROL_ENABLE;
485 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
486
487 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
488 /* write again to set free running mode */
489 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
490
491 /*
492 * DW APB p. 46, load counter with all 1s before starting free
493 * running mode.
494 */
495 apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
496 ctrl &= ~APBTMR_CONTROL_INT;
497 ctrl |= APBTMR_CONTROL_ENABLE;
498 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
499 break;
500
501 case CLOCK_EVT_MODE_UNUSED:
502 case CLOCK_EVT_MODE_SHUTDOWN:
503 apbt_disable_int(timer_num);
504 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
505 ctrl &= ~APBTMR_CONTROL_ENABLE;
506 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
507 break;
508
509 case CLOCK_EVT_MODE_RESUME:
510 apbt_enable_int(timer_num);
511 break;
512 }
513}
514
515static int apbt_next_event(unsigned long delta,
516 struct clock_event_device *evt)
517{
518 unsigned long ctrl;
519 int timer_num;
520
521 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
522
523 timer_num = adev->num;
524 /* Disable timer */
525 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
526 ctrl &= ~APBTMR_CONTROL_ENABLE;
527 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
528 /* write new count */
529 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
530 ctrl |= APBTMR_CONTROL_ENABLE;
531 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
532 return 0;
533}
534
535/*
536 * APB timer clock is not in sync with pclk on Langwell, which translates to
537 * unreliable read value caused by sampling error. the error does not add up
538 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
539 * would go backwards. the following code is trying to prevent time traveling
540 * backwards. little bit paranoid.
541 */
542static cycle_t apbt_read_clocksource(struct clocksource *cs)
543{
544 unsigned long t0, t1, t2;
545 static unsigned long last_read;
546
547bad_count:
548 t1 = apbt_readl(phy_cs_timer_id,
549 APBTMR_N_CURRENT_VALUE);
550 t2 = apbt_readl(phy_cs_timer_id,
551 APBTMR_N_CURRENT_VALUE);
552 if (unlikely(t1 < t2)) {
553 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
554 t1, t2, t2 - t1);
555 goto bad_count;
556 }
557 /*
558 * check against cached last read, makes sure time does not go back.
559 * it could be a normal rollover but we will do tripple check anyway
560 */
561 if (unlikely(t2 > last_read)) {
562 /* check if we have a normal rollover */
563 unsigned long raw_intr_status =
564 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
565 /*
566 * cs timer interrupt is masked but raw intr bit is set if
567 * rollover occurs. then we read EOI reg to clear it.
568 */
569 if (raw_intr_status & (1 << phy_cs_timer_id)) {
570 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
571 goto out;
572 }
573 pr_debug("APB CS going back %lx:%lx:%lx ",
574 t2, last_read, t2 - last_read);
575bad_count_x3:
576 pr_debug(KERN_INFO "tripple check enforced\n");
577 t0 = apbt_readl(phy_cs_timer_id,
578 APBTMR_N_CURRENT_VALUE);
579 udelay(1);
580 t1 = apbt_readl(phy_cs_timer_id,
581 APBTMR_N_CURRENT_VALUE);
582 udelay(1);
583 t2 = apbt_readl(phy_cs_timer_id,
584 APBTMR_N_CURRENT_VALUE);
585 if ((t2 > t1) || (t1 > t0)) {
586 printk(KERN_ERR "Error: APB CS tripple check failed\n");
587 goto bad_count_x3;
588 }
589 }
590out:
591 last_read = t2;
592 return (cycle_t)~t2;
593}
594
595static int apbt_clocksource_register(void)
596{
597 u64 start, now;
598 cycle_t t1;
599
600 /* Start the counter, use timer 2 as source, timer 0/1 for event */
601 apbt_start_counter(phy_cs_timer_id);
602
603 /* Verify whether apbt counter works */
604 t1 = apbt_read_clocksource(&clocksource_apbt);
605 rdtscll(start);
606
607 /*
608 * We don't know the TSC frequency yet, but waiting for
609 * 200000 TSC cycles is safe:
610 * 4 GHz == 50us
611 * 1 GHz == 200us
612 */
613 do {
614 rep_nop();
615 rdtscll(now);
616 } while ((now - start) < 200000UL);
617
618 /* APBT is the only always on clocksource, it has to work! */
619 if (t1 == apbt_read_clocksource(&clocksource_apbt))
620 panic("APBT counter not counting. APBT disabled\n");
621
622 /*
623 * initialize and register APBT clocksource
624 * convert that to ns/clock cycle
625 * mult = (ns/c) * 2^APBT_SHIFT
626 */
627 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
628 (unsigned long) apbt_freq, APBT_SHIFT);
629 clocksource_register(&clocksource_apbt);
630
631 return 0;
632}
633
634/*
635 * Early setup the APBT timer, only use timer 0 for booting then switch to
636 * per CPU timer if possible.
637 * returns 1 if per cpu apbt is setup
638 * returns 0 if no per cpu apbt is chosen
639 * panic if set up failed, this is the only platform timer on Moorestown.
640 */
641void __init apbt_time_init(void)
642{
643#ifdef CONFIG_SMP
644 int i;
645 struct sfi_timer_table_entry *p_mtmr;
646 unsigned int percpu_timer;
647 struct apbt_dev *adev;
648#endif
649
650 if (apb_timer_block_enabled)
651 return;
652 apbt_set_mapping();
653 if (apbt_virt_address) {
654 pr_debug("Found APBT version 0x%lx\n",\
655 apbt_readl_reg(APBTMRS_COMP_VERSION));
656 } else
657 goto out_noapbt;
658 /*
659 * Read the frequency and check for a sane value, for ESL model
660 * we extend the possible clock range to allow time scaling.
661 */
662
663 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
664 pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
665 goto out_noapbt;
666 }
667 if (apbt_clocksource_register()) {
668 pr_debug("APBT has failed to register clocksource\n");
669 goto out_noapbt;
670 }
671 if (!apbt_clockevent_register())
672 apb_timer_block_enabled = 1;
673 else {
674 pr_debug("APBT has failed to register clockevent\n");
675 goto out_noapbt;
676 }
677#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return;
682 }
683 pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
684 if (num_possible_cpus() <= sfi_mtimer_num) {
685 percpu_timer = 1;
686 apbt_num_timers_used = num_possible_cpus();
687 } else {
688 percpu_timer = 0;
689 apbt_num_timers_used = 1;
690 adev = &per_cpu(cpu_apbt_dev, 0);
691 adev->flags &= ~APBT_DEV_USED;
692 }
693 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
694
695 /* here we set up per CPU timer data structure */
696 apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
697 GFP_KERNEL);
698 if (!apbt_devs) {
699 printk(KERN_ERR "Failed to allocate APB timer devices\n");
700 return;
701 }
702 for (i = 0; i < apbt_num_timers_used; i++) {
703 adev = &per_cpu(cpu_apbt_dev, i);
704 adev->num = i;
705 adev->cpu = i;
706 p_mtmr = sfi_get_mtmr(i);
707 if (p_mtmr) {
708 adev->tick = p_mtmr->freq_hz;
709 adev->irq = p_mtmr->irq;
710 } else
711 printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
712 adev->count = 0;
713 sprintf(adev->name, "apbt%d", i);
714 }
715#endif
716
717 return;
718
719out_noapbt:
720 apbt_clear_mapping();
721 apb_timer_block_enabled = 0;
722 panic("failed to enable APB timer\n");
723}
724
725static inline void apbt_disable(int n)
726{
727 if (is_apbt_capable()) {
728 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
729 ctrl &= ~APBTMR_CONTROL_ENABLE;
730 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
731 }
732}
733
734/* called before apb_timer_enable, use early map */
735unsigned long apbt_quick_calibrate()
736{
737 int i, scale;
738 u64 old, new;
739 cycle_t t1, t2;
740 unsigned long khz = 0;
741 u32 loop, shift;
742
743 apbt_set_mapping();
744 apbt_start_counter(phy_cs_timer_id);
745
746 /* check if the timer can count down, otherwise return */
747 old = apbt_read_clocksource(&clocksource_apbt);
748 i = 10000;
749 while (--i) {
750 if (old != apbt_read_clocksource(&clocksource_apbt))
751 break;
752 }
753 if (!i)
754 goto failed;
755
756 /* count 16 ms */
757 loop = (apbt_freq * 1000) << 4;
758
759 /* restart the timer to ensure it won't get to 0 in the calibration */
760 apbt_start_counter(phy_cs_timer_id);
761
762 old = apbt_read_clocksource(&clocksource_apbt);
763 old += loop;
764
765 t1 = __native_read_tsc();
766
767 do {
768 new = apbt_read_clocksource(&clocksource_apbt);
769 } while (new < old);
770
771 t2 = __native_read_tsc();
772
773 shift = 5;
774 if (unlikely(loop >> shift == 0)) {
775 printk(KERN_INFO
776 "APBT TSC calibration failed, not enough resolution\n");
777 return 0;
778 }
779 scale = (int)div_u64((t2 - t1), loop >> shift);
780 khz = (scale * apbt_freq * 1000) >> shift;
781 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
782 return khz;
783failed:
784 return 0;
785}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3704997e8b25..b5d8b0bcf235 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -393,6 +393,7 @@ void __init gart_iommu_hole_init(void)
393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
394 int bus; 394 int bus;
395 int dev_base, dev_limit; 395 int dev_base, dev_limit;
396 u32 ctl;
396 397
397 bus = bus_dev_ranges[i].bus; 398 bus = bus_dev_ranges[i].bus;
398 dev_base = bus_dev_ranges[i].dev_base; 399 dev_base = bus_dev_ranges[i].dev_base;
@@ -406,7 +407,19 @@ void __init gart_iommu_hole_init(void)
406 gart_iommu_aperture = 1; 407 gart_iommu_aperture = 1;
407 x86_init.iommu.iommu_init = gart_iommu_init; 408 x86_init.iommu.iommu_init = gart_iommu_init;
408 409
409 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 410 ctl = read_pci_config(bus, slot, 3,
411 AMD64_GARTAPERTURECTL);
412
413 /*
414 * Before we do anything else disable the GART. It may
415 * still be enabled if we boot into a crash-kernel here.
416 * Reconfiguring the GART while it is enabled could have
417 * unknown side-effects.
418 */
419 ctl &= ~GARTEN;
420 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
421
422 aper_order = (ctl >> 1) & 7;
410 aper_size = (32 * 1024 * 1024) << aper_order; 423 aper_size = (32 * 1024 * 1024) << aper_order;
411 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 424 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
412 aper_base <<= 25; 425 aper_base <<= 25;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index aa57c079c98f..e5a4a1e01618 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U;
61 61
62/* 62/*
63 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
64 *
65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
66 * are in the 0 ... 7 range, then we can use logical addressing which
67 * has some performance advantages (better broadcasting).
68 *
69 * If there's an APIC ID above 8, we use physical addressing.
70 */ 64 */
71unsigned int max_physical_apicid; 65unsigned int max_physical_apicid;
72 66
@@ -587,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
587 res = (((u64)(*deltatsc)) * pm_100ms); 581 res = (((u64)(*deltatsc)) * pm_100ms);
588 do_div(res, deltapm); 582 do_div(res, deltapm);
589 apic_printk(APIC_VERBOSE, "TSC delta adjusted to " 583 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
590 "PM-Timer: %lu (%ld) \n", 584 "PM-Timer: %lu (%ld)\n",
591 (unsigned long)res, *deltatsc); 585 (unsigned long)res, *deltatsc);
592 *deltatsc = (long)res; 586 *deltatsc = (long)res;
593 } 587 }
@@ -1396,7 +1390,7 @@ void __init enable_IR_x2apic(void)
1396 } 1390 }
1397 1391
1398 local_irq_save(flags); 1392 local_irq_save(flags);
1399 mask_8259A(); 1393 legacy_pic->mask_all();
1400 mask_IO_APIC_setup(ioapic_entries); 1394 mask_IO_APIC_setup(ioapic_entries);
1401 1395
1402 if (dmar_table_init_ret) 1396 if (dmar_table_init_ret)
@@ -1428,7 +1422,7 @@ void __init enable_IR_x2apic(void)
1428nox2apic: 1422nox2apic:
1429 if (!ret) /* IR enabling failed */ 1423 if (!ret) /* IR enabling failed */
1430 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1431 unmask_8259A(); 1425 legacy_pic->restore_mask();
1432 local_irq_restore(flags); 1426 local_irq_restore(flags);
1433 1427
1434out: 1428out:
@@ -1646,8 +1640,8 @@ int __init APIC_init_uniprocessor(void)
1646 } 1640 }
1647#endif 1641#endif
1648 1642
1643#ifndef CONFIG_SMP
1649 enable_IR_x2apic(); 1644 enable_IR_x2apic();
1650#ifdef CONFIG_X86_64
1651 default_setup_apic_routing(); 1645 default_setup_apic_routing();
1652#endif 1646#endif
1653 1647
@@ -1897,18 +1891,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1897 if (apicid > max_physical_apicid) 1891 if (apicid > max_physical_apicid)
1898 max_physical_apicid = apicid; 1892 max_physical_apicid = apicid;
1899 1893
1900#ifdef CONFIG_X86_32
1901 switch (boot_cpu_data.x86_vendor) {
1902 case X86_VENDOR_INTEL:
1903 if (num_processors > 8)
1904 def_to_bigsmp = 1;
1905 break;
1906 case X86_VENDOR_AMD:
1907 if (max_physical_apicid >= 8)
1908 def_to_bigsmp = 1;
1909 }
1910#endif
1911
1912#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) 1894#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
1913 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1895 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1914 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1896 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
@@ -2038,7 +2020,7 @@ static int lapic_resume(struct sys_device *dev)
2038 } 2020 }
2039 2021
2040 mask_IO_APIC_setup(ioapic_entries); 2022 mask_IO_APIC_setup(ioapic_entries);
2041 mask_8259A(); 2023 legacy_pic->mask_all();
2042 } 2024 }
2043 2025
2044 if (x2apic_mode) 2026 if (x2apic_mode)
@@ -2082,7 +2064,7 @@ static int lapic_resume(struct sys_device *dev)
2082 2064
2083 if (intr_remapping_enabled) { 2065 if (intr_remapping_enabled) {
2084 reenable_intr_remapping(x2apic_mode); 2066 reenable_intr_remapping(x2apic_mode);
2085 unmask_8259A(); 2067 legacy_pic->restore_mask();
2086 restore_IO_APIC_setup(ioapic_entries); 2068 restore_IO_APIC_setup(ioapic_entries);
2087 free_ioapic_entries(ioapic_entries); 2069 free_ioapic_entries(ioapic_entries);
2088 } 2070 }
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index eacbd2b31d27..09d3b17ce0c2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -223,7 +223,7 @@ struct apic apic_flat = {
223}; 223};
224 224
225/* 225/*
226 * Physflat mode is used when there are more than 8 CPUs on a AMD system. 226 * Physflat mode is used when there are more than 8 CPUs on a system.
227 * We cannot use logical delivery in this case because the mask 227 * We cannot use logical delivery in this case because the mask
228 * overflows, so use physical mode. 228 * overflows, so use physical mode.
229 */ 229 */
@@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
242 } 242 }
243
244 if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
245 printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
246 return 1;
247 }
243#endif 248#endif
244 249
245 return 0; 250 return 0;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index dd2b5f264643..03ba1b895f5e 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -42,6 +42,7 @@
42#include <linux/errno.h> 42#include <linux/errno.h>
43#include <linux/acpi.h> 43#include <linux/acpi.h>
44#include <linux/init.h> 44#include <linux/init.h>
45#include <linux/gfp.h>
45#include <linux/nmi.h> 46#include <linux/nmi.h>
46#include <linux/smp.h> 47#include <linux/smp.h>
47#include <linux/io.h> 48#include <linux/io.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index de00c4619a55..127b8718abfb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -36,6 +36,7 @@
36#include <linux/freezer.h> 36#include <linux/freezer.h>
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */ 38#include <linux/jiffies.h> /* time_after() */
39#include <linux/slab.h>
39#ifdef CONFIG_ACPI 40#ifdef CONFIG_ACPI
40#include <acpi/acpi_bus.h> 41#include <acpi/acpi_bus.h>
41#endif 42#endif
@@ -73,8 +74,8 @@
73 */ 74 */
74int sis_apic_bug = -1; 75int sis_apic_bug = -1;
75 76
76static DEFINE_SPINLOCK(ioapic_lock); 77static DEFINE_RAW_SPINLOCK(ioapic_lock);
77static DEFINE_SPINLOCK(vector_lock); 78static DEFINE_RAW_SPINLOCK(vector_lock);
78 79
79/* 80/*
80 * # of IRQ routing registers 81 * # of IRQ routing registers
@@ -94,8 +95,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
94/* # of MP IRQ source entries */ 95/* # of MP IRQ source entries */
95int mp_irq_entries; 96int mp_irq_entries;
96 97
97/* Number of legacy interrupts */
98static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
99/* GSI interrupts */ 98/* GSI interrupts */
100static int nr_irqs_gsi = NR_IRQS_LEGACY; 99static int nr_irqs_gsi = NR_IRQS_LEGACY;
101 100
@@ -140,33 +139,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 139
141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 140/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
142#ifdef CONFIG_SPARSE_IRQ 141#ifdef CONFIG_SPARSE_IRQ
143static struct irq_cfg irq_cfgx[] = { 142static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
144#else 143#else
145static struct irq_cfg irq_cfgx[NR_IRQS] = { 144static struct irq_cfg irq_cfgx[NR_IRQS];
146#endif 145#endif
147 [0] = { .vector = IRQ0_VECTOR, },
148 [1] = { .vector = IRQ1_VECTOR, },
149 [2] = { .vector = IRQ2_VECTOR, },
150 [3] = { .vector = IRQ3_VECTOR, },
151 [4] = { .vector = IRQ4_VECTOR, },
152 [5] = { .vector = IRQ5_VECTOR, },
153 [6] = { .vector = IRQ6_VECTOR, },
154 [7] = { .vector = IRQ7_VECTOR, },
155 [8] = { .vector = IRQ8_VECTOR, },
156 [9] = { .vector = IRQ9_VECTOR, },
157 [10] = { .vector = IRQ10_VECTOR, },
158 [11] = { .vector = IRQ11_VECTOR, },
159 [12] = { .vector = IRQ12_VECTOR, },
160 [13] = { .vector = IRQ13_VECTOR, },
161 [14] = { .vector = IRQ14_VECTOR, },
162 [15] = { .vector = IRQ15_VECTOR, },
163};
164
165void __init io_apic_disable_legacy(void)
166{
167 nr_legacy_irqs = 0;
168 nr_irqs_gsi = 0;
169}
170 146
171int __init arch_early_irq_init(void) 147int __init arch_early_irq_init(void)
172{ 148{
@@ -176,6 +152,11 @@ int __init arch_early_irq_init(void)
176 int node; 152 int node;
177 int i; 153 int i;
178 154
155 if (!legacy_pic->nr_legacy_irqs) {
156 nr_irqs_gsi = 0;
157 io_apic_irqs = ~0UL;
158 }
159
179 cfg = irq_cfgx; 160 cfg = irq_cfgx;
180 count = ARRAY_SIZE(irq_cfgx); 161 count = ARRAY_SIZE(irq_cfgx);
181 node= cpu_to_node(boot_cpu_id); 162 node= cpu_to_node(boot_cpu_id);
@@ -185,8 +166,14 @@ int __init arch_early_irq_init(void)
185 desc->chip_data = &cfg[i]; 166 desc->chip_data = &cfg[i];
186 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 167 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
187 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 168 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
188 if (i < nr_legacy_irqs) 169 /*
189 cpumask_setall(cfg[i].domain); 170 * For legacy IRQ's, start with assigning irq0 to irq15 to
171 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
172 */
173 if (i < legacy_pic->nr_legacy_irqs) {
174 cfg[i].vector = IRQ0_VECTOR + i;
175 cpumask_set_cpu(0, cfg[i].domain);
176 }
190 } 177 }
191 178
192 return 0; 179 return 0;
@@ -406,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
406 struct irq_pin_list *entry; 393 struct irq_pin_list *entry;
407 unsigned long flags; 394 unsigned long flags;
408 395
409 spin_lock_irqsave(&ioapic_lock, flags); 396 raw_spin_lock_irqsave(&ioapic_lock, flags);
410 for_each_irq_pin(entry, cfg->irq_2_pin) { 397 for_each_irq_pin(entry, cfg->irq_2_pin) {
411 unsigned int reg; 398 unsigned int reg;
412 int pin; 399 int pin;
@@ -415,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
415 reg = io_apic_read(entry->apic, 0x10 + pin*2); 402 reg = io_apic_read(entry->apic, 0x10 + pin*2);
416 /* Is the remote IRR bit set? */ 403 /* Is the remote IRR bit set? */
417 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 404 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
418 spin_unlock_irqrestore(&ioapic_lock, flags); 405 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
419 return true; 406 return true;
420 } 407 }
421 } 408 }
422 spin_unlock_irqrestore(&ioapic_lock, flags); 409 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
423 410
424 return false; 411 return false;
425} 412}
@@ -433,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
433{ 420{
434 union entry_union eu; 421 union entry_union eu;
435 unsigned long flags; 422 unsigned long flags;
436 spin_lock_irqsave(&ioapic_lock, flags); 423 raw_spin_lock_irqsave(&ioapic_lock, flags);
437 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); 424 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
438 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); 425 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
439 spin_unlock_irqrestore(&ioapic_lock, flags); 426 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
440 return eu.entry; 427 return eu.entry;
441} 428}
442 429
@@ -459,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
459void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 446void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
460{ 447{
461 unsigned long flags; 448 unsigned long flags;
462 spin_lock_irqsave(&ioapic_lock, flags); 449 raw_spin_lock_irqsave(&ioapic_lock, flags);
463 __ioapic_write_entry(apic, pin, e); 450 __ioapic_write_entry(apic, pin, e);
464 spin_unlock_irqrestore(&ioapic_lock, flags); 451 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
465} 452}
466 453
467/* 454/*
@@ -474,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin)
474 unsigned long flags; 461 unsigned long flags;
475 union entry_union eu = { .entry.mask = 1 }; 462 union entry_union eu = { .entry.mask = 1 };
476 463
477 spin_lock_irqsave(&ioapic_lock, flags); 464 raw_spin_lock_irqsave(&ioapic_lock, flags);
478 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 465 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
479 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 466 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
480 spin_unlock_irqrestore(&ioapic_lock, flags); 467 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
481} 468}
482 469
483/* 470/*
@@ -604,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
604 591
605 BUG_ON(!cfg); 592 BUG_ON(!cfg);
606 593
607 spin_lock_irqsave(&ioapic_lock, flags); 594 raw_spin_lock_irqsave(&ioapic_lock, flags);
608 __mask_IO_APIC_irq(cfg); 595 __mask_IO_APIC_irq(cfg);
609 spin_unlock_irqrestore(&ioapic_lock, flags); 596 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
610} 597}
611 598
612static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 599static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -614,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
614 struct irq_cfg *cfg = desc->chip_data; 601 struct irq_cfg *cfg = desc->chip_data;
615 unsigned long flags; 602 unsigned long flags;
616 603
617 spin_lock_irqsave(&ioapic_lock, flags); 604 raw_spin_lock_irqsave(&ioapic_lock, flags);
618 __unmask_IO_APIC_irq(cfg); 605 __unmask_IO_APIC_irq(cfg);
619 spin_unlock_irqrestore(&ioapic_lock, flags); 606 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
620} 607}
621 608
622static void mask_IO_APIC_irq(unsigned int irq) 609static void mask_IO_APIC_irq(unsigned int irq)
@@ -865,7 +852,7 @@ static int __init find_isa_irq_apic(int irq, int type)
865 */ 852 */
866static int EISA_ELCR(unsigned int irq) 853static int EISA_ELCR(unsigned int irq)
867{ 854{
868 if (irq < nr_legacy_irqs) { 855 if (irq < legacy_pic->nr_legacy_irqs) {
869 unsigned int port = 0x4d0 + (irq >> 3); 856 unsigned int port = 0x4d0 + (irq >> 3);
870 return (inb(port) >> (irq & 7)) & 1; 857 return (inb(port) >> (irq & 7)) & 1;
871 } 858 }
@@ -1140,12 +1127,12 @@ void lock_vector_lock(void)
1140 /* Used to the online set of cpus does not change 1127 /* Used to the online set of cpus does not change
1141 * during assign_irq_vector. 1128 * during assign_irq_vector.
1142 */ 1129 */
1143 spin_lock(&vector_lock); 1130 raw_spin_lock(&vector_lock);
1144} 1131}
1145 1132
1146void unlock_vector_lock(void) 1133void unlock_vector_lock(void)
1147{ 1134{
1148 spin_unlock(&vector_lock); 1135 raw_spin_unlock(&vector_lock);
1149} 1136}
1150 1137
1151static int 1138static int
@@ -1162,7 +1149,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1162 * Also, we've got to be careful not to trash gate 1149 * Also, we've got to be careful not to trash gate
1163 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1150 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1164 */ 1151 */
1165 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1152 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1153 static int current_offset = VECTOR_OFFSET_START % 8;
1166 unsigned int old_vector; 1154 unsigned int old_vector;
1167 int cpu, err; 1155 int cpu, err;
1168 cpumask_var_t tmp_mask; 1156 cpumask_var_t tmp_mask;
@@ -1198,7 +1186,7 @@ next:
1198 if (vector >= first_system_vector) { 1186 if (vector >= first_system_vector) {
1199 /* If out of vectors on large boxen, must share them. */ 1187 /* If out of vectors on large boxen, must share them. */
1200 offset = (offset + 1) % 8; 1188 offset = (offset + 1) % 8;
1201 vector = FIRST_DEVICE_VECTOR + offset; 1189 vector = FIRST_EXTERNAL_VECTOR + offset;
1202 } 1190 }
1203 if (unlikely(current_vector == vector)) 1191 if (unlikely(current_vector == vector))
1204 continue; 1192 continue;
@@ -1232,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1232 int err; 1220 int err;
1233 unsigned long flags; 1221 unsigned long flags;
1234 1222
1235 spin_lock_irqsave(&vector_lock, flags); 1223 raw_spin_lock_irqsave(&vector_lock, flags);
1236 err = __assign_irq_vector(irq, cfg, mask); 1224 err = __assign_irq_vector(irq, cfg, mask);
1237 spin_unlock_irqrestore(&vector_lock, flags); 1225 raw_spin_unlock_irqrestore(&vector_lock, flags);
1238 return err; 1226 return err;
1239} 1227}
1240 1228
@@ -1268,14 +1256,27 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1268void __setup_vector_irq(int cpu) 1256void __setup_vector_irq(int cpu)
1269{ 1257{
1270 /* Initialize vector_irq on a new cpu */ 1258 /* Initialize vector_irq on a new cpu */
1271 /* This function must be called with vector_lock held */
1272 int irq, vector; 1259 int irq, vector;
1273 struct irq_cfg *cfg; 1260 struct irq_cfg *cfg;
1274 struct irq_desc *desc; 1261 struct irq_desc *desc;
1275 1262
1263 /*
1264 * vector_lock will make sure that we don't run into irq vector
1265 * assignments that might be happening on another cpu in parallel,
1266 * while we setup our initial vector to irq mappings.
1267 */
1268 raw_spin_lock(&vector_lock);
1276 /* Mark the inuse vectors */ 1269 /* Mark the inuse vectors */
1277 for_each_irq_desc(irq, desc) { 1270 for_each_irq_desc(irq, desc) {
1278 cfg = desc->chip_data; 1271 cfg = desc->chip_data;
1272
1273 /*
1274 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1275 * will be part of the irq_cfg's domain.
1276 */
1277 if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
1278 cpumask_set_cpu(cpu, cfg->domain);
1279
1279 if (!cpumask_test_cpu(cpu, cfg->domain)) 1280 if (!cpumask_test_cpu(cpu, cfg->domain))
1280 continue; 1281 continue;
1281 vector = cfg->vector; 1282 vector = cfg->vector;
@@ -1291,6 +1292,7 @@ void __setup_vector_irq(int cpu)
1291 if (!cpumask_test_cpu(cpu, cfg->domain)) 1292 if (!cpumask_test_cpu(cpu, cfg->domain))
1292 per_cpu(vector_irq, cpu)[vector] = -1; 1293 per_cpu(vector_irq, cpu)[vector] = -1;
1293 } 1294 }
1295 raw_spin_unlock(&vector_lock);
1294} 1296}
1295 1297
1296static struct irq_chip ioapic_chip; 1298static struct irq_chip ioapic_chip;
@@ -1440,6 +1442,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1440 1442
1441 cfg = desc->chip_data; 1443 cfg = desc->chip_data;
1442 1444
1445 /*
1446 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1447 * controllers like 8259. Now that IO-APIC can handle this irq, update
1448 * the cfg->domain.
1449 */
1450 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1451 apic->vector_allocation_domain(0, cfg->domain);
1452
1443 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1453 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1444 return; 1454 return;
1445 1455
@@ -1461,8 +1471,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1461 } 1471 }
1462 1472
1463 ioapic_register_intr(irq, desc, trigger); 1473 ioapic_register_intr(irq, desc, trigger);
1464 if (irq < nr_legacy_irqs) 1474 if (irq < legacy_pic->nr_legacy_irqs)
1465 disable_8259A_irq(irq); 1475 legacy_pic->chip->mask(irq);
1466 1476
1467 ioapic_write_entry(apic_id, pin, entry); 1477 ioapic_write_entry(apic_id, pin, entry);
1468} 1478}
@@ -1473,7 +1483,7 @@ static struct {
1473 1483
1474static void __init setup_IO_APIC_irqs(void) 1484static void __init setup_IO_APIC_irqs(void)
1475{ 1485{
1476 int apic_id = 0, pin, idx, irq; 1486 int apic_id, pin, idx, irq;
1477 int notcon = 0; 1487 int notcon = 0;
1478 struct irq_desc *desc; 1488 struct irq_desc *desc;
1479 struct irq_cfg *cfg; 1489 struct irq_cfg *cfg;
@@ -1481,14 +1491,7 @@ static void __init setup_IO_APIC_irqs(void)
1481 1491
1482 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1492 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1483 1493
1484#ifdef CONFIG_ACPI 1494 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1485 if (!acpi_disabled && acpi_ioapic) {
1486 apic_id = mp_find_ioapic(0);
1487 if (apic_id < 0)
1488 apic_id = 0;
1489 }
1490#endif
1491
1492 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1495 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1493 idx = find_irq_entry(apic_id, pin, mp_INT); 1496 idx = find_irq_entry(apic_id, pin, mp_INT);
1494 if (idx == -1) { 1497 if (idx == -1) {
@@ -1510,6 +1513,9 @@ static void __init setup_IO_APIC_irqs(void)
1510 1513
1511 irq = pin_2_irq(idx, apic_id, pin); 1514 irq = pin_2_irq(idx, apic_id, pin);
1512 1515
1516 if ((apic_id > 0) && (irq > 16))
1517 continue;
1518
1513 /* 1519 /*
1514 * Skip the timer IRQ if there's a quirk handler 1520 * Skip the timer IRQ if there's a quirk handler
1515 * installed and if it returns 1: 1521 * installed and if it returns 1:
@@ -1539,6 +1545,56 @@ static void __init setup_IO_APIC_irqs(void)
1539} 1545}
1540 1546
1541/* 1547/*
1548 * for the gsit that is not in first ioapic
1549 * but could not use acpi_register_gsi()
1550 * like some special sci in IBM x3330
1551 */
1552void setup_IO_APIC_irq_extra(u32 gsi)
1553{
1554 int apic_id = 0, pin, idx, irq;
1555 int node = cpu_to_node(boot_cpu_id);
1556 struct irq_desc *desc;
1557 struct irq_cfg *cfg;
1558
1559 /*
1560 * Convert 'gsi' to 'ioapic.pin'.
1561 */
1562 apic_id = mp_find_ioapic(gsi);
1563 if (apic_id < 0)
1564 return;
1565
1566 pin = mp_find_ioapic_pin(apic_id, gsi);
1567 idx = find_irq_entry(apic_id, pin, mp_INT);
1568 if (idx == -1)
1569 return;
1570
1571 irq = pin_2_irq(idx, apic_id, pin);
1572#ifdef CONFIG_SPARSE_IRQ
1573 desc = irq_to_desc(irq);
1574 if (desc)
1575 return;
1576#endif
1577 desc = irq_to_desc_alloc_node(irq, node);
1578 if (!desc) {
1579 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1580 return;
1581 }
1582
1583 cfg = desc->chip_data;
1584 add_pin_to_irq_node(cfg, node, apic_id, pin);
1585
1586 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1587 pr_debug("Pin %d-%d already programmed\n",
1588 mp_ioapics[apic_id].apicid, pin);
1589 return;
1590 }
1591 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1592
1593 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1594 irq_trigger(idx), irq_polarity(idx));
1595}
1596
1597/*
1542 * Set up the timer pin, possibly with the 8259A-master behind. 1598 * Set up the timer pin, possibly with the 8259A-master behind.
1543 */ 1599 */
1544static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, 1600static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
@@ -1601,14 +1657,14 @@ __apicdebuginit(void) print_IO_APIC(void)
1601 1657
1602 for (apic = 0; apic < nr_ioapics; apic++) { 1658 for (apic = 0; apic < nr_ioapics; apic++) {
1603 1659
1604 spin_lock_irqsave(&ioapic_lock, flags); 1660 raw_spin_lock_irqsave(&ioapic_lock, flags);
1605 reg_00.raw = io_apic_read(apic, 0); 1661 reg_00.raw = io_apic_read(apic, 0);
1606 reg_01.raw = io_apic_read(apic, 1); 1662 reg_01.raw = io_apic_read(apic, 1);
1607 if (reg_01.bits.version >= 0x10) 1663 if (reg_01.bits.version >= 0x10)
1608 reg_02.raw = io_apic_read(apic, 2); 1664 reg_02.raw = io_apic_read(apic, 2);
1609 if (reg_01.bits.version >= 0x20) 1665 if (reg_01.bits.version >= 0x20)
1610 reg_03.raw = io_apic_read(apic, 3); 1666 reg_03.raw = io_apic_read(apic, 3);
1611 spin_unlock_irqrestore(&ioapic_lock, flags); 1667 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1612 1668
1613 printk("\n"); 1669 printk("\n");
1614 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1670 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1647,7 +1703,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1647 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1703 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1648 1704
1649 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1705 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1650 " Stat Dmod Deli Vect: \n"); 1706 " Stat Dmod Deli Vect:\n");
1651 1707
1652 for (i = 0; i <= reg_01.bits.entries; i++) { 1708 for (i = 0; i <= reg_01.bits.entries; i++) {
1653 struct IO_APIC_route_entry entry; 1709 struct IO_APIC_route_entry entry;
@@ -1825,12 +1881,12 @@ __apicdebuginit(void) print_PIC(void)
1825 unsigned int v; 1881 unsigned int v;
1826 unsigned long flags; 1882 unsigned long flags;
1827 1883
1828 if (!nr_legacy_irqs) 1884 if (!legacy_pic->nr_legacy_irqs)
1829 return; 1885 return;
1830 1886
1831 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1887 printk(KERN_DEBUG "\nprinting PIC contents\n");
1832 1888
1833 spin_lock_irqsave(&i8259A_lock, flags); 1889 raw_spin_lock_irqsave(&i8259A_lock, flags);
1834 1890
1835 v = inb(0xa1) << 8 | inb(0x21); 1891 v = inb(0xa1) << 8 | inb(0x21);
1836 printk(KERN_DEBUG "... PIC IMR: %04x\n", v); 1892 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1844,7 +1900,7 @@ __apicdebuginit(void) print_PIC(void)
1844 outb(0x0a,0xa0); 1900 outb(0x0a,0xa0);
1845 outb(0x0a,0x20); 1901 outb(0x0a,0x20);
1846 1902
1847 spin_unlock_irqrestore(&i8259A_lock, flags); 1903 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
1848 1904
1849 printk(KERN_DEBUG "... PIC ISR: %04x\n", v); 1905 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1850 1906
@@ -1903,13 +1959,13 @@ void __init enable_IO_APIC(void)
1903 * The number of IO-APIC IRQ registers (== #pins): 1959 * The number of IO-APIC IRQ registers (== #pins):
1904 */ 1960 */
1905 for (apic = 0; apic < nr_ioapics; apic++) { 1961 for (apic = 0; apic < nr_ioapics; apic++) {
1906 spin_lock_irqsave(&ioapic_lock, flags); 1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1907 reg_01.raw = io_apic_read(apic, 1); 1963 reg_01.raw = io_apic_read(apic, 1);
1908 spin_unlock_irqrestore(&ioapic_lock, flags); 1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1909 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1910 } 1966 }
1911 1967
1912 if (!nr_legacy_irqs) 1968 if (!legacy_pic->nr_legacy_irqs)
1913 return; 1969 return;
1914 1970
1915 for(apic = 0; apic < nr_ioapics; apic++) { 1971 for(apic = 0; apic < nr_ioapics; apic++) {
@@ -1966,7 +2022,7 @@ void disable_IO_APIC(void)
1966 */ 2022 */
1967 clear_IO_APIC(); 2023 clear_IO_APIC();
1968 2024
1969 if (!nr_legacy_irqs) 2025 if (!legacy_pic->nr_legacy_irqs)
1970 return; 2026 return;
1971 2027
1972 /* 2028 /*
@@ -2045,9 +2101,9 @@ void __init setup_ioapic_ids_from_mpc(void)
2045 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 2101 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
2046 2102
2047 /* Read the register 0 value */ 2103 /* Read the register 0 value */
2048 spin_lock_irqsave(&ioapic_lock, flags); 2104 raw_spin_lock_irqsave(&ioapic_lock, flags);
2049 reg_00.raw = io_apic_read(apic_id, 0); 2105 reg_00.raw = io_apic_read(apic_id, 0);
2050 spin_unlock_irqrestore(&ioapic_lock, flags); 2106 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2051 2107
2052 old_id = mp_ioapics[apic_id].apicid; 2108 old_id = mp_ioapics[apic_id].apicid;
2053 2109
@@ -2106,16 +2162,16 @@ void __init setup_ioapic_ids_from_mpc(void)
2106 mp_ioapics[apic_id].apicid); 2162 mp_ioapics[apic_id].apicid);
2107 2163
2108 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 2164 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
2109 spin_lock_irqsave(&ioapic_lock, flags); 2165 raw_spin_lock_irqsave(&ioapic_lock, flags);
2110 io_apic_write(apic_id, 0, reg_00.raw); 2166 io_apic_write(apic_id, 0, reg_00.raw);
2111 spin_unlock_irqrestore(&ioapic_lock, flags); 2167 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2112 2168
2113 /* 2169 /*
2114 * Sanity check 2170 * Sanity check
2115 */ 2171 */
2116 spin_lock_irqsave(&ioapic_lock, flags); 2172 raw_spin_lock_irqsave(&ioapic_lock, flags);
2117 reg_00.raw = io_apic_read(apic_id, 0); 2173 reg_00.raw = io_apic_read(apic_id, 0);
2118 spin_unlock_irqrestore(&ioapic_lock, flags); 2174 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2119 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2175 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
2120 printk("could not set ID!\n"); 2176 printk("could not set ID!\n");
2121 else 2177 else
@@ -2198,15 +2254,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2198 unsigned long flags; 2254 unsigned long flags;
2199 struct irq_cfg *cfg; 2255 struct irq_cfg *cfg;
2200 2256
2201 spin_lock_irqsave(&ioapic_lock, flags); 2257 raw_spin_lock_irqsave(&ioapic_lock, flags);
2202 if (irq < nr_legacy_irqs) { 2258 if (irq < legacy_pic->nr_legacy_irqs) {
2203 disable_8259A_irq(irq); 2259 legacy_pic->chip->mask(irq);
2204 if (i8259A_irq_pending(irq)) 2260 if (legacy_pic->irq_pending(irq))
2205 was_pending = 1; 2261 was_pending = 1;
2206 } 2262 }
2207 cfg = irq_cfg(irq); 2263 cfg = irq_cfg(irq);
2208 __unmask_IO_APIC_irq(cfg); 2264 __unmask_IO_APIC_irq(cfg);
2209 spin_unlock_irqrestore(&ioapic_lock, flags); 2265 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2210 2266
2211 return was_pending; 2267 return was_pending;
2212} 2268}
@@ -2217,9 +2273,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
2217 struct irq_cfg *cfg = irq_cfg(irq); 2273 struct irq_cfg *cfg = irq_cfg(irq);
2218 unsigned long flags; 2274 unsigned long flags;
2219 2275
2220 spin_lock_irqsave(&vector_lock, flags); 2276 raw_spin_lock_irqsave(&vector_lock, flags);
2221 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2277 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2222 spin_unlock_irqrestore(&vector_lock, flags); 2278 raw_spin_unlock_irqrestore(&vector_lock, flags);
2223 2279
2224 return 1; 2280 return 1;
2225} 2281}
@@ -2312,14 +2368,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2312 irq = desc->irq; 2368 irq = desc->irq;
2313 cfg = desc->chip_data; 2369 cfg = desc->chip_data;
2314 2370
2315 spin_lock_irqsave(&ioapic_lock, flags); 2371 raw_spin_lock_irqsave(&ioapic_lock, flags);
2316 ret = set_desc_affinity(desc, mask, &dest); 2372 ret = set_desc_affinity(desc, mask, &dest);
2317 if (!ret) { 2373 if (!ret) {
2318 /* Only the high 8 bits are valid. */ 2374 /* Only the high 8 bits are valid. */
2319 dest = SET_APIC_LOGICAL_ID(dest); 2375 dest = SET_APIC_LOGICAL_ID(dest);
2320 __target_IO_APIC_irq(irq, dest, cfg); 2376 __target_IO_APIC_irq(irq, dest, cfg);
2321 } 2377 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2378 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2379
2324 return ret; 2380 return ret;
2325} 2381}
@@ -2434,6 +2490,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2434 cfg = irq_cfg(irq); 2490 cfg = irq_cfg(irq);
2435 raw_spin_lock(&desc->lock); 2491 raw_spin_lock(&desc->lock);
2436 2492
2493 /*
2494 * Check if the irq migration is in progress. If so, we
2495 * haven't received the cleanup request yet for this irq.
2496 */
2497 if (cfg->move_in_progress)
2498 goto unlock;
2499
2437 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2500 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2438 goto unlock; 2501 goto unlock;
2439 2502
@@ -2547,9 +2610,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
2547 irq = desc->irq; 2610 irq = desc->irq;
2548 cfg = desc->chip_data; 2611 cfg = desc->chip_data;
2549 2612
2550 spin_lock_irqsave(&ioapic_lock, flags); 2613 raw_spin_lock_irqsave(&ioapic_lock, flags);
2551 __eoi_ioapic_irq(irq, cfg); 2614 __eoi_ioapic_irq(irq, cfg);
2552 spin_unlock_irqrestore(&ioapic_lock, flags); 2615 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2553} 2616}
2554 2617
2555static void ack_apic_level(unsigned int irq) 2618static void ack_apic_level(unsigned int irq)
@@ -2727,8 +2790,8 @@ static inline void init_IO_APIC_traps(void)
2727 * so default to an old-fashioned 8259 2790 * so default to an old-fashioned 8259
2728 * interrupt if we can.. 2791 * interrupt if we can..
2729 */ 2792 */
2730 if (irq < nr_legacy_irqs) 2793 if (irq < legacy_pic->nr_legacy_irqs)
2731 make_8259A_irq(irq); 2794 legacy_pic->make_irq(irq);
2732 else 2795 else
2733 /* Strange. Oh, well.. */ 2796 /* Strange. Oh, well.. */
2734 desc->chip = &no_irq_chip; 2797 desc->chip = &no_irq_chip;
@@ -2885,7 +2948,7 @@ static inline void __init check_timer(void)
2885 /* 2948 /*
2886 * get/set the timer IRQ vector: 2949 * get/set the timer IRQ vector:
2887 */ 2950 */
2888 disable_8259A_irq(0); 2951 legacy_pic->chip->mask(0);
2889 assign_irq_vector(0, cfg, apic->target_cpus()); 2952 assign_irq_vector(0, cfg, apic->target_cpus());
2890 2953
2891 /* 2954 /*
@@ -2898,7 +2961,7 @@ static inline void __init check_timer(void)
2898 * automatically. 2961 * automatically.
2899 */ 2962 */
2900 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2963 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2901 init_8259A(1); 2964 legacy_pic->init(1);
2902#ifdef CONFIG_X86_32 2965#ifdef CONFIG_X86_32
2903 { 2966 {
2904 unsigned int ver; 2967 unsigned int ver;
@@ -2957,7 +3020,7 @@ static inline void __init check_timer(void)
2957 if (timer_irq_works()) { 3020 if (timer_irq_works()) {
2958 if (nmi_watchdog == NMI_IO_APIC) { 3021 if (nmi_watchdog == NMI_IO_APIC) {
2959 setup_nmi(); 3022 setup_nmi();
2960 enable_8259A_irq(0); 3023 legacy_pic->chip->unmask(0);
2961 } 3024 }
2962 if (disable_timer_pin_1 > 0) 3025 if (disable_timer_pin_1 > 0)
2963 clear_IO_APIC_pin(0, pin1); 3026 clear_IO_APIC_pin(0, pin1);
@@ -2980,14 +3043,14 @@ static inline void __init check_timer(void)
2980 */ 3043 */
2981 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 3044 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
2982 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 3045 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2983 enable_8259A_irq(0); 3046 legacy_pic->chip->unmask(0);
2984 if (timer_irq_works()) { 3047 if (timer_irq_works()) {
2985 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 3048 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2986 timer_through_8259 = 1; 3049 timer_through_8259 = 1;
2987 if (nmi_watchdog == NMI_IO_APIC) { 3050 if (nmi_watchdog == NMI_IO_APIC) {
2988 disable_8259A_irq(0); 3051 legacy_pic->chip->mask(0);
2989 setup_nmi(); 3052 setup_nmi();
2990 enable_8259A_irq(0); 3053 legacy_pic->chip->unmask(0);
2991 } 3054 }
2992 goto out; 3055 goto out;
2993 } 3056 }
@@ -2995,7 +3058,7 @@ static inline void __init check_timer(void)
2995 * Cleanup, just in case ... 3058 * Cleanup, just in case ...
2996 */ 3059 */
2997 local_irq_disable(); 3060 local_irq_disable();
2998 disable_8259A_irq(0); 3061 legacy_pic->chip->mask(0);
2999 clear_IO_APIC_pin(apic2, pin2); 3062 clear_IO_APIC_pin(apic2, pin2);
3000 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 3063 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
3001 } 3064 }
@@ -3014,22 +3077,22 @@ static inline void __init check_timer(void)
3014 3077
3015 lapic_register_intr(0, desc); 3078 lapic_register_intr(0, desc);
3016 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 3079 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
3017 enable_8259A_irq(0); 3080 legacy_pic->chip->unmask(0);
3018 3081
3019 if (timer_irq_works()) { 3082 if (timer_irq_works()) {
3020 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3083 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3021 goto out; 3084 goto out;
3022 } 3085 }
3023 local_irq_disable(); 3086 local_irq_disable();
3024 disable_8259A_irq(0); 3087 legacy_pic->chip->mask(0);
3025 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3088 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3026 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3089 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
3027 3090
3028 apic_printk(APIC_QUIET, KERN_INFO 3091 apic_printk(APIC_QUIET, KERN_INFO
3029 "...trying to set up timer as ExtINT IRQ...\n"); 3092 "...trying to set up timer as ExtINT IRQ...\n");
3030 3093
3031 init_8259A(0); 3094 legacy_pic->init(0);
3032 make_8259A_irq(0); 3095 legacy_pic->make_irq(0);
3033 apic_write(APIC_LVT0, APIC_DM_EXTINT); 3096 apic_write(APIC_LVT0, APIC_DM_EXTINT);
3034 3097
3035 unlock_ExtINT_logic(); 3098 unlock_ExtINT_logic();
@@ -3071,7 +3134,7 @@ void __init setup_IO_APIC(void)
3071 /* 3134 /*
3072 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3135 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3073 */ 3136 */
3074 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; 3137 io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3075 3138
3076 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3139 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3077 /* 3140 /*
@@ -3082,7 +3145,7 @@ void __init setup_IO_APIC(void)
3082 sync_Arb_IDs(); 3145 sync_Arb_IDs();
3083 setup_IO_APIC_irqs(); 3146 setup_IO_APIC_irqs();
3084 init_IO_APIC_traps(); 3147 init_IO_APIC_traps();
3085 if (nr_legacy_irqs) 3148 if (legacy_pic->nr_legacy_irqs)
3086 check_timer(); 3149 check_timer();
3087} 3150}
3088 3151
@@ -3131,13 +3194,13 @@ static int ioapic_resume(struct sys_device *dev)
3131 data = container_of(dev, struct sysfs_ioapic_data, dev); 3194 data = container_of(dev, struct sysfs_ioapic_data, dev);
3132 entry = data->entry; 3195 entry = data->entry;
3133 3196
3134 spin_lock_irqsave(&ioapic_lock, flags); 3197 raw_spin_lock_irqsave(&ioapic_lock, flags);
3135 reg_00.raw = io_apic_read(dev->id, 0); 3198 reg_00.raw = io_apic_read(dev->id, 0);
3136 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 3199 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3137 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 3200 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3138 io_apic_write(dev->id, 0, reg_00.raw); 3201 io_apic_write(dev->id, 0, reg_00.raw);
3139 } 3202 }
3140 spin_unlock_irqrestore(&ioapic_lock, flags); 3203 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3141 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 3204 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
3142 ioapic_write_entry(dev->id, i, entry[i]); 3205 ioapic_write_entry(dev->id, i, entry[i]);
3143 3206
@@ -3200,7 +3263,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3200 if (irq_want < nr_irqs_gsi) 3263 if (irq_want < nr_irqs_gsi)
3201 irq_want = nr_irqs_gsi; 3264 irq_want = nr_irqs_gsi;
3202 3265
3203 spin_lock_irqsave(&vector_lock, flags); 3266 raw_spin_lock_irqsave(&vector_lock, flags);
3204 for (new = irq_want; new < nr_irqs; new++) { 3267 for (new = irq_want; new < nr_irqs; new++) {
3205 desc_new = irq_to_desc_alloc_node(new, node); 3268 desc_new = irq_to_desc_alloc_node(new, node);
3206 if (!desc_new) { 3269 if (!desc_new) {
@@ -3219,14 +3282,11 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3219 irq = new; 3282 irq = new;
3220 break; 3283 break;
3221 } 3284 }
3222 spin_unlock_irqrestore(&vector_lock, flags); 3285 raw_spin_unlock_irqrestore(&vector_lock, flags);
3286
3287 if (irq > 0)
3288 dynamic_irq_init_keep_chip_data(irq);
3223 3289
3224 if (irq > 0) {
3225 dynamic_irq_init(irq);
3226 /* restore it, in case dynamic_irq_init clear it */
3227 if (desc_new)
3228 desc_new->chip_data = cfg_new;
3229 }
3230 return irq; 3290 return irq;
3231} 3291}
3232 3292
@@ -3248,20 +3308,13 @@ int create_irq(void)
3248void destroy_irq(unsigned int irq) 3308void destroy_irq(unsigned int irq)
3249{ 3309{
3250 unsigned long flags; 3310 unsigned long flags;
3251 struct irq_cfg *cfg;
3252 struct irq_desc *desc;
3253 3311
3254 /* store it, in case dynamic_irq_cleanup clear it */ 3312 dynamic_irq_cleanup_keep_chip_data(irq);
3255 desc = irq_to_desc(irq);
3256 cfg = desc->chip_data;
3257 dynamic_irq_cleanup(irq);
3258 /* connect back irq_cfg */
3259 desc->chip_data = cfg;
3260 3313
3261 free_irte(irq); 3314 free_irte(irq);
3262 spin_lock_irqsave(&vector_lock, flags); 3315 raw_spin_lock_irqsave(&vector_lock, flags);
3263 __clear_irq_vector(irq, cfg); 3316 __clear_irq_vector(irq, get_irq_chip_data(irq));
3264 spin_unlock_irqrestore(&vector_lock, flags); 3317 raw_spin_unlock_irqrestore(&vector_lock, flags);
3265} 3318}
3266 3319
3267/* 3320/*
@@ -3798,9 +3851,9 @@ int __init io_apic_get_redir_entries (int ioapic)
3798 union IO_APIC_reg_01 reg_01; 3851 union IO_APIC_reg_01 reg_01;
3799 unsigned long flags; 3852 unsigned long flags;
3800 3853
3801 spin_lock_irqsave(&ioapic_lock, flags); 3854 raw_spin_lock_irqsave(&ioapic_lock, flags);
3802 reg_01.raw = io_apic_read(ioapic, 1); 3855 reg_01.raw = io_apic_read(ioapic, 1);
3803 spin_unlock_irqrestore(&ioapic_lock, flags); 3856 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3804 3857
3805 return reg_01.bits.entries; 3858 return reg_01.bits.entries;
3806} 3859}
@@ -3883,7 +3936,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3883 /* 3936 /*
3884 * IRQs < 16 are already in the irq_2_pin[] map 3937 * IRQs < 16 are already in the irq_2_pin[] map
3885 */ 3938 */
3886 if (irq >= nr_legacy_irqs) { 3939 if (irq >= legacy_pic->nr_legacy_irqs) {
3887 cfg = desc->chip_data; 3940 cfg = desc->chip_data;
3888 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { 3941 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3889 printk(KERN_INFO "can not add pin %d for irq %d\n", 3942 printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -3962,9 +4015,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3962 if (physids_empty(apic_id_map)) 4015 if (physids_empty(apic_id_map))
3963 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); 4016 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3964 4017
3965 spin_lock_irqsave(&ioapic_lock, flags); 4018 raw_spin_lock_irqsave(&ioapic_lock, flags);
3966 reg_00.raw = io_apic_read(ioapic, 0); 4019 reg_00.raw = io_apic_read(ioapic, 0);
3967 spin_unlock_irqrestore(&ioapic_lock, flags); 4020 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3968 4021
3969 if (apic_id >= get_physical_broadcast()) { 4022 if (apic_id >= get_physical_broadcast()) {
3970 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " 4023 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -3998,10 +4051,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3998 if (reg_00.bits.ID != apic_id) { 4051 if (reg_00.bits.ID != apic_id) {
3999 reg_00.bits.ID = apic_id; 4052 reg_00.bits.ID = apic_id;
4000 4053
4001 spin_lock_irqsave(&ioapic_lock, flags); 4054 raw_spin_lock_irqsave(&ioapic_lock, flags);
4002 io_apic_write(ioapic, 0, reg_00.raw); 4055 io_apic_write(ioapic, 0, reg_00.raw);
4003 reg_00.raw = io_apic_read(ioapic, 0); 4056 reg_00.raw = io_apic_read(ioapic, 0);
4004 spin_unlock_irqrestore(&ioapic_lock, flags); 4057 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4005 4058
4006 /* Sanity check */ 4059 /* Sanity check */
4007 if (reg_00.bits.ID != apic_id) { 4060 if (reg_00.bits.ID != apic_id) {
@@ -4022,9 +4075,9 @@ int __init io_apic_get_version(int ioapic)
4022 union IO_APIC_reg_01 reg_01; 4075 union IO_APIC_reg_01 reg_01;
4023 unsigned long flags; 4076 unsigned long flags;
4024 4077
4025 spin_lock_irqsave(&ioapic_lock, flags); 4078 raw_spin_lock_irqsave(&ioapic_lock, flags);
4026 reg_01.raw = io_apic_read(ioapic, 1); 4079 reg_01.raw = io_apic_read(ioapic, 1);
4027 spin_unlock_irqrestore(&ioapic_lock, flags); 4080 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4028 4081
4029 return reg_01.bits.version; 4082 return reg_01.bits.version;
4030} 4083}
@@ -4056,27 +4109,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4056#ifdef CONFIG_SMP 4109#ifdef CONFIG_SMP
4057void __init setup_ioapic_dest(void) 4110void __init setup_ioapic_dest(void)
4058{ 4111{
4059 int pin, ioapic = 0, irq, irq_entry; 4112 int pin, ioapic, irq, irq_entry;
4060 struct irq_desc *desc; 4113 struct irq_desc *desc;
4061 const struct cpumask *mask; 4114 const struct cpumask *mask;
4062 4115
4063 if (skip_ioapic_setup == 1) 4116 if (skip_ioapic_setup == 1)
4064 return; 4117 return;
4065 4118
4066#ifdef CONFIG_ACPI 4119 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4067 if (!acpi_disabled && acpi_ioapic) {
4068 ioapic = mp_find_ioapic(0);
4069 if (ioapic < 0)
4070 ioapic = 0;
4071 }
4072#endif
4073
4074 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4120 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4075 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4121 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4076 if (irq_entry == -1) 4122 if (irq_entry == -1)
4077 continue; 4123 continue;
4078 irq = pin_2_irq(irq_entry, ioapic, pin); 4124 irq = pin_2_irq(irq_entry, ioapic, pin);
4079 4125
4126 if ((ioapic > 0) && (irq > 16))
4127 continue;
4128
4080 desc = irq_to_desc(irq); 4129 desc = irq_to_desc(irq);
4081 4130
4082 /* 4131 /*
@@ -4261,3 +4310,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4261 4310
4262 nr_ioapics++; 4311 nr_ioapics++;
4263} 4312}
4313
4314/* Enable IOAPIC early just for system timer */
4315void __init pre_init_apic_IRQ0(void)
4316{
4317 struct irq_cfg *cfg;
4318 struct irq_desc *desc;
4319
4320 printk(KERN_INFO "Early APIC setup for system timer0\n");
4321#ifndef CONFIG_SMP
4322 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
4323#endif
4324 desc = irq_to_desc_alloc_node(0, 0);
4325
4326 setup_local_APIC();
4327
4328 cfg = irq_cfg(0);
4329 add_pin_to_irq_node(cfg, 0, 0, 0);
4330 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
4331
4332 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
4333}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 0159a69396cb..1edaf15c0b8e 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -18,6 +18,7 @@
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/slab.h>
21#include <linux/sysdev.h> 22#include <linux/sysdev.h>
22#include <linux/sysctl.h> 23#include <linux/sysctl.h>
23#include <linux/percpu.h> 24#include <linux/percpu.h>
@@ -177,7 +178,7 @@ int __init check_nmi_watchdog(void)
177error: 178error:
178 if (nmi_watchdog == NMI_IO_APIC) { 179 if (nmi_watchdog == NMI_IO_APIC) {
179 if (!timer_through_8259) 180 if (!timer_through_8259)
180 disable_8259A_irq(0); 181 legacy_pic->chip->mask(0);
181 on_each_cpu(__acpi_nmi_disable, NULL, 1); 182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
182 } 183 }
183 184
@@ -416,13 +417,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
416 417
417 /* We can be called before check_nmi_watchdog, hence NULL check. */ 418 /* We can be called before check_nmi_watchdog, hence NULL check. */
418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 419 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 420 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
420 421
421 spin_lock(&lock); 422 raw_spin_lock(&lock);
422 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 423 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
423 show_regs(regs); 424 show_regs(regs);
424 dump_stack(); 425 dump_stack();
425 spin_unlock(&lock); 426 raw_spin_unlock(&lock);
426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 427 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
427 428
428 rc = 1; 429 rc = 1;
@@ -438,8 +439,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
438 * Ayiee, looks like this CPU is stuck ... 439 * Ayiee, looks like this CPU is stuck ...
439 * wait a few IRQs (5 seconds) before doing the oops ... 440 * wait a few IRQs (5 seconds) before doing the oops ...
440 */ 441 */
441 __this_cpu_inc(per_cpu_var(alert_counter)); 442 __this_cpu_inc(alert_counter);
442 if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) 443 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
443 /* 444 /*
444 * die_nmi will return ONLY if NOTIFY_STOP happens.. 445 * die_nmi will return ONLY if NOTIFY_STOP happens..
445 */ 446 */
@@ -447,7 +448,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
447 regs, panic_on_timeout); 448 regs, panic_on_timeout);
448 } else { 449 } else {
449 __get_cpu_var(last_irq_sum) = sum; 450 __get_cpu_var(last_irq_sum) = sum;
450 __this_cpu_write(per_cpu_var(alert_counter), 0); 451 __this_cpu_write(alert_counter, 0);
451 } 452 }
452 453
453 /* see if the nmi watchdog went off */ 454 /* see if the nmi watchdog went off */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 98c4665f251c..3e28401f161c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
225 225
226 mpc_record = 0; 226 mpc_record = 0;
227 printk(KERN_INFO 227 printk(KERN_INFO
228 "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); 228 "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
229 229
230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { 230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
231 printk(KERN_WARNING 231 printk(KERN_WARNING
@@ -277,6 +277,7 @@ static __init void early_check_numaq(void)
277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; 277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; 278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
279 x86_init.timers.tsc_pre_init = numaq_tsc_init; 279 x86_init.timers.tsc_pre_init = numaq_tsc_init;
280 x86_init.pci.init = pci_numaq_init;
280 } 281 }
281} 282}
282 283
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 1a6559f6768c..99d2fe016084 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,7 +52,32 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void default_setup_apic_routing(void) 55void __init default_setup_apic_routing(void)
56{
57 int version = apic_version[boot_cpu_physical_apicid];
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78}
79
80static void setup_apic_flat_routing(void)
56{ 81{
57#ifdef CONFIG_X86_IO_APIC 82#ifdef CONFIG_X86_IO_APIC
58 printk(KERN_INFO 83 printk(KERN_INFO
@@ -103,7 +128,7 @@ struct apic apic_default = {
103 .init_apic_ldr = default_init_apic_ldr, 128 .init_apic_ldr = default_init_apic_ldr,
104 129
105 .ioapic_phys_id_map = default_ioapic_phys_id_map, 130 .ioapic_phys_id_map = default_ioapic_phys_id_map,
106 .setup_apic_routing = default_setup_apic_routing, 131 .setup_apic_routing = setup_apic_flat_routing,
107 .multi_timer_check = NULL, 132 .multi_timer_check = NULL,
108 .apicid_to_node = default_apicid_to_node, 133 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c4cbd3080c1c..83e9be4778e2 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void)
67 } 67 }
68#endif 68#endif
69 69
70 if (apic == &apic_flat) { 70 if (apic == &apic_flat && num_possible_cpus() > 8)
71 switch (boot_cpu_data.x86_vendor) { 71 apic = &apic_physflat;
72 case X86_VENDOR_INTEL:
73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
80 }
81 72
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83 74
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d56b0efb2057..c085d52dbaf2 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -17,9 +17,12 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/timer.h> 19#include <linux/timer.h>
20#include <linux/slab.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h>
25#include <linux/kdebug.h>
23 26
24#include <asm/uv/uv_mmrs.h> 27#include <asm/uv/uv_mmrs.h>
25#include <asm/uv/uv_hub.h> 28#include <asm/uv/uv_hub.h>
@@ -34,8 +37,13 @@
34 37
35DEFINE_PER_CPU(int, x2apic_extra_bits); 38DEFINE_PER_CPU(int, x2apic_extra_bits);
36 39
40#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
41
37static enum uv_system_type uv_system_type; 42static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr; 43static u64 gru_start_paddr, gru_end_paddr;
44int uv_min_hub_revision_id;
45EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
46static DEFINE_SPINLOCK(uv_nmi_lock);
39 47
40static inline bool is_GRU_range(u64 start, u64 end) 48static inline bool is_GRU_range(u64 start, u64 end)
41{ 49{
@@ -55,20 +63,28 @@ static int early_get_nodeid(void)
55 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); 63 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
56 node_id.v = *mmr; 64 node_id.v = *mmr;
57 early_iounmap(mmr, sizeof(*mmr)); 65 early_iounmap(mmr, sizeof(*mmr));
66
67 /* Currently, all blades have same revision number */
68 uv_min_hub_revision_id = node_id.s.revision;
69
58 return node_id.s.node_id; 70 return node_id.s.node_id;
59} 71}
60 72
61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 73static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
62{ 74{
75 int nodeid;
76
63 if (!strcmp(oem_id, "SGI")) { 77 if (!strcmp(oem_id, "SGI")) {
78 nodeid = early_get_nodeid();
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 79 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
80 x86_platform.nmi_init = uv_nmi_init;
65 if (!strcmp(oem_table_id, "UVL")) 81 if (!strcmp(oem_table_id, "UVL"))
66 uv_system_type = UV_LEGACY_APIC; 82 uv_system_type = UV_LEGACY_APIC;
67 else if (!strcmp(oem_table_id, "UVX")) 83 else if (!strcmp(oem_table_id, "UVX"))
68 uv_system_type = UV_X2APIC; 84 uv_system_type = UV_X2APIC;
69 else if (!strcmp(oem_table_id, "UVH")) { 85 else if (!strcmp(oem_table_id, "UVH")) {
70 __get_cpu_var(x2apic_extra_bits) = 86 __get_cpu_var(x2apic_extra_bits) =
71 early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); 87 nodeid << (UV_APIC_PNODE_SHIFT - 1);
72 uv_system_type = UV_NON_UNIQUE_APIC; 88 uv_system_type = UV_NON_UNIQUE_APIC;
73 return 1; 89 return 1;
74 } 90 }
@@ -105,11 +121,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
105unsigned long sn_rtc_cycles_per_second; 121unsigned long sn_rtc_cycles_per_second;
106EXPORT_SYMBOL(sn_rtc_cycles_per_second); 122EXPORT_SYMBOL(sn_rtc_cycles_per_second);
107 123
108/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
109
110static const struct cpumask *uv_target_cpus(void) 124static const struct cpumask *uv_target_cpus(void)
111{ 125{
112 return cpumask_of(0); 126 return cpu_online_mask;
113} 127}
114 128
115static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) 129static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -374,13 +388,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
374 388
375enum map_type {map_wb, map_uc}; 389enum map_type {map_wb, map_uc};
376 390
377static __init void map_high(char *id, unsigned long base, int shift, 391static __init void map_high(char *id, unsigned long base, int pshift,
378 int max_pnode, enum map_type map_type) 392 int bshift, int max_pnode, enum map_type map_type)
379{ 393{
380 unsigned long bytes, paddr; 394 unsigned long bytes, paddr;
381 395
382 paddr = base << shift; 396 paddr = base << pshift;
383 bytes = (1UL << shift) * (max_pnode + 1); 397 bytes = (1UL << bshift) * (max_pnode + 1);
384 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 398 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
385 paddr + bytes); 399 paddr + bytes);
386 if (map_type == map_uc) 400 if (map_type == map_uc)
@@ -396,7 +410,7 @@ static __init void map_gru_high(int max_pnode)
396 410
397 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 411 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
398 if (gru.s.enable) { 412 if (gru.s.enable) {
399 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 413 map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
400 gru_start_paddr = ((u64)gru.s.base << shift); 414 gru_start_paddr = ((u64)gru.s.base << shift);
401 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); 415 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
402 416
@@ -410,7 +424,7 @@ static __init void map_mmr_high(int max_pnode)
410 424
411 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 425 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
412 if (mmr.s.enable) 426 if (mmr.s.enable)
413 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); 427 map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
414} 428}
415 429
416static __init void map_mmioh_high(int max_pnode) 430static __init void map_mmioh_high(int max_pnode)
@@ -420,7 +434,8 @@ static __init void map_mmioh_high(int max_pnode)
420 434
421 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 435 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
422 if (mmioh.s.enable) 436 if (mmioh.s.enable)
423 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 437 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
438 max_pnode, map_uc);
424} 439}
425 440
426static __init void map_low_mmrs(void) 441static __init void map_low_mmrs(void)
@@ -472,7 +487,7 @@ static void uv_heartbeat(unsigned long ignored)
472 487
473static void __cpuinit uv_heartbeat_enable(int cpu) 488static void __cpuinit uv_heartbeat_enable(int cpu)
474{ 489{
475 if (!uv_cpu_hub_info(cpu)->scir.enabled) { 490 while (!uv_cpu_hub_info(cpu)->scir.enabled) {
476 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; 491 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
477 492
478 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); 493 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
@@ -480,11 +495,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
480 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; 495 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
481 add_timer_on(timer, cpu); 496 add_timer_on(timer, cpu);
482 uv_cpu_hub_info(cpu)->scir.enabled = 1; 497 uv_cpu_hub_info(cpu)->scir.enabled = 1;
483 }
484 498
485 /* check boot cpu */ 499 /* also ensure that boot cpu is enabled */
486 if (!uv_cpu_hub_info(0)->scir.enabled) 500 cpu = 0;
487 uv_heartbeat_enable(0); 501 }
488} 502}
489 503
490#ifdef CONFIG_HOTPLUG_CPU 504#ifdef CONFIG_HOTPLUG_CPU
@@ -543,6 +557,30 @@ late_initcall(uv_init_heartbeat);
543 557
544#endif /* !CONFIG_HOTPLUG_CPU */ 558#endif /* !CONFIG_HOTPLUG_CPU */
545 559
560/* Direct Legacy VGA I/O traffic to designated IOH */
561int uv_set_vga_state(struct pci_dev *pdev, bool decode,
562 unsigned int command_bits, bool change_bridge)
563{
564 int domain, bus, rc;
565
566 PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
567 pdev->devfn, decode, command_bits, change_bridge);
568
569 if (!change_bridge)
570 return 0;
571
572 if ((command_bits & PCI_COMMAND_IO) == 0)
573 return 0;
574
575 domain = pci_domain_nr(pdev->bus);
576 bus = pdev->bus->number;
577
578 rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
579 PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
580
581 return rc;
582}
583
546/* 584/*
547 * Called on each cpu to initialize the per_cpu UV data area. 585 * Called on each cpu to initialize the per_cpu UV data area.
548 * FIXME: hotplug not supported yet 586 * FIXME: hotplug not supported yet
@@ -559,6 +597,46 @@ void __cpuinit uv_cpu_init(void)
559 set_x2apic_extra_bits(uv_hub_info->pnode); 597 set_x2apic_extra_bits(uv_hub_info->pnode);
560} 598}
561 599
600/*
601 * When NMI is received, print a stack trace.
602 */
603int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
604{
605 if (reason != DIE_NMI_IPI)
606 return NOTIFY_OK;
607 /*
608 * Use a lock so only one cpu prints at a time
609 * to prevent intermixed output.
610 */
611 spin_lock(&uv_nmi_lock);
612 pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
613 dump_stack();
614 spin_unlock(&uv_nmi_lock);
615
616 return NOTIFY_STOP;
617}
618
619static struct notifier_block uv_dump_stack_nmi_nb = {
620 .notifier_call = uv_handle_nmi
621};
622
623void uv_register_nmi_notifier(void)
624{
625 if (register_die_notifier(&uv_dump_stack_nmi_nb))
626 printk(KERN_WARNING "UV NMI handler failed to register\n");
627}
628
629void uv_nmi_init(void)
630{
631 unsigned int value;
632
633 /*
634 * Unmask NMI on all cpus
635 */
636 value = apic_read(APIC_LVT1) | APIC_DM_NMI;
637 value &= ~APIC_LVT_MASKED;
638 apic_write(APIC_LVT1, value);
639}
562 640
563void __init uv_system_init(void) 641void __init uv_system_init(void)
564{ 642{
@@ -624,13 +702,15 @@ void __init uv_system_init(void)
624 } 702 }
625 703
626 uv_bios_init(); 704 uv_bios_init();
627 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 705 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
628 &sn_coherency_id, &sn_region_size); 706 &sn_region_size, &system_serial_number);
629 uv_rtc_init(); 707 uv_rtc_init();
630 708
631 for_each_present_cpu(cpu) { 709 for_each_present_cpu(cpu) {
710 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
711
632 nid = cpu_to_node(cpu); 712 nid = cpu_to_node(cpu);
633 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 713 pnode = uv_apicid_to_pnode(apicid);
634 blade = boot_pnode_to_blade(pnode); 714 blade = boot_pnode_to_blade(pnode);
635 lcpu = uv_blade_info[blade].nr_possible_cpus; 715 lcpu = uv_blade_info[blade].nr_possible_cpus;
636 uv_blade_info[blade].nr_possible_cpus++; 716 uv_blade_info[blade].nr_possible_cpus++;
@@ -651,15 +731,13 @@ void __init uv_system_init(void)
651 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 731 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
652 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 732 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
653 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 733 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
654 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 734 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
655 uv_node_to_blade[nid] = blade; 735 uv_node_to_blade[nid] = blade;
656 uv_cpu_to_blade[cpu] = blade; 736 uv_cpu_to_blade[cpu] = blade;
657 max_pnode = max(pnode, max_pnode); 737 max_pnode = max(pnode, max_pnode);
658 738
659 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " 739 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
660 "lcpu %d, blade %d\n", 740 cpu, apicid, pnode, nid, lcpu, blade);
661 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
662 lcpu, blade);
663 } 741 }
664 742
665 /* Add blade/pnode info for nodes without cpus */ 743 /* Add blade/pnode info for nodes without cpus */
@@ -680,5 +758,9 @@ void __init uv_system_init(void)
680 758
681 uv_cpu_init(); 759 uv_cpu_init();
682 uv_scir_register_cpu_notifier(); 760 uv_scir_register_cpu_notifier();
761 uv_register_nmi_notifier();
683 proc_mkdir("sgi_uv", NULL); 762 proc_mkdir("sgi_uv", NULL);
763
764 /* register Legacy VGA I/O redirection handler */
765 pci_register_set_vga_state(uv_set_vga_state);
684} 766}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index b5b6b23bce53..031aa887b0eb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1992 apm_info.disabled = 1; 1992 apm_info.disabled = 1;
1993 printk(KERN_INFO "%s machine detected. " 1993 printk(KERN_INFO "%s machine detected. "
1994 "Disabling APM.\n", d->ident); 1994 "Disabling APM.\n", d->ident);
1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
1996 printk(KERN_INFO "download from support.intel.com \n"); 1996 printk(KERN_INFO "download from support.intel.com\n");
1997 } 1997 }
1998 return 0; 1998 return 0;
1999} 1999}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index b0206a211b09..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -15,8 +15,8 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 * 17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 18 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson 19 * Copyright (c) Russ Anderson <rja@sgi.com>
20 */ 20 */
21 21
22#include <linux/efi.h> 22#include <linux/efi.h>
@@ -30,6 +30,7 @@ static struct uv_systab uv_systab;
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{ 31{
32 struct uv_systab *tab = &uv_systab; 32 struct uv_systab *tab = &uv_systab;
33 s64 ret;
33 34
34 if (!tab->function) 35 if (!tab->function)
35 /* 36 /*
@@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
37 */ 38 */
38 return BIOS_STATUS_UNIMPLEMENTED; 39 return BIOS_STATUS_UNIMPLEMENTED;
39 40
40 return efi_call6((void *)__va(tab->function), 41 ret = efi_call6((void *)__va(tab->function), (u64)which,
41 (u64)which, a1, a2, a3, a4, a5); 42 a1, a2, a3, a4, a5);
43 return ret;
42} 44}
45EXPORT_SYMBOL_GPL(uv_bios_call);
43 46
44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, 47s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 u64 a4, u64 a5) 48 u64 a4, u64 a5)
@@ -73,11 +76,14 @@ long sn_coherency_id;
73EXPORT_SYMBOL_GPL(sn_coherency_id); 76EXPORT_SYMBOL_GPL(sn_coherency_id);
74long sn_region_size; 77long sn_region_size;
75EXPORT_SYMBOL_GPL(sn_region_size); 78EXPORT_SYMBOL_GPL(sn_region_size);
79long system_serial_number;
80EXPORT_SYMBOL_GPL(system_serial_number);
76int uv_type; 81int uv_type;
82EXPORT_SYMBOL_GPL(uv_type);
77 83
78 84
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, 85s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region) 86 long *region, long *ssn)
81{ 87{
82 s64 ret; 88 s64 ret;
83 u64 v0, v1; 89 u64 v0, v1;
@@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
97 *coher = part.coherence_id; 103 *coher = part.coherence_id;
98 if (region) 104 if (region)
99 *region = part.region_size; 105 *region = part.region_size;
106 if (ssn)
107 *ssn = v1;
100 return ret; 108 return ret;
101} 109}
110EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
102 111
103int 112int
104uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, 113uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
@@ -154,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
154} 163}
155EXPORT_SYMBOL_GPL(uv_bios_freq_base); 164EXPORT_SYMBOL_GPL(uv_bios_freq_base);
156 165
166/*
167 * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
168 * @decode: true to enable target, false to disable target
169 * @domain: PCI domain number
170 * @bus: PCI bus number
171 *
172 * Returns:
173 * 0: Success
174 * -EINVAL: Invalid domain or bus number
175 * -ENOSYS: Capability not available
176 * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
177 */
178int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
179{
180 return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
181 (u64)decode, (u64)domain, (u64)bus, 0, 0);
182}
183EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
184
157 185
158#ifdef CONFIG_EFI 186#ifdef CONFIG_EFI
159void uv_bios_init(void) 187void uv_bios_init(void)
@@ -185,4 +213,3 @@ void uv_bios_init(void)
185 213
186void uv_bios_init(void) { } 214void uv_bios_init(void) { }
187#endif 215#endif
188
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 30f25a75fe28..5de7f4c56971 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -5,7 +5,6 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/acpi.h> 9#include <linux/acpi.h>
11#include <asm/io.h> 10#include <asm/io.h>
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 1d2cb383410e..c202b62f3671 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
23
24obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
25obj-$(CONFIG_CPU_SUP_AMD) += amd.o 23obj-$(CONFIG_CPU_SUP_AMD) += amd.o
26obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 24obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 468489b57aae..97ad79cdf688 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
35 { 0, 0, 0, 0 } 39 { 0, 0, 0, 0 }
36 }; 40 };
37 41
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index b368cd862997..000000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr);
34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr);
35static DEFINE_PER_CPU(int, cpud_priv_count);
36
37static DEFINE_MUTEX(cpu_debug_lock);
38
39static struct dentry *cpu_debugfs_dir;
40
41static struct cpu_debug_base cpu_base[] = {
42 { "mc", CPU_MC, 0 },
43 { "monitor", CPU_MONITOR, 0 },
44 { "time", CPU_TIME, 0 },
45 { "pmc", CPU_PMC, 1 },
46 { "platform", CPU_PLATFORM, 0 },
47 { "apic", CPU_APIC, 0 },
48 { "poweron", CPU_POWERON, 0 },
49 { "control", CPU_CONTROL, 0 },
50 { "features", CPU_FEATURES, 0 },
51 { "lastbranch", CPU_LBRANCH, 0 },
52 { "bios", CPU_BIOS, 0 },
53 { "freq", CPU_FREQ, 0 },
54 { "mtrr", CPU_MTRR, 0 },
55 { "perf", CPU_PERF, 0 },
56 { "cache", CPU_CACHE, 0 },
57 { "sysenter", CPU_SYSENTER, 0 },
58 { "therm", CPU_THERM, 0 },
59 { "misc", CPU_MISC, 0 },
60 { "debug", CPU_DEBUG, 0 },
61 { "pat", CPU_PAT, 0 },
62 { "vmx", CPU_VMX, 0 },
63 { "call", CPU_CALL, 0 },
64 { "base", CPU_BASE, 0 },
65 { "ver", CPU_VER, 0 },
66 { "conf", CPU_CONF, 0 },
67 { "smm", CPU_SMM, 0 },
68 { "svm", CPU_SVM, 0 },
69 { "osvm", CPU_OSVM, 0 },
70 { "tss", CPU_TSS, 0 },
71 { "cr", CPU_CR, 0 },
72 { "dt", CPU_DT, 0 },
73 { "registers", CPU_REG_ALL, 0 },
74};
75
76static struct cpu_file_base cpu_file[] = {
77 { "index", CPU_REG_ALL, 0 },
78 { "value", CPU_REG_ALL, 1 },
79};
80
81/* CPU Registers Range */
82static struct cpu_debug_range cpu_reg_range[] = {
83 { 0x00000000, 0x00000001, CPU_MC, },
84 { 0x00000006, 0x00000007, CPU_MONITOR, },
85 { 0x00000010, 0x00000010, CPU_TIME, },
86 { 0x00000011, 0x00000013, CPU_PMC, },
87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
88 { 0x0000001B, 0x0000001B, CPU_APIC, },
89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
94 { 0x00000079, 0x00000079, CPU_BIOS, },
95 { 0x00000088, 0x0000008A, CPU_CACHE, },
96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
98 { 0x000000C1, 0x000000C4, CPU_PMC, },
99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
100 { 0x000000E7, 0x000000E8, CPU_PERF, },
101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
102
103 { 0x00000116, 0x0000011E, CPU_CACHE, },
104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
105 { 0x00000179, 0x0000017B, CPU_MC, },
106 { 0x00000186, 0x00000189, CPU_PMC, },
107 { 0x00000198, 0x00000199, CPU_PERF, },
108 { 0x0000019A, 0x0000019A, CPU_TIME, },
109 { 0x0000019B, 0x0000019D, CPU_THERM, },
110 { 0x000001A0, 0x000001A0, CPU_MISC, },
111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
115
116 { 0x00000200, 0x0000020F, CPU_MTRR, },
117 { 0x00000250, 0x00000250, CPU_MTRR, },
118 { 0x00000258, 0x00000259, CPU_MTRR, },
119 { 0x00000268, 0x0000026F, CPU_MTRR, },
120 { 0x00000277, 0x00000277, CPU_PAT, },
121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
122
123 { 0x00000300, 0x00000311, CPU_PMC, },
124 { 0x00000345, 0x00000345, CPU_PMC, },
125 { 0x00000360, 0x00000371, CPU_PMC, },
126 { 0x0000038D, 0x00000390, CPU_PMC, },
127 { 0x000003A0, 0x000003BE, CPU_PMC, },
128 { 0x000003C0, 0x000003CD, CPU_PMC, },
129 { 0x000003E0, 0x000003E1, CPU_PMC, },
130 { 0x000003F0, 0x000003F2, CPU_PMC, },
131
132 { 0x00000400, 0x00000417, CPU_MC, },
133 { 0x00000480, 0x0000048B, CPU_VMX, },
134
135 { 0x00000600, 0x00000600, CPU_DEBUG, },
136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
138
139 { 0x000107CC, 0x000107D3, CPU_PMC, },
140
141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
142 { 0xC0000081, 0xC0000084, CPU_CALL, },
143 { 0xC0000100, 0xC0000102, CPU_BASE, },
144 { 0xC0000103, 0xC0000103, CPU_TIME, },
145
146 { 0xC0010000, 0xC0010007, CPU_PMC, },
147 { 0xC0010010, 0xC0010010, CPU_CONF, },
148 { 0xC0010015, 0xC0010015, CPU_CONF, },
149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
151 { 0xC001001F, 0xC001001F, CPU_CONF, },
152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
153 { 0xC0010044, 0xC0010048, CPU_MC, },
154 { 0xC0010050, 0xC0010056, CPU_SMM, },
155 { 0xC0010058, 0xC0010058, CPU_CONF, },
156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
157 { 0xC0010061, 0xC0010068, CPU_SMM, },
158 { 0xC0010069, 0xC001006B, CPU_SMM, },
159 { 0xC0010070, 0xC0010071, CPU_SMM, },
160 { 0xC0010111, 0xC0010113, CPU_SMM, },
161 { 0xC0010114, 0xC0010118, CPU_SVM, },
162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
163 { 0xC0011022, 0xC0011023, CPU_CONF, },
164};
165
166static int is_typeflag_valid(unsigned cpu, unsigned flag)
167{
168 int i;
169
170 /* Standard Registers should be always valid */
171 if (flag >= CPU_TSS)
172 return 1;
173
174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
175 if (cpu_reg_range[i].flag == flag)
176 return 1;
177 }
178
179 /* Invalid */
180 return 0;
181}
182
183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
184 int index, unsigned flag)
185{
186 if (cpu_reg_range[index].flag == flag) {
187 *min = cpu_reg_range[index].min;
188 *max = cpu_reg_range[index].max;
189 } else
190 *max = 0;
191
192 return *max;
193}
194
195/* This function can also be called with seq = NULL for printk */
196static void print_cpu_data(struct seq_file *seq, unsigned type,
197 u32 low, u32 high)
198{
199 struct cpu_private *priv;
200 u64 val = high;
201
202 if (seq) {
203 priv = seq->private;
204 if (priv->file) {
205 val = (val << 32) | low;
206 seq_printf(seq, "0x%llx\n", val);
207 } else
208 seq_printf(seq, " %08x: %08x_%08x\n",
209 type, high, low);
210 } else
211 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
212}
213
214/* This function can also be called with seq = NULL for printk */
215static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
216{
217 unsigned msr, msr_min, msr_max;
218 struct cpu_private *priv;
219 u32 low, high;
220 int i;
221
222 if (seq) {
223 priv = seq->private;
224 if (priv->file) {
225 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
226 &low, &high))
227 print_cpu_data(seq, priv->reg, low, high);
228 return;
229 }
230 }
231
232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
234 continue;
235
236 for (msr = msr_min; msr <= msr_max; msr++) {
237 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
238 continue;
239 print_cpu_data(seq, msr, low, high);
240 }
241 }
242}
243
244static void print_tss(void *arg)
245{
246 struct pt_regs *regs = task_pt_regs(current);
247 struct seq_file *seq = arg;
248 unsigned int seg;
249
250 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
251 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
252 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
253 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
254
255 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
256 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
257 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
258 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
259
260#ifdef CONFIG_X86_64
261 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
262 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
263 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
264 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
265 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
266 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
267 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
268 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
269#endif
270
271 asm("movl %%cs,%0" : "=r" (seg));
272 seq_printf(seq, " CS\t: %04x\n", seg);
273 asm("movl %%ds,%0" : "=r" (seg));
274 seq_printf(seq, " DS\t: %04x\n", seg);
275 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
276 asm("movl %%es,%0" : "=r" (seg));
277 seq_printf(seq, " ES\t: %04x\n", seg);
278 asm("movl %%fs,%0" : "=r" (seg));
279 seq_printf(seq, " FS\t: %04x\n", seg);
280 asm("movl %%gs,%0" : "=r" (seg));
281 seq_printf(seq, " GS\t: %04x\n", seg);
282
283 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
284
285 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
286}
287
288static void print_cr(void *arg)
289{
290 struct seq_file *seq = arg;
291
292 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
293 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
294 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
295 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
296#ifdef CONFIG_X86_64
297 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
298#endif
299}
300
301static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
302{
303 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
304}
305
306static void print_dt(void *seq)
307{
308 struct desc_ptr dt;
309 unsigned long ldt;
310
311 /* IDT */
312 store_idt((struct desc_ptr *)&dt);
313 print_desc_ptr("IDT", seq, dt);
314
315 /* GDT */
316 store_gdt((struct desc_ptr *)&dt);
317 print_desc_ptr("GDT", seq, dt);
318
319 /* LDT */
320 store_ldt(ldt);
321 seq_printf(seq, " LDT\t: %016lx\n", ldt);
322
323 /* TR */
324 store_tr(ldt);
325 seq_printf(seq, " TR\t: %016lx\n", ldt);
326}
327
328static void print_dr(void *arg)
329{
330 struct seq_file *seq = arg;
331 unsigned long dr;
332 int i;
333
334 for (i = 0; i < 8; i++) {
335 /* Ignore db4, db5 */
336 if ((i == 4) || (i == 5))
337 continue;
338 get_debugreg(dr, i);
339 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
340 }
341
342 seq_printf(seq, "\n MSR\t:\n");
343}
344
345static void print_apic(void *arg)
346{
347 struct seq_file *seq = arg;
348
349#ifdef CONFIG_X86_LOCAL_APIC
350 seq_printf(seq, " LAPIC\t:\n");
351 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
352 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
353 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
354 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
355 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
356 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
357 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
358 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
359 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
360 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
361 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
362 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
363 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
364 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
365 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
366 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
367 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
368 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
386 seq_printf(seq, "\n MSR\t:\n");
387}
388
389static int cpu_seq_show(struct seq_file *seq, void *v)
390{
391 struct cpu_private *priv = seq->private;
392
393 if (priv == NULL)
394 return -EINVAL;
395
396 switch (cpu_base[priv->type].flag) {
397 case CPU_TSS:
398 smp_call_function_single(priv->cpu, print_tss, seq, 1);
399 break;
400 case CPU_CR:
401 smp_call_function_single(priv->cpu, print_cr, seq, 1);
402 break;
403 case CPU_DT:
404 smp_call_function_single(priv->cpu, print_dt, seq, 1);
405 break;
406 case CPU_DEBUG:
407 if (priv->file == CPU_INDEX_BIT)
408 smp_call_function_single(priv->cpu, print_dr, seq, 1);
409 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
410 break;
411 case CPU_APIC:
412 if (priv->file == CPU_INDEX_BIT)
413 smp_call_function_single(priv->cpu, print_apic, seq, 1);
414 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
415 break;
416
417 default:
418 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
419 break;
420 }
421 seq_printf(seq, "\n");
422
423 return 0;
424}
425
426static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
427{
428 if (*pos == 0) /* One time is enough ;-) */
429 return seq;
430
431 return NULL;
432}
433
434static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435{
436 (*pos)++;
437
438 return cpu_seq_start(seq, pos);
439}
440
441static void cpu_seq_stop(struct seq_file *seq, void *v)
442{
443}
444
445static const struct seq_operations cpu_seq_ops = {
446 .start = cpu_seq_start,
447 .next = cpu_seq_next,
448 .stop = cpu_seq_stop,
449 .show = cpu_seq_show,
450};
451
452static int cpu_seq_open(struct inode *inode, struct file *file)
453{
454 struct cpu_private *priv = inode->i_private;
455 struct seq_file *seq;
456 int err;
457
458 err = seq_open(file, &cpu_seq_ops);
459 if (!err) {
460 seq = file->private_data;
461 seq->private = priv;
462 }
463
464 return err;
465}
466
467static int write_msr(struct cpu_private *priv, u64 val)
468{
469 u32 low, high;
470
471 high = (val >> 32) & 0xffffffff;
472 low = val & 0xffffffff;
473
474 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
475 return 0;
476
477 return -EPERM;
478}
479
480static int write_cpu_register(struct cpu_private *priv, const char *buf)
481{
482 int ret = -EPERM;
483 u64 val;
484
485 ret = strict_strtoull(buf, 0, &val);
486 if (ret < 0)
487 return ret;
488
489 /* Supporting only MSRs */
490 if (priv->type < CPU_TSS_BIT)
491 return write_msr(priv, val);
492
493 return ret;
494}
495
496static ssize_t cpu_write(struct file *file, const char __user *ubuf,
497 size_t count, loff_t *off)
498{
499 struct seq_file *seq = file->private_data;
500 struct cpu_private *priv = seq->private;
501 char buf[19];
502
503 if ((priv == NULL) || (count >= sizeof(buf)))
504 return -EINVAL;
505
506 if (copy_from_user(&buf, ubuf, count))
507 return -EFAULT;
508
509 buf[count] = 0;
510
511 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
512 if (!write_cpu_register(priv, buf))
513 return count;
514
515 return -EACCES;
516}
517
518static const struct file_operations cpu_fops = {
519 .owner = THIS_MODULE,
520 .open = cpu_seq_open,
521 .read = seq_read,
522 .write = cpu_write,
523 .llseek = seq_lseek,
524 .release = seq_release,
525};
526
527static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
528 unsigned file, struct dentry *dentry)
529{
530 struct cpu_private *priv = NULL;
531
532 /* Already intialized */
533 if (file == CPU_INDEX_BIT)
534 if (per_cpu(cpud_arr[type].init, cpu))
535 return 0;
536
537 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
538 if (priv == NULL)
539 return -ENOMEM;
540
541 priv->cpu = cpu;
542 priv->type = type;
543 priv->reg = reg;
544 priv->file = file;
545 mutex_lock(&cpu_debug_lock);
546 per_cpu(cpud_priv_arr[type], cpu) = priv;
547 per_cpu(cpud_priv_count, cpu)++;
548 mutex_unlock(&cpu_debug_lock);
549
550 if (file)
551 debugfs_create_file(cpu_file[file].name, S_IRUGO,
552 dentry, (void *)priv, &cpu_fops);
553 else {
554 debugfs_create_file(cpu_base[type].name, S_IRUGO,
555 per_cpu(cpud_arr[type].dentry, cpu),
556 (void *)priv, &cpu_fops);
557 mutex_lock(&cpu_debug_lock);
558 per_cpu(cpud_arr[type].init, cpu) = 1;
559 mutex_unlock(&cpu_debug_lock);
560 }
561
562 return 0;
563}
564
565static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
566 struct dentry *dentry)
567{
568 unsigned file;
569 int err = 0;
570
571 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
572 err = cpu_create_file(cpu, type, reg, file, dentry);
573 if (err)
574 return err;
575 }
576
577 return err;
578}
579
580static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
581{
582 struct dentry *cpu_dentry = NULL;
583 unsigned reg, reg_min, reg_max;
584 int i, err = 0;
585 char reg_dir[12];
586 u32 low, high;
587
588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
590 cpu_base[type].flag))
591 continue;
592
593 for (reg = reg_min; reg <= reg_max; reg++) {
594 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
595 continue;
596
597 sprintf(reg_dir, "0x%x", reg);
598 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
599 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
600 if (err)
601 return err;
602 }
603 }
604
605 return err;
606}
607
608static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
609{
610 struct dentry *cpu_dentry = NULL;
611 unsigned type;
612 int err = 0;
613
614 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
615 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
616 continue;
617 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
618 per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry;
619
620 if (type < CPU_TSS_BIT)
621 err = cpu_init_msr(cpu, type, cpu_dentry);
622 else
623 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
624 cpu_dentry);
625 if (err)
626 return err;
627 }
628
629 return err;
630}
631
632static int cpu_init_cpu(void)
633{
634 struct dentry *cpu_dentry = NULL;
635 struct cpuinfo_x86 *cpui;
636 char cpu_dir[12];
637 unsigned cpu;
638 int err = 0;
639
640 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
641 cpui = &cpu_data(cpu);
642 if (!cpu_has(cpui, X86_FEATURE_MSR))
643 continue;
644
645 sprintf(cpu_dir, "cpu%d", cpu);
646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
647 err = cpu_init_allreg(cpu, cpu_dentry);
648
649 pr_info("cpu%d(%d) debug files %d\n",
650 cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu));
651 if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) {
652 pr_err("Register files count %d exceeds limit %d\n",
653 per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES);
654 per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES;
655 err = -ENFILE;
656 }
657 if (err)
658 return err;
659 }
660
661 return err;
662}
663
664static int __init cpu_debug_init(void)
665{
666 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
667
668 return cpu_init_cpu();
669}
670
671static void __exit cpu_debug_exit(void)
672{
673 int i, cpu;
674
675 if (cpu_debugfs_dir)
676 debugfs_remove_recursive(cpu_debugfs_dir);
677
678 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
679 for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++)
680 kfree(per_cpu(cpud_priv_arr[i], cpu));
681}
682
683module_init(cpu_debug_init);
684module_exit(cpu_debug_exit);
685
686MODULE_AUTHOR("Jaswinder Singh Rajput");
687MODULE_DESCRIPTION("CPU Debug module");
688MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b9..870e6cc6ad28 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
10 10
11comment "CPUFreq processor drivers" 11comment "CPUFreq processor drivers"
12 12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
13config X86_ACPI_CPUFREQ 27config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver" 28 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE 29 select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294d..1840c0a5170b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920fa7c80..459168083b77 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h>
36#include <trace/events/power.h> 37#include <trace/events/power.h>
37 38
38#include <linux/acpi.h> 39#include <linux/acpi.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 006b278b0d5d..c587db472a75 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -20,7 +20,6 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/init.h> 21#include <linux/init.h>
22 22
23#include <linux/slab.h>
24#include <linux/delay.h> 23#include <linux/delay.h>
25#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
26 25
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index ac27ec2264d5..16e3483be9e3 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -80,6 +80,7 @@
80#include <linux/cpufreq.h> 80#include <linux/cpufreq.h>
81#include <linux/pci.h> 81#include <linux/pci.h>
82#include <linux/errno.h> 82#include <linux/errno.h>
83#include <linux/slab.h>
83 84
84#include <asm/processor-cyrix.h> 85#include <asm/processor-cyrix.h>
85 86
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index da5f70fcb766..e7b559d74c52 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -9,7 +9,6 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/cpufreq.h> 12#include <linux/cpufreq.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15 14
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 869615193720..7b8a8ba67b07 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -25,7 +25,6 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/slab.h>
29#include <linux/cpumask.h> 28#include <linux/cpumask.h>
30#include <linux/timex.h> 29#include <linux/timex.h>
31 30
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 000000000000..ce7cde713e71
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,621 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return -EINVAL;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID;
323 in_params[1].type = ACPI_TYPE_INTEGER;
324 in_params[1].integer.value = 1;
325 in_params[2].type = ACPI_TYPE_INTEGER;
326 in_params[2].integer.value = 2;
327 in_params[3].type = ACPI_TYPE_BUFFER;
328 in_params[3].buffer.length = 8;
329 in_params[3].buffer.pointer = (u8 *)&capabilities;
330
331 capabilities[0] = OSC_QUERY_ENABLE;
332 capabilities[1] = 0x1;
333
334 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
335 if (ACPI_FAILURE(status))
336 return -ENODEV;
337
338 if (!output.length)
339 return -ENODEV;
340
341 out_obj = output.pointer;
342 if (out_obj->type != ACPI_TYPE_BUFFER) {
343 ret = -ENODEV;
344 goto out_free;
345 }
346
347 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
348 if (errors) {
349 ret = -ENODEV;
350 goto out_free;
351 }
352
353 supported = *((u32 *)(out_obj->buffer.pointer + 4));
354 if (!(supported & 0x1)) {
355 ret = -ENODEV;
356 goto out_free;
357 }
358
359 kfree(output.pointer);
360 capabilities[0] = 0x0;
361 capabilities[1] = 0x1;
362
363 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
364 if (ACPI_FAILURE(status))
365 return -ENODEV;
366
367 if (!output.length)
368 return -ENODEV;
369
370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 ret = -ENODEV;
373 goto out_free;
374 }
375
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) {
378 ret = -ENODEV;
379 goto out_free;
380 }
381
382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) {
384 ret = -ENODEV;
385 goto out_free;
386 }
387
388out_free:
389 kfree(output.pointer);
390 return ret;
391}
392
393static int __init pcc_cpufreq_probe(void)
394{
395 acpi_status status;
396 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle;
401 int ret = 0;
402
403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status))
405 return -ENODEV;
406
407 status = acpi_get_handle(handle, "_OSC", &osc_handle);
408 if (ACPI_SUCCESS(status)) {
409 ret = pcc_cpufreq_do_osc(&osc_handle);
410 if (ret)
411 dprintk("probe: _OSC evaluation did not succeed\n");
412 /* Firmware's use of _OSC is optional */
413 ret = 0;
414 }
415
416 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
417 if (ACPI_FAILURE(status))
418 return -ENODEV;
419
420 out_obj = output.pointer;
421 if (out_obj->type != ACPI_TYPE_PACKAGE) {
422 ret = -ENODEV;
423 goto out_free;
424 }
425
426 member = &out_obj->package.elements[0];
427 if (member->type != ACPI_TYPE_BUFFER) {
428 ret = -ENODEV;
429 goto out_free;
430 }
431
432 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
433
434 dprintk("probe: mem_resource descriptor: 0x%x,"
435 " length: %d, space_id: %d, resource_usage: %d,"
436 " type_specific: %d, granularity: 0x%llx,"
437 " minimum: 0x%llx, maximum: 0x%llx,"
438 " translation_offset: 0x%llx, address_length: 0x%llx\n",
439 mem_resource->descriptor, mem_resource->length,
440 mem_resource->space_id, mem_resource->resource_usage,
441 mem_resource->type_specific, mem_resource->granularity,
442 mem_resource->minimum, mem_resource->maximum,
443 mem_resource->translation_offset,
444 mem_resource->address_length);
445
446 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
447 ret = -ENODEV;
448 goto out_free;
449 }
450
451 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
452 mem_resource->address_length);
453 if (pcch_virt_addr == NULL) {
454 dprintk("probe: could not map shared mem region\n");
455 goto out_free;
456 }
457 pcch_hdr = pcch_virt_addr;
458
459 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
460 dprintk("probe: PCCH header is at physical address: 0x%llx,"
461 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
462 " supported features: 0x%x, command field: 0x%x,"
463 " status field: 0x%x, nominal latency: %d us\n",
464 mem_resource->minimum, ioread32(&pcch_hdr->signature),
465 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
466 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
467 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
468 ioread32(&pcch_hdr->latency));
469
470 dprintk("probe: min time between commands: %d us,"
471 " max time between commands: %d us,"
472 " nominal CPU frequency: %d MHz,"
473 " minimum CPU frequency: %d MHz,"
474 " minimum CPU frequency without throttling: %d MHz\n",
475 ioread32(&pcch_hdr->minimum_time),
476 ioread32(&pcch_hdr->maximum_time),
477 ioread32(&pcch_hdr->nominal),
478 ioread32(&pcch_hdr->throttled_frequency),
479 ioread32(&pcch_hdr->minimum_frequency));
480
481 member = &out_obj->package.elements[1];
482 if (member->type != ACPI_TYPE_BUFFER) {
483 ret = -ENODEV;
484 goto pcch_free;
485 }
486
487 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
488
489 doorbell.space_id = reg_resource->space_id;
490 doorbell.bit_width = reg_resource->bit_width;
491 doorbell.bit_offset = reg_resource->bit_offset;
492 doorbell.access_width = 64;
493 doorbell.address = reg_resource->address;
494
495 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
496 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
497 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
498 doorbell.access_width, reg_resource->address);
499
500 member = &out_obj->package.elements[2];
501 if (member->type != ACPI_TYPE_INTEGER) {
502 ret = -ENODEV;
503 goto pcch_free;
504 }
505
506 doorbell_preserve = member->integer.value;
507
508 member = &out_obj->package.elements[3];
509 if (member->type != ACPI_TYPE_INTEGER) {
510 ret = -ENODEV;
511 goto pcch_free;
512 }
513
514 doorbell_write = member->integer.value;
515
516 dprintk("probe: doorbell_preserve: 0x%llx,"
517 " doorbell_write: 0x%llx\n",
518 doorbell_preserve, doorbell_write);
519
520 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
521 if (!pcc_cpu_info) {
522 ret = -ENOMEM;
523 goto pcch_free;
524 }
525
526 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
527 " limits: %d MHz, %d MHz\n", PCC_VERSION,
528 ioread32(&pcch_hdr->minimum_frequency),
529 ioread32(&pcch_hdr->nominal));
530 kfree(output.pointer);
531 return ret;
532pcch_free:
533 pcc_clear_mapping();
534out_free:
535 kfree(output.pointer);
536 return ret;
537}
538
539static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
540{
541 unsigned int cpu = policy->cpu;
542 unsigned int result = 0;
543
544 if (!pcch_virt_addr) {
545 result = -1;
546 goto pcch_null;
547 }
548
549 result = pcc_get_offset(cpu);
550 if (result) {
551 dprintk("init: PCCP evaluation failed\n");
552 goto free;
553 }
554
555 policy->max = policy->cpuinfo.max_freq =
556 ioread32(&pcch_hdr->nominal) * 1000;
557 policy->min = policy->cpuinfo.min_freq =
558 ioread32(&pcch_hdr->minimum_frequency) * 1000;
559 policy->cur = pcc_get_freq(cpu);
560
561 dprintk("init: policy->max is %d, policy->min is %d\n",
562 policy->max, policy->min);
563
564 return 0;
565free:
566 pcc_clear_mapping();
567 free_percpu(pcc_cpu_info);
568pcch_null:
569 return result;
570}
571
572static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
573{
574 return 0;
575}
576
577static struct cpufreq_driver pcc_cpufreq_driver = {
578 .flags = CPUFREQ_CONST_LOOPS,
579 .get = pcc_get_freq,
580 .verify = pcc_cpufreq_verify,
581 .target = pcc_cpufreq_target,
582 .init = pcc_cpufreq_cpu_init,
583 .exit = pcc_cpufreq_cpu_exit,
584 .name = "pcc-cpufreq",
585 .owner = THIS_MODULE,
586};
587
588static int __init pcc_cpufreq_init(void)
589{
590 int ret;
591
592 if (acpi_disabled)
593 return 0;
594
595 ret = pcc_cpufreq_probe();
596 if (ret) {
597 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
598 return ret;
599 }
600
601 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
602
603 return ret;
604}
605
606static void __exit pcc_cpufreq_exit(void)
607{
608 cpufreq_unregister_driver(&pcc_cpufreq_driver);
609
610 pcc_clear_mapping();
611
612 free_percpu(pcc_cpu_info);
613}
614
615MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
616MODULE_VERSION(PCC_VERSION);
617MODULE_DESCRIPTION("Processor Clocking Control interface driver");
618MODULE_LICENSE("GPL");
619
620late_initcall(pcc_cpufreq_init);
621module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index cb01dac267d3..b3379d6a5c57 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -13,7 +13,6 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/ioport.h> 15#include <linux/ioport.h>
16#include <linux/slab.h>
17#include <linux/timex.h> 16#include <linux/timex.h>
18#include <linux/io.h> 17#include <linux/io.h>
19 18
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index f125e5c551c0..d360b56e9825 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
807 unsigned int index) 807 unsigned int index)
808{ 808{
809 acpi_integer control; 809 u64 control;
810 810
811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
812 return; 812 return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
824{ 824{
825 struct cpufreq_frequency_table *powernow_table; 825 struct cpufreq_frequency_table *powernow_table;
826 int ret_val = -ENODEV; 826 int ret_val = -ENODEV;
827 acpi_integer control, status; 827 u64 control, status;
828 828
829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
830 dprintk("register performance failed: bad ACPI data\n"); 830 dprintk("register performance failed: bad ACPI data\n");
@@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
948 u32 fid; 948 u32 fid;
949 u32 vid; 949 u32 vid;
950 u32 freq, index; 950 u32 freq, index;
951 acpi_integer status, control; 951 u64 status, control;
952 952
953 if (data->exttype) { 953 if (data->exttype) {
954 status = data->acpi_data.states[i].status; 954 status = data->acpi_data.states[i].status;
@@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1356 1356
1357 kfree(data->powernow_table); 1357 kfree(data->powernow_table);
1358 kfree(data); 1358 kfree(data);
1359 per_cpu(powernow_data, pol->cpu) = NULL;
1359 1360
1360 return 0; 1361 return 0;
1361} 1362}
@@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
1375 int err; 1376 int err;
1376 1377
1377 if (!data) 1378 if (!data)
1378 return -EINVAL; 1379 return 0;
1379 1380
1380 smp_call_function_single(cpu, query_values_on_cpu, &err, true); 1381 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1381 if (err) 1382 if (err)
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 8d672ef162ce..9b1ff37de46a 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> /* current */ 20#include <linux/sched.h> /* current */
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/gfp.h>
23 24
24#include <asm/msr.h> 25#include <asm/msr.h>
25#include <asm/processor.h> 26#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 2ce8e0b5cc54..561758e95180 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -23,7 +23,6 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
25#include <linux/pci.h> 25#include <linux/pci.h>
26#include <linux/slab.h>
27#include <linux/sched.h> 26#include <linux/sched.h>
28 27
29#include "speedstep-lib.h" 28#include "speedstep-lib.h"
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index ad0083abfa23..a94ec6be69fa 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -13,7 +13,6 @@
13#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/cpufreq.h> 15#include <linux/cpufreq.h>
16#include <linux/slab.h>
17 16
18#include <asm/msr.h> 17#include <asm/msr.h>
19#include <asm/tsc.h> 18#include <asm/tsc.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 04d73c114e49..8abd869baabf 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -17,7 +17,6 @@
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/cpufreq.h> 19#include <linux/cpufreq.h>
20#include <linux/slab.h>
21#include <linux/delay.h> 20#include <linux/delay.h>
22#include <linux/io.h> 21#include <linux/io.h>
23#include <asm/ist.h> 22#include <asm/ist.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 879666f4d871..7e1cca13af35 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 70 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 sched_clock_stable = 1; 73 if (!check_tsc_unstable())
74 sched_clock_stable = 1;
74 } 75 }
75 76
76 /* 77 /*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c2b722d5a722..b3eeb66c0a51 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21#include <asm/smp.h>
21 22
22#define LVL_1_INST 1 23#define LVL_1_INST 1
23#define LVL_1_DATA 2 24#define LVL_1_DATA 2
@@ -152,7 +153,8 @@ struct _cpuid4_info {
152 union _cpuid4_leaf_ebx ebx; 153 union _cpuid4_leaf_ebx ebx;
153 union _cpuid4_leaf_ecx ecx; 154 union _cpuid4_leaf_ecx ecx;
154 unsigned long size; 155 unsigned long size;
155 unsigned long can_disable; 156 bool can_disable;
157 unsigned int l3_indices;
156 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
157}; 159};
158 160
@@ -162,7 +164,8 @@ struct _cpuid4_info_regs {
162 union _cpuid4_leaf_ebx ebx; 164 union _cpuid4_leaf_ebx ebx;
163 union _cpuid4_leaf_ecx ecx; 165 union _cpuid4_leaf_ecx ecx;
164 unsigned long size; 166 unsigned long size;
165 unsigned long can_disable; 167 bool can_disable;
168 unsigned int l3_indices;
166}; 169};
167 170
168unsigned short num_cache_leaves; 171unsigned short num_cache_leaves;
@@ -292,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
292 (ebx->split.ways_of_associativity + 1) - 1; 295 (ebx->split.ways_of_associativity + 1) - 1;
293} 296}
294 297
298struct _cache_attr {
299 struct attribute attr;
300 ssize_t (*show)(struct _cpuid4_info *, char *);
301 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
302};
303
304#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void)
306{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0;
316
317 pci_read_config_dword(dev, 0x1C4, &val);
318
319 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13));
324
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
326}
327
295static void __cpuinit 328static void __cpuinit
296amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
297{ 330{
@@ -301,12 +334,103 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
301 if (boot_cpu_data.x86 == 0x11) 334 if (boot_cpu_data.x86 == 0x11)
302 return; 335 return;
303 336
304 /* see erratum #382 */ 337 /* see errata #382 and #388 */
305 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) 338 if ((boot_cpu_data.x86 == 0x10) &&
339 ((boot_cpu_data.x86_model < 0x8) ||
340 (boot_cpu_data.x86_mask < 0x1)))
306 return; 341 return;
307 342
308 this_leaf->can_disable = 1; 343 this_leaf->can_disable = true;
344 this_leaf->l3_indices = amd_calc_l3_indices();
345}
346
347static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
348 unsigned int index)
349{
350 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
351 int node = amd_get_nb_id(cpu);
352 struct pci_dev *dev = node_to_k8_nb_misc(node);
353 unsigned int reg = 0;
354
355 if (!this_leaf->can_disable)
356 return -EINVAL;
357
358 if (!dev)
359 return -EINVAL;
360
361 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
362 return sprintf(buf, "0x%08x\n", reg);
363}
364
365#define SHOW_CACHE_DISABLE(index) \
366static ssize_t \
367show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
368{ \
369 return show_cache_disable(this_leaf, buf, index); \
370}
371SHOW_CACHE_DISABLE(0)
372SHOW_CACHE_DISABLE(1)
373
374static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
375 const char *buf, size_t count, unsigned int index)
376{
377 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
378 int node = amd_get_nb_id(cpu);
379 struct pci_dev *dev = node_to_k8_nb_misc(node);
380 unsigned long val = 0;
381
382#define SUBCACHE_MASK (3UL << 20)
383#define SUBCACHE_INDEX 0xfff
384
385 if (!this_leaf->can_disable)
386 return -EINVAL;
387
388 if (!capable(CAP_SYS_ADMIN))
389 return -EPERM;
390
391 if (!dev)
392 return -EINVAL;
393
394 if (strict_strtoul(buf, 10, &val) < 0)
395 return -EINVAL;
396
397 /* do not allow writes outside of allowed bits */
398 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
399 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
400 return -EINVAL;
401
402 val |= BIT(30);
403 pci_write_config_dword(dev, 0x1BC + index * 4, val);
404 /*
405 * We need to WBINVD on a core on the node containing the L3 cache which
406 * indices we disable therefore a simple wbinvd() is not sufficient.
407 */
408 wbinvd_on_cpu(cpu);
409 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
410 return count;
411}
412
413#define STORE_CACHE_DISABLE(index) \
414static ssize_t \
415store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
416 const char *buf, size_t count) \
417{ \
418 return store_cache_disable(this_leaf, buf, count, index); \
309} 419}
420STORE_CACHE_DISABLE(0)
421STORE_CACHE_DISABLE(1)
422
423static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
424 show_cache_disable_0, store_cache_disable_0);
425static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
426 show_cache_disable_1, store_cache_disable_1);
427
428#else /* CONFIG_CPU_SUP_AMD */
429static void __cpuinit
430amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
431{
432};
433#endif /* CONFIG_CPU_SUP_AMD */
310 434
311static int 435static int
312__cpuinit cpuid4_cache_lookup_regs(int index, 436__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -713,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
713#define to_object(k) container_of(k, struct _index_kobject, kobj) 837#define to_object(k) container_of(k, struct _index_kobject, kobj)
714#define to_attr(a) container_of(a, struct _cache_attr, attr) 838#define to_attr(a) container_of(a, struct _cache_attr, attr)
715 839
716static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
717 unsigned int index)
718{
719 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
720 int node = cpu_to_node(cpu);
721 struct pci_dev *dev = node_to_k8_nb_misc(node);
722 unsigned int reg = 0;
723
724 if (!this_leaf->can_disable)
725 return -EINVAL;
726
727 if (!dev)
728 return -EINVAL;
729
730 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
731 return sprintf(buf, "%x\n", reg);
732}
733
734#define SHOW_CACHE_DISABLE(index) \
735static ssize_t \
736show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
737{ \
738 return show_cache_disable(this_leaf, buf, index); \
739}
740SHOW_CACHE_DISABLE(0)
741SHOW_CACHE_DISABLE(1)
742
743static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
744 const char *buf, size_t count, unsigned int index)
745{
746 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
747 int node = cpu_to_node(cpu);
748 struct pci_dev *dev = node_to_k8_nb_misc(node);
749 unsigned long val = 0;
750 unsigned int scrubber = 0;
751
752 if (!this_leaf->can_disable)
753 return -EINVAL;
754
755 if (!capable(CAP_SYS_ADMIN))
756 return -EPERM;
757
758 if (!dev)
759 return -EINVAL;
760
761 if (strict_strtoul(buf, 10, &val) < 0)
762 return -EINVAL;
763
764 val |= 0xc0000000;
765
766 pci_read_config_dword(dev, 0x58, &scrubber);
767 scrubber &= ~0x1f000000;
768 pci_write_config_dword(dev, 0x58, scrubber);
769
770 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
771 wbinvd();
772 pci_write_config_dword(dev, 0x1BC + index * 4, val);
773 return count;
774}
775
776#define STORE_CACHE_DISABLE(index) \
777static ssize_t \
778store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
779 const char *buf, size_t count) \
780{ \
781 return store_cache_disable(this_leaf, buf, count, index); \
782}
783STORE_CACHE_DISABLE(0)
784STORE_CACHE_DISABLE(1)
785
786struct _cache_attr {
787 struct attribute attr;
788 ssize_t (*show)(struct _cpuid4_info *, char *);
789 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
790};
791
792#define define_one_ro(_name) \ 840#define define_one_ro(_name) \
793static struct _cache_attr _name = \ 841static struct _cache_attr _name = \
794 __ATTR(_name, 0444, show_##_name, NULL) 842 __ATTR(_name, 0444, show_##_name, NULL)
@@ -803,23 +851,28 @@ define_one_ro(size);
803define_one_ro(shared_cpu_map); 851define_one_ro(shared_cpu_map);
804define_one_ro(shared_cpu_list); 852define_one_ro(shared_cpu_list);
805 853
806static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 854#define DEFAULT_SYSFS_CACHE_ATTRS \
807 show_cache_disable_0, store_cache_disable_0); 855 &type.attr, \
808static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 856 &level.attr, \
809 show_cache_disable_1, store_cache_disable_1); 857 &coherency_line_size.attr, \
858 &physical_line_partition.attr, \
859 &ways_of_associativity.attr, \
860 &number_of_sets.attr, \
861 &size.attr, \
862 &shared_cpu_map.attr, \
863 &shared_cpu_list.attr
810 864
811static struct attribute *default_attrs[] = { 865static struct attribute *default_attrs[] = {
812 &type.attr, 866 DEFAULT_SYSFS_CACHE_ATTRS,
813 &level.attr, 867 NULL
814 &coherency_line_size.attr, 868};
815 &physical_line_partition.attr, 869
816 &ways_of_associativity.attr, 870static struct attribute *default_l3_attrs[] = {
817 &number_of_sets.attr, 871 DEFAULT_SYSFS_CACHE_ATTRS,
818 &size.attr, 872#ifdef CONFIG_CPU_SUP_AMD
819 &shared_cpu_map.attr,
820 &shared_cpu_list.attr,
821 &cache_disable_0.attr, 873 &cache_disable_0.attr,
822 &cache_disable_1.attr, 874 &cache_disable_1.attr,
875#endif
823 NULL 876 NULL
824}; 877};
825 878
@@ -850,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
850 return ret; 903 return ret;
851} 904}
852 905
853static struct sysfs_ops sysfs_ops = { 906static const struct sysfs_ops sysfs_ops = {
854 .show = show, 907 .show = show,
855 .store = store, 908 .store = store,
856}; 909};
@@ -910,6 +963,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
910 unsigned int cpu = sys_dev->id; 963 unsigned int cpu = sys_dev->id;
911 unsigned long i, j; 964 unsigned long i, j;
912 struct _index_kobject *this_object; 965 struct _index_kobject *this_object;
966 struct _cpuid4_info *this_leaf;
913 int retval; 967 int retval;
914 968
915 retval = cpuid4_cache_sysfs_init(cpu); 969 retval = cpuid4_cache_sysfs_init(cpu);
@@ -928,6 +982,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
928 this_object = INDEX_KOBJECT_PTR(cpu, i); 982 this_object = INDEX_KOBJECT_PTR(cpu, i);
929 this_object->cpu = cpu; 983 this_object->cpu = cpu;
930 this_object->index = i; 984 this_object->index = i;
985
986 this_leaf = CPUID4_INFO_IDX(cpu, i);
987
988 if (this_leaf->can_disable)
989 ktype_cache.default_attrs = default_l3_attrs;
990 else
991 ktype_cache.default_attrs = default_attrs;
992
931 retval = kobject_init_and_add(&(this_object->kobj), 993 retval = kobject_init_and_add(&(this_object->kobj),
932 &ktype_cache, 994 &ktype_cache,
933 per_cpu(ici_cache_kobject, cpu), 995 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 73734baa50f2..e7dbde7bfedb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -22,6 +22,7 @@
22#include <linux/kdebug.h> 22#include <linux/kdebug.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/gfp.h>
25#include <asm/mce.h> 26#include <asm/mce.h>
26#include <asm/apic.h> 27#include <asm/apic.h>
27 28
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a8aacd4b513c..7a355ddcc64b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/sysfs.h> 27#include <linux/sysfs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
29#include <linux/init.h> 30#include <linux/init.h>
30#include <linux/kmod.h> 31#include <linux/kmod.h>
31#include <linux/poll.h> 32#include <linux/poll.h>
@@ -46,6 +47,13 @@
46 47
47#include "mce-internal.h" 48#include "mce-internal.h"
48 49
50static DEFINE_MUTEX(mce_read_mutex);
51
52#define rcu_dereference_check_mce(p) \
53 rcu_dereference_check((p), \
54 rcu_read_lock_sched_held() || \
55 lockdep_is_held(&mce_read_mutex))
56
49#define CREATE_TRACE_POINTS 57#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h> 58#include <trace/events/mce.h>
51 59
@@ -158,7 +166,7 @@ void mce_log(struct mce *mce)
158 mce->finished = 0; 166 mce->finished = 0;
159 wmb(); 167 wmb();
160 for (;;) { 168 for (;;) {
161 entry = rcu_dereference(mcelog.next); 169 entry = rcu_dereference_check_mce(mcelog.next);
162 for (;;) { 170 for (;;) {
163 /* 171 /*
164 * When the buffer fills up discard new entries. 172 * When the buffer fills up discard new entries.
@@ -531,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
531 struct mce m; 539 struct mce m;
532 int i; 540 int i;
533 541
534 __get_cpu_var(mce_poll_count)++; 542 percpu_inc(mce_poll_count);
535 543
536 mce_setup(&m); 544 mce_setup(&m);
537 545
@@ -926,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
926 934
927 atomic_inc(&mce_entry); 935 atomic_inc(&mce_entry);
928 936
929 __get_cpu_var(mce_exception_count)++; 937 percpu_inc(mce_exception_count);
930 938
931 if (notify_die(DIE_NMI, "machine check", regs, error_code, 939 if (notify_die(DIE_NMI, "machine check", regs, error_code,
932 18, SIGKILL) == NOTIFY_STOP) 940 18, SIGKILL) == NOTIFY_STOP)
@@ -1485,8 +1493,6 @@ static void collect_tscs(void *data)
1485 rdtscll(cpu_tsc[smp_processor_id()]); 1493 rdtscll(cpu_tsc[smp_processor_id()]);
1486} 1494}
1487 1495
1488static DEFINE_MUTEX(mce_read_mutex);
1489
1490static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1491 loff_t *off) 1497 loff_t *off)
1492{ 1498{
@@ -1500,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1500 return -ENOMEM; 1506 return -ENOMEM;
1501 1507
1502 mutex_lock(&mce_read_mutex); 1508 mutex_lock(&mce_read_mutex);
1503 next = rcu_dereference(mcelog.next); 1509 next = rcu_dereference_check_mce(mcelog.next);
1504 1510
1505 /* Only supports full reads right now */ 1511 /* Only supports full reads right now */
1506 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
@@ -1565,7 +1571,7 @@ timeout:
1565static unsigned int mce_poll(struct file *file, poll_table *wait) 1571static unsigned int mce_poll(struct file *file, poll_table *wait)
1566{ 1572{
1567 poll_wait(file, &mce_wait, wait); 1573 poll_wait(file, &mce_wait, wait);
1568 if (rcu_dereference(mcelog.next)) 1574 if (rcu_dereference_check_mce(mcelog.next))
1569 return POLLIN | POLLRDNORM; 1575 return POLLIN | POLLRDNORM;
1570 return 0; 1576 return 0;
1571} 1577}
@@ -2044,6 +2050,7 @@ static __init void mce_init_banks(void)
2044 struct mce_bank *b = &mce_banks[i]; 2050 struct mce_bank *b = &mce_banks[i];
2045 struct sysdev_attribute *a = &b->attr; 2051 struct sysdev_attribute *a = &b->attr;
2046 2052
2053 sysfs_attr_init(&a->attr);
2047 a->attr.name = b->attrname; 2054 a->attr.name = b->attrname;
2048 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2055 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2049 2056
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efca..224392d8fe8c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -21,6 +21,7 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/smp.h> 27#include <linux/smp.h>
@@ -388,7 +389,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
388 return ret; 389 return ret;
389} 390}
390 391
391static struct sysfs_ops threshold_ops = { 392static const struct sysfs_ops threshold_ops = {
392 .show = show, 393 .show = show,
393 .store = store, 394 .store = store,
394}; 395};
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 7c785634af2b..62b48e40920a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -5,6 +5,7 @@
5 * Author: Andi Kleen 5 * Author: Andi Kleen
6 */ 6 */
7 7
8#include <linux/gfp.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/interrupt.h> 10#include <linux/interrupt.h>
10#include <linux/percpu.h> 11#include <linux/percpu.h>
@@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot)
95 96
96 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
97 if (val & CMCI_EN) { 98 if (val & CMCI_EN) {
98 if (test_and_clear_bit(i, owned) || boot) 99 if (test_and_clear_bit(i, owned) && !boot)
99 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
100 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
101 continue; 102 continue;
@@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot)
107 108
108 /* Did the enable bit stick? -- the bank supports CMCI */ 109 /* Did the enable bit stick? -- the bank supports CMCI */
109 if (val & CMCI_EN) { 110 if (val & CMCI_EN) {
110 if (!test_and_set_bit(i, owned) || boot) 111 if (!test_and_set_bit(i, owned) && !boot)
111 print_update("CMCI", &hdr, i); 112 print_update("CMCI", &hdr, i);
112 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 113 __clear_bit(i, __get_cpu_var(mce_poll_banks));
113 } else { 114 } else {
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..ad9e5ed81181 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o cleanup.o 1obj-y := main.o if.o generic.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110dfd..92ba9cd31c9a 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
108 return 0; 108 return 0;
109} 109}
110 110
111static struct mtrr_ops amd_mtrr_ops = { 111static const struct mtrr_ops amd_mtrr_ops = {
112 .vendor = X86_VENDOR_AMD, 112 .vendor = X86_VENDOR_AMD,
113 .set = amd_set_mtrr, 113 .set = amd_set_mtrr,
114 .get = amd_get_mtrr, 114 .get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3a..316fe3e60a97 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
110 return 0; 110 return 0;
111} 111}
112 112
113static struct mtrr_ops centaur_mtrr_ops = { 113static const struct mtrr_ops centaur_mtrr_ops = {
114 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
115 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
116 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698e0466..06130b52f012 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/sort.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <linux/kvm_para.h> 27#include <linux/kvm_para.h>
28#include <linux/range.h>
29 29
30#include <asm/processor.h> 30#include <asm/processor.h>
31#include <asm/e820.h> 31#include <asm/e820.h>
@@ -34,11 +34,6 @@
34 34
35#include "mtrr.h" 35#include "mtrr.h"
36 36
37struct res_range {
38 unsigned long start;
39 unsigned long end;
40};
41
42struct var_mtrr_range_state { 37struct var_mtrr_range_state {
43 unsigned long base_pfn; 38 unsigned long base_pfn;
44 unsigned long size_pfn; 39 unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
56/* Should be related to MTRR_VAR_RANGES nums */ 51/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256 52#define RANGE_NUM 256
58 53
59static struct res_range __initdata range[RANGE_NUM]; 54static struct range __initdata range[RANGE_NUM];
60static int __initdata nr_range; 55static int __initdata nr_range;
61 56
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 57static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
64static int __initdata debug_print; 59static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) 60#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66 61
67
68static int __init
69add_range(struct res_range *range, int nr_range,
70 unsigned long start, unsigned long end)
71{
72 /* Out of slots: */
73 if (nr_range >= RANGE_NUM)
74 return nr_range;
75
76 range[nr_range].start = start;
77 range[nr_range].end = end;
78
79 nr_range++;
80
81 return nr_range;
82}
83
84static int __init
85add_range_with_merge(struct res_range *range, int nr_range,
86 unsigned long start, unsigned long end)
87{
88 int i;
89
90 /* Try to merge it with old one: */
91 for (i = 0; i < nr_range; i++) {
92 unsigned long final_start, final_end;
93 unsigned long common_start, common_end;
94
95 if (!range[i].end)
96 continue;
97
98 common_start = max(range[i].start, start);
99 common_end = min(range[i].end, end);
100 if (common_start > common_end + 1)
101 continue;
102
103 final_start = min(range[i].start, start);
104 final_end = max(range[i].end, end);
105
106 range[i].start = final_start;
107 range[i].end = final_end;
108 return nr_range;
109 }
110
111 /* Need to add it: */
112 return add_range(range, nr_range, start, end);
113}
114
115static void __init
116subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117{
118 int i, j;
119
120 for (j = 0; j < RANGE_NUM; j++) {
121 if (!range[j].end)
122 continue;
123
124 if (start <= range[j].start && end >= range[j].end) {
125 range[j].start = 0;
126 range[j].end = 0;
127 continue;
128 }
129
130 if (start <= range[j].start && end < range[j].end &&
131 range[j].start < end + 1) {
132 range[j].start = end + 1;
133 continue;
134 }
135
136
137 if (start > range[j].start && end >= range[j].end &&
138 range[j].end > start - 1) {
139 range[j].end = start - 1;
140 continue;
141 }
142
143 if (start > range[j].start && end < range[j].end) {
144 /* Find the new spare: */
145 for (i = 0; i < RANGE_NUM; i++) {
146 if (range[i].end == 0)
147 break;
148 }
149 if (i < RANGE_NUM) {
150 range[i].end = range[j].end;
151 range[i].start = end + 1;
152 } else {
153 printk(KERN_ERR "run of slot in ranges\n");
154 }
155 range[j].end = start - 1;
156 continue;
157 }
158 }
159}
160
161static int __init cmp_range(const void *x1, const void *x2)
162{
163 const struct res_range *r1 = x1;
164 const struct res_range *r2 = x2;
165 long start1, start2;
166
167 start1 = r1->start;
168 start2 = r2->start;
169
170 return start1 - start2;
171}
172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
208#define BIOS_BUG_MSG KERN_WARNING \ 62#define BIOS_BUG_MSG KERN_WARNING \
209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 63 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
210 64
211static int __init 65static int __init
212x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 66x86_get_mtrr_mem_range(struct range *range, int nr_range,
213 unsigned long extra_remove_base, 67 unsigned long extra_remove_base,
214 unsigned long extra_remove_size) 68 unsigned long extra_remove_size)
215{ 69{
@@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 continue; 77 continue;
224 base = range_state[i].base_pfn; 78 base = range_state[i].base_pfn;
225 size = range_state[i].size_pfn; 79 size = range_state[i].size_pfn;
226 nr_range = add_range_with_merge(range, nr_range, base, 80 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
227 base + size - 1); 81 base, base + size);
228 } 82 }
229 if (debug_print) { 83 if (debug_print) {
230 printk(KERN_DEBUG "After WB checking\n"); 84 printk(KERN_DEBUG "After WB checking\n");
231 for (i = 0; i < nr_range; i++) 85 for (i = 0; i < nr_range; i++)
232 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 86 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
233 range[i].start, range[i].end + 1); 87 range[i].start, range[i].end);
234 } 88 }
235 89
236 /* Take out UC ranges: */ 90 /* Take out UC ranges: */
@@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
252 size -= (1<<(20-PAGE_SHIFT)) - base; 106 size -= (1<<(20-PAGE_SHIFT)) - base;
253 base = 1<<(20-PAGE_SHIFT); 107 base = 1<<(20-PAGE_SHIFT);
254 } 108 }
255 subtract_range(range, base, base + size - 1); 109 subtract_range(range, RANGE_NUM, base, base + size);
256 } 110 }
257 if (extra_remove_size) 111 if (extra_remove_size)
258 subtract_range(range, extra_remove_base, 112 subtract_range(range, RANGE_NUM, extra_remove_base,
259 extra_remove_base + extra_remove_size - 1); 113 extra_remove_base + extra_remove_size);
260 114
261 if (debug_print) { 115 if (debug_print) {
262 printk(KERN_DEBUG "After UC checking\n"); 116 printk(KERN_DEBUG "After UC checking\n");
263 for (i = 0; i < RANGE_NUM; i++) { 117 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end) 118 if (!range[i].end)
265 continue; 119 continue;
266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 120 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
267 range[i].start, range[i].end + 1); 121 range[i].start, range[i].end);
268 } 122 }
269 } 123 }
270 124
@@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
273 if (debug_print) { 127 if (debug_print) {
274 printk(KERN_DEBUG "After sorting\n"); 128 printk(KERN_DEBUG "After sorting\n");
275 for (i = 0; i < nr_range; i++) 129 for (i = 0; i < nr_range; i++)
276 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 130 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
277 range[i].start, range[i].end + 1); 131 range[i].start, range[i].end);
278 } 132 }
279 133
280 /* clear those is not used */
281 for (i = nr_range; i < RANGE_NUM; i++)
282 memset(&range[i], 0, sizeof(range[i]));
283
284 return nr_range; 134 return nr_range;
285} 135}
286 136
287#ifdef CONFIG_MTRR_SANITIZER 137#ifdef CONFIG_MTRR_SANITIZER
288 138
289static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 139static unsigned long __init sum_ranges(struct range *range, int nr_range)
290{ 140{
291 unsigned long sum = 0; 141 unsigned long sum = 0;
292 int i; 142 int i;
293 143
294 for (i = 0; i < nr_range; i++) 144 for (i = 0; i < nr_range; i++)
295 sum += range[i].end + 1 - range[i].start; 145 sum += range[i].end - range[i].start;
296 146
297 return sum; 147 return sum;
298} 148}
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
621early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 471early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
622 472
623static int __init 473static int __init
624x86_setup_var_mtrrs(struct res_range *range, int nr_range, 474x86_setup_var_mtrrs(struct range *range, int nr_range,
625 u64 chunk_size, u64 gran_size) 475 u64 chunk_size, u64 gran_size)
626{ 476{
627 struct var_mtrr_state var_state; 477 struct var_mtrr_state var_state;
@@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
639 /* Write the range: */ 489 /* Write the range: */
640 for (i = 0; i < nr_range; i++) { 490 for (i = 0; i < nr_range; i++) {
641 set_var_mtrr_range(&var_state, range[i].start, 491 set_var_mtrr_range(&var_state, range[i].start,
642 range[i].end - range[i].start + 1); 492 range[i].end - range[i].start);
643 } 493 }
644 494
645 /* Write the last range: */ 495 /* Write the last range: */
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
742 unsigned long x_remove_base, 592 unsigned long x_remove_base,
743 unsigned long x_remove_size, int i) 593 unsigned long x_remove_size, int i)
744{ 594{
745 static struct res_range range_new[RANGE_NUM]; 595 static struct range range_new[RANGE_NUM];
746 unsigned long range_sums_new; 596 unsigned long range_sums_new;
747 static int nr_range_new; 597 static int nr_range_new;
748 int num_reg; 598 int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
869 * [0, 1M) should always be covered by var mtrr with WB 719 * [0, 1M) should always be covered by var mtrr with WB
870 * and fixed mtrrs should take effect before var mtrr for it: 720 * and fixed mtrrs should take effect before var mtrr for it:
871 */ 721 */
872 nr_range = add_range_with_merge(range, nr_range, 0, 722 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
873 (1ULL<<(20 - PAGE_SHIFT)) - 1); 723 1ULL<<(20 - PAGE_SHIFT));
874 /* Sort the ranges: */ 724 /* Sort the ranges: */
875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 725 sort_range(range, nr_range);
876 726
877 range_sums = sum_ranges(range, nr_range); 727 range_sums = sum_ranges(range, nr_range);
878 printk(KERN_INFO "total RAM covered: %ldM\n", 728 printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1089 nr_range = 0; 939 nr_range = 0;
1090 if (mtrr_tom2) { 940 if (mtrr_tom2) {
1091 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); 941 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1092 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; 942 range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
1093 if (highest_pfn < range[nr_range].end + 1) 943 if (highest_pfn < range[nr_range].end)
1094 highest_pfn = range[nr_range].end + 1; 944 highest_pfn = range[nr_range].end;
1095 nr_range++; 945 nr_range++;
1096 } 946 }
1097 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 947 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1103 953
1104 /* Check the holes: */ 954 /* Check the holes: */
1105 for (i = 0; i < nr_range - 1; i++) { 955 for (i = 0; i < nr_range - 1; i++) {
1106 if (range[i].end + 1 < range[i+1].start) 956 if (range[i].end < range[i+1].start)
1107 total_trim_size += real_trim_memory(range[i].end + 1, 957 total_trim_size += real_trim_memory(range[i].end,
1108 range[i+1].start); 958 range[i+1].start);
1109 } 959 }
1110 960
1111 /* Check the top: */ 961 /* Check the top: */
1112 i = nr_range - 1; 962 i = nr_range - 1;
1113 if (range[i].end + 1 < end_pfn) 963 if (range[i].end < end_pfn)
1114 total_trim_size += real_trim_memory(range[i].end + 1, 964 total_trim_size += real_trim_memory(range[i].end,
1115 end_pfn); 965 end_pfn);
1116 966
1117 if (total_trim_size) { 967 if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09c..68a3343e5798 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
265 post_set(); 265 post_set();
266} 266}
267 267
268static struct mtrr_ops cyrix_mtrr_ops = { 268static const struct mtrr_ops cyrix_mtrr_ops = {
269 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
270 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
271 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68dd..fd31a441c61c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/io.h> 9#include <linux/io.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12 11
@@ -464,7 +463,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
464 tmp |= ~((1<<(hi - 1)) - 1); 463 tmp |= ~((1<<(hi - 1)) - 1);
465 464
466 if (tmp != mask_lo) { 465 if (tmp != mask_lo) {
467 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); 466 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
468 mask_lo = tmp; 467 mask_lo = tmp;
469 } 468 }
470 } 469 }
@@ -570,7 +569,7 @@ static unsigned long set_mtrr_state(void)
570 569
571 570
572static unsigned long cr4; 571static unsigned long cr4;
573static DEFINE_SPINLOCK(set_atomicity_lock); 572static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
574 573
575/* 574/*
576 * Since we are disabling the cache don't allow any interrupts, 575 * Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +589,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
590 * changes to the way the kernel boots 589 * changes to the way the kernel boots
591 */ 590 */
592 591
593 spin_lock(&set_atomicity_lock); 592 raw_spin_lock(&set_atomicity_lock);
594 593
595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 594 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
596 cr0 = read_cr0() | X86_CR0_CD; 595 cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +626,7 @@ static void post_set(void) __releases(set_atomicity_lock)
627 /* Restore value of CR4 */ 626 /* Restore value of CR4 */
628 if (cpu_has_pge) 627 if (cpu_has_pge)
629 write_cr4(cr4); 628 write_cr4(cr4);
630 spin_unlock(&set_atomicity_lock); 629 raw_spin_unlock(&set_atomicity_lock);
631} 630}
632 631
633static void generic_set_all(void) 632static void generic_set_all(void)
@@ -752,7 +751,7 @@ int positive_have_wrcomb(void)
752/* 751/*
753 * Generic structure... 752 * Generic structure...
754 */ 753 */
755struct mtrr_ops generic_mtrr_ops = { 754const struct mtrr_ops generic_mtrr_ops = {
756 .use_intel_if = 1, 755 .use_intel_if = 1,
757 .set_all = generic_set_all, 756 .set_all = generic_set_all,
758 .get = generic_get_mtrr, 757 .get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index e006e56f699c..79289632cb27 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
8#include <linux/init.h> 9#include <linux/init.h>
9 10
10#define LINE_SIZE 80 11#define LINE_SIZE 80
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de54575..79556bd9b602 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex);
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init; 61static bool mtrr_aps_delayed_init;
62 62
63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
64 64
65struct mtrr_ops *mtrr_if; 65const struct mtrr_ops *mtrr_if;
66 66
67static void set_mtrr(unsigned int reg, unsigned long base, 67static void set_mtrr(unsigned int reg, unsigned long base,
68 unsigned long size, mtrr_type type); 68 unsigned long size, mtrr_type type);
69 69
70void set_mtrr_ops(struct mtrr_ops *ops) 70void set_mtrr_ops(const struct mtrr_ops *ops)
71{ 71{
72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
73 mtrr_ops[ops->vendor] = ops; 73 mtrr_ops[ops->vendor] = ops;
@@ -145,6 +145,7 @@ struct set_mtrr_data {
145 145
146/** 146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data
148 * 149 *
149 * Returns nothing. 150 * Returns nothing.
150 */ 151 */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87a..df5e41f31a27 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
32extern int generic_validate_add_page(unsigned long base, unsigned long size, 32extern int generic_validate_add_page(unsigned long base, unsigned long size,
33 unsigned int type); 33 unsigned int type);
34 34
35extern struct mtrr_ops generic_mtrr_ops; 35extern const struct mtrr_ops generic_mtrr_ops;
36 36
37extern int positive_have_wrcomb(void); 37extern int positive_have_wrcomb(void);
38 38
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
54void get_mtrr_state(void); 54void get_mtrr_state(void);
55 55
56extern void set_mtrr_ops(struct mtrr_ops *ops); 56extern void set_mtrr_ops(const struct mtrr_ops *ops);
57 57
58extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
59extern struct mtrr_ops *mtrr_if; 59extern const struct mtrr_ops *mtrr_if;
60 60
61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0d..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
1#include <linux/init.h>
2#include <linux/io.h>
3#include <linux/mm.h>
4
5#include <asm/processor-cyrix.h>
6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
10#include "mtrr.h"
11
12/* Put the processor into a state where MTRRs can be safely set */
13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
14{
15 unsigned int cr0;
16
17 /* Disable interrupts locally */
18 local_irq_save(ctxt->flags);
19
20 if (use_intel() || is_cpu(CYRIX)) {
21
22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
23 if (cpu_has_pge) {
24 ctxt->cr4val = read_cr4();
25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
26 }
27
28 /*
29 * Disable and flush caches. Note that wbinvd flushes the TLBs
30 * as a side-effect
31 */
32 cr0 = read_cr0() | X86_CR0_CD;
33 wbinvd();
34 write_cr0(cr0);
35 wbinvd();
36
37 if (use_intel()) {
38 /* Save MTRR state */
39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
40 } else {
41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
47 }
48}
49
50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
51{
52 if (use_intel()) {
53 /* Disable MTRRs, and set the default type to uncached */
54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
55 ctxt->deftype_hi);
56 } else {
57 if (is_cpu(CYRIX)) {
58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
62}
63
64/* Restore the processor after a set_mtrr_prepare */
65void set_mtrr_done(struct set_mtrr_context *ctxt)
66{
67 if (use_intel() || is_cpu(CYRIX)) {
68
69 /* Flush caches and TLBs */
70 wbinvd();
71
72 /* Restore MTRRdefType */
73 if (use_intel()) {
74 /* Intel (P6) standard MTRRs */
75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
76 ctxt->deftype_hi);
77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
84
85 /* Enable caches */
86 write_cr0(read_cr0() & 0xbfffffff);
87
88 /* Restore value of CR4 */
89 if (cpu_has_pge)
90 write_cr4(ctxt->cr4val);
91 }
92 /* Re-enable interrupts locally (if enabled previously) */
93 local_irq_restore(ctxt->flags);
94}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c223b7e895d9..db5bdc8addf8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
10 * 11 *
11 * For licencing details see kernel-base/COPYING 12 * For licencing details see kernel-base/COPYING
12 */ 13 */
@@ -20,12 +21,15 @@
20#include <linux/kdebug.h> 21#include <linux/kdebug.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h>
23#include <linux/highmem.h> 25#include <linux/highmem.h>
24#include <linux/cpu.h> 26#include <linux/cpu.h>
27#include <linux/bitops.h>
25 28
26#include <asm/apic.h> 29#include <asm/apic.h>
27#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
28#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h>
29 33
30static u64 perf_event_mask __read_mostly; 34static u64 perf_event_mask __read_mostly;
31 35
@@ -68,26 +72,59 @@ struct debug_store {
68 u64 pebs_event_reset[MAX_PEBS_EVENTS]; 72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 73};
70 74
75struct event_constraint {
76 union {
77 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
78 u64 idxmsk64;
79 };
80 u64 code;
81 u64 cmask;
82 int weight;
83};
84
85struct amd_nb {
86 int nb_id; /* NorthBridge id */
87 int refcnt; /* reference count */
88 struct perf_event *owners[X86_PMC_IDX_MAX];
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90};
91
71struct cpu_hw_events { 92struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX]; 93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 95 unsigned long interrupts;
76 int enabled; 96 int enabled;
77 struct debug_store *ds; 97 struct debug_store *ds;
78};
79 98
80struct event_constraint { 99 int n_events;
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 100 int n_added;
82 int code; 101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
104 struct amd_nb *amd_nb;
83}; 105};
84 106
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } 107#define __EVENT_CONSTRAINT(c, n, m, w) {\
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } 108 { .idxmsk64 = (n) }, \
109 .code = (c), \
110 .cmask = (m), \
111 .weight = (w), \
112}
113
114#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
87 116
88#define for_each_event_constraint(e, c) \ 117#define INTEL_EVENT_CONSTRAINT(c, n) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++) 118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
90 119
120#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
122
123#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0)
125
126#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++)
91 128
92/* 129/*
93 * struct x86_pmu - generic x86 pmu 130 * struct x86_pmu - generic x86 pmu
@@ -98,8 +135,8 @@ struct x86_pmu {
98 int (*handle_irq)(struct pt_regs *); 135 int (*handle_irq)(struct pt_regs *);
99 void (*disable_all)(void); 136 void (*disable_all)(void);
100 void (*enable_all)(void); 137 void (*enable_all)(void);
101 void (*enable)(struct hw_perf_event *, int); 138 void (*enable)(struct perf_event *);
102 void (*disable)(struct hw_perf_event *, int); 139 void (*disable)(struct perf_event *);
103 unsigned eventsel; 140 unsigned eventsel;
104 unsigned perfctr; 141 unsigned perfctr;
105 u64 (*event_map)(int); 142 u64 (*event_map)(int);
@@ -114,121 +151,28 @@ struct x86_pmu {
114 u64 intel_ctrl; 151 u64 intel_ctrl;
115 void (*enable_bts)(u64 config); 152 void (*enable_bts)(u64 config);
116 void (*disable_bts)(void); 153 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
119};
120 154
121static struct x86_pmu x86_pmu __read_mostly; 155 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event);
122 158
123static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 159 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
124 .enabled = 1, 160 struct perf_event *event);
125}; 161 struct event_constraint *event_constraints;
126 162
127static const struct event_constraint *event_constraints; 163 int (*cpu_prepare)(int cpu);
128 164 void (*cpu_starting)(int cpu);
129/* 165 void (*cpu_dying)(int cpu);
130 * Not sure about some of these 166 void (*cpu_dead)(int cpu);
131 */
132static const u64 p6_perfmon_event_map[] =
133{
134 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
135 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
136 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
137 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
138 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
139 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
140 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
141};
142
143static u64 p6_pmu_event_map(int hw_event)
144{
145 return p6_perfmon_event_map[hw_event];
146}
147
148/*
149 * Event setting that is specified not to count anything.
150 * We use this to effectively disable a counter.
151 *
152 * L2_RQSTS with 0 MESI unit mask.
153 */
154#define P6_NOP_EVENT 0x0000002EULL
155
156static u64 p6_pmu_raw_event(u64 hw_event)
157{
158#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
159#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
160#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
161#define P6_EVNTSEL_INV_MASK 0x00800000ULL
162#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
163
164#define P6_EVNTSEL_MASK \
165 (P6_EVNTSEL_EVENT_MASK | \
166 P6_EVNTSEL_UNIT_MASK | \
167 P6_EVNTSEL_EDGE_MASK | \
168 P6_EVNTSEL_INV_MASK | \
169 P6_EVNTSEL_REG_MASK)
170
171 return hw_event & P6_EVNTSEL_MASK;
172}
173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
184
185/*
186 * Intel PerfMon v3. Used on Core2 and later.
187 */
188static const u64 intel_perfmon_event_map[] =
189{
190 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
191 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
192 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
193 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
194 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
195 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
197}; 167};
198 168
199static const struct event_constraint intel_core_event_constraints[] = 169static struct x86_pmu x86_pmu __read_mostly;
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212 170
213static const struct event_constraint intel_nehalem_event_constraints[] = 171static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
214{ 172 .enabled = 1,
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226}; 173};
227 174
228static u64 intel_pmu_event_map(int hw_event) 175static int x86_perf_event_set_period(struct perf_event *event);
229{
230 return intel_perfmon_event_map[hw_event];
231}
232 176
233/* 177/*
234 * Generalized hw caching related hw_event table, filled 178 * Generalized hw caching related hw_event table, filled
@@ -245,435 +189,18 @@ static u64 __read_mostly hw_cache_event_ids
245 [PERF_COUNT_HW_CACHE_OP_MAX] 189 [PERF_COUNT_HW_CACHE_OP_MAX]
246 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 190 [PERF_COUNT_HW_CACHE_RESULT_MAX];
247 191
248static __initconst u64 nehalem_hw_cache_event_ids
249 [PERF_COUNT_HW_CACHE_MAX]
250 [PERF_COUNT_HW_CACHE_OP_MAX]
251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
252{
253 [ C(L1D) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
256 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
260 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
264 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
265 },
266 },
267 [ C(L1I ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
270 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = 0x0,
278 [ C(RESULT_MISS) ] = 0x0,
279 },
280 },
281 [ C(LL ) ] = {
282 [ C(OP_READ) ] = {
283 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
284 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
285 },
286 [ C(OP_WRITE) ] = {
287 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
288 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
289 },
290 [ C(OP_PREFETCH) ] = {
291 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
292 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
293 },
294 },
295 [ C(DTLB) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
298 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
299 },
300 [ C(OP_WRITE) ] = {
301 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
302 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
303 },
304 [ C(OP_PREFETCH) ] = {
305 [ C(RESULT_ACCESS) ] = 0x0,
306 [ C(RESULT_MISS) ] = 0x0,
307 },
308 },
309 [ C(ITLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
312 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
313 },
314 [ C(OP_WRITE) ] = {
315 [ C(RESULT_ACCESS) ] = -1,
316 [ C(RESULT_MISS) ] = -1,
317 },
318 [ C(OP_PREFETCH) ] = {
319 [ C(RESULT_ACCESS) ] = -1,
320 [ C(RESULT_MISS) ] = -1,
321 },
322 },
323 [ C(BPU ) ] = {
324 [ C(OP_READ) ] = {
325 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
326 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static __initconst u64 core2_hw_cache_event_ids
340 [PERF_COUNT_HW_CACHE_MAX]
341 [PERF_COUNT_HW_CACHE_OP_MAX]
342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
343{
344 [ C(L1D) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
347 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
351 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
355 [ C(RESULT_MISS) ] = 0,
356 },
357 },
358 [ C(L1I ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
361 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = 0,
369 [ C(RESULT_MISS) ] = 0,
370 },
371 },
372 [ C(LL ) ] = {
373 [ C(OP_READ) ] = {
374 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
375 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
376 },
377 [ C(OP_WRITE) ] = {
378 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
379 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
380 },
381 [ C(OP_PREFETCH) ] = {
382 [ C(RESULT_ACCESS) ] = 0,
383 [ C(RESULT_MISS) ] = 0,
384 },
385 },
386 [ C(DTLB) ] = {
387 [ C(OP_READ) ] = {
388 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
389 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
390 },
391 [ C(OP_WRITE) ] = {
392 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
393 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
394 },
395 [ C(OP_PREFETCH) ] = {
396 [ C(RESULT_ACCESS) ] = 0,
397 [ C(RESULT_MISS) ] = 0,
398 },
399 },
400 [ C(ITLB) ] = {
401 [ C(OP_READ) ] = {
402 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
403 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
404 },
405 [ C(OP_WRITE) ] = {
406 [ C(RESULT_ACCESS) ] = -1,
407 [ C(RESULT_MISS) ] = -1,
408 },
409 [ C(OP_PREFETCH) ] = {
410 [ C(RESULT_ACCESS) ] = -1,
411 [ C(RESULT_MISS) ] = -1,
412 },
413 },
414 [ C(BPU ) ] = {
415 [ C(OP_READ) ] = {
416 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
417 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
418 },
419 [ C(OP_WRITE) ] = {
420 [ C(RESULT_ACCESS) ] = -1,
421 [ C(RESULT_MISS) ] = -1,
422 },
423 [ C(OP_PREFETCH) ] = {
424 [ C(RESULT_ACCESS) ] = -1,
425 [ C(RESULT_MISS) ] = -1,
426 },
427 },
428};
429
430static __initconst u64 atom_hw_cache_event_ids
431 [PERF_COUNT_HW_CACHE_MAX]
432 [PERF_COUNT_HW_CACHE_OP_MAX]
433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
434{
435 [ C(L1D) ] = {
436 [ C(OP_READ) ] = {
437 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
438 [ C(RESULT_MISS) ] = 0,
439 },
440 [ C(OP_WRITE) ] = {
441 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_PREFETCH) ] = {
445 [ C(RESULT_ACCESS) ] = 0x0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 },
449 [ C(L1I ) ] = {
450 [ C(OP_READ) ] = {
451 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
452 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
453 },
454 [ C(OP_WRITE) ] = {
455 [ C(RESULT_ACCESS) ] = -1,
456 [ C(RESULT_MISS) ] = -1,
457 },
458 [ C(OP_PREFETCH) ] = {
459 [ C(RESULT_ACCESS) ] = 0,
460 [ C(RESULT_MISS) ] = 0,
461 },
462 },
463 [ C(LL ) ] = {
464 [ C(OP_READ) ] = {
465 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
466 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
467 },
468 [ C(OP_WRITE) ] = {
469 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
470 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
471 },
472 [ C(OP_PREFETCH) ] = {
473 [ C(RESULT_ACCESS) ] = 0,
474 [ C(RESULT_MISS) ] = 0,
475 },
476 },
477 [ C(DTLB) ] = {
478 [ C(OP_READ) ] = {
479 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
480 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
481 },
482 [ C(OP_WRITE) ] = {
483 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
484 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
485 },
486 [ C(OP_PREFETCH) ] = {
487 [ C(RESULT_ACCESS) ] = 0,
488 [ C(RESULT_MISS) ] = 0,
489 },
490 },
491 [ C(ITLB) ] = {
492 [ C(OP_READ) ] = {
493 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
494 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
495 },
496 [ C(OP_WRITE) ] = {
497 [ C(RESULT_ACCESS) ] = -1,
498 [ C(RESULT_MISS) ] = -1,
499 },
500 [ C(OP_PREFETCH) ] = {
501 [ C(RESULT_ACCESS) ] = -1,
502 [ C(RESULT_MISS) ] = -1,
503 },
504 },
505 [ C(BPU ) ] = {
506 [ C(OP_READ) ] = {
507 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
508 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
509 },
510 [ C(OP_WRITE) ] = {
511 [ C(RESULT_ACCESS) ] = -1,
512 [ C(RESULT_MISS) ] = -1,
513 },
514 [ C(OP_PREFETCH) ] = {
515 [ C(RESULT_ACCESS) ] = -1,
516 [ C(RESULT_MISS) ] = -1,
517 },
518 },
519};
520
521static u64 intel_pmu_raw_event(u64 hw_event)
522{
523#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
528
529#define CORE_EVNTSEL_MASK \
530 (CORE_EVNTSEL_EVENT_MASK | \
531 CORE_EVNTSEL_UNIT_MASK | \
532 CORE_EVNTSEL_EDGE_MASK | \
533 CORE_EVNTSEL_INV_MASK | \
534 CORE_EVNTSEL_REG_MASK)
535
536 return hw_event & CORE_EVNTSEL_MASK;
537}
538
539static __initconst u64 amd_hw_cache_event_ids
540 [PERF_COUNT_HW_CACHE_MAX]
541 [PERF_COUNT_HW_CACHE_OP_MAX]
542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
543{
544 [ C(L1D) ] = {
545 [ C(OP_READ) ] = {
546 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
547 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
548 },
549 [ C(OP_WRITE) ] = {
550 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
551 [ C(RESULT_MISS) ] = 0,
552 },
553 [ C(OP_PREFETCH) ] = {
554 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
555 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
556 },
557 },
558 [ C(L1I ) ] = {
559 [ C(OP_READ) ] = {
560 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
561 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
562 },
563 [ C(OP_WRITE) ] = {
564 [ C(RESULT_ACCESS) ] = -1,
565 [ C(RESULT_MISS) ] = -1,
566 },
567 [ C(OP_PREFETCH) ] = {
568 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
569 [ C(RESULT_MISS) ] = 0,
570 },
571 },
572 [ C(LL ) ] = {
573 [ C(OP_READ) ] = {
574 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
575 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
576 },
577 [ C(OP_WRITE) ] = {
578 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
579 [ C(RESULT_MISS) ] = 0,
580 },
581 [ C(OP_PREFETCH) ] = {
582 [ C(RESULT_ACCESS) ] = 0,
583 [ C(RESULT_MISS) ] = 0,
584 },
585 },
586 [ C(DTLB) ] = {
587 [ C(OP_READ) ] = {
588 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
589 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
590 },
591 [ C(OP_WRITE) ] = {
592 [ C(RESULT_ACCESS) ] = 0,
593 [ C(RESULT_MISS) ] = 0,
594 },
595 [ C(OP_PREFETCH) ] = {
596 [ C(RESULT_ACCESS) ] = 0,
597 [ C(RESULT_MISS) ] = 0,
598 },
599 },
600 [ C(ITLB) ] = {
601 [ C(OP_READ) ] = {
602 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
603 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
604 },
605 [ C(OP_WRITE) ] = {
606 [ C(RESULT_ACCESS) ] = -1,
607 [ C(RESULT_MISS) ] = -1,
608 },
609 [ C(OP_PREFETCH) ] = {
610 [ C(RESULT_ACCESS) ] = -1,
611 [ C(RESULT_MISS) ] = -1,
612 },
613 },
614 [ C(BPU ) ] = {
615 [ C(OP_READ) ] = {
616 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
617 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
618 },
619 [ C(OP_WRITE) ] = {
620 [ C(RESULT_ACCESS) ] = -1,
621 [ C(RESULT_MISS) ] = -1,
622 },
623 [ C(OP_PREFETCH) ] = {
624 [ C(RESULT_ACCESS) ] = -1,
625 [ C(RESULT_MISS) ] = -1,
626 },
627 },
628};
629
630/*
631 * AMD Performance Monitor K7 and later.
632 */
633static const u64 amd_perfmon_event_map[] =
634{
635 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
636 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
637 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
638 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
639 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
640 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
641};
642
643static u64 amd_pmu_event_map(int hw_event)
644{
645 return amd_perfmon_event_map[hw_event];
646}
647
648static u64 amd_pmu_raw_event(u64 hw_event)
649{
650#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
651#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
652#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
653#define K7_EVNTSEL_INV_MASK 0x000800000ULL
654#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
655
656#define K7_EVNTSEL_MASK \
657 (K7_EVNTSEL_EVENT_MASK | \
658 K7_EVNTSEL_UNIT_MASK | \
659 K7_EVNTSEL_EDGE_MASK | \
660 K7_EVNTSEL_INV_MASK | \
661 K7_EVNTSEL_REG_MASK)
662
663 return hw_event & K7_EVNTSEL_MASK;
664}
665
666/* 192/*
667 * Propagate event elapsed time into the generic event. 193 * Propagate event elapsed time into the generic event.
668 * Can only be executed on the CPU where the event is active. 194 * Can only be executed on the CPU where the event is active.
669 * Returns the delta events processed. 195 * Returns the delta events processed.
670 */ 196 */
671static u64 197static u64
672x86_perf_event_update(struct perf_event *event, 198x86_perf_event_update(struct perf_event *event)
673 struct hw_perf_event *hwc, int idx)
674{ 199{
200 struct hw_perf_event *hwc = &event->hw;
675 int shift = 64 - x86_pmu.event_bits; 201 int shift = 64 - x86_pmu.event_bits;
676 u64 prev_raw_count, new_raw_count; 202 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx;
677 s64 delta; 204 s64 delta;
678 205
679 if (idx == X86_PMC_IDX_FIXED_BTS) 206 if (idx == X86_PMC_IDX_FIXED_BTS)
@@ -773,7 +300,7 @@ static inline bool bts_available(void)
773 return x86_pmu.enable_bts != NULL; 300 return x86_pmu.enable_bts != NULL;
774} 301}
775 302
776static inline void init_debug_store_on_cpu(int cpu) 303static void init_debug_store_on_cpu(int cpu)
777{ 304{
778 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
779 306
@@ -785,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu)
785 (u32)((u64)(unsigned long)ds >> 32)); 312 (u32)((u64)(unsigned long)ds >> 32));
786} 313}
787 314
788static inline void fini_debug_store_on_cpu(int cpu) 315static void fini_debug_store_on_cpu(int cpu)
789{ 316{
790 if (!per_cpu(cpu_hw_events, cpu).ds) 317 if (!per_cpu(cpu_hw_events, cpu).ds)
791 return; 318 return;
@@ -914,42 +441,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
914 return 0; 441 return 0;
915} 442}
916 443
917static void intel_pmu_enable_bts(u64 config)
918{
919 unsigned long debugctlmsr;
920
921 debugctlmsr = get_debugctlmsr();
922
923 debugctlmsr |= X86_DEBUGCTL_TR;
924 debugctlmsr |= X86_DEBUGCTL_BTS;
925 debugctlmsr |= X86_DEBUGCTL_BTINT;
926
927 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
928 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
929
930 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
931 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
932
933 update_debugctlmsr(debugctlmsr);
934}
935
936static void intel_pmu_disable_bts(void)
937{
938 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
939 unsigned long debugctlmsr;
940
941 if (!cpuc->ds)
942 return;
943
944 debugctlmsr = get_debugctlmsr();
945
946 debugctlmsr &=
947 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
948 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
949
950 update_debugctlmsr(debugctlmsr);
951}
952
953/* 444/*
954 * Setup the hardware configuration for a given attr_type 445 * Setup the hardware configuration for a given attr_type
955 */ 446 */
@@ -988,6 +479,8 @@ static int __hw_perf_event_init(struct perf_event *event)
988 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
989 480
990 hwc->idx = -1; 481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
991 484
992 /* 485 /*
993 * Count user and OS events unless requested not to. 486 * Count user and OS events unless requested not to.
@@ -1017,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event)
1017 */ 510 */
1018 if (attr->type == PERF_TYPE_RAW) { 511 if (attr->type == PERF_TYPE_RAW) {
1019 hwc->config |= x86_pmu.raw_event(attr->config); 512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
1020 return 0; 516 return 0;
1021 } 517 }
1022 518
@@ -1056,216 +552,314 @@ static int __hw_perf_event_init(struct perf_event *event)
1056 return 0; 552 return 0;
1057} 553}
1058 554
1059static void p6_pmu_disable_all(void) 555static void x86_pmu_disable_all(void)
1060{ 556{
1061 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1062 u64 val; 558 int idx;
1063
1064 if (!cpuc->enabled)
1065 return;
1066 559
1067 cpuc->enabled = 0; 560 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1068 barrier(); 561 u64 val;
1069 562
1070 /* p6 only has one enable register */ 563 if (!test_bit(idx, cpuc->active_mask))
1071 rdmsrl(MSR_P6_EVNTSEL0, val); 564 continue;
1072 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 565 rdmsrl(x86_pmu.eventsel + idx, val);
1073 wrmsrl(MSR_P6_EVNTSEL0, val); 566 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
567 continue;
568 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
569 wrmsrl(x86_pmu.eventsel + idx, val);
570 }
1074} 571}
1075 572
1076static void intel_pmu_disable_all(void) 573void hw_perf_disable(void)
1077{ 574{
1078 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 575 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1079 576
577 if (!x86_pmu_initialized())
578 return;
579
1080 if (!cpuc->enabled) 580 if (!cpuc->enabled)
1081 return; 581 return;
1082 582
583 cpuc->n_added = 0;
1083 cpuc->enabled = 0; 584 cpuc->enabled = 0;
1084 barrier(); 585 barrier();
1085 586
1086 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 587 x86_pmu.disable_all();
1087
1088 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1089 intel_pmu_disable_bts();
1090} 588}
1091 589
1092static void amd_pmu_disable_all(void) 590static void x86_pmu_enable_all(void)
1093{ 591{
1094 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1095 int idx; 593 int idx;
1096 594
1097 if (!cpuc->enabled)
1098 return;
1099
1100 cpuc->enabled = 0;
1101 /*
1102 * ensure we write the disable before we start disabling the
1103 * events proper, so that amd_pmu_enable_event() does the
1104 * right thing.
1105 */
1106 barrier();
1107
1108 for (idx = 0; idx < x86_pmu.num_events; idx++) { 595 for (idx = 0; idx < x86_pmu.num_events; idx++) {
596 struct perf_event *event = cpuc->events[idx];
1109 u64 val; 597 u64 val;
1110 598
1111 if (!test_bit(idx, cpuc->active_mask)) 599 if (!test_bit(idx, cpuc->active_mask))
1112 continue; 600 continue;
1113 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 601
1114 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) 602 val = event->hw.config;
1115 continue; 603 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
1116 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 604 wrmsrl(x86_pmu.eventsel + idx, val);
1117 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1118 } 605 }
1119} 606}
1120 607
1121void hw_perf_disable(void) 608static const struct pmu pmu;
609
610static inline int is_x86_event(struct perf_event *event)
1122{ 611{
1123 if (!x86_pmu_initialized()) 612 return event->pmu == &pmu;
1124 return;
1125 return x86_pmu.disable_all();
1126} 613}
1127 614
1128static void p6_pmu_enable_all(void) 615static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
1129{ 616{
1130 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 617 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
1131 unsigned long val; 618 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
619 int i, j, w, wmax, num = 0;
620 struct hw_perf_event *hwc;
1132 621
1133 if (cpuc->enabled) 622 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1134 return;
1135 623
1136 cpuc->enabled = 1; 624 for (i = 0; i < n; i++) {
1137 barrier(); 625 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
626 constraints[i] = c;
627 }
1138 628
1139 /* p6 only has one enable register */ 629 /*
1140 rdmsrl(MSR_P6_EVNTSEL0, val); 630 * fastpath, try to reuse previous register
1141 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 631 */
1142 wrmsrl(MSR_P6_EVNTSEL0, val); 632 for (i = 0; i < n; i++) {
1143} 633 hwc = &cpuc->event_list[i]->hw;
634 c = constraints[i];
1144 635
1145static void intel_pmu_enable_all(void) 636 /* never assigned */
1146{ 637 if (hwc->idx == -1)
1147 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 638 break;
1148 639
1149 if (cpuc->enabled) 640 /* constraint still honored */
1150 return; 641 if (!test_bit(hwc->idx, c->idxmsk))
642 break;
1151 643
1152 cpuc->enabled = 1; 644 /* not already used */
1153 barrier(); 645 if (test_bit(hwc->idx, used_mask))
646 break;
1154 647
1155 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 648 __set_bit(hwc->idx, used_mask);
649 if (assign)
650 assign[i] = hwc->idx;
651 }
652 if (i == n)
653 goto done;
1156 654
1157 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 655 /*
1158 struct perf_event *event = 656 * begin slow path
1159 cpuc->events[X86_PMC_IDX_FIXED_BTS]; 657 */
1160 658
1161 if (WARN_ON_ONCE(!event)) 659 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1162 return;
1163 660
1164 intel_pmu_enable_bts(event->hw.config); 661 /*
1165 } 662 * weight = number of possible counters
1166} 663 *
664 * 1 = most constrained, only works on one counter
665 * wmax = least constrained, works on any counter
666 *
667 * assign events to counters starting with most
668 * constrained events.
669 */
670 wmax = x86_pmu.num_events;
1167 671
1168static void amd_pmu_enable_all(void) 672 /*
1169{ 673 * when fixed event counters are present,
1170 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 674 * wmax is incremented by 1 to account
1171 int idx; 675 * for one more choice
676 */
677 if (x86_pmu.num_events_fixed)
678 wmax++;
1172 679
1173 if (cpuc->enabled) 680 for (w = 1, num = n; num && w <= wmax; w++) {
1174 return; 681 /* for each event */
682 for (i = 0; num && i < n; i++) {
683 c = constraints[i];
684 hwc = &cpuc->event_list[i]->hw;
1175 685
1176 cpuc->enabled = 1; 686 if (c->weight != w)
1177 barrier(); 687 continue;
1178 688
1179 for (idx = 0; idx < x86_pmu.num_events; idx++) { 689 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
1180 struct perf_event *event = cpuc->events[idx]; 690 if (!test_bit(j, used_mask))
1181 u64 val; 691 break;
692 }
1182 693
1183 if (!test_bit(idx, cpuc->active_mask)) 694 if (j == X86_PMC_IDX_MAX)
1184 continue; 695 break;
1185 696
1186 val = event->hw.config; 697 __set_bit(j, used_mask);
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 698
1188 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 699 if (assign)
700 assign[i] = j;
701 num--;
702 }
1189 } 703 }
704done:
705 /*
706 * scheduling failed or is just a simulation,
707 * free resources if necessary
708 */
709 if (!assign || num) {
710 for (i = 0; i < n; i++) {
711 if (x86_pmu.put_event_constraints)
712 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
713 }
714 }
715 return num ? -ENOSPC : 0;
1190} 716}
1191 717
1192void hw_perf_enable(void) 718/*
719 * dogrp: true if must collect siblings events (group)
720 * returns total number of events and error code
721 */
722static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1193{ 723{
1194 if (!x86_pmu_initialized()) 724 struct perf_event *event;
1195 return; 725 int n, max_count;
1196 x86_pmu.enable_all();
1197}
1198 726
1199static inline u64 intel_pmu_get_status(void) 727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
1200{
1201 u64 status;
1202 728
1203 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 729 /* current number of events already accepted */
730 n = cpuc->n_events;
1204 731
1205 return status; 732 if (is_x86_event(leader)) {
1206} 733 if (n >= max_count)
734 return -ENOSPC;
735 cpuc->event_list[n] = leader;
736 n++;
737 }
738 if (!dogrp)
739 return n;
1207 740
1208static inline void intel_pmu_ack_status(u64 ack) 741 list_for_each_entry(event, &leader->sibling_list, group_entry) {
1209{ 742 if (!is_x86_event(event) ||
1210 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 743 event->state <= PERF_EVENT_STATE_OFF)
1211} 744 continue;
1212 745
1213static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 746 if (n >= max_count)
1214{ 747 return -ENOSPC;
1215 (void)checking_wrmsrl(hwc->config_base + idx,
1216 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1217}
1218 748
1219static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) 749 cpuc->event_list[n] = event;
1220{ 750 n++;
1221 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 751 }
752 return n;
1222} 753}
1223 754
1224static inline void 755static inline void x86_assign_hw_event(struct perf_event *event,
1225intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) 756 struct cpu_hw_events *cpuc, int i)
1226{ 757{
1227 int idx = __idx - X86_PMC_IDX_FIXED; 758 struct hw_perf_event *hwc = &event->hw;
1228 u64 ctrl_val, mask;
1229 759
1230 mask = 0xfULL << (idx * 4); 760 hwc->idx = cpuc->assign[i];
761 hwc->last_cpu = smp_processor_id();
762 hwc->last_tag = ++cpuc->tags[i];
1231 763
1232 rdmsrl(hwc->config_base, ctrl_val); 764 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
1233 ctrl_val &= ~mask; 765 hwc->config_base = 0;
1234 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 766 hwc->event_base = 0;
767 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
768 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
769 /*
770 * We set it so that event_base + idx in wrmsr/rdmsr maps to
771 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
772 */
773 hwc->event_base =
774 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
775 } else {
776 hwc->config_base = x86_pmu.eventsel;
777 hwc->event_base = x86_pmu.perfctr;
778 }
1235} 779}
1236 780
1237static inline void 781static inline int match_prev_assignment(struct hw_perf_event *hwc,
1238p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) 782 struct cpu_hw_events *cpuc,
783 int i)
1239{ 784{
1240 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 785 return hwc->idx == cpuc->assign[i] &&
1241 u64 val = P6_NOP_EVENT; 786 hwc->last_cpu == smp_processor_id() &&
1242 787 hwc->last_tag == cpuc->tags[i];
1243 if (cpuc->enabled)
1244 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1245
1246 (void)checking_wrmsrl(hwc->config_base + idx, val);
1247} 788}
1248 789
1249static inline void 790static int x86_pmu_start(struct perf_event *event);
1250intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) 791static void x86_pmu_stop(struct perf_event *event);
792
793void hw_perf_enable(void)
1251{ 794{
1252 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1253 intel_pmu_disable_bts(); 796 struct perf_event *event;
797 struct hw_perf_event *hwc;
798 int i;
799
800 if (!x86_pmu_initialized())
1254 return; 801 return;
1255 }
1256 802
1257 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 803 if (cpuc->enabled)
1258 intel_pmu_disable_fixed(hwc, idx);
1259 return; 804 return;
805
806 if (cpuc->n_added) {
807 int n_running = cpuc->n_events - cpuc->n_added;
808 /*
809 * apply assignment obtained either from
810 * hw_perf_group_sched_in() or x86_pmu_enable()
811 *
812 * step1: save events moving to new counters
813 * step2: reprogram moved events into new counters
814 */
815 for (i = 0; i < n_running; i++) {
816 event = cpuc->event_list[i];
817 hwc = &event->hw;
818
819 /*
820 * we can avoid reprogramming counter if:
821 * - assigned same counter as last time
822 * - running on same CPU as last time
823 * - no other event has used the counter since
824 */
825 if (hwc->idx == -1 ||
826 match_prev_assignment(hwc, cpuc, i))
827 continue;
828
829 x86_pmu_stop(event);
830 }
831
832 for (i = 0; i < cpuc->n_events; i++) {
833 event = cpuc->event_list[i];
834 hwc = &event->hw;
835
836 if (!match_prev_assignment(hwc, cpuc, i))
837 x86_assign_hw_event(event, cpuc, i);
838 else if (i < n_running)
839 continue;
840
841 x86_pmu_start(event);
842 }
843 cpuc->n_added = 0;
844 perf_events_lapic_init();
1260 } 845 }
1261 846
1262 x86_pmu_disable_event(hwc, idx); 847 cpuc->enabled = 1;
848 barrier();
849
850 x86_pmu.enable_all();
1263} 851}
1264 852
1265static inline void 853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
1266amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1267{ 854{
1268 x86_pmu_disable_event(hwc, idx); 855 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857}
858
859static inline void x86_pmu_disable_event(struct perf_event *event)
860{
861 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
1269} 863}
1270 864
1271static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1275,12 +869,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1275 * To be called with the event disabled in hw: 869 * To be called with the event disabled in hw:
1276 */ 870 */
1277static int 871static int
1278x86_perf_event_set_period(struct perf_event *event, 872x86_perf_event_set_period(struct perf_event *event)
1279 struct hw_perf_event *hwc, int idx)
1280{ 873{
874 struct hw_perf_event *hwc = &event->hw;
1281 s64 left = atomic64_read(&hwc->period_left); 875 s64 left = atomic64_read(&hwc->period_left);
1282 s64 period = hwc->sample_period; 876 s64 period = hwc->sample_period;
1283 int err, ret = 0; 877 int err, ret = 0, idx = hwc->idx;
1284 878
1285 if (idx == X86_PMC_IDX_FIXED_BTS) 879 if (idx == X86_PMC_IDX_FIXED_BTS)
1286 return 0; 880 return 0;
@@ -1326,212 +920,63 @@ x86_perf_event_set_period(struct perf_event *event,
1326 return ret; 920 return ret;
1327} 921}
1328 922
1329static inline void 923static void x86_pmu_enable_event(struct perf_event *event)
1330intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1331{
1332 int idx = __idx - X86_PMC_IDX_FIXED;
1333 u64 ctrl_val, bits, mask;
1334 int err;
1335
1336 /*
1337 * Enable IRQ generation (0x8),
1338 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
1339 * if requested:
1340 */
1341 bits = 0x8ULL;
1342 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
1343 bits |= 0x2;
1344 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1345 bits |= 0x1;
1346 bits <<= (idx * 4);
1347 mask = 0xfULL << (idx * 4);
1348
1349 rdmsrl(hwc->config_base, ctrl_val);
1350 ctrl_val &= ~mask;
1351 ctrl_val |= bits;
1352 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1353}
1354
1355static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1356{ 924{
1357 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1358 u64 val;
1359
1360 val = hwc->config;
1361 if (cpuc->enabled) 926 if (cpuc->enabled)
1362 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 927 __x86_pmu_enable_event(&event->hw);
1363
1364 (void)checking_wrmsrl(hwc->config_base + idx, val);
1365} 928}
1366 929
1367 930/*
1368static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) 931 * activate a single event
1369{ 932 *
1370 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 933 * The event is added to the group of enabled events
1371 if (!__get_cpu_var(cpu_hw_events).enabled) 934 * but only if it can be scehduled with existing events.
1372 return; 935 *
1373 936 * Called with PMU disabled. If successful and return value 1,
1374 intel_pmu_enable_bts(hwc->config); 937 * then guaranteed to call perf_enable() and hw_perf_enable()
1375 return; 938 */
1376 } 939static int x86_pmu_enable(struct perf_event *event)
1377
1378 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1379 intel_pmu_enable_fixed(hwc, idx);
1380 return;
1381 }
1382
1383 x86_pmu_enable_event(hwc, idx);
1384}
1385
1386static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1387{ 940{
1388 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 941 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
942 struct hw_perf_event *hwc;
943 int assign[X86_PMC_IDX_MAX];
944 int n, n0, ret;
1389 945
1390 if (cpuc->enabled) 946 hwc = &event->hw;
1391 x86_pmu_enable_event(hwc, idx);
1392}
1393
1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1395{
1396 unsigned int hw_event;
1397
1398 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1399
1400 if (unlikely((hw_event ==
1401 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1402 (hwc->sample_period == 1)))
1403 return X86_PMC_IDX_FIXED_BTS;
1404 947
1405 if (!x86_pmu.num_events_fixed) 948 n0 = cpuc->n_events;
1406 return -1; 949 n = collect_events(cpuc, event, false);
950 if (n < 0)
951 return n;
1407 952
953 ret = x86_schedule_events(cpuc, n, assign);
954 if (ret)
955 return ret;
1408 /* 956 /*
1409 * fixed counters do not take all possible filters 957 * copy new assignment, now we know it is possible
958 * will be used by hw_perf_enable()
1410 */ 959 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) 960 memcpy(cpuc->assign, assign, n*sizeof(int));
1412 return -1;
1413
1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1417 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1418 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1419 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1420
1421 return -1;
1422}
1423
1424/*
1425 * generic counter allocator: get next free counter
1426 */
1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1429{
1430 int idx;
1431
1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435 961
1436/* 962 cpuc->n_events = n;
1437 * intel-specific counter allocator: check event constraints 963 cpuc->n_added += n - n0;
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444 964
1445 if (!event_constraints) 965 return 0;
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1465{
1466 int idx;
1467
1468 idx = fixed_mode_idx(hwc);
1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1470 /* BTS is already occupied. */
1471 if (test_and_set_bit(idx, cpuc->used_mask))
1472 return -EAGAIN;
1473
1474 hwc->config_base = 0;
1475 hwc->event_base = 0;
1476 hwc->idx = idx;
1477 } else if (idx >= 0) {
1478 /*
1479 * Try to get the fixed event, if that is already taken
1480 * then try to get a generic event:
1481 */
1482 if (test_and_set_bit(idx, cpuc->used_mask))
1483 goto try_generic;
1484
1485 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1486 /*
1487 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1488 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1489 */
1490 hwc->event_base =
1491 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1492 hwc->idx = idx;
1493 } else {
1494 idx = hwc->idx;
1495 /* Try to get the previous generic event again */
1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1497try_generic:
1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1499 if (idx == -1)
1500 return -EAGAIN;
1501
1502 set_bit(idx, cpuc->used_mask);
1503 hwc->idx = idx;
1504 }
1505 hwc->config_base = x86_pmu.eventsel;
1506 hwc->event_base = x86_pmu.perfctr;
1507 }
1508
1509 return idx;
1510} 966}
1511 967
1512/* 968static int x86_pmu_start(struct perf_event *event)
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{ 969{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 970 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw; 971 int idx = event->hw.idx;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1525 perf_events_lapic_init();
1526 972
1527 x86_pmu.disable(hwc, idx); 973 if (idx == -1)
974 return -EAGAIN;
1528 975
976 x86_perf_event_set_period(event);
1529 cpuc->events[idx] = event; 977 cpuc->events[idx] = event;
1530 set_bit(idx, cpuc->active_mask); 978 __set_bit(idx, cpuc->active_mask);
1531 979 x86_pmu.enable(event);
1532 x86_perf_event_set_period(event, hwc, idx);
1533 x86_pmu.enable(hwc, idx);
1534
1535 perf_event_update_userpage(event); 980 perf_event_update_userpage(event);
1536 981
1537 return 0; 982 return 0;
@@ -1539,14 +984,8 @@ static int x86_pmu_enable(struct perf_event *event)
1539 984
1540static void x86_pmu_unthrottle(struct perf_event *event) 985static void x86_pmu_unthrottle(struct perf_event *event)
1541{ 986{
1542 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 987 int ret = x86_pmu_start(event);
1543 struct hw_perf_event *hwc = &event->hw; 988 WARN_ON_ONCE(ret);
1544
1545 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1546 cpuc->events[hwc->idx] != event))
1547 return;
1548
1549 x86_pmu.enable(hwc, hwc->idx);
1550} 989}
1551 990
1552void perf_event_print_debug(void) 991void perf_event_print_debug(void)
@@ -1576,7 +1015,7 @@ void perf_event_print_debug(void)
1576 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1577 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1578 } 1017 }
1579 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1580 1019
1581 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1020 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1582 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1600,257 +1039,50 @@ void perf_event_print_debug(void)
1600 local_irq_restore(flags); 1039 local_irq_restore(flags);
1601} 1040}
1602 1041
1603static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) 1042static void x86_pmu_stop(struct perf_event *event)
1604{
1605 struct debug_store *ds = cpuc->ds;
1606 struct bts_record {
1607 u64 from;
1608 u64 to;
1609 u64 flags;
1610 };
1611 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1612 struct bts_record *at, *top;
1613 struct perf_output_handle handle;
1614 struct perf_event_header header;
1615 struct perf_sample_data data;
1616 struct pt_regs regs;
1617
1618 if (!event)
1619 return;
1620
1621 if (!ds)
1622 return;
1623
1624 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1625 top = (struct bts_record *)(unsigned long)ds->bts_index;
1626
1627 if (top <= at)
1628 return;
1629
1630 ds->bts_index = ds->bts_buffer_base;
1631
1632
1633 data.period = event->hw.last_period;
1634 data.addr = 0;
1635 data.raw = NULL;
1636 regs.ip = 0;
1637
1638 /*
1639 * Prepare a generic sample, i.e. fill in the invariant fields.
1640 * We will overwrite the from and to address before we output
1641 * the sample.
1642 */
1643 perf_prepare_sample(&header, &data, event, &regs);
1644
1645 if (perf_output_begin(&handle, event,
1646 header.size * (top - at), 1, 1))
1647 return;
1648
1649 for (; at < top; at++) {
1650 data.ip = at->from;
1651 data.addr = at->to;
1652
1653 perf_output_sample(&handle, &header, &data, event);
1654 }
1655
1656 perf_output_end(&handle);
1657
1658 /* There's new data available. */
1659 event->hw.interrupts++;
1660 event->pending_kill = POLL_IN;
1661}
1662
1663static void x86_pmu_disable(struct perf_event *event)
1664{ 1043{
1665 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1044 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1666 struct hw_perf_event *hwc = &event->hw; 1045 struct hw_perf_event *hwc = &event->hw;
1667 int idx = hwc->idx; 1046 int idx = hwc->idx;
1668 1047
1669 /* 1048 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1670 * Must be done before we disable, otherwise the nmi handler 1049 return;
1671 * could reenable again:
1672 */
1673 clear_bit(idx, cpuc->active_mask);
1674 x86_pmu.disable(hwc, idx);
1675 1050
1676 /* 1051 x86_pmu.disable(event);
1677 * Make sure the cleared pointer becomes visible before we
1678 * (potentially) free the event:
1679 */
1680 barrier();
1681 1052
1682 /* 1053 /*
1683 * Drain the remaining delta count out of a event 1054 * Drain the remaining delta count out of a event
1684 * that we are disabling: 1055 * that we are disabling:
1685 */ 1056 */
1686 x86_perf_event_update(event, hwc, idx); 1057 x86_perf_event_update(event);
1687
1688 /* Drain the remaining BTS records. */
1689 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1690 intel_pmu_drain_bts_buffer(cpuc);
1691 1058
1692 cpuc->events[idx] = NULL; 1059 cpuc->events[idx] = NULL;
1693 clear_bit(idx, cpuc->used_mask);
1694
1695 perf_event_update_userpage(event);
1696}
1697
1698/*
1699 * Save and restart an expired event. Called by NMI contexts,
1700 * so it has to be careful about preempting normal event ops:
1701 */
1702static int intel_pmu_save_and_restart(struct perf_event *event)
1703{
1704 struct hw_perf_event *hwc = &event->hw;
1705 int idx = hwc->idx;
1706 int ret;
1707
1708 x86_perf_event_update(event, hwc, idx);
1709 ret = x86_perf_event_set_period(event, hwc, idx);
1710
1711 if (event->state == PERF_EVENT_STATE_ACTIVE)
1712 intel_pmu_enable_event(hwc, idx);
1713
1714 return ret;
1715}
1716
1717static void intel_pmu_reset(void)
1718{
1719 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1720 unsigned long flags;
1721 int idx;
1722
1723 if (!x86_pmu.num_events)
1724 return;
1725
1726 local_irq_save(flags);
1727
1728 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1729
1730 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1731 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1732 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1733 }
1734 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1735 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1736 }
1737 if (ds)
1738 ds->bts_index = ds->bts_buffer_base;
1739
1740 local_irq_restore(flags);
1741}
1742
1743static int p6_pmu_handle_irq(struct pt_regs *regs)
1744{
1745 struct perf_sample_data data;
1746 struct cpu_hw_events *cpuc;
1747 struct perf_event *event;
1748 struct hw_perf_event *hwc;
1749 int idx, handled = 0;
1750 u64 val;
1751
1752 data.addr = 0;
1753 data.raw = NULL;
1754
1755 cpuc = &__get_cpu_var(cpu_hw_events);
1756
1757 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1758 if (!test_bit(idx, cpuc->active_mask))
1759 continue;
1760
1761 event = cpuc->events[idx];
1762 hwc = &event->hw;
1763
1764 val = x86_perf_event_update(event, hwc, idx);
1765 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1766 continue;
1767
1768 /*
1769 * event overflow
1770 */
1771 handled = 1;
1772 data.period = event->hw.last_period;
1773
1774 if (!x86_perf_event_set_period(event, hwc, idx))
1775 continue;
1776
1777 if (perf_event_overflow(event, 1, &data, regs))
1778 p6_pmu_disable_event(hwc, idx);
1779 }
1780
1781 if (handled)
1782 inc_irq_stat(apic_perf_irqs);
1783
1784 return handled;
1785} 1060}
1786 1061
1787/* 1062static void x86_pmu_disable(struct perf_event *event)
1788 * This handler is triggered by the local APIC, so the APIC IRQ handling
1789 * rules apply:
1790 */
1791static int intel_pmu_handle_irq(struct pt_regs *regs)
1792{ 1063{
1793 struct perf_sample_data data; 1064 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1794 struct cpu_hw_events *cpuc; 1065 int i;
1795 int bit, loops;
1796 u64 ack, status;
1797
1798 data.addr = 0;
1799 data.raw = NULL;
1800
1801 cpuc = &__get_cpu_var(cpu_hw_events);
1802 1066
1803 perf_disable(); 1067 x86_pmu_stop(event);
1804 intel_pmu_drain_bts_buffer(cpuc);
1805 status = intel_pmu_get_status();
1806 if (!status) {
1807 perf_enable();
1808 return 0;
1809 }
1810 1068
1811 loops = 0; 1069 for (i = 0; i < cpuc->n_events; i++) {
1812again: 1070 if (event == cpuc->event_list[i]) {
1813 if (++loops > 100) {
1814 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1815 perf_event_print_debug();
1816 intel_pmu_reset();
1817 perf_enable();
1818 return 1;
1819 }
1820
1821 inc_irq_stat(apic_perf_irqs);
1822 ack = status;
1823 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1824 struct perf_event *event = cpuc->events[bit];
1825 1071
1826 clear_bit(bit, (unsigned long *) &status); 1072 if (x86_pmu.put_event_constraints)
1827 if (!test_bit(bit, cpuc->active_mask)) 1073 x86_pmu.put_event_constraints(cpuc, event);
1828 continue;
1829 1074
1830 if (!intel_pmu_save_and_restart(event)) 1075 while (++i < cpuc->n_events)
1831 continue; 1076 cpuc->event_list[i-1] = cpuc->event_list[i];
1832 1077
1833 data.period = event->hw.last_period; 1078 --cpuc->n_events;
1834 1079 break;
1835 if (perf_event_overflow(event, 1, &data, regs)) 1080 }
1836 intel_pmu_disable_event(&event->hw, bit);
1837 } 1081 }
1838 1082 perf_event_update_userpage(event);
1839 intel_pmu_ack_status(ack);
1840
1841 /*
1842 * Repeat if there is more work to be done:
1843 */
1844 status = intel_pmu_get_status();
1845 if (status)
1846 goto again;
1847
1848 perf_enable();
1849
1850 return 1;
1851} 1083}
1852 1084
1853static int amd_pmu_handle_irq(struct pt_regs *regs) 1085static int x86_pmu_handle_irq(struct pt_regs *regs)
1854{ 1086{
1855 struct perf_sample_data data; 1087 struct perf_sample_data data;
1856 struct cpu_hw_events *cpuc; 1088 struct cpu_hw_events *cpuc;
@@ -1859,8 +1091,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1859 int idx, handled = 0; 1091 int idx, handled = 0;
1860 u64 val; 1092 u64 val;
1861 1093
1862 data.addr = 0; 1094 perf_sample_data_init(&data, 0);
1863 data.raw = NULL;
1864 1095
1865 cpuc = &__get_cpu_var(cpu_hw_events); 1096 cpuc = &__get_cpu_var(cpu_hw_events);
1866 1097
@@ -1871,7 +1102,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1871 event = cpuc->events[idx]; 1102 event = cpuc->events[idx];
1872 hwc = &event->hw; 1103 hwc = &event->hw;
1873 1104
1874 val = x86_perf_event_update(event, hwc, idx); 1105 val = x86_perf_event_update(event);
1875 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1106 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1876 continue; 1107 continue;
1877 1108
@@ -1881,11 +1112,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1881 handled = 1; 1112 handled = 1;
1882 data.period = event->hw.last_period; 1113 data.period = event->hw.last_period;
1883 1114
1884 if (!x86_perf_event_set_period(event, hwc, idx)) 1115 if (!x86_perf_event_set_period(event))
1885 continue; 1116 continue;
1886 1117
1887 if (perf_event_overflow(event, 1, &data, regs)) 1118 if (perf_event_overflow(event, 1, &data, regs))
1888 amd_pmu_disable_event(hwc, idx); 1119 x86_pmu_stop(event);
1889 } 1120 }
1890 1121
1891 if (handled) 1122 if (handled)
@@ -1968,193 +1199,171 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1968 .priority = 1 1199 .priority = 1
1969}; 1200};
1970 1201
1971static __initconst struct x86_pmu p6_pmu = { 1202static struct event_constraint unconstrained;
1972 .name = "p6", 1203static struct event_constraint emptyconstraint;
1973 .handle_irq = p6_pmu_handle_irq,
1974 .disable_all = p6_pmu_disable_all,
1975 .enable_all = p6_pmu_enable_all,
1976 .enable = p6_pmu_enable_event,
1977 .disable = p6_pmu_disable_event,
1978 .eventsel = MSR_P6_EVNTSEL0,
1979 .perfctr = MSR_P6_PERFCTR0,
1980 .event_map = p6_pmu_event_map,
1981 .raw_event = p6_pmu_raw_event,
1982 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1983 .apic = 1,
1984 .max_period = (1ULL << 31) - 1,
1985 .version = 0,
1986 .num_events = 2,
1987 /*
1988 * Events have 40 bits implemented. However they are designed such
1989 * that bits [32-39] are sign extensions of bit 31. As such the
1990 * effective width of a event for P6-like PMU is 32 bits only.
1991 *
1992 * See IA-32 Intel Architecture Software developer manual Vol 3B
1993 */
1994 .event_bits = 32,
1995 .event_mask = (1ULL << 32) - 1,
1996 .get_event_idx = intel_get_event_idx,
1997};
1998 1204
1999static __initconst struct x86_pmu intel_pmu = { 1205static struct event_constraint *
2000 .name = "Intel", 1206x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
2001 .handle_irq = intel_pmu_handle_irq, 1207{
2002 .disable_all = intel_pmu_disable_all, 1208 struct event_constraint *c;
2003 .enable_all = intel_pmu_enable_all,
2004 .enable = intel_pmu_enable_event,
2005 .disable = intel_pmu_disable_event,
2006 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
2007 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
2008 .event_map = intel_pmu_event_map,
2009 .raw_event = intel_pmu_raw_event,
2010 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
2011 .apic = 1,
2012 /*
2013 * Intel PMCs cannot be accessed sanely above 32 bit width,
2014 * so we install an artificial 1<<31 period regardless of
2015 * the generic event period:
2016 */
2017 .max_period = (1ULL << 31) - 1,
2018 .enable_bts = intel_pmu_enable_bts,
2019 .disable_bts = intel_pmu_disable_bts,
2020 .get_event_idx = intel_get_event_idx,
2021};
2022 1209
2023static __initconst struct x86_pmu amd_pmu = { 1210 if (x86_pmu.event_constraints) {
2024 .name = "AMD", 1211 for_each_event_constraint(c, x86_pmu.event_constraints) {
2025 .handle_irq = amd_pmu_handle_irq, 1212 if ((event->hw.config & c->cmask) == c->code)
2026 .disable_all = amd_pmu_disable_all, 1213 return c;
2027 .enable_all = amd_pmu_enable_all, 1214 }
2028 .enable = amd_pmu_enable_event, 1215 }
2029 .disable = amd_pmu_disable_event, 1216
2030 .eventsel = MSR_K7_EVNTSEL0, 1217 return &unconstrained;
2031 .perfctr = MSR_K7_PERFCTR0, 1218}
2032 .event_map = amd_pmu_event_map,
2033 .raw_event = amd_pmu_raw_event,
2034 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
2035 .num_events = 4,
2036 .event_bits = 48,
2037 .event_mask = (1ULL << 48) - 1,
2038 .apic = 1,
2039 /* use highest bit to detect overflow */
2040 .max_period = (1ULL << 47) - 1,
2041 .get_event_idx = gen_get_event_idx,
2042};
2043 1219
2044static __init int p6_pmu_init(void) 1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
2045{ 1222{
2046 switch (boot_cpu_data.x86_model) { 1223 int ret = 0;
2047 case 1:
2048 case 3: /* Pentium Pro */
2049 case 5:
2050 case 6: /* Pentium II */
2051 case 7:
2052 case 8:
2053 case 11: /* Pentium III */
2054 event_constraints = intel_p6_event_constraints;
2055 break;
2056 case 9:
2057 case 13:
2058 /* Pentium M */
2059 event_constraints = intel_p6_event_constraints;
2060 break;
2061 default:
2062 pr_cont("unsupported p6 CPU model %d ",
2063 boot_cpu_data.x86_model);
2064 return -ENODEV;
2065 }
2066 1224
2067 x86_pmu = p6_pmu; 1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
2068 1228
2069 return 0; 1229 if (!is_x86_event(event))
1230 ret = event->pmu->enable(event);
1231
1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
2070} 1239}
2071 1240
2072static __init int intel_pmu_init(void) 1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
2073{ 1243{
2074 union cpuid10_edx edx; 1244 event->state = PERF_EVENT_STATE_INACTIVE;
2075 union cpuid10_eax eax; 1245 event->oncpu = -1;
2076 unsigned int unused;
2077 unsigned int ebx;
2078 int version;
2079
2080 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
2081 /* check for P6 processor family */
2082 if (boot_cpu_data.x86 == 6) {
2083 return p6_pmu_init();
2084 } else {
2085 return -ENODEV;
2086 }
2087 }
2088 1246
2089 /* 1247 if (!is_x86_event(event))
2090 * Check whether the Architectural PerfMon supports 1248 event->pmu->disable(event);
2091 * Branch Misses Retired hw_event or not.
2092 */
2093 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
2094 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
2095 return -ENODEV;
2096 1249
2097 version = eax.split.version_id; 1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
2098 if (version < 2)
2099 return -ENODEV;
2100 1251
2101 x86_pmu = intel_pmu; 1252 if (!is_software_event(event))
2102 x86_pmu.version = version; 1253 cpuctx->active_oncpu--;
2103 x86_pmu.num_events = eax.split.num_events;
2104 x86_pmu.event_bits = eax.split.bit_width;
2105 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
2106 1254
1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1256 cpuctx->exclusive = 0;
1257}
1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
2107 /* 1299 /*
2108 * Quirk: v2 perfmon does not report fixed-purpose events, so 1300 * copy new assignment, now we know it is possible
2109 * assume at least 3 events: 1301 * will be used by hw_perf_enable()
2110 */ 1302 */
2111 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
2112 1308
2113 /* 1309 /*
2114 * Install the hw-cache-events table: 1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
2115 */ 1315 */
2116 switch (boot_cpu_data.x86_model) { 1316 return 1;
2117 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 1317undo:
2118 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 1318 x86_event_sched_out(leader, cpuctx);
2119 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 1319 n0 = 1;
2120 case 29: /* six-core 45 nm xeon "Dunnington" */ 1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2121 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
2122 sizeof(hw_cache_event_ids)); 1322 x86_event_sched_out(sub, cpuctx);
2123 1323 if (++n0 == n1)
2124 pr_cont("Core2 events, "); 1324 break;
2125 event_constraints = intel_core_event_constraints; 1325 }
2126 break; 1326 }
2127 default: 1327 return ret;
2128 case 26: 1328}
2129 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1329
2130 sizeof(hw_cache_event_ids)); 1330#include "perf_event_amd.c"
1331#include "perf_event_p6.c"
1332#include "perf_event_intel.c"
2131 1333
2132 event_constraints = intel_nehalem_event_constraints; 1334static int __cpuinit
2133 pr_cont("Nehalem/Corei7 events, "); 1335x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1336{
1337 unsigned int cpu = (long)hcpu;
1338 int ret = NOTIFY_OK;
1339
1340 switch (action & ~CPU_TASKS_FROZEN) {
1341 case CPU_UP_PREPARE:
1342 if (x86_pmu.cpu_prepare)
1343 ret = x86_pmu.cpu_prepare(cpu);
2134 break; 1344 break;
2135 case 28:
2136 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2137 sizeof(hw_cache_event_ids));
2138 1345
2139 pr_cont("Atom events, "); 1346 case CPU_STARTING:
1347 if (x86_pmu.cpu_starting)
1348 x86_pmu.cpu_starting(cpu);
2140 break; 1349 break;
2141 }
2142 return 0;
2143}
2144 1350
2145static __init int amd_pmu_init(void) 1351 case CPU_DYING:
2146{ 1352 if (x86_pmu.cpu_dying)
2147 /* Performance-monitoring supported from K7 and later: */ 1353 x86_pmu.cpu_dying(cpu);
2148 if (boot_cpu_data.x86 < 6) 1354 break;
2149 return -ENODEV;
2150 1355
2151 x86_pmu = amd_pmu; 1356 case CPU_UP_CANCELED:
1357 case CPU_DEAD:
1358 if (x86_pmu.cpu_dead)
1359 x86_pmu.cpu_dead(cpu);
1360 break;
2152 1361
2153 /* Events are common for all AMDs */ 1362 default:
2154 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 1363 break;
2155 sizeof(hw_cache_event_ids)); 1364 }
2156 1365
2157 return 0; 1366 return ret;
2158} 1367}
2159 1368
2160static void __init pmu_check_apic(void) 1369static void __init pmu_check_apic(void)
@@ -2169,6 +1378,7 @@ static void __init pmu_check_apic(void)
2169 1378
2170void __init init_hw_perf_events(void) 1379void __init init_hw_perf_events(void)
2171{ 1380{
1381 struct event_constraint *c;
2172 int err; 1382 int err;
2173 1383
2174 pr_info("Performance Events: "); 1384 pr_info("Performance Events: ");
@@ -2213,6 +1423,20 @@ void __init init_hw_perf_events(void)
2213 perf_events_lapic_init(); 1423 perf_events_lapic_init();
2214 register_die_notifier(&perf_event_nmi_notifier); 1424 register_die_notifier(&perf_event_nmi_notifier);
2215 1425
1426 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
1428 0, x86_pmu.num_events);
1429
1430 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK)
1433 continue;
1434
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
1436 c->weight += x86_pmu.num_events;
1437 }
1438 }
1439
2216 pr_info("... version: %d\n", x86_pmu.version); 1440 pr_info("... version: %d\n", x86_pmu.version);
2217 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1441 pr_info("... bit width: %d\n", x86_pmu.event_bits);
2218 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1442 pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -2220,60 +1444,91 @@ void __init init_hw_perf_events(void)
2220 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
2221 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
2222 pr_info("... event mask: %016Lx\n", perf_event_mask); 1446 pr_info("... event mask: %016Lx\n", perf_event_mask);
1447
1448 perf_cpu_notifier(x86_pmu_notifier);
2223} 1449}
2224 1450
2225static inline void x86_pmu_read(struct perf_event *event) 1451static inline void x86_pmu_read(struct perf_event *event)
2226{ 1452{
2227 x86_perf_event_update(event, &event->hw, event->hw.idx); 1453 x86_perf_event_update(event);
2228} 1454}
2229 1455
2230static const struct pmu pmu = { 1456static const struct pmu pmu = {
2231 .enable = x86_pmu_enable, 1457 .enable = x86_pmu_enable,
2232 .disable = x86_pmu_disable, 1458 .disable = x86_pmu_disable,
1459 .start = x86_pmu_start,
1460 .stop = x86_pmu_stop,
2233 .read = x86_pmu_read, 1461 .read = x86_pmu_read,
2234 .unthrottle = x86_pmu_unthrottle, 1462 .unthrottle = x86_pmu_unthrottle,
2235}; 1463};
2236 1464
2237static int 1465/*
2238validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) 1466 * validate a single event group
2239{ 1467 *
2240 struct hw_perf_event fake_event = event->hw; 1468 * validation include:
2241 1469 * - check events are compatible which each other
2242 if (event->pmu && event->pmu != &pmu) 1470 * - events do not compete for the same counter
2243 return 0; 1471 * - number of events <= number of counters
2244 1472 *
2245 return x86_schedule_event(cpuc, &fake_event) >= 0; 1473 * validation ensures the group can be loaded onto the
2246} 1474 * PMU if it was the only group available.
2247 1475 */
2248static int validate_group(struct perf_event *event) 1476static int validate_group(struct perf_event *event)
2249{ 1477{
2250 struct perf_event *sibling, *leader = event->group_leader; 1478 struct perf_event *leader = event->group_leader;
2251 struct cpu_hw_events fake_pmu; 1479 struct cpu_hw_events *fake_cpuc;
1480 int ret, n;
2252 1481
2253 memset(&fake_pmu, 0, sizeof(fake_pmu)); 1482 ret = -ENOMEM;
1483 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1484 if (!fake_cpuc)
1485 goto out;
2254 1486
2255 if (!validate_event(&fake_pmu, leader)) 1487 /*
2256 return -ENOSPC; 1488 * the event is not yet connected with its
1489 * siblings therefore we must first collect
1490 * existing siblings, then add the new event
1491 * before we can simulate the scheduling
1492 */
1493 ret = -ENOSPC;
1494 n = collect_events(fake_cpuc, leader, true);
1495 if (n < 0)
1496 goto out_free;
2257 1497
2258 list_for_each_entry(sibling, &leader->sibling_list, group_entry) { 1498 fake_cpuc->n_events = n;
2259 if (!validate_event(&fake_pmu, sibling)) 1499 n = collect_events(fake_cpuc, event, false);
2260 return -ENOSPC; 1500 if (n < 0)
2261 } 1501 goto out_free;
2262 1502
2263 if (!validate_event(&fake_pmu, event)) 1503 fake_cpuc->n_events = n;
2264 return -ENOSPC;
2265 1504
2266 return 0; 1505 ret = x86_schedule_events(fake_cpuc, n, NULL);
1506
1507out_free:
1508 kfree(fake_cpuc);
1509out:
1510 return ret;
2267} 1511}
2268 1512
2269const struct pmu *hw_perf_event_init(struct perf_event *event) 1513const struct pmu *hw_perf_event_init(struct perf_event *event)
2270{ 1514{
1515 const struct pmu *tmp;
2271 int err; 1516 int err;
2272 1517
2273 err = __hw_perf_event_init(event); 1518 err = __hw_perf_event_init(event);
2274 if (!err) { 1519 if (!err) {
1520 /*
1521 * we temporarily connect event to its pmu
1522 * such that validate_group() can classify
1523 * it as an x86 event using is_x86_event()
1524 */
1525 tmp = event->pmu;
1526 event->pmu = &pmu;
1527
2275 if (event->group_leader != event) 1528 if (event->group_leader != event)
2276 err = validate_group(event); 1529 err = validate_group(event);
1530
1531 event->pmu = tmp;
2277 } 1532 }
2278 if (err) { 1533 if (err) {
2279 if (event->destroy) 1534 if (event->destroy)
@@ -2297,7 +1552,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2297 1552
2298static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1553static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2299static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1554static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2300static DEFINE_PER_CPU(int, in_ignored_frame);
2301 1555
2302 1556
2303static void 1557static void
@@ -2313,10 +1567,6 @@ static void backtrace_warning(void *data, char *msg)
2313 1567
2314static int backtrace_stack(void *data, char *name) 1568static int backtrace_stack(void *data, char *name)
2315{ 1569{
2316 per_cpu(in_ignored_frame, smp_processor_id()) =
2317 x86_is_stack_id(NMI_STACK, name) ||
2318 x86_is_stack_id(DEBUG_STACK, name);
2319
2320 return 0; 1570 return 0;
2321} 1571}
2322 1572
@@ -2324,9 +1574,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2324{ 1574{
2325 struct perf_callchain_entry *entry = data; 1575 struct perf_callchain_entry *entry = data;
2326 1576
2327 if (per_cpu(in_ignored_frame, smp_processor_id()))
2328 return;
2329
2330 if (reliable) 1577 if (reliable)
2331 callchain_store(entry, addr); 1578 callchain_store(entry, addr);
2332} 1579}
@@ -2347,7 +1594,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2347 callchain_store(entry, PERF_CONTEXT_KERNEL); 1594 callchain_store(entry, PERF_CONTEXT_KERNEL);
2348 callchain_store(entry, regs->ip); 1595 callchain_store(entry, regs->ip);
2349 1596
2350 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2351} 1598}
2352 1599
2353/* 1600/*
@@ -2385,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
2385 return len; 1632 return len;
2386} 1633}
2387 1634
2388static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 1635#ifdef CONFIG_COMPAT
1636static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2389{ 1638{
2390 unsigned long bytes; 1639 /* 32-bit process in 64-bit kernel. */
1640 struct stack_frame_ia32 frame;
1641 const void __user *fp;
1642
1643 if (!test_thread_flag(TIF_IA32))
1644 return 0;
1645
1646 fp = compat_ptr(regs->bp);
1647 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1648 unsigned long bytes;
1649 frame.next_frame = 0;
1650 frame.return_address = 0;
2391 1651
2392 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); 1652 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1653 if (bytes != sizeof(frame))
1654 break;
1655
1656 if (fp < compat_ptr(regs->sp))
1657 break;
2393 1658
2394 return bytes == sizeof(*frame); 1659 callchain_store(entry, frame.return_address);
1660 fp = compat_ptr(frame.next_frame);
1661 }
1662 return 1;
1663}
1664#else
1665static inline int
1666perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1667{
1668 return 0;
2395} 1669}
1670#endif
2396 1671
2397static void 1672static void
2398perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1673perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -2408,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
2408 callchain_store(entry, PERF_CONTEXT_USER); 1683 callchain_store(entry, PERF_CONTEXT_USER);
2409 callchain_store(entry, regs->ip); 1684 callchain_store(entry, regs->ip);
2410 1685
1686 if (perf_callchain_user32(regs, entry))
1687 return;
1688
2411 while (entry->nr < PERF_MAX_STACK_DEPTH) { 1689 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1690 unsigned long bytes;
2412 frame.next_frame = NULL; 1691 frame.next_frame = NULL;
2413 frame.return_address = 0; 1692 frame.return_address = 0;
2414 1693
2415 if (!copy_stack_frame(fp, &frame)) 1694 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1695 if (bytes != sizeof(frame))
2416 break; 1696 break;
2417 1697
2418 if ((unsigned long)fp < regs->sp) 1698 if ((unsigned long)fp < regs->sp)
@@ -2433,9 +1713,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
2433 1713
2434 is_user = user_mode(regs); 1714 is_user = user_mode(regs);
2435 1715
2436 if (!current || current->pid == 0)
2437 return;
2438
2439 if (is_user && current->state != TASK_RUNNING) 1716 if (is_user && current->state != TASK_RUNNING)
2440 return; 1717 return;
2441 1718
@@ -2462,7 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2462 return entry; 1739 return entry;
2463} 1740}
2464 1741
2465void hw_perf_event_setup_online(int cpu) 1742void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2466{ 1743{
2467 init_debug_store_on_cpu(cpu); 1744 regs->ip = ip;
1745 /*
1746 * perf_arch_fetch_caller_regs adds another call, we need to increment
1747 * the skip level
1748 */
1749 regs->bp = rewind_frame_pointer(skip + 1);
1750 regs->cs = __KERNEL_CS;
1751 local_save_flags(regs->flags);
2468} 1752}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 000000000000..db6f7d4056e1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,422 @@
1#ifdef CONFIG_CPU_SUP_AMD
2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
9{
10 [ C(L1D) ] = {
11 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
14 },
15 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
17 [ C(RESULT_MISS) ] = 0,
18 },
19 [ C(OP_PREFETCH) ] = {
20 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
21 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
22 },
23 },
24 [ C(L1I ) ] = {
25 [ C(OP_READ) ] = {
26 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
27 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
28 },
29 [ C(OP_WRITE) ] = {
30 [ C(RESULT_ACCESS) ] = -1,
31 [ C(RESULT_MISS) ] = -1,
32 },
33 [ C(OP_PREFETCH) ] = {
34 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
35 [ C(RESULT_MISS) ] = 0,
36 },
37 },
38 [ C(LL ) ] = {
39 [ C(OP_READ) ] = {
40 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
41 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
42 },
43 [ C(OP_WRITE) ] = {
44 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
45 [ C(RESULT_MISS) ] = 0,
46 },
47 [ C(OP_PREFETCH) ] = {
48 [ C(RESULT_ACCESS) ] = 0,
49 [ C(RESULT_MISS) ] = 0,
50 },
51 },
52 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
56 },
57 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0,
59 [ C(RESULT_MISS) ] = 0,
60 },
61 [ C(OP_PREFETCH) ] = {
62 [ C(RESULT_ACCESS) ] = 0,
63 [ C(RESULT_MISS) ] = 0,
64 },
65 },
66 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
70 },
71 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1,
73 [ C(RESULT_MISS) ] = -1,
74 },
75 [ C(OP_PREFETCH) ] = {
76 [ C(RESULT_ACCESS) ] = -1,
77 [ C(RESULT_MISS) ] = -1,
78 },
79 },
80 [ C(BPU ) ] = {
81 [ C(OP_READ) ] = {
82 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
83 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
84 },
85 [ C(OP_WRITE) ] = {
86 [ C(RESULT_ACCESS) ] = -1,
87 [ C(RESULT_MISS) ] = -1,
88 },
89 [ C(OP_PREFETCH) ] = {
90 [ C(RESULT_ACCESS) ] = -1,
91 [ C(RESULT_MISS) ] = -1,
92 },
93 },
94};
95
96/*
97 * AMD Performance Monitor K7 and later.
98 */
99static const u64 amd_perfmon_event_map[] =
100{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
107};
108
109static u64 amd_pmu_event_map(int hw_event)
110{
111 return amd_perfmon_event_map[hw_event];
112}
113
114static u64 amd_pmu_raw_event(u64 hw_event)
115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
121
122#define K7_EVNTSEL_MASK \
123 (K7_EVNTSEL_EVENT_MASK | \
124 K7_EVNTSEL_UNIT_MASK | \
125 K7_EVNTSEL_EDGE_MASK | \
126 K7_EVNTSEL_INV_MASK | \
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130}
131
132/*
133 * AMD64 events are detected based on their event codes.
134 */
135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
136{
137 return (hwc->config & 0xe0) == 0xe0;
138}
139
140static inline int amd_has_nb(struct cpu_hw_events *cpuc)
141{
142 struct amd_nb *nb = cpuc->amd_nb;
143
144 return nb && nb->nb_id != -1;
145}
146
147static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
148 struct perf_event *event)
149{
150 struct hw_perf_event *hwc = &event->hw;
151 struct amd_nb *nb = cpuc->amd_nb;
152 int i;
153
154 /*
155 * only care about NB events
156 */
157 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
158 return;
159
160 /*
161 * need to scan whole list because event may not have
162 * been assigned during scheduling
163 *
164 * no race condition possible because event can only
165 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here
167 */
168 for (i = 0; i < x86_pmu.num_events; i++) {
169 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL);
171 break;
172 }
173 }
174}
175
176 /*
177 * AMD64 NorthBridge events need special treatment because
178 * counter access needs to be synchronized across all cores
179 * of a package. Refer to BKDG section 3.12
180 *
181 * NB events are events measuring L3 cache, Hypertransport
182 * traffic. They are identified by an event code >= 0xe00.
183 * They measure events on the NorthBride which is shared
184 * by all cores on a package. NB events are counted on a
185 * shared set of counters. When a NB event is programmed
186 * in a counter, the data actually comes from a shared
187 * counter. Thus, access to those counters needs to be
188 * synchronized.
189 *
190 * We implement the synchronization such that no two cores
191 * can be measuring NB events using the same counters. Thus,
192 * we maintain a per-NB allocation table. The available slot
193 * is propagated using the event_constraint structure.
194 *
195 * We provide only one choice for each NB event based on
196 * the fact that only NB events have restrictions. Consequently,
197 * if a counter is available, there is a guarantee the NB event
198 * will be assigned to it. If no slot is available, an empty
199 * constraint is returned and scheduling will eventually fail
200 * for this event.
201 *
202 * Note that all cores attached the same NB compete for the same
203 * counters to host NB events, this is why we use atomic ops. Some
204 * multi-chip CPUs may have more than one NB.
205 *
206 * Given that resources are allocated (cmpxchg), they must be
207 * eventually freed for others to use. This is accomplished by
208 * calling amd_put_event_constraints().
209 *
210 * Non NB events are not impacted by this restriction.
211 */
212static struct event_constraint *
213amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
214{
215 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events;
219 int i, j, k = -1;
220
221 /*
222 * if not NB event or no NB, then no constraints
223 */
224 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
225 return &unconstrained;
226
227 /*
228 * detect if already present, if so reuse
229 *
230 * cannot merge with actual allocation
231 * because of possible holes
232 *
233 * event can already be present yet not assigned (in hwc->idx)
234 * because of successive calls to x86_schedule_events() from
235 * hw_perf_group_sched_in() without hw_perf_enable()
236 */
237 for (i = 0; i < max; i++) {
238 /*
239 * keep track of first free slot
240 */
241 if (k == -1 && !nb->owners[i])
242 k = i;
243
244 /* already present, reuse */
245 if (nb->owners[i] == event)
246 goto done;
247 }
248 /*
249 * not present, so grab a new slot
250 * starting either at:
251 */
252 if (hwc->idx != -1) {
253 /* previous assignment */
254 i = hwc->idx;
255 } else if (k != -1) {
256 /* start from free slot found */
257 i = k;
258 } else {
259 /*
260 * event not found, no slot found in
261 * first pass, try again from the
262 * beginning
263 */
264 i = 0;
265 }
266 j = i;
267 do {
268 old = cmpxchg(nb->owners+i, NULL, event);
269 if (!old)
270 break;
271 if (++i == max)
272 i = 0;
273 } while (i != j);
274done:
275 if (!old)
276 return &nb->event_constraints[i];
277
278 return &emptyconstraint;
279}
280
281static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
282{
283 struct amd_nb *nb;
284 int i;
285
286 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
287 if (!nb)
288 return NULL;
289
290 memset(nb, 0, sizeof(*nb));
291 nb->nb_id = nb_id;
292
293 /*
294 * initialize all possible NB constraints
295 */
296 for (i = 0; i < x86_pmu.num_events; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1;
299 }
300 return nb;
301}
302
303static int amd_pmu_cpu_prepare(int cpu)
304{
305 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
306
307 WARN_ON_ONCE(cpuc->amd_nb);
308
309 if (boot_cpu_data.x86_max_cores < 2)
310 return NOTIFY_OK;
311
312 cpuc->amd_nb = amd_alloc_nb(cpu, -1);
313 if (!cpuc->amd_nb)
314 return NOTIFY_BAD;
315
316 return NOTIFY_OK;
317}
318
319static void amd_pmu_cpu_starting(int cpu)
320{
321 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
322 struct amd_nb *nb;
323 int i, nb_id;
324
325 if (boot_cpu_data.x86_max_cores < 2)
326 return;
327
328 nb_id = amd_get_nb_id(cpu);
329 WARN_ON_ONCE(nb_id == BAD_APICID);
330
331 raw_spin_lock(&amd_nb_lock);
332
333 for_each_online_cpu(i) {
334 nb = per_cpu(cpu_hw_events, i).amd_nb;
335 if (WARN_ON_ONCE(!nb))
336 continue;
337
338 if (nb->nb_id == nb_id) {
339 kfree(cpuc->amd_nb);
340 cpuc->amd_nb = nb;
341 break;
342 }
343 }
344
345 cpuc->amd_nb->nb_id = nb_id;
346 cpuc->amd_nb->refcnt++;
347
348 raw_spin_unlock(&amd_nb_lock);
349}
350
351static void amd_pmu_cpu_dead(int cpu)
352{
353 struct cpu_hw_events *cpuhw;
354
355 if (boot_cpu_data.x86_max_cores < 2)
356 return;
357
358 cpuhw = &per_cpu(cpu_hw_events, cpu);
359
360 raw_spin_lock(&amd_nb_lock);
361
362 if (cpuhw->amd_nb) {
363 struct amd_nb *nb = cpuhw->amd_nb;
364
365 if (nb->nb_id == -1 || --nb->refcnt == 0)
366 kfree(nb);
367
368 cpuhw->amd_nb = NULL;
369 }
370
371 raw_spin_unlock(&amd_nb_lock);
372}
373
374static __initconst struct x86_pmu amd_pmu = {
375 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event,
381 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4,
387 .event_bits = 48,
388 .event_mask = (1ULL << 48) - 1,
389 .apic = 1,
390 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1,
392 .get_event_constraints = amd_get_event_constraints,
393 .put_event_constraints = amd_put_event_constraints,
394
395 .cpu_prepare = amd_pmu_cpu_prepare,
396 .cpu_starting = amd_pmu_cpu_starting,
397 .cpu_dead = amd_pmu_cpu_dead,
398};
399
400static __init int amd_pmu_init(void)
401{
402 /* Performance-monitoring supported from K7 and later: */
403 if (boot_cpu_data.x86 < 6)
404 return -ENODEV;
405
406 x86_pmu = amd_pmu;
407
408 /* Events are common for all AMDs */
409 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
410 sizeof(hw_cache_event_ids));
411
412 return 0;
413}
414
415#else /* CONFIG_CPU_SUP_AMD */
416
417static int amd_pmu_init(void)
418{
419 return 0;
420}
421
422#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 000000000000..9c794ac87837
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,980 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Intel PerfMon, used on Core and later.
5 */
6static const u64 intel_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15};
16
17static struct event_constraint intel_core_event_constraints[] =
18{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
21 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
22 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
23 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
24 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
25 EVENT_CONSTRAINT_END
26};
27
28static struct event_constraint intel_core2_event_constraints[] =
29{
30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
32 /*
33 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
34 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
35 * ratio between these counters.
36 */
37 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
38 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
39 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
40 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
41 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
42 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
43 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
44 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
45 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
46 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
47 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
48 EVENT_CONSTRAINT_END
49};
50
51static struct event_constraint intel_nehalem_event_constraints[] =
52{
53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
55 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
56 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
57 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
58 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
59 INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
60 INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
61 INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
62 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
63 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
64 EVENT_CONSTRAINT_END
65};
66
67static struct event_constraint intel_westmere_event_constraints[] =
68{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
75 EVENT_CONSTRAINT_END
76};
77
78static struct event_constraint intel_gen_event_constraints[] =
79{
80 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
81 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
82 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
83 EVENT_CONSTRAINT_END
84};
85
86static u64 intel_pmu_event_map(int hw_event)
87{
88 return intel_perfmon_event_map[hw_event];
89}
90
91static __initconst u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
95{
96 [ C(L1D) ] = {
97 [ C(OP_READ) ] = {
98 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
99 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
100 },
101 [ C(OP_WRITE) ] = {
102 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
103 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
104 },
105 [ C(OP_PREFETCH) ] = {
106 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
107 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
108 },
109 },
110 [ C(L1I ) ] = {
111 [ C(OP_READ) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
113 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
114 },
115 [ C(OP_WRITE) ] = {
116 [ C(RESULT_ACCESS) ] = -1,
117 [ C(RESULT_MISS) ] = -1,
118 },
119 [ C(OP_PREFETCH) ] = {
120 [ C(RESULT_ACCESS) ] = 0x0,
121 [ C(RESULT_MISS) ] = 0x0,
122 },
123 },
124 [ C(LL ) ] = {
125 [ C(OP_READ) ] = {
126 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
127 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
128 },
129 [ C(OP_WRITE) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
131 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
132 },
133 [ C(OP_PREFETCH) ] = {
134 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
135 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
136 },
137 },
138 [ C(DTLB) ] = {
139 [ C(OP_READ) ] = {
140 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
141 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
142 },
143 [ C(OP_WRITE) ] = {
144 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
145 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
146 },
147 [ C(OP_PREFETCH) ] = {
148 [ C(RESULT_ACCESS) ] = 0x0,
149 [ C(RESULT_MISS) ] = 0x0,
150 },
151 },
152 [ C(ITLB) ] = {
153 [ C(OP_READ) ] = {
154 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
155 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
156 },
157 [ C(OP_WRITE) ] = {
158 [ C(RESULT_ACCESS) ] = -1,
159 [ C(RESULT_MISS) ] = -1,
160 },
161 [ C(OP_PREFETCH) ] = {
162 [ C(RESULT_ACCESS) ] = -1,
163 [ C(RESULT_MISS) ] = -1,
164 },
165 },
166 [ C(BPU ) ] = {
167 [ C(OP_READ) ] = {
168 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
169 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
170 },
171 [ C(OP_WRITE) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 [ C(OP_PREFETCH) ] = {
176 [ C(RESULT_ACCESS) ] = -1,
177 [ C(RESULT_MISS) ] = -1,
178 },
179 },
180};
181
182static __initconst u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
186{
187 [ C(L1D) ] = {
188 [ C(OP_READ) ] = {
189 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
190 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
191 },
192 [ C(OP_WRITE) ] = {
193 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
194 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
195 },
196 [ C(OP_PREFETCH) ] = {
197 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
198 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
199 },
200 },
201 [ C(L1I ) ] = {
202 [ C(OP_READ) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
204 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
205 },
206 [ C(OP_WRITE) ] = {
207 [ C(RESULT_ACCESS) ] = -1,
208 [ C(RESULT_MISS) ] = -1,
209 },
210 [ C(OP_PREFETCH) ] = {
211 [ C(RESULT_ACCESS) ] = 0x0,
212 [ C(RESULT_MISS) ] = 0x0,
213 },
214 },
215 [ C(LL ) ] = {
216 [ C(OP_READ) ] = {
217 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
218 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
219 },
220 [ C(OP_WRITE) ] = {
221 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
222 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
223 },
224 [ C(OP_PREFETCH) ] = {
225 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
226 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
227 },
228 },
229 [ C(DTLB) ] = {
230 [ C(OP_READ) ] = {
231 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
232 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
233 },
234 [ C(OP_WRITE) ] = {
235 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
236 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
237 },
238 [ C(OP_PREFETCH) ] = {
239 [ C(RESULT_ACCESS) ] = 0x0,
240 [ C(RESULT_MISS) ] = 0x0,
241 },
242 },
243 [ C(ITLB) ] = {
244 [ C(OP_READ) ] = {
245 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
246 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
247 },
248 [ C(OP_WRITE) ] = {
249 [ C(RESULT_ACCESS) ] = -1,
250 [ C(RESULT_MISS) ] = -1,
251 },
252 [ C(OP_PREFETCH) ] = {
253 [ C(RESULT_ACCESS) ] = -1,
254 [ C(RESULT_MISS) ] = -1,
255 },
256 },
257 [ C(BPU ) ] = {
258 [ C(OP_READ) ] = {
259 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
260 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
261 },
262 [ C(OP_WRITE) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 [ C(OP_PREFETCH) ] = {
267 [ C(RESULT_ACCESS) ] = -1,
268 [ C(RESULT_MISS) ] = -1,
269 },
270 },
271};
272
273static __initconst u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
277{
278 [ C(L1D) ] = {
279 [ C(OP_READ) ] = {
280 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
281 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
282 },
283 [ C(OP_WRITE) ] = {
284 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
285 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
286 },
287 [ C(OP_PREFETCH) ] = {
288 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
289 [ C(RESULT_MISS) ] = 0,
290 },
291 },
292 [ C(L1I ) ] = {
293 [ C(OP_READ) ] = {
294 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
295 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
296 },
297 [ C(OP_WRITE) ] = {
298 [ C(RESULT_ACCESS) ] = -1,
299 [ C(RESULT_MISS) ] = -1,
300 },
301 [ C(OP_PREFETCH) ] = {
302 [ C(RESULT_ACCESS) ] = 0,
303 [ C(RESULT_MISS) ] = 0,
304 },
305 },
306 [ C(LL ) ] = {
307 [ C(OP_READ) ] = {
308 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
309 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
310 },
311 [ C(OP_WRITE) ] = {
312 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
313 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
314 },
315 [ C(OP_PREFETCH) ] = {
316 [ C(RESULT_ACCESS) ] = 0,
317 [ C(RESULT_MISS) ] = 0,
318 },
319 },
320 [ C(DTLB) ] = {
321 [ C(OP_READ) ] = {
322 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
323 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
324 },
325 [ C(OP_WRITE) ] = {
326 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
327 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
328 },
329 [ C(OP_PREFETCH) ] = {
330 [ C(RESULT_ACCESS) ] = 0,
331 [ C(RESULT_MISS) ] = 0,
332 },
333 },
334 [ C(ITLB) ] = {
335 [ C(OP_READ) ] = {
336 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
337 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
338 },
339 [ C(OP_WRITE) ] = {
340 [ C(RESULT_ACCESS) ] = -1,
341 [ C(RESULT_MISS) ] = -1,
342 },
343 [ C(OP_PREFETCH) ] = {
344 [ C(RESULT_ACCESS) ] = -1,
345 [ C(RESULT_MISS) ] = -1,
346 },
347 },
348 [ C(BPU ) ] = {
349 [ C(OP_READ) ] = {
350 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
351 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
352 },
353 [ C(OP_WRITE) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 [ C(OP_PREFETCH) ] = {
358 [ C(RESULT_ACCESS) ] = -1,
359 [ C(RESULT_MISS) ] = -1,
360 },
361 },
362};
363
364static __initconst u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
368{
369 [ C(L1D) ] = {
370 [ C(OP_READ) ] = {
371 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
372 [ C(RESULT_MISS) ] = 0,
373 },
374 [ C(OP_WRITE) ] = {
375 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
376 [ C(RESULT_MISS) ] = 0,
377 },
378 [ C(OP_PREFETCH) ] = {
379 [ C(RESULT_ACCESS) ] = 0x0,
380 [ C(RESULT_MISS) ] = 0,
381 },
382 },
383 [ C(L1I ) ] = {
384 [ C(OP_READ) ] = {
385 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
386 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
387 },
388 [ C(OP_WRITE) ] = {
389 [ C(RESULT_ACCESS) ] = -1,
390 [ C(RESULT_MISS) ] = -1,
391 },
392 [ C(OP_PREFETCH) ] = {
393 [ C(RESULT_ACCESS) ] = 0,
394 [ C(RESULT_MISS) ] = 0,
395 },
396 },
397 [ C(LL ) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
400 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
404 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(DTLB) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
414 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
418 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(ITLB) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
428 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = -1,
432 [ C(RESULT_MISS) ] = -1,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = -1,
436 [ C(RESULT_MISS) ] = -1,
437 },
438 },
439 [ C(BPU ) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
442 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = -1,
446 [ C(RESULT_MISS) ] = -1,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = -1,
450 [ C(RESULT_MISS) ] = -1,
451 },
452 },
453};
454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void)
510{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
512
513 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
514
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts();
517}
518
519static void intel_pmu_enable_all(void)
520{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
526 struct perf_event *event =
527 cpuc->events[X86_PMC_IDX_FIXED_BTS];
528
529 if (WARN_ON_ONCE(!event))
530 return;
531
532 intel_pmu_enable_bts(event->hw.config);
533 }
534}
535
536static inline u64 intel_pmu_get_status(void)
537{
538 u64 status;
539
540 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
541
542 return status;
543}
544
545static inline void intel_pmu_ack_status(u64 ack)
546{
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548}
549
550static inline void
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask;
555
556 mask = 0xfULL << (idx * 4);
557
558 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621}
622
623static inline void
624intel_pmu_disable_event(struct perf_event *event)
625{
626 struct hw_perf_event *hwc = &event->hw;
627
628 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
629 intel_pmu_disable_bts();
630 intel_pmu_drain_bts_buffer();
631 return;
632 }
633
634 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
635 intel_pmu_disable_fixed(hwc);
636 return;
637 }
638
639 x86_pmu_disable_event(event);
640}
641
642static inline void
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask;
647 int err;
648
649 /*
650 * Enable IRQ generation (0x8),
651 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
652 * if requested:
653 */
654 bits = 0x8ULL;
655 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
656 bits |= 0x2;
657 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
658 bits |= 0x1;
659
660 /*
661 * ANY bit is supported in v3 and up
662 */
663 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
664 bits |= 0x4;
665
666 bits <<= (idx * 4);
667 mask = 0xfULL << (idx * 4);
668
669 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask;
671 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val);
673}
674
675static void intel_pmu_enable_event(struct perf_event *event)
676{
677 struct hw_perf_event *hwc = &event->hw;
678
679 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
680 if (!__get_cpu_var(cpu_hw_events).enabled)
681 return;
682
683 intel_pmu_enable_bts(hwc->config);
684 return;
685 }
686
687 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
688 intel_pmu_enable_fixed(hwc);
689 return;
690 }
691
692 __x86_pmu_enable_event(hwc);
693}
694
695/*
696 * Save and restart an expired event. Called by NMI contexts,
697 * so it has to be careful about preempting normal event ops:
698 */
699static int intel_pmu_save_and_restart(struct perf_event *event)
700{
701 x86_perf_event_update(event);
702 return x86_perf_event_set_period(event);
703}
704
705static void intel_pmu_reset(void)
706{
707 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
708 unsigned long flags;
709 int idx;
710
711 if (!x86_pmu.num_events)
712 return;
713
714 local_irq_save(flags);
715
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717
718 for (idx = 0; idx < x86_pmu.num_events; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 }
725 if (ds)
726 ds->bts_index = ds->bts_buffer_base;
727
728 local_irq_restore(flags);
729}
730
731/*
732 * This handler is triggered by the local APIC, so the APIC IRQ handling
733 * rules apply:
734 */
735static int intel_pmu_handle_irq(struct pt_regs *regs)
736{
737 struct perf_sample_data data;
738 struct cpu_hw_events *cpuc;
739 int bit, loops;
740 u64 ack, status;
741
742 perf_sample_data_init(&data, 0);
743
744 cpuc = &__get_cpu_var(cpu_hw_events);
745
746 intel_pmu_disable_all();
747 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status();
749 if (!status) {
750 intel_pmu_enable_all();
751 return 0;
752 }
753
754 loops = 0;
755again:
756 if (++loops > 100) {
757 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
758 perf_event_print_debug();
759 intel_pmu_reset();
760 goto done;
761 }
762
763 inc_irq_stat(apic_perf_irqs);
764 ack = status;
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit];
767
768 if (!test_bit(bit, cpuc->active_mask))
769 continue;
770
771 if (!intel_pmu_save_and_restart(event))
772 continue;
773
774 data.period = event->hw.last_period;
775
776 if (perf_event_overflow(event, 1, &data, regs))
777 x86_pmu_stop(event);
778 }
779
780 intel_pmu_ack_status(ack);
781
782 /*
783 * Repeat if there is more work to be done:
784 */
785 status = intel_pmu_get_status();
786 if (status)
787 goto again;
788
789done:
790 intel_pmu_enable_all();
791 return 1;
792}
793
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint *
798intel_special_constraints(struct perf_event *event)
799{
800 unsigned int hw_event;
801
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803
804 if (unlikely((hw_event ==
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
806 (event->hw.sample_period == 1))) {
807
808 return &bts_constraint;
809 }
810 return NULL;
811}
812
813static struct event_constraint *
814intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
815{
816 struct event_constraint *c;
817
818 c = intel_special_constraints(event);
819 if (c)
820 return c;
821
822 return x86_get_event_constraints(cpuc, event);
823}
824
825static __initconst struct x86_pmu core_pmu = {
826 .name = "core",
827 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1,
838 /*
839 * Intel PMCs cannot be accessed sanely above 32 bit width,
840 * so we install an artificial 1<<31 period regardless of
841 * the generic event period:
842 */
843 .max_period = (1ULL << 31) - 1,
844 .get_event_constraints = intel_get_event_constraints,
845 .event_constraints = intel_core_event_constraints,
846};
847
848static __initconst struct x86_pmu intel_pmu = {
849 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1,
861 /*
862 * Intel PMCs cannot be accessed sanely above 32 bit width,
863 * so we install an artificial 1<<31 period regardless of
864 * the generic event period:
865 */
866 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints,
870
871 .cpu_starting = init_debug_store_on_cpu,
872 .cpu_dying = fini_debug_store_on_cpu,
873};
874
875static __init int intel_pmu_init(void)
876{
877 union cpuid10_edx edx;
878 union cpuid10_eax eax;
879 unsigned int unused;
880 unsigned int ebx;
881 int version;
882
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */
885 if (boot_cpu_data.x86 == 6) {
886 return p6_pmu_init();
887 } else {
888 return -ENODEV;
889 }
890 }
891
892 /*
893 * Check whether the Architectural PerfMon supports
894 * Branch Misses Retired hw_event or not.
895 */
896 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
897 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
898 return -ENODEV;
899
900 version = eax.split.version_id;
901 if (version < 2)
902 x86_pmu = core_pmu;
903 else
904 x86_pmu = intel_pmu;
905
906 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events;
908 x86_pmu.event_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
910
911 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events:
914 */
915 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
917
918 /*
919 * Install the hw-cache-events table:
920 */
921 switch (boot_cpu_data.x86_model) {
922 case 14: /* 65 nm core solo/duo, "Yonah" */
923 pr_cont("Core events, ");
924 break;
925
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids));
932
933 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, ");
935 break;
936
937 case 26: /* 45 nm nehalem, "Bloomfield" */
938 case 30: /* 45 nm nehalem, "Lynnfield" */
939 case 46: /* 45 nm nehalem-ex, "Beckton" */
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids));
942
943 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, ");
945 break;
946 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids));
949
950 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, ");
952 break;
953
954 case 37: /* 32 nm nehalem, "Clarkdale" */
955 case 44: /* 32 nm nehalem, "Gulftown" */
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids));
958
959 x86_pmu.event_constraints = intel_westmere_event_constraints;
960 pr_cont("Westmere events, ");
961 break;
962
963 default:
964 /*
965 * default constraints for v2 and up
966 */
967 x86_pmu.event_constraints = intel_gen_event_constraints;
968 pr_cont("generic architected perfmon, ");
969 }
970 return 0;
971}
972
973#else /* CONFIG_CPU_SUP_INTEL */
974
975static int intel_pmu_init(void)
976{
977 return 0;
978}
979
980#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 000000000000..a330485d14da
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,159 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Not sure about some of these
5 */
6static const u64 p6_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
15};
16
17static u64 p6_pmu_event_map(int hw_event)
18{
19 return p6_perfmon_event_map[hw_event];
20}
21
22/*
23 * Event setting that is specified not to count anything.
24 * We use this to effectively disable a counter.
25 *
26 * L2_RQSTS with 0 MESI unit mask.
27 */
28#define P6_NOP_EVENT 0x0000002EULL
29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] =
49{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
51 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
52 INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
53 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
54 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
55 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
56 EVENT_CONSTRAINT_END
57};
58
59static void p6_pmu_disable_all(void)
60{
61 u64 val;
62
63 /* p6 only has one enable register */
64 rdmsrl(MSR_P6_EVNTSEL0, val);
65 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
66 wrmsrl(MSR_P6_EVNTSEL0, val);
67}
68
69static void p6_pmu_enable_all(void)
70{
71 unsigned long val;
72
73 /* p6 only has one enable register */
74 rdmsrl(MSR_P6_EVNTSEL0, val);
75 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
76 wrmsrl(MSR_P6_EVNTSEL0, val);
77}
78
79static inline void
80p6_pmu_disable_event(struct perf_event *event)
81{
82 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
83 struct hw_perf_event *hwc = &event->hw;
84 u64 val = P6_NOP_EVENT;
85
86 if (cpuc->enabled)
87 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
88
89 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
90}
91
92static void p6_pmu_enable_event(struct perf_event *event)
93{
94 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
95 struct hw_perf_event *hwc = &event->hw;
96 u64 val;
97
98 val = hwc->config;
99 if (cpuc->enabled)
100 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
101
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103}
104
105static __initconst struct x86_pmu p6_pmu = {
106 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event,
112 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1,
118 .max_period = (1ULL << 31) - 1,
119 .version = 0,
120 .num_events = 2,
121 /*
122 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the
124 * effective width of a event for P6-like PMU is 32 bits only.
125 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */
128 .event_bits = 32,
129 .event_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints,
132};
133
134static __init int p6_pmu_init(void)
135{
136 switch (boot_cpu_data.x86_model) {
137 case 1:
138 case 3: /* Pentium Pro */
139 case 5:
140 case 6: /* Pentium II */
141 case 7:
142 case 8:
143 case 11: /* Pentium III */
144 case 9:
145 case 13:
146 /* Pentium M */
147 break;
148 default:
149 pr_cont("unsupported p6 CPU model %d ",
150 boot_cpu_data.x86_model);
151 return -ENODEV;
152 }
153
154 x86_pmu = p6_pmu;
155
156 return 0;
157}
158
159#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 898df9719afb..fb329e9f8494 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
115 115
116 return !test_bit(counter, perfctr_nmi_owner); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118
119/* checks the an msr for availability */
120int avail_to_resrv_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 return !test_bit(counter, perfctr_nmi_owner);
128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 118EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 119
131int reserve_perfctr_nmi(unsigned int msr) 120int reserve_perfctr_nmi(unsigned int msr)
@@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
691 cpu_nmi_set_wd_enabled(); 680 cpu_nmi_set_wd_enabled();
692 681
693 apic_write(APIC_LVTPC, APIC_DM_NMI); 682 apic_write(APIC_LVTPC, APIC_DM_NMI);
694 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
695 wrmsr(evntsel_msr, evntsel, 0); 684 wrmsr(evntsel_msr, evntsel, 0);
696 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
697 return 1; 686 return 1;
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59cf..dfdb4dba2320 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include <asm/vmware.h> 27#include <asm/vmware.h>
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)
101 102
102 return 0; 103 return 0;
103} 104}
105EXPORT_SYMBOL(vmware_platform);
104 106
105/* 107/*
106 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 108 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index cb27fd6136c9..8b862d5900fe 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/uaccess.h> 42#include <linux/uaccess.h>
43#include <linux/gfp.h>
43 44
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/msr.h> 46#include <asm/msr.h>
@@ -229,7 +230,7 @@ static void __exit cpuid_exit(void)
229 for_each_online_cpu(cpu) 230 for_each_online_cpu(cpu)
230 cpuid_device_destroy(cpu); 231 cpuid_device_destroy(cpu);
231 class_destroy(cpuid_class); 232 class_destroy(cpuid_class);
232 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 233 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
233 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 234 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
234} 235}
235 236
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a4849c10a77e..ebd4c51d096a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,7 +27,6 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/x86_init.h>
31 30
32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 31#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
33 32
@@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
103#ifdef CONFIG_HPET_TIMER 102#ifdef CONFIG_HPET_TIMER
104 hpet_disable(); 103 hpet_disable();
105#endif 104#endif
106
107#ifdef CONFIG_X86_64
108 x86_platform.iommu_shutdown();
109#endif
110
111 crash_save_cpu(regs, safe_smp_processor_id()); 105 crash_save_cpu(regs, safe_smp_processor_id());
112} 106}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index cd97ce18c29d..67414550c3cc 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -5,6 +5,7 @@
5 * Copyright (C) IBM Corporation, 2004. All rights reserved 5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */ 6 */
7 7
8#include <linux/slab.h>
8#include <linux/errno.h> 9#include <linux/errno.h>
9#include <linux/highmem.h> 10#include <linux/highmem.h>
10#include <linux/crash_dump.h> 11#include <linux/crash_dump.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c56bc2873030..6d817554780a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -123,13 +123,15 @@ print_context_stack_bp(struct thread_info *tinfo,
123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { 123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
124 unsigned long addr = *ret_addr; 124 unsigned long addr = *ret_addr;
125 125
126 if (__kernel_text_address(addr)) { 126 if (!__kernel_text_address(addr))
127 ops->address(data, addr, 1); 127 break;
128 frame = frame->next_frame; 128
129 ret_addr = &frame->return_address; 129 ops->address(data, addr, 1);
130 print_ftrace_graph_addr(addr, data, ops, tinfo, graph); 130 frame = frame->next_frame;
131 } 131 ret_addr = &frame->return_address;
132 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
132 } 133 }
134
133 return (unsigned long)frame; 135 return (unsigned long)frame;
134} 136}
135EXPORT_SYMBOL_GPL(print_context_stack_bp); 137EXPORT_SYMBOL_GPL(print_context_stack_bp);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 4fd1420faffa..e1a93be4fd44 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,6 +14,8 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17#include <linux/uaccess.h>
18
17extern void 19extern void
18show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
19 unsigned long *stack, unsigned long bp, char *log_lvl); 21 unsigned long *stack, unsigned long bp, char *log_lvl);
@@ -29,4 +31,26 @@ struct stack_frame {
29 struct stack_frame *next_frame; 31 struct stack_frame *next_frame;
30 unsigned long return_address; 32 unsigned long return_address;
31}; 33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
32#endif 51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index ae775ca47b25..11540a189d93 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -18,11 +18,6 @@
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21/* Just a stub for now */
22int x86_is_stack_id(int id, char *name)
23{
24 return 0;
25}
26 21
27void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
28 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 0ad9597073f5..272c9f1f05f3 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = {
33#endif 33#endif
34}; 34};
35 35
36int x86_is_stack_id(int id, char *name)
37{
38 return x86_stack_ids[id - 1] == name;
39}
40
41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 36static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
42 unsigned *usedp, char **idp) 37 unsigned *usedp, char **idp)
43{ 38{
@@ -125,9 +120,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
125{ 120{
126#ifdef CONFIG_FRAME_POINTER 121#ifdef CONFIG_FRAME_POINTER
127 struct stack_frame *frame = (struct stack_frame *)bp; 122 struct stack_frame *frame = (struct stack_frame *)bp;
123 unsigned long next;
128 124
129 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) 125 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
130 return (unsigned long)frame->next_frame; 126 if (!probe_kernel_address(&frame->next_frame, next))
127 return next;
128 else
129 WARN_ONCE(1, "Perf: bad frame pointer = %p in "
130 "callchain\n", &frame->next_frame);
131 }
131#endif 132#endif
132 return bp; 133 return bp;
133} 134}
@@ -207,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
207 if (in_irq_stack(stack, irq_stack, irq_stack_end)) { 208 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
208 if (ops->stack(data, "IRQ") < 0) 209 if (ops->stack(data, "IRQ") < 0)
209 break; 210 break;
210 bp = print_context_stack(tinfo, stack, bp, 211 bp = ops->walk_stack(tinfo, stack, bp,
211 ops, data, irq_stack_end, &graph); 212 ops, data, irq_stack_end, &graph);
212 /* 213 /*
213 * We link to the next stack (which would be 214 * We link to the next stack (which would be
@@ -228,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
228 /* 229 /*
229 * This handles the process stack: 230 * This handles the process stack:
230 */ 231 */
231 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); 232 bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
232 put_cpu(); 233 put_cpu();
233} 234}
234EXPORT_SYMBOL(dump_trace); 235EXPORT_SYMBOL(dump_trace);
@@ -291,6 +292,7 @@ void show_registers(struct pt_regs *regs)
291 292
292 sp = regs->sp; 293 sp = regs->sp;
293 printk("CPU %d ", cpu); 294 printk("CPU %d ", cpu);
295 print_modules();
294 __show_regs(regs, 1); 296 __show_regs(regs, 1);
295 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 297 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
296 cur->comm, cur->pid, task_thread_info(cur), cur); 298 cur->comm, cur->pid, task_thread_info(cur), cur);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 05ed7ab2ca48..7bca3c6a02fb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h> 15#include <linux/pfn.h>
21#include <linux/suspend.h> 16#include <linux/suspend.h>
22#include <linux/firmware-map.h> 17#include <linux/firmware-map.h>
23 18
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h> 19#include <asm/e820.h>
27#include <asm/proto.h> 20#include <asm/proto.h>
28#include <asm/setup.h> 21#include <asm/setup.h>
29#include <asm/trampoline.h>
30 22
31/* 23/*
32 * The e820 map is the map that gets modified e.g. with command line parameters 24 * The e820 map is the map that gets modified e.g. with command line parameters
@@ -517,31 +509,55 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
517 int checktype) 509 int checktype)
518{ 510{
519 int i; 511 int i;
512 u64 end;
520 u64 real_removed_size = 0; 513 u64 real_removed_size = 0;
521 514
522 if (size > (ULLONG_MAX - start)) 515 if (size > (ULLONG_MAX - start))
523 size = ULLONG_MAX - start; 516 size = ULLONG_MAX - start;
524 517
518 end = start + size;
519 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
520 (unsigned long long) start,
521 (unsigned long long) end);
522 if (checktype)
523 e820_print_type(old_type);
524 printk(KERN_CONT "\n");
525
525 for (i = 0; i < e820.nr_map; i++) { 526 for (i = 0; i < e820.nr_map; i++) {
526 struct e820entry *ei = &e820.map[i]; 527 struct e820entry *ei = &e820.map[i];
527 u64 final_start, final_end; 528 u64 final_start, final_end;
529 u64 ei_end;
528 530
529 if (checktype && ei->type != old_type) 531 if (checktype && ei->type != old_type)
530 continue; 532 continue;
533
534 ei_end = ei->addr + ei->size;
531 /* totally covered? */ 535 /* totally covered? */
532 if (ei->addr >= start && 536 if (ei->addr >= start && ei_end <= end) {
533 (ei->addr + ei->size) <= (start + size)) {
534 real_removed_size += ei->size; 537 real_removed_size += ei->size;
535 memset(ei, 0, sizeof(struct e820entry)); 538 memset(ei, 0, sizeof(struct e820entry));
536 continue; 539 continue;
537 } 540 }
541
542 /* new range is totally covered? */
543 if (ei->addr < start && ei_end > end) {
544 e820_add_region(end, ei_end - end, ei->type);
545 ei->size = start - ei->addr;
546 real_removed_size += size;
547 continue;
548 }
549
538 /* partially covered */ 550 /* partially covered */
539 final_start = max(start, ei->addr); 551 final_start = max(start, ei->addr);
540 final_end = min(start + size, ei->addr + ei->size); 552 final_end = min(end, ei_end);
541 if (final_start >= final_end) 553 if (final_start >= final_end)
542 continue; 554 continue;
543 real_removed_size += final_end - final_start; 555 real_removed_size += final_end - final_start;
544 556
557 /*
558 * left range could be head or tail, so need to update
559 * size at first.
560 */
545 ei->size -= final_end - final_start; 561 ei->size -= final_end - final_start;
546 if (ei->addr < final_start) 562 if (ei->addr < final_start)
547 continue; 563 continue;
@@ -722,319 +738,44 @@ core_initcall(e820_mark_nvs_memory);
722#endif 738#endif
723 739
724/* 740/*
725 * Early reserved memory areas. 741 * Find a free area with specified alignment in a specific range.
726 */
727#define MAX_EARLY_RES 32
728
729struct early_res {
730 u64 start, end;
731 char name[16];
732 char overlap_ok;
733};
734static struct early_res early_res[MAX_EARLY_RES] __initdata = {
735 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
736#ifdef CONFIG_X86_32
737 /*
738 * But first pinch a few for the stack/trampoline stuff
739 * FIXME: Don't need the extra page at 4K, but need to fix
740 * trampoline before removing it. (see the GDT stuff)
741 */
742 { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 },
743#endif
744
745 {}
746};
747
748static int __init find_overlapped_early(u64 start, u64 end)
749{
750 int i;
751 struct early_res *r;
752
753 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
754 r = &early_res[i];
755 if (end > r->start && start < r->end)
756 break;
757 }
758
759 return i;
760}
761
762/*
763 * Drop the i-th range from the early reservation map,
764 * by copying any higher ranges down one over it, and
765 * clearing what had been the last slot.
766 */
767static void __init drop_range(int i)
768{
769 int j;
770
771 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
772 ;
773
774 memmove(&early_res[i], &early_res[i + 1],
775 (j - 1 - i) * sizeof(struct early_res));
776
777 early_res[j - 1].end = 0;
778}
779
780/*
781 * Split any existing ranges that:
782 * 1) are marked 'overlap_ok', and
783 * 2) overlap with the stated range [start, end)
784 * into whatever portion (if any) of the existing range is entirely
785 * below or entirely above the stated range. Drop the portion
786 * of the existing range that overlaps with the stated range,
787 * which will allow the caller of this routine to then add that
788 * stated range without conflicting with any existing range.
789 */ 742 */
790static void __init drop_overlaps_that_are_ok(u64 start, u64 end) 743u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
791{ 744{
792 int i; 745 int i;
793 struct early_res *r;
794 u64 lower_start, lower_end;
795 u64 upper_start, upper_end;
796 char name[16];
797 746
798 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { 747 for (i = 0; i < e820.nr_map; i++) {
799 r = &early_res[i]; 748 struct e820entry *ei = &e820.map[i];
749 u64 addr;
750 u64 ei_start, ei_last;
800 751
801 /* Continue past non-overlapping ranges */ 752 if (ei->type != E820_RAM)
802 if (end <= r->start || start >= r->end)
803 continue; 753 continue;
804 754
805 /* 755 ei_last = ei->addr + ei->size;
806 * Leave non-ok overlaps as is; let caller 756 ei_start = ei->addr;
807 * panic "Overlapping early reservations" 757 addr = find_early_area(ei_start, ei_last, start, end,
808 * when it hits this overlap. 758 size, align);
809 */
810 if (!r->overlap_ok)
811 return;
812
813 /*
814 * We have an ok overlap. We will drop it from the early
815 * reservation map, and add back in any non-overlapping
816 * portions (lower or upper) as separate, overlap_ok,
817 * non-overlapping ranges.
818 */
819
820 /* 1. Note any non-overlapping (lower or upper) ranges. */
821 strncpy(name, r->name, sizeof(name) - 1);
822
823 lower_start = lower_end = 0;
824 upper_start = upper_end = 0;
825 if (r->start < start) {
826 lower_start = r->start;
827 lower_end = start;
828 }
829 if (r->end > end) {
830 upper_start = end;
831 upper_end = r->end;
832 }
833
834 /* 2. Drop the original ok overlapping range */
835 drop_range(i);
836
837 i--; /* resume for-loop on copied down entry */
838
839 /* 3. Add back in any non-overlapping ranges. */
840 if (lower_end)
841 reserve_early_overlap_ok(lower_start, lower_end, name);
842 if (upper_end)
843 reserve_early_overlap_ok(upper_start, upper_end, name);
844 }
845}
846
847static void __init __reserve_early(u64 start, u64 end, char *name,
848 int overlap_ok)
849{
850 int i;
851 struct early_res *r;
852
853 i = find_overlapped_early(start, end);
854 if (i >= MAX_EARLY_RES)
855 panic("Too many early reservations");
856 r = &early_res[i];
857 if (r->end)
858 panic("Overlapping early reservations "
859 "%llx-%llx %s to %llx-%llx %s\n",
860 start, end - 1, name?name:"", r->start,
861 r->end - 1, r->name);
862 r->start = start;
863 r->end = end;
864 r->overlap_ok = overlap_ok;
865 if (name)
866 strncpy(r->name, name, sizeof(r->name) - 1);
867}
868
869/*
870 * A few early reservtations come here.
871 *
872 * The 'overlap_ok' in the name of this routine does -not- mean it
873 * is ok for these reservations to overlap an earlier reservation.
874 * Rather it means that it is ok for subsequent reservations to
875 * overlap this one.
876 *
877 * Use this entry point to reserve early ranges when you are doing
878 * so out of "Paranoia", reserving perhaps more memory than you need,
879 * just in case, and don't mind a subsequent overlapping reservation
880 * that is known to be needed.
881 *
882 * The drop_overlaps_that_are_ok() call here isn't really needed.
883 * It would be needed if we had two colliding 'overlap_ok'
884 * reservations, so that the second such would not panic on the
885 * overlap with the first. We don't have any such as of this
886 * writing, but might as well tolerate such if it happens in
887 * the future.
888 */
889void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
890{
891 drop_overlaps_that_are_ok(start, end);
892 __reserve_early(start, end, name, 1);
893}
894
895/*
896 * Most early reservations come here.
897 *
898 * We first have drop_overlaps_that_are_ok() drop any pre-existing
899 * 'overlap_ok' ranges, so that we can then reserve this memory
900 * range without risk of panic'ing on an overlapping overlap_ok
901 * early reservation.
902 */
903void __init reserve_early(u64 start, u64 end, char *name)
904{
905 if (start >= end)
906 return;
907
908 drop_overlaps_that_are_ok(start, end);
909 __reserve_early(start, end, name, 0);
910}
911
912void __init free_early(u64 start, u64 end)
913{
914 struct early_res *r;
915 int i;
916
917 i = find_overlapped_early(start, end);
918 r = &early_res[i];
919 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
920 panic("free_early on not reserved area: %llx-%llx!",
921 start, end - 1);
922
923 drop_range(i);
924}
925
926void __init early_res_to_bootmem(u64 start, u64 end)
927{
928 int i, count;
929 u64 final_start, final_end;
930
931 count = 0;
932 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
933 count++;
934
935 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
936 count, start, end);
937 for (i = 0; i < count; i++) {
938 struct early_res *r = &early_res[i];
939 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
940 r->start, r->end, r->name);
941 final_start = max(start, r->start);
942 final_end = min(end, r->end);
943 if (final_start >= final_end) {
944 printk(KERN_CONT "\n");
945 continue;
946 }
947 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
948 final_start, final_end);
949 reserve_bootmem_generic(final_start, final_end - final_start,
950 BOOTMEM_DEFAULT);
951 }
952}
953 759
954/* Check for already reserved areas */ 760 if (addr != -1ULL)
955static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) 761 return addr;
956{
957 int i;
958 u64 addr = *addrp;
959 int changed = 0;
960 struct early_res *r;
961again:
962 i = find_overlapped_early(addr, addr + size);
963 r = &early_res[i];
964 if (i < MAX_EARLY_RES && r->end) {
965 *addrp = addr = round_up(r->end, align);
966 changed = 1;
967 goto again;
968 } 762 }
969 return changed; 763 return -1ULL;
970} 764}
971 765
972/* Check for already reserved areas */ 766u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
973static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
974{ 767{
975 int i; 768 return find_e820_area(start, end, size, align);
976 u64 addr = *addrp, last;
977 u64 size = *sizep;
978 int changed = 0;
979again:
980 last = addr + size;
981 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
982 struct early_res *r = &early_res[i];
983 if (last > r->start && addr < r->start) {
984 size = r->start - addr;
985 changed = 1;
986 goto again;
987 }
988 if (last > r->end && addr < r->end) {
989 addr = round_up(r->end, align);
990 size = last - addr;
991 changed = 1;
992 goto again;
993 }
994 if (last <= r->end && addr >= r->start) {
995 (*sizep)++;
996 return 0;
997 }
998 }
999 if (changed) {
1000 *addrp = addr;
1001 *sizep = size;
1002 }
1003 return changed;
1004} 769}
1005 770
1006/* 771u64 __init get_max_mapped(void)
1007 * Find a free area with specified alignment in a specific range.
1008 */
1009u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
1010{ 772{
1011 int i; 773 u64 end = max_pfn_mapped;
1012 774
1013 for (i = 0; i < e820.nr_map; i++) { 775 end <<= PAGE_SHIFT;
1014 struct e820entry *ei = &e820.map[i];
1015 u64 addr, last;
1016 u64 ei_last;
1017 776
1018 if (ei->type != E820_RAM) 777 return end;
1019 continue;
1020 addr = round_up(ei->addr, align);
1021 ei_last = ei->addr + ei->size;
1022 if (addr < start)
1023 addr = round_up(start, align);
1024 if (addr >= ei_last)
1025 continue;
1026 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1027 ;
1028 last = addr + size;
1029 if (last > ei_last)
1030 continue;
1031 if (last > end)
1032 continue;
1033 return addr;
1034 }
1035 return -1ULL;
1036} 778}
1037
1038/* 779/*
1039 * Find next free range after *start 780 * Find next free range after *start
1040 */ 781 */
@@ -1044,25 +785,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1044 785
1045 for (i = 0; i < e820.nr_map; i++) { 786 for (i = 0; i < e820.nr_map; i++) {
1046 struct e820entry *ei = &e820.map[i]; 787 struct e820entry *ei = &e820.map[i];
1047 u64 addr, last; 788 u64 addr;
1048 u64 ei_last; 789 u64 ei_start, ei_last;
1049 790
1050 if (ei->type != E820_RAM) 791 if (ei->type != E820_RAM)
1051 continue; 792 continue;
1052 addr = round_up(ei->addr, align); 793
1053 ei_last = ei->addr + ei->size; 794 ei_last = ei->addr + ei->size;
1054 if (addr < start) 795 ei_start = ei->addr;
1055 addr = round_up(start, align); 796 addr = find_early_area_size(ei_start, ei_last, start,
1056 if (addr >= ei_last) 797 sizep, align);
1057 continue; 798
1058 *sizep = ei_last - addr; 799 if (addr != -1ULL)
1059 while (bad_addr_size(&addr, sizep, align) && 800 return addr;
1060 addr + *sizep <= ei_last)
1061 ;
1062 last = addr + *sizep;
1063 if (last > ei_last)
1064 continue;
1065 return addr;
1066 } 801 }
1067 802
1068 return -1ULL; 803 return -1ULL;
@@ -1421,6 +1156,8 @@ void __init e820_reserve_resources_late(void)
1421 end = MAX_RESOURCE_SIZE; 1156 end = MAX_RESOURCE_SIZE;
1422 if (start >= end) 1157 if (start >= end)
1423 continue; 1158 continue;
1159 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
1160 start, end);
1424 reserve_region_with_split(&iomem_resource, start, end, 1161 reserve_region_with_split(&iomem_resource, start, end,
1425 "RAM buffer"); 1162 "RAM buffer");
1426 } 1163 }
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index cdcfb122f256..c2fa9b8b497e 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -362,7 +362,7 @@ void __init efi_init(void)
362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); 362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
363 early_iounmap(tmp, 2); 363 early_iounmap(tmp, 2);
364 364
365 printk(KERN_INFO "EFI v%u.%.02u by %s \n", 365 printk(KERN_INFO "EFI v%u.%.02u by %s\n",
366 efi.systab->hdr.revision >> 16, 366 efi.systab->hdr.revision >> 16,
367 efi.systab->hdr.revision & 0xffff, vendor); 367 efi.systab->hdr.revision & 0xffff, vendor);
368 368
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 309689245431..cd37469b54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -30,14 +30,32 @@
30 30
31#ifdef CONFIG_DYNAMIC_FTRACE 31#ifdef CONFIG_DYNAMIC_FTRACE
32 32
33/*
34 * modifying_code is set to notify NMIs that they need to use
35 * memory barriers when entering or exiting. But we don't want
36 * to burden NMIs with unnecessary memory barriers when code
37 * modification is not being done (which is most of the time).
38 *
39 * A mutex is already held when ftrace_arch_code_modify_prepare
40 * and post_process are called. No locks need to be taken here.
41 *
42 * Stop machine will make sure currently running NMIs are done
43 * and new NMIs will see the updated variable before we need
44 * to worry about NMIs doing memory barriers.
45 */
46static int modifying_code __read_mostly;
47static DEFINE_PER_CPU(int, save_modifying_code);
48
33int ftrace_arch_code_modify_prepare(void) 49int ftrace_arch_code_modify_prepare(void)
34{ 50{
35 set_kernel_text_rw(); 51 set_kernel_text_rw();
52 modifying_code = 1;
36 return 0; 53 return 0;
37} 54}
38 55
39int ftrace_arch_code_modify_post_process(void) 56int ftrace_arch_code_modify_post_process(void)
40{ 57{
58 modifying_code = 0;
41 set_kernel_text_ro(); 59 set_kernel_text_ro();
42 return 0; 60 return 0;
43} 61}
@@ -149,6 +167,11 @@ static void ftrace_mod_code(void)
149 167
150void ftrace_nmi_enter(void) 168void ftrace_nmi_enter(void)
151{ 169{
170 __get_cpu_var(save_modifying_code) = modifying_code;
171
172 if (!__get_cpu_var(save_modifying_code))
173 return;
174
152 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
153 smp_rmb(); 176 smp_rmb();
154 ftrace_mod_code(); 177 ftrace_mod_code();
@@ -160,6 +183,9 @@ void ftrace_nmi_enter(void)
160 183
161void ftrace_nmi_exit(void) 184void ftrace_nmi_exit(void)
162{ 185{
186 if (!__get_cpu_var(save_modifying_code))
187 return;
188
163 /* Finish all executions before clearing nmi_running */ 189 /* Finish all executions before clearing nmi_running */
164 smp_mb(); 190 smp_mb();
165 atomic_dec(&nmi_running); 191 atomic_dec(&nmi_running);
@@ -484,13 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
484 } 510 }
485} 511}
486#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 512#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
487
488#ifdef CONFIG_FTRACE_SYSCALLS
489
490extern unsigned long *sys_call_table;
491
492unsigned long __init arch_syscall_addr(int nr)
493{
494 return (unsigned long)(&sys_call_table)[nr];
495}
496#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 5051b94c9069..b2e246037392 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10#include <linux/mm.h>
10 11
11#include <asm/setup.h> 12#include <asm/setup.h>
12#include <asm/sections.h> 13#include <asm/sections.h>
@@ -29,14 +30,25 @@ static void __init i386_default_early_setup(void)
29 30
30void __init i386_start_kernel(void) 31void __init i386_start_kernel(void)
31{ 32{
33#ifdef CONFIG_X86_TRAMPOLINE
34 /*
35 * But first pinch a few for the stack/trampoline stuff
36 * FIXME: Don't need the extra page at 4K, but need to fix
37 * trampoline before removing it. (see the GDT stuff)
38 */
39 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
40 "EX TRAMPOLINE");
41#endif
42
32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 43 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
33 44
34#ifdef CONFIG_BLK_DEV_INITRD 45#ifdef CONFIG_BLK_DEV_INITRD
35 /* Reserve INITRD */ 46 /* Reserve INITRD */
36 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 47 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
48 /* Assume only end is not page aligned */
37 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 49 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
38 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 50 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
39 u64 ramdisk_end = ramdisk_image + ramdisk_size; 51 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
40 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 52 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
41 } 53 }
42#endif 54#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b5a9896ca1e7..7147143fd614 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data)
103#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */ 104 /* Reserve INITRD */
105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
106 /* Assume only end is not page aligned */
106 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
107 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
108 unsigned long ramdisk_end = ramdisk_image + ramdisk_size; 109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
109 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 110 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
110 } 111 }
111#endif 112#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7fd318bac59c..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -442,8 +442,8 @@ is386: movl $2,%ecx # set MP
442 */ 442 */
443 cmpb $0,ready 443 cmpb $0,ready
444 jne 1f 444 jne 1f
445 movl $per_cpu__gdt_page,%eax 445 movl $gdt_page,%eax
446 movl $per_cpu__stack_canary,%ecx 446 movl $stack_canary,%ecx
447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
448 shrl $16, %ecx 448 shrl $16, %ecx
449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -706,7 +706,7 @@ idt_descr:
706 .word 0 # 32 bit align gdt_desc.address 706 .word 0 # 32 bit align gdt_desc.address
707ENTRY(early_gdt_descr) 707ENTRY(early_gdt_descr)
708 .word GDT_ENTRIES*8-1 708 .word GDT_ENTRIES*8-1
709 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ 709 .long gdt_page /* Overwritten for secondary CPUs */
710 710
711/* 711/*
712 * The boot_gdt must mirror the equivalent in setup.S and is 712 * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2d8b5035371c..3d1e6f16b7a6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -27,7 +27,7 @@
27#define GET_CR2_INTO_RCX movq %cr2, %rcx 27#define GET_CR2_INTO_RCX movq %cr2, %rcx
28#endif 28#endif
29 29
30/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 30/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
31 * because we need identity-mapped pages. 31 * because we need identity-mapped pages.
32 * 32 *
33 */ 33 */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba6e65884603..23b4ecdffa9b 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
4#include <linux/sysdev.h> 4#include <linux/sysdev.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/slab.h>
7#include <linux/hpet.h> 8#include <linux/hpet.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/cpu.h> 10#include <linux/cpu.h>
@@ -34,6 +35,8 @@
34 */ 35 */
35unsigned long hpet_address; 36unsigned long hpet_address;
36u8 hpet_blockid; /* OS timer block num */ 37u8 hpet_blockid; /* OS timer block num */
38u8 hpet_msi_disable;
39
37#ifdef CONFIG_PCI_MSI 40#ifdef CONFIG_PCI_MSI
38static unsigned long hpet_num_timers; 41static unsigned long hpet_num_timers;
39#endif 42#endif
@@ -264,7 +267,7 @@ static void hpet_resume_device(void)
264 force_hpet_resume(); 267 force_hpet_resume();
265} 268}
266 269
267static void hpet_resume_counter(void) 270static void hpet_resume_counter(struct clocksource *cs)
268{ 271{
269 hpet_resume_device(); 272 hpet_resume_device();
270 hpet_restart_counter(); 273 hpet_restart_counter();
@@ -397,9 +400,15 @@ static int hpet_next_event(unsigned long delta,
397 * then we might have a real hardware problem. We can not do 400 * then we might have a real hardware problem. We can not do
398 * much about it here, but at least alert the user/admin with 401 * much about it here, but at least alert the user/admin with
399 * a prominent warning. 402 * a prominent warning.
403 * An erratum on some chipsets (ICH9,..), results in comparator read
404 * immediately following a write returning old value. Workaround
405 * for this is to read this value second time, when first
406 * read returns old value.
400 */ 407 */
401 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, 408 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
409 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
402 KERN_WARNING "hpet: compare register read back failed.\n"); 410 KERN_WARNING "hpet: compare register read back failed.\n");
411 }
403 412
404 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 413 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
405} 414}
@@ -596,6 +605,9 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
596 unsigned int num_timers_used = 0; 605 unsigned int num_timers_used = 0;
597 int i; 606 int i;
598 607
608 if (hpet_msi_disable)
609 return;
610
599 if (boot_cpu_has(X86_FEATURE_ARAT)) 611 if (boot_cpu_has(X86_FEATURE_ARAT))
600 return; 612 return;
601 id = hpet_readl(HPET_ID); 613 id = hpet_readl(HPET_ID);
@@ -928,6 +940,9 @@ static __init int hpet_late_init(void)
928 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 940 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
929 hpet_print_config(); 941 hpet_print_config();
930 942
943 if (hpet_msi_disable)
944 return 0;
945
931 if (boot_cpu_has(X86_FEATURE_ARAT)) 946 if (boot_cpu_has(X86_FEATURE_ARAT))
932 return 0; 947 return 0;
933 948
@@ -1135,6 +1150,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1135 do_div(clc, freq); 1150 do_div(clc, freq);
1136 clc >>= hpet_clockevent.shift; 1151 clc >>= hpet_clockevent.shift;
1137 hpet_pie_delta = clc; 1152 hpet_pie_delta = clc;
1153 hpet_pie_limit = 0;
1138 } 1154 }
1139 return 1; 1155 return 1;
1140} 1156}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 05d5fec64a94..d6cc065f519f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213} 213}
214 214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type, 215int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type) 216 int *gen_len, int *gen_type)
236{ 217{
@@ -362,10 +343,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
362 return ret; 343 return ret;
363 } 344 }
364 345
365 ret = arch_store_info(bp);
366
367 if (ret < 0)
368 return ret;
369 /* 346 /*
370 * Check that the low-order bits of the address are appropriate 347 * Check that the low-order bits of the address are appropriate
371 * for the alignment implied by len. 348 * for the alignment implied by len.
@@ -502,8 +479,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
502 rcu_read_lock(); 479 rcu_read_lock();
503 480
504 bp = per_cpu(bp_per_reg[i], cpu); 481 bp = per_cpu(bp_per_reg[i], cpu);
505 if (bp)
506 rc = NOTIFY_DONE;
507 /* 482 /*
508 * Reset the 'i'th TRAP bit in dr6 to denote completion of 483 * Reset the 'i'th TRAP bit in dr6 to denote completion of
509 * exception handling 484 * exception handling
@@ -522,7 +497,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
522 497
523 rcu_read_unlock(); 498 rcu_read_unlock();
524 } 499 }
525 if (dr6 & (~DR_TRAP_BITS)) 500 /*
501 * Further processing in do_debug() is needed for a) user-space
502 * breakpoints (to generate signals) and b) when the system has
503 * taken exception due to multiple causes
504 */
505 if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
506 (dr6 & (~DR_TRAP_BITS)))
526 rc = NOTIFY_DONE; 507 rc = NOTIFY_DONE;
527 508
528 set_debugreg(dr7, 7); 509 set_debugreg(dr7, 7);
@@ -547,8 +528,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
547{ 528{
548 /* TODO */ 529 /* TODO */
549} 530}
550
551void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
552{
553 /* TODO */
554}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540a7f3d..54c31c285488 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/regset.h> 9#include <linux/regset.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
11 12
12#include <asm/sigcontext.h> 13#include <asm/sigcontext.h>
13#include <asm/processor.h> 14#include <asm/processor.h>
@@ -164,6 +165,11 @@ int init_fpu(struct task_struct *tsk)
164 return 0; 165 return 0;
165} 166}
166 167
168/*
169 * The xstateregs_active() routine is the same as the fpregs_active() routine,
170 * as the "regset->n" for the xstate regset will be updated based on the feature
171 * capabilites supported by the xsave.
172 */
167int fpregs_active(struct task_struct *target, const struct user_regset *regset) 173int fpregs_active(struct task_struct *target, const struct user_regset *regset)
168{ 174{
169 return tsk_used_math(target) ? regset->n : 0; 175 return tsk_used_math(target) ? regset->n : 0;
@@ -204,8 +210,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
204 if (ret) 210 if (ret)
205 return ret; 211 return ret;
206 212
207 set_stopped_child_used_math(target);
208
209 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 213 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
210 &target->thread.xstate->fxsave, 0, -1); 214 &target->thread.xstate->fxsave, 0, -1);
211 215
@@ -224,6 +228,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
224 return ret; 228 return ret;
225} 229}
226 230
231int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
232 unsigned int pos, unsigned int count,
233 void *kbuf, void __user *ubuf)
234{
235 int ret;
236
237 if (!cpu_has_xsave)
238 return -ENODEV;
239
240 ret = init_fpu(target);
241 if (ret)
242 return ret;
243
244 /*
245 * Copy the 48bytes defined by the software first into the xstate
246 * memory layout in the thread struct, so that we can copy the entire
247 * xstateregs to the user using one user_regset_copyout().
248 */
249 memcpy(&target->thread.xstate->fxsave.sw_reserved,
250 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
251
252 /*
253 * Copy the xstate memory layout.
254 */
255 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
256 &target->thread.xstate->xsave, 0, -1);
257 return ret;
258}
259
260int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
261 unsigned int pos, unsigned int count,
262 const void *kbuf, const void __user *ubuf)
263{
264 int ret;
265 struct xsave_hdr_struct *xsave_hdr;
266
267 if (!cpu_has_xsave)
268 return -ENODEV;
269
270 ret = init_fpu(target);
271 if (ret)
272 return ret;
273
274 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
275 &target->thread.xstate->xsave, 0, -1);
276
277 /*
278 * mxcsr reserved bits must be masked to zero for security reasons.
279 */
280 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
281
282 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
283
284 xsave_hdr->xstate_bv &= pcntxt_mask;
285 /*
286 * These bits must be zero.
287 */
288 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
289
290 return ret;
291}
292
227#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 293#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
228 294
229/* 295/*
@@ -404,8 +470,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
404 if (ret) 470 if (ret)
405 return ret; 471 return ret;
406 472
407 set_stopped_child_used_math(target);
408
409 if (!HAVE_HWFP) 473 if (!HAVE_HWFP)
410 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 474 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
411 475
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef80..7c9f02c130f3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/init.h> 9#include <linux/init.h>
11#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
@@ -32,8 +31,14 @@
32 */ 31 */
33 32
34static int i8259A_auto_eoi; 33static int i8259A_auto_eoi;
35DEFINE_SPINLOCK(i8259A_lock); 34DEFINE_RAW_SPINLOCK(i8259A_lock);
36static void mask_and_ack_8259A(unsigned int); 35static void mask_and_ack_8259A(unsigned int);
36static void mask_8259A(void);
37static void unmask_8259A(void);
38static void disable_8259A_irq(unsigned int irq);
39static void enable_8259A_irq(unsigned int irq);
40static void init_8259A(int auto_eoi);
41static int i8259A_irq_pending(unsigned int irq);
37 42
38struct irq_chip i8259A_chip = { 43struct irq_chip i8259A_chip = {
39 .name = "XT-PIC", 44 .name = "XT-PIC",
@@ -63,51 +68,51 @@ unsigned int cached_irq_mask = 0xffff;
63 */ 68 */
64unsigned long io_apic_irqs; 69unsigned long io_apic_irqs;
65 70
66void disable_8259A_irq(unsigned int irq) 71static void disable_8259A_irq(unsigned int irq)
67{ 72{
68 unsigned int mask = 1 << irq; 73 unsigned int mask = 1 << irq;
69 unsigned long flags; 74 unsigned long flags;
70 75
71 spin_lock_irqsave(&i8259A_lock, flags); 76 raw_spin_lock_irqsave(&i8259A_lock, flags);
72 cached_irq_mask |= mask; 77 cached_irq_mask |= mask;
73 if (irq & 8) 78 if (irq & 8)
74 outb(cached_slave_mask, PIC_SLAVE_IMR); 79 outb(cached_slave_mask, PIC_SLAVE_IMR);
75 else 80 else
76 outb(cached_master_mask, PIC_MASTER_IMR); 81 outb(cached_master_mask, PIC_MASTER_IMR);
77 spin_unlock_irqrestore(&i8259A_lock, flags); 82 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
78} 83}
79 84
80void enable_8259A_irq(unsigned int irq) 85static void enable_8259A_irq(unsigned int irq)
81{ 86{
82 unsigned int mask = ~(1 << irq); 87 unsigned int mask = ~(1 << irq);
83 unsigned long flags; 88 unsigned long flags;
84 89
85 spin_lock_irqsave(&i8259A_lock, flags); 90 raw_spin_lock_irqsave(&i8259A_lock, flags);
86 cached_irq_mask &= mask; 91 cached_irq_mask &= mask;
87 if (irq & 8) 92 if (irq & 8)
88 outb(cached_slave_mask, PIC_SLAVE_IMR); 93 outb(cached_slave_mask, PIC_SLAVE_IMR);
89 else 94 else
90 outb(cached_master_mask, PIC_MASTER_IMR); 95 outb(cached_master_mask, PIC_MASTER_IMR);
91 spin_unlock_irqrestore(&i8259A_lock, flags); 96 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
92} 97}
93 98
94int i8259A_irq_pending(unsigned int irq) 99static int i8259A_irq_pending(unsigned int irq)
95{ 100{
96 unsigned int mask = 1<<irq; 101 unsigned int mask = 1<<irq;
97 unsigned long flags; 102 unsigned long flags;
98 int ret; 103 int ret;
99 104
100 spin_lock_irqsave(&i8259A_lock, flags); 105 raw_spin_lock_irqsave(&i8259A_lock, flags);
101 if (irq < 8) 106 if (irq < 8)
102 ret = inb(PIC_MASTER_CMD) & mask; 107 ret = inb(PIC_MASTER_CMD) & mask;
103 else 108 else
104 ret = inb(PIC_SLAVE_CMD) & (mask >> 8); 109 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
105 spin_unlock_irqrestore(&i8259A_lock, flags); 110 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
106 111
107 return ret; 112 return ret;
108} 113}
109 114
110void make_8259A_irq(unsigned int irq) 115static void make_8259A_irq(unsigned int irq)
111{ 116{
112 disable_irq_nosync(irq); 117 disable_irq_nosync(irq);
113 io_apic_irqs &= ~(1<<irq); 118 io_apic_irqs &= ~(1<<irq);
@@ -150,7 +155,7 @@ static void mask_and_ack_8259A(unsigned int irq)
150 unsigned int irqmask = 1 << irq; 155 unsigned int irqmask = 1 << irq;
151 unsigned long flags; 156 unsigned long flags;
152 157
153 spin_lock_irqsave(&i8259A_lock, flags); 158 raw_spin_lock_irqsave(&i8259A_lock, flags);
154 /* 159 /*
155 * Lightweight spurious IRQ detection. We do not want 160 * Lightweight spurious IRQ detection. We do not want
156 * to overdo spurious IRQ handling - it's usually a sign 161 * to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +188,7 @@ handle_real_irq:
183 outb(cached_master_mask, PIC_MASTER_IMR); 188 outb(cached_master_mask, PIC_MASTER_IMR);
184 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ 189 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
185 } 190 }
186 spin_unlock_irqrestore(&i8259A_lock, flags); 191 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
187 return; 192 return;
188 193
189spurious_8259A_irq: 194spurious_8259A_irq:
@@ -281,37 +286,37 @@ static int __init i8259A_init_sysfs(void)
281 286
282device_initcall(i8259A_init_sysfs); 287device_initcall(i8259A_init_sysfs);
283 288
284void mask_8259A(void) 289static void mask_8259A(void)
285{ 290{
286 unsigned long flags; 291 unsigned long flags;
287 292
288 spin_lock_irqsave(&i8259A_lock, flags); 293 raw_spin_lock_irqsave(&i8259A_lock, flags);
289 294
290 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 295 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 296 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 297
293 spin_unlock_irqrestore(&i8259A_lock, flags); 298 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
294} 299}
295 300
296void unmask_8259A(void) 301static void unmask_8259A(void)
297{ 302{
298 unsigned long flags; 303 unsigned long flags;
299 304
300 spin_lock_irqsave(&i8259A_lock, flags); 305 raw_spin_lock_irqsave(&i8259A_lock, flags);
301 306
302 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 307 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
303 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 308 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
304 309
305 spin_unlock_irqrestore(&i8259A_lock, flags); 310 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
306} 311}
307 312
308void init_8259A(int auto_eoi) 313static void init_8259A(int auto_eoi)
309{ 314{
310 unsigned long flags; 315 unsigned long flags;
311 316
312 i8259A_auto_eoi = auto_eoi; 317 i8259A_auto_eoi = auto_eoi;
313 318
314 spin_lock_irqsave(&i8259A_lock, flags); 319 raw_spin_lock_irqsave(&i8259A_lock, flags);
315 320
316 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 321 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
317 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 322 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +361,49 @@ void init_8259A(int auto_eoi)
356 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 361 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
357 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 362 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
358 363
359 spin_unlock_irqrestore(&i8259A_lock, flags); 364 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
360} 365}
366
367/*
368 * make i8259 a driver so that we can select pic functions at run time. the goal
369 * is to make x86 binary compatible among pc compatible and non-pc compatible
370 * platforms, such as x86 MID.
371 */
372
373static void legacy_pic_noop(void) { };
374static void legacy_pic_uint_noop(unsigned int unused) { };
375static void legacy_pic_int_noop(int unused) { };
376
377static struct irq_chip dummy_pic_chip = {
378 .name = "dummy pic",
379 .mask = legacy_pic_uint_noop,
380 .unmask = legacy_pic_uint_noop,
381 .disable = legacy_pic_uint_noop,
382 .mask_ack = legacy_pic_uint_noop,
383};
384static int legacy_pic_irq_pending_noop(unsigned int irq)
385{
386 return 0;
387}
388
389struct legacy_pic null_legacy_pic = {
390 .nr_legacy_irqs = 0,
391 .chip = &dummy_pic_chip,
392 .mask_all = legacy_pic_noop,
393 .restore_mask = legacy_pic_noop,
394 .init = legacy_pic_int_noop,
395 .irq_pending = legacy_pic_irq_pending_noop,
396 .make_irq = legacy_pic_uint_noop,
397};
398
399struct legacy_pic default_legacy_pic = {
400 .nr_legacy_irqs = NR_IRQS_LEGACY,
401 .chip = &i8259A_chip,
402 .mask_all = mask_8259A,
403 .restore_mask = unmask_8259A,
404 .init = init_8259A,
405 .irq_pending = i8259A_irq_pending,
406 .make_irq = make_8259A_irq,
407};
408
409struct legacy_pic *legacy_pic = &default_legacy_pic;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index d5932226614f..0ed2d300cd46 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/kprobes.h> 9#include <linux/kprobes.h>
11#include <linux/init.h> 10#include <linux/init.h>
@@ -84,24 +83,7 @@ static struct irqaction irq2 = {
84}; 83};
85 84
86DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 85DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
87 [0 ... IRQ0_VECTOR - 1] = -1, 86 [0 ... NR_VECTORS - 1] = -1,
88 [IRQ0_VECTOR] = 0,
89 [IRQ1_VECTOR] = 1,
90 [IRQ2_VECTOR] = 2,
91 [IRQ3_VECTOR] = 3,
92 [IRQ4_VECTOR] = 4,
93 [IRQ5_VECTOR] = 5,
94 [IRQ6_VECTOR] = 6,
95 [IRQ7_VECTOR] = 7,
96 [IRQ8_VECTOR] = 8,
97 [IRQ9_VECTOR] = 9,
98 [IRQ10_VECTOR] = 10,
99 [IRQ11_VECTOR] = 11,
100 [IRQ12_VECTOR] = 12,
101 [IRQ13_VECTOR] = 13,
102 [IRQ14_VECTOR] = 14,
103 [IRQ15_VECTOR] = 15,
104 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
105}; 87};
106 88
107int vector_used_by_percpu_irq(unsigned int vector) 89int vector_used_by_percpu_irq(unsigned int vector)
@@ -123,12 +105,12 @@ void __init init_ISA_irqs(void)
123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 105#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
124 init_bsp_APIC(); 106 init_bsp_APIC();
125#endif 107#endif
126 init_8259A(0); 108 legacy_pic->init(0);
127 109
128 /* 110 /*
129 * 16 old-style INTA-cycle interrupts: 111 * 16 old-style INTA-cycle interrupts:
130 */ 112 */
131 for (i = 0; i < NR_IRQS_LEGACY; i++) { 113 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
132 struct irq_desc *desc = irq_to_desc(i); 114 struct irq_desc *desc = irq_to_desc(i);
133 115
134 desc->status = IRQ_DISABLED; 116 desc->status = IRQ_DISABLED;
@@ -142,9 +124,44 @@ void __init init_ISA_irqs(void)
142 124
143void __init init_IRQ(void) 125void __init init_IRQ(void)
144{ 126{
127 int i;
128
129 /*
130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
132 * then this configuration will likely be static after the boot. If
133 * these IRQ's are handled by more mordern controllers like IO-APIC,
134 * then this vector space can be freed and re-used dynamically as the
135 * irq's migrate etc.
136 */
137 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
138 per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
139
145 x86_init.irqs.intr_init(); 140 x86_init.irqs.intr_init();
146} 141}
147 142
143/*
144 * Setup the vector to irq mappings.
145 */
146void setup_vector_irq(int cpu)
147{
148#ifndef CONFIG_X86_IO_APIC
149 int irq;
150
151 /*
152 * On most of the platforms, legacy PIC delivers the interrupts on the
153 * boot cpu. But there are certain platforms where PIC interrupts are
154 * delivered to multiple cpu's. If the legacy IRQ is handled by the
155 * legacy PIC, for the new cpu that is coming online, setup the static
156 * legacy vector to irq mapping:
157 */
158 for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
159 per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
160#endif
161
162 __setup_vector_irq(cpu);
163}
164
148static void __init smp_intr_init(void) 165static void __init smp_intr_init(void)
149{ 166{
150#ifdef CONFIG_SMP 167#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index cbc4332a77b2..0f7bc20cfcde 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -2,8 +2,8 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/module.h> 9#include <linux/module.h>
@@ -121,3 +121,17 @@ void k8_flush_garts(void)
121} 121}
122EXPORT_SYMBOL_GPL(k8_flush_garts); 122EXPORT_SYMBOL_GPL(k8_flush_garts);
123 123
124static __init int init_k8_nbs(void)
125{
126 int err = 0;
127
128 err = cache_k8_northbridges();
129
130 if (err < 0)
131 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
132
133 return err;
134}
135
136/* This has to go after the PCI subsystem */
137fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index e444357375ce..8afd9f321f10 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/stat.h> 14#include <linux/stat.h>
14#include <linux/io.h> 15#include <linux/io.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index dd74fe7273b1..b2258ca91003 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -42,6 +42,7 @@
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h>
45 46
46#include <asm/debugreg.h> 47#include <asm/debugreg.h>
47#include <asm/apicdef.h> 48#include <asm/apicdef.h>
@@ -204,40 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
204 205
205static struct hw_breakpoint { 206static struct hw_breakpoint {
206 unsigned enabled; 207 unsigned enabled;
207 unsigned type;
208 unsigned len;
209 unsigned long addr; 208 unsigned long addr;
209 int len;
210 int type;
211 struct perf_event **pev;
210} breakinfo[4]; 212} breakinfo[4];
211 213
212static void kgdb_correct_hw_break(void) 214static void kgdb_correct_hw_break(void)
213{ 215{
214 unsigned long dr7;
215 int correctit = 0;
216 int breakbit;
217 int breakno; 216 int breakno;
218 217
219 get_debugreg(dr7, 7);
220 for (breakno = 0; breakno < 4; breakno++) { 218 for (breakno = 0; breakno < 4; breakno++) {
221 breakbit = 2 << (breakno << 1); 219 struct perf_event *bp;
222 if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { 220 struct arch_hw_breakpoint *info;
223 correctit = 1; 221 int val;
224 dr7 |= breakbit; 222 int cpu = raw_smp_processor_id();
225 dr7 &= ~(0xf0000 << (breakno << 2)); 223 if (!breakinfo[breakno].enabled)
226 dr7 |= ((breakinfo[breakno].len << 2) | 224 continue;
227 breakinfo[breakno].type) << 225 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
228 ((breakno << 2) + 16); 226 info = counter_arch_bp(bp);
229 set_debugreg(breakinfo[breakno].addr, breakno); 227 if (bp->attr.disabled != 1)
230 228 continue;
231 } else { 229 bp->attr.bp_addr = breakinfo[breakno].addr;
232 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 230 bp->attr.bp_len = breakinfo[breakno].len;
233 correctit = 1; 231 bp->attr.bp_type = breakinfo[breakno].type;
234 dr7 &= ~breakbit; 232 info->address = breakinfo[breakno].addr;
235 dr7 &= ~(0xf0000 << (breakno << 2)); 233 info->len = breakinfo[breakno].len;
236 } 234 info->type = breakinfo[breakno].type;
237 } 235 val = arch_install_hw_breakpoint(bp);
236 if (!val)
237 bp->attr.disabled = 0;
238 }
239 hw_breakpoint_restore();
240}
241
242static int hw_break_reserve_slot(int breakno)
243{
244 int cpu;
245 int cnt = 0;
246 struct perf_event **pevent;
247
248 for_each_online_cpu(cpu) {
249 cnt++;
250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
251 if (dbg_reserve_bp_slot(*pevent))
252 goto fail;
253 }
254
255 return 0;
256
257fail:
258 for_each_online_cpu(cpu) {
259 cnt--;
260 if (!cnt)
261 break;
262 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
263 dbg_release_bp_slot(*pevent);
238 } 264 }
239 if (correctit) 265 return -1;
240 set_debugreg(dr7, 7); 266}
267
268static int hw_break_release_slot(int breakno)
269{
270 struct perf_event **pevent;
271 int cpu;
272
273 for_each_online_cpu(cpu) {
274 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
275 if (dbg_release_bp_slot(*pevent))
276 /*
277 * The debugger is responisble for handing the retry on
278 * remove failure.
279 */
280 return -1;
281 }
282 return 0;
241} 283}
242 284
243static int 285static int
@@ -251,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
251 if (i == 4) 293 if (i == 4)
252 return -1; 294 return -1;
253 295
296 if (hw_break_release_slot(i)) {
297 printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
298 return -1;
299 }
254 breakinfo[i].enabled = 0; 300 breakinfo[i].enabled = 0;
255 301
256 return 0; 302 return 0;
@@ -259,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
259static void kgdb_remove_all_hw_break(void) 305static void kgdb_remove_all_hw_break(void)
260{ 306{
261 int i; 307 int i;
308 int cpu = raw_smp_processor_id();
309 struct perf_event *bp;
262 310
263 for (i = 0; i < 4; i++) 311 for (i = 0; i < 4; i++) {
264 memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); 312 if (!breakinfo[i].enabled)
313 continue;
314 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
315 if (bp->attr.disabled == 1)
316 continue;
317 arch_uninstall_hw_breakpoint(bp);
318 bp->attr.disabled = 1;
319 }
265} 320}
266 321
267static int 322static int
268kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) 323kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
269{ 324{
270 unsigned type;
271 int i; 325 int i;
272 326
273 for (i = 0; i < 4; i++) 327 for (i = 0; i < 4; i++)
@@ -278,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
278 332
279 switch (bptype) { 333 switch (bptype) {
280 case BP_HARDWARE_BREAKPOINT: 334 case BP_HARDWARE_BREAKPOINT:
281 type = 0; 335 len = 1;
282 len = 1; 336 breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
283 break; 337 break;
284 case BP_WRITE_WATCHPOINT: 338 case BP_WRITE_WATCHPOINT:
285 type = 1; 339 breakinfo[i].type = X86_BREAKPOINT_WRITE;
286 break; 340 break;
287 case BP_ACCESS_WATCHPOINT: 341 case BP_ACCESS_WATCHPOINT:
288 type = 3; 342 breakinfo[i].type = X86_BREAKPOINT_RW;
289 break; 343 break;
290 default: 344 default:
291 return -1; 345 return -1;
292 } 346 }
293 347 switch (len) {
294 if (len == 1 || len == 2 || len == 4) 348 case 1:
295 breakinfo[i].len = len - 1; 349 breakinfo[i].len = X86_BREAKPOINT_LEN_1;
296 else 350 break;
351 case 2:
352 breakinfo[i].len = X86_BREAKPOINT_LEN_2;
353 break;
354 case 4:
355 breakinfo[i].len = X86_BREAKPOINT_LEN_4;
356 break;
357#ifdef CONFIG_X86_64
358 case 8:
359 breakinfo[i].len = X86_BREAKPOINT_LEN_8;
360 break;
361#endif
362 default:
297 return -1; 363 return -1;
298 364 }
299 breakinfo[i].enabled = 1;
300 breakinfo[i].addr = addr; 365 breakinfo[i].addr = addr;
301 breakinfo[i].type = type; 366 if (hw_break_reserve_slot(i)) {
367 breakinfo[i].addr = 0;
368 return -1;
369 }
370 breakinfo[i].enabled = 1;
302 371
303 return 0; 372 return 0;
304} 373}
@@ -313,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
313 */ 382 */
314void kgdb_disable_hw_debug(struct pt_regs *regs) 383void kgdb_disable_hw_debug(struct pt_regs *regs)
315{ 384{
385 int i;
386 int cpu = raw_smp_processor_id();
387 struct perf_event *bp;
388
316 /* Disable hardware debugging while we are in kgdb: */ 389 /* Disable hardware debugging while we are in kgdb: */
317 set_debugreg(0UL, 7); 390 set_debugreg(0UL, 7);
391 for (i = 0; i < 4; i++) {
392 if (!breakinfo[i].enabled)
393 continue;
394 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
395 if (bp->attr.disabled == 1)
396 continue;
397 arch_uninstall_hw_breakpoint(bp);
398 bp->attr.disabled = 1;
399 }
318} 400}
319 401
320/** 402/**
@@ -378,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 struct pt_regs *linux_regs) 460 struct pt_regs *linux_regs)
379{ 461{
380 unsigned long addr; 462 unsigned long addr;
381 unsigned long dr6;
382 char *ptr; 463 char *ptr;
383 int newPC; 464 int newPC;
384 465
@@ -404,20 +485,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
404 raw_smp_processor_id()); 485 raw_smp_processor_id());
405 } 486 }
406 487
407 get_debugreg(dr6, 6);
408 if (!(dr6 & 0x4000)) {
409 int breakno;
410
411 for (breakno = 0; breakno < 4; breakno++) {
412 if (dr6 & (1 << breakno) &&
413 breakinfo[breakno].type == 0) {
414 /* Set restore flag: */
415 linux_regs->flags |= X86_EFLAGS_RF;
416 break;
417 }
418 }
419 }
420 set_debugreg(0UL, 6);
421 kgdb_correct_hw_break(); 488 kgdb_correct_hw_break();
422 489
423 return 0; 490 return 0;
@@ -485,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
485 break; 552 break;
486 553
487 case DIE_DEBUG: 554 case DIE_DEBUG:
488 if (atomic_read(&kgdb_cpu_doing_single_step) == 555 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
489 raw_smp_processor_id()) {
490 if (user_mode(regs)) 556 if (user_mode(regs))
491 return single_step_cont(regs, args); 557 return single_step_cont(regs, args);
492 break; 558 break;
@@ -539,7 +605,42 @@ static struct notifier_block kgdb_notifier = {
539 */ 605 */
540int kgdb_arch_init(void) 606int kgdb_arch_init(void)
541{ 607{
542 return register_die_notifier(&kgdb_notifier); 608 int i, cpu;
609 int ret;
610 struct perf_event_attr attr;
611 struct perf_event **pevent;
612
613 ret = register_die_notifier(&kgdb_notifier);
614 if (ret != 0)
615 return ret;
616 /*
617 * Pre-allocate the hw breakpoint structions in the non-atomic
618 * portion of kgdb because this operation requires mutexs to
619 * complete.
620 */
621 hw_breakpoint_init(&attr);
622 attr.bp_addr = (unsigned long)kgdb_arch_init;
623 attr.bp_len = HW_BREAKPOINT_LEN_1;
624 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1;
626 for (i = 0; i < 4; i++) {
627 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
628 if (IS_ERR(breakinfo[i].pev)) {
629 printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
630 breakinfo[i].pev = NULL;
631 kgdb_arch_exit();
632 return -1;
633 }
634 for_each_online_cpu(cpu) {
635 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
636 pevent[0]->hw.sample_period = 1;
637 if (pevent[0]->destroy != NULL) {
638 pevent[0]->destroy = NULL;
639 release_bp_slot(*pevent);
640 }
641 }
642 }
643 return ret;
543} 644}
544 645
545/** 646/**
@@ -550,6 +651,13 @@ int kgdb_arch_init(void)
550 */ 651 */
551void kgdb_arch_exit(void) 652void kgdb_arch_exit(void)
552{ 653{
654 int i;
655 for (i = 0; i < 4; i++) {
656 if (breakinfo[i].pev) {
657 unregister_wide_hw_breakpoint(breakinfo[i].pev);
658 breakinfo[i].pev = NULL;
659 }
660 }
553 unregister_die_notifier(&kgdb_notifier); 661 unregister_die_notifier(&kgdb_notifier);
554} 662}
555 663
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5b8c7505b3bc..b43bbaebe2c0 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
52#include <linux/ftrace.h>
52 53
53#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
106}; 107};
107const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 108const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
108 109
109/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 110static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
110static void __kprobes set_jmp_op(void *from, void *to)
111{ 111{
112 struct __arch_jmp_op { 112 struct __arch_relative_insn {
113 char op; 113 u8 op;
114 s32 raddr; 114 s32 raddr;
115 } __attribute__((packed)) * jop; 115 } __attribute__((packed)) *insn;
116 jop = (struct __arch_jmp_op *)from; 116
117 jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); 117 insn = (struct __arch_relative_insn *)from;
118 jop->op = RELATIVEJUMP_INSTRUCTION; 118 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
119 insn->op = op;
120}
121
122/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
123static void __kprobes synthesize_reljump(void *from, void *to)
124{
125 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
119} 126}
120 127
121/* 128/*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
202 /* 209 /*
203 * Basically, kp->ainsn.insn has an original instruction. 210 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping 211 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of 212 * at different place, __copy_instruction() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction 213 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn. 214 * from the kp->ainsn.insn.
208 * 215 *
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
284} 291}
285 292
286/* 293/*
287 * Adjust the displacement if the instruction uses the %rip-relative 294 * Copy an instruction and adjust the displacement if the instruction
288 * addressing mode. 295 * uses the %rip-relative addressing mode.
289 * If it does, Return the address of the 32-bit displacement word. 296 * If it does, Return the address of the 32-bit displacement word.
290 * If not, return null. 297 * If not, return null.
291 * Only applicable to 64-bit x86. 298 * Only applicable to 64-bit x86.
292 */ 299 */
293static void __kprobes fix_riprel(struct kprobe *p) 300static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
294{ 301{
295#ifdef CONFIG_X86_64
296 struct insn insn; 302 struct insn insn;
297 kernel_insn_init(&insn, p->ainsn.insn); 303 int ret;
304 kprobe_opcode_t buf[MAX_INSN_SIZE];
298 305
306 kernel_insn_init(&insn, src);
307 if (recover) {
308 insn_get_opcode(&insn);
309 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
310 ret = recover_probed_instruction(buf,
311 (unsigned long)src);
312 if (ret)
313 return 0;
314 kernel_insn_init(&insn, buf);
315 }
316 }
317 insn_get_length(&insn);
318 memcpy(dest, insn.kaddr, insn.length);
319
320#ifdef CONFIG_X86_64
299 if (insn_rip_relative(&insn)) { 321 if (insn_rip_relative(&insn)) {
300 s64 newdisp; 322 s64 newdisp;
301 u8 *disp; 323 u8 *disp;
324 kernel_insn_init(&insn, dest);
302 insn_get_displacement(&insn); 325 insn_get_displacement(&insn);
303 /* 326 /*
304 * The copied instruction uses the %rip-relative addressing 327 * The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
312 * extension of the original signed 32-bit displacement would 335 * extension of the original signed 32-bit displacement would
313 * have given. 336 * have given.
314 */ 337 */
315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value - 338 newdisp = (u8 *) src + (s64) insn.displacement.value -
316 (u8 *) p->ainsn.insn; 339 (u8 *) dest;
317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 340 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); 341 disp = (u8 *) dest + insn_offset_displacement(&insn);
319 *(s32 *) disp = (s32) newdisp; 342 *(s32 *) disp = (s32) newdisp;
320 } 343 }
321#endif 344#endif
345 return insn.length;
322} 346}
323 347
324static void __kprobes arch_copy_kprobe(struct kprobe *p) 348static void __kprobes arch_copy_kprobe(struct kprobe *p)
325{ 349{
326 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 350 /*
327 351 * Copy an instruction without recovering int3, because it will be
328 fix_riprel(p); 352 * put by another subsystem.
353 */
354 __copy_instruction(p->ainsn.insn, p->addr, 0);
329 355
330 if (can_boost(p->addr)) 356 if (can_boost(p->addr))
331 p->ainsn.boostable = 0; 357 p->ainsn.boostable = 0;
@@ -337,6 +363,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
337 363
338int __kprobes arch_prepare_kprobe(struct kprobe *p) 364int __kprobes arch_prepare_kprobe(struct kprobe *p)
339{ 365{
366 if (alternatives_text_reserved(p->addr, p->addr))
367 return -EINVAL;
368
340 if (!can_probe((unsigned long)p->addr)) 369 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ; 370 return -EILSEQ;
342 /* insn: must be on special executable page on x86. */ 371 /* insn: must be on special executable page on x86. */
@@ -403,18 +432,6 @@ static void __kprobes restore_btf(void)
403 update_debugctlmsr(current->thread.debugctlmsr); 432 update_debugctlmsr(current->thread.debugctlmsr);
404} 433}
405 434
406static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
407{
408 clear_btf();
409 regs->flags |= X86_EFLAGS_TF;
410 regs->flags &= ~X86_EFLAGS_IF;
411 /* single step inline if the instruction is an int3 */
412 if (p->opcode == BREAKPOINT_INSTRUCTION)
413 regs->ip = (unsigned long)p->addr;
414 else
415 regs->ip = (unsigned long)p->ainsn.insn;
416}
417
418void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
419 struct pt_regs *regs) 436 struct pt_regs *regs)
420{ 437{
@@ -426,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
426 *sara = (unsigned long) &kretprobe_trampoline; 443 *sara = (unsigned long) &kretprobe_trampoline;
427} 444}
428 445
446#ifdef CONFIG_OPTPROBES
447static int __kprobes setup_detour_execution(struct kprobe *p,
448 struct pt_regs *regs,
449 int reenter);
450#else
451#define setup_detour_execution(p, regs, reenter) (0)
452#endif
453
429static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 454static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
430 struct kprobe_ctlblk *kcb) 455 struct kprobe_ctlblk *kcb, int reenter)
431{ 456{
432#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) 457 if (setup_detour_execution(p, regs, reenter))
458 return;
459
460#if !defined(CONFIG_PREEMPT)
433 if (p->ainsn.boostable == 1 && !p->post_handler) { 461 if (p->ainsn.boostable == 1 && !p->post_handler) {
434 /* Boost up -- we can execute copied instructions directly */ 462 /* Boost up -- we can execute copied instructions directly */
435 reset_current_kprobe(); 463 if (!reenter)
464 reset_current_kprobe();
465 /*
466 * Reentering boosted probe doesn't reset current_kprobe,
467 * nor set current_kprobe, because it doesn't use single
468 * stepping.
469 */
436 regs->ip = (unsigned long)p->ainsn.insn; 470 regs->ip = (unsigned long)p->ainsn.insn;
437 preempt_enable_no_resched(); 471 preempt_enable_no_resched();
438 return; 472 return;
439 } 473 }
440#endif 474#endif
441 prepare_singlestep(p, regs); 475 if (reenter) {
442 kcb->kprobe_status = KPROBE_HIT_SS; 476 save_previous_kprobe(kcb);
477 set_current_kprobe(p, regs, kcb);
478 kcb->kprobe_status = KPROBE_REENTER;
479 } else
480 kcb->kprobe_status = KPROBE_HIT_SS;
481 /* Prepare real single stepping */
482 clear_btf();
483 regs->flags |= X86_EFLAGS_TF;
484 regs->flags &= ~X86_EFLAGS_IF;
485 /* single step inline if the instruction is an int3 */
486 if (p->opcode == BREAKPOINT_INSTRUCTION)
487 regs->ip = (unsigned long)p->addr;
488 else
489 regs->ip = (unsigned long)p->ainsn.insn;
443} 490}
444 491
445/* 492/*
@@ -453,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
453 switch (kcb->kprobe_status) { 500 switch (kcb->kprobe_status) {
454 case KPROBE_HIT_SSDONE: 501 case KPROBE_HIT_SSDONE:
455 case KPROBE_HIT_ACTIVE: 502 case KPROBE_HIT_ACTIVE:
456 save_previous_kprobe(kcb);
457 set_current_kprobe(p, regs, kcb);
458 kprobes_inc_nmissed_count(p); 503 kprobes_inc_nmissed_count(p);
459 prepare_singlestep(p, regs); 504 setup_singlestep(p, regs, kcb, 1);
460 kcb->kprobe_status = KPROBE_REENTER;
461 break; 505 break;
462 case KPROBE_HIT_SS: 506 case KPROBE_HIT_SS:
463 /* A probe has been hit in the codepath leading up to, or just 507 /* A probe has been hit in the codepath leading up to, or just
@@ -532,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
532 * more here. 576 * more here.
533 */ 577 */
534 if (!p->pre_handler || !p->pre_handler(p, regs)) 578 if (!p->pre_handler || !p->pre_handler(p, regs))
535 setup_singlestep(p, regs, kcb); 579 setup_singlestep(p, regs, kcb, 0);
536 return 1; 580 return 1;
537 } 581 }
538 } else if (kprobe_running()) { 582 } else if (kprobe_running()) {
539 p = __get_cpu_var(current_kprobe); 583 p = __get_cpu_var(current_kprobe);
540 if (p->break_handler && p->break_handler(p, regs)) { 584 if (p->break_handler && p->break_handler(p, regs)) {
541 setup_singlestep(p, regs, kcb); 585 setup_singlestep(p, regs, kcb, 0);
542 return 1; 586 return 1;
543 } 587 }
544 } /* else: not a kprobe fault; let the kernel handle it */ 588 } /* else: not a kprobe fault; let the kernel handle it */
@@ -547,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
547 return 0; 591 return 0;
548} 592}
549 593
594#ifdef CONFIG_X86_64
595#define SAVE_REGS_STRING \
596 /* Skip cs, ip, orig_ax. */ \
597 " subq $24, %rsp\n" \
598 " pushq %rdi\n" \
599 " pushq %rsi\n" \
600 " pushq %rdx\n" \
601 " pushq %rcx\n" \
602 " pushq %rax\n" \
603 " pushq %r8\n" \
604 " pushq %r9\n" \
605 " pushq %r10\n" \
606 " pushq %r11\n" \
607 " pushq %rbx\n" \
608 " pushq %rbp\n" \
609 " pushq %r12\n" \
610 " pushq %r13\n" \
611 " pushq %r14\n" \
612 " pushq %r15\n"
613#define RESTORE_REGS_STRING \
614 " popq %r15\n" \
615 " popq %r14\n" \
616 " popq %r13\n" \
617 " popq %r12\n" \
618 " popq %rbp\n" \
619 " popq %rbx\n" \
620 " popq %r11\n" \
621 " popq %r10\n" \
622 " popq %r9\n" \
623 " popq %r8\n" \
624 " popq %rax\n" \
625 " popq %rcx\n" \
626 " popq %rdx\n" \
627 " popq %rsi\n" \
628 " popq %rdi\n" \
629 /* Skip orig_ax, ip, cs */ \
630 " addq $24, %rsp\n"
631#else
632#define SAVE_REGS_STRING \
633 /* Skip cs, ip, orig_ax and gs. */ \
634 " subl $16, %esp\n" \
635 " pushl %fs\n" \
636 " pushl %ds\n" \
637 " pushl %es\n" \
638 " pushl %eax\n" \
639 " pushl %ebp\n" \
640 " pushl %edi\n" \
641 " pushl %esi\n" \
642 " pushl %edx\n" \
643 " pushl %ecx\n" \
644 " pushl %ebx\n"
645#define RESTORE_REGS_STRING \
646 " popl %ebx\n" \
647 " popl %ecx\n" \
648 " popl %edx\n" \
649 " popl %esi\n" \
650 " popl %edi\n" \
651 " popl %ebp\n" \
652 " popl %eax\n" \
653 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
654 " addl $24, %esp\n"
655#endif
656
550/* 657/*
551 * When a retprobed function returns, this code saves registers and 658 * When a retprobed function returns, this code saves registers and
552 * calls trampoline_handler() runs, which calls the kretprobe's handler. 659 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -560,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
560 /* We don't bother saving the ss register */ 667 /* We don't bother saving the ss register */
561 " pushq %rsp\n" 668 " pushq %rsp\n"
562 " pushfq\n" 669 " pushfq\n"
563 /* 670 SAVE_REGS_STRING
564 * Skip cs, ip, orig_ax.
565 * trampoline_handler() will plug in these values
566 */
567 " subq $24, %rsp\n"
568 " pushq %rdi\n"
569 " pushq %rsi\n"
570 " pushq %rdx\n"
571 " pushq %rcx\n"
572 " pushq %rax\n"
573 " pushq %r8\n"
574 " pushq %r9\n"
575 " pushq %r10\n"
576 " pushq %r11\n"
577 " pushq %rbx\n"
578 " pushq %rbp\n"
579 " pushq %r12\n"
580 " pushq %r13\n"
581 " pushq %r14\n"
582 " pushq %r15\n"
583 " movq %rsp, %rdi\n" 671 " movq %rsp, %rdi\n"
584 " call trampoline_handler\n" 672 " call trampoline_handler\n"
585 /* Replace saved sp with true return address. */ 673 /* Replace saved sp with true return address. */
586 " movq %rax, 152(%rsp)\n" 674 " movq %rax, 152(%rsp)\n"
587 " popq %r15\n" 675 RESTORE_REGS_STRING
588 " popq %r14\n"
589 " popq %r13\n"
590 " popq %r12\n"
591 " popq %rbp\n"
592 " popq %rbx\n"
593 " popq %r11\n"
594 " popq %r10\n"
595 " popq %r9\n"
596 " popq %r8\n"
597 " popq %rax\n"
598 " popq %rcx\n"
599 " popq %rdx\n"
600 " popq %rsi\n"
601 " popq %rdi\n"
602 /* Skip orig_ax, ip, cs */
603 " addq $24, %rsp\n"
604 " popfq\n" 676 " popfq\n"
605#else 677#else
606 " pushf\n" 678 " pushf\n"
607 /* 679 SAVE_REGS_STRING
608 * Skip cs, ip, orig_ax and gs.
609 * trampoline_handler() will plug in these values
610 */
611 " subl $16, %esp\n"
612 " pushl %fs\n"
613 " pushl %es\n"
614 " pushl %ds\n"
615 " pushl %eax\n"
616 " pushl %ebp\n"
617 " pushl %edi\n"
618 " pushl %esi\n"
619 " pushl %edx\n"
620 " pushl %ecx\n"
621 " pushl %ebx\n"
622 " movl %esp, %eax\n" 680 " movl %esp, %eax\n"
623 " call trampoline_handler\n" 681 " call trampoline_handler\n"
624 /* Move flags to cs */ 682 /* Move flags to cs */
@@ -626,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
626 " movl %edx, 52(%esp)\n" 684 " movl %edx, 52(%esp)\n"
627 /* Replace saved flags with true return address. */ 685 /* Replace saved flags with true return address. */
628 " movl %eax, 56(%esp)\n" 686 " movl %eax, 56(%esp)\n"
629 " popl %ebx\n" 687 RESTORE_REGS_STRING
630 " popl %ecx\n"
631 " popl %edx\n"
632 " popl %esi\n"
633 " popl %edi\n"
634 " popl %ebp\n"
635 " popl %eax\n"
636 /* Skip ds, es, fs, gs, orig_ax and ip */
637 " addl $24, %esp\n"
638 " popf\n" 688 " popf\n"
639#endif 689#endif
640 " ret\n"); 690 " ret\n");
@@ -802,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
802 * These instructions can be executed directly if it 852 * These instructions can be executed directly if it
803 * jumps back to correct address. 853 * jumps back to correct address.
804 */ 854 */
805 set_jmp_op((void *)regs->ip, 855 synthesize_reljump((void *)regs->ip,
806 (void *)orig_ip + (regs->ip - copy_ip)); 856 (void *)orig_ip + (regs->ip - copy_ip));
807 p->ainsn.boostable = 1; 857 p->ainsn.boostable = 1;
808 } else { 858 } else {
809 p->ainsn.boostable = -1; 859 p->ainsn.boostable = -1;
@@ -1030,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1030 return 0; 1080 return 0;
1031} 1081}
1032 1082
1083
1084#ifdef CONFIG_OPTPROBES
1085
1086/* Insert a call instruction at address 'from', which calls address 'to'.*/
1087static void __kprobes synthesize_relcall(void *from, void *to)
1088{
1089 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1090}
1091
1092/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1093static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1094 unsigned long val)
1095{
1096#ifdef CONFIG_X86_64
1097 *addr++ = 0x48;
1098 *addr++ = 0xbf;
1099#else
1100 *addr++ = 0xb8;
1101#endif
1102 *(unsigned long *)addr = val;
1103}
1104
1105void __kprobes kprobes_optinsn_template_holder(void)
1106{
1107 asm volatile (
1108 ".global optprobe_template_entry\n"
1109 "optprobe_template_entry: \n"
1110#ifdef CONFIG_X86_64
1111 /* We don't bother saving the ss register */
1112 " pushq %rsp\n"
1113 " pushfq\n"
1114 SAVE_REGS_STRING
1115 " movq %rsp, %rsi\n"
1116 ".global optprobe_template_val\n"
1117 "optprobe_template_val: \n"
1118 ASM_NOP5
1119 ASM_NOP5
1120 ".global optprobe_template_call\n"
1121 "optprobe_template_call: \n"
1122 ASM_NOP5
1123 /* Move flags to rsp */
1124 " movq 144(%rsp), %rdx\n"
1125 " movq %rdx, 152(%rsp)\n"
1126 RESTORE_REGS_STRING
1127 /* Skip flags entry */
1128 " addq $8, %rsp\n"
1129 " popfq\n"
1130#else /* CONFIG_X86_32 */
1131 " pushf\n"
1132 SAVE_REGS_STRING
1133 " movl %esp, %edx\n"
1134 ".global optprobe_template_val\n"
1135 "optprobe_template_val: \n"
1136 ASM_NOP5
1137 ".global optprobe_template_call\n"
1138 "optprobe_template_call: \n"
1139 ASM_NOP5
1140 RESTORE_REGS_STRING
1141 " addl $4, %esp\n" /* skip cs */
1142 " popf\n"
1143#endif
1144 ".global optprobe_template_end\n"
1145 "optprobe_template_end: \n");
1146}
1147
1148#define TMPL_MOVE_IDX \
1149 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1150#define TMPL_CALL_IDX \
1151 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1152#define TMPL_END_IDX \
1153 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1154
1155#define INT3_SIZE sizeof(kprobe_opcode_t)
1156
1157/* Optimized kprobe call back function: called from optinsn */
1158static void __kprobes optimized_callback(struct optimized_kprobe *op,
1159 struct pt_regs *regs)
1160{
1161 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1162
1163 preempt_disable();
1164 if (kprobe_running()) {
1165 kprobes_inc_nmissed_count(&op->kp);
1166 } else {
1167 /* Save skipped registers */
1168#ifdef CONFIG_X86_64
1169 regs->cs = __KERNEL_CS;
1170#else
1171 regs->cs = __KERNEL_CS | get_kernel_rpl();
1172 regs->gs = 0;
1173#endif
1174 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1175 regs->orig_ax = ~0UL;
1176
1177 __get_cpu_var(current_kprobe) = &op->kp;
1178 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1179 opt_pre_handler(&op->kp, regs);
1180 __get_cpu_var(current_kprobe) = NULL;
1181 }
1182 preempt_enable_no_resched();
1183}
1184
1185static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1186{
1187 int len = 0, ret;
1188
1189 while (len < RELATIVEJUMP_SIZE) {
1190 ret = __copy_instruction(dest + len, src + len, 1);
1191 if (!ret || !can_boost(dest + len))
1192 return -EINVAL;
1193 len += ret;
1194 }
1195 /* Check whether the address range is reserved */
1196 if (ftrace_text_reserved(src, src + len - 1) ||
1197 alternatives_text_reserved(src, src + len - 1))
1198 return -EBUSY;
1199
1200 return len;
1201}
1202
1203/* Check whether insn is indirect jump */
1204static int __kprobes insn_is_indirect_jump(struct insn *insn)
1205{
1206 return ((insn->opcode.bytes[0] == 0xff &&
1207 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1208 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1209}
1210
1211/* Check whether insn jumps into specified address range */
1212static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1213{
1214 unsigned long target = 0;
1215
1216 switch (insn->opcode.bytes[0]) {
1217 case 0xe0: /* loopne */
1218 case 0xe1: /* loope */
1219 case 0xe2: /* loop */
1220 case 0xe3: /* jcxz */
1221 case 0xe9: /* near relative jump */
1222 case 0xeb: /* short relative jump */
1223 break;
1224 case 0x0f:
1225 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1226 break;
1227 return 0;
1228 default:
1229 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1230 break;
1231 return 0;
1232 }
1233 target = (unsigned long)insn->next_byte + insn->immediate.value;
1234
1235 return (start <= target && target <= start + len);
1236}
1237
1238/* Decode whole function to ensure any instructions don't jump into target */
1239static int __kprobes can_optimize(unsigned long paddr)
1240{
1241 int ret;
1242 unsigned long addr, size = 0, offset = 0;
1243 struct insn insn;
1244 kprobe_opcode_t buf[MAX_INSN_SIZE];
1245 /* Dummy buffers for lookup_symbol_attrs */
1246 static char __dummy_buf[KSYM_NAME_LEN];
1247
1248 /* Lookup symbol including addr */
1249 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
1250 return 0;
1251
1252 /* Check there is enough space for a relative jump. */
1253 if (size - offset < RELATIVEJUMP_SIZE)
1254 return 0;
1255
1256 /* Decode instructions */
1257 addr = paddr - offset;
1258 while (addr < paddr - offset + size) { /* Decode until function end */
1259 if (search_exception_tables(addr))
1260 /*
1261 * Since some fixup code will jumps into this function,
1262 * we can't optimize kprobe in this function.
1263 */
1264 return 0;
1265 kernel_insn_init(&insn, (void *)addr);
1266 insn_get_opcode(&insn);
1267 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1268 ret = recover_probed_instruction(buf, addr);
1269 if (ret)
1270 return 0;
1271 kernel_insn_init(&insn, buf);
1272 }
1273 insn_get_length(&insn);
1274 /* Recover address */
1275 insn.kaddr = (void *)addr;
1276 insn.next_byte = (void *)(addr + insn.length);
1277 /* Check any instructions don't jump into target */
1278 if (insn_is_indirect_jump(&insn) ||
1279 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1280 RELATIVE_ADDR_SIZE))
1281 return 0;
1282 addr += insn.length;
1283 }
1284
1285 return 1;
1286}
1287
1288/* Check optimized_kprobe can actually be optimized. */
1289int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1290{
1291 int i;
1292 struct kprobe *p;
1293
1294 for (i = 1; i < op->optinsn.size; i++) {
1295 p = get_kprobe(op->kp.addr + i);
1296 if (p && !kprobe_disabled(p))
1297 return -EEXIST;
1298 }
1299
1300 return 0;
1301}
1302
1303/* Check the addr is within the optimized instructions. */
1304int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1305 unsigned long addr)
1306{
1307 return ((unsigned long)op->kp.addr <= addr &&
1308 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1309}
1310
1311/* Free optimized instruction slot */
1312static __kprobes
1313void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1314{
1315 if (op->optinsn.insn) {
1316 free_optinsn_slot(op->optinsn.insn, dirty);
1317 op->optinsn.insn = NULL;
1318 op->optinsn.size = 0;
1319 }
1320}
1321
1322void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1323{
1324 __arch_remove_optimized_kprobe(op, 1);
1325}
1326
1327/*
1328 * Copy replacing target instructions
1329 * Target instructions MUST be relocatable (checked inside)
1330 */
1331int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1332{
1333 u8 *buf;
1334 int ret;
1335 long rel;
1336
1337 if (!can_optimize((unsigned long)op->kp.addr))
1338 return -EILSEQ;
1339
1340 op->optinsn.insn = get_optinsn_slot();
1341 if (!op->optinsn.insn)
1342 return -ENOMEM;
1343
1344 /*
1345 * Verify if the address gap is in 2GB range, because this uses
1346 * a relative jump.
1347 */
1348 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1349 if (abs(rel) > 0x7fffffff)
1350 return -ERANGE;
1351
1352 buf = (u8 *)op->optinsn.insn;
1353
1354 /* Copy instructions into the out-of-line buffer */
1355 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1356 if (ret < 0) {
1357 __arch_remove_optimized_kprobe(op, 0);
1358 return ret;
1359 }
1360 op->optinsn.size = ret;
1361
1362 /* Copy arch-dep-instance from template */
1363 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1364
1365 /* Set probe information */
1366 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1367
1368 /* Set probe function call */
1369 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1370
1371 /* Set returning jmp instruction at the tail of out-of-line buffer */
1372 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1373 (u8 *)op->kp.addr + op->optinsn.size);
1374
1375 flush_icache_range((unsigned long) buf,
1376 (unsigned long) buf + TMPL_END_IDX +
1377 op->optinsn.size + RELATIVEJUMP_SIZE);
1378 return 0;
1379}
1380
1381/* Replace a breakpoint (int3) with a relative jump. */
1382int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1383{
1384 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1385 s32 rel = (s32)((long)op->optinsn.insn -
1386 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1387
1388 /* Backup instructions which will be replaced by jump address */
1389 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1390 RELATIVE_ADDR_SIZE);
1391
1392 jmp_code[0] = RELATIVEJUMP_OPCODE;
1393 *(s32 *)(&jmp_code[1]) = rel;
1394
1395 /*
1396 * text_poke_smp doesn't support NMI/MCE code modifying.
1397 * However, since kprobes itself also doesn't support NMI/MCE
1398 * code probing, it's not a problem.
1399 */
1400 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
1401 return 0;
1402}
1403
1404/* Replace a relative jump with a breakpoint (int3). */
1405void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1406{
1407 u8 buf[RELATIVEJUMP_SIZE];
1408
1409 /* Set int3 to first byte for kprobes */
1410 buf[0] = BREAKPOINT_INSTRUCTION;
1411 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1412 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1413}
1414
1415static int __kprobes setup_detour_execution(struct kprobe *p,
1416 struct pt_regs *regs,
1417 int reenter)
1418{
1419 struct optimized_kprobe *op;
1420
1421 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1422 /* This kprobe is really able to run optimized path. */
1423 op = container_of(p, struct optimized_kprobe, kp);
1424 /* Detour through copied instructions */
1425 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1426 if (!reenter)
1427 reset_current_kprobe();
1428 preempt_enable_no_resched();
1429 return 1;
1430 }
1431 return 0;
1432}
1433#endif
1434
1033int __init arch_init_kprobes(void) 1435int __init arch_init_kprobes(void)
1034{ 1436{
1035 return 0; 1437 return 0;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60cbd17..ea697263b373 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/errno.h> 9#include <linux/errno.h>
10#include <linux/gfp.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/string.h> 12#include <linux/string.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4a8bb82248ae..035c8c529181 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/gfp.h>
12#include <linux/reboot.h> 13#include <linux/reboot.h>
13#include <linux/numa.h> 14#include <linux/numa.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 845d80ce1ef1..63eaf6596233 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -42,6 +42,7 @@
42#include <linux/kernel.h> 42#include <linux/kernel.h>
43#include <linux/mca.h> 43#include <linux/mca.h>
44#include <linux/kprobes.h> 44#include <linux/kprobes.h>
45#include <linux/slab.h>
45#include <asm/system.h> 46#include <asm/system.h>
46#include <asm/io.h> 47#include <asm/io.h>
47#include <linux/proc_fs.h> 48#include <linux/proc_fs.h>
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 37542b67c57e..e1af7c055c7d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -36,9 +36,6 @@ MODULE_LICENSE("GPL v2");
36#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 36#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
37#define UCODE_UCODE_TYPE 0x00000001 37#define UCODE_UCODE_TYPE 0x00000001
38 38
39const struct firmware *firmware;
40static int supported_cpu;
41
42struct equiv_cpu_entry { 39struct equiv_cpu_entry {
43 u32 installed_cpu; 40 u32 installed_cpu;
44 u32 fixed_errata_mask; 41 u32 fixed_errata_mask;
@@ -77,12 +74,15 @@ static struct equiv_cpu_entry *equiv_cpu_table;
77 74
78static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 75static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
79{ 76{
77 struct cpuinfo_x86 *c = &cpu_data(cpu);
80 u32 dummy; 78 u32 dummy;
81 79
82 if (!supported_cpu)
83 return -1;
84
85 memset(csig, 0, sizeof(*csig)); 80 memset(csig, 0, sizeof(*csig));
81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
83 "supported\n", cpu, c->x86);
84 return -1;
85 }
86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
88 return 0; 88 return 0;
@@ -294,10 +294,14 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
294 294
295static enum ucode_state request_microcode_fw(int cpu, struct device *device) 295static enum ucode_state request_microcode_fw(int cpu, struct device *device)
296{ 296{
297 const char *fw_name = "amd-ucode/microcode_amd.bin";
298 const struct firmware *firmware;
297 enum ucode_state ret; 299 enum ucode_state ret;
298 300
299 if (firmware == NULL) 301 if (request_firmware(&firmware, fw_name, device)) {
302 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
300 return UCODE_NFOUND; 303 return UCODE_NFOUND;
304 }
301 305
302 if (*(u32 *)firmware->data != UCODE_MAGIC) { 306 if (*(u32 *)firmware->data != UCODE_MAGIC) {
303 pr_err("invalid UCODE_MAGIC (0x%08x)\n", 307 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
@@ -307,6 +311,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
307 311
308 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 312 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
309 313
314 release_firmware(firmware);
315
310 return ret; 316 return ret;
311} 317}
312 318
@@ -325,31 +331,7 @@ static void microcode_fini_cpu_amd(int cpu)
325 uci->mc = NULL; 331 uci->mc = NULL;
326} 332}
327 333
328void init_microcode_amd(struct device *device)
329{
330 const char *fw_name = "amd-ucode/microcode_amd.bin";
331 struct cpuinfo_x86 *c = &boot_cpu_data;
332
333 WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
334
335 if (c->x86 < 0x10) {
336 pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
337 return;
338 }
339 supported_cpu = 1;
340
341 if (request_firmware(&firmware, fw_name, device))
342 pr_err("failed to load file %s\n", fw_name);
343}
344
345void fini_microcode_amd(void)
346{
347 release_firmware(firmware);
348}
349
350static struct microcode_ops microcode_amd_ops = { 334static struct microcode_ops microcode_amd_ops = {
351 .init = init_microcode_amd,
352 .fini = fini_microcode_amd,
353 .request_microcode_user = request_microcode_user, 335 .request_microcode_user = request_microcode_user,
354 .request_microcode_fw = request_microcode_fw, 336 .request_microcode_fw = request_microcode_fw,
355 .collect_cpu_info = collect_cpu_info_amd, 337 .collect_cpu_info = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 0c8632433090..cceb5bc3c3c2 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -521,9 +521,6 @@ static int __init microcode_init(void)
521 return PTR_ERR(microcode_pdev); 521 return PTR_ERR(microcode_pdev);
522 } 522 }
523 523
524 if (microcode_ops->init)
525 microcode_ops->init(&microcode_pdev->dev);
526
527 get_online_cpus(); 524 get_online_cpus();
528 mutex_lock(&microcode_mutex); 525 mutex_lock(&microcode_mutex);
529 526
@@ -566,9 +563,6 @@ static void __exit microcode_exit(void)
566 563
567 platform_device_unregister(microcode_pdev); 564 platform_device_unregister(microcode_pdev);
568 565
569 if (microcode_ops->fini)
570 microcode_ops->fini();
571
572 microcode_ops = NULL; 566 microcode_ops = NULL;
573 567
574 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 568 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index ebd193e476ca..85a343e28937 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -328,7 +328,7 @@ static int apply_microcode(int cpu)
328 cpu_num, mc_intel->hdr.rev); 328 cpu_num, mc_intel->hdr.rev);
329 return -1; 329 return -1;
330 } 330 }
331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
332 cpu_num, val[1], 332 cpu_num, val[1],
333 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
334 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc416..71825806cd44 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/dmi.h> 9#include <linux/dmi.h>
10#include <linux/range.h>
11
10#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
11#include <linux/sort.h> 13#include <linux/sort.h>
12#include <asm/io.h> 14#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
30 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, 32 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
31}; 33};
32 34
33struct range {
34 u64 start;
35 u64 end;
36};
37
38static int __cpuinit cmp_range(const void *x1, const void *x2) 35static int __cpuinit cmp_range(const void *x1, const void *x2)
39{ 36{
40 const struct range *r1 = x1; 37 const struct range *r1 = x1;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 89f386f044e4..e0bc186d7501 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -23,6 +23,7 @@
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h>
26 27
27#include <asm/system.h> 28#include <asm/system.h>
28#include <asm/page.h> 29#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 40b54ceb68b5..e81030f71a8f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
359 x86_init.mpparse.mpc_record(1); 359 x86_init.mpparse.mpc_record(1);
360 } 360 }
361 361
362#ifdef CONFIG_X86_BIGSMP
363 generic_bigsmp_probe();
364#endif
365
366 if (apic->setup_apic_routing)
367 apic->setup_apic_routing();
368
369 if (!num_processors) 362 if (!num_processors)
370 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 363 printk(KERN_ERR "MPTABLE: no processors registered!\n");
371 return num_processors; 364 return num_processors;
@@ -671,7 +664,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 664{
672 unsigned long size = get_mpc_size(mpf->physptr); 665 unsigned long size = get_mpc_size(mpf->physptr);
673 666
674 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); 667 reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
675} 668}
676 669
677static int __init smp_scan_config(unsigned long base, unsigned long length) 670static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -700,7 +693,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
700 mpf, (u64)virt_to_phys(mpf)); 693 mpf, (u64)virt_to_phys(mpf));
701 694
702 mem = virt_to_phys(mpf); 695 mem = virt_to_phys(mpf);
703 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); 696 reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
704 if (mpf->physptr) 697 if (mpf->physptr)
705 smp_reserve_memory(mpf); 698 smp_reserve_memory(mpf);
706 699
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 3b7078abc871..0aad8670858e 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -10,8 +10,211 @@
10 * of the License. 10 * of the License.
11 */ 11 */
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sfi.h>
15#include <linux/irq.h>
16#include <linux/module.h>
13 17
14#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/mpspec_def.h>
20#include <asm/hw_irq.h>
21#include <asm/apic.h>
22#include <asm/io_apic.h>
23#include <asm/mrst.h>
24#include <asm/io.h>
25#include <asm/i8259.h>
26#include <asm/apb_timer.h>
27
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
30int sfi_mtimer_num;
31
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
33EXPORT_SYMBOL_GPL(sfi_mrtc_array);
34int sfi_mrtc_num;
35
36static inline void assign_to_mp_irq(struct mpc_intsrc *m,
37 struct mpc_intsrc *mp_irq)
38{
39 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
40}
41
42static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
43 struct mpc_intsrc *m)
44{
45 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
46}
47
48static void save_mp_irq(struct mpc_intsrc *m)
49{
50 int i;
51
52 for (i = 0; i < mp_irq_entries; i++) {
53 if (!mp_irq_cmp(&mp_irqs[i], m))
54 return;
55 }
56
57 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
58 if (++mp_irq_entries == MAX_IRQ_SOURCES)
59 panic("Max # of irq sources exceeded!!\n");
60}
61
62/* parse all the mtimer info to a static mtimer array */
63static int __init sfi_parse_mtmr(struct sfi_table_header *table)
64{
65 struct sfi_table_simple *sb;
66 struct sfi_timer_table_entry *pentry;
67 struct mpc_intsrc mp_irq;
68 int totallen;
69
70 sb = (struct sfi_table_simple *)table;
71 if (!sfi_mtimer_num) {
72 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
73 struct sfi_timer_table_entry);
74 pentry = (struct sfi_timer_table_entry *) sb->pentry;
75 totallen = sfi_mtimer_num * sizeof(*pentry);
76 memcpy(sfi_mtimer_array, pentry, totallen);
77 }
78
79 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
80 pentry = sfi_mtimer_array;
81 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
82 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
83 " irq = %d\n", totallen, (u32)pentry->phys_addr,
84 pentry->freq_hz, pentry->irq);
85 if (!pentry->irq)
86 continue;
87 mp_irq.type = MP_IOAPIC;
88 mp_irq.irqtype = mp_INT;
89/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
90 mp_irq.irqflag = 5;
91 mp_irq.srcbus = 0;
92 mp_irq.srcbusirq = pentry->irq; /* IRQ */
93 mp_irq.dstapic = MP_APIC_ALL;
94 mp_irq.dstirq = pentry->irq;
95 save_mp_irq(&mp_irq);
96 }
97
98 return 0;
99}
100
101struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
102{
103 int i;
104 if (hint < sfi_mtimer_num) {
105 if (!sfi_mtimer_usage[hint]) {
106 pr_debug("hint taken for timer %d irq %d\n",\
107 hint, sfi_mtimer_array[hint].irq);
108 sfi_mtimer_usage[hint] = 1;
109 return &sfi_mtimer_array[hint];
110 }
111 }
112 /* take the first timer available */
113 for (i = 0; i < sfi_mtimer_num;) {
114 if (!sfi_mtimer_usage[i]) {
115 sfi_mtimer_usage[i] = 1;
116 return &sfi_mtimer_array[i];
117 }
118 i++;
119 }
120 return NULL;
121}
122
123void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
124{
125 int i;
126 for (i = 0; i < sfi_mtimer_num;) {
127 if (mtmr->irq == sfi_mtimer_array[i].irq) {
128 sfi_mtimer_usage[i] = 0;
129 return;
130 }
131 i++;
132 }
133}
134
135/* parse all the mrtc info to a global mrtc array */
136int __init sfi_parse_mrtc(struct sfi_table_header *table)
137{
138 struct sfi_table_simple *sb;
139 struct sfi_rtc_table_entry *pentry;
140 struct mpc_intsrc mp_irq;
141
142 int totallen;
143
144 sb = (struct sfi_table_simple *)table;
145 if (!sfi_mrtc_num) {
146 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
147 struct sfi_rtc_table_entry);
148 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
149 totallen = sfi_mrtc_num * sizeof(*pentry);
150 memcpy(sfi_mrtc_array, pentry, totallen);
151 }
152
153 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
154 pentry = sfi_mrtc_array;
155 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
156 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
157 totallen, (u32)pentry->phys_addr, pentry->irq);
158 mp_irq.type = MP_IOAPIC;
159 mp_irq.irqtype = mp_INT;
160 mp_irq.irqflag = 0;
161 mp_irq.srcbus = 0;
162 mp_irq.srcbusirq = pentry->irq; /* IRQ */
163 mp_irq.dstapic = MP_APIC_ALL;
164 mp_irq.dstirq = pentry->irq;
165 save_mp_irq(&mp_irq);
166 }
167 return 0;
168}
169
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void)
183{
184 unsigned long flags, fast_calibrate;
185
186 local_irq_save(flags);
187 fast_calibrate = apbt_quick_calibrate();
188 local_irq_restore(flags);
189
190 if (fast_calibrate)
191 return fast_calibrate;
192
193 return 0;
194}
195
196void __init mrst_time_init(void)
197{
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0();
200 apbt_time_init();
201}
202
203void __init mrst_rtc_init(void)
204{
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206}
207
208/*
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
215 if (disable_apbt_percpu)
216 setup_boot_APIC_clock();
217};
15 218
16/* 219/*
17 * Moorestown specific x86_init function overrides and early setup 220 * Moorestown specific x86_init function overrides and early setup
@@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)
21{ 224{
22 x86_init.resources.probe_roms = x86_init_noop; 225 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop; 226 x86_init.resources.reserve_resources = x86_init_noop;
227
228 x86_init.timers.timer_init = mrst_time_init;
229 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
230
231 x86_init.irqs.pre_vector_init = x86_init_noop;
232
233 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
234
235 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
236 x86_init.pci.init = pci_mrst_init;
237 x86_init.pci.fixup_irqs = x86_init_noop;
238
239 legacy_pic = &null_legacy_pic;
24} 240}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4bd93c9b2b27..4d4468e9f47c 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -37,6 +37,7 @@
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
39#include <linux/uaccess.h> 39#include <linux/uaccess.h>
40#include <linux/gfp.h>
40 41
41#include <asm/processor.h> 42#include <asm/processor.h>
42#include <asm/msr.h> 43#include <asm/msr.h>
@@ -285,7 +286,7 @@ static void __exit msr_exit(void)
285 for_each_online_cpu(cpu) 286 for_each_online_cpu(cpu)
286 msr_device_destroy(cpu); 287 msr_device_destroy(cpu);
287 class_destroy(msr_class); 288 class_destroy(msr_class);
288 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 289 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
289 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 290 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
290} 291}
291 292
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 9d1d263f786f..8297160c41b3 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,7 +17,9 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20
20#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h>
21#include <asm/olpc.h> 23#include <asm/olpc.h>
22 24
23#ifdef CONFIG_OPEN_FIRMWARE 25#ifdef CONFIG_OPEN_FIRMWARE
@@ -243,9 +245,11 @@ static int __init olpc_init(void)
243 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, 245 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
244 (unsigned char *) &olpc_platform_info.ecver, 1); 246 (unsigned char *) &olpc_platform_info.ecver, 1);
245 247
246 /* check to see if the VSA exists */ 248#ifdef CONFIG_PCI_OLPC
247 if (cs5535_has_vsa2()) 249 /* If the VSA exists let it emulate PCI, if not emulate in kernel */
248 olpc_platform_info.flags |= OLPC_F_VSA; 250 if (!cs5535_has_vsa2())
251 x86_init.pci.arch_init = pci_olpc_init;
252#endif
249 253
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 254 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
251 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", 255 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d16310..1db183ed7c01 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 428 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 429 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
430 430
431#ifdef CONFIG_HIGHPTE
432 .kmap_atomic_pte = kmap_atomic,
433#endif
434
435#if PAGETABLE_LEVELS >= 3 431#if PAGETABLE_LEVELS >= 3
436#ifdef CONFIG_X86_PAE 432#ifdef CONFIG_X86_PAE
437 .set_pte_atomic = native_set_pte_atomic, 433 .set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 2bbde6078143..fb99f7edb341 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1309/* 1309/*
1310 * get_tce_space_from_tar(): 1310 * get_tce_space_from_tar():
1311 * Function for kdump case. Get the tce tables from first kernel 1311 * Function for kdump case. Get the tce tables from first kernel
1312 * by reading the contents of the base adress register of calgary iommu 1312 * by reading the contents of the base address register of calgary iommu
1313 */ 1313 */
1314static void __init get_tce_space_from_tar(void) 1314static void __init get_tce_space_from_tar(void)
1315{ 1315{
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 75e14e21f61a..4b7e3d8b01dd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -2,6 +2,7 @@
2#include <linux/dma-debug.h> 2#include <linux/dma-debug.h>
3#include <linux/dmar.h> 3#include <linux/dmar.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/gfp.h>
5#include <linux/pci.h> 6#include <linux/pci.h>
6#include <linux/kmemleak.h> 7#include <linux/kmemleak.h>
7 8
@@ -38,7 +39,7 @@ int iommu_detected __read_mostly = 0;
38 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 39 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
39 * If this variable is 1, IOMMU implementations do no DMA translation for 40 * If this variable is 1, IOMMU implementations do no DMA translation for
40 * devices and allow every device to access to whole physical memory. This is 41 * devices and allow every device to access to whole physical memory. This is
41 * useful if a user want to use an IOMMU only for KVM device assignment to 42 * useful if a user wants to use an IOMMU only for KVM device assignment to
42 * guests and not for driver dma translation. 43 * guests and not for driver dma translation.
43 */ 44 */
44int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
@@ -65,7 +66,7 @@ int dma_set_mask(struct device *dev, u64 mask)
65} 66}
66EXPORT_SYMBOL(dma_set_mask); 67EXPORT_SYMBOL(dma_set_mask);
67 68
68#ifdef CONFIG_X86_64 69#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
69static __initdata void *dma32_bootmem_ptr; 70static __initdata void *dma32_bootmem_ptr;
70static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); 71static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
71 72
@@ -116,14 +117,21 @@ static void __init dma32_free_bootmem(void)
116 dma32_bootmem_ptr = NULL; 117 dma32_bootmem_ptr = NULL;
117 dma32_bootmem_size = 0; 118 dma32_bootmem_size = 0;
118} 119}
120#else
121void __init dma32_reserve_bootmem(void)
122{
123}
124static void __init dma32_free_bootmem(void)
125{
126}
127
119#endif 128#endif
120 129
121void __init pci_iommu_alloc(void) 130void __init pci_iommu_alloc(void)
122{ 131{
123#ifdef CONFIG_X86_64
124 /* free the range so iommu could get some range less than 4G */ 132 /* free the range so iommu could get some range less than 4G */
125 dma32_free_bootmem(); 133 dma32_free_bootmem();
126#endif 134
127 if (pci_swiotlb_detect()) 135 if (pci_swiotlb_detect())
128 goto out; 136 goto out;
129 137
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 34de53b46f87..0f7f130caa67 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -29,6 +29,7 @@
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/sysdev.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h>
32#include <asm/atomic.h> 33#include <asm/atomic.h>
33#include <asm/mtrr.h> 34#include <asm/mtrr.h>
34#include <asm/pgtable.h> 35#include <asm/pgtable.h>
@@ -564,6 +565,9 @@ static void enable_gart_translations(void)
564 565
565 enable_gart_translation(dev, __pa(agp_gatt_table)); 566 enable_gart_translation(dev, __pa(agp_gatt_table));
566 } 567 }
568
569 /* Flush the GART-TLB to remove stale entries */
570 k8_flush_garts();
567} 571}
568 572
569/* 573/*
@@ -735,7 +739,7 @@ int __init gart_iommu_init(void)
735 unsigned long scratch; 739 unsigned long scratch;
736 long i; 740 long i;
737 741
738 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 742 if (num_k8_northbridges == 0)
739 return 0; 743 return 0;
740 744
741#ifndef CONFIG_AGP_AMD64 745#ifndef CONFIG_AGP_AMD64
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 22be12b60a8f..3af4af810c07 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -4,6 +4,7 @@
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/gfp.h>
7#include <linux/pci.h> 8#include <linux/pci.h>
8#include <linux/mm.h> 9#include <linux/mm.h>
9 10
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cf1e04b2ad65..28ad9f4d8b94 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -110,8 +110,8 @@ void show_regs_common(void)
110 if (!product) 110 if (!product)
111 product = ""; 111 product = "";
112 112
113 printk("\n"); 113 printk(KERN_CONT "\n");
114 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", 114 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
115 current->pid, current->comm, print_tainted(), 115 current->pid, current->comm, print_tainted(),
116 init_utsname()->release, 116 init_utsname()->release,
117 (int)strcspn(init_utsname()->version, " "), 117 (int)strcspn(init_utsname()->version, " "),
@@ -122,18 +122,6 @@ void flush_thread(void)
122{ 122{
123 struct task_struct *tsk = current; 123 struct task_struct *tsk = current;
124 124
125#ifdef CONFIG_X86_64
126 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
127 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
128 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
129 clear_tsk_thread_flag(tsk, TIF_IA32);
130 } else {
131 set_tsk_thread_flag(tsk, TIF_IA32);
132 current_thread_info()->status |= TS_COMPAT;
133 }
134 }
135#endif
136
137 flush_ptrace_hw_breakpoint(tsk); 125 flush_ptrace_hw_breakpoint(tsk);
138 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 126 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
139 /* 127 /*
@@ -295,6 +283,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
295 regs.es = __USER_DS; 283 regs.es = __USER_DS;
296 regs.fs = __KERNEL_PERCPU; 284 regs.fs = __KERNEL_PERCPU;
297 regs.gs = __KERNEL_STACK_CANARY; 285 regs.gs = __KERNEL_STACK_CANARY;
286#else
287 regs.ss = __KERNEL_DS;
298#endif 288#endif
299 289
300 regs.orig_ax = -1; 290 regs.orig_ax = -1;
@@ -536,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
536} 526}
537 527
538/* 528/*
539 * Check for AMD CPUs, which have potentially C1E support 529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
540 */ 533 */
541static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) 534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
542{ 535{
536 u64 val;
543 if (c->x86_vendor != X86_VENDOR_AMD) 537 if (c->x86_vendor != X86_VENDOR_AMD)
544 return 0; 538 goto no_c1e_idle;
545
546 if (c->x86 < 0x0F)
547 return 0;
548 539
549 /* Family 0x0f models < rev F do not have C1E */ 540 /* Family 0x0f models < rev F do not have C1E */
550 if (c->x86 == 0x0f && c->x86_model < 0x40) 541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
551 return 0; 542 return 1;
552 543
553 return 1; 544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
550 if (val >= 2) {
551 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
552 if (!(val & BIT(1)))
553 goto no_c1e_idle;
554 }
555 return 1;
556 }
557
558no_c1e_idle:
559 return 0;
554} 560}
555 561
556static cpumask_var_t c1e_mask; 562static cpumask_var_t c1e_mask;
@@ -617,7 +623,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
617{ 623{
618#ifdef CONFIG_SMP 624#ifdef CONFIG_SMP
619 if (pm_idle == poll_idle && smp_num_siblings > 1) { 625 if (pm_idle == poll_idle && smp_num_siblings > 1) {
620 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 626 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
621 " performance may degrade.\n"); 627 " performance may degrade.\n");
622 } 628 }
623#endif 629#endif
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index fe6a34e42bde..f6c62667e30c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -139,16 +139,16 @@ void __show_regs(struct pt_regs *regs, int all)
139 139
140 show_regs_common(); 140 show_regs_common();
141 141
142 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 142 printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
143 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
144 smp_processor_id()); 144 smp_processor_id());
145 print_symbol("EIP is at %s\n", regs->ip); 145 print_symbol("EIP is at %s\n", regs->ip);
146 146
147 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 147 printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
148 regs->ax, regs->bx, regs->cx, regs->dx); 148 regs->ax, regs->bx, regs->cx, regs->dx);
149 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 149 printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
150 regs->si, regs->di, regs->bp, sp); 150 regs->si, regs->di, regs->bp, sp);
151 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 151 printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); 152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
153 153
154 if (!all) 154 if (!all)
@@ -158,19 +158,19 @@ void __show_regs(struct pt_regs *regs, int all)
158 cr2 = read_cr2(); 158 cr2 = read_cr2();
159 cr3 = read_cr3(); 159 cr3 = read_cr3();
160 cr4 = read_cr4_safe(); 160 cr4 = read_cr4_safe();
161 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 161 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
162 cr0, cr2, cr3, cr4); 162 cr0, cr2, cr3, cr4);
163 163
164 get_debugreg(d0, 0); 164 get_debugreg(d0, 0);
165 get_debugreg(d1, 1); 165 get_debugreg(d1, 1);
166 get_debugreg(d2, 2); 166 get_debugreg(d2, 2);
167 get_debugreg(d3, 3); 167 get_debugreg(d3, 3);
168 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", 168 printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
169 d0, d1, d2, d3); 169 d0, d1, d2, d3);
170 170
171 get_debugreg(d6, 6); 171 get_debugreg(d6, 6);
172 get_debugreg(d7, 7); 172 get_debugreg(d7, 7);
173 printk("DR6: %08lx DR7: %08lx\n", 173 printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
174 d6, d7); 174 d6, d7);
175} 175}
176 176
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 418f860880a2..dc9690b4c4cc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -161,19 +161,19 @@ void __show_regs(struct pt_regs *regs, int all)
161 unsigned int ds, cs, es; 161 unsigned int ds, cs, es;
162 162
163 show_regs_common(); 163 show_regs_common();
164 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 164 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
165 printk_address(regs->ip, 1); 165 printk_address(regs->ip, 1);
166 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 166 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
167 regs->sp, regs->flags); 167 regs->sp, regs->flags);
168 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 168 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
169 regs->ax, regs->bx, regs->cx); 169 regs->ax, regs->bx, regs->cx);
170 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 170 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
171 regs->dx, regs->si, regs->di); 171 regs->dx, regs->si, regs->di);
172 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 172 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
173 regs->bp, regs->r8, regs->r9); 173 regs->bp, regs->r8, regs->r9);
174 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 174 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
175 regs->r10, regs->r11, regs->r12); 175 regs->r10, regs->r11, regs->r12);
176 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 176 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
177 regs->r13, regs->r14, regs->r15); 177 regs->r13, regs->r14, regs->r15);
178 178
179 asm("movl %%ds,%0" : "=r" (ds)); 179 asm("movl %%ds,%0" : "=r" (ds));
@@ -194,21 +194,21 @@ void __show_regs(struct pt_regs *regs, int all)
194 cr3 = read_cr3(); 194 cr3 = read_cr3();
195 cr4 = read_cr4(); 195 cr4 = read_cr4();
196 196
197 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 197 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
198 fs, fsindex, gs, gsindex, shadowgs); 198 fs, fsindex, gs, gsindex, shadowgs);
199 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 199 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
200 es, cr0); 200 es, cr0);
201 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 201 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
202 cr4); 202 cr4);
203 203
204 get_debugreg(d0, 0); 204 get_debugreg(d0, 0);
205 get_debugreg(d1, 1); 205 get_debugreg(d1, 1);
206 get_debugreg(d2, 2); 206 get_debugreg(d2, 2);
207 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 207 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 get_debugreg(d3, 3); 208 get_debugreg(d3, 3);
209 get_debugreg(d6, 6); 209 get_debugreg(d6, 6);
210 get_debugreg(d7, 7); 210 get_debugreg(d7, 7);
211 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 212}
213 213
214void release_thread(struct task_struct *dead_task) 214void release_thread(struct task_struct *dead_task)
@@ -515,6 +515,18 @@ void set_personality_64bit(void)
515 current->personality &= ~READ_IMPLIES_EXEC; 515 current->personality &= ~READ_IMPLIES_EXEC;
516} 516}
517 517
518void set_personality_ia32(void)
519{
520 /* inherit personality from parent */
521
522 /* Make sure to be in 32bit mode */
523 set_thread_flag(TIF_IA32);
524 current->personality |= force_personality32;
525
526 /* Prepare the first "return" to user space */
527 current_thread_info()->status |= TS_COMPAT;
528}
529
518unsigned long get_wchan(struct task_struct *p) 530unsigned long get_wchan(struct task_struct *p)
519{ 531{
520 unsigned long stack; 532 unsigned long stack;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 017d937639fe..2e9b55027b7e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
15#include <linux/ptrace.h> 16#include <linux/ptrace.h>
16#include <linux/regset.h> 17#include <linux/regset.h>
17#include <linux/tracehook.h> 18#include <linux/tracehook.h>
@@ -48,6 +49,7 @@ enum x86_regset {
48 REGSET_FP, 49 REGSET_FP,
49 REGSET_XFP, 50 REGSET_XFP,
50 REGSET_IOPERM64 = REGSET_XFP, 51 REGSET_IOPERM64 = REGSET_XFP,
52 REGSET_XSTATE,
51 REGSET_TLS, 53 REGSET_TLS,
52 REGSET_IOPERM32, 54 REGSET_IOPERM32,
53}; 55};
@@ -140,30 +142,6 @@ static const int arg_offs_table[] = {
140#endif 142#endif
141}; 143};
142 144
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
167/* 145/*
168 * does not yet catch signals sent when the child dies. 146 * does not yet catch signals sent when the child dies.
169 * in exit.c or in signal.c. 147 * in exit.c or in signal.c.
@@ -604,7 +582,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
604 struct perf_event_attr attr; 582 struct perf_event_attr attr;
605 583
606 /* 584 /*
607 * We shoud have at least an inactive breakpoint at this 585 * We should have at least an inactive breakpoint at this
608 * slot. It means the user is writing dr7 without having 586 * slot. It means the user is writing dr7 without having
609 * written the address register first 587 * written the address register first
610 */ 588 */
@@ -702,7 +680,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
702 } else if (n == 6) { 680 } else if (n == 6) {
703 val = thread->debugreg6; 681 val = thread->debugreg6;
704 } else if (n == 7) { 682 } else if (n == 7) {
705 val = ptrace_get_dr7(thread->ptrace_bps); 683 val = thread->ptrace_dr7;
706 } 684 }
707 return val; 685 return val;
708} 686}
@@ -778,8 +756,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
778 return rc; 756 return rc;
779 } 757 }
780 /* All that's left is DR7 */ 758 /* All that's left is DR7 */
781 if (n == 7) 759 if (n == 7) {
782 rc = ptrace_write_dr7(tsk, val); 760 rc = ptrace_write_dr7(tsk, val);
761 if (!rc)
762 thread->ptrace_dr7 = val;
763 }
783 764
784ret_path: 765ret_path:
785 return rc; 766 return rc;
@@ -1584,7 +1565,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1584 1565
1585#ifdef CONFIG_X86_64 1566#ifdef CONFIG_X86_64
1586 1567
1587static const struct user_regset x86_64_regsets[] = { 1568static struct user_regset x86_64_regsets[] __read_mostly = {
1588 [REGSET_GENERAL] = { 1569 [REGSET_GENERAL] = {
1589 .core_note_type = NT_PRSTATUS, 1570 .core_note_type = NT_PRSTATUS,
1590 .n = sizeof(struct user_regs_struct) / sizeof(long), 1571 .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1597,6 +1578,12 @@ static const struct user_regset x86_64_regsets[] = {
1597 .size = sizeof(long), .align = sizeof(long), 1578 .size = sizeof(long), .align = sizeof(long),
1598 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1579 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1599 }, 1580 },
1581 [REGSET_XSTATE] = {
1582 .core_note_type = NT_X86_XSTATE,
1583 .size = sizeof(u64), .align = sizeof(u64),
1584 .active = xstateregs_active, .get = xstateregs_get,
1585 .set = xstateregs_set
1586 },
1600 [REGSET_IOPERM64] = { 1587 [REGSET_IOPERM64] = {
1601 .core_note_type = NT_386_IOPERM, 1588 .core_note_type = NT_386_IOPERM,
1602 .n = IO_BITMAP_LONGS, 1589 .n = IO_BITMAP_LONGS,
@@ -1622,7 +1609,7 @@ static const struct user_regset_view user_x86_64_view = {
1622#endif /* CONFIG_X86_64 */ 1609#endif /* CONFIG_X86_64 */
1623 1610
1624#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1611#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1625static const struct user_regset x86_32_regsets[] = { 1612static struct user_regset x86_32_regsets[] __read_mostly = {
1626 [REGSET_GENERAL] = { 1613 [REGSET_GENERAL] = {
1627 .core_note_type = NT_PRSTATUS, 1614 .core_note_type = NT_PRSTATUS,
1628 .n = sizeof(struct user_regs_struct32) / sizeof(u32), 1615 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1641,6 +1628,12 @@ static const struct user_regset x86_32_regsets[] = {
1641 .size = sizeof(u32), .align = sizeof(u32), 1628 .size = sizeof(u32), .align = sizeof(u32),
1642 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1629 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1643 }, 1630 },
1631 [REGSET_XSTATE] = {
1632 .core_note_type = NT_X86_XSTATE,
1633 .size = sizeof(u64), .align = sizeof(u64),
1634 .active = xstateregs_active, .get = xstateregs_get,
1635 .set = xstateregs_set
1636 },
1644 [REGSET_TLS] = { 1637 [REGSET_TLS] = {
1645 .core_note_type = NT_386_TLS, 1638 .core_note_type = NT_386_TLS,
1646 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, 1639 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1663,6 +1656,23 @@ static const struct user_regset_view user_x86_32_view = {
1663}; 1656};
1664#endif 1657#endif
1665 1658
1659/*
1660 * This represents bytes 464..511 in the memory layout exported through
1661 * the REGSET_XSTATE interface.
1662 */
1663u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
1664
1665void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
1666{
1667#ifdef CONFIG_X86_64
1668 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1669#endif
1670#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1671 x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1672#endif
1673 xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
1674}
1675
1666const struct user_regset_view *task_user_regset_view(struct task_struct *task) 1676const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1667{ 1677{
1668#ifdef CONFIG_IA32_EMULATION 1678#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 18093d7498f0..12e9feaa2f7a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,6 +491,19 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494
495/*
496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on
497 * floppy DMA. Disable HPET MSI on such platforms.
498 */
499static void force_disable_hpet_msi(struct pci_dev *unused)
500{
501 hpet_msi_disable = 1;
502}
503
504DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
505 force_disable_hpet_msi);
506
494#endif 507#endif
495 508
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 509#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1545bc0c9845..8e1aac86b50c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"), 203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
204 }, 204 },
205 }, 205 },
206 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
207 .callback = set_bios_reboot,
208 .ident = "Dell OptiPlex 760",
209 .matches = {
210 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
211 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
212 DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
213 },
214 },
206 { /* Handle problems with rebooting on Dell 2400's */ 215 { /* Handle problems with rebooting on Dell 2400's */
207 .callback = set_bios_reboot, 216 .callback = set_bios_reboot,
208 .ident = "Dell PowerEdge 2400", 217 .ident = "Dell PowerEdge 2400",
@@ -452,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
452 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), 461 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
453 }, 462 },
454 }, 463 },
464 { /* Handle problems with rebooting on the iMac9,1. */
465 .callback = set_pci_reboot,
466 .ident = "Apple iMac9,1",
467 .matches = {
468 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
469 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
470 },
471 },
455 { } 472 { }
456}; 473};
457 474
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f7b8b9894b22..c4851eff57b3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -55,7 +55,6 @@
55#include <linux/stddef.h> 55#include <linux/stddef.h>
56#include <linux/unistd.h> 56#include <linux/unistd.h>
57#include <linux/ptrace.h> 57#include <linux/ptrace.h>
58#include <linux/slab.h>
59#include <linux/user.h> 58#include <linux/user.h>
60#include <linux/delay.h> 59#include <linux/delay.h>
61 60
@@ -121,7 +120,9 @@
121unsigned long max_low_pfn_mapped; 120unsigned long max_low_pfn_mapped;
122unsigned long max_pfn_mapped; 121unsigned long max_pfn_mapped;
123 122
123#ifdef CONFIG_DMI
124RESERVE_BRK(dmi_alloc, 65536); 124RESERVE_BRK(dmi_alloc, 65536);
125#endif
125 126
126unsigned int boot_cpu_id __read_mostly; 127unsigned int boot_cpu_id __read_mostly;
127 128
@@ -312,16 +313,17 @@ static void __init reserve_brk(void)
312#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 313#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
313static void __init relocate_initrd(void) 314static void __init relocate_initrd(void)
314{ 315{
315 316 /* Assume only end is not page aligned */
316 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 317 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
317 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 318 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
319 u64 area_size = PAGE_ALIGN(ramdisk_size);
318 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 320 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
319 u64 ramdisk_here; 321 u64 ramdisk_here;
320 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
321 char *p, *q; 323 char *p, *q;
322 324
323 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into lowmem */
324 ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, 326 ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
325 PAGE_SIZE); 327 PAGE_SIZE);
326 328
327 if (ramdisk_here == -1ULL) 329 if (ramdisk_here == -1ULL)
@@ -330,7 +332,7 @@ static void __init relocate_initrd(void)
330 332
331 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the lowmem currently occupied by
332 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
333 reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, 335 reserve_early(ramdisk_here, ramdisk_here + area_size,
334 "NEW RAMDISK"); 336 "NEW RAMDISK");
335 initrd_start = ramdisk_here + PAGE_OFFSET; 337 initrd_start = ramdisk_here + PAGE_OFFSET;
336 initrd_end = initrd_start + ramdisk_size; 338 initrd_end = initrd_start + ramdisk_size;
@@ -374,9 +376,10 @@ static void __init relocate_initrd(void)
374 376
375static void __init reserve_initrd(void) 377static void __init reserve_initrd(void)
376{ 378{
379 /* Assume only end is not page aligned */
377 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 380 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
378 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 381 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
379 u64 ramdisk_end = ramdisk_image + ramdisk_size; 382 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
380 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 383 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
381 384
382 if (!boot_params.hdr.type_of_loader || 385 if (!boot_params.hdr.type_of_loader ||
@@ -604,6 +607,16 @@ static int __init setup_elfcorehdr(char *arg)
604early_param("elfcorehdr", setup_elfcorehdr); 607early_param("elfcorehdr", setup_elfcorehdr);
605#endif 608#endif
606 609
610static __init void reserve_ibft_region(void)
611{
612 unsigned long addr, size = 0;
613
614 addr = find_ibft_region(&size);
615
616 if (size)
617 reserve_early_overlap_ok(addr, addr + size, "ibft");
618}
619
607#ifdef CONFIG_X86_RESERVE_LOW_64K 620#ifdef CONFIG_X86_RESERVE_LOW_64K
608static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 621static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
609{ 622{
@@ -642,23 +655,48 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
642 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), 655 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
643 }, 656 },
644 }, 657 },
645 {
646 /* 658 /*
647 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 659 * AMI BIOS with low memory corruption was found on Intel DG45ID and
648 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 660 * DG45FC boards.
661 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
649 * match only DMI_BOARD_NAME and see if there is more bad products 662 * match only DMI_BOARD_NAME and see if there is more bad products
650 * with this vendor. 663 * with this vendor.
651 */ 664 */
665 {
652 .callback = dmi_low_memory_corruption, 666 .callback = dmi_low_memory_corruption,
653 .ident = "AMI BIOS", 667 .ident = "AMI BIOS",
654 .matches = { 668 .matches = {
655 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), 669 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
656 }, 670 },
657 }, 671 },
672 {
673 .callback = dmi_low_memory_corruption,
674 .ident = "AMI BIOS",
675 .matches = {
676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
677 },
678 },
658#endif 679#endif
659 {} 680 {}
660}; 681};
661 682
683static void __init trim_bios_range(void)
684{
685 /*
686 * A special case is the first 4Kb of memory;
687 * This is a BIOS owned area, not kernel ram, but generally
688 * not listed as such in the E820 table.
689 */
690 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
691 /*
692 * special case: Some BIOSen report the PC BIOS
693 * area (640->1Mb) as ram even though it is not.
694 * take them out.
695 */
696 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
697 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
698}
699
662/* 700/*
663 * Determine if we were loaded by an EFI loader. If so, then we have also been 701 * Determine if we were loaded by an EFI loader. If so, then we have also been
664 * passed the efi memmap, systab, etc., so we should use these data structures 702 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -822,7 +860,7 @@ void __init setup_arch(char **cmdline_p)
822 insert_resource(&iomem_resource, &data_resource); 860 insert_resource(&iomem_resource, &data_resource);
823 insert_resource(&iomem_resource, &bss_resource); 861 insert_resource(&iomem_resource, &bss_resource);
824 862
825 863 trim_bios_range();
826#ifdef CONFIG_X86_32 864#ifdef CONFIG_X86_32
827 if (ppro_with_ram_bug()) { 865 if (ppro_with_ram_bug()) {
828 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, 866 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
@@ -881,6 +919,8 @@ void __init setup_arch(char **cmdline_p)
881 */ 919 */
882 find_smp_config(); 920 find_smp_config();
883 921
922 reserve_ibft_region();
923
884 reserve_trampoline_memory(); 924 reserve_trampoline_memory();
885 925
886#ifdef CONFIG_ACPI_SLEEP 926#ifdef CONFIG_ACPI_SLEEP
@@ -942,17 +982,11 @@ void __init setup_arch(char **cmdline_p)
942#endif 982#endif
943 983
944 initmem_init(0, max_pfn, acpi, k8); 984 initmem_init(0, max_pfn, acpi, k8);
945 985#ifndef CONFIG_NO_BOOTMEM
946#ifdef CONFIG_X86_64 986 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
947 /*
948 * dma32_reserve_bootmem() allocates bootmem which may conflict
949 * with the crashkernel command line, so do that after
950 * reserve_crashkernel()
951 */
952 dma32_reserve_bootmem();
953#endif 987#endif
954 988
955 reserve_ibft_region(); 989 dma32_reserve_bootmem();
956 990
957#ifdef CONFIG_KVM_CLOCK 991#ifdef CONFIG_KVM_CLOCK
958 kvmclock_init(); 992 kvmclock_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 35abcb8b00e9..ef6370b00e70 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
137 137
138static void __init pcpu_fc_free(void *ptr, size_t size) 138static void __init pcpu_fc_free(void *ptr, size_t size)
139{ 139{
140#ifdef CONFIG_NO_BOOTMEM
141 u64 start = __pa(ptr);
142 u64 end = start + size;
143 free_early_partial(start, end);
144#else
140 free_bootmem(__pa(ptr), size); 145 free_bootmem(__pa(ptr), size);
146#endif
141} 147}
142 148
143static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 149static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ec1de97600e7..d801210945d6 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
21#include <linux/cache.h> 21#include <linux/cache.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/gfp.h>
24 25
25#include <asm/mtrr.h> 26#include <asm/mtrr.h>
26#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 678d0b8c26f3..763d815e27a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -48,6 +48,8 @@
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h> 50#include <linux/tboot.h>
51#include <linux/stackprotector.h>
52#include <linux/gfp.h>
51 53
52#include <asm/acpi.h> 54#include <asm/acpi.h>
53#include <asm/desc.h> 55#include <asm/desc.h>
@@ -67,6 +69,7 @@
67#include <linux/mc146818rtc.h> 69#include <linux/mc146818rtc.h>
68 70
69#include <asm/smpboot_hooks.h> 71#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h>
70 73
71#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
72u8 apicid_2_node[MAX_APICID]; 75u8 apicid_2_node[MAX_APICID];
@@ -240,7 +243,10 @@ static void __cpuinit smp_callin(void)
240 end_local_APIC_setup(); 243 end_local_APIC_setup();
241 map_cpu_to_logical_apicid(); 244 map_cpu_to_logical_apicid();
242 245
243 notify_cpu_starting(cpuid); 246 /*
247 * Need to setup vector mappings before we enable interrupts.
248 */
249 setup_vector_irq(smp_processor_id());
244 /* 250 /*
245 * Get our bogomips. 251 * Get our bogomips.
246 * 252 *
@@ -257,6 +263,8 @@ static void __cpuinit smp_callin(void)
257 */ 263 */
258 smp_store_cpu_info(cpuid); 264 smp_store_cpu_info(cpuid);
259 265
266 notify_cpu_starting(cpuid);
267
260 /* 268 /*
261 * Allow the master to continue. 269 * Allow the master to continue.
262 */ 270 */
@@ -286,9 +294,9 @@ notrace static void __cpuinit start_secondary(void *unused)
286 check_tsc_sync_target(); 294 check_tsc_sync_target();
287 295
288 if (nmi_watchdog == NMI_IO_APIC) { 296 if (nmi_watchdog == NMI_IO_APIC) {
289 disable_8259A_irq(0); 297 legacy_pic->chip->mask(0);
290 enable_NMI_through_LVT0(); 298 enable_NMI_through_LVT0();
291 enable_8259A_irq(0); 299 legacy_pic->chip->unmask(0);
292 } 300 }
293 301
294#ifdef CONFIG_X86_32 302#ifdef CONFIG_X86_32
@@ -315,15 +323,18 @@ notrace static void __cpuinit start_secondary(void *unused)
315 */ 323 */
316 ipi_call_lock(); 324 ipi_call_lock();
317 lock_vector_lock(); 325 lock_vector_lock();
318 __setup_vector_irq(smp_processor_id());
319 set_cpu_online(smp_processor_id(), true); 326 set_cpu_online(smp_processor_id(), true);
320 unlock_vector_lock(); 327 unlock_vector_lock();
321 ipi_call_unlock(); 328 ipi_call_unlock();
322 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 329 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
330 x86_platform.nmi_init();
323 331
324 /* enable local interrupts */ 332 /* enable local interrupts */
325 local_irq_enable(); 333 local_irq_enable();
326 334
335 /* to prevent fake stack check failure in clock setup */
336 boot_init_stack_canary();
337
327 x86_cpuinit.setup_percpu_clockev(); 338 x86_cpuinit.setup_percpu_clockev();
328 339
329 wmb(); 340 wmb();
@@ -1083,9 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1083 set_cpu_sibling_map(0); 1094 set_cpu_sibling_map(0);
1084 1095
1085 enable_IR_x2apic(); 1096 enable_IR_x2apic();
1086#ifdef CONFIG_X86_64
1087 default_setup_apic_routing(); 1097 default_setup_apic_routing();
1088#endif
1089 1098
1090 if (smp_sanity_check(max_cpus) < 0) { 1099 if (smp_sanity_check(max_cpus) < 0) {
1091 printk(KERN_INFO "SMP disabled\n"); 1100 printk(KERN_INFO "SMP disabled\n");
@@ -1213,11 +1222,12 @@ __init void prefill_possible_map(void)
1213 1222
1214 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1223 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1215 1224
1216 if (possible > CONFIG_NR_CPUS) { 1225 /* nr_cpu_ids could be reduced via nr_cpus= */
1226 if (possible > nr_cpu_ids) {
1217 printk(KERN_WARNING 1227 printk(KERN_WARNING
1218 "%d Processors exceeds NR_CPUS limit of %d\n", 1228 "%d Processors exceeds NR_CPUS limit of %d\n",
1219 possible, CONFIG_NR_CPUS); 1229 possible, nr_cpu_ids);
1220 possible = CONFIG_NR_CPUS; 1230 possible = nr_cpu_ids;
1221 } 1231 }
1222 1232
1223 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1233 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index dee1ff7cba58..196552bb412c 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -25,191 +25,6 @@
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27/* 27/*
28 * Perform the select(nd, in, out, ex, tv) and mmap() system
29 * calls. Linux/i386 didn't use to be able to handle more than
30 * 4 system call parameters, so these system calls used a memory
31 * block for parameter passing..
32 */
33
34struct mmap_arg_struct {
35 unsigned long addr;
36 unsigned long len;
37 unsigned long prot;
38 unsigned long flags;
39 unsigned long fd;
40 unsigned long offset;
41};
42
43asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
44{
45 struct mmap_arg_struct a;
46 int err = -EFAULT;
47
48 if (copy_from_user(&a, arg, sizeof(a)))
49 goto out;
50
51 err = -EINVAL;
52 if (a.offset & ~PAGE_MASK)
53 goto out;
54
55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
56 a.fd, a.offset >> PAGE_SHIFT);
57out:
58 return err;
59}
60
61
62struct sel_arg_struct {
63 unsigned long n;
64 fd_set __user *inp, *outp, *exp;
65 struct timeval __user *tvp;
66};
67
68asmlinkage int old_select(struct sel_arg_struct __user *arg)
69{
70 struct sel_arg_struct a;
71
72 if (copy_from_user(&a, arg, sizeof(a)))
73 return -EFAULT;
74 /* sys_select() does the appropriate kernel locking */
75 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
76}
77
78/*
79 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
80 *
81 * This is really horribly ugly.
82 */
83asmlinkage int sys_ipc(uint call, int first, int second,
84 int third, void __user *ptr, long fifth)
85{
86 int version, ret;
87
88 version = call >> 16; /* hack for backward compatibility */
89 call &= 0xffff;
90
91 switch (call) {
92 case SEMOP:
93 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
94 case SEMTIMEDOP:
95 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
96 (const struct timespec __user *)fifth);
97
98 case SEMGET:
99 return sys_semget(first, second, third);
100 case SEMCTL: {
101 union semun fourth;
102 if (!ptr)
103 return -EINVAL;
104 if (get_user(fourth.__pad, (void __user * __user *) ptr))
105 return -EFAULT;
106 return sys_semctl(first, second, third, fourth);
107 }
108
109 case MSGSND:
110 return sys_msgsnd(first, (struct msgbuf __user *) ptr,
111 second, third);
112 case MSGRCV:
113 switch (version) {
114 case 0: {
115 struct ipc_kludge tmp;
116 if (!ptr)
117 return -EINVAL;
118
119 if (copy_from_user(&tmp,
120 (struct ipc_kludge __user *) ptr,
121 sizeof(tmp)))
122 return -EFAULT;
123 return sys_msgrcv(first, tmp.msgp, second,
124 tmp.msgtyp, third);
125 }
126 default:
127 return sys_msgrcv(first,
128 (struct msgbuf __user *) ptr,
129 second, fifth, third);
130 }
131 case MSGGET:
132 return sys_msgget((key_t) first, second);
133 case MSGCTL:
134 return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
135
136 case SHMAT:
137 switch (version) {
138 default: {
139 ulong raddr;
140 ret = do_shmat(first, (char __user *) ptr, second, &raddr);
141 if (ret)
142 return ret;
143 return put_user(raddr, (ulong __user *) third);
144 }
145 case 1: /* iBCS2 emulator entry point */
146 if (!segment_eq(get_fs(), get_ds()))
147 return -EINVAL;
148 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
149 return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
150 }
151 case SHMDT:
152 return sys_shmdt((char __user *)ptr);
153 case SHMGET:
154 return sys_shmget(first, second, third);
155 case SHMCTL:
156 return sys_shmctl(first, second,
157 (struct shmid_ds __user *) ptr);
158 default:
159 return -ENOSYS;
160 }
161}
162
163/*
164 * Old cruft
165 */
166asmlinkage int sys_uname(struct old_utsname __user *name)
167{
168 int err;
169 if (!name)
170 return -EFAULT;
171 down_read(&uts_sem);
172 err = copy_to_user(name, utsname(), sizeof(*name));
173 up_read(&uts_sem);
174 return err? -EFAULT:0;
175}
176
177asmlinkage int sys_olduname(struct oldold_utsname __user *name)
178{
179 int error;
180
181 if (!name)
182 return -EFAULT;
183 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
184 return -EFAULT;
185
186 down_read(&uts_sem);
187
188 error = __copy_to_user(&name->sysname, &utsname()->sysname,
189 __OLD_UTS_LEN);
190 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
191 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
192 __OLD_UTS_LEN);
193 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
194 error |= __copy_to_user(&name->release, &utsname()->release,
195 __OLD_UTS_LEN);
196 error |= __put_user(0, name->release + __OLD_UTS_LEN);
197 error |= __copy_to_user(&name->version, &utsname()->version,
198 __OLD_UTS_LEN);
199 error |= __put_user(0, name->version + __OLD_UTS_LEN);
200 error |= __copy_to_user(&name->machine, &utsname()->machine,
201 __OLD_UTS_LEN);
202 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
203
204 up_read(&uts_sem);
205
206 error = error ? -EFAULT : 0;
207
208 return error;
209}
210
211
212/*
213 * Do a system call from kernel instead of calling sys_execve so we 28 * Do a system call from kernel instead of calling sys_execve so we
214 * end up with proper pt_regs. 29 * end up with proper pt_regs.
215 */ 30 */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 8aa2057efd12..ff14a5044ce6 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -209,15 +209,3 @@ bottomup:
209 209
210 return addr; 210 return addr;
211} 211}
212
213
214SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
215{
216 int err;
217 down_read(&uts_sem);
218 err = copy_to_user(name, utsname(), sizeof(*name));
219 up_read(&uts_sem);
220 if (personality(current->personality) == PER_LINUX32)
221 err |= copy_to_user(&name->machine, "i686", 5);
222 return err ? -EFAULT : 0;
223}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 15228b5d3eb7..8b3729341216 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -81,7 +81,7 @@ ENTRY(sys_call_table)
81 .long sys_settimeofday 81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */ 82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16 83 .long sys_setgroups16
84 .long old_select 84 .long sys_old_select
85 .long sys_symlink 85 .long sys_symlink
86 .long sys_lstat 86 .long sys_lstat
87 .long sys_readlink /* 85 */ 87 .long sys_readlink /* 85 */
@@ -89,7 +89,7 @@ ENTRY(sys_call_table)
89 .long sys_swapon 89 .long sys_swapon
90 .long sys_reboot 90 .long sys_reboot
91 .long sys_old_readdir 91 .long sys_old_readdir
92 .long old_mmap /* 90 */ 92 .long sys_old_mmap /* 90 */
93 .long sys_munmap 93 .long sys_munmap
94 .long sys_truncate 94 .long sys_truncate
95 .long sys_ftruncate 95 .long sys_ftruncate
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be2573448ed9..fb5cc5e14cfa 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
70 * manually to deassert NMI lines for the watchdog if run 70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system. 71 * on an 82489DX-based system.
72 */ 72 */
73 spin_lock(&i8259A_lock); 73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3); 74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */ 75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL); 76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock); 77 raw_spin_unlock(&i8259A_lock);
78 } 78 }
79 79
80 global_clock_event->event_handler(global_clock_event); 80 global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 364d015efebc..17b03dd3a6b5 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -9,6 +9,7 @@
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/slab.h>
12 13
13#include <asm/mmu_context.h> 14#include <asm/mmu_context.h>
14#include <asm/uv/uv.h> 15#include <asm/uv/uv.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 33399176512a..1168e4454188 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 534
535 get_debugreg(dr6, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Filter out all the reserved bits which are preset to 1 */
538 dr6 &= ~DR6_RESERVED;
539
537 /* Catch kmemcheck conditions first of all! */ 540 /* Catch kmemcheck conditions first of all! */
538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 541 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 542 return;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 597683aa5ba0..9faf91ae1841 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -50,7 +50,7 @@ u64 native_sched_clock(void)
50 * unstable. We do this because unlike Time Of Day, 50 * unstable. We do this because unlike Time Of Day,
51 * the scheduler clock tolerates small errors and it's 51 * the scheduler clock tolerates small errors and it's
52 * very important for it to be as fast as the platform 52 * very important for it to be as fast as the platform
53 * can achive it. ) 53 * can achieve it. )
54 */ 54 */
55 if (unlikely(tsc_disabled)) { 55 if (unlikely(tsc_disabled)) {
56 /* No locking but a rare wrong value is not a big deal: */ 56 /* No locking but a rare wrong value is not a big deal: */
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
740} 740}
741#endif 741#endif
742 742
743static void resume_tsc(void) 743static void resume_tsc(struct clocksource *cs)
744{ 744{
745 clocksource_tsc.cycle_last = 0; 745 clocksource_tsc.cycle_last = 0;
746} 746}
@@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
806 unsigned long res_low, res_high; 806 unsigned long res_low, res_high;
807 807
808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
809 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ 809 /* Geode_LX - the OLPC CPU has a very reliable TSC */
810 if (res_low & RTSC_SUSP) 810 if (res_low & RTSC_SUSP)
811 tsc_clocksource_reliable = 1; 811 tsc_clocksource_reliable = 1;
812#endif 812#endif
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index ece73d8e3240..1d40336b030a 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/slab.h>
13#include <linux/irq.h> 14#include <linux/irq.h>
14 15
15#include <asm/apic.h> 16#include <asm/apic.h>
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 36afb98675a4..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void)
54 if (!sgi_uv_kobj) 54 if (!sgi_uv_kobj)
55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); 55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
56 if (!sgi_uv_kobj) { 56 if (!sgi_uv_kobj) {
57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); 57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
58 return -EINVAL; 58 return -EINVAL;
59 } 59 }
60 60
61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); 61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
62 if (ret) { 62 if (ret) {
63 printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); 63 printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
64 return ret; 64 return ret;
65 } 65 }
66 66
67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); 67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
68 if (ret) { 68 if (ret) {
69 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); 69 printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
70 return ret; 70 return ret;
71 } 71 }
72 72
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 3c84aa001c11..56e421bc379b 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -19,6 +19,7 @@
19 * Copyright (c) Dimitri Sivanich 19 * Copyright (c) Dimitri Sivanich
20 */ 20 */
21#include <linux/clockchips.h> 21#include <linux/clockchips.h>
22#include <linux/slab.h>
22 23
23#include <asm/uv/uv_mmrs.h> 24#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h> 25#include <asm/uv/uv_hub.h>
@@ -282,10 +283,21 @@ static int uv_rtc_unset_timer(int cpu, int force)
282 283
283/* 284/*
284 * Read the RTC. 285 * Read the RTC.
286 *
287 * Starting with HUB rev 2.0, the UV RTC register is replicated across all
288 * cachelines of it's own page. This allows faster simultaneous reads
289 * from a given socket.
285 */ 290 */
286static cycle_t uv_read_rtc(struct clocksource *cs) 291static cycle_t uv_read_rtc(struct clocksource *cs)
287{ 292{
288 return (cycle_t)uv_read_local_mmr(UVH_RTC); 293 unsigned long offset;
294
295 if (uv_get_min_hub_revision_id() == 1)
296 offset = 0;
297 else
298 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
299
300 return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
289} 301}
290 302
291/* 303/*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 34a279a7471d..e680ea52db9b 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -49,11 +49,6 @@ extern int no_broadcast;
49char visws_board_type = -1; 49char visws_board_type = -1;
50char visws_board_rev = -1; 50char visws_board_rev = -1;
51 51
52int is_visws_box(void)
53{
54 return visws_board_type >= 0;
55}
56
57static void __init visws_time_init(void) 52static void __init visws_time_init(void)
58{ 53{
59 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 54 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -242,6 +237,8 @@ void __init visws_early_detect(void)
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init; 237 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init; 238 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init; 239 x86_init.timers.timer_init = visws_time_init;
240 x86_init.pci.init = pci_visws_init;
241 x86_init.pci.init_irq = x86_init_noop;
245 242
246 /* 243 /*
247 * Install reboot quirks: 244 * Install reboot quirks:
@@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {
508 */ 505 */
509static unsigned int startup_piix4_master_irq(unsigned int irq) 506static unsigned int startup_piix4_master_irq(unsigned int irq)
510{ 507{
511 init_8259A(0); 508 legacy_pic->init(0);
512 509
513 return startup_cobalt_irq(irq); 510 return startup_cobalt_irq(irq);
514} 511}
@@ -532,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = {
532 529
533static struct irq_chip piix4_virtual_irq_type = { 530static struct irq_chip piix4_virtual_irq_type = {
534 .name = "PIIX4-virtual", 531 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq,
538}; 532};
539 533
540 534
@@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
559 struct irq_desc *desc; 553 struct irq_desc *desc;
560 unsigned long flags; 554 unsigned long flags;
561 555
562 spin_lock_irqsave(&i8259A_lock, flags); 556 raw_spin_lock_irqsave(&i8259A_lock, flags);
563 557
564 /* Find out what's interrupting in the PIIX4 master 8259 */ 558 /* Find out what's interrupting in the PIIX4 master 8259 */
565 outb(0x0c, 0x20); /* OCW3 Poll command */ 559 outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
596 outb(0x60 + realirq, 0x20); 590 outb(0x60 + realirq, 0x20);
597 } 591 }
598 592
599 spin_unlock_irqrestore(&i8259A_lock, flags); 593 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
600 594
601 desc = irq_to_desc(realirq); 595 desc = irq_to_desc(realirq);
602 596
@@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
609 handle_IRQ_event(realirq, desc->action); 603 handle_IRQ_event(realirq, desc->action);
610 604
611 if (!(desc->status & IRQ_DISABLED)) 605 if (!(desc->status & IRQ_DISABLED))
612 enable_8259A_irq(realirq); 606 legacy_pic->chip->unmask(realirq);
613 607
614 return IRQ_HANDLED; 608 return IRQ_HANDLED;
615 609
616out_unlock: 610out_unlock:
617 spin_unlock_irqrestore(&i8259A_lock, flags); 611 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
618 return IRQ_NONE; 612 return IRQ_NONE;
619} 613}
620 614
@@ -628,6 +622,12 @@ static struct irqaction cascade_action = {
628 .name = "cascade", 622 .name = "cascade",
629}; 623};
630 624
625static inline void set_piix4_virtual_irq_type(void)
626{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask;
630}
631 631
632void init_VISWS_APIC_irqs(void) 632void init_VISWS_APIC_irqs(void)
633{ 633{
@@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)
653 desc->chip = &piix4_master_irq_type; 653 desc->chip = &piix4_master_irq_type;
654 } 654 }
655 else if (i < CO_IRQ_APIC0) { 655 else if (i < CO_IRQ_APIC0) {
656 set_piix4_virtual_irq_type();
656 desc->chip = &piix4_virtual_irq_type; 657 desc->chip = &piix4_virtual_irq_type;
657 } 658 }
658 else if (IS_CO_APIC(i)) { 659 else if (IS_CO_APIC(i)) {
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c30193..ce9fbacb7526 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -28,11 +28,13 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/gfp.h>
31#include <asm/vmi.h> 32#include <asm/vmi.h>
32#include <asm/io.h> 33#include <asm/io.h>
33#include <asm/fixmap.h> 34#include <asm/fixmap.h>
34#include <asm/apicdef.h> 35#include <asm/apicdef.h>
35#include <asm/apic.h> 36#include <asm/apic.h>
37#include <asm/pgalloc.h>
36#include <asm/processor.h> 38#include <asm/processor.h>
37#include <asm/timer.h> 39#include <asm/timer.h>
38#include <asm/vmi_time.h> 40#include <asm/vmi_time.h>
@@ -266,30 +268,6 @@ static void vmi_nop(void)
266{ 268{
267} 269}
268 270
269#ifdef CONFIG_HIGHPTE
270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
271{
272 void *va = kmap_atomic(page, type);
273
274 /*
275 * Internally, the VMI ROM must map virtual addresses to physical
276 * addresses for processing MMU updates. By the time MMU updates
277 * are issued, this information is typically already lost.
278 * Fortunately, the VMI provides a cache of mapping slots for active
279 * page tables.
280 *
281 * We use slot zero for the linear mapping of physical memory, and
282 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
283 *
284 * args: SLOT VA COUNT PFN
285 */
286 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
287 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
288
289 return va;
290}
291#endif
292
293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
294{ 272{
295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +618,12 @@ static inline int __init activate_vmi(void)
640 u64 reloc; 618 u64 reloc;
641 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; 619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
642 620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
643 if (call_vrom_func(vmi_rom, vmi_init) != 0) { 627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
644 printk(KERN_ERR "VMI ROM failed to initialize!"); 628 printk(KERN_ERR "VMI ROM failed to initialize!");
645 return 0; 629 return 0;
@@ -778,10 +762,6 @@ static inline int __init activate_vmi(void)
778 762
779 /* Set linear is needed in all cases */ 763 /* Set linear is needed in all cases */
780 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
781#ifdef CONFIG_HIGHPTE
782 if (vmi_ops.set_linear_mapping)
783 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
784#endif
785 765
786 /* 766 /*
787 * These MUST always be patched. Don't support indirect jumps 767 * These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 74c92bb194df..5e1ff66ecd73 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
79 79
80static inline unsigned int vmi_get_timer_vector(void) 80static inline unsigned int vmi_get_timer_vector(void)
81{ 81{
82#ifdef CONFIG_X86_IO_APIC 82 return IRQ0_VECTOR;
83 return FIRST_DEVICE_VECTOR;
84#else
85 return FIRST_EXTERNAL_VECTOR;
86#endif
87} 83}
88 84
89/** vmi clockchip */ 85/** vmi clockchip */
@@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,
171{ 167{
172 /* Unfortunately, set_next_event interface only passes relative 168 /* Unfortunately, set_next_event interface only passes relative
173 * expiry, but we want absolute expiry. It'd be better if were 169 * expiry, but we want absolute expiry. It'd be better if were
174 * were passed an aboslute expiry, since a bunch of time may 170 * were passed an absolute expiry, since a bunch of time may
175 * have been stolen between the time the delta is computed and 171 * have been stolen between the time the delta is computed and
176 * when we set the alarm below. */ 172 * when we set the alarm below. */
177 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); 173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f92a0da608cb..2cc249718c46 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -291,8 +291,8 @@ SECTIONS
291 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 291 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
292 __smp_locks = .; 292 __smp_locks = .;
293 *(.smp_locks) 293 *(.smp_locks)
294 __smp_locks_end = .;
295 . = ALIGN(PAGE_SIZE); 294 . = ALIGN(PAGE_SIZE);
295 __smp_locks_end = .;
296 } 296 }
297 297
298#ifdef CONFIG_X86_64 298#ifdef CONFIG_X86_64
@@ -341,7 +341,7 @@ SECTIONS
341 * Per-cpu symbols which need to be offset from __per_cpu_load 341 * Per-cpu symbols which need to be offset from __per_cpu_load
342 * for the boot processor. 342 * for the boot processor.
343 */ 343 */
344#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load 344#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
345INIT_PER_CPU(gdt_page); 345INIT_PER_CPU(gdt_page);
346INIT_PER_CPU(irq_stack_union); 346INIT_PER_CPU(irq_stack_union);
347 347
@@ -352,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
352 "kernel image bigger than KERNEL_IMAGE_SIZE"); 352 "kernel image bigger than KERNEL_IMAGE_SIZE");
353 353
354#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
355. = ASSERT((per_cpu__irq_stack_union == 0), 355. = ASSERT((irq_stack_union == 0),
356 "irq_stack_union is not at start of per-cpu area"); 356 "irq_stack_union is not at start of per-cpu area");
357#endif 357#endif
358 358
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9055e5872ff0..1c0c6ab9c60f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,8 @@ static int __init vsyscall_init(void)
301 register_sysctl_table(kernel_root_table2); 301 register_sysctl_table(kernel_root_table2);
302#endif 302#endif
303 on_each_cpu(cpu_vsyscall_init, NULL, 1); 303 on_each_cpu(cpu_vsyscall_init, NULL, 1);
304 hotcpu_notifier(cpu_vsyscall_notifier, 0); 304 /* notifier priority > KVM */
305 hotcpu_notifier(cpu_vsyscall_notifier, 30);
305 return 0; 306 return 0;
306} 307}
307 308
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ccd179dec36e..61a1e8c7e19f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -4,9 +4,11 @@
4 * For licencing details see kernel-base/COPYING 4 * For licencing details see kernel-base/COPYING
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h>
7 8
8#include <asm/bios_ebda.h> 9#include <asm/bios_ebda.h>
9#include <asm/paravirt.h> 10#include <asm/paravirt.h>
11#include <asm/pci_x86.h>
10#include <asm/mpspec.h> 12#include <asm/mpspec.h>
11#include <asm/setup.h> 13#include <asm/setup.h>
12#include <asm/apic.h> 14#include <asm/apic.h>
@@ -70,16 +72,25 @@ struct x86_init_ops x86_init __initdata = {
70 .iommu = { 72 .iommu = {
71 .iommu_init = iommu_init_noop, 73 .iommu_init = iommu_init_noop,
72 }, 74 },
75
76 .pci = {
77 .init = x86_default_pci_init,
78 .init_irq = x86_default_pci_init_irq,
79 .fixup_irqs = x86_default_pci_fixup_irqs,
80 },
73}; 81};
74 82
75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 83struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
76 .setup_percpu_clockev = setup_secondary_APIC_clock, 84 .setup_percpu_clockev = setup_secondary_APIC_clock,
77}; 85};
78 86
87static void default_nmi_init(void) { };
88
79struct x86_platform_ops x86_platform = { 89struct x86_platform_ops x86_platform = {
80 .calibrate_tsc = native_calibrate_tsc, 90 .calibrate_tsc = native_calibrate_tsc,
81 .get_wallclock = mach_get_cmos_time, 91 .get_wallclock = mach_get_cmos_time,
82 .set_wallclock = mach_set_rtc_mmss, 92 .set_wallclock = mach_set_rtc_mmss,
83 .iommu_shutdown = iommu_shutdown_noop, 93 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range, 94 .is_untracked_pat_range = is_ISA_range,
95 .nmi_init = default_nmi_init
85}; 96};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c5ee17e8c6d9..782c3a362ec6 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void)
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 338 xstate_size = ebx;
339 339
340 update_regset_xstate_info(xstate_size, pcntxt_mask);
340 prepare_fx_sw_frame(); 341 prepare_fx_sw_frame();
341 342
342 setup_xstate_init(); 343 setup_xstate_init();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4cd498332466..970bbd479516 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER 31 select USER_RETURN_NOTIFIER
32 select KVM_MMIO
32 ---help--- 33 ---help---
33 Support hosting fully virtualized guest machines using hardware 34 Support hosting fully virtualized guest machines using hardware
34 virtualization extensions. You will need a fairly recent 35 virtualization extensions. You will need a fairly recent
@@ -65,6 +66,7 @@ config KVM_AMD
65 66
66# OK, it's a little counter-intuitive to do this, but it puts it neatly under 67# OK, it's a little counter-intuitive to do this, but it puts it neatly under
67# the virtualization menu. 68# the virtualization menu.
69source drivers/vhost/Kconfig
68source drivers/lguest/Kconfig 70source drivers/lguest/Kconfig
69source drivers/virtio/Kconfig 71source drivers/virtio/Kconfig
70 72
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7e8faea4651e..4dade6ac0827 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -32,7 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "mmu.h" /* for is_long_mode() */ 35#include "x86.h"
36 36
37/* 37/*
38 * Opcode effective-address decode tables. 38 * Opcode effective-address decode tables.
@@ -76,6 +76,8 @@
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */ 78/* Misc flags */
79#define Lock (1<<26) /* lock prefix is allowed for the instruction */
80#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
79#define No64 (1<<28) 81#define No64 (1<<28)
80/* Source 2 operand type */ 82/* Source 2 operand type */
81#define Src2None (0<<29) 83#define Src2None (0<<29)
@@ -88,39 +90,40 @@
88enum { 90enum {
89 Group1_80, Group1_81, Group1_82, Group1_83, 91 Group1_80, Group1_81, Group1_82, Group1_83,
90 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 92 Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
93 Group8, Group9,
91}; 94};
92 95
93static u32 opcode_table[256] = { 96static u32 opcode_table[256] = {
94 /* 0x00 - 0x07 */ 97 /* 0x00 - 0x07 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 98 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 99 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 100 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 101 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
99 /* 0x08 - 0x0F */ 102 /* 0x08 - 0x0F */
100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 103 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 104 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 105 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0, 106 ImplicitOps | Stack | No64, 0,
104 /* 0x10 - 0x17 */ 107 /* 0x10 - 0x17 */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 108 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 109 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 110 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 111 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
109 /* 0x18 - 0x1F */ 112 /* 0x18 - 0x1F */
110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 113 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 114 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 115 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 116 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
114 /* 0x20 - 0x27 */ 117 /* 0x20 - 0x27 */
115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 118 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 119 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
117 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 120 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
118 /* 0x28 - 0x2F */ 121 /* 0x28 - 0x2F */
119 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 122 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
120 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 123 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
121 0, 0, 0, 0, 124 0, 0, 0, 0,
122 /* 0x30 - 0x37 */ 125 /* 0x30 - 0x37 */
123 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 126 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
124 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 127 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
125 0, 0, 0, 0, 128 0, 0, 0, 0,
126 /* 0x38 - 0x3F */ 129 /* 0x38 - 0x3F */
@@ -156,7 +159,7 @@ static u32 opcode_table[256] = {
156 Group | Group1_80, Group | Group1_81, 159 Group | Group1_80, Group | Group1_81,
157 Group | Group1_82, Group | Group1_83, 160 Group | Group1_82, Group | Group1_83,
158 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 161 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
159 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 162 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
160 /* 0x88 - 0x8F */ 163 /* 0x88 - 0x8F */
161 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 164 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
162 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 165 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -210,7 +213,7 @@ static u32 opcode_table[256] = {
210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
211 /* 0xF0 - 0xF7 */ 214 /* 0xF0 - 0xF7 */
212 0, 0, 0, 0, 215 0, 0, 0, 0,
213 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
214 /* 0xF8 - 0xFF */ 217 /* 0xF8 - 0xFF */
215 ImplicitOps, 0, ImplicitOps, ImplicitOps, 218 ImplicitOps, 0, ImplicitOps, ImplicitOps,
216 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, 219 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -218,16 +221,20 @@ static u32 opcode_table[256] = {
218 221
219static u32 twobyte_table[256] = { 222static u32 twobyte_table[256] = {
220 /* 0x00 - 0x0F */ 223 /* 0x00 - 0x0F */
221 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 224 0, Group | GroupDual | Group7, 0, 0,
222 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 225 0, ImplicitOps, ImplicitOps | Priv, 0,
226 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
227 0, ImplicitOps | ModRM, 0, 0,
223 /* 0x10 - 0x1F */ 228 /* 0x10 - 0x1F */
224 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
225 /* 0x20 - 0x2F */ 230 /* 0x20 - 0x2F */
226 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 231 ModRM | ImplicitOps | Priv, ModRM | Priv,
232 ModRM | ImplicitOps | Priv, ModRM | Priv,
233 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0x30 - 0x3F */ 235 /* 0x30 - 0x3F */
229 ImplicitOps, 0, ImplicitOps, 0, 236 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
230 ImplicitOps, ImplicitOps, 0, 0, 237 ImplicitOps, ImplicitOps | Priv, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0x40 - 0x47 */ 239 /* 0x40 - 0x47 */
233 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 240 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -257,21 +264,23 @@ static u32 twobyte_table[256] = {
257 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 264 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
258 /* 0xA8 - 0xAF */ 265 /* 0xA8 - 0xAF */
259 ImplicitOps | Stack, ImplicitOps | Stack, 266 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp, 267 0, DstMem | SrcReg | ModRM | BitOp | Lock,
261 DstMem | SrcReg | Src2ImmByte | ModRM, 268 DstMem | SrcReg | Src2ImmByte | ModRM,
262 DstMem | SrcReg | Src2CL | ModRM, 269 DstMem | SrcReg | Src2CL | ModRM,
263 ModRM, 0, 270 ModRM, 0,
264 /* 0xB0 - 0xB7 */ 271 /* 0xB0 - 0xB7 */
265 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 272 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
266 DstMem | SrcReg | ModRM | BitOp, 273 0, DstMem | SrcReg | ModRM | BitOp | Lock,
267 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 274 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
268 DstReg | SrcMem16 | ModRM | Mov, 275 DstReg | SrcMem16 | ModRM | Mov,
269 /* 0xB8 - 0xBF */ 276 /* 0xB8 - 0xBF */
270 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, 277 0, 0,
278 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
271 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 279 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
272 DstReg | SrcMem16 | ModRM | Mov, 280 DstReg | SrcMem16 | ModRM | Mov,
273 /* 0xC0 - 0xCF */ 281 /* 0xC0 - 0xCF */
274 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, 282 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
283 0, 0, 0, Group | GroupDual | Group9,
275 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0,
276 /* 0xD0 - 0xDF */ 285 /* 0xD0 - 0xDF */
277 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -283,25 +292,41 @@ static u32 twobyte_table[256] = {
283 292
284static u32 group_table[] = { 293static u32 group_table[] = {
285 [Group1_80*8] = 294 [Group1_80*8] =
286 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 295 ByteOp | DstMem | SrcImm | ModRM | Lock,
287 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 296 ByteOp | DstMem | SrcImm | ModRM | Lock,
288 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 297 ByteOp | DstMem | SrcImm | ModRM | Lock,
289 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 298 ByteOp | DstMem | SrcImm | ModRM | Lock,
299 ByteOp | DstMem | SrcImm | ModRM | Lock,
300 ByteOp | DstMem | SrcImm | ModRM | Lock,
301 ByteOp | DstMem | SrcImm | ModRM | Lock,
302 ByteOp | DstMem | SrcImm | ModRM,
290 [Group1_81*8] = 303 [Group1_81*8] =
291 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 304 DstMem | SrcImm | ModRM | Lock,
292 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 305 DstMem | SrcImm | ModRM | Lock,
293 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 306 DstMem | SrcImm | ModRM | Lock,
294 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 307 DstMem | SrcImm | ModRM | Lock,
308 DstMem | SrcImm | ModRM | Lock,
309 DstMem | SrcImm | ModRM | Lock,
310 DstMem | SrcImm | ModRM | Lock,
311 DstMem | SrcImm | ModRM,
295 [Group1_82*8] = 312 [Group1_82*8] =
296 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 313 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
297 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 314 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
298 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 315 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
299 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 316 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
317 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
318 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
319 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
320 ByteOp | DstMem | SrcImm | ModRM | No64,
300 [Group1_83*8] = 321 [Group1_83*8] =
301 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 322 DstMem | SrcImmByte | ModRM | Lock,
302 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 323 DstMem | SrcImmByte | ModRM | Lock,
303 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 324 DstMem | SrcImmByte | ModRM | Lock,
304 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 325 DstMem | SrcImmByte | ModRM | Lock,
326 DstMem | SrcImmByte | ModRM | Lock,
327 DstMem | SrcImmByte | ModRM | Lock,
328 DstMem | SrcImmByte | ModRM | Lock,
329 DstMem | SrcImmByte | ModRM,
305 [Group1A*8] = 330 [Group1A*8] =
306 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 331 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
307 [Group3_Byte*8] = 332 [Group3_Byte*8] =
@@ -320,24 +345,39 @@ static u32 group_table[] = {
320 SrcMem | ModRM | Stack, 0, 345 SrcMem | ModRM | Stack, 0,
321 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
322 [Group7*8] = 347 [Group7*8] =
323 0, 0, ModRM | SrcMem, ModRM | SrcMem, 348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
324 SrcNone | ModRM | DstMem | Mov, 0, 349 SrcNone | ModRM | DstMem | Mov, 0,
325 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, 350 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
351 [Group8*8] =
352 0, 0, 0, 0,
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
326}; 357};
327 358
328static u32 group2_table[] = { 359static u32 group2_table[] = {
329 [Group7*8] = 360 [Group7*8] =
330 SrcNone | ModRM, 0, 0, SrcNone | ModRM, 361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
331 SrcNone | ModRM | DstMem | Mov, 0, 362 SrcNone | ModRM | DstMem | Mov, 0,
332 SrcMem16 | ModRM | Mov, 0, 363 SrcMem16 | ModRM | Mov, 0,
364 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0,
333}; 366};
334 367
335/* EFLAGS bit definitions. */ 368/* EFLAGS bit definitions. */
369#define EFLG_ID (1<<21)
370#define EFLG_VIP (1<<20)
371#define EFLG_VIF (1<<19)
372#define EFLG_AC (1<<18)
336#define EFLG_VM (1<<17) 373#define EFLG_VM (1<<17)
337#define EFLG_RF (1<<16) 374#define EFLG_RF (1<<16)
375#define EFLG_IOPL (3<<12)
376#define EFLG_NT (1<<14)
338#define EFLG_OF (1<<11) 377#define EFLG_OF (1<<11)
339#define EFLG_DF (1<<10) 378#define EFLG_DF (1<<10)
340#define EFLG_IF (1<<9) 379#define EFLG_IF (1<<9)
380#define EFLG_TF (1<<8)
341#define EFLG_SF (1<<7) 381#define EFLG_SF (1<<7)
342#define EFLG_ZF (1<<6) 382#define EFLG_ZF (1<<6)
343#define EFLG_AF (1<<4) 383#define EFLG_AF (1<<4)
@@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
606 646
607 if (linear < fc->start || linear >= fc->end) { 647 if (linear < fc->start || linear >= fc->end) {
608 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 648 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
609 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); 649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
610 if (rc) 650 if (rc)
611 return rc; 651 return rc;
612 fc->start = linear; 652 fc->start = linear;
@@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
661 op_bytes = 3; 701 op_bytes = 3;
662 *address = 0; 702 *address = 0;
663 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
664 ctxt->vcpu); 704 ctxt->vcpu, NULL);
665 if (rc) 705 if (rc)
666 return rc; 706 return rc;
667 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
668 ctxt->vcpu); 708 ctxt->vcpu, NULL);
669 return rc; 709 return rc;
670} 710}
671 711
@@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
889 929
890 switch (mode) { 930 switch (mode) {
891 case X86EMUL_MODE_REAL: 931 case X86EMUL_MODE_REAL:
932 case X86EMUL_MODE_VM86:
892 case X86EMUL_MODE_PROT16: 933 case X86EMUL_MODE_PROT16:
893 def_op_bytes = def_ad_bytes = 2; 934 def_op_bytes = def_ad_bytes = 2;
894 break; 935 break;
@@ -975,7 +1016,7 @@ done_prefixes:
975 } 1016 }
976 1017
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; 1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
979 return -1; 1020 return -1;
980 } 1021 }
981 1022
@@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1196 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1237 rc = ops->read_emulated(register_address(c, ss_base(ctxt),
1197 c->regs[VCPU_REGS_RSP]), 1238 c->regs[VCPU_REGS_RSP]),
1198 dest, len, ctxt->vcpu); 1239 dest, len, ctxt->vcpu);
1199 if (rc != 0) 1240 if (rc != X86EMUL_CONTINUE)
1200 return rc; 1241 return rc;
1201 1242
1202 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); 1243 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
1203 return rc; 1244 return rc;
1204} 1245}
1205 1246
1247static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops,
1249 void *dest, int len)
1250{
1251 int rc;
1252 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
1255
1256 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE)
1258 return rc;
1259
1260 change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
1261 | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
1262
1263 switch(ctxt->mode) {
1264 case X86EMUL_MODE_PROT64:
1265 case X86EMUL_MODE_PROT32:
1266 case X86EMUL_MODE_PROT16:
1267 if (cpl == 0)
1268 change_mask |= EFLG_IOPL;
1269 if (cpl <= iopl)
1270 change_mask |= EFLG_IF;
1271 break;
1272 case X86EMUL_MODE_VM86:
1273 if (iopl < 3) {
1274 kvm_inject_gp(ctxt->vcpu, 0);
1275 return X86EMUL_PROPAGATE_FAULT;
1276 }
1277 change_mask |= EFLG_IF;
1278 break;
1279 default: /* real mode */
1280 change_mask |= (EFLG_IOPL | EFLG_IF);
1281 break;
1282 }
1283
1284 *(unsigned long *)dest =
1285 (ctxt->eflags & ~change_mask) | (val & change_mask);
1286
1287 return rc;
1288}
1289
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1290static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{ 1291{
1208 struct decode_cache *c = &ctxt->decode; 1292 struct decode_cache *c = &ctxt->decode;
@@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1225 if (rc != 0) 1309 if (rc != 0)
1226 return rc; 1310 return rc;
1227 1311
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); 1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
1229 return rc; 1313 return rc;
1230} 1314}
1231 1315
@@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1370 int rc; 1454 int rc;
1371 1455
1372 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); 1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1373 if (rc != 0) 1457 if (rc != X86EMUL_CONTINUE)
1374 return rc; 1458 return rc;
1375 1459
1376 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
@@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1385 (u32) c->regs[VCPU_REGS_RBX]; 1469 (u32) c->regs[VCPU_REGS_RBX];
1386 1470
1387 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); 1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1388 if (rc != 0) 1472 if (rc != X86EMUL_CONTINUE)
1389 return rc; 1473 return rc;
1390 ctxt->eflags |= EFLG_ZF; 1474 ctxt->eflags |= EFLG_ZF;
1391 } 1475 }
@@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1407 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1408 if (rc) 1492 if (rc)
1409 return rc; 1493 return rc;
1410 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); 1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
1411 return rc; 1495 return rc;
1412} 1496}
1413 1497
@@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1451 &c->dst.val, 1535 &c->dst.val,
1452 c->dst.bytes, 1536 c->dst.bytes,
1453 ctxt->vcpu); 1537 ctxt->vcpu);
1454 if (rc != 0) 1538 if (rc != X86EMUL_CONTINUE)
1455 return rc; 1539 return rc;
1456 break; 1540 break;
1457 case OP_NONE: 1541 case OP_NONE:
@@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1514 u64 msr_data; 1598 u64 msr_data;
1515 1599
1516 /* syscall is not available in real mode */ 1600 /* syscall is not available in real mode */
1517 if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL 1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
1518 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) 1602 return X86EMUL_UNHANDLEABLE;
1519 return -1;
1520 1603
1521 setup_syscalls_segments(ctxt, &cs, &ss); 1604 setup_syscalls_segments(ctxt, &cs, &ss);
1522 1605
@@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1553 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1636 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1554 } 1637 }
1555 1638
1556 return 0; 1639 return X86EMUL_CONTINUE;
1557} 1640}
1558 1641
1559static int 1642static int
@@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1563 struct kvm_segment cs, ss; 1646 struct kvm_segment cs, ss;
1564 u64 msr_data; 1647 u64 msr_data;
1565 1648
1566 /* inject #UD if LOCK prefix is used */ 1649 /* inject #GP if in real mode */
1567 if (c->lock_prefix) 1650 if (ctxt->mode == X86EMUL_MODE_REAL) {
1568 return -1;
1569
1570 /* inject #GP if in real mode or paging is disabled */
1571 if (ctxt->mode == X86EMUL_MODE_REAL ||
1572 !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1573 kvm_inject_gp(ctxt->vcpu, 0); 1651 kvm_inject_gp(ctxt->vcpu, 0);
1574 return -1; 1652 return X86EMUL_UNHANDLEABLE;
1575 } 1653 }
1576 1654
1577 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1655 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1578 * Therefore, we inject an #UD. 1656 * Therefore, we inject an #UD.
1579 */ 1657 */
1580 if (ctxt->mode == X86EMUL_MODE_PROT64) 1658 if (ctxt->mode == X86EMUL_MODE_PROT64)
1581 return -1; 1659 return X86EMUL_UNHANDLEABLE;
1582 1660
1583 setup_syscalls_segments(ctxt, &cs, &ss); 1661 setup_syscalls_segments(ctxt, &cs, &ss);
1584 1662
@@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1587 case X86EMUL_MODE_PROT32: 1665 case X86EMUL_MODE_PROT32:
1588 if ((msr_data & 0xfffc) == 0x0) { 1666 if ((msr_data & 0xfffc) == 0x0) {
1589 kvm_inject_gp(ctxt->vcpu, 0); 1667 kvm_inject_gp(ctxt->vcpu, 0);
1590 return -1; 1668 return X86EMUL_PROPAGATE_FAULT;
1591 } 1669 }
1592 break; 1670 break;
1593 case X86EMUL_MODE_PROT64: 1671 case X86EMUL_MODE_PROT64:
1594 if (msr_data == 0x0) { 1672 if (msr_data == 0x0) {
1595 kvm_inject_gp(ctxt->vcpu, 0); 1673 kvm_inject_gp(ctxt->vcpu, 0);
1596 return -1; 1674 return X86EMUL_PROPAGATE_FAULT;
1597 } 1675 }
1598 break; 1676 break;
1599 } 1677 }
@@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1618 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 1696 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1619 c->regs[VCPU_REGS_RSP] = msr_data; 1697 c->regs[VCPU_REGS_RSP] = msr_data;
1620 1698
1621 return 0; 1699 return X86EMUL_CONTINUE;
1622} 1700}
1623 1701
1624static int 1702static int
@@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1629 u64 msr_data; 1707 u64 msr_data;
1630 int usermode; 1708 int usermode;
1631 1709
1632 /* inject #UD if LOCK prefix is used */ 1710 /* inject #GP if in real mode or Virtual 8086 mode */
1633 if (c->lock_prefix) 1711 if (ctxt->mode == X86EMUL_MODE_REAL ||
1634 return -1; 1712 ctxt->mode == X86EMUL_MODE_VM86) {
1635
1636 /* inject #GP if in real mode or paging is disabled */
1637 if (ctxt->mode == X86EMUL_MODE_REAL
1638 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1639 kvm_inject_gp(ctxt->vcpu, 0);
1640 return -1;
1641 }
1642
1643 /* sysexit must be called from CPL 0 */
1644 if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
1645 kvm_inject_gp(ctxt->vcpu, 0); 1713 kvm_inject_gp(ctxt->vcpu, 0);
1646 return -1; 1714 return X86EMUL_UNHANDLEABLE;
1647 } 1715 }
1648 1716
1649 setup_syscalls_segments(ctxt, &cs, &ss); 1717 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1661 cs.selector = (u16)(msr_data + 16); 1729 cs.selector = (u16)(msr_data + 16);
1662 if ((msr_data & 0xfffc) == 0x0) { 1730 if ((msr_data & 0xfffc) == 0x0) {
1663 kvm_inject_gp(ctxt->vcpu, 0); 1731 kvm_inject_gp(ctxt->vcpu, 0);
1664 return -1; 1732 return X86EMUL_PROPAGATE_FAULT;
1665 } 1733 }
1666 ss.selector = (u16)(msr_data + 24); 1734 ss.selector = (u16)(msr_data + 24);
1667 break; 1735 break;
@@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1669 cs.selector = (u16)(msr_data + 32); 1737 cs.selector = (u16)(msr_data + 32);
1670 if (msr_data == 0x0) { 1738 if (msr_data == 0x0) {
1671 kvm_inject_gp(ctxt->vcpu, 0); 1739 kvm_inject_gp(ctxt->vcpu, 0);
1672 return -1; 1740 return X86EMUL_PROPAGATE_FAULT;
1673 } 1741 }
1674 ss.selector = cs.selector + 8; 1742 ss.selector = cs.selector + 8;
1675 cs.db = 0; 1743 cs.db = 0;
@@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1685 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 1753 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
1686 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 1754 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
1687 1755
1688 return 0; 1756 return X86EMUL_CONTINUE;
1757}
1758
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1760{
1761 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL)
1763 return false;
1764 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
1768}
1769
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1771 struct x86_emulate_ops *ops,
1772 u16 port, u16 len)
1773{
1774 struct kvm_segment tr_seg;
1775 int r;
1776 u16 io_bitmap_ptr;
1777 u8 perm, bit_idx = port & 0x7;
1778 unsigned mask = (1 << len) - 1;
1779
1780 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
1781 if (tr_seg.unusable)
1782 return false;
1783 if (tr_seg.limit < 103)
1784 return false;
1785 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
1786 NULL);
1787 if (r != X86EMUL_CONTINUE)
1788 return false;
1789 if (io_bitmap_ptr + port/8 > tr_seg.limit)
1790 return false;
1791 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
1792 ctxt->vcpu, NULL);
1793 if (r != X86EMUL_CONTINUE)
1794 return false;
1795 if ((perm >> bit_idx) & mask)
1796 return false;
1797 return true;
1798}
1799
1800static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops,
1802 u16 port, u16 len)
1803{
1804 if (emulator_bad_iopl(ctxt))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false;
1807 return true;
1689} 1808}
1690 1809
1691int 1810int
@@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1709 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1710 saved_eip = c->eip; 1829 saved_eip = c->eip;
1711 1830
1831 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done;
1835 }
1836
1837 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done;
1841 }
1842
1712 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) 1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1713 memop = c->modrm_ea; 1844 memop = c->modrm_ea;
1714 1845
@@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1749 &c->src.val, 1880 &c->src.val,
1750 c->src.bytes, 1881 c->src.bytes,
1751 ctxt->vcpu); 1882 ctxt->vcpu);
1752 if (rc != 0) 1883 if (rc != X86EMUL_CONTINUE)
1753 goto done; 1884 goto done;
1754 c->src.orig_val = c->src.val; 1885 c->src.orig_val = c->src.val;
1755 } 1886 }
@@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1768 c->dst.ptr = (void *)c->dst.ptr + 1899 c->dst.ptr = (void *)c->dst.ptr +
1769 (c->src.val & mask) / 8; 1900 (c->src.val & mask) / 8;
1770 } 1901 }
1771 if (!(c->d & Mov) && 1902 if (!(c->d & Mov)) {
1772 /* optimisation - avoid slow emulated read */ 1903 /* optimisation - avoid slow emulated read */
1773 ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1774 &c->dst.val, 1905 &c->dst.val,
1775 c->dst.bytes, ctxt->vcpu)) != 0)) 1906 c->dst.bytes,
1776 goto done; 1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1777 } 1911 }
1778 c->dst.orig_val = c->dst.val; 1912 c->dst.orig_val = c->dst.val;
1779 1913
@@ -1876,7 +2010,12 @@ special_insn:
1876 break; 2010 break;
1877 case 0x6c: /* insb */ 2011 case 0x6c: /* insb */
1878 case 0x6d: /* insw/insd */ 2012 case 0x6d: /* insw/insd */
1879 if (kvm_emulate_pio_string(ctxt->vcpu, 2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done;
2017 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu,
1880 1, 2019 1,
1881 (c->d & ByteOp) ? 1 : c->op_bytes, 2020 (c->d & ByteOp) ? 1 : c->op_bytes,
1882 c->rep_prefix ? 2021 c->rep_prefix ?
@@ -1892,6 +2031,11 @@ special_insn:
1892 return 0; 2031 return 0;
1893 case 0x6e: /* outsb */ 2032 case 0x6e: /* outsb */
1894 case 0x6f: /* outsw/outsd */ 2033 case 0x6f: /* outsw/outsd */
2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done;
2038 }
1895 if (kvm_emulate_pio_string(ctxt->vcpu, 2039 if (kvm_emulate_pio_string(ctxt->vcpu,
1896 0, 2040 0,
1897 (c->d & ByteOp) ? 1 : c->op_bytes, 2041 (c->d & ByteOp) ? 1 : c->op_bytes,
@@ -1978,25 +2122,19 @@ special_insn:
1978 break; 2122 break;
1979 case 0x8e: { /* mov seg, r/m16 */ 2123 case 0x8e: { /* mov seg, r/m16 */
1980 uint16_t sel; 2124 uint16_t sel;
1981 int type_bits;
1982 int err;
1983 2125
1984 sel = c->src.val; 2126 sel = c->src.val;
1985 if (c->modrm_reg == VCPU_SREG_SS)
1986 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1987 2127
1988 if (c->modrm_reg <= 5) { 2128 if (c->modrm_reg == VCPU_SREG_CS ||
1989 type_bits = (c->modrm_reg == 1) ? 9 : 1; 2129 c->modrm_reg > VCPU_SREG_GS) {
1990 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 2130 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1991 type_bits, c->modrm_reg); 2131 goto done;
1992 } else {
1993 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1994 c->modrm);
1995 goto cannot_emulate;
1996 } 2132 }
1997 2133
1998 if (err < 0) 2134 if (c->modrm_reg == VCPU_SREG_SS)
1999 goto cannot_emulate; 2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
2136
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
2000 2138
2001 c->dst.type = OP_NONE; /* Disable writeback. */ 2139 c->dst.type = OP_NONE; /* Disable writeback. */
2002 break; 2140 break;
@@ -2025,7 +2163,10 @@ special_insn:
2025 c->dst.type = OP_REG; 2163 c->dst.type = OP_REG;
2026 c->dst.ptr = (unsigned long *) &ctxt->eflags; 2164 c->dst.ptr = (unsigned long *) &ctxt->eflags;
2027 c->dst.bytes = c->op_bytes; 2165 c->dst.bytes = c->op_bytes;
2028 goto pop_instruction; 2166 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2167 if (rc != X86EMUL_CONTINUE)
2168 goto done;
2169 break;
2029 case 0xa0 ... 0xa1: /* mov */ 2170 case 0xa0 ... 0xa1: /* mov */
2030 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2171 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2031 c->dst.val = c->src.val; 2172 c->dst.val = c->src.val;
@@ -2039,11 +2180,12 @@ special_insn:
2039 c->dst.ptr = (unsigned long *)register_address(c, 2180 c->dst.ptr = (unsigned long *)register_address(c,
2040 es_base(ctxt), 2181 es_base(ctxt),
2041 c->regs[VCPU_REGS_RDI]); 2182 c->regs[VCPU_REGS_RDI]);
2042 if ((rc = ops->read_emulated(register_address(c, 2183 rc = ops->read_emulated(register_address(c,
2043 seg_override_base(ctxt, c), 2184 seg_override_base(ctxt, c),
2044 c->regs[VCPU_REGS_RSI]), 2185 c->regs[VCPU_REGS_RSI]),
2045 &c->dst.val, 2186 &c->dst.val,
2046 c->dst.bytes, ctxt->vcpu)) != 0) 2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
2047 goto done; 2189 goto done;
2048 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2049 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2058,10 +2200,11 @@ special_insn:
2058 c->src.ptr = (unsigned long *)register_address(c, 2200 c->src.ptr = (unsigned long *)register_address(c,
2059 seg_override_base(ctxt, c), 2201 seg_override_base(ctxt, c),
2060 c->regs[VCPU_REGS_RSI]); 2202 c->regs[VCPU_REGS_RSI]);
2061 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
2062 &c->src.val, 2204 &c->src.val,
2063 c->src.bytes, 2205 c->src.bytes,
2064 ctxt->vcpu)) != 0) 2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
2065 goto done; 2208 goto done;
2066 2209
2067 c->dst.type = OP_NONE; /* Disable writeback. */ 2210 c->dst.type = OP_NONE; /* Disable writeback. */
@@ -2069,10 +2212,11 @@ special_insn:
2069 c->dst.ptr = (unsigned long *)register_address(c, 2212 c->dst.ptr = (unsigned long *)register_address(c,
2070 es_base(ctxt), 2213 es_base(ctxt),
2071 c->regs[VCPU_REGS_RDI]); 2214 c->regs[VCPU_REGS_RDI]);
2072 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
2073 &c->dst.val, 2216 &c->dst.val,
2074 c->dst.bytes, 2217 c->dst.bytes,
2075 ctxt->vcpu)) != 0) 2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
2076 goto done; 2220 goto done;
2077 2221
2078 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
@@ -2102,12 +2246,13 @@ special_insn:
2102 c->dst.type = OP_REG; 2246 c->dst.type = OP_REG;
2103 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2104 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2105 if ((rc = ops->read_emulated(register_address(c, 2249 rc = ops->read_emulated(register_address(c,
2106 seg_override_base(ctxt, c), 2250 seg_override_base(ctxt, c),
2107 c->regs[VCPU_REGS_RSI]), 2251 c->regs[VCPU_REGS_RSI]),
2108 &c->dst.val, 2252 &c->dst.val,
2109 c->dst.bytes, 2253 c->dst.bytes,
2110 ctxt->vcpu)) != 0) 2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
2111 goto done; 2256 goto done;
2112 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2113 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2163,11 +2308,9 @@ special_insn:
2163 case 0xe9: /* jmp rel */ 2308 case 0xe9: /* jmp rel */
2164 goto jmp; 2309 goto jmp;
2165 case 0xea: /* jmp far */ 2310 case 0xea: /* jmp far */
2166 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, 2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
2167 VCPU_SREG_CS) < 0) { 2312 VCPU_SREG_CS))
2168 DPRINTF("jmp far: Failed to load CS descriptor\n"); 2313 goto done;
2169 goto cannot_emulate;
2170 }
2171 2314
2172 c->eip = c->src.val; 2315 c->eip = c->src.val;
2173 break; 2316 break;
@@ -2185,7 +2328,13 @@ special_insn:
2185 case 0xef: /* out (e/r)ax,dx */ 2328 case 0xef: /* out (e/r)ax,dx */
2186 port = c->regs[VCPU_REGS_RDX]; 2329 port = c->regs[VCPU_REGS_RDX];
2187 io_dir_in = 0; 2330 io_dir_in = 0;
2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, 2331 do_io:
2332 if (!emulator_io_permited(ctxt, ops, port,
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done;
2336 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2189 (c->d & ByteOp) ? 1 : c->op_bytes, 2338 (c->d & ByteOp) ? 1 : c->op_bytes,
2190 port) != 0) { 2339 port) != 0) {
2191 c->eip = saved_eip; 2340 c->eip = saved_eip;
@@ -2210,13 +2359,21 @@ special_insn:
2210 c->dst.type = OP_NONE; /* Disable writeback. */ 2359 c->dst.type = OP_NONE; /* Disable writeback. */
2211 break; 2360 break;
2212 case 0xfa: /* cli */ 2361 case 0xfa: /* cli */
2213 ctxt->eflags &= ~X86_EFLAGS_IF; 2362 if (emulator_bad_iopl(ctxt))
2214 c->dst.type = OP_NONE; /* Disable writeback. */ 2363 kvm_inject_gp(ctxt->vcpu, 0);
2364 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF;
2366 c->dst.type = OP_NONE; /* Disable writeback. */
2367 }
2215 break; 2368 break;
2216 case 0xfb: /* sti */ 2369 case 0xfb: /* sti */
2217 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2370 if (emulator_bad_iopl(ctxt))
2218 ctxt->eflags |= X86_EFLAGS_IF; 2371 kvm_inject_gp(ctxt->vcpu, 0);
2219 c->dst.type = OP_NONE; /* Disable writeback. */ 2372 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */
2376 }
2220 break; 2377 break;
2221 case 0xfc: /* cld */ 2378 case 0xfc: /* cld */
2222 ctxt->eflags &= ~EFLG_DF; 2379 ctxt->eflags &= ~EFLG_DF;
@@ -2319,8 +2476,9 @@ twobyte_insn:
2319 } 2476 }
2320 break; 2477 break;
2321 case 0x05: /* syscall */ 2478 case 0x05: /* syscall */
2322 if (emulate_syscall(ctxt) == -1) 2479 rc = emulate_syscall(ctxt);
2323 goto cannot_emulate; 2480 if (rc != X86EMUL_CONTINUE)
2481 goto done;
2324 else 2482 else
2325 goto writeback; 2483 goto writeback;
2326 break; 2484 break;
@@ -2391,14 +2549,16 @@ twobyte_insn:
2391 c->dst.type = OP_NONE; 2549 c->dst.type = OP_NONE;
2392 break; 2550 break;
2393 case 0x34: /* sysenter */ 2551 case 0x34: /* sysenter */
2394 if (emulate_sysenter(ctxt) == -1) 2552 rc = emulate_sysenter(ctxt);
2395 goto cannot_emulate; 2553 if (rc != X86EMUL_CONTINUE)
2554 goto done;
2396 else 2555 else
2397 goto writeback; 2556 goto writeback;
2398 break; 2557 break;
2399 case 0x35: /* sysexit */ 2558 case 0x35: /* sysexit */
2400 if (emulate_sysexit(ctxt) == -1) 2559 rc = emulate_sysexit(ctxt);
2401 goto cannot_emulate; 2560 if (rc != X86EMUL_CONTINUE)
2561 goto done;
2402 else 2562 else
2403 goto writeback; 2563 goto writeback;
2404 break; 2564 break;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 296aba49472a..0150affad25d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -32,6 +32,7 @@
32#define pr_fmt(fmt) "pit: " fmt 32#define pr_fmt(fmt) "pit: " fmt
33 33
34#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
35#include <linux/slab.h>
35 36
36#include "irq.h" 37#include "irq.h"
37#include "i8254.h" 38#include "i8254.h"
@@ -242,11 +243,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
242{ 243{
243 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
244 irq_ack_notifier); 245 irq_ack_notifier);
245 spin_lock(&ps->inject_lock); 246 raw_spin_lock(&ps->inject_lock);
246 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 247 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
247 atomic_inc(&ps->pit_timer.pending); 248 atomic_inc(&ps->pit_timer.pending);
248 ps->irq_ack = 1; 249 ps->irq_ack = 1;
249 spin_unlock(&ps->inject_lock); 250 raw_spin_unlock(&ps->inject_lock);
250} 251}
251 252
252void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -467,6 +468,9 @@ static int pit_ioport_read(struct kvm_io_device *this,
467 return -EOPNOTSUPP; 468 return -EOPNOTSUPP;
468 469
469 addr &= KVM_PIT_CHANNEL_MASK; 470 addr &= KVM_PIT_CHANNEL_MASK;
471 if (addr == 3)
472 return 0;
473
470 s = &pit_state->channels[addr]; 474 s = &pit_state->channels[addr];
471 475
472 mutex_lock(&pit_state->lock); 476 mutex_lock(&pit_state->lock);
@@ -602,7 +606,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
602 .write = speaker_ioport_write, 606 .write = speaker_ioport_write,
603}; 607};
604 608
605/* Caller must have writers lock on slots_lock */ 609/* Caller must hold slots_lock */
606struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) 610struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
607{ 611{
608 struct kvm_pit *pit; 612 struct kvm_pit *pit;
@@ -621,7 +625,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
621 625
622 mutex_init(&pit->pit_state.lock); 626 mutex_init(&pit->pit_state.lock);
623 mutex_lock(&pit->pit_state.lock); 627 mutex_lock(&pit->pit_state.lock);
624 spin_lock_init(&pit->pit_state.inject_lock); 628 raw_spin_lock_init(&pit->pit_state.inject_lock);
625 629
626 kvm->arch.vpit = pit; 630 kvm->arch.vpit = pit;
627 pit->kvm = kvm; 631 pit->kvm = kvm;
@@ -642,13 +646,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
642 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 646 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
643 647
644 kvm_iodevice_init(&pit->dev, &pit_dev_ops); 648 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
645 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); 649 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
646 if (ret < 0) 650 if (ret < 0)
647 goto fail; 651 goto fail;
648 652
649 if (flags & KVM_PIT_SPEAKER_DUMMY) { 653 if (flags & KVM_PIT_SPEAKER_DUMMY) {
650 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); 654 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
651 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, 655 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
652 &pit->speaker_dev); 656 &pit->speaker_dev);
653 if (ret < 0) 657 if (ret < 0)
654 goto fail_unregister; 658 goto fail_unregister;
@@ -657,11 +661,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
657 return pit; 661 return pit;
658 662
659fail_unregister: 663fail_unregister:
660 __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); 664 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
661 665
662fail: 666fail:
663 if (pit->irq_source_id >= 0) 667 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
664 kvm_free_irq_source_id(kvm, pit->irq_source_id); 668 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
669 kvm_free_irq_source_id(kvm, pit->irq_source_id);
665 670
666 kfree(pit); 671 kfree(pit);
667 return NULL; 672 return NULL;
@@ -720,12 +725,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
720 /* Try to inject pending interrupts when 725 /* Try to inject pending interrupts when
721 * last one has been acked. 726 * last one has been acked.
722 */ 727 */
723 spin_lock(&ps->inject_lock); 728 raw_spin_lock(&ps->inject_lock);
724 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { 729 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
725 ps->irq_ack = 0; 730 ps->irq_ack = 0;
726 inject = 1; 731 inject = 1;
727 } 732 }
728 spin_unlock(&ps->inject_lock); 733 raw_spin_unlock(&ps->inject_lock);
729 if (inject) 734 if (inject)
730 __inject_pit_timer_intr(kvm); 735 __inject_pit_timer_intr(kvm);
731 } 736 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d4c1c7ffdc09..900d6b0ba7c2 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 spinlock_t inject_lock; 30 raw_spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index d057c0cbd245..a790fa128a9f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,6 +26,7 @@
26 * Port from Qemu. 26 * Port from Qemu.
27 */ 27 */
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h>
29#include <linux/bitops.h> 30#include <linux/bitops.h>
30#include "irq.h" 31#include "irq.h"
31 32
@@ -44,18 +45,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
44 * Other interrupt may be delivered to PIC while lock is dropped but 45 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage. 46 * it should be safe since PIC state is already updated at this stage.
46 */ 47 */
47 spin_unlock(&s->pics_state->lock); 48 raw_spin_unlock(&s->pics_state->lock);
48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 49 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock); 50 raw_spin_lock(&s->pics_state->lock);
50} 51}
51 52
52void kvm_pic_clear_isr_ack(struct kvm *kvm) 53void kvm_pic_clear_isr_ack(struct kvm *kvm)
53{ 54{
54 struct kvm_pic *s = pic_irqchip(kvm); 55 struct kvm_pic *s = pic_irqchip(kvm);
55 spin_lock(&s->lock); 56
57 raw_spin_lock(&s->lock);
56 s->pics[0].isr_ack = 0xff; 58 s->pics[0].isr_ack = 0xff;
57 s->pics[1].isr_ack = 0xff; 59 s->pics[1].isr_ack = 0xff;
58 spin_unlock(&s->lock); 60 raw_spin_unlock(&s->lock);
59} 61}
60 62
61/* 63/*
@@ -156,9 +158,9 @@ static void pic_update_irq(struct kvm_pic *s)
156 158
157void kvm_pic_update_irq(struct kvm_pic *s) 159void kvm_pic_update_irq(struct kvm_pic *s)
158{ 160{
159 spin_lock(&s->lock); 161 raw_spin_lock(&s->lock);
160 pic_update_irq(s); 162 pic_update_irq(s);
161 spin_unlock(&s->lock); 163 raw_spin_unlock(&s->lock);
162} 164}
163 165
164int kvm_pic_set_irq(void *opaque, int irq, int level) 166int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -166,14 +168,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
166 struct kvm_pic *s = opaque; 168 struct kvm_pic *s = opaque;
167 int ret = -1; 169 int ret = -1;
168 170
169 spin_lock(&s->lock); 171 raw_spin_lock(&s->lock);
170 if (irq >= 0 && irq < PIC_NUM_PINS) { 172 if (irq >= 0 && irq < PIC_NUM_PINS) {
171 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 173 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
172 pic_update_irq(s); 174 pic_update_irq(s);
173 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 175 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
174 s->pics[irq >> 3].imr, ret == 0); 176 s->pics[irq >> 3].imr, ret == 0);
175 } 177 }
176 spin_unlock(&s->lock); 178 raw_spin_unlock(&s->lock);
177 179
178 return ret; 180 return ret;
179} 181}
@@ -203,7 +205,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
203 int irq, irq2, intno; 205 int irq, irq2, intno;
204 struct kvm_pic *s = pic_irqchip(kvm); 206 struct kvm_pic *s = pic_irqchip(kvm);
205 207
206 spin_lock(&s->lock); 208 raw_spin_lock(&s->lock);
207 irq = pic_get_irq(&s->pics[0]); 209 irq = pic_get_irq(&s->pics[0]);
208 if (irq >= 0) { 210 if (irq >= 0) {
209 pic_intack(&s->pics[0], irq); 211 pic_intack(&s->pics[0], irq);
@@ -228,7 +230,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
228 intno = s->pics[0].irq_base + irq; 230 intno = s->pics[0].irq_base + irq;
229 } 231 }
230 pic_update_irq(s); 232 pic_update_irq(s);
231 spin_unlock(&s->lock); 233 raw_spin_unlock(&s->lock);
232 234
233 return intno; 235 return intno;
234} 236}
@@ -442,7 +444,7 @@ static int picdev_write(struct kvm_io_device *this,
442 printk(KERN_ERR "PIC: non byte write\n"); 444 printk(KERN_ERR "PIC: non byte write\n");
443 return 0; 445 return 0;
444 } 446 }
445 spin_lock(&s->lock); 447 raw_spin_lock(&s->lock);
446 switch (addr) { 448 switch (addr) {
447 case 0x20: 449 case 0x20:
448 case 0x21: 450 case 0x21:
@@ -455,7 +457,7 @@ static int picdev_write(struct kvm_io_device *this,
455 elcr_ioport_write(&s->pics[addr & 1], addr, data); 457 elcr_ioport_write(&s->pics[addr & 1], addr, data);
456 break; 458 break;
457 } 459 }
458 spin_unlock(&s->lock); 460 raw_spin_unlock(&s->lock);
459 return 0; 461 return 0;
460} 462}
461 463
@@ -472,7 +474,7 @@ static int picdev_read(struct kvm_io_device *this,
472 printk(KERN_ERR "PIC: non byte read\n"); 474 printk(KERN_ERR "PIC: non byte read\n");
473 return 0; 475 return 0;
474 } 476 }
475 spin_lock(&s->lock); 477 raw_spin_lock(&s->lock);
476 switch (addr) { 478 switch (addr) {
477 case 0x20: 479 case 0x20:
478 case 0x21: 480 case 0x21:
@@ -486,7 +488,7 @@ static int picdev_read(struct kvm_io_device *this,
486 break; 488 break;
487 } 489 }
488 *(unsigned char *)val = data; 490 *(unsigned char *)val = data;
489 spin_unlock(&s->lock); 491 raw_spin_unlock(&s->lock);
490 return 0; 492 return 0;
491} 493}
492 494
@@ -520,7 +522,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
520 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 522 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
521 if (!s) 523 if (!s)
522 return NULL; 524 return NULL;
523 spin_lock_init(&s->lock); 525 raw_spin_lock_init(&s->lock);
524 s->kvm = kvm; 526 s->kvm = kvm;
525 s->pics[0].elcr_mask = 0xf8; 527 s->pics[0].elcr_mask = 0xf8;
526 s->pics[1].elcr_mask = 0xde; 528 s->pics[1].elcr_mask = 0xde;
@@ -533,7 +535,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
533 * Initialize PIO device 535 * Initialize PIO device
534 */ 536 */
535 kvm_iodevice_init(&s->dev, &picdev_ops); 537 kvm_iodevice_init(&s->dev, &picdev_ops);
536 ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); 538 mutex_lock(&kvm->slots_lock);
539 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
540 mutex_unlock(&kvm->slots_lock);
537 if (ret < 0) { 541 if (ret < 0) {
538 kfree(s); 542 kfree(s);
539 return NULL; 543 return NULL;
@@ -541,3 +545,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
541 545
542 return s; 546 return s;
543} 547}
548
549void kvm_destroy_pic(struct kvm *kvm)
550{
551 struct kvm_pic *vpic = kvm->arch.vpic;
552
553 if (vpic) {
554 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
555 kvm->arch.vpic = NULL;
556 kfree(vpic);
557 }
558}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index be399e207d57..34b15915754d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -62,7 +62,7 @@ struct kvm_kpic_state {
62}; 62};
63 63
64struct kvm_pic { 64struct kvm_pic {
65 spinlock_t lock; 65 raw_spinlock_t lock;
66 unsigned pending_acks; 66 unsigned pending_acks;
67 struct kvm *kvm; 67 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
@@ -75,6 +75,7 @@ struct kvm_pic {
75}; 75};
76 76
77struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
78void kvm_destroy_pic(struct kvm *kvm);
78int kvm_pic_read_irq(struct kvm *kvm); 79int kvm_pic_read_irq(struct kvm *kvm);
79void kvm_pic_update_irq(struct kvm_pic *s); 80void kvm_pic_update_irq(struct kvm_pic *s);
80void kvm_pic_clear_isr_ack(struct kvm *kvm); 81void kvm_pic_clear_isr_ack(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6a4403..cff851cf5322 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,6 +1,11 @@
1#ifndef ASM_KVM_CACHE_REGS_H 1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H 2#define ASM_KVM_CACHE_REGS_H
3 3
4#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
5#define KVM_POSSIBLE_CR4_GUEST_BITS \
6 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
7 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
8
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, 9static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg) 10 enum kvm_reg reg)
6{ 11{
@@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38 return vcpu->arch.pdptrs[index]; 43 return vcpu->arch.pdptrs[index];
39} 44}
40 45
46static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
47{
48 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
49 if (tmask & vcpu->arch.cr0_guest_owned_bits)
50 kvm_x86_ops->decache_cr0_guest_bits(vcpu);
51 return vcpu->arch.cr0 & mask;
52}
53
54static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
55{
56 return kvm_read_cr0_bits(vcpu, ~0UL);
57}
58
59static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
60{
61 ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
62 if (tmask & vcpu->arch.cr4_guest_owned_bits)
63 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
64 return vcpu->arch.cr4 & mask;
65}
66
67static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
68{
69 return kvm_read_cr4_bits(vcpu, ~0UL);
70}
71
41#endif 72#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cd60c0bd1b32..1eb7a4ae0c9c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -26,6 +26,7 @@
26#include <linux/io.h> 26#include <linux/io.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/slab.h>
29#include <asm/processor.h> 30#include <asm/processor.h>
30#include <asm/msr.h> 31#include <asm/msr.h>
31#include <asm/page.h> 32#include <asm/page.h>
@@ -373,6 +374,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
373 if (unlikely(!apic_enabled(apic))) 374 if (unlikely(!apic_enabled(apic)))
374 break; 375 break;
375 376
377 if (trig_mode) {
378 apic_debug("level trig mode for vector %d", vector);
379 apic_set_vector(vector, apic->regs + APIC_TMR);
380 } else
381 apic_clear_vector(vector, apic->regs + APIC_TMR);
382
376 result = !apic_test_and_set_irr(vector, apic); 383 result = !apic_test_and_set_irr(vector, apic);
377 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 384 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
378 trig_mode, vector, !result); 385 trig_mode, vector, !result);
@@ -383,11 +390,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
383 break; 390 break;
384 } 391 }
385 392
386 if (trig_mode) {
387 apic_debug("level trig mode for vector %d", vector);
388 apic_set_vector(vector, apic->regs + APIC_TMR);
389 } else
390 apic_clear_vector(vector, apic->regs + APIC_TMR);
391 kvm_vcpu_kick(vcpu); 393 kvm_vcpu_kick(vcpu);
392 break; 394 break;
393 395
@@ -1150,6 +1152,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1150 hrtimer_cancel(&apic->lapic_timer.timer); 1152 hrtimer_cancel(&apic->lapic_timer.timer);
1151 update_divide_count(apic); 1153 update_divide_count(apic);
1152 start_apic_timer(apic); 1154 start_apic_timer(apic);
1155 apic->irr_pending = true;
1153} 1156}
1154 1157
1155void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1158void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1244,3 +1247,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1244 1247
1245 return 0; 1248 return 0;
1246} 1249}
1250
1251int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
1252{
1253 struct kvm_lapic *apic = vcpu->arch.apic;
1254
1255 if (!irqchip_in_kernel(vcpu->kvm))
1256 return 1;
1257
1258 /* if this is ICR write vector before command */
1259 if (reg == APIC_ICR)
1260 apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
1261 return apic_reg_write(apic, reg, (u32)data);
1262}
1263
1264int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1265{
1266 struct kvm_lapic *apic = vcpu->arch.apic;
1267 u32 low, high = 0;
1268
1269 if (!irqchip_in_kernel(vcpu->kvm))
1270 return 1;
1271
1272 if (apic_reg_read(apic, reg, 4, &low))
1273 return 1;
1274 if (reg == APIC_ICR)
1275 apic_reg_read(apic, APIC_ICR2, 4, &high);
1276
1277 *data = (((u64)high) << 32) | low;
1278
1279 return 0;
1280}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40010b09c4aa..f5fe32c5edad 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
48 48
49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); 49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
51
52int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
53int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
54
55static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
56{
57 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
58}
51#endif 59#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c3e5b2314cb..19a8906bcaa2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include "mmu.h" 20#include "mmu.h"
21#include "x86.h"
21#include "kvm_cache_regs.h" 22#include "kvm_cache_regs.h"
22 23
23#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
@@ -29,6 +30,8 @@
29#include <linux/swap.h> 30#include <linux/swap.h>
30#include <linux/hugetlb.h> 31#include <linux/hugetlb.h>
31#include <linux/compiler.h> 32#include <linux/compiler.h>
33#include <linux/srcu.h>
34#include <linux/slab.h>
32 35
33#include <asm/page.h> 36#include <asm/page.h>
34#include <asm/cmpxchg.h> 37#include <asm/cmpxchg.h>
@@ -136,16 +139,6 @@ module_param(oos_shadow, bool, 0644);
136#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 139#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
137 | PT64_NX_MASK) 140 | PT64_NX_MASK)
138 141
139#define PFERR_PRESENT_MASK (1U << 0)
140#define PFERR_WRITE_MASK (1U << 1)
141#define PFERR_USER_MASK (1U << 2)
142#define PFERR_RSVD_MASK (1U << 3)
143#define PFERR_FETCH_MASK (1U << 4)
144
145#define PT_PDPE_LEVEL 3
146#define PT_DIRECTORY_LEVEL 2
147#define PT_PAGE_TABLE_LEVEL 1
148
149#define RMAP_EXT 4 142#define RMAP_EXT 4
150 143
151#define ACC_EXEC_MASK 1 144#define ACC_EXEC_MASK 1
@@ -153,6 +146,9 @@ module_param(oos_shadow, bool, 0644);
153#define ACC_USER_MASK PT_USER_MASK 146#define ACC_USER_MASK PT_USER_MASK
154#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 147#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
155 148
149#include <trace/events/kvm.h>
150
151#undef TRACE_INCLUDE_FILE
156#define CREATE_TRACE_POINTS 152#define CREATE_TRACE_POINTS
157#include "mmutrace.h" 153#include "mmutrace.h"
158 154
@@ -229,7 +225,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
229 225
230static int is_write_protection(struct kvm_vcpu *vcpu) 226static int is_write_protection(struct kvm_vcpu *vcpu)
231{ 227{
232 return vcpu->arch.cr0 & X86_CR0_WP; 228 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
233} 229}
234 230
235static int is_cpuid_PSE36(void) 231static int is_cpuid_PSE36(void)
@@ -239,7 +235,7 @@ static int is_cpuid_PSE36(void)
239 235
240static int is_nx(struct kvm_vcpu *vcpu) 236static int is_nx(struct kvm_vcpu *vcpu)
241{ 237{
242 return vcpu->arch.shadow_efer & EFER_NX; 238 return vcpu->arch.efer & EFER_NX;
243} 239}
244 240
245static int is_shadow_present_pte(u64 pte) 241static int is_shadow_present_pte(u64 pte)
@@ -253,7 +249,7 @@ static int is_large_pte(u64 pte)
253 return pte & PT_PAGE_SIZE_MASK; 249 return pte & PT_PAGE_SIZE_MASK;
254} 250}
255 251
256static int is_writeble_pte(unsigned long pte) 252static int is_writable_pte(unsigned long pte)
257{ 253{
258 return pte & PT_WRITABLE_MASK; 254 return pte & PT_WRITABLE_MASK;
259} 255}
@@ -470,24 +466,10 @@ static int has_wrprotected_page(struct kvm *kvm,
470 466
471static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 467static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
472{ 468{
473 unsigned long page_size = PAGE_SIZE; 469 unsigned long page_size;
474 struct vm_area_struct *vma;
475 unsigned long addr;
476 int i, ret = 0; 470 int i, ret = 0;
477 471
478 addr = gfn_to_hva(kvm, gfn); 472 page_size = kvm_host_page_size(kvm, gfn);
479 if (kvm_is_error_hva(addr))
480 return page_size;
481
482 down_read(&current->mm->mmap_sem);
483 vma = find_vma(current->mm, addr);
484 if (!vma)
485 goto out;
486
487 page_size = vma_kernel_pagesize(vma);
488
489out:
490 up_read(&current->mm->mmap_sem);
491 473
492 for (i = PT_PAGE_TABLE_LEVEL; 474 for (i = PT_PAGE_TABLE_LEVEL;
493 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { 475 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -503,8 +485,7 @@ out:
503static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 485static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
504{ 486{
505 struct kvm_memory_slot *slot; 487 struct kvm_memory_slot *slot;
506 int host_level; 488 int host_level, level, max_level;
507 int level = PT_PAGE_TABLE_LEVEL;
508 489
509 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 490 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
510 if (slot && slot->dirty_bitmap) 491 if (slot && slot->dirty_bitmap)
@@ -515,11 +496,12 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
515 if (host_level == PT_PAGE_TABLE_LEVEL) 496 if (host_level == PT_PAGE_TABLE_LEVEL)
516 return host_level; 497 return host_level;
517 498
518 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { 499 max_level = kvm_x86_ops->get_lpage_level() < host_level ?
500 kvm_x86_ops->get_lpage_level() : host_level;
519 501
502 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
520 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 503 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
521 break; 504 break;
522 }
523 505
524 return level - 1; 506 return level - 1;
525} 507}
@@ -635,7 +617,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
635 pfn = spte_to_pfn(*spte); 617 pfn = spte_to_pfn(*spte);
636 if (*spte & shadow_accessed_mask) 618 if (*spte & shadow_accessed_mask)
637 kvm_set_pfn_accessed(pfn); 619 kvm_set_pfn_accessed(pfn);
638 if (is_writeble_pte(*spte)) 620 if (is_writable_pte(*spte))
639 kvm_set_pfn_dirty(pfn); 621 kvm_set_pfn_dirty(pfn);
640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); 622 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
641 if (!*rmapp) { 623 if (!*rmapp) {
@@ -664,6 +646,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
664 prev_desc = desc; 646 prev_desc = desc;
665 desc = desc->more; 647 desc = desc->more;
666 } 648 }
649 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
667 BUG(); 650 BUG();
668 } 651 }
669} 652}
@@ -710,7 +693,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
710 BUG_ON(!spte); 693 BUG_ON(!spte);
711 BUG_ON(!(*spte & PT_PRESENT_MASK)); 694 BUG_ON(!(*spte & PT_PRESENT_MASK));
712 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 695 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
713 if (is_writeble_pte(*spte)) { 696 if (is_writable_pte(*spte)) {
714 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 697 __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
715 write_protected = 1; 698 write_protected = 1;
716 } 699 }
@@ -734,7 +717,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
734 BUG_ON(!(*spte & PT_PRESENT_MASK)); 717 BUG_ON(!(*spte & PT_PRESENT_MASK));
735 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 718 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
736 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 719 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
737 if (is_writeble_pte(*spte)) { 720 if (is_writable_pte(*spte)) {
738 rmap_remove(kvm, spte); 721 rmap_remove(kvm, spte);
739 --kvm->stat.lpages; 722 --kvm->stat.lpages;
740 __set_spte(spte, shadow_trap_nonpresent_pte); 723 __set_spte(spte, shadow_trap_nonpresent_pte);
@@ -789,7 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
789 772
790 new_spte &= ~PT_WRITABLE_MASK; 773 new_spte &= ~PT_WRITABLE_MASK;
791 new_spte &= ~SPTE_HOST_WRITEABLE; 774 new_spte &= ~SPTE_HOST_WRITEABLE;
792 if (is_writeble_pte(*spte)) 775 if (is_writable_pte(*spte))
793 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 776 kvm_set_pfn_dirty(spte_to_pfn(*spte));
794 __set_spte(spte, new_spte); 777 __set_spte(spte, new_spte);
795 spte = rmap_next(kvm, rmapp, spte); 778 spte = rmap_next(kvm, rmapp, spte);
@@ -807,35 +790,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
807 unsigned long data)) 790 unsigned long data))
808{ 791{
809 int i, j; 792 int i, j;
793 int ret;
810 int retval = 0; 794 int retval = 0;
795 struct kvm_memslots *slots;
811 796
812 /* 797 slots = rcu_dereference(kvm->memslots);
813 * If mmap_sem isn't taken, we can look the memslots with only 798
814 * the mmu_lock by skipping over the slots with userspace_addr == 0. 799 for (i = 0; i < slots->nmemslots; i++) {
815 */ 800 struct kvm_memory_slot *memslot = &slots->memslots[i];
816 for (i = 0; i < kvm->nmemslots; i++) {
817 struct kvm_memory_slot *memslot = &kvm->memslots[i];
818 unsigned long start = memslot->userspace_addr; 801 unsigned long start = memslot->userspace_addr;
819 unsigned long end; 802 unsigned long end;
820 803
821 /* mmu_lock protects userspace_addr */
822 if (!start)
823 continue;
824
825 end = start + (memslot->npages << PAGE_SHIFT); 804 end = start + (memslot->npages << PAGE_SHIFT);
826 if (hva >= start && hva < end) { 805 if (hva >= start && hva < end) {
827 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 806 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
828 807
829 retval |= handler(kvm, &memslot->rmap[gfn_offset], 808 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
830 data);
831 809
832 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 810 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
833 int idx = gfn_offset; 811 int idx = gfn_offset;
834 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 812 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
835 retval |= handler(kvm, 813 ret |= handler(kvm,
836 &memslot->lpage_info[j][idx].rmap_pde, 814 &memslot->lpage_info[j][idx].rmap_pde,
837 data); 815 data);
838 } 816 }
817 trace_kvm_age_page(hva, memslot, ret);
818 retval |= ret;
839 } 819 }
840 } 820 }
841 821
@@ -858,9 +838,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
858 u64 *spte; 838 u64 *spte;
859 int young = 0; 839 int young = 0;
860 840
861 /* always return old for EPT */ 841 /*
842 * Emulate the accessed bit for EPT, by checking if this page has
843 * an EPT mapping, and clearing it if it does. On the next access,
844 * a new EPT mapping will be established.
845 * This has some overhead, but not as much as the cost of swapping
846 * out actively used pages or breaking up actively used hugepages.
847 */
862 if (!shadow_accessed_mask) 848 if (!shadow_accessed_mask)
863 return 0; 849 return kvm_unmap_rmapp(kvm, rmapp, data);
864 850
865 spte = rmap_next(kvm, rmapp, NULL); 851 spte = rmap_next(kvm, rmapp, NULL);
866 while (spte) { 852 while (spte) {
@@ -1504,8 +1490,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1504 for_each_sp(pages, sp, parents, i) { 1490 for_each_sp(pages, sp, parents, i) {
1505 kvm_mmu_zap_page(kvm, sp); 1491 kvm_mmu_zap_page(kvm, sp);
1506 mmu_pages_clear_parents(&parents); 1492 mmu_pages_clear_parents(&parents);
1493 zapped++;
1507 } 1494 }
1508 zapped += pages.nr;
1509 kvm_mmu_pages_init(parent, &parents, &pages); 1495 kvm_mmu_pages_init(parent, &parents, &pages);
1510 } 1496 }
1511 1497
@@ -1556,14 +1542,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1556 */ 1542 */
1557 1543
1558 if (used_pages > kvm_nr_mmu_pages) { 1544 if (used_pages > kvm_nr_mmu_pages) {
1559 while (used_pages > kvm_nr_mmu_pages) { 1545 while (used_pages > kvm_nr_mmu_pages &&
1546 !list_empty(&kvm->arch.active_mmu_pages)) {
1560 struct kvm_mmu_page *page; 1547 struct kvm_mmu_page *page;
1561 1548
1562 page = container_of(kvm->arch.active_mmu_pages.prev, 1549 page = container_of(kvm->arch.active_mmu_pages.prev,
1563 struct kvm_mmu_page, link); 1550 struct kvm_mmu_page, link);
1564 kvm_mmu_zap_page(kvm, page); 1551 used_pages -= kvm_mmu_zap_page(kvm, page);
1565 used_pages--; 1552 used_pages--;
1566 } 1553 }
1554 kvm_nr_mmu_pages = used_pages;
1567 kvm->arch.n_free_mmu_pages = 0; 1555 kvm->arch.n_free_mmu_pages = 0;
1568 } 1556 }
1569 else 1557 else
@@ -1610,14 +1598,15 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1610 && !sp->role.invalid) { 1598 && !sp->role.invalid) {
1611 pgprintk("%s: zap %lx %x\n", 1599 pgprintk("%s: zap %lx %x\n",
1612 __func__, gfn, sp->role.word); 1600 __func__, gfn, sp->role.word);
1613 kvm_mmu_zap_page(kvm, sp); 1601 if (kvm_mmu_zap_page(kvm, sp))
1602 nn = bucket->first;
1614 } 1603 }
1615 } 1604 }
1616} 1605}
1617 1606
1618static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1607static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1619{ 1608{
1620 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); 1609 int slot = memslot_id(kvm, gfn);
1621 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1610 struct kvm_mmu_page *sp = page_header(__pa(pte));
1622 1611
1623 __set_bit(slot, sp->slot_bitmap); 1612 __set_bit(slot, sp->slot_bitmap);
@@ -1641,7 +1630,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1641{ 1630{
1642 struct page *page; 1631 struct page *page;
1643 1632
1644 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 1633 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1645 1634
1646 if (gpa == UNMAPPED_GVA) 1635 if (gpa == UNMAPPED_GVA)
1647 return NULL; 1636 return NULL;
@@ -1854,7 +1843,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1854 * is responsibility of mmu_get_page / kvm_sync_page. 1843 * is responsibility of mmu_get_page / kvm_sync_page.
1855 * Same reasoning can be applied to dirty page accounting. 1844 * Same reasoning can be applied to dirty page accounting.
1856 */ 1845 */
1857 if (!can_unsync && is_writeble_pte(*sptep)) 1846 if (!can_unsync && is_writable_pte(*sptep))
1858 goto set_pte; 1847 goto set_pte;
1859 1848
1860 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1849 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1862,7 +1851,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1862 __func__, gfn); 1851 __func__, gfn);
1863 ret = 1; 1852 ret = 1;
1864 pte_access &= ~ACC_WRITE_MASK; 1853 pte_access &= ~ACC_WRITE_MASK;
1865 if (is_writeble_pte(spte)) 1854 if (is_writable_pte(spte))
1866 spte &= ~PT_WRITABLE_MASK; 1855 spte &= ~PT_WRITABLE_MASK;
1867 } 1856 }
1868 } 1857 }
@@ -1883,7 +1872,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1883 bool reset_host_protection) 1872 bool reset_host_protection)
1884{ 1873{
1885 int was_rmapped = 0; 1874 int was_rmapped = 0;
1886 int was_writeble = is_writeble_pte(*sptep); 1875 int was_writable = is_writable_pte(*sptep);
1887 int rmap_count; 1876 int rmap_count;
1888 1877
1889 pgprintk("%s: spte %llx access %x write_fault %d" 1878 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1934,7 +1923,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1934 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1923 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1935 rmap_recycle(vcpu, sptep, gfn); 1924 rmap_recycle(vcpu, sptep, gfn);
1936 } else { 1925 } else {
1937 if (was_writeble) 1926 if (was_writable)
1938 kvm_release_pfn_dirty(pfn); 1927 kvm_release_pfn_dirty(pfn);
1939 else 1928 else
1940 kvm_release_pfn_clean(pfn); 1929 kvm_release_pfn_clean(pfn);
@@ -2164,8 +2153,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2164 spin_unlock(&vcpu->kvm->mmu_lock); 2153 spin_unlock(&vcpu->kvm->mmu_lock);
2165} 2154}
2166 2155
2167static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2156static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2157 u32 access, u32 *error)
2168{ 2158{
2159 if (error)
2160 *error = 0;
2169 return vaddr; 2161 return vaddr;
2170} 2162}
2171 2163
@@ -2749,7 +2741,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2749 if (tdp_enabled) 2741 if (tdp_enabled)
2750 return 0; 2742 return 0;
2751 2743
2752 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 2744 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2753 2745
2754 spin_lock(&vcpu->kvm->mmu_lock); 2746 spin_lock(&vcpu->kvm->mmu_lock);
2755 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2747 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2849,16 +2841,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2849 */ 2841 */
2850 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 2842 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2851 if (!page) 2843 if (!page)
2852 goto error_1; 2844 return -ENOMEM;
2845
2853 vcpu->arch.mmu.pae_root = page_address(page); 2846 vcpu->arch.mmu.pae_root = page_address(page);
2854 for (i = 0; i < 4; ++i) 2847 for (i = 0; i < 4; ++i)
2855 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2848 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2856 2849
2857 return 0; 2850 return 0;
2858
2859error_1:
2860 free_mmu_pages(vcpu);
2861 return -ENOMEM;
2862} 2851}
2863 2852
2864int kvm_mmu_create(struct kvm_vcpu *vcpu) 2853int kvm_mmu_create(struct kvm_vcpu *vcpu)
@@ -2938,10 +2927,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2938 spin_lock(&kvm_lock); 2927 spin_lock(&kvm_lock);
2939 2928
2940 list_for_each_entry(kvm, &vm_list, vm_list) { 2929 list_for_each_entry(kvm, &vm_list, vm_list) {
2941 int npages; 2930 int npages, idx;
2942 2931
2943 if (!down_read_trylock(&kvm->slots_lock)) 2932 idx = srcu_read_lock(&kvm->srcu);
2944 continue;
2945 spin_lock(&kvm->mmu_lock); 2933 spin_lock(&kvm->mmu_lock);
2946 npages = kvm->arch.n_alloc_mmu_pages - 2934 npages = kvm->arch.n_alloc_mmu_pages -
2947 kvm->arch.n_free_mmu_pages; 2935 kvm->arch.n_free_mmu_pages;
@@ -2954,7 +2942,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2954 nr_to_scan--; 2942 nr_to_scan--;
2955 2943
2956 spin_unlock(&kvm->mmu_lock); 2944 spin_unlock(&kvm->mmu_lock);
2957 up_read(&kvm->slots_lock); 2945 srcu_read_unlock(&kvm->srcu, idx);
2958 } 2946 }
2959 if (kvm_freed) 2947 if (kvm_freed)
2960 list_move_tail(&kvm_freed->vm_list, &vm_list); 2948 list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -3021,9 +3009,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3021 int i; 3009 int i;
3022 unsigned int nr_mmu_pages; 3010 unsigned int nr_mmu_pages;
3023 unsigned int nr_pages = 0; 3011 unsigned int nr_pages = 0;
3012 struct kvm_memslots *slots;
3024 3013
3025 for (i = 0; i < kvm->nmemslots; i++) 3014 slots = rcu_dereference(kvm->memslots);
3026 nr_pages += kvm->memslots[i].npages; 3015 for (i = 0; i < slots->nmemslots; i++)
3016 nr_pages += slots->memslots[i].npages;
3027 3017
3028 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3018 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3029 nr_mmu_pages = max(nr_mmu_pages, 3019 nr_mmu_pages = max(nr_mmu_pages,
@@ -3248,7 +3238,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3248 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) 3238 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3249 audit_mappings_page(vcpu, ent, va, level - 1); 3239 audit_mappings_page(vcpu, ent, va, level - 1);
3250 else { 3240 else {
3251 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3241 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3252 gfn_t gfn = gpa >> PAGE_SHIFT; 3242 gfn_t gfn = gpa >> PAGE_SHIFT;
3253 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3243 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3254 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3244 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
@@ -3293,10 +3283,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3293static int count_rmaps(struct kvm_vcpu *vcpu) 3283static int count_rmaps(struct kvm_vcpu *vcpu)
3294{ 3284{
3295 int nmaps = 0; 3285 int nmaps = 0;
3296 int i, j, k; 3286 int i, j, k, idx;
3297 3287
3288 idx = srcu_read_lock(&kvm->srcu);
3289 slots = rcu_dereference(kvm->memslots);
3298 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3290 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3299 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; 3291 struct kvm_memory_slot *m = &slots->memslots[i];
3300 struct kvm_rmap_desc *d; 3292 struct kvm_rmap_desc *d;
3301 3293
3302 for (j = 0; j < m->npages; ++j) { 3294 for (j = 0; j < m->npages; ++j) {
@@ -3319,6 +3311,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3319 } 3311 }
3320 } 3312 }
3321 } 3313 }
3314 srcu_read_unlock(&kvm->srcu, idx);
3322 return nmaps; 3315 return nmaps;
3323} 3316}
3324 3317
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b3884b49..be66759321a5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
2#define __KVM_X86_MMU_H 2#define __KVM_X86_MMU_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6#define PT64_PT_BITS 9 7#define PT64_PT_BITS 9
7#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) 8#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -37,6 +38,16 @@
37#define PT32_ROOT_LEVEL 2 38#define PT32_ROOT_LEVEL 2
38#define PT32E_ROOT_LEVEL 3 39#define PT32E_ROOT_LEVEL 3
39 40
41#define PT_PDPE_LEVEL 3
42#define PT_DIRECTORY_LEVEL 2
43#define PT_PAGE_TABLE_LEVEL 1
44
45#define PFERR_PRESENT_MASK (1U << 0)
46#define PFERR_WRITE_MASK (1U << 1)
47#define PFERR_USER_MASK (1U << 2)
48#define PFERR_RSVD_MASK (1U << 3)
49#define PFERR_FETCH_MASK (1U << 4)
50
40int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
41 52
42static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
@@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
53 return kvm_mmu_load(vcpu); 64 return kvm_mmu_load(vcpu);
54} 65}
55 66
56static inline int is_long_mode(struct kvm_vcpu *vcpu)
57{
58#ifdef CONFIG_X86_64
59 return vcpu->arch.shadow_efer & EFER_LMA;
60#else
61 return 0;
62#endif
63}
64
65static inline int is_pae(struct kvm_vcpu *vcpu)
66{
67 return vcpu->arch.cr4 & X86_CR4_PAE;
68}
69
70static inline int is_pse(struct kvm_vcpu *vcpu)
71{
72 return vcpu->arch.cr4 & X86_CR4_PSE;
73}
74
75static inline int is_paging(struct kvm_vcpu *vcpu)
76{
77 return vcpu->arch.cr0 & X86_CR0_PG;
78}
79
80static inline int is_present_gpte(unsigned long pte) 67static inline int is_present_gpte(unsigned long pte)
81{ 68{
82 return pte & PT_PRESENT_MASK; 69 return pte & PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a6017132fba8..81eab9a50e6a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -150,7 +150,9 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 150 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 151 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 152
153 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
154 goto not_present;
155
154 trace_kvm_mmu_paging_element(pte, walker->level); 156 trace_kvm_mmu_paging_element(pte, walker->level);
155 157
156 if (!is_present_gpte(pte)) 158 if (!is_present_gpte(pte))
@@ -160,7 +162,7 @@ walk:
160 if (rsvd_fault) 162 if (rsvd_fault)
161 goto access_error; 163 goto access_error;
162 164
163 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writable_pte(pte))
164 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
165 goto access_error; 167 goto access_error;
166 168
@@ -455,8 +457,6 @@ out_unlock:
455static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
456{ 458{
457 struct kvm_shadow_walk_iterator iterator; 459 struct kvm_shadow_walk_iterator iterator;
458 pt_element_t gpte;
459 gpa_t pte_gpa = -1;
460 int level; 460 int level;
461 u64 *sptep; 461 u64 *sptep;
462 int need_flush = 0; 462 int need_flush = 0;
@@ -470,10 +470,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
470 if (level == PT_PAGE_TABLE_LEVEL || 470 if (level == PT_PAGE_TABLE_LEVEL ||
471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
473 struct kvm_mmu_page *sp = page_header(__pa(sptep));
474
475 pte_gpa = (sp->gfn << PAGE_SHIFT);
476 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
477 473
478 if (is_shadow_present_pte(*sptep)) { 474 if (is_shadow_present_pte(*sptep)) {
479 rmap_remove(vcpu->kvm, sptep); 475 rmap_remove(vcpu->kvm, sptep);
@@ -492,32 +488,25 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
492 if (need_flush) 488 if (need_flush)
493 kvm_flush_remote_tlbs(vcpu->kvm); 489 kvm_flush_remote_tlbs(vcpu->kvm);
494 spin_unlock(&vcpu->kvm->mmu_lock); 490 spin_unlock(&vcpu->kvm->mmu_lock);
495
496 if (pte_gpa == -1)
497 return;
498 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
499 sizeof(pt_element_t)))
500 return;
501 if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
502 if (mmu_topup_memory_caches(vcpu))
503 return;
504 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
505 sizeof(pt_element_t), 0);
506 }
507} 491}
508 492
509static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
494 u32 *error)
510{ 495{
511 struct guest_walker walker; 496 struct guest_walker walker;
512 gpa_t gpa = UNMAPPED_GVA; 497 gpa_t gpa = UNMAPPED_GVA;
513 int r; 498 int r;
514 499
515 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); 500 r = FNAME(walk_addr)(&walker, vcpu, vaddr,
501 !!(access & PFERR_WRITE_MASK),
502 !!(access & PFERR_USER_MASK),
503 !!(access & PFERR_FETCH_MASK));
516 504
517 if (r) { 505 if (r) {
518 gpa = gfn_to_gpa(walker.gfn); 506 gpa = gfn_to_gpa(walker.gfn);
519 gpa |= vaddr & ~PAGE_MASK; 507 gpa |= vaddr & ~PAGE_MASK;
520 } 508 } else if (error)
509 *error = walker.error_code;
521 510
522 return gpa; 511 return gpa;
523} 512}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d9b33843c80..2ba58206812a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -26,6 +26,7 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h>
29 30
30#include <asm/desc.h> 31#include <asm/desc.h>
31 32
@@ -231,7 +232,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
231 efer &= ~EFER_LME; 232 efer &= ~EFER_LME;
232 233
233 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 234 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
234 vcpu->arch.shadow_efer = efer; 235 vcpu->arch.efer = efer;
235} 236}
236 237
237static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 238static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -540,6 +541,8 @@ static void init_vmcb(struct vcpu_svm *svm)
540 struct vmcb_control_area *control = &svm->vmcb->control; 541 struct vmcb_control_area *control = &svm->vmcb->control;
541 struct vmcb_save_area *save = &svm->vmcb->save; 542 struct vmcb_save_area *save = &svm->vmcb->save;
542 543
544 svm->vcpu.fpu_active = 1;
545
543 control->intercept_cr_read = INTERCEPT_CR0_MASK | 546 control->intercept_cr_read = INTERCEPT_CR0_MASK |
544 INTERCEPT_CR3_MASK | 547 INTERCEPT_CR3_MASK |
545 INTERCEPT_CR4_MASK; 548 INTERCEPT_CR4_MASK;
@@ -552,13 +555,19 @@ static void init_vmcb(struct vcpu_svm *svm)
552 control->intercept_dr_read = INTERCEPT_DR0_MASK | 555 control->intercept_dr_read = INTERCEPT_DR0_MASK |
553 INTERCEPT_DR1_MASK | 556 INTERCEPT_DR1_MASK |
554 INTERCEPT_DR2_MASK | 557 INTERCEPT_DR2_MASK |
555 INTERCEPT_DR3_MASK; 558 INTERCEPT_DR3_MASK |
559 INTERCEPT_DR4_MASK |
560 INTERCEPT_DR5_MASK |
561 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK;
556 563
557 control->intercept_dr_write = INTERCEPT_DR0_MASK | 564 control->intercept_dr_write = INTERCEPT_DR0_MASK |
558 INTERCEPT_DR1_MASK | 565 INTERCEPT_DR1_MASK |
559 INTERCEPT_DR2_MASK | 566 INTERCEPT_DR2_MASK |
560 INTERCEPT_DR3_MASK | 567 INTERCEPT_DR3_MASK |
568 INTERCEPT_DR4_MASK |
561 INTERCEPT_DR5_MASK | 569 INTERCEPT_DR5_MASK |
570 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK; 571 INTERCEPT_DR7_MASK;
563 572
564 control->intercept_exceptions = (1 << PF_VECTOR) | 573 control->intercept_exceptions = (1 << PF_VECTOR) |
@@ -569,6 +578,7 @@ static void init_vmcb(struct vcpu_svm *svm)
569 control->intercept = (1ULL << INTERCEPT_INTR) | 578 control->intercept = (1ULL << INTERCEPT_INTR) |
570 (1ULL << INTERCEPT_NMI) | 579 (1ULL << INTERCEPT_NMI) |
571 (1ULL << INTERCEPT_SMI) | 580 (1ULL << INTERCEPT_SMI) |
581 (1ULL << INTERCEPT_SELECTIVE_CR0) |
572 (1ULL << INTERCEPT_CPUID) | 582 (1ULL << INTERCEPT_CPUID) |
573 (1ULL << INTERCEPT_INVD) | 583 (1ULL << INTERCEPT_INVD) |
574 (1ULL << INTERCEPT_HLT) | 584 (1ULL << INTERCEPT_HLT) |
@@ -641,10 +651,8 @@ static void init_vmcb(struct vcpu_svm *svm)
641 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 651 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
642 (1ULL << INTERCEPT_INVLPG)); 652 (1ULL << INTERCEPT_INVLPG));
643 control->intercept_exceptions &= ~(1 << PF_VECTOR); 653 control->intercept_exceptions &= ~(1 << PF_VECTOR);
644 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 654 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
645 INTERCEPT_CR3_MASK); 655 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
646 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
647 INTERCEPT_CR3_MASK);
648 save->g_pat = 0x0007040600070406ULL; 656 save->g_pat = 0x0007040600070406ULL;
649 save->cr3 = 0; 657 save->cr3 = 0;
650 save->cr4 = 0; 658 save->cr4 = 0;
@@ -698,29 +706,28 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
698 if (err) 706 if (err)
699 goto free_svm; 707 goto free_svm;
700 708
709 err = -ENOMEM;
701 page = alloc_page(GFP_KERNEL); 710 page = alloc_page(GFP_KERNEL);
702 if (!page) { 711 if (!page)
703 err = -ENOMEM;
704 goto uninit; 712 goto uninit;
705 }
706 713
707 err = -ENOMEM;
708 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 714 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
709 if (!msrpm_pages) 715 if (!msrpm_pages)
710 goto uninit; 716 goto free_page1;
711 717
712 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 718 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
713 if (!nested_msrpm_pages) 719 if (!nested_msrpm_pages)
714 goto uninit; 720 goto free_page2;
715
716 svm->msrpm = page_address(msrpm_pages);
717 svm_vcpu_init_msrpm(svm->msrpm);
718 721
719 hsave_page = alloc_page(GFP_KERNEL); 722 hsave_page = alloc_page(GFP_KERNEL);
720 if (!hsave_page) 723 if (!hsave_page)
721 goto uninit; 724 goto free_page3;
725
722 svm->nested.hsave = page_address(hsave_page); 726 svm->nested.hsave = page_address(hsave_page);
723 727
728 svm->msrpm = page_address(msrpm_pages);
729 svm_vcpu_init_msrpm(svm->msrpm);
730
724 svm->nested.msrpm = page_address(nested_msrpm_pages); 731 svm->nested.msrpm = page_address(nested_msrpm_pages);
725 732
726 svm->vmcb = page_address(page); 733 svm->vmcb = page_address(page);
@@ -730,13 +737,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
730 init_vmcb(svm); 737 init_vmcb(svm);
731 738
732 fx_init(&svm->vcpu); 739 fx_init(&svm->vcpu);
733 svm->vcpu.fpu_active = 1;
734 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 740 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
735 if (kvm_vcpu_is_bsp(&svm->vcpu)) 741 if (kvm_vcpu_is_bsp(&svm->vcpu))
736 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 742 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
737 743
738 return &svm->vcpu; 744 return &svm->vcpu;
739 745
746free_page3:
747 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
748free_page2:
749 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
750free_page1:
751 __free_page(page);
740uninit: 752uninit:
741 kvm_vcpu_uninit(&svm->vcpu); 753 kvm_vcpu_uninit(&svm->vcpu);
742free_svm: 754free_svm:
@@ -765,14 +777,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
765 if (unlikely(cpu != vcpu->cpu)) { 777 if (unlikely(cpu != vcpu->cpu)) {
766 u64 delta; 778 u64 delta;
767 779
768 /* 780 if (check_tsc_unstable()) {
769 * Make sure that the guest sees a monotonically 781 /*
770 * increasing TSC. 782 * Make sure that the guest sees a monotonically
771 */ 783 * increasing TSC.
772 delta = vcpu->arch.host_tsc - native_read_tsc(); 784 */
773 svm->vmcb->control.tsc_offset += delta; 785 delta = vcpu->arch.host_tsc - native_read_tsc();
774 if (is_nested(svm)) 786 svm->vmcb->control.tsc_offset += delta;
775 svm->nested.hsave->control.tsc_offset += delta; 787 if (is_nested(svm))
788 svm->nested.hsave->control.tsc_offset += delta;
789 }
776 vcpu->cpu = cpu; 790 vcpu->cpu = cpu;
777 kvm_migrate_timers(vcpu); 791 kvm_migrate_timers(vcpu);
778 svm->asid_generation = 0; 792 svm->asid_generation = 0;
@@ -954,42 +968,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
954 svm->vmcb->save.gdtr.base = dt->base ; 968 svm->vmcb->save.gdtr.base = dt->base ;
955} 969}
956 970
971static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
972{
973}
974
957static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 975static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
958{ 976{
959} 977}
960 978
979static void update_cr0_intercept(struct vcpu_svm *svm)
980{
981 ulong gcr0 = svm->vcpu.arch.cr0;
982 u64 *hcr0 = &svm->vmcb->save.cr0;
983
984 if (!svm->vcpu.fpu_active)
985 *hcr0 |= SVM_CR0_SELECTIVE_MASK;
986 else
987 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
988 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
989
990
991 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
992 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
993 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
994 } else {
995 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
996 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
997 }
998}
999
961static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1000static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
962{ 1001{
963 struct vcpu_svm *svm = to_svm(vcpu); 1002 struct vcpu_svm *svm = to_svm(vcpu);
964 1003
965#ifdef CONFIG_X86_64 1004#ifdef CONFIG_X86_64
966 if (vcpu->arch.shadow_efer & EFER_LME) { 1005 if (vcpu->arch.efer & EFER_LME) {
967 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1006 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
968 vcpu->arch.shadow_efer |= EFER_LMA; 1007 vcpu->arch.efer |= EFER_LMA;
969 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1008 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
970 } 1009 }
971 1010
972 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1011 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
973 vcpu->arch.shadow_efer &= ~EFER_LMA; 1012 vcpu->arch.efer &= ~EFER_LMA;
974 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1013 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
975 } 1014 }
976 } 1015 }
977#endif 1016#endif
978 if (npt_enabled) 1017 vcpu->arch.cr0 = cr0;
979 goto set;
980 1018
981 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 1019 if (!npt_enabled)
982 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1020 cr0 |= X86_CR0_PG | X86_CR0_WP;
983 vcpu->fpu_active = 1;
984 }
985 1021
986 vcpu->arch.cr0 = cr0; 1022 if (!vcpu->fpu_active)
987 cr0 |= X86_CR0_PG | X86_CR0_WP;
988 if (!vcpu->fpu_active) {
989 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
990 cr0 |= X86_CR0_TS; 1023 cr0 |= X86_CR0_TS;
991 }
992set:
993 /* 1024 /*
994 * re-enable caching here because the QEMU bios 1025 * re-enable caching here because the QEMU bios
995 * does not do it - this results in some delay at 1026 * does not do it - this results in some delay at
@@ -997,6 +1028,7 @@ set:
997 */ 1028 */
998 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1029 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
999 svm->vmcb->save.cr0 = cr0; 1030 svm->vmcb->save.cr0 = cr0;
1031 update_cr0_intercept(svm);
1000} 1032}
1001 1033
1002static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1034static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1102,76 +1134,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1102 svm->vmcb->control.asid = sd->next_asid++; 1134 svm->vmcb->control.asid = sd->next_asid++;
1103} 1135}
1104 1136
1105static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 1137static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
1106{ 1138{
1107 struct vcpu_svm *svm = to_svm(vcpu); 1139 struct vcpu_svm *svm = to_svm(vcpu);
1108 unsigned long val;
1109 1140
1110 switch (dr) { 1141 switch (dr) {
1111 case 0 ... 3: 1142 case 0 ... 3:
1112 val = vcpu->arch.db[dr]; 1143 *dest = vcpu->arch.db[dr];
1113 break; 1144 break;
1145 case 4:
1146 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1147 return EMULATE_FAIL; /* will re-inject UD */
1148 /* fall through */
1114 case 6: 1149 case 6:
1115 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1150 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1116 val = vcpu->arch.dr6; 1151 *dest = vcpu->arch.dr6;
1117 else 1152 else
1118 val = svm->vmcb->save.dr6; 1153 *dest = svm->vmcb->save.dr6;
1119 break; 1154 break;
1155 case 5:
1156 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1157 return EMULATE_FAIL; /* will re-inject UD */
1158 /* fall through */
1120 case 7: 1159 case 7:
1121 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1160 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1122 val = vcpu->arch.dr7; 1161 *dest = vcpu->arch.dr7;
1123 else 1162 else
1124 val = svm->vmcb->save.dr7; 1163 *dest = svm->vmcb->save.dr7;
1125 break; 1164 break;
1126 default:
1127 val = 0;
1128 } 1165 }
1129 1166
1130 return val; 1167 return EMULATE_DONE;
1131} 1168}
1132 1169
1133static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 1170static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1134 int *exception)
1135{ 1171{
1136 struct vcpu_svm *svm = to_svm(vcpu); 1172 struct vcpu_svm *svm = to_svm(vcpu);
1137 1173
1138 *exception = 0;
1139
1140 switch (dr) { 1174 switch (dr) {
1141 case 0 ... 3: 1175 case 0 ... 3:
1142 vcpu->arch.db[dr] = value; 1176 vcpu->arch.db[dr] = value;
1143 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 1177 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1144 vcpu->arch.eff_db[dr] = value; 1178 vcpu->arch.eff_db[dr] = value;
1145 return; 1179 break;
1146 case 4 ... 5: 1180 case 4:
1147 if (vcpu->arch.cr4 & X86_CR4_DE) 1181 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1148 *exception = UD_VECTOR; 1182 return EMULATE_FAIL; /* will re-inject UD */
1149 return; 1183 /* fall through */
1150 case 6: 1184 case 6:
1151 if (value & 0xffffffff00000000ULL) {
1152 *exception = GP_VECTOR;
1153 return;
1154 }
1155 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; 1185 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1156 return; 1186 break;
1187 case 5:
1188 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1189 return EMULATE_FAIL; /* will re-inject UD */
1190 /* fall through */
1157 case 7: 1191 case 7:
1158 if (value & 0xffffffff00000000ULL) {
1159 *exception = GP_VECTOR;
1160 return;
1161 }
1162 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; 1192 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1163 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 1193 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1164 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1194 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1165 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); 1195 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1166 } 1196 }
1167 return; 1197 break;
1168 default:
1169 /* FIXME: Possible case? */
1170 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1171 __func__, dr);
1172 *exception = UD_VECTOR;
1173 return;
1174 } 1198 }
1199
1200 return EMULATE_DONE;
1175} 1201}
1176 1202
1177static int pf_interception(struct vcpu_svm *svm) 1203static int pf_interception(struct vcpu_svm *svm)
@@ -1239,13 +1265,17 @@ static int ud_interception(struct vcpu_svm *svm)
1239 return 1; 1265 return 1;
1240} 1266}
1241 1267
1242static int nm_interception(struct vcpu_svm *svm) 1268static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1243{ 1269{
1270 struct vcpu_svm *svm = to_svm(vcpu);
1244 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1271 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1245 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
1246 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
1247 svm->vcpu.fpu_active = 1; 1272 svm->vcpu.fpu_active = 1;
1273 update_cr0_intercept(svm);
1274}
1248 1275
1276static int nm_interception(struct vcpu_svm *svm)
1277{
1278 svm_fpu_activate(&svm->vcpu);
1249 return 1; 1279 return 1;
1250} 1280}
1251 1281
@@ -1337,7 +1367,7 @@ static int vmmcall_interception(struct vcpu_svm *svm)
1337 1367
1338static int nested_svm_check_permissions(struct vcpu_svm *svm) 1368static int nested_svm_check_permissions(struct vcpu_svm *svm)
1339{ 1369{
1340 if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) 1370 if (!(svm->vcpu.arch.efer & EFER_SVME)
1341 || !is_paging(&svm->vcpu)) { 1371 || !is_paging(&svm->vcpu)) {
1342 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1372 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1343 return 1; 1373 return 1;
@@ -1740,8 +1770,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1740 hsave->save.ds = vmcb->save.ds; 1770 hsave->save.ds = vmcb->save.ds;
1741 hsave->save.gdtr = vmcb->save.gdtr; 1771 hsave->save.gdtr = vmcb->save.gdtr;
1742 hsave->save.idtr = vmcb->save.idtr; 1772 hsave->save.idtr = vmcb->save.idtr;
1743 hsave->save.efer = svm->vcpu.arch.shadow_efer; 1773 hsave->save.efer = svm->vcpu.arch.efer;
1744 hsave->save.cr0 = svm->vcpu.arch.cr0; 1774 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
1745 hsave->save.cr4 = svm->vcpu.arch.cr4; 1775 hsave->save.cr4 = svm->vcpu.arch.cr4;
1746 hsave->save.rflags = vmcb->save.rflags; 1776 hsave->save.rflags = vmcb->save.rflags;
1747 hsave->save.rip = svm->next_rip; 1777 hsave->save.rip = svm->next_rip;
@@ -2153,9 +2183,10 @@ static int rdmsr_interception(struct vcpu_svm *svm)
2153 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2183 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2154 u64 data; 2184 u64 data;
2155 2185
2156 if (svm_get_msr(&svm->vcpu, ecx, &data)) 2186 if (svm_get_msr(&svm->vcpu, ecx, &data)) {
2187 trace_kvm_msr_read_ex(ecx);
2157 kvm_inject_gp(&svm->vcpu, 0); 2188 kvm_inject_gp(&svm->vcpu, 0);
2158 else { 2189 } else {
2159 trace_kvm_msr_read(ecx, data); 2190 trace_kvm_msr_read(ecx, data);
2160 2191
2161 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 2192 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
@@ -2247,13 +2278,15 @@ static int wrmsr_interception(struct vcpu_svm *svm)
2247 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2278 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2248 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 2279 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2249 2280
2250 trace_kvm_msr_write(ecx, data);
2251 2281
2252 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2282 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2253 if (svm_set_msr(&svm->vcpu, ecx, data)) 2283 if (svm_set_msr(&svm->vcpu, ecx, data)) {
2284 trace_kvm_msr_write_ex(ecx, data);
2254 kvm_inject_gp(&svm->vcpu, 0); 2285 kvm_inject_gp(&svm->vcpu, 0);
2255 else 2286 } else {
2287 trace_kvm_msr_write(ecx, data);
2256 skip_emulated_instruction(&svm->vcpu); 2288 skip_emulated_instruction(&svm->vcpu);
2289 }
2257 return 1; 2290 return 1;
2258} 2291}
2259 2292
@@ -2297,7 +2330,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2297 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2330 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2298 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2331 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2299 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2332 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2300 /* for now: */ 2333 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2301 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2334 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2302 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2335 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2303 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2336 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
@@ -2306,11 +2339,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2306 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2339 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2307 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2340 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2308 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2341 [SVM_EXIT_READ_DR3] = emulate_on_interception,
2342 [SVM_EXIT_READ_DR4] = emulate_on_interception,
2343 [SVM_EXIT_READ_DR5] = emulate_on_interception,
2344 [SVM_EXIT_READ_DR6] = emulate_on_interception,
2345 [SVM_EXIT_READ_DR7] = emulate_on_interception,
2309 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 2346 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
2310 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 2347 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
2311 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 2348 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
2312 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 2349 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
2350 [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
2313 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 2351 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
2352 [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
2314 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 2353 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
2315 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2354 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2316 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2355 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
@@ -2383,20 +2422,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2383 2422
2384 svm_complete_interrupts(svm); 2423 svm_complete_interrupts(svm);
2385 2424
2386 if (npt_enabled) { 2425 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2387 int mmu_reload = 0;
2388 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
2389 svm_set_cr0(vcpu, svm->vmcb->save.cr0);
2390 mmu_reload = 1;
2391 }
2392 vcpu->arch.cr0 = svm->vmcb->save.cr0; 2426 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2427 if (npt_enabled)
2393 vcpu->arch.cr3 = svm->vmcb->save.cr3; 2428 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2394 if (mmu_reload) {
2395 kvm_mmu_reset_context(vcpu);
2396 kvm_mmu_load(vcpu);
2397 }
2398 }
2399
2400 2429
2401 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2430 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2402 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2431 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2798,12 +2827,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
2798 2827
2799 svm->vmcb->save.cr3 = root; 2828 svm->vmcb->save.cr3 = root;
2800 force_new_asid(vcpu); 2829 force_new_asid(vcpu);
2801
2802 if (vcpu->fpu_active) {
2803 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
2804 svm->vmcb->save.cr0 |= X86_CR0_TS;
2805 vcpu->fpu_active = 0;
2806 }
2807} 2830}
2808 2831
2809static int is_disabled(void) 2832static int is_disabled(void)
@@ -2852,6 +2875,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2852 return 0; 2875 return 0;
2853} 2876}
2854 2877
2878static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2879{
2880}
2881
2855static const struct trace_print_flags svm_exit_reasons_str[] = { 2882static const struct trace_print_flags svm_exit_reasons_str[] = {
2856 { SVM_EXIT_READ_CR0, "read_cr0" }, 2883 { SVM_EXIT_READ_CR0, "read_cr0" },
2857 { SVM_EXIT_READ_CR3, "read_cr3" }, 2884 { SVM_EXIT_READ_CR3, "read_cr3" },
@@ -2905,9 +2932,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
2905 { -1, NULL } 2932 { -1, NULL }
2906}; 2933};
2907 2934
2908static bool svm_gb_page_enable(void) 2935static int svm_get_lpage_level(void)
2909{ 2936{
2910 return true; 2937 return PT_PDPE_LEVEL;
2938}
2939
2940static bool svm_rdtscp_supported(void)
2941{
2942 return false;
2943}
2944
2945static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2946{
2947 struct vcpu_svm *svm = to_svm(vcpu);
2948
2949 update_cr0_intercept(svm);
2950 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
2911} 2951}
2912 2952
2913static struct kvm_x86_ops svm_x86_ops = { 2953static struct kvm_x86_ops svm_x86_ops = {
@@ -2936,6 +2976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2936 .set_segment = svm_set_segment, 2976 .set_segment = svm_set_segment,
2937 .get_cpl = svm_get_cpl, 2977 .get_cpl = svm_get_cpl,
2938 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 2978 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
2979 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
2939 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 2980 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
2940 .set_cr0 = svm_set_cr0, 2981 .set_cr0 = svm_set_cr0,
2941 .set_cr3 = svm_set_cr3, 2982 .set_cr3 = svm_set_cr3,
@@ -2950,6 +2991,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2950 .cache_reg = svm_cache_reg, 2991 .cache_reg = svm_cache_reg,
2951 .get_rflags = svm_get_rflags, 2992 .get_rflags = svm_get_rflags,
2952 .set_rflags = svm_set_rflags, 2993 .set_rflags = svm_set_rflags,
2994 .fpu_activate = svm_fpu_activate,
2995 .fpu_deactivate = svm_fpu_deactivate,
2953 2996
2954 .tlb_flush = svm_flush_tlb, 2997 .tlb_flush = svm_flush_tlb,
2955 2998
@@ -2975,7 +3018,11 @@ static struct kvm_x86_ops svm_x86_ops = {
2975 .get_mt_mask = svm_get_mt_mask, 3018 .get_mt_mask = svm_get_mt_mask,
2976 3019
2977 .exit_reasons_str = svm_exit_reasons_str, 3020 .exit_reasons_str = svm_exit_reasons_str,
2978 .gb_page_enable = svm_gb_page_enable, 3021 .get_lpage_level = svm_get_lpage_level,
3022
3023 .cpuid_update = svm_cpuid_update,
3024
3025 .rdtscp_supported = svm_rdtscp_supported,
2979}; 3026};
2980 3027
2981static int __init svm_init(void) 3028static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 816e0449db0b..6ad30a29f044 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,
56); 56);
57 57
58/* 58/*
59 * Tracepoint for hypercall.
60 */
61TRACE_EVENT(kvm_hv_hypercall,
62 TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
63 __u64 ingpa, __u64 outgpa),
64 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
65
66 TP_STRUCT__entry(
67 __field( __u16, code )
68 __field( bool, fast )
69 __field( __u16, rep_cnt )
70 __field( __u16, rep_idx )
71 __field( __u64, ingpa )
72 __field( __u64, outgpa )
73 ),
74
75 TP_fast_assign(
76 __entry->code = code;
77 __entry->fast = fast;
78 __entry->rep_cnt = rep_cnt;
79 __entry->rep_idx = rep_idx;
80 __entry->ingpa = ingpa;
81 __entry->outgpa = outgpa;
82 ),
83
84 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
85 __entry->code, __entry->fast ? "fast" : "slow",
86 __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
87 __entry->outgpa)
88);
89
90/*
59 * Tracepoint for PIO. 91 * Tracepoint for PIO.
60 */ 92 */
61TRACE_EVENT(kvm_pio, 93TRACE_EVENT(kvm_pio,
@@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,
214 * Tracepoint for guest MSR access. 246 * Tracepoint for guest MSR access.
215 */ 247 */
216TRACE_EVENT(kvm_msr, 248TRACE_EVENT(kvm_msr,
217 TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), 249 TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
218 TP_ARGS(rw, ecx, data), 250 TP_ARGS(write, ecx, data, exception),
219 251
220 TP_STRUCT__entry( 252 TP_STRUCT__entry(
221 __field( unsigned int, rw ) 253 __field( unsigned, write )
222 __field( unsigned int, ecx ) 254 __field( u32, ecx )
223 __field( unsigned long, data ) 255 __field( u64, data )
256 __field( u8, exception )
224 ), 257 ),
225 258
226 TP_fast_assign( 259 TP_fast_assign(
227 __entry->rw = rw; 260 __entry->write = write;
228 __entry->ecx = ecx; 261 __entry->ecx = ecx;
229 __entry->data = data; 262 __entry->data = data;
263 __entry->exception = exception;
230 ), 264 ),
231 265
232 TP_printk("msr_%s %x = 0x%lx", 266 TP_printk("msr_%s %x = 0x%llx%s",
233 __entry->rw ? "write" : "read", 267 __entry->write ? "write" : "read",
234 __entry->ecx, __entry->data) 268 __entry->ecx, __entry->data,
269 __entry->exception ? " (#GP)" : "")
235); 270);
236 271
237#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) 272#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
238#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) 273#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
274#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
275#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
239 276
240/* 277/*
241 * Tracepoint for guest CR access. 278 * Tracepoint for guest CR access.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4918d6fc924..bc933cfb4e66 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/moduleparam.h> 27#include <linux/moduleparam.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h>
29#include "kvm_cache_regs.h" 30#include "kvm_cache_regs.h"
30#include "x86.h" 31#include "x86.h"
31 32
@@ -61,6 +62,23 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 62static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 63module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 64
65#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
66 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
67#define KVM_GUEST_CR0_MASK \
68 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
69#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
70 (X86_CR0_WP | X86_CR0_NE)
71#define KVM_VM_CR0_ALWAYS_ON \
72 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
73#define KVM_CR4_GUEST_OWNED_BITS \
74 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
75 | X86_CR4_OSXMMEXCPT)
76
77#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
78#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
79
80#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
81
64/* 82/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 83 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive 84 * ple_gap: upper bound on the amount of time between two successive
@@ -115,7 +133,7 @@ struct vcpu_vmx {
115 } host_state; 133 } host_state;
116 struct { 134 struct {
117 int vm86_active; 135 int vm86_active;
118 u8 save_iopl; 136 ulong save_rflags;
119 struct kvm_save_segment { 137 struct kvm_save_segment {
120 u16 selector; 138 u16 selector;
121 unsigned long base; 139 unsigned long base;
@@ -136,6 +154,8 @@ struct vcpu_vmx {
136 ktime_t entry_time; 154 ktime_t entry_time;
137 s64 vnmi_blocked_time; 155 s64 vnmi_blocked_time;
138 u32 exit_reason; 156 u32 exit_reason;
157
158 bool rdtscp_enabled;
139}; 159};
140 160
141static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 161static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -210,7 +230,7 @@ static const u32 vmx_msr_index[] = {
210#ifdef CONFIG_X86_64 230#ifdef CONFIG_X86_64
211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 231 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
212#endif 232#endif
213 MSR_EFER, MSR_K6_STAR, 233 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
214}; 234};
215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 235#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
216 236
@@ -301,6 +321,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
301 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 321 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
302} 322}
303 323
324static inline bool cpu_has_vmx_ept_1g_page(void)
325{
326 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
327}
328
304static inline int cpu_has_vmx_invept_individual_addr(void) 329static inline int cpu_has_vmx_invept_individual_addr(void)
305{ 330{
306 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 331 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -336,9 +361,7 @@ static inline int cpu_has_vmx_ple(void)
336 361
337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 362static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
338{ 363{
339 return flexpriority_enabled && 364 return flexpriority_enabled && irqchip_in_kernel(kvm);
340 (cpu_has_vmx_virtualize_apic_accesses()) &&
341 (irqchip_in_kernel(kvm));
342} 365}
343 366
344static inline int cpu_has_vmx_vpid(void) 367static inline int cpu_has_vmx_vpid(void)
@@ -347,6 +370,12 @@ static inline int cpu_has_vmx_vpid(void)
347 SECONDARY_EXEC_ENABLE_VPID; 370 SECONDARY_EXEC_ENABLE_VPID;
348} 371}
349 372
373static inline int cpu_has_vmx_rdtscp(void)
374{
375 return vmcs_config.cpu_based_2nd_exec_ctrl &
376 SECONDARY_EXEC_RDTSCP;
377}
378
350static inline int cpu_has_virtual_nmis(void) 379static inline int cpu_has_virtual_nmis(void)
351{ 380{
352 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 381 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -551,22 +580,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
551{ 580{
552 u32 eb; 581 u32 eb;
553 582
554 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); 583 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
555 if (!vcpu->fpu_active) 584 (1u << NM_VECTOR) | (1u << DB_VECTOR);
556 eb |= 1u << NM_VECTOR; 585 if ((vcpu->guest_debug &
557 /* 586 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
558 * Unconditionally intercept #DB so we can maintain dr6 without 587 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
559 * reading it every exit. 588 eb |= 1u << BP_VECTOR;
560 */
561 eb |= 1u << DB_VECTOR;
562 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
563 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
564 eb |= 1u << BP_VECTOR;
565 }
566 if (to_vmx(vcpu)->rmode.vm86_active) 589 if (to_vmx(vcpu)->rmode.vm86_active)
567 eb = ~0; 590 eb = ~0;
568 if (enable_ept) 591 if (enable_ept)
569 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 592 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
593 if (vcpu->fpu_active)
594 eb &= ~(1u << NM_VECTOR);
570 vmcs_write32(EXCEPTION_BITMAP, eb); 595 vmcs_write32(EXCEPTION_BITMAP, eb);
571} 596}
572 597
@@ -589,7 +614,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
589 u64 guest_efer; 614 u64 guest_efer;
590 u64 ignore_bits; 615 u64 ignore_bits;
591 616
592 guest_efer = vmx->vcpu.arch.shadow_efer; 617 guest_efer = vmx->vcpu.arch.efer;
593 618
594 /* 619 /*
595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 620 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -767,38 +792,51 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
767 792
768static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 793static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
769{ 794{
795 ulong cr0;
796
770 if (vcpu->fpu_active) 797 if (vcpu->fpu_active)
771 return; 798 return;
772 vcpu->fpu_active = 1; 799 vcpu->fpu_active = 1;
773 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); 800 cr0 = vmcs_readl(GUEST_CR0);
774 if (vcpu->arch.cr0 & X86_CR0_TS) 801 cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
775 vmcs_set_bits(GUEST_CR0, X86_CR0_TS); 802 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
803 vmcs_writel(GUEST_CR0, cr0);
776 update_exception_bitmap(vcpu); 804 update_exception_bitmap(vcpu);
805 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
806 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
777} 807}
778 808
809static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
810
779static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 811static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
780{ 812{
781 if (!vcpu->fpu_active) 813 vmx_decache_cr0_guest_bits(vcpu);
782 return; 814 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
783 vcpu->fpu_active = 0;
784 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
785 update_exception_bitmap(vcpu); 815 update_exception_bitmap(vcpu);
816 vcpu->arch.cr0_guest_owned_bits = 0;
817 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
818 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
786} 819}
787 820
788static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 821static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
789{ 822{
790 unsigned long rflags; 823 unsigned long rflags, save_rflags;
791 824
792 rflags = vmcs_readl(GUEST_RFLAGS); 825 rflags = vmcs_readl(GUEST_RFLAGS);
793 if (to_vmx(vcpu)->rmode.vm86_active) 826 if (to_vmx(vcpu)->rmode.vm86_active) {
794 rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 827 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
828 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
829 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
830 }
795 return rflags; 831 return rflags;
796} 832}
797 833
798static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 834static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
799{ 835{
800 if (to_vmx(vcpu)->rmode.vm86_active) 836 if (to_vmx(vcpu)->rmode.vm86_active) {
837 to_vmx(vcpu)->rmode.save_rflags = rflags;
801 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 838 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
839 }
802 vmcs_writel(GUEST_RFLAGS, rflags); 840 vmcs_writel(GUEST_RFLAGS, rflags);
803} 841}
804 842
@@ -878,6 +916,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
878 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 916 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
879} 917}
880 918
919static bool vmx_rdtscp_supported(void)
920{
921 return cpu_has_vmx_rdtscp();
922}
923
881/* 924/*
882 * Swap MSR entry in host/guest MSR entry array. 925 * Swap MSR entry in host/guest MSR entry array.
883 */ 926 */
@@ -913,12 +956,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
913 index = __find_msr_index(vmx, MSR_CSTAR); 956 index = __find_msr_index(vmx, MSR_CSTAR);
914 if (index >= 0) 957 if (index >= 0)
915 move_msr_up(vmx, index, save_nmsrs++); 958 move_msr_up(vmx, index, save_nmsrs++);
959 index = __find_msr_index(vmx, MSR_TSC_AUX);
960 if (index >= 0 && vmx->rdtscp_enabled)
961 move_msr_up(vmx, index, save_nmsrs++);
916 /* 962 /*
917 * MSR_K6_STAR is only needed on long mode guests, and only 963 * MSR_K6_STAR is only needed on long mode guests, and only
918 * if efer.sce is enabled. 964 * if efer.sce is enabled.
919 */ 965 */
920 index = __find_msr_index(vmx, MSR_K6_STAR); 966 index = __find_msr_index(vmx, MSR_K6_STAR);
921 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) 967 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
922 move_msr_up(vmx, index, save_nmsrs++); 968 move_msr_up(vmx, index, save_nmsrs++);
923 } 969 }
924#endif 970#endif
@@ -1002,6 +1048,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1002 case MSR_IA32_SYSENTER_ESP: 1048 case MSR_IA32_SYSENTER_ESP:
1003 data = vmcs_readl(GUEST_SYSENTER_ESP); 1049 data = vmcs_readl(GUEST_SYSENTER_ESP);
1004 break; 1050 break;
1051 case MSR_TSC_AUX:
1052 if (!to_vmx(vcpu)->rdtscp_enabled)
1053 return 1;
1054 /* Otherwise falls through */
1005 default: 1055 default:
1006 vmx_load_host_state(to_vmx(vcpu)); 1056 vmx_load_host_state(to_vmx(vcpu));
1007 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1057 msr = find_msr_entry(to_vmx(vcpu), msr_index);
@@ -1065,7 +1115,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1065 vcpu->arch.pat = data; 1115 vcpu->arch.pat = data;
1066 break; 1116 break;
1067 } 1117 }
1068 /* Otherwise falls through to kvm_set_msr_common */ 1118 ret = kvm_set_msr_common(vcpu, msr_index, data);
1119 break;
1120 case MSR_TSC_AUX:
1121 if (!vmx->rdtscp_enabled)
1122 return 1;
1123 /* Check reserved bit, higher 32 bits should be zero */
1124 if ((data >> 32) != 0)
1125 return 1;
1126 /* Otherwise falls through */
1069 default: 1127 default:
1070 msr = find_msr_entry(vmx, msr_index); 1128 msr = find_msr_entry(vmx, msr_index);
1071 if (msr) { 1129 if (msr) {
@@ -1224,6 +1282,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1224 CPU_BASED_USE_IO_BITMAPS | 1282 CPU_BASED_USE_IO_BITMAPS |
1225 CPU_BASED_MOV_DR_EXITING | 1283 CPU_BASED_MOV_DR_EXITING |
1226 CPU_BASED_USE_TSC_OFFSETING | 1284 CPU_BASED_USE_TSC_OFFSETING |
1285 CPU_BASED_MWAIT_EXITING |
1286 CPU_BASED_MONITOR_EXITING |
1227 CPU_BASED_INVLPG_EXITING; 1287 CPU_BASED_INVLPG_EXITING;
1228 opt = CPU_BASED_TPR_SHADOW | 1288 opt = CPU_BASED_TPR_SHADOW |
1229 CPU_BASED_USE_MSR_BITMAPS | 1289 CPU_BASED_USE_MSR_BITMAPS |
@@ -1243,7 +1303,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1243 SECONDARY_EXEC_ENABLE_VPID | 1303 SECONDARY_EXEC_ENABLE_VPID |
1244 SECONDARY_EXEC_ENABLE_EPT | 1304 SECONDARY_EXEC_ENABLE_EPT |
1245 SECONDARY_EXEC_UNRESTRICTED_GUEST | 1305 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1306 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
1307 SECONDARY_EXEC_RDTSCP;
1247 if (adjust_vmx_controls(min2, opt2, 1308 if (adjust_vmx_controls(min2, opt2,
1248 MSR_IA32_VMX_PROCBASED_CTLS2, 1309 MSR_IA32_VMX_PROCBASED_CTLS2,
1249 &_cpu_based_2nd_exec_control) < 0) 1310 &_cpu_based_2nd_exec_control) < 0)
@@ -1429,8 +1490,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1429 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1490 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
1430 1491
1431 flags = vmcs_readl(GUEST_RFLAGS); 1492 flags = vmcs_readl(GUEST_RFLAGS);
1432 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1493 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1433 flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); 1494 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1434 vmcs_writel(GUEST_RFLAGS, flags); 1495 vmcs_writel(GUEST_RFLAGS, flags);
1435 1496
1436 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1497 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1457,8 +1518,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1457static gva_t rmode_tss_base(struct kvm *kvm) 1518static gva_t rmode_tss_base(struct kvm *kvm)
1458{ 1519{
1459 if (!kvm->arch.tss_addr) { 1520 if (!kvm->arch.tss_addr) {
1460 gfn_t base_gfn = kvm->memslots[0].base_gfn + 1521 struct kvm_memslots *slots;
1461 kvm->memslots[0].npages - 3; 1522 gfn_t base_gfn;
1523
1524 slots = rcu_dereference(kvm->memslots);
1525 base_gfn = kvm->memslots->memslots[0].base_gfn +
1526 kvm->memslots->memslots[0].npages - 3;
1462 return base_gfn << PAGE_SHIFT; 1527 return base_gfn << PAGE_SHIFT;
1463 } 1528 }
1464 return kvm->arch.tss_addr; 1529 return kvm->arch.tss_addr;
@@ -1499,8 +1564,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1499 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1564 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1500 1565
1501 flags = vmcs_readl(GUEST_RFLAGS); 1566 flags = vmcs_readl(GUEST_RFLAGS);
1502 vmx->rmode.save_iopl 1567 vmx->rmode.save_rflags = flags;
1503 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1504 1568
1505 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1569 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1506 1570
@@ -1544,9 +1608,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1544 * of this msr depends on is_long_mode(). 1608 * of this msr depends on is_long_mode().
1545 */ 1609 */
1546 vmx_load_host_state(to_vmx(vcpu)); 1610 vmx_load_host_state(to_vmx(vcpu));
1547 vcpu->arch.shadow_efer = efer; 1611 vcpu->arch.efer = efer;
1548 if (!msr)
1549 return;
1550 if (efer & EFER_LMA) { 1612 if (efer & EFER_LMA) {
1551 vmcs_write32(VM_ENTRY_CONTROLS, 1613 vmcs_write32(VM_ENTRY_CONTROLS,
1552 vmcs_read32(VM_ENTRY_CONTROLS) | 1614 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1576,13 +1638,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1576 (guest_tr_ar & ~AR_TYPE_MASK) 1638 (guest_tr_ar & ~AR_TYPE_MASK)
1577 | AR_TYPE_BUSY_64_TSS); 1639 | AR_TYPE_BUSY_64_TSS);
1578 } 1640 }
1579 vcpu->arch.shadow_efer |= EFER_LMA; 1641 vcpu->arch.efer |= EFER_LMA;
1580 vmx_set_efer(vcpu, vcpu->arch.shadow_efer); 1642 vmx_set_efer(vcpu, vcpu->arch.efer);
1581} 1643}
1582 1644
1583static void exit_lmode(struct kvm_vcpu *vcpu) 1645static void exit_lmode(struct kvm_vcpu *vcpu)
1584{ 1646{
1585 vcpu->arch.shadow_efer &= ~EFER_LMA; 1647 vcpu->arch.efer &= ~EFER_LMA;
1586 1648
1587 vmcs_write32(VM_ENTRY_CONTROLS, 1649 vmcs_write32(VM_ENTRY_CONTROLS,
1588 vmcs_read32(VM_ENTRY_CONTROLS) 1650 vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1598,10 +1660,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1598 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1660 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1599} 1661}
1600 1662
1663static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1664{
1665 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
1666
1667 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
1668 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1669}
1670
1601static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1671static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1602{ 1672{
1603 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; 1673 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
1604 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 1674
1675 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
1676 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
1605} 1677}
1606 1678
1607static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 1679static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1646,7 +1718,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1646 (CPU_BASED_CR3_LOAD_EXITING | 1718 (CPU_BASED_CR3_LOAD_EXITING |
1647 CPU_BASED_CR3_STORE_EXITING)); 1719 CPU_BASED_CR3_STORE_EXITING));
1648 vcpu->arch.cr0 = cr0; 1720 vcpu->arch.cr0 = cr0;
1649 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1721 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1650 } else if (!is_paging(vcpu)) { 1722 } else if (!is_paging(vcpu)) {
1651 /* From nonpaging to paging */ 1723 /* From nonpaging to paging */
1652 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1724 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1654,23 +1726,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1654 ~(CPU_BASED_CR3_LOAD_EXITING | 1726 ~(CPU_BASED_CR3_LOAD_EXITING |
1655 CPU_BASED_CR3_STORE_EXITING)); 1727 CPU_BASED_CR3_STORE_EXITING));
1656 vcpu->arch.cr0 = cr0; 1728 vcpu->arch.cr0 = cr0;
1657 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1729 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1658 } 1730 }
1659 1731
1660 if (!(cr0 & X86_CR0_WP)) 1732 if (!(cr0 & X86_CR0_WP))
1661 *hw_cr0 &= ~X86_CR0_WP; 1733 *hw_cr0 &= ~X86_CR0_WP;
1662} 1734}
1663 1735
1664static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1665 struct kvm_vcpu *vcpu)
1666{
1667 if (!is_paging(vcpu)) {
1668 *hw_cr4 &= ~X86_CR4_PAE;
1669 *hw_cr4 |= X86_CR4_PSE;
1670 } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
1671 *hw_cr4 &= ~X86_CR4_PAE;
1672}
1673
1674static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1736static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1675{ 1737{
1676 struct vcpu_vmx *vmx = to_vmx(vcpu); 1738 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1682,8 +1744,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1682 else 1744 else
1683 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; 1745 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1684 1746
1685 vmx_fpu_deactivate(vcpu);
1686
1687 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 1747 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
1688 enter_pmode(vcpu); 1748 enter_pmode(vcpu);
1689 1749
@@ -1691,7 +1751,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1691 enter_rmode(vcpu); 1751 enter_rmode(vcpu);
1692 1752
1693#ifdef CONFIG_X86_64 1753#ifdef CONFIG_X86_64
1694 if (vcpu->arch.shadow_efer & EFER_LME) { 1754 if (vcpu->arch.efer & EFER_LME) {
1695 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 1755 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1696 enter_lmode(vcpu); 1756 enter_lmode(vcpu);
1697 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 1757 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1702,12 +1762,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1702 if (enable_ept) 1762 if (enable_ept)
1703 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1763 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1704 1764
1765 if (!vcpu->fpu_active)
1766 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
1767
1705 vmcs_writel(CR0_READ_SHADOW, cr0); 1768 vmcs_writel(CR0_READ_SHADOW, cr0);
1706 vmcs_writel(GUEST_CR0, hw_cr0); 1769 vmcs_writel(GUEST_CR0, hw_cr0);
1707 vcpu->arch.cr0 = cr0; 1770 vcpu->arch.cr0 = cr0;
1708
1709 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1710 vmx_fpu_activate(vcpu);
1711} 1771}
1712 1772
1713static u64 construct_eptp(unsigned long root_hpa) 1773static u64 construct_eptp(unsigned long root_hpa)
@@ -1738,8 +1798,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1738 1798
1739 vmx_flush_tlb(vcpu); 1799 vmx_flush_tlb(vcpu);
1740 vmcs_writel(GUEST_CR3, guest_cr3); 1800 vmcs_writel(GUEST_CR3, guest_cr3);
1741 if (vcpu->arch.cr0 & X86_CR0_PE)
1742 vmx_fpu_deactivate(vcpu);
1743} 1801}
1744 1802
1745static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1803static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1748,8 +1806,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1748 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1806 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1749 1807
1750 vcpu->arch.cr4 = cr4; 1808 vcpu->arch.cr4 = cr4;
1751 if (enable_ept) 1809 if (enable_ept) {
1752 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1810 if (!is_paging(vcpu)) {
1811 hw_cr4 &= ~X86_CR4_PAE;
1812 hw_cr4 |= X86_CR4_PSE;
1813 } else if (!(cr4 & X86_CR4_PAE)) {
1814 hw_cr4 &= ~X86_CR4_PAE;
1815 }
1816 }
1753 1817
1754 vmcs_writel(CR4_READ_SHADOW, cr4); 1818 vmcs_writel(CR4_READ_SHADOW, cr4);
1755 vmcs_writel(GUEST_CR4, hw_cr4); 1819 vmcs_writel(GUEST_CR4, hw_cr4);
@@ -1787,7 +1851,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1787 1851
1788static int vmx_get_cpl(struct kvm_vcpu *vcpu) 1852static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1789{ 1853{
1790 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ 1854 if (!is_protmode(vcpu))
1791 return 0; 1855 return 0;
1792 1856
1793 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 1857 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
@@ -2042,7 +2106,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
2042static bool guest_state_valid(struct kvm_vcpu *vcpu) 2106static bool guest_state_valid(struct kvm_vcpu *vcpu)
2043{ 2107{
2044 /* real mode guest state checks */ 2108 /* real mode guest state checks */
2045 if (!(vcpu->arch.cr0 & X86_CR0_PE)) { 2109 if (!is_protmode(vcpu)) {
2046 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 2110 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
2047 return false; 2111 return false;
2048 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 2112 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -2175,7 +2239,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2175 struct kvm_userspace_memory_region kvm_userspace_mem; 2239 struct kvm_userspace_memory_region kvm_userspace_mem;
2176 int r = 0; 2240 int r = 0;
2177 2241
2178 down_write(&kvm->slots_lock); 2242 mutex_lock(&kvm->slots_lock);
2179 if (kvm->arch.apic_access_page) 2243 if (kvm->arch.apic_access_page)
2180 goto out; 2244 goto out;
2181 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 2245 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -2188,7 +2252,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2188 2252
2189 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2253 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2190out: 2254out:
2191 up_write(&kvm->slots_lock); 2255 mutex_unlock(&kvm->slots_lock);
2192 return r; 2256 return r;
2193} 2257}
2194 2258
@@ -2197,7 +2261,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2197 struct kvm_userspace_memory_region kvm_userspace_mem; 2261 struct kvm_userspace_memory_region kvm_userspace_mem;
2198 int r = 0; 2262 int r = 0;
2199 2263
2200 down_write(&kvm->slots_lock); 2264 mutex_lock(&kvm->slots_lock);
2201 if (kvm->arch.ept_identity_pagetable) 2265 if (kvm->arch.ept_identity_pagetable)
2202 goto out; 2266 goto out;
2203 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 2267 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
@@ -2212,7 +2276,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2212 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2276 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2213 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); 2277 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
2214out: 2278out:
2215 up_write(&kvm->slots_lock); 2279 mutex_unlock(&kvm->slots_lock);
2216 return r; 2280 return r;
2217} 2281}
2218 2282
@@ -2384,14 +2448,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2384 for (i = 0; i < NR_VMX_MSR; ++i) { 2448 for (i = 0; i < NR_VMX_MSR; ++i) {
2385 u32 index = vmx_msr_index[i]; 2449 u32 index = vmx_msr_index[i];
2386 u32 data_low, data_high; 2450 u32 data_low, data_high;
2387 u64 data;
2388 int j = vmx->nmsrs; 2451 int j = vmx->nmsrs;
2389 2452
2390 if (rdmsr_safe(index, &data_low, &data_high) < 0) 2453 if (rdmsr_safe(index, &data_low, &data_high) < 0)
2391 continue; 2454 continue;
2392 if (wrmsr_safe(index, data_low, data_high) < 0) 2455 if (wrmsr_safe(index, data_low, data_high) < 0)
2393 continue; 2456 continue;
2394 data = data_low | ((u64)data_high << 32);
2395 vmx->guest_msrs[j].index = i; 2457 vmx->guest_msrs[j].index = i;
2396 vmx->guest_msrs[j].data = 0; 2458 vmx->guest_msrs[j].data = 0;
2397 vmx->guest_msrs[j].mask = -1ull; 2459 vmx->guest_msrs[j].mask = -1ull;
@@ -2404,7 +2466,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2404 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 2466 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2405 2467
2406 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 2468 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2407 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 2469 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
2470 if (enable_ept)
2471 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2472 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2408 2473
2409 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2474 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2410 rdtscll(tsc_this); 2475 rdtscll(tsc_this);
@@ -2429,10 +2494,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2429{ 2494{
2430 struct vcpu_vmx *vmx = to_vmx(vcpu); 2495 struct vcpu_vmx *vmx = to_vmx(vcpu);
2431 u64 msr; 2496 u64 msr;
2432 int ret; 2497 int ret, idx;
2433 2498
2434 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2499 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2435 down_read(&vcpu->kvm->slots_lock); 2500 idx = srcu_read_lock(&vcpu->kvm->srcu);
2436 if (!init_rmode(vmx->vcpu.kvm)) { 2501 if (!init_rmode(vmx->vcpu.kvm)) {
2437 ret = -ENOMEM; 2502 ret = -ENOMEM;
2438 goto out; 2503 goto out;
@@ -2526,7 +2591,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2591 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2527 2592
2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 2593 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2594 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
2530 vmx_set_cr4(&vmx->vcpu, 0); 2595 vmx_set_cr4(&vmx->vcpu, 0);
2531 vmx_set_efer(&vmx->vcpu, 0); 2596 vmx_set_efer(&vmx->vcpu, 0);
2532 vmx_fpu_activate(&vmx->vcpu); 2597 vmx_fpu_activate(&vmx->vcpu);
@@ -2540,7 +2605,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2540 vmx->emulation_required = 0; 2605 vmx->emulation_required = 0;
2541 2606
2542out: 2607out:
2543 up_read(&vcpu->kvm->slots_lock); 2608 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2544 return ret; 2609 return ret;
2545} 2610}
2546 2611
@@ -2717,6 +2782,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2717 kvm_queue_exception(vcpu, vec); 2782 kvm_queue_exception(vcpu, vec);
2718 return 1; 2783 return 1;
2719 case BP_VECTOR: 2784 case BP_VECTOR:
2785 /*
2786 * Update instruction length as we may reinject the exception
2787 * from user space while in guest debugging mode.
2788 */
2789 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
2790 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2720 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 2791 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2721 return 0; 2792 return 0;
2722 /* fall through */ 2793 /* fall through */
@@ -2839,6 +2910,13 @@ static int handle_exception(struct kvm_vcpu *vcpu)
2839 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 2910 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
2840 /* fall through */ 2911 /* fall through */
2841 case BP_VECTOR: 2912 case BP_VECTOR:
2913 /*
2914 * Update instruction length as we may reinject #BP from
2915 * user space while in guest debugging mode. Reading it for
2916 * #DB as well causes no harm, it is not used in that case.
2917 */
2918 vmx->vcpu.arch.event_exit_inst_len =
2919 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2842 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2920 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2843 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 2921 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
2844 kvm_run->debug.arch.exception = ex_no; 2922 kvm_run->debug.arch.exception = ex_no;
@@ -2940,11 +3018,10 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2940 }; 3018 };
2941 break; 3019 break;
2942 case 2: /* clts */ 3020 case 2: /* clts */
2943 vmx_fpu_deactivate(vcpu); 3021 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
2944 vcpu->arch.cr0 &= ~X86_CR0_TS; 3022 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
2945 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2946 vmx_fpu_activate(vcpu);
2947 skip_emulated_instruction(vcpu); 3023 skip_emulated_instruction(vcpu);
3024 vmx_fpu_activate(vcpu);
2948 return 1; 3025 return 1;
2949 case 1: /*mov from cr*/ 3026 case 1: /*mov from cr*/
2950 switch (cr) { 3027 switch (cr) {
@@ -2962,7 +3039,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2962 } 3039 }
2963 break; 3040 break;
2964 case 3: /* lmsw */ 3041 case 3: /* lmsw */
2965 kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); 3042 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
3043 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
3044 kvm_lmsw(vcpu, val);
2966 3045
2967 skip_emulated_instruction(vcpu); 3046 skip_emulated_instruction(vcpu);
2968 return 1; 3047 return 1;
@@ -2975,12 +3054,22 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2975 return 0; 3054 return 0;
2976} 3055}
2977 3056
3057static int check_dr_alias(struct kvm_vcpu *vcpu)
3058{
3059 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3060 kvm_queue_exception(vcpu, UD_VECTOR);
3061 return -1;
3062 }
3063 return 0;
3064}
3065
2978static int handle_dr(struct kvm_vcpu *vcpu) 3066static int handle_dr(struct kvm_vcpu *vcpu)
2979{ 3067{
2980 unsigned long exit_qualification; 3068 unsigned long exit_qualification;
2981 unsigned long val; 3069 unsigned long val;
2982 int dr, reg; 3070 int dr, reg;
2983 3071
3072 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
2984 if (!kvm_require_cpl(vcpu, 0)) 3073 if (!kvm_require_cpl(vcpu, 0))
2985 return 1; 3074 return 1;
2986 dr = vmcs_readl(GUEST_DR7); 3075 dr = vmcs_readl(GUEST_DR7);
@@ -3016,14 +3105,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3016 case 0 ... 3: 3105 case 0 ... 3:
3017 val = vcpu->arch.db[dr]; 3106 val = vcpu->arch.db[dr];
3018 break; 3107 break;
3108 case 4:
3109 if (check_dr_alias(vcpu) < 0)
3110 return 1;
3111 /* fall through */
3019 case 6: 3112 case 6:
3020 val = vcpu->arch.dr6; 3113 val = vcpu->arch.dr6;
3021 break; 3114 break;
3022 case 7: 3115 case 5:
3116 if (check_dr_alias(vcpu) < 0)
3117 return 1;
3118 /* fall through */
3119 default: /* 7 */
3023 val = vcpu->arch.dr7; 3120 val = vcpu->arch.dr7;
3024 break; 3121 break;
3025 default:
3026 val = 0;
3027 } 3122 }
3028 kvm_register_write(vcpu, reg, val); 3123 kvm_register_write(vcpu, reg, val);
3029 } else { 3124 } else {
@@ -3034,21 +3129,25 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3034 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 3129 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
3035 vcpu->arch.eff_db[dr] = val; 3130 vcpu->arch.eff_db[dr] = val;
3036 break; 3131 break;
3037 case 4 ... 5: 3132 case 4:
3038 if (vcpu->arch.cr4 & X86_CR4_DE) 3133 if (check_dr_alias(vcpu) < 0)
3039 kvm_queue_exception(vcpu, UD_VECTOR); 3134 return 1;
3040 break; 3135 /* fall through */
3041 case 6: 3136 case 6:
3042 if (val & 0xffffffff00000000ULL) { 3137 if (val & 0xffffffff00000000ULL) {
3043 kvm_queue_exception(vcpu, GP_VECTOR); 3138 kvm_inject_gp(vcpu, 0);
3044 break; 3139 return 1;
3045 } 3140 }
3046 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 3141 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3047 break; 3142 break;
3048 case 7: 3143 case 5:
3144 if (check_dr_alias(vcpu) < 0)
3145 return 1;
3146 /* fall through */
3147 default: /* 7 */
3049 if (val & 0xffffffff00000000ULL) { 3148 if (val & 0xffffffff00000000ULL) {
3050 kvm_queue_exception(vcpu, GP_VECTOR); 3149 kvm_inject_gp(vcpu, 0);
3051 break; 3150 return 1;
3052 } 3151 }
3053 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 3152 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3054 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 3153 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
@@ -3075,6 +3174,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
3075 u64 data; 3174 u64 data;
3076 3175
3077 if (vmx_get_msr(vcpu, ecx, &data)) { 3176 if (vmx_get_msr(vcpu, ecx, &data)) {
3177 trace_kvm_msr_read_ex(ecx);
3078 kvm_inject_gp(vcpu, 0); 3178 kvm_inject_gp(vcpu, 0);
3079 return 1; 3179 return 1;
3080 } 3180 }
@@ -3094,13 +3194,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3194 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
3095 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3195 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3096 3196
3097 trace_kvm_msr_write(ecx, data);
3098
3099 if (vmx_set_msr(vcpu, ecx, data) != 0) { 3197 if (vmx_set_msr(vcpu, ecx, data) != 0) {
3198 trace_kvm_msr_write_ex(ecx, data);
3100 kvm_inject_gp(vcpu, 0); 3199 kvm_inject_gp(vcpu, 0);
3101 return 1; 3200 return 1;
3102 } 3201 }
3103 3202
3203 trace_kvm_msr_write(ecx, data);
3104 skip_emulated_instruction(vcpu); 3204 skip_emulated_instruction(vcpu);
3105 return 1; 3205 return 1;
3106} 3206}
@@ -3385,7 +3485,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3385 } 3485 }
3386 3486
3387 if (err != EMULATE_DONE) { 3487 if (err != EMULATE_DONE) {
3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3488 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3489 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0; 3490 vcpu->run->internal.ndata = 0;
@@ -3416,6 +3515,12 @@ static int handle_pause(struct kvm_vcpu *vcpu)
3416 return 1; 3515 return 1;
3417} 3516}
3418 3517
3518static int handle_invalid_op(struct kvm_vcpu *vcpu)
3519{
3520 kvm_queue_exception(vcpu, UD_VECTOR);
3521 return 1;
3522}
3523
3419/* 3524/*
3420 * The exit handlers return 1 if the exit was handled fully and guest execution 3525 * The exit handlers return 1 if the exit was handled fully and guest execution
3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3526 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -3453,6 +3558,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3558 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3559 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 3560 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3561 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
3562 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
3456}; 3563};
3457 3564
3458static const int kvm_vmx_max_exit_handlers = 3565static const int kvm_vmx_max_exit_handlers =
@@ -3686,9 +3793,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3686 */ 3793 */
3687 vmcs_writel(HOST_CR0, read_cr0()); 3794 vmcs_writel(HOST_CR0, read_cr0());
3688 3795
3689 if (vcpu->arch.switch_db_regs)
3690 set_debugreg(vcpu->arch.dr6, 6);
3691
3692 asm( 3796 asm(
3693 /* Store host registers */ 3797 /* Store host registers */
3694 "push %%"R"dx; push %%"R"bp;" 3798 "push %%"R"dx; push %%"R"bp;"
@@ -3789,9 +3893,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3789 | (1 << VCPU_EXREG_PDPTR)); 3893 | (1 << VCPU_EXREG_PDPTR));
3790 vcpu->arch.regs_dirty = 0; 3894 vcpu->arch.regs_dirty = 0;
3791 3895
3792 if (vcpu->arch.switch_db_regs)
3793 get_debugreg(vcpu->arch.dr6, 6);
3794
3795 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3896 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3796 if (vmx->rmode.irq.pending) 3897 if (vmx->rmode.irq.pending)
3797 fixup_rmode_irq(vmx); 3898 fixup_rmode_irq(vmx);
@@ -3920,7 +4021,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3920 * b. VT-d with snooping control feature: snooping control feature of 4021 * b. VT-d with snooping control feature: snooping control feature of
3921 * VT-d engine can guarantee the cache correctness. Just set it 4022 * VT-d engine can guarantee the cache correctness. Just set it
3922 * to WB to keep consistent with host. So the same as item 3. 4023 * to WB to keep consistent with host. So the same as item 3.
3923 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep 4024 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
3924 * consistent with host MTRR 4025 * consistent with host MTRR
3925 */ 4026 */
3926 if (is_mmio) 4027 if (is_mmio)
@@ -3931,37 +4032,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3931 VMX_EPT_MT_EPTE_SHIFT; 4032 VMX_EPT_MT_EPTE_SHIFT;
3932 else 4033 else
3933 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) 4034 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3934 | VMX_EPT_IGMT_BIT; 4035 | VMX_EPT_IPAT_BIT;
3935 4036
3936 return ret; 4037 return ret;
3937} 4038}
3938 4039
4040#define _ER(x) { EXIT_REASON_##x, #x }
4041
3939static const struct trace_print_flags vmx_exit_reasons_str[] = { 4042static const struct trace_print_flags vmx_exit_reasons_str[] = {
3940 { EXIT_REASON_EXCEPTION_NMI, "exception" }, 4043 _ER(EXCEPTION_NMI),
3941 { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, 4044 _ER(EXTERNAL_INTERRUPT),
3942 { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, 4045 _ER(TRIPLE_FAULT),
3943 { EXIT_REASON_NMI_WINDOW, "nmi_window" }, 4046 _ER(PENDING_INTERRUPT),
3944 { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, 4047 _ER(NMI_WINDOW),
3945 { EXIT_REASON_CR_ACCESS, "cr_access" }, 4048 _ER(TASK_SWITCH),
3946 { EXIT_REASON_DR_ACCESS, "dr_access" }, 4049 _ER(CPUID),
3947 { EXIT_REASON_CPUID, "cpuid" }, 4050 _ER(HLT),
3948 { EXIT_REASON_MSR_READ, "rdmsr" }, 4051 _ER(INVLPG),
3949 { EXIT_REASON_MSR_WRITE, "wrmsr" }, 4052 _ER(RDPMC),
3950 { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, 4053 _ER(RDTSC),
3951 { EXIT_REASON_HLT, "halt" }, 4054 _ER(VMCALL),
3952 { EXIT_REASON_INVLPG, "invlpg" }, 4055 _ER(VMCLEAR),
3953 { EXIT_REASON_VMCALL, "hypercall" }, 4056 _ER(VMLAUNCH),
3954 { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, 4057 _ER(VMPTRLD),
3955 { EXIT_REASON_APIC_ACCESS, "apic_access" }, 4058 _ER(VMPTRST),
3956 { EXIT_REASON_WBINVD, "wbinvd" }, 4059 _ER(VMREAD),
3957 { EXIT_REASON_TASK_SWITCH, "task_switch" }, 4060 _ER(VMRESUME),
3958 { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, 4061 _ER(VMWRITE),
4062 _ER(VMOFF),
4063 _ER(VMON),
4064 _ER(CR_ACCESS),
4065 _ER(DR_ACCESS),
4066 _ER(IO_INSTRUCTION),
4067 _ER(MSR_READ),
4068 _ER(MSR_WRITE),
4069 _ER(MWAIT_INSTRUCTION),
4070 _ER(MONITOR_INSTRUCTION),
4071 _ER(PAUSE_INSTRUCTION),
4072 _ER(MCE_DURING_VMENTRY),
4073 _ER(TPR_BELOW_THRESHOLD),
4074 _ER(APIC_ACCESS),
4075 _ER(EPT_VIOLATION),
4076 _ER(EPT_MISCONFIG),
4077 _ER(WBINVD),
3959 { -1, NULL } 4078 { -1, NULL }
3960}; 4079};
3961 4080
3962static bool vmx_gb_page_enable(void) 4081#undef _ER
4082
4083static int vmx_get_lpage_level(void)
3963{ 4084{
3964 return false; 4085 if (enable_ept && !cpu_has_vmx_ept_1g_page())
4086 return PT_DIRECTORY_LEVEL;
4087 else
4088 /* For shadow and EPT supported 1GB page */
4089 return PT_PDPE_LEVEL;
4090}
4091
4092static inline u32 bit(int bitno)
4093{
4094 return 1 << (bitno & 31);
4095}
4096
4097static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4098{
4099 struct kvm_cpuid_entry2 *best;
4100 struct vcpu_vmx *vmx = to_vmx(vcpu);
4101 u32 exec_control;
4102
4103 vmx->rdtscp_enabled = false;
4104 if (vmx_rdtscp_supported()) {
4105 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
4106 if (exec_control & SECONDARY_EXEC_RDTSCP) {
4107 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
4108 if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
4109 vmx->rdtscp_enabled = true;
4110 else {
4111 exec_control &= ~SECONDARY_EXEC_RDTSCP;
4112 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4113 exec_control);
4114 }
4115 }
4116 }
3965} 4117}
3966 4118
3967static struct kvm_x86_ops vmx_x86_ops = { 4119static struct kvm_x86_ops vmx_x86_ops = {
@@ -3990,6 +4142,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3990 .set_segment = vmx_set_segment, 4142 .set_segment = vmx_set_segment,
3991 .get_cpl = vmx_get_cpl, 4143 .get_cpl = vmx_get_cpl,
3992 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4144 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4145 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
3993 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4146 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
3994 .set_cr0 = vmx_set_cr0, 4147 .set_cr0 = vmx_set_cr0,
3995 .set_cr3 = vmx_set_cr3, 4148 .set_cr3 = vmx_set_cr3,
@@ -4002,6 +4155,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4002 .cache_reg = vmx_cache_reg, 4155 .cache_reg = vmx_cache_reg,
4003 .get_rflags = vmx_get_rflags, 4156 .get_rflags = vmx_get_rflags,
4004 .set_rflags = vmx_set_rflags, 4157 .set_rflags = vmx_set_rflags,
4158 .fpu_activate = vmx_fpu_activate,
4159 .fpu_deactivate = vmx_fpu_deactivate,
4005 4160
4006 .tlb_flush = vmx_flush_tlb, 4161 .tlb_flush = vmx_flush_tlb,
4007 4162
@@ -4027,7 +4182,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
4027 .get_mt_mask = vmx_get_mt_mask, 4182 .get_mt_mask = vmx_get_mt_mask,
4028 4183
4029 .exit_reasons_str = vmx_exit_reasons_str, 4184 .exit_reasons_str = vmx_exit_reasons_str,
4030 .gb_page_enable = vmx_gb_page_enable, 4185 .get_lpage_level = vmx_get_lpage_level,
4186
4187 .cpuid_update = vmx_cpuid_update,
4188
4189 .rdtscp_supported = vmx_rdtscp_supported,
4031}; 4190};
4032 4191
4033static int __init vmx_init(void) 4192static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9d068966fb2a..3c4ca98ad27f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -38,6 +38,8 @@
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h> 40#include <linux/user-return-notifier.h>
41#include <linux/srcu.h>
42#include <linux/slab.h>
41#include <trace/events/kvm.h> 43#include <trace/events/kvm.h>
42#undef TRACE_INCLUDE_FILE 44#undef TRACE_INCLUDE_FILE
43#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
@@ -93,16 +95,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
93 95
94struct kvm_shared_msrs_global { 96struct kvm_shared_msrs_global {
95 int nr; 97 int nr;
96 struct kvm_shared_msr { 98 u32 msrs[KVM_NR_SHARED_MSRS];
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100}; 99};
101 100
102struct kvm_shared_msrs { 101struct kvm_shared_msrs {
103 struct user_return_notifier urn; 102 struct user_return_notifier urn;
104 bool registered; 103 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS]; 104 struct kvm_shared_msr_values {
105 u64 host;
106 u64 curr;
107 } values[KVM_NR_SHARED_MSRS];
106}; 108};
107 109
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; 110static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
@@ -147,53 +149,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
147static void kvm_on_user_return(struct user_return_notifier *urn) 149static void kvm_on_user_return(struct user_return_notifier *urn)
148{ 150{
149 unsigned slot; 151 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals 152 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn); 153 = container_of(urn, struct kvm_shared_msrs, urn);
154 struct kvm_shared_msr_values *values;
153 155
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) { 156 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot]; 157 values = &locals->values[slot];
156 if (global->value != locals->current_value[slot]) { 158 if (values->host != values->curr) {
157 wrmsrl(global->msr, global->value); 159 wrmsrl(shared_msrs_global.msrs[slot], values->host);
158 locals->current_value[slot] = global->value; 160 values->curr = values->host;
159 } 161 }
160 } 162 }
161 locals->registered = false; 163 locals->registered = false;
162 user_return_notifier_unregister(urn); 164 user_return_notifier_unregister(urn);
163} 165}
164 166
165void kvm_define_shared_msr(unsigned slot, u32 msr) 167static void shared_msr_update(unsigned slot, u32 msr)
166{ 168{
167 int cpu; 169 struct kvm_shared_msrs *smsr;
168 u64 value; 170 u64 value;
169 171
172 smsr = &__get_cpu_var(shared_msrs);
173 /* only read, and nobody should modify it at this time,
174 * so don't need lock */
175 if (slot >= shared_msrs_global.nr) {
176 printk(KERN_ERR "kvm: invalid MSR slot!");
177 return;
178 }
179 rdmsrl_safe(msr, &value);
180 smsr->values[slot].host = value;
181 smsr->values[slot].curr = value;
182}
183
184void kvm_define_shared_msr(unsigned slot, u32 msr)
185{
170 if (slot >= shared_msrs_global.nr) 186 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1; 187 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr; 188 shared_msrs_global.msrs[slot] = msr;
173 rdmsrl_safe(msr, &value); 189 /* we need ensured the shared_msr_global have been updated */
174 shared_msrs_global.msrs[slot].value = value; 190 smp_wmb();
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177} 191}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr); 192EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179 193
180static void kvm_shared_msr_cpu_online(void) 194static void kvm_shared_msr_cpu_online(void)
181{ 195{
182 unsigned i; 196 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184 197
185 for (i = 0; i < shared_msrs_global.nr; ++i) 198 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value; 199 shared_msr_update(i, shared_msrs_global.msrs[i]);
187} 200}
188 201
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 202void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{ 203{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 204 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192 205
193 if (((value ^ smsr->current_value[slot]) & mask) == 0) 206 if (((value ^ smsr->values[slot].curr) & mask) == 0)
194 return; 207 return;
195 smsr->current_value[slot] = value; 208 smsr->values[slot].curr = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value); 209 wrmsrl(shared_msrs_global.msrs[slot], value);
197 if (!smsr->registered) { 210 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return; 211 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn); 212 user_return_notifier_register(&smsr->urn);
@@ -257,12 +270,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
257} 270}
258EXPORT_SYMBOL_GPL(kvm_set_apic_base); 271EXPORT_SYMBOL_GPL(kvm_set_apic_base);
259 272
273#define EXCPT_BENIGN 0
274#define EXCPT_CONTRIBUTORY 1
275#define EXCPT_PF 2
276
277static int exception_class(int vector)
278{
279 switch (vector) {
280 case PF_VECTOR:
281 return EXCPT_PF;
282 case DE_VECTOR:
283 case TS_VECTOR:
284 case NP_VECTOR:
285 case SS_VECTOR:
286 case GP_VECTOR:
287 return EXCPT_CONTRIBUTORY;
288 default:
289 break;
290 }
291 return EXCPT_BENIGN;
292}
293
294static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
295 unsigned nr, bool has_error, u32 error_code)
296{
297 u32 prev_nr;
298 int class1, class2;
299
300 if (!vcpu->arch.exception.pending) {
301 queue:
302 vcpu->arch.exception.pending = true;
303 vcpu->arch.exception.has_error_code = has_error;
304 vcpu->arch.exception.nr = nr;
305 vcpu->arch.exception.error_code = error_code;
306 return;
307 }
308
309 /* to check exception */
310 prev_nr = vcpu->arch.exception.nr;
311 if (prev_nr == DF_VECTOR) {
312 /* triple fault -> shutdown */
313 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
314 return;
315 }
316 class1 = exception_class(prev_nr);
317 class2 = exception_class(nr);
318 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
319 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
320 /* generate double fault per SDM Table 5-5 */
321 vcpu->arch.exception.pending = true;
322 vcpu->arch.exception.has_error_code = true;
323 vcpu->arch.exception.nr = DF_VECTOR;
324 vcpu->arch.exception.error_code = 0;
325 } else
326 /* replace previous exception with a new one in a hope
327 that instruction re-execution will regenerate lost
328 exception */
329 goto queue;
330}
331
260void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 332void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
261{ 333{
262 WARN_ON(vcpu->arch.exception.pending); 334 kvm_multiple_exception(vcpu, nr, false, 0);
263 vcpu->arch.exception.pending = true;
264 vcpu->arch.exception.has_error_code = false;
265 vcpu->arch.exception.nr = nr;
266} 335}
267EXPORT_SYMBOL_GPL(kvm_queue_exception); 336EXPORT_SYMBOL_GPL(kvm_queue_exception);
268 337
@@ -270,25 +339,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
270 u32 error_code) 339 u32 error_code)
271{ 340{
272 ++vcpu->stat.pf_guest; 341 ++vcpu->stat.pf_guest;
273
274 if (vcpu->arch.exception.pending) {
275 switch(vcpu->arch.exception.nr) {
276 case DF_VECTOR:
277 /* triple fault -> shutdown */
278 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
279 return;
280 case PF_VECTOR:
281 vcpu->arch.exception.nr = DF_VECTOR;
282 vcpu->arch.exception.error_code = 0;
283 return;
284 default:
285 /* replace previous exception with a new one in a hope
286 that instruction re-execution will regenerate lost
287 exception */
288 vcpu->arch.exception.pending = false;
289 break;
290 }
291 }
292 vcpu->arch.cr2 = addr; 342 vcpu->arch.cr2 = addr;
293 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 343 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
294} 344}
@@ -301,11 +351,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
301 351
302void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 352void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
303{ 353{
304 WARN_ON(vcpu->arch.exception.pending); 354 kvm_multiple_exception(vcpu, nr, true, error_code);
305 vcpu->arch.exception.pending = true;
306 vcpu->arch.exception.has_error_code = true;
307 vcpu->arch.exception.nr = nr;
308 vcpu->arch.exception.error_code = error_code;
309} 355}
310EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 356EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
311 357
@@ -383,41 +429,38 @@ out:
383 429
384void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 430void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
385{ 431{
386 if (cr0 & CR0_RESERVED_BITS) { 432 cr0 |= X86_CR0_ET;
387 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 433
388 cr0, vcpu->arch.cr0); 434#ifdef CONFIG_X86_64
435 if (cr0 & 0xffffffff00000000UL) {
389 kvm_inject_gp(vcpu, 0); 436 kvm_inject_gp(vcpu, 0);
390 return; 437 return;
391 } 438 }
439#endif
440
441 cr0 &= ~CR0_RESERVED_BITS;
392 442
393 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
394 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
395 kvm_inject_gp(vcpu, 0); 444 kvm_inject_gp(vcpu, 0);
396 return; 445 return;
397 } 446 }
398 447
399 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 448 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
400 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
401 "and a clear PE flag\n");
402 kvm_inject_gp(vcpu, 0); 449 kvm_inject_gp(vcpu, 0);
403 return; 450 return;
404 } 451 }
405 452
406 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 453 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
407#ifdef CONFIG_X86_64 454#ifdef CONFIG_X86_64
408 if ((vcpu->arch.shadow_efer & EFER_LME)) { 455 if ((vcpu->arch.efer & EFER_LME)) {
409 int cs_db, cs_l; 456 int cs_db, cs_l;
410 457
411 if (!is_pae(vcpu)) { 458 if (!is_pae(vcpu)) {
412 printk(KERN_DEBUG "set_cr0: #GP, start paging "
413 "in long mode while PAE is disabled\n");
414 kvm_inject_gp(vcpu, 0); 459 kvm_inject_gp(vcpu, 0);
415 return; 460 return;
416 } 461 }
417 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 462 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
418 if (cs_l) { 463 if (cs_l) {
419 printk(KERN_DEBUG "set_cr0: #GP, start paging "
420 "in long mode while CS.L == 1\n");
421 kvm_inject_gp(vcpu, 0); 464 kvm_inject_gp(vcpu, 0);
422 return; 465 return;
423 466
@@ -425,8 +468,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
425 } else 468 } else
426#endif 469#endif
427 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 470 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
428 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
429 "reserved bits\n");
430 kvm_inject_gp(vcpu, 0); 471 kvm_inject_gp(vcpu, 0);
431 return; 472 return;
432 } 473 }
@@ -443,38 +484,33 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
443 484
444void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 485void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
445{ 486{
446 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 487 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
447} 488}
448EXPORT_SYMBOL_GPL(kvm_lmsw); 489EXPORT_SYMBOL_GPL(kvm_lmsw);
449 490
450void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 491void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
451{ 492{
452 unsigned long old_cr4 = vcpu->arch.cr4; 493 unsigned long old_cr4 = kvm_read_cr4(vcpu);
453 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 494 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
454 495
455 if (cr4 & CR4_RESERVED_BITS) { 496 if (cr4 & CR4_RESERVED_BITS) {
456 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
457 kvm_inject_gp(vcpu, 0); 497 kvm_inject_gp(vcpu, 0);
458 return; 498 return;
459 } 499 }
460 500
461 if (is_long_mode(vcpu)) { 501 if (is_long_mode(vcpu)) {
462 if (!(cr4 & X86_CR4_PAE)) { 502 if (!(cr4 & X86_CR4_PAE)) {
463 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
464 "in long mode\n");
465 kvm_inject_gp(vcpu, 0); 503 kvm_inject_gp(vcpu, 0);
466 return; 504 return;
467 } 505 }
468 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 506 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
469 && ((cr4 ^ old_cr4) & pdptr_bits) 507 && ((cr4 ^ old_cr4) & pdptr_bits)
470 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 508 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
471 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
472 kvm_inject_gp(vcpu, 0); 509 kvm_inject_gp(vcpu, 0);
473 return; 510 return;
474 } 511 }
475 512
476 if (cr4 & X86_CR4_VMXE) { 513 if (cr4 & X86_CR4_VMXE) {
477 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
478 kvm_inject_gp(vcpu, 0); 514 kvm_inject_gp(vcpu, 0);
479 return; 515 return;
480 } 516 }
@@ -495,21 +531,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
495 531
496 if (is_long_mode(vcpu)) { 532 if (is_long_mode(vcpu)) {
497 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 533 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
498 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
499 kvm_inject_gp(vcpu, 0); 534 kvm_inject_gp(vcpu, 0);
500 return; 535 return;
501 } 536 }
502 } else { 537 } else {
503 if (is_pae(vcpu)) { 538 if (is_pae(vcpu)) {
504 if (cr3 & CR3_PAE_RESERVED_BITS) { 539 if (cr3 & CR3_PAE_RESERVED_BITS) {
505 printk(KERN_DEBUG
506 "set_cr3: #GP, reserved bits\n");
507 kvm_inject_gp(vcpu, 0); 540 kvm_inject_gp(vcpu, 0);
508 return; 541 return;
509 } 542 }
510 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 543 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
511 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
512 "reserved bits\n");
513 kvm_inject_gp(vcpu, 0); 544 kvm_inject_gp(vcpu, 0);
514 return; 545 return;
515 } 546 }
@@ -541,7 +572,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3);
541void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 572void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
542{ 573{
543 if (cr8 & CR8_RESERVED_BITS) { 574 if (cr8 & CR8_RESERVED_BITS) {
544 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
545 kvm_inject_gp(vcpu, 0); 575 kvm_inject_gp(vcpu, 0);
546 return; 576 return;
547 } 577 }
@@ -575,9 +605,11 @@ static inline u32 bit(int bitno)
575 * kvm-specific. Those are put in the beginning of the list. 605 * kvm-specific. Those are put in the beginning of the list.
576 */ 606 */
577 607
578#define KVM_SAVE_MSRS_BEGIN 2 608#define KVM_SAVE_MSRS_BEGIN 5
579static u32 msrs_to_save[] = { 609static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 610 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
611 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
612 HV_X64_MSR_APIC_ASSIST_PAGE,
581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 613 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
582 MSR_K6_STAR, 614 MSR_K6_STAR,
583#ifdef CONFIG_X86_64 615#ifdef CONFIG_X86_64
@@ -595,15 +627,12 @@ static u32 emulated_msrs[] = {
595static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 627static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
596{ 628{
597 if (efer & efer_reserved_bits) { 629 if (efer & efer_reserved_bits) {
598 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
599 efer);
600 kvm_inject_gp(vcpu, 0); 630 kvm_inject_gp(vcpu, 0);
601 return; 631 return;
602 } 632 }
603 633
604 if (is_paging(vcpu) 634 if (is_paging(vcpu)
605 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 635 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
606 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
607 kvm_inject_gp(vcpu, 0); 636 kvm_inject_gp(vcpu, 0);
608 return; 637 return;
609 } 638 }
@@ -613,7 +642,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
613 642
614 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 643 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
615 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 644 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
616 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
617 kvm_inject_gp(vcpu, 0); 645 kvm_inject_gp(vcpu, 0);
618 return; 646 return;
619 } 647 }
@@ -624,7 +652,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
624 652
625 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 653 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
626 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 654 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
627 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
628 kvm_inject_gp(vcpu, 0); 655 kvm_inject_gp(vcpu, 0);
629 return; 656 return;
630 } 657 }
@@ -633,9 +660,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
633 kvm_x86_ops->set_efer(vcpu, efer); 660 kvm_x86_ops->set_efer(vcpu, efer);
634 661
635 efer &= ~EFER_LMA; 662 efer &= ~EFER_LMA;
636 efer |= vcpu->arch.shadow_efer & EFER_LMA; 663 efer |= vcpu->arch.efer & EFER_LMA;
637 664
638 vcpu->arch.shadow_efer = efer; 665 vcpu->arch.efer = efer;
639 666
640 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 667 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
641 kvm_mmu_reset_context(vcpu); 668 kvm_mmu_reset_context(vcpu);
@@ -670,7 +697,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
670{ 697{
671 static int version; 698 static int version;
672 struct pvclock_wall_clock wc; 699 struct pvclock_wall_clock wc;
673 struct timespec now, sys, boot; 700 struct timespec boot;
674 701
675 if (!wall_clock) 702 if (!wall_clock)
676 return; 703 return;
@@ -685,9 +712,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
685 * wall clock specified here. guest system time equals host 712 * wall clock specified here. guest system time equals host
686 * system time for us, thus we must fill in host boot time here. 713 * system time for us, thus we must fill in host boot time here.
687 */ 714 */
688 now = current_kernel_time(); 715 getboottime(&boot);
689 ktime_get_ts(&sys);
690 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
691 716
692 wc.sec = boot.tv_sec; 717 wc.sec = boot.tv_sec;
693 wc.nsec = boot.tv_nsec; 718 wc.nsec = boot.tv_nsec;
@@ -762,6 +787,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
762 local_irq_save(flags); 787 local_irq_save(flags);
763 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 788 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
764 ktime_get_ts(&ts); 789 ktime_get_ts(&ts);
790 monotonic_to_bootbased(&ts);
765 local_irq_restore(flags); 791 local_irq_restore(flags);
766 792
767 /* With all the info we got, fill in the values */ 793 /* With all the info we got, fill in the values */
@@ -914,9 +940,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
914 if (msr >= MSR_IA32_MC0_CTL && 940 if (msr >= MSR_IA32_MC0_CTL &&
915 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 941 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
916 u32 offset = msr - MSR_IA32_MC0_CTL; 942 u32 offset = msr - MSR_IA32_MC0_CTL;
917 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 943 /* only 0 or all 1s can be written to IA32_MCi_CTL
944 * some Linux kernels though clear bit 10 in bank 4 to
945 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
946 * this to avoid an uncatched #GP in the guest
947 */
918 if ((offset & 0x3) == 0 && 948 if ((offset & 0x3) == 0 &&
919 data != 0 && data != ~(u64)0) 949 data != 0 && (data | (1 << 10)) != ~(u64)0)
920 return -1; 950 return -1;
921 vcpu->arch.mce_banks[offset] = data; 951 vcpu->arch.mce_banks[offset] = data;
922 break; 952 break;
@@ -958,6 +988,100 @@ out:
958 return r; 988 return r;
959} 989}
960 990
991static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
992{
993 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
994}
995
996static bool kvm_hv_msr_partition_wide(u32 msr)
997{
998 bool r = false;
999 switch (msr) {
1000 case HV_X64_MSR_GUEST_OS_ID:
1001 case HV_X64_MSR_HYPERCALL:
1002 r = true;
1003 break;
1004 }
1005
1006 return r;
1007}
1008
1009static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1010{
1011 struct kvm *kvm = vcpu->kvm;
1012
1013 switch (msr) {
1014 case HV_X64_MSR_GUEST_OS_ID:
1015 kvm->arch.hv_guest_os_id = data;
1016 /* setting guest os id to zero disables hypercall page */
1017 if (!kvm->arch.hv_guest_os_id)
1018 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1019 break;
1020 case HV_X64_MSR_HYPERCALL: {
1021 u64 gfn;
1022 unsigned long addr;
1023 u8 instructions[4];
1024
1025 /* if guest os id is not set hypercall should remain disabled */
1026 if (!kvm->arch.hv_guest_os_id)
1027 break;
1028 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1029 kvm->arch.hv_hypercall = data;
1030 break;
1031 }
1032 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1033 addr = gfn_to_hva(kvm, gfn);
1034 if (kvm_is_error_hva(addr))
1035 return 1;
1036 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1037 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1038 if (copy_to_user((void __user *)addr, instructions, 4))
1039 return 1;
1040 kvm->arch.hv_hypercall = data;
1041 break;
1042 }
1043 default:
1044 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1045 "data 0x%llx\n", msr, data);
1046 return 1;
1047 }
1048 return 0;
1049}
1050
1051static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1052{
1053 switch (msr) {
1054 case HV_X64_MSR_APIC_ASSIST_PAGE: {
1055 unsigned long addr;
1056
1057 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1058 vcpu->arch.hv_vapic = data;
1059 break;
1060 }
1061 addr = gfn_to_hva(vcpu->kvm, data >>
1062 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1063 if (kvm_is_error_hva(addr))
1064 return 1;
1065 if (clear_user((void __user *)addr, PAGE_SIZE))
1066 return 1;
1067 vcpu->arch.hv_vapic = data;
1068 break;
1069 }
1070 case HV_X64_MSR_EOI:
1071 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1072 case HV_X64_MSR_ICR:
1073 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1074 case HV_X64_MSR_TPR:
1075 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1076 default:
1077 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1078 "data 0x%llx\n", msr, data);
1079 return 1;
1080 }
1081
1082 return 0;
1083}
1084
961int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1085int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
962{ 1086{
963 switch (msr) { 1087 switch (msr) {
@@ -1072,6 +1196,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1072 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1196 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1073 "0x%x data 0x%llx\n", msr, data); 1197 "0x%x data 0x%llx\n", msr, data);
1074 break; 1198 break;
1199 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1200 if (kvm_hv_msr_partition_wide(msr)) {
1201 int r;
1202 mutex_lock(&vcpu->kvm->lock);
1203 r = set_msr_hyperv_pw(vcpu, msr, data);
1204 mutex_unlock(&vcpu->kvm->lock);
1205 return r;
1206 } else
1207 return set_msr_hyperv(vcpu, msr, data);
1208 break;
1075 default: 1209 default:
1076 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1210 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1077 return xen_hvm_config(vcpu, data); 1211 return xen_hvm_config(vcpu, data);
@@ -1171,6 +1305,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1171 return 0; 1305 return 0;
1172} 1306}
1173 1307
1308static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1309{
1310 u64 data = 0;
1311 struct kvm *kvm = vcpu->kvm;
1312
1313 switch (msr) {
1314 case HV_X64_MSR_GUEST_OS_ID:
1315 data = kvm->arch.hv_guest_os_id;
1316 break;
1317 case HV_X64_MSR_HYPERCALL:
1318 data = kvm->arch.hv_hypercall;
1319 break;
1320 default:
1321 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1322 return 1;
1323 }
1324
1325 *pdata = data;
1326 return 0;
1327}
1328
1329static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1330{
1331 u64 data = 0;
1332
1333 switch (msr) {
1334 case HV_X64_MSR_VP_INDEX: {
1335 int r;
1336 struct kvm_vcpu *v;
1337 kvm_for_each_vcpu(r, v, vcpu->kvm)
1338 if (v == vcpu)
1339 data = r;
1340 break;
1341 }
1342 case HV_X64_MSR_EOI:
1343 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
1344 case HV_X64_MSR_ICR:
1345 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1346 case HV_X64_MSR_TPR:
1347 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1348 default:
1349 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1350 return 1;
1351 }
1352 *pdata = data;
1353 return 0;
1354}
1355
1174int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1356int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1175{ 1357{
1176 u64 data; 1358 u64 data;
@@ -1222,7 +1404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1222 data |= (((uint64_t)4ULL) << 40); 1404 data |= (((uint64_t)4ULL) << 40);
1223 break; 1405 break;
1224 case MSR_EFER: 1406 case MSR_EFER:
1225 data = vcpu->arch.shadow_efer; 1407 data = vcpu->arch.efer;
1226 break; 1408 break;
1227 case MSR_KVM_WALL_CLOCK: 1409 case MSR_KVM_WALL_CLOCK:
1228 data = vcpu->kvm->arch.wall_clock; 1410 data = vcpu->kvm->arch.wall_clock;
@@ -1237,6 +1419,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1237 case MSR_IA32_MCG_STATUS: 1419 case MSR_IA32_MCG_STATUS:
1238 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1420 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1239 return get_msr_mce(vcpu, msr, pdata); 1421 return get_msr_mce(vcpu, msr, pdata);
1422 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1423 if (kvm_hv_msr_partition_wide(msr)) {
1424 int r;
1425 mutex_lock(&vcpu->kvm->lock);
1426 r = get_msr_hyperv_pw(vcpu, msr, pdata);
1427 mutex_unlock(&vcpu->kvm->lock);
1428 return r;
1429 } else
1430 return get_msr_hyperv(vcpu, msr, pdata);
1431 break;
1240 default: 1432 default:
1241 if (!ignore_msrs) { 1433 if (!ignore_msrs) {
1242 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1434 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1262,15 +1454,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1262 int (*do_msr)(struct kvm_vcpu *vcpu, 1454 int (*do_msr)(struct kvm_vcpu *vcpu,
1263 unsigned index, u64 *data)) 1455 unsigned index, u64 *data))
1264{ 1456{
1265 int i; 1457 int i, idx;
1266 1458
1267 vcpu_load(vcpu); 1459 vcpu_load(vcpu);
1268 1460
1269 down_read(&vcpu->kvm->slots_lock); 1461 idx = srcu_read_lock(&vcpu->kvm->srcu);
1270 for (i = 0; i < msrs->nmsrs; ++i) 1462 for (i = 0; i < msrs->nmsrs; ++i)
1271 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1463 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1272 break; 1464 break;
1273 up_read(&vcpu->kvm->slots_lock); 1465 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1274 1466
1275 vcpu_put(vcpu); 1467 vcpu_put(vcpu);
1276 1468
@@ -1352,6 +1544,11 @@ int kvm_dev_ioctl_check_extension(long ext)
1352 case KVM_CAP_XEN_HVM: 1544 case KVM_CAP_XEN_HVM:
1353 case KVM_CAP_ADJUST_CLOCK: 1545 case KVM_CAP_ADJUST_CLOCK:
1354 case KVM_CAP_VCPU_EVENTS: 1546 case KVM_CAP_VCPU_EVENTS:
1547 case KVM_CAP_HYPERV:
1548 case KVM_CAP_HYPERV_VAPIC:
1549 case KVM_CAP_HYPERV_SPIN:
1550 case KVM_CAP_PCI_SEGMENT:
1551 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1355 r = 1; 1552 r = 1;
1356 break; 1553 break;
1357 case KVM_CAP_COALESCED_MMIO: 1554 case KVM_CAP_COALESCED_MMIO:
@@ -1465,8 +1662,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1465 1662
1466void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1663void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1467{ 1664{
1468 kvm_x86_ops->vcpu_put(vcpu);
1469 kvm_put_guest_fpu(vcpu); 1665 kvm_put_guest_fpu(vcpu);
1666 kvm_x86_ops->vcpu_put(vcpu);
1470} 1667}
1471 1668
1472static int is_efer_nx(void) 1669static int is_efer_nx(void)
@@ -1531,6 +1728,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1531 cpuid_fix_nx_cap(vcpu); 1728 cpuid_fix_nx_cap(vcpu);
1532 r = 0; 1729 r = 0;
1533 kvm_apic_set_version(vcpu); 1730 kvm_apic_set_version(vcpu);
1731 kvm_x86_ops->cpuid_update(vcpu);
1534 1732
1535out_free: 1733out_free:
1536 vfree(cpuid_entries); 1734 vfree(cpuid_entries);
@@ -1553,6 +1751,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1553 goto out; 1751 goto out;
1554 vcpu->arch.cpuid_nent = cpuid->nent; 1752 vcpu->arch.cpuid_nent = cpuid->nent;
1555 kvm_apic_set_version(vcpu); 1753 kvm_apic_set_version(vcpu);
1754 kvm_x86_ops->cpuid_update(vcpu);
1556 return 0; 1755 return 0;
1557 1756
1558out: 1757out:
@@ -1595,12 +1794,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1595 u32 index, int *nent, int maxnent) 1794 u32 index, int *nent, int maxnent)
1596{ 1795{
1597 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1796 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1598 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1599#ifdef CONFIG_X86_64 1797#ifdef CONFIG_X86_64
1798 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
1799 ? F(GBPAGES) : 0;
1600 unsigned f_lm = F(LM); 1800 unsigned f_lm = F(LM);
1601#else 1801#else
1802 unsigned f_gbpages = 0;
1602 unsigned f_lm = 0; 1803 unsigned f_lm = 0;
1603#endif 1804#endif
1805 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
1604 1806
1605 /* cpuid 1.edx */ 1807 /* cpuid 1.edx */
1606 const u32 kvm_supported_word0_x86_features = 1808 const u32 kvm_supported_word0_x86_features =
@@ -1620,7 +1822,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1620 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1822 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1621 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1823 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1622 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1824 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1623 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1825 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
1624 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1826 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1625 /* cpuid 1.ecx */ 1827 /* cpuid 1.ecx */
1626 const u32 kvm_supported_word4_x86_features = 1828 const u32 kvm_supported_word4_x86_features =
@@ -1867,7 +2069,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1867 return 0; 2069 return 0;
1868 if (mce->status & MCI_STATUS_UC) { 2070 if (mce->status & MCI_STATUS_UC) {
1869 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2071 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1870 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 2072 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
1871 printk(KERN_DEBUG "kvm: set_mce: " 2073 printk(KERN_DEBUG "kvm: set_mce: "
1872 "injects mce exception while " 2074 "injects mce exception while "
1873 "previous one is in progress!\n"); 2075 "previous one is in progress!\n");
@@ -1913,7 +2115,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1913 2115
1914 events->sipi_vector = vcpu->arch.sipi_vector; 2116 events->sipi_vector = vcpu->arch.sipi_vector;
1915 2117
1916 events->flags = 0; 2118 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2119 | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
1917 2120
1918 vcpu_put(vcpu); 2121 vcpu_put(vcpu);
1919} 2122}
@@ -1921,7 +2124,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1921static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2124static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1922 struct kvm_vcpu_events *events) 2125 struct kvm_vcpu_events *events)
1923{ 2126{
1924 if (events->flags) 2127 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2128 | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
1925 return -EINVAL; 2129 return -EINVAL;
1926 2130
1927 vcpu_load(vcpu); 2131 vcpu_load(vcpu);
@@ -1938,10 +2142,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1938 kvm_pic_clear_isr_ack(vcpu->kvm); 2142 kvm_pic_clear_isr_ack(vcpu->kvm);
1939 2143
1940 vcpu->arch.nmi_injected = events->nmi.injected; 2144 vcpu->arch.nmi_injected = events->nmi.injected;
1941 vcpu->arch.nmi_pending = events->nmi.pending; 2145 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2146 vcpu->arch.nmi_pending = events->nmi.pending;
1942 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 2147 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
1943 2148
1944 vcpu->arch.sipi_vector = events->sipi_vector; 2149 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2150 vcpu->arch.sipi_vector = events->sipi_vector;
1945 2151
1946 vcpu_put(vcpu); 2152 vcpu_put(vcpu);
1947 2153
@@ -2157,14 +2363,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2157 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 2363 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
2158 return -EINVAL; 2364 return -EINVAL;
2159 2365
2160 down_write(&kvm->slots_lock); 2366 mutex_lock(&kvm->slots_lock);
2161 spin_lock(&kvm->mmu_lock); 2367 spin_lock(&kvm->mmu_lock);
2162 2368
2163 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 2369 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
2164 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 2370 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
2165 2371
2166 spin_unlock(&kvm->mmu_lock); 2372 spin_unlock(&kvm->mmu_lock);
2167 up_write(&kvm->slots_lock); 2373 mutex_unlock(&kvm->slots_lock);
2168 return 0; 2374 return 0;
2169} 2375}
2170 2376
@@ -2173,13 +2379,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2173 return kvm->arch.n_alloc_mmu_pages; 2379 return kvm->arch.n_alloc_mmu_pages;
2174} 2380}
2175 2381
2382gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2383{
2384 int i;
2385 struct kvm_mem_alias *alias;
2386 struct kvm_mem_aliases *aliases;
2387
2388 aliases = rcu_dereference(kvm->arch.aliases);
2389
2390 for (i = 0; i < aliases->naliases; ++i) {
2391 alias = &aliases->aliases[i];
2392 if (alias->flags & KVM_ALIAS_INVALID)
2393 continue;
2394 if (gfn >= alias->base_gfn
2395 && gfn < alias->base_gfn + alias->npages)
2396 return alias->target_gfn + gfn - alias->base_gfn;
2397 }
2398 return gfn;
2399}
2400
2176gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2401gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2177{ 2402{
2178 int i; 2403 int i;
2179 struct kvm_mem_alias *alias; 2404 struct kvm_mem_alias *alias;
2405 struct kvm_mem_aliases *aliases;
2180 2406
2181 for (i = 0; i < kvm->arch.naliases; ++i) { 2407 aliases = rcu_dereference(kvm->arch.aliases);
2182 alias = &kvm->arch.aliases[i]; 2408
2409 for (i = 0; i < aliases->naliases; ++i) {
2410 alias = &aliases->aliases[i];
2183 if (gfn >= alias->base_gfn 2411 if (gfn >= alias->base_gfn
2184 && gfn < alias->base_gfn + alias->npages) 2412 && gfn < alias->base_gfn + alias->npages)
2185 return alias->target_gfn + gfn - alias->base_gfn; 2413 return alias->target_gfn + gfn - alias->base_gfn;
@@ -2197,6 +2425,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2197{ 2425{
2198 int r, n; 2426 int r, n;
2199 struct kvm_mem_alias *p; 2427 struct kvm_mem_alias *p;
2428 struct kvm_mem_aliases *aliases, *old_aliases;
2200 2429
2201 r = -EINVAL; 2430 r = -EINVAL;
2202 /* General sanity checks */ 2431 /* General sanity checks */
@@ -2213,26 +2442,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2213 < alias->target_phys_addr) 2442 < alias->target_phys_addr)
2214 goto out; 2443 goto out;
2215 2444
2216 down_write(&kvm->slots_lock); 2445 r = -ENOMEM;
2217 spin_lock(&kvm->mmu_lock); 2446 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2447 if (!aliases)
2448 goto out;
2449
2450 mutex_lock(&kvm->slots_lock);
2218 2451
2219 p = &kvm->arch.aliases[alias->slot]; 2452 /* invalidate any gfn reference in case of deletion/shrinking */
2453 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2454 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2455 old_aliases = kvm->arch.aliases;
2456 rcu_assign_pointer(kvm->arch.aliases, aliases);
2457 synchronize_srcu_expedited(&kvm->srcu);
2458 kvm_mmu_zap_all(kvm);
2459 kfree(old_aliases);
2460
2461 r = -ENOMEM;
2462 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2463 if (!aliases)
2464 goto out_unlock;
2465
2466 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2467
2468 p = &aliases->aliases[alias->slot];
2220 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2469 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2221 p->npages = alias->memory_size >> PAGE_SHIFT; 2470 p->npages = alias->memory_size >> PAGE_SHIFT;
2222 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2471 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2472 p->flags &= ~(KVM_ALIAS_INVALID);
2223 2473
2224 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2474 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2225 if (kvm->arch.aliases[n - 1].npages) 2475 if (aliases->aliases[n - 1].npages)
2226 break; 2476 break;
2227 kvm->arch.naliases = n; 2477 aliases->naliases = n;
2228 2478
2229 spin_unlock(&kvm->mmu_lock); 2479 old_aliases = kvm->arch.aliases;
2230 kvm_mmu_zap_all(kvm); 2480 rcu_assign_pointer(kvm->arch.aliases, aliases);
2231 2481 synchronize_srcu_expedited(&kvm->srcu);
2232 up_write(&kvm->slots_lock); 2482 kfree(old_aliases);
2233 2483 r = 0;
2234 return 0;
2235 2484
2485out_unlock:
2486 mutex_unlock(&kvm->slots_lock);
2236out: 2487out:
2237 return r; 2488 return r;
2238} 2489}
@@ -2270,18 +2521,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2270 r = 0; 2521 r = 0;
2271 switch (chip->chip_id) { 2522 switch (chip->chip_id) {
2272 case KVM_IRQCHIP_PIC_MASTER: 2523 case KVM_IRQCHIP_PIC_MASTER:
2273 spin_lock(&pic_irqchip(kvm)->lock); 2524 raw_spin_lock(&pic_irqchip(kvm)->lock);
2274 memcpy(&pic_irqchip(kvm)->pics[0], 2525 memcpy(&pic_irqchip(kvm)->pics[0],
2275 &chip->chip.pic, 2526 &chip->chip.pic,
2276 sizeof(struct kvm_pic_state)); 2527 sizeof(struct kvm_pic_state));
2277 spin_unlock(&pic_irqchip(kvm)->lock); 2528 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2278 break; 2529 break;
2279 case KVM_IRQCHIP_PIC_SLAVE: 2530 case KVM_IRQCHIP_PIC_SLAVE:
2280 spin_lock(&pic_irqchip(kvm)->lock); 2531 raw_spin_lock(&pic_irqchip(kvm)->lock);
2281 memcpy(&pic_irqchip(kvm)->pics[1], 2532 memcpy(&pic_irqchip(kvm)->pics[1],
2282 &chip->chip.pic, 2533 &chip->chip.pic,
2283 sizeof(struct kvm_pic_state)); 2534 sizeof(struct kvm_pic_state));
2284 spin_unlock(&pic_irqchip(kvm)->lock); 2535 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2285 break; 2536 break;
2286 case KVM_IRQCHIP_IOAPIC: 2537 case KVM_IRQCHIP_IOAPIC:
2287 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 2538 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2361,29 +2612,63 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2361int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2612int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2362 struct kvm_dirty_log *log) 2613 struct kvm_dirty_log *log)
2363{ 2614{
2364 int r; 2615 int r, i;
2365 int n;
2366 struct kvm_memory_slot *memslot; 2616 struct kvm_memory_slot *memslot;
2367 int is_dirty = 0; 2617 unsigned long n;
2618 unsigned long is_dirty = 0;
2619 unsigned long *dirty_bitmap = NULL;
2368 2620
2369 down_write(&kvm->slots_lock); 2621 mutex_lock(&kvm->slots_lock);
2370 2622
2371 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2623 r = -EINVAL;
2372 if (r) 2624 if (log->slot >= KVM_MEMORY_SLOTS)
2625 goto out;
2626
2627 memslot = &kvm->memslots->memslots[log->slot];
2628 r = -ENOENT;
2629 if (!memslot->dirty_bitmap)
2630 goto out;
2631
2632 n = kvm_dirty_bitmap_bytes(memslot);
2633
2634 r = -ENOMEM;
2635 dirty_bitmap = vmalloc(n);
2636 if (!dirty_bitmap)
2373 goto out; 2637 goto out;
2638 memset(dirty_bitmap, 0, n);
2639
2640 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2641 is_dirty = memslot->dirty_bitmap[i];
2374 2642
2375 /* If nothing is dirty, don't bother messing with page tables. */ 2643 /* If nothing is dirty, don't bother messing with page tables. */
2376 if (is_dirty) { 2644 if (is_dirty) {
2645 struct kvm_memslots *slots, *old_slots;
2646
2377 spin_lock(&kvm->mmu_lock); 2647 spin_lock(&kvm->mmu_lock);
2378 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2648 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2379 spin_unlock(&kvm->mmu_lock); 2649 spin_unlock(&kvm->mmu_lock);
2380 memslot = &kvm->memslots[log->slot]; 2650
2381 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2651 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2382 memset(memslot->dirty_bitmap, 0, n); 2652 if (!slots)
2653 goto out_free;
2654
2655 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2656 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2657
2658 old_slots = kvm->memslots;
2659 rcu_assign_pointer(kvm->memslots, slots);
2660 synchronize_srcu_expedited(&kvm->srcu);
2661 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2662 kfree(old_slots);
2383 } 2663 }
2664
2384 r = 0; 2665 r = 0;
2666 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2667 r = -EFAULT;
2668out_free:
2669 vfree(dirty_bitmap);
2385out: 2670out:
2386 up_write(&kvm->slots_lock); 2671 mutex_unlock(&kvm->slots_lock);
2387 return r; 2672 return r;
2388} 2673}
2389 2674
@@ -2466,6 +2751,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2466 if (vpic) { 2751 if (vpic) {
2467 r = kvm_ioapic_init(kvm); 2752 r = kvm_ioapic_init(kvm);
2468 if (r) { 2753 if (r) {
2754 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
2755 &vpic->dev);
2469 kfree(vpic); 2756 kfree(vpic);
2470 goto create_irqchip_unlock; 2757 goto create_irqchip_unlock;
2471 } 2758 }
@@ -2477,10 +2764,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2477 r = kvm_setup_default_irq_routing(kvm); 2764 r = kvm_setup_default_irq_routing(kvm);
2478 if (r) { 2765 if (r) {
2479 mutex_lock(&kvm->irq_lock); 2766 mutex_lock(&kvm->irq_lock);
2480 kfree(kvm->arch.vpic); 2767 kvm_ioapic_destroy(kvm);
2481 kfree(kvm->arch.vioapic); 2768 kvm_destroy_pic(kvm);
2482 kvm->arch.vpic = NULL;
2483 kvm->arch.vioapic = NULL;
2484 mutex_unlock(&kvm->irq_lock); 2769 mutex_unlock(&kvm->irq_lock);
2485 } 2770 }
2486 create_irqchip_unlock: 2771 create_irqchip_unlock:
@@ -2496,7 +2781,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2496 sizeof(struct kvm_pit_config))) 2781 sizeof(struct kvm_pit_config)))
2497 goto out; 2782 goto out;
2498 create_pit: 2783 create_pit:
2499 down_write(&kvm->slots_lock); 2784 mutex_lock(&kvm->slots_lock);
2500 r = -EEXIST; 2785 r = -EEXIST;
2501 if (kvm->arch.vpit) 2786 if (kvm->arch.vpit)
2502 goto create_pit_unlock; 2787 goto create_pit_unlock;
@@ -2505,7 +2790,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2505 if (kvm->arch.vpit) 2790 if (kvm->arch.vpit)
2506 r = 0; 2791 r = 0;
2507 create_pit_unlock: 2792 create_pit_unlock:
2508 up_write(&kvm->slots_lock); 2793 mutex_unlock(&kvm->slots_lock);
2509 break; 2794 break;
2510 case KVM_IRQ_LINE_STATUS: 2795 case KVM_IRQ_LINE_STATUS:
2511 case KVM_IRQ_LINE: { 2796 case KVM_IRQ_LINE: {
@@ -2722,7 +3007,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2722 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3007 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2723 return 0; 3008 return 0;
2724 3009
2725 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 3010 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2726} 3011}
2727 3012
2728static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3013static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -2731,17 +3016,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2731 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3016 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2732 return 0; 3017 return 0;
2733 3018
2734 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 3019 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2735} 3020}
2736 3021
2737static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3022gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
2738 struct kvm_vcpu *vcpu) 3023{
3024 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3025 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3026}
3027
3028 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3029{
3030 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3031 access |= PFERR_FETCH_MASK;
3032 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3033}
3034
3035gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3036{
3037 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3038 access |= PFERR_WRITE_MASK;
3039 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3040}
3041
3042/* uses this to access any guest's mapped memory without checking CPL */
3043gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3044{
3045 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
3046}
3047
3048static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3049 struct kvm_vcpu *vcpu, u32 access,
3050 u32 *error)
2739{ 3051{
2740 void *data = val; 3052 void *data = val;
2741 int r = X86EMUL_CONTINUE; 3053 int r = X86EMUL_CONTINUE;
2742 3054
2743 while (bytes) { 3055 while (bytes) {
2744 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3056 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
2745 unsigned offset = addr & (PAGE_SIZE-1); 3057 unsigned offset = addr & (PAGE_SIZE-1);
2746 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3058 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2747 int ret; 3059 int ret;
@@ -2764,14 +3076,37 @@ out:
2764 return r; 3076 return r;
2765} 3077}
2766 3078
3079/* used for instruction fetching */
3080static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3081 struct kvm_vcpu *vcpu, u32 *error)
3082{
3083 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3084 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3085 access | PFERR_FETCH_MASK, error);
3086}
3087
3088static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3089 struct kvm_vcpu *vcpu, u32 *error)
3090{
3091 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3092 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3093 error);
3094}
3095
3096static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3097 struct kvm_vcpu *vcpu, u32 *error)
3098{
3099 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3100}
3101
2767static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3102static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2768 struct kvm_vcpu *vcpu) 3103 struct kvm_vcpu *vcpu, u32 *error)
2769{ 3104{
2770 void *data = val; 3105 void *data = val;
2771 int r = X86EMUL_CONTINUE; 3106 int r = X86EMUL_CONTINUE;
2772 3107
2773 while (bytes) { 3108 while (bytes) {
2774 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3109 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
2775 unsigned offset = addr & (PAGE_SIZE-1); 3110 unsigned offset = addr & (PAGE_SIZE-1);
2776 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3111 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2777 int ret; 3112 int ret;
@@ -2801,6 +3136,7 @@ static int emulator_read_emulated(unsigned long addr,
2801 struct kvm_vcpu *vcpu) 3136 struct kvm_vcpu *vcpu)
2802{ 3137{
2803 gpa_t gpa; 3138 gpa_t gpa;
3139 u32 error_code;
2804 3140
2805 if (vcpu->mmio_read_completed) { 3141 if (vcpu->mmio_read_completed) {
2806 memcpy(val, vcpu->mmio_data, bytes); 3142 memcpy(val, vcpu->mmio_data, bytes);
@@ -2810,17 +3146,20 @@ static int emulator_read_emulated(unsigned long addr,
2810 return X86EMUL_CONTINUE; 3146 return X86EMUL_CONTINUE;
2811 } 3147 }
2812 3148
2813 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3149 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
3150
3151 if (gpa == UNMAPPED_GVA) {
3152 kvm_inject_page_fault(vcpu, addr, error_code);
3153 return X86EMUL_PROPAGATE_FAULT;
3154 }
2814 3155
2815 /* For APIC access vmexit */ 3156 /* For APIC access vmexit */
2816 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3157 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2817 goto mmio; 3158 goto mmio;
2818 3159
2819 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 3160 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
2820 == X86EMUL_CONTINUE) 3161 == X86EMUL_CONTINUE)
2821 return X86EMUL_CONTINUE; 3162 return X86EMUL_CONTINUE;
2822 if (gpa == UNMAPPED_GVA)
2823 return X86EMUL_PROPAGATE_FAULT;
2824 3163
2825mmio: 3164mmio:
2826 /* 3165 /*
@@ -2859,11 +3198,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2859 struct kvm_vcpu *vcpu) 3198 struct kvm_vcpu *vcpu)
2860{ 3199{
2861 gpa_t gpa; 3200 gpa_t gpa;
3201 u32 error_code;
2862 3202
2863 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3203 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
2864 3204
2865 if (gpa == UNMAPPED_GVA) { 3205 if (gpa == UNMAPPED_GVA) {
2866 kvm_inject_page_fault(vcpu, addr, 2); 3206 kvm_inject_page_fault(vcpu, addr, error_code);
2867 return X86EMUL_PROPAGATE_FAULT; 3207 return X86EMUL_PROPAGATE_FAULT;
2868 } 3208 }
2869 3209
@@ -2927,7 +3267,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2927 char *kaddr; 3267 char *kaddr;
2928 u64 val; 3268 u64 val;
2929 3269
2930 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3270 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
2931 3271
2932 if (gpa == UNMAPPED_GVA || 3272 if (gpa == UNMAPPED_GVA ||
2933 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3273 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -2964,35 +3304,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2964 3304
2965int emulate_clts(struct kvm_vcpu *vcpu) 3305int emulate_clts(struct kvm_vcpu *vcpu)
2966{ 3306{
2967 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 3307 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3308 kvm_x86_ops->fpu_activate(vcpu);
2968 return X86EMUL_CONTINUE; 3309 return X86EMUL_CONTINUE;
2969} 3310}
2970 3311
2971int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3312int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2972{ 3313{
2973 struct kvm_vcpu *vcpu = ctxt->vcpu; 3314 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
2974
2975 switch (dr) {
2976 case 0 ... 3:
2977 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2978 return X86EMUL_CONTINUE;
2979 default:
2980 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2981 return X86EMUL_UNHANDLEABLE;
2982 }
2983} 3315}
2984 3316
2985int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3317int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2986{ 3318{
2987 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3319 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2988 int exception;
2989 3320
2990 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 3321 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
2991 if (exception) {
2992 /* FIXME: better handling */
2993 return X86EMUL_UNHANDLEABLE;
2994 }
2995 return X86EMUL_CONTINUE;
2996} 3322}
2997 3323
2998void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3324void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3006,7 +3332,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3006 3332
3007 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3333 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
3008 3334
3009 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 3335 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3010 3336
3011 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 3337 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3012 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 3338 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -3014,7 +3340,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3014EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3340EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3015 3341
3016static struct x86_emulate_ops emulate_ops = { 3342static struct x86_emulate_ops emulate_ops = {
3017 .read_std = kvm_read_guest_virt, 3343 .read_std = kvm_read_guest_virt_system,
3344 .fetch = kvm_fetch_guest_virt,
3018 .read_emulated = emulator_read_emulated, 3345 .read_emulated = emulator_read_emulated,
3019 .write_emulated = emulator_write_emulated, 3346 .write_emulated = emulator_write_emulated,
3020 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3347 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -3057,8 +3384,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3057 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3384 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3058 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3385 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
3059 vcpu->arch.emulate_ctxt.mode = 3386 vcpu->arch.emulate_ctxt.mode =
3387 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3060 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3388 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3061 ? X86EMUL_MODE_REAL : cs_l 3389 ? X86EMUL_MODE_VM86 : cs_l
3062 ? X86EMUL_MODE_PROT64 : cs_db 3390 ? X86EMUL_MODE_PROT64 : cs_db
3063 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3391 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3064 3392
@@ -3150,12 +3478,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
3150 gva_t q = vcpu->arch.pio.guest_gva; 3478 gva_t q = vcpu->arch.pio.guest_gva;
3151 unsigned bytes; 3479 unsigned bytes;
3152 int ret; 3480 int ret;
3481 u32 error_code;
3153 3482
3154 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 3483 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3155 if (vcpu->arch.pio.in) 3484 if (vcpu->arch.pio.in)
3156 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 3485 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
3157 else 3486 else
3158 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 3487 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3488
3489 if (ret == X86EMUL_PROPAGATE_FAULT)
3490 kvm_inject_page_fault(vcpu, q, error_code);
3491
3159 return ret; 3492 return ret;
3160} 3493}
3161 3494
@@ -3176,7 +3509,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
3176 if (io->in) { 3509 if (io->in) {
3177 r = pio_copy_data(vcpu); 3510 r = pio_copy_data(vcpu);
3178 if (r) 3511 if (r)
3179 return r; 3512 goto out;
3180 } 3513 }
3181 3514
3182 delta = 1; 3515 delta = 1;
@@ -3203,7 +3536,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
3203 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 3536 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3204 } 3537 }
3205 } 3538 }
3206 3539out:
3207 io->count -= io->cur_count; 3540 io->count -= io->cur_count;
3208 io->cur_count = 0; 3541 io->cur_count = 0;
3209 3542
@@ -3216,11 +3549,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3216 int r; 3549 int r;
3217 3550
3218 if (vcpu->arch.pio.in) 3551 if (vcpu->arch.pio.in)
3219 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3552 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3220 vcpu->arch.pio.size, pd); 3553 vcpu->arch.pio.size, pd);
3221 else 3554 else
3222 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3555 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3223 vcpu->arch.pio.size, pd); 3556 vcpu->arch.pio.port, vcpu->arch.pio.size,
3557 pd);
3224 return r; 3558 return r;
3225} 3559}
3226 3560
@@ -3231,7 +3565,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
3231 int i, r = 0; 3565 int i, r = 0;
3232 3566
3233 for (i = 0; i < io->cur_count; i++) { 3567 for (i = 0; i < io->cur_count; i++) {
3234 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 3568 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3235 io->port, io->size, pd)) { 3569 io->port, io->size, pd)) {
3236 r = -EOPNOTSUPP; 3570 r = -EOPNOTSUPP;
3237 break; 3571 break;
@@ -3245,6 +3579,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3245{ 3579{
3246 unsigned long val; 3580 unsigned long val;
3247 3581
3582 trace_kvm_pio(!in, port, size, 1);
3583
3248 vcpu->run->exit_reason = KVM_EXIT_IO; 3584 vcpu->run->exit_reason = KVM_EXIT_IO;
3249 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3585 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3250 vcpu->run->io.size = vcpu->arch.pio.size = size; 3586 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3256,11 +3592,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3256 vcpu->arch.pio.down = 0; 3592 vcpu->arch.pio.down = 0;
3257 vcpu->arch.pio.rep = 0; 3593 vcpu->arch.pio.rep = 0;
3258 3594
3259 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3595 if (!vcpu->arch.pio.in) {
3260 size, 1); 3596 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3261 3597 memcpy(vcpu->arch.pio_data, &val, 4);
3262 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3598 }
3263 memcpy(vcpu->arch.pio_data, &val, 4);
3264 3599
3265 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3600 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3266 complete_pio(vcpu); 3601 complete_pio(vcpu);
@@ -3277,6 +3612,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3277 unsigned now, in_page; 3612 unsigned now, in_page;
3278 int ret = 0; 3613 int ret = 0;
3279 3614
3615 trace_kvm_pio(!in, port, size, count);
3616
3280 vcpu->run->exit_reason = KVM_EXIT_IO; 3617 vcpu->run->exit_reason = KVM_EXIT_IO;
3281 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3618 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3282 vcpu->run->io.size = vcpu->arch.pio.size = size; 3619 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3288,9 +3625,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3288 vcpu->arch.pio.down = down; 3625 vcpu->arch.pio.down = down;
3289 vcpu->arch.pio.rep = rep; 3626 vcpu->arch.pio.rep = rep;
3290 3627
3291 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3292 size, count);
3293
3294 if (!count) { 3628 if (!count) {
3295 kvm_x86_ops->skip_emulated_instruction(vcpu); 3629 kvm_x86_ops->skip_emulated_instruction(vcpu);
3296 return 1; 3630 return 1;
@@ -3322,10 +3656,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3322 if (!vcpu->arch.pio.in) { 3656 if (!vcpu->arch.pio.in) {
3323 /* string PIO write */ 3657 /* string PIO write */
3324 ret = pio_copy_data(vcpu); 3658 ret = pio_copy_data(vcpu);
3325 if (ret == X86EMUL_PROPAGATE_FAULT) { 3659 if (ret == X86EMUL_PROPAGATE_FAULT)
3326 kvm_inject_gp(vcpu, 0);
3327 return 1; 3660 return 1;
3328 }
3329 if (ret == 0 && !pio_string_write(vcpu)) { 3661 if (ret == 0 && !pio_string_write(vcpu)) {
3330 complete_pio(vcpu); 3662 complete_pio(vcpu);
3331 if (vcpu->arch.pio.count == 0) 3663 if (vcpu->arch.pio.count == 0)
@@ -3484,11 +3816,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3484 return a0 | ((gpa_t)a1 << 32); 3816 return a0 | ((gpa_t)a1 << 32);
3485} 3817}
3486 3818
3819int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
3820{
3821 u64 param, ingpa, outgpa, ret;
3822 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
3823 bool fast, longmode;
3824 int cs_db, cs_l;
3825
3826 /*
3827 * hypercall generates UD from non zero cpl and real mode
3828 * per HYPER-V spec
3829 */
3830 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
3831 kvm_queue_exception(vcpu, UD_VECTOR);
3832 return 0;
3833 }
3834
3835 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3836 longmode = is_long_mode(vcpu) && cs_l == 1;
3837
3838 if (!longmode) {
3839 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
3840 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
3841 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
3842 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
3843 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
3844 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
3845 }
3846#ifdef CONFIG_X86_64
3847 else {
3848 param = kvm_register_read(vcpu, VCPU_REGS_RCX);
3849 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
3850 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
3851 }
3852#endif
3853
3854 code = param & 0xffff;
3855 fast = (param >> 16) & 0x1;
3856 rep_cnt = (param >> 32) & 0xfff;
3857 rep_idx = (param >> 48) & 0xfff;
3858
3859 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
3860
3861 switch (code) {
3862 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
3863 kvm_vcpu_on_spin(vcpu);
3864 break;
3865 default:
3866 res = HV_STATUS_INVALID_HYPERCALL_CODE;
3867 break;
3868 }
3869
3870 ret = res | (((u64)rep_done & 0xfff) << 32);
3871 if (longmode) {
3872 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3873 } else {
3874 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
3875 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
3876 }
3877
3878 return 1;
3879}
3880
3487int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3881int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3488{ 3882{
3489 unsigned long nr, a0, a1, a2, a3, ret; 3883 unsigned long nr, a0, a1, a2, a3, ret;
3490 int r = 1; 3884 int r = 1;
3491 3885
3886 if (kvm_hv_hypercall_enabled(vcpu->kvm))
3887 return kvm_hv_hypercall(vcpu);
3888
3492 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3889 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3493 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3890 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3494 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3891 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3531,10 +3928,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3531int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3928int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3532{ 3929{
3533 char instruction[3]; 3930 char instruction[3];
3534 int ret = 0;
3535 unsigned long rip = kvm_rip_read(vcpu); 3931 unsigned long rip = kvm_rip_read(vcpu);
3536 3932
3537
3538 /* 3933 /*
3539 * Blow out the MMU to ensure that no other VCPU has an active mapping 3934 * Blow out the MMU to ensure that no other VCPU has an active mapping
3540 * to ensure that the updated hypercall appears atomically across all 3935 * to ensure that the updated hypercall appears atomically across all
@@ -3543,11 +3938,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3543 kvm_mmu_zap_all(vcpu->kvm); 3938 kvm_mmu_zap_all(vcpu->kvm);
3544 3939
3545 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3940 kvm_x86_ops->patch_hypercall(vcpu, instruction);
3546 if (emulator_write_emulated(rip, instruction, 3, vcpu)
3547 != X86EMUL_CONTINUE)
3548 ret = -EFAULT;
3549 3941
3550 return ret; 3942 return emulator_write_emulated(rip, instruction, 3, vcpu);
3551} 3943}
3552 3944
3553static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3945static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3580,10 +3972,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3580{ 3972{
3581 unsigned long value; 3973 unsigned long value;
3582 3974
3583 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3584 switch (cr) { 3975 switch (cr) {
3585 case 0: 3976 case 0:
3586 value = vcpu->arch.cr0; 3977 value = kvm_read_cr0(vcpu);
3587 break; 3978 break;
3588 case 2: 3979 case 2:
3589 value = vcpu->arch.cr2; 3980 value = vcpu->arch.cr2;
@@ -3592,7 +3983,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3592 value = vcpu->arch.cr3; 3983 value = vcpu->arch.cr3;
3593 break; 3984 break;
3594 case 4: 3985 case 4:
3595 value = vcpu->arch.cr4; 3986 value = kvm_read_cr4(vcpu);
3596 break; 3987 break;
3597 case 8: 3988 case 8:
3598 value = kvm_get_cr8(vcpu); 3989 value = kvm_get_cr8(vcpu);
@@ -3610,7 +4001,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3610{ 4001{
3611 switch (cr) { 4002 switch (cr) {
3612 case 0: 4003 case 0:
3613 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 4004 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3614 *rflags = kvm_get_rflags(vcpu); 4005 *rflags = kvm_get_rflags(vcpu);
3615 break; 4006 break;
3616 case 2: 4007 case 2:
@@ -3620,7 +4011,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3620 kvm_set_cr3(vcpu, val); 4011 kvm_set_cr3(vcpu, val);
3621 break; 4012 break;
3622 case 4: 4013 case 4:
3623 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 4014 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3624 break; 4015 break;
3625 case 8: 4016 case 8:
3626 kvm_set_cr8(vcpu, val & 0xfUL); 4017 kvm_set_cr8(vcpu, val & 0xfUL);
@@ -3687,6 +4078,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3687 } 4078 }
3688 return best; 4079 return best;
3689} 4080}
4081EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
3690 4082
3691int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 4083int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3692{ 4084{
@@ -3770,14 +4162,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
3770static void vapic_exit(struct kvm_vcpu *vcpu) 4162static void vapic_exit(struct kvm_vcpu *vcpu)
3771{ 4163{
3772 struct kvm_lapic *apic = vcpu->arch.apic; 4164 struct kvm_lapic *apic = vcpu->arch.apic;
4165 int idx;
3773 4166
3774 if (!apic || !apic->vapic_addr) 4167 if (!apic || !apic->vapic_addr)
3775 return; 4168 return;
3776 4169
3777 down_read(&vcpu->kvm->slots_lock); 4170 idx = srcu_read_lock(&vcpu->kvm->srcu);
3778 kvm_release_page_dirty(apic->vapic_page); 4171 kvm_release_page_dirty(apic->vapic_page);
3779 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 4172 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3780 up_read(&vcpu->kvm->slots_lock); 4173 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3781} 4174}
3782 4175
3783static void update_cr8_intercept(struct kvm_vcpu *vcpu) 4176static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -3873,12 +4266,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3873 r = 0; 4266 r = 0;
3874 goto out; 4267 goto out;
3875 } 4268 }
4269 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
4270 vcpu->fpu_active = 0;
4271 kvm_x86_ops->fpu_deactivate(vcpu);
4272 }
3876 } 4273 }
3877 4274
3878 preempt_disable(); 4275 preempt_disable();
3879 4276
3880 kvm_x86_ops->prepare_guest_switch(vcpu); 4277 kvm_x86_ops->prepare_guest_switch(vcpu);
3881 kvm_load_guest_fpu(vcpu); 4278 if (vcpu->fpu_active)
4279 kvm_load_guest_fpu(vcpu);
3882 4280
3883 local_irq_disable(); 4281 local_irq_disable();
3884 4282
@@ -3906,7 +4304,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3906 kvm_lapic_sync_to_vapic(vcpu); 4304 kvm_lapic_sync_to_vapic(vcpu);
3907 } 4305 }
3908 4306
3909 up_read(&vcpu->kvm->slots_lock); 4307 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3910 4308
3911 kvm_guest_enter(); 4309 kvm_guest_enter();
3912 4310
@@ -3948,7 +4346,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3948 4346
3949 preempt_enable(); 4347 preempt_enable();
3950 4348
3951 down_read(&vcpu->kvm->slots_lock); 4349 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3952 4350
3953 /* 4351 /*
3954 * Profile KVM exit RIPs: 4352 * Profile KVM exit RIPs:
@@ -3970,6 +4368,7 @@ out:
3970static int __vcpu_run(struct kvm_vcpu *vcpu) 4368static int __vcpu_run(struct kvm_vcpu *vcpu)
3971{ 4369{
3972 int r; 4370 int r;
4371 struct kvm *kvm = vcpu->kvm;
3973 4372
3974 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 4373 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3975 pr_debug("vcpu %d received sipi with vector # %x\n", 4374 pr_debug("vcpu %d received sipi with vector # %x\n",
@@ -3981,7 +4380,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
3981 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4380 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3982 } 4381 }
3983 4382
3984 down_read(&vcpu->kvm->slots_lock); 4383 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3985 vapic_enter(vcpu); 4384 vapic_enter(vcpu);
3986 4385
3987 r = 1; 4386 r = 1;
@@ -3989,9 +4388,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
3989 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 4388 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3990 r = vcpu_enter_guest(vcpu); 4389 r = vcpu_enter_guest(vcpu);
3991 else { 4390 else {
3992 up_read(&vcpu->kvm->slots_lock); 4391 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3993 kvm_vcpu_block(vcpu); 4392 kvm_vcpu_block(vcpu);
3994 down_read(&vcpu->kvm->slots_lock); 4393 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3995 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4394 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3996 { 4395 {
3997 switch(vcpu->arch.mp_state) { 4396 switch(vcpu->arch.mp_state) {
@@ -4026,13 +4425,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4026 ++vcpu->stat.signal_exits; 4425 ++vcpu->stat.signal_exits;
4027 } 4426 }
4028 if (need_resched()) { 4427 if (need_resched()) {
4029 up_read(&vcpu->kvm->slots_lock); 4428 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4030 kvm_resched(vcpu); 4429 kvm_resched(vcpu);
4031 down_read(&vcpu->kvm->slots_lock); 4430 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4032 } 4431 }
4033 } 4432 }
4034 4433
4035 up_read(&vcpu->kvm->slots_lock); 4434 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4036 post_kvm_run_save(vcpu); 4435 post_kvm_run_save(vcpu);
4037 4436
4038 vapic_exit(vcpu); 4437 vapic_exit(vcpu);
@@ -4062,7 +4461,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4062 kvm_set_cr8(vcpu, kvm_run->cr8); 4461 kvm_set_cr8(vcpu, kvm_run->cr8);
4063 4462
4064 if (vcpu->arch.pio.cur_count) { 4463 if (vcpu->arch.pio.cur_count) {
4464 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4065 r = complete_pio(vcpu); 4465 r = complete_pio(vcpu);
4466 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4066 if (r) 4467 if (r)
4067 goto out; 4468 goto out;
4068 } 4469 }
@@ -4071,10 +4472,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4071 vcpu->mmio_read_completed = 1; 4472 vcpu->mmio_read_completed = 1;
4072 vcpu->mmio_needed = 0; 4473 vcpu->mmio_needed = 0;
4073 4474
4074 down_read(&vcpu->kvm->slots_lock); 4475 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4075 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4476 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
4076 EMULTYPE_NO_DECODE); 4477 EMULTYPE_NO_DECODE);
4077 up_read(&vcpu->kvm->slots_lock); 4478 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4078 if (r == EMULATE_DO_MMIO) { 4479 if (r == EMULATE_DO_MMIO) {
4079 /* 4480 /*
4080 * Read-modify-write. Back to userspace. 4481 * Read-modify-write. Back to userspace.
@@ -4201,13 +4602,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4201 sregs->gdt.limit = dt.limit; 4602 sregs->gdt.limit = dt.limit;
4202 sregs->gdt.base = dt.base; 4603 sregs->gdt.base = dt.base;
4203 4604
4204 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4605 sregs->cr0 = kvm_read_cr0(vcpu);
4205 sregs->cr0 = vcpu->arch.cr0;
4206 sregs->cr2 = vcpu->arch.cr2; 4606 sregs->cr2 = vcpu->arch.cr2;
4207 sregs->cr3 = vcpu->arch.cr3; 4607 sregs->cr3 = vcpu->arch.cr3;
4208 sregs->cr4 = vcpu->arch.cr4; 4608 sregs->cr4 = kvm_read_cr4(vcpu);
4209 sregs->cr8 = kvm_get_cr8(vcpu); 4609 sregs->cr8 = kvm_get_cr8(vcpu);
4210 sregs->efer = vcpu->arch.shadow_efer; 4610 sregs->efer = vcpu->arch.efer;
4211 sregs->apic_base = kvm_get_apic_base(vcpu); 4611 sregs->apic_base = kvm_get_apic_base(vcpu);
4212 4612
4213 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 4613 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4295,14 +4695,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4295{ 4695{
4296 struct descriptor_table dtable; 4696 struct descriptor_table dtable;
4297 u16 index = selector >> 3; 4697 u16 index = selector >> 3;
4698 int ret;
4699 u32 err;
4700 gva_t addr;
4298 4701
4299 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4702 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4300 4703
4301 if (dtable.limit < index * 8 + 7) { 4704 if (dtable.limit < index * 8 + 7) {
4302 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4705 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4303 return 1; 4706 return X86EMUL_PROPAGATE_FAULT;
4304 } 4707 }
4305 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4708 addr = dtable.base + index * 8;
4709 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4710 vcpu, &err);
4711 if (ret == X86EMUL_PROPAGATE_FAULT)
4712 kvm_inject_page_fault(vcpu, addr, err);
4713
4714 return ret;
4306} 4715}
4307 4716
4308/* allowed just for 8 bytes segments */ 4717/* allowed just for 8 bytes segments */
@@ -4316,15 +4725,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4316 4725
4317 if (dtable.limit < index * 8 + 7) 4726 if (dtable.limit < index * 8 + 7)
4318 return 1; 4727 return 1;
4319 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4728 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4729}
4730
4731static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4732 struct desc_struct *seg_desc)
4733{
4734 u32 base_addr = get_desc_base(seg_desc);
4735
4736 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4320} 4737}
4321 4738
4322static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4739static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4323 struct desc_struct *seg_desc) 4740 struct desc_struct *seg_desc)
4324{ 4741{
4325 u32 base_addr = get_desc_base(seg_desc); 4742 u32 base_addr = get_desc_base(seg_desc);
4326 4743
4327 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4744 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4328} 4745}
4329 4746
4330static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4747static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4335,18 +4752,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4335 return kvm_seg.selector; 4752 return kvm_seg.selector;
4336} 4753}
4337 4754
4338static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4339 u16 selector,
4340 struct kvm_segment *kvm_seg)
4341{
4342 struct desc_struct seg_desc;
4343
4344 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4345 return 1;
4346 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4347 return 0;
4348}
4349
4350static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4755static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4351{ 4756{
4352 struct kvm_segment segvar = { 4757 struct kvm_segment segvar = {
@@ -4364,7 +4769,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
4364 .unusable = 0, 4769 .unusable = 0,
4365 }; 4770 };
4366 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4771 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4367 return 0; 4772 return X86EMUL_CONTINUE;
4368} 4773}
4369 4774
4370static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4775static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
@@ -4374,24 +4779,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4374 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); 4779 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4375} 4780}
4376 4781
4377int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4782int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4378 int type_bits, int seg)
4379{ 4783{
4380 struct kvm_segment kvm_seg; 4784 struct kvm_segment kvm_seg;
4785 struct desc_struct seg_desc;
4786 u8 dpl, rpl, cpl;
4787 unsigned err_vec = GP_VECTOR;
4788 u32 err_code = 0;
4789 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4790 int ret;
4381 4791
4382 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4792 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
4383 return kvm_load_realmode_segment(vcpu, selector, seg); 4793 return kvm_load_realmode_segment(vcpu, selector, seg);
4384 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4385 return 1;
4386 kvm_seg.type |= type_bits;
4387 4794
4388 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4795 /* NULL selector is not valid for TR, CS and SS */
4389 seg != VCPU_SREG_LDTR) 4796 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
4390 if (!kvm_seg.s) 4797 && null_selector)
4391 kvm_seg.unusable = 1; 4798 goto exception;
4799
4800 /* TR should be in GDT only */
4801 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
4802 goto exception;
4803
4804 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4805 if (ret)
4806 return ret;
4807
4808 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4809
4810 if (null_selector) { /* for NULL selector skip all following checks */
4811 kvm_seg.unusable = 1;
4812 goto load;
4813 }
4814
4815 err_code = selector & 0xfffc;
4816 err_vec = GP_VECTOR;
4817
4818 /* can't load system descriptor into segment selecor */
4819 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4820 goto exception;
4392 4821
4822 if (!kvm_seg.present) {
4823 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4824 goto exception;
4825 }
4826
4827 rpl = selector & 3;
4828 dpl = kvm_seg.dpl;
4829 cpl = kvm_x86_ops->get_cpl(vcpu);
4830
4831 switch (seg) {
4832 case VCPU_SREG_SS:
4833 /*
4834 * segment is not a writable data segment or segment
4835 * selector's RPL != CPL or segment selector's RPL != CPL
4836 */
4837 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4838 goto exception;
4839 break;
4840 case VCPU_SREG_CS:
4841 if (!(kvm_seg.type & 8))
4842 goto exception;
4843
4844 if (kvm_seg.type & 4) {
4845 /* conforming */
4846 if (dpl > cpl)
4847 goto exception;
4848 } else {
4849 /* nonconforming */
4850 if (rpl > cpl || dpl != cpl)
4851 goto exception;
4852 }
4853 /* CS(RPL) <- CPL */
4854 selector = (selector & 0xfffc) | cpl;
4855 break;
4856 case VCPU_SREG_TR:
4857 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4858 goto exception;
4859 break;
4860 case VCPU_SREG_LDTR:
4861 if (kvm_seg.s || kvm_seg.type != 2)
4862 goto exception;
4863 break;
4864 default: /* DS, ES, FS, or GS */
4865 /*
4866 * segment is not a data or readable code segment or
4867 * ((segment is a data or nonconforming code segment)
4868 * and (both RPL and CPL > DPL))
4869 */
4870 if ((kvm_seg.type & 0xa) == 0x8 ||
4871 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4872 goto exception;
4873 break;
4874 }
4875
4876 if (!kvm_seg.unusable && kvm_seg.s) {
4877 /* mark segment as accessed */
4878 kvm_seg.type |= 1;
4879 seg_desc.type |= 1;
4880 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4881 }
4882load:
4393 kvm_set_segment(vcpu, &kvm_seg, seg); 4883 kvm_set_segment(vcpu, &kvm_seg, seg);
4394 return 0; 4884 return X86EMUL_CONTINUE;
4885exception:
4886 kvm_queue_exception_e(vcpu, err_vec, err_code);
4887 return X86EMUL_PROPAGATE_FAULT;
4395} 4888}
4396 4889
4397static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4890static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4417,6 +4910,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4417 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4910 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4418} 4911}
4419 4912
4913static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4914{
4915 struct kvm_segment kvm_seg;
4916 kvm_get_segment(vcpu, &kvm_seg, seg);
4917 kvm_seg.selector = sel;
4918 kvm_set_segment(vcpu, &kvm_seg, seg);
4919}
4920
4420static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4921static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4421 struct tss_segment_32 *tss) 4922 struct tss_segment_32 *tss)
4422{ 4923{
@@ -4434,25 +4935,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4434 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4935 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4435 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4936 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4436 4937
4437 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4938 /*
4939 * SDM says that segment selectors are loaded before segment
4940 * descriptors
4941 */
4942 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
4943 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
4944 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
4945 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
4946 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
4947 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
4948 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
4949
4950 /*
4951 * Now load segment descriptors. If fault happenes at this stage
4952 * it is handled in a context of new task
4953 */
4954 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
4438 return 1; 4955 return 1;
4439 4956
4440 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4957 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4441 return 1; 4958 return 1;
4442 4959
4443 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4960 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4444 return 1; 4961 return 1;
4445 4962
4446 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4963 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4447 return 1; 4964 return 1;
4448 4965
4449 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4966 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4450 return 1; 4967 return 1;
4451 4968
4452 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4969 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
4453 return 1; 4970 return 1;
4454 4971
4455 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4972 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
4456 return 1; 4973 return 1;
4457 return 0; 4974 return 0;
4458} 4975}
@@ -4492,19 +5009,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4492 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 5009 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4493 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 5010 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4494 5011
4495 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 5012 /*
5013 * SDM says that segment selectors are loaded before segment
5014 * descriptors
5015 */
5016 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5017 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5018 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5019 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5020 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5021
5022 /*
5023 * Now load segment descriptors. If fault happenes at this stage
5024 * it is handled in a context of new task
5025 */
5026 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
4496 return 1; 5027 return 1;
4497 5028
4498 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 5029 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4499 return 1; 5030 return 1;
4500 5031
4501 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 5032 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4502 return 1; 5033 return 1;
4503 5034
4504 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 5035 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4505 return 1; 5036 return 1;
4506 5037
4507 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 5038 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4508 return 1; 5039 return 1;
4509 return 0; 5040 return 0;
4510} 5041}
@@ -4526,7 +5057,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4526 sizeof tss_segment_16)) 5057 sizeof tss_segment_16))
4527 goto out; 5058 goto out;
4528 5059
4529 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5060 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4530 &tss_segment_16, sizeof tss_segment_16)) 5061 &tss_segment_16, sizeof tss_segment_16))
4531 goto out; 5062 goto out;
4532 5063
@@ -4534,7 +5065,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4534 tss_segment_16.prev_task_link = old_tss_sel; 5065 tss_segment_16.prev_task_link = old_tss_sel;
4535 5066
4536 if (kvm_write_guest(vcpu->kvm, 5067 if (kvm_write_guest(vcpu->kvm,
4537 get_tss_base_addr(vcpu, nseg_desc), 5068 get_tss_base_addr_write(vcpu, nseg_desc),
4538 &tss_segment_16.prev_task_link, 5069 &tss_segment_16.prev_task_link,
4539 sizeof tss_segment_16.prev_task_link)) 5070 sizeof tss_segment_16.prev_task_link))
4540 goto out; 5071 goto out;
@@ -4565,7 +5096,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4565 sizeof tss_segment_32)) 5096 sizeof tss_segment_32))
4566 goto out; 5097 goto out;
4567 5098
4568 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5099 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4569 &tss_segment_32, sizeof tss_segment_32)) 5100 &tss_segment_32, sizeof tss_segment_32))
4570 goto out; 5101 goto out;
4571 5102
@@ -4573,7 +5104,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4573 tss_segment_32.prev_task_link = old_tss_sel; 5104 tss_segment_32.prev_task_link = old_tss_sel;
4574 5105
4575 if (kvm_write_guest(vcpu->kvm, 5106 if (kvm_write_guest(vcpu->kvm,
4576 get_tss_base_addr(vcpu, nseg_desc), 5107 get_tss_base_addr_write(vcpu, nseg_desc),
4577 &tss_segment_32.prev_task_link, 5108 &tss_segment_32.prev_task_link,
4578 sizeof tss_segment_32.prev_task_link)) 5109 sizeof tss_segment_32.prev_task_link))
4579 goto out; 5110 goto out;
@@ -4595,8 +5126,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4595 int ret = 0; 5126 int ret = 0;
4596 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 5127 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4597 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 5128 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
5129 u32 desc_limit;
4598 5130
4599 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 5131 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
4600 5132
4601 /* FIXME: Handle errors. Failure to read either TSS or their 5133 /* FIXME: Handle errors. Failure to read either TSS or their
4602 * descriptors should generate a pagefault. 5134 * descriptors should generate a pagefault.
@@ -4617,7 +5149,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4617 } 5149 }
4618 } 5150 }
4619 5151
4620 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 5152 desc_limit = get_desc_limit(&nseg_desc);
5153 if (!nseg_desc.p ||
5154 ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
5155 desc_limit < 0x2b)) {
4621 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 5156 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4622 return 1; 5157 return 1;
4623 } 5158 }
@@ -4655,7 +5190,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4655 &nseg_desc); 5190 &nseg_desc);
4656 } 5191 }
4657 5192
4658 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 5193 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
4659 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 5194 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4660 tr_seg.type = 11; 5195 tr_seg.type = 11;
4661 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 5196 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
@@ -4686,17 +5221,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4686 5221
4687 kvm_set_cr8(vcpu, sregs->cr8); 5222 kvm_set_cr8(vcpu, sregs->cr8);
4688 5223
4689 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 5224 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
4690 kvm_x86_ops->set_efer(vcpu, sregs->efer); 5225 kvm_x86_ops->set_efer(vcpu, sregs->efer);
4691 kvm_set_apic_base(vcpu, sregs->apic_base); 5226 kvm_set_apic_base(vcpu, sregs->apic_base);
4692 5227
4693 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 5228 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
4694
4695 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4696 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 5229 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4697 vcpu->arch.cr0 = sregs->cr0; 5230 vcpu->arch.cr0 = sregs->cr0;
4698 5231
4699 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 5232 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
4700 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5233 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4701 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5234 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4702 load_pdptrs(vcpu, vcpu->arch.cr3); 5235 load_pdptrs(vcpu, vcpu->arch.cr3);
@@ -4731,7 +5264,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4731 /* Older userspace won't unhalt the vcpu on reset. */ 5264 /* Older userspace won't unhalt the vcpu on reset. */
4732 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 5265 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4733 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 5266 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4734 !(vcpu->arch.cr0 & X86_CR0_PE)) 5267 !is_protmode(vcpu))
4735 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5268 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4736 5269
4737 vcpu_put(vcpu); 5270 vcpu_put(vcpu);
@@ -4829,11 +5362,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4829{ 5362{
4830 unsigned long vaddr = tr->linear_address; 5363 unsigned long vaddr = tr->linear_address;
4831 gpa_t gpa; 5364 gpa_t gpa;
5365 int idx;
4832 5366
4833 vcpu_load(vcpu); 5367 vcpu_load(vcpu);
4834 down_read(&vcpu->kvm->slots_lock); 5368 idx = srcu_read_lock(&vcpu->kvm->srcu);
4835 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 5369 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
4836 up_read(&vcpu->kvm->slots_lock); 5370 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4837 tr->physical_address = gpa; 5371 tr->physical_address = gpa;
4838 tr->valid = gpa != UNMAPPED_GVA; 5372 tr->valid = gpa != UNMAPPED_GVA;
4839 tr->writeable = 1; 5373 tr->writeable = 1;
@@ -4914,14 +5448,14 @@ EXPORT_SYMBOL_GPL(fx_init);
4914 5448
4915void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5449void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4916{ 5450{
4917 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 5451 if (vcpu->guest_fpu_loaded)
4918 return; 5452 return;
4919 5453
4920 vcpu->guest_fpu_loaded = 1; 5454 vcpu->guest_fpu_loaded = 1;
4921 kvm_fx_save(&vcpu->arch.host_fx_image); 5455 kvm_fx_save(&vcpu->arch.host_fx_image);
4922 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5456 kvm_fx_restore(&vcpu->arch.guest_fx_image);
5457 trace_kvm_fpu(1);
4923} 5458}
4924EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4925 5459
4926void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5460void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4927{ 5461{
@@ -4932,8 +5466,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4932 kvm_fx_save(&vcpu->arch.guest_fx_image); 5466 kvm_fx_save(&vcpu->arch.guest_fx_image);
4933 kvm_fx_restore(&vcpu->arch.host_fx_image); 5467 kvm_fx_restore(&vcpu->arch.host_fx_image);
4934 ++vcpu->stat.fpu_reload; 5468 ++vcpu->stat.fpu_reload;
5469 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
5470 trace_kvm_fpu(0);
4935} 5471}
4936EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4937 5472
4938void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5473void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4939{ 5474{
@@ -5068,12 +5603,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5068 GFP_KERNEL); 5603 GFP_KERNEL);
5069 if (!vcpu->arch.mce_banks) { 5604 if (!vcpu->arch.mce_banks) {
5070 r = -ENOMEM; 5605 r = -ENOMEM;
5071 goto fail_mmu_destroy; 5606 goto fail_free_lapic;
5072 } 5607 }
5073 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5608 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
5074 5609
5075 return 0; 5610 return 0;
5076 5611fail_free_lapic:
5612 kvm_free_lapic(vcpu);
5077fail_mmu_destroy: 5613fail_mmu_destroy:
5078 kvm_mmu_destroy(vcpu); 5614 kvm_mmu_destroy(vcpu);
5079fail_free_pio_data: 5615fail_free_pio_data:
@@ -5084,10 +5620,13 @@ fail:
5084 5620
5085void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5621void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5086{ 5622{
5623 int idx;
5624
5625 kfree(vcpu->arch.mce_banks);
5087 kvm_free_lapic(vcpu); 5626 kvm_free_lapic(vcpu);
5088 down_read(&vcpu->kvm->slots_lock); 5627 idx = srcu_read_lock(&vcpu->kvm->srcu);
5089 kvm_mmu_destroy(vcpu); 5628 kvm_mmu_destroy(vcpu);
5090 up_read(&vcpu->kvm->slots_lock); 5629 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5091 free_page((unsigned long)vcpu->arch.pio_data); 5630 free_page((unsigned long)vcpu->arch.pio_data);
5092} 5631}
5093 5632
@@ -5098,6 +5637,12 @@ struct kvm *kvm_arch_create_vm(void)
5098 if (!kvm) 5637 if (!kvm)
5099 return ERR_PTR(-ENOMEM); 5638 return ERR_PTR(-ENOMEM);
5100 5639
5640 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5641 if (!kvm->arch.aliases) {
5642 kfree(kvm);
5643 return ERR_PTR(-ENOMEM);
5644 }
5645
5101 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5646 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5102 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5647 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5103 5648
@@ -5154,16 +5699,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5154 put_page(kvm->arch.apic_access_page); 5699 put_page(kvm->arch.apic_access_page);
5155 if (kvm->arch.ept_identity_pagetable) 5700 if (kvm->arch.ept_identity_pagetable)
5156 put_page(kvm->arch.ept_identity_pagetable); 5701 put_page(kvm->arch.ept_identity_pagetable);
5702 cleanup_srcu_struct(&kvm->srcu);
5703 kfree(kvm->arch.aliases);
5157 kfree(kvm); 5704 kfree(kvm);
5158} 5705}
5159 5706
5160int kvm_arch_set_memory_region(struct kvm *kvm, 5707int kvm_arch_prepare_memory_region(struct kvm *kvm,
5161 struct kvm_userspace_memory_region *mem, 5708 struct kvm_memory_slot *memslot,
5162 struct kvm_memory_slot old, 5709 struct kvm_memory_slot old,
5710 struct kvm_userspace_memory_region *mem,
5163 int user_alloc) 5711 int user_alloc)
5164{ 5712{
5165 int npages = mem->memory_size >> PAGE_SHIFT; 5713 int npages = memslot->npages;
5166 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
5167 5714
5168 /*To keep backward compatibility with older userspace, 5715 /*To keep backward compatibility with older userspace,
5169 *x86 needs to hanlde !user_alloc case. 5716 *x86 needs to hanlde !user_alloc case.
@@ -5183,26 +5730,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
5183 if (IS_ERR((void *)userspace_addr)) 5730 if (IS_ERR((void *)userspace_addr))
5184 return PTR_ERR((void *)userspace_addr); 5731 return PTR_ERR((void *)userspace_addr);
5185 5732
5186 /* set userspace_addr atomically for kvm_hva_to_rmapp */
5187 spin_lock(&kvm->mmu_lock);
5188 memslot->userspace_addr = userspace_addr; 5733 memslot->userspace_addr = userspace_addr;
5189 spin_unlock(&kvm->mmu_lock);
5190 } else {
5191 if (!old.user_alloc && old.rmap) {
5192 int ret;
5193
5194 down_write(&current->mm->mmap_sem);
5195 ret = do_munmap(current->mm, old.userspace_addr,
5196 old.npages * PAGE_SIZE);
5197 up_write(&current->mm->mmap_sem);
5198 if (ret < 0)
5199 printk(KERN_WARNING
5200 "kvm_vm_ioctl_set_memory_region: "
5201 "failed to munmap memory\n");
5202 }
5203 } 5734 }
5204 } 5735 }
5205 5736
5737
5738 return 0;
5739}
5740
5741void kvm_arch_commit_memory_region(struct kvm *kvm,
5742 struct kvm_userspace_memory_region *mem,
5743 struct kvm_memory_slot old,
5744 int user_alloc)
5745{
5746
5747 int npages = mem->memory_size >> PAGE_SHIFT;
5748
5749 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
5750 int ret;
5751
5752 down_write(&current->mm->mmap_sem);
5753 ret = do_munmap(current->mm, old.userspace_addr,
5754 old.npages * PAGE_SIZE);
5755 up_write(&current->mm->mmap_sem);
5756 if (ret < 0)
5757 printk(KERN_WARNING
5758 "kvm_vm_ioctl_set_memory_region: "
5759 "failed to munmap memory\n");
5760 }
5761
5206 spin_lock(&kvm->mmu_lock); 5762 spin_lock(&kvm->mmu_lock);
5207 if (!kvm->arch.n_requested_mmu_pages) { 5763 if (!kvm->arch.n_requested_mmu_pages) {
5208 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5764 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
@@ -5211,8 +5767,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
5211 5767
5212 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5768 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
5213 spin_unlock(&kvm->mmu_lock); 5769 spin_unlock(&kvm->mmu_lock);
5214
5215 return 0;
5216} 5770}
5217 5771
5218void kvm_arch_flush_shadow(struct kvm *kvm) 5772void kvm_arch_flush_shadow(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea585d2a..2d101639bd8d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,6 +2,7 @@
2#define ARCH_X86_KVM_X86_H 2#define ARCH_X86_KVM_X86_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 7static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{ 8{
@@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
35struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 36struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
36 u32 function, u32 index); 37 u32 function, u32 index);
37 38
39static inline bool is_protmode(struct kvm_vcpu *vcpu)
40{
41 return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
42}
43
44static inline int is_long_mode(struct kvm_vcpu *vcpu)
45{
46#ifdef CONFIG_X86_64
47 return vcpu->arch.efer & EFER_LMA;
48#else
49 return 0;
50#endif
51}
52
53static inline int is_pae(struct kvm_vcpu *vcpu)
54{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
56}
57
58static inline int is_pse(struct kvm_vcpu *vcpu)
59{
60 return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
61}
62
63static inline int is_paging(struct kvm_vcpu *vcpu)
64{
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66}
67
38#endif 68#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7e59dc1d3fc2..2bdf628066bd 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -115,7 +115,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
115 local_irq_save(flags); 115 local_irq_save(flags);
116 if (lguest_data.hcall_status[next_call] != 0xFF) { 116 if (lguest_data.hcall_status[next_call] != 0xFF) {
117 /* Table full, so do normal hcall which will flush table. */ 117 /* Table full, so do normal hcall which will flush table. */
118 kvm_hypercall4(call, arg1, arg2, arg3, arg4); 118 hcall(call, arg1, arg2, arg3, arg4);
119 } else { 119 } else {
120 lguest_data.hcalls[next_call].arg0 = call; 120 lguest_data.hcalls[next_call].arg0 = call;
121 lguest_data.hcalls[next_call].arg1 = arg1; 121 lguest_data.hcalls[next_call].arg1 = arg1;
@@ -145,46 +145,45 @@ static void async_hcall(unsigned long call, unsigned long arg1,
145 * So, when we're in lazy mode, we call async_hcall() to store the call for 145 * So, when we're in lazy mode, we call async_hcall() to store the call for
146 * future processing: 146 * future processing:
147 */ 147 */
148static void lazy_hcall1(unsigned long call, 148static void lazy_hcall1(unsigned long call, unsigned long arg1)
149 unsigned long arg1)
150{ 149{
151 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 150 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
152 kvm_hypercall1(call, arg1); 151 hcall(call, arg1, 0, 0, 0);
153 else 152 else
154 async_hcall(call, arg1, 0, 0, 0); 153 async_hcall(call, arg1, 0, 0, 0);
155} 154}
156 155
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ 156/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
158static void lazy_hcall2(unsigned long call, 157static void lazy_hcall2(unsigned long call,
159 unsigned long arg1, 158 unsigned long arg1,
160 unsigned long arg2) 159 unsigned long arg2)
161{ 160{
162 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 161 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
163 kvm_hypercall2(call, arg1, arg2); 162 hcall(call, arg1, arg2, 0, 0);
164 else 163 else
165 async_hcall(call, arg1, arg2, 0, 0); 164 async_hcall(call, arg1, arg2, 0, 0);
166} 165}
167 166
168static void lazy_hcall3(unsigned long call, 167static void lazy_hcall3(unsigned long call,
169 unsigned long arg1, 168 unsigned long arg1,
170 unsigned long arg2, 169 unsigned long arg2,
171 unsigned long arg3) 170 unsigned long arg3)
172{ 171{
173 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 172 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
174 kvm_hypercall3(call, arg1, arg2, arg3); 173 hcall(call, arg1, arg2, arg3, 0);
175 else 174 else
176 async_hcall(call, arg1, arg2, arg3, 0); 175 async_hcall(call, arg1, arg2, arg3, 0);
177} 176}
178 177
179#ifdef CONFIG_X86_PAE 178#ifdef CONFIG_X86_PAE
180static void lazy_hcall4(unsigned long call, 179static void lazy_hcall4(unsigned long call,
181 unsigned long arg1, 180 unsigned long arg1,
182 unsigned long arg2, 181 unsigned long arg2,
183 unsigned long arg3, 182 unsigned long arg3,
184 unsigned long arg4) 183 unsigned long arg4)
185{ 184{
186 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 185 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
187 kvm_hypercall4(call, arg1, arg2, arg3, arg4); 186 hcall(call, arg1, arg2, arg3, arg4);
188 else 187 else
189 async_hcall(call, arg1, arg2, arg3, arg4); 188 async_hcall(call, arg1, arg2, arg3, arg4);
190} 189}
@@ -196,13 +195,13 @@ static void lazy_hcall4(unsigned long call,
196:*/ 195:*/
197static void lguest_leave_lazy_mmu_mode(void) 196static void lguest_leave_lazy_mmu_mode(void)
198{ 197{
199 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 198 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
200 paravirt_leave_lazy_mmu(); 199 paravirt_leave_lazy_mmu();
201} 200}
202 201
203static void lguest_end_context_switch(struct task_struct *next) 202static void lguest_end_context_switch(struct task_struct *next)
204{ 203{
205 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 204 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
206 paravirt_end_context_switch(next); 205 paravirt_end_context_switch(next);
207} 206}
208 207
@@ -286,7 +285,7 @@ static void lguest_write_idt_entry(gate_desc *dt,
286 /* Keep the local copy up to date. */ 285 /* Keep the local copy up to date. */
287 native_write_idt_entry(dt, entrynum, g); 286 native_write_idt_entry(dt, entrynum, g);
288 /* Tell Host about this new entry. */ 287 /* Tell Host about this new entry. */
289 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 288 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
290} 289}
291 290
292/* 291/*
@@ -300,7 +299,7 @@ static void lguest_load_idt(const struct desc_ptr *desc)
300 struct desc_struct *idt = (void *)desc->address; 299 struct desc_struct *idt = (void *)desc->address;
301 300
302 for (i = 0; i < (desc->size+1)/8; i++) 301 for (i = 0; i < (desc->size+1)/8; i++)
303 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); 302 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
304} 303}
305 304
306/* 305/*
@@ -321,7 +320,7 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
321 struct desc_struct *gdt = (void *)desc->address; 320 struct desc_struct *gdt = (void *)desc->address;
322 321
323 for (i = 0; i < (desc->size+1)/8; i++) 322 for (i = 0; i < (desc->size+1)/8; i++)
324 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 323 hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
325} 324}
326 325
327/* 326/*
@@ -334,8 +333,8 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
334{ 333{
335 native_write_gdt_entry(dt, entrynum, desc, type); 334 native_write_gdt_entry(dt, entrynum, desc, type);
336 /* Tell Host about this new entry. */ 335 /* Tell Host about this new entry. */
337 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum, 336 hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
338 dt[entrynum].a, dt[entrynum].b); 337 dt[entrynum].a, dt[entrynum].b, 0);
339} 338}
340 339
341/* 340/*
@@ -931,7 +930,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
931 } 930 }
932 931
933 /* Please wake us this far in the future. */ 932 /* Please wake us this far in the future. */
934 kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta); 933 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
935 return 0; 934 return 0;
936} 935}
937 936
@@ -942,7 +941,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
942 case CLOCK_EVT_MODE_UNUSED: 941 case CLOCK_EVT_MODE_UNUSED:
943 case CLOCK_EVT_MODE_SHUTDOWN: 942 case CLOCK_EVT_MODE_SHUTDOWN:
944 /* A 0 argument shuts the clock down. */ 943 /* A 0 argument shuts the clock down. */
945 kvm_hypercall0(LHCALL_SET_CLOCKEVENT); 944 hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
946 break; 945 break;
947 case CLOCK_EVT_MODE_ONESHOT: 946 case CLOCK_EVT_MODE_ONESHOT:
948 /* This is what we expect. */ 947 /* This is what we expect. */
@@ -1100,7 +1099,7 @@ static void set_lguest_basic_apic_ops(void)
1100/* STOP! Until an interrupt comes in. */ 1099/* STOP! Until an interrupt comes in. */
1101static void lguest_safe_halt(void) 1100static void lguest_safe_halt(void)
1102{ 1101{
1103 kvm_hypercall0(LHCALL_HALT); 1102 hcall(LHCALL_HALT, 0, 0, 0, 0);
1104} 1103}
1105 1104
1106/* 1105/*
@@ -1112,8 +1111,8 @@ static void lguest_safe_halt(void)
1112 */ 1111 */
1113static void lguest_power_off(void) 1112static void lguest_power_off(void)
1114{ 1113{
1115 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), 1114 hcall(LHCALL_SHUTDOWN, __pa("Power down"),
1116 LGUEST_SHUTDOWN_POWEROFF); 1115 LGUEST_SHUTDOWN_POWEROFF, 0, 0);
1117} 1116}
1118 1117
1119/* 1118/*
@@ -1123,7 +1122,7 @@ static void lguest_power_off(void)
1123 */ 1122 */
1124static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 1123static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
1125{ 1124{
1126 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF); 1125 hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
1127 /* The hcall won't return, but to keep gcc happy, we're "done". */ 1126 /* The hcall won't return, but to keep gcc happy, we're "done". */
1128 return NOTIFY_DONE; 1127 return NOTIFY_DONE;
1129} 1128}
@@ -1162,7 +1161,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1162 len = sizeof(scratch) - 1; 1161 len = sizeof(scratch) - 1;
1163 scratch[len] = '\0'; 1162 scratch[len] = '\0';
1164 memcpy(scratch, buf, len); 1163 memcpy(scratch, buf, len);
1165 kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch)); 1164 hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0);
1166 1165
1167 /* This routine returns the number of bytes actually written. */ 1166 /* This routine returns the number of bytes actually written. */
1168 return len; 1167 return len;
@@ -1174,7 +1173,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1174 */ 1173 */
1175static void lguest_restart(char *reason) 1174static void lguest_restart(char *reason)
1176{ 1175{
1177 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); 1176 hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
1178} 1177}
1179 1178
1180/*G:050 1179/*G:050
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 27eac0faee48..4f420c2f2d55 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -32,7 +32,7 @@ ENTRY(lguest_entry)
32 */ 32 */
33 movl $LHCALL_LGUEST_INIT, %eax 33 movl $LHCALL_LGUEST_INIT, %eax
34 movl $lguest_data - __PAGE_OFFSET, %ebx 34 movl $lguest_data - __PAGE_OFFSET, %ebx
35 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 35 int $LGUEST_TRAP_ENTRY
36 36
37 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
38 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 05d686bbbe9f..3ac4a8ade627 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,7 +14,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c
14 14
15clean-files := inat-tables.c 15clean-files := inat-tables.c
16 16
17obj-$(CONFIG_SMP) += msr-smp.o 17obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
18 18
19lib-y := delay.o 19lib-y := delay.o
20lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
@@ -35,9 +35,10 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
35endif 35endif
36 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 36 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
37else 37else
38 obj-y += io_64.o iomap_copy_64.o 38 obj-y += iomap_copy_64.o
39 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 39 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
40 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 40 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
41 lib-y += memmove_64.o memset_64.o 41 lib-y += memmove_64.o memset_64.o
42 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 42 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
43 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
43endif 44endif
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 000000000000..a3c668875038
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,19 @@
1#include <linux/smp.h>
2#include <linux/module.h>
3
4static void __wbinvd(void *dummy)
5{
6 wbinvd();
7}
8
9void wbinvd_on_cpu(int cpu)
10{
11 smp_call_function_single(cpu, __wbinvd, NULL, 1);
12}
13EXPORT_SYMBOL(wbinvd_on_cpu);
14
15int wbinvd_on_all_cpus(void)
16{
17 return on_each_cpu(__wbinvd, NULL, 1);
18}
19EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
deleted file mode 100644
index 3f1eb59b5f08..000000000000
--- a/arch/x86/lib/io_64.c
+++ /dev/null
@@ -1,25 +0,0 @@
1#include <linux/string.h>
2#include <linux/module.h>
3#include <asm/io.h>
4
5void __memcpy_toio(unsigned long dst, const void *src, unsigned len)
6{
7 __inline_memcpy((void *)dst, src, len);
8}
9EXPORT_SYMBOL(__memcpy_toio);
10
11void __memcpy_fromio(void *dst, unsigned long src, unsigned len)
12{
13 __inline_memcpy(dst, (const void *)src, len);
14}
15EXPORT_SYMBOL(__memcpy_fromio);
16
17void memset_io(volatile void __iomem *a, int b, size_t c)
18{
19 /*
20 * TODO: memset can mangle the IO patterns quite a bit.
21 * perhaps it would be better to use a dumb one:
22 */
23 memset((void *)a, b, c);
24}
25EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
new file mode 100644
index 000000000000..15acecf0d7aa
--- /dev/null
+++ b/arch/x86/lib/rwsem_64.S
@@ -0,0 +1,81 @@
1/*
2 * x86-64 rwsem wrappers
3 *
4 * This interfaces the inline asm code to the slow-path
5 * C routines. We need to save the call-clobbered regs
6 * that the asm does not mark as clobbered, and move the
7 * argument from %rax to %rdi.
8 *
9 * NOTE! We don't need to save %rax, because the functions
10 * will always return the semaphore pointer in %rax (which
11 * is also the input argument to these helpers)
12 *
13 * The following can clobber %rdx because the asm clobbers it:
14 * call_rwsem_down_write_failed
15 * call_rwsem_wake
16 * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
17 */
18
19#include <linux/linkage.h>
20#include <asm/rwlock.h>
21#include <asm/alternative-asm.h>
22#include <asm/frame.h>
23#include <asm/dwarf2.h>
24
25#define save_common_regs \
26 pushq %rdi; \
27 pushq %rsi; \
28 pushq %rcx; \
29 pushq %r8; \
30 pushq %r9; \
31 pushq %r10; \
32 pushq %r11
33
34#define restore_common_regs \
35 popq %r11; \
36 popq %r10; \
37 popq %r9; \
38 popq %r8; \
39 popq %rcx; \
40 popq %rsi; \
41 popq %rdi
42
43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed)
45 save_common_regs
46 pushq %rdx
47 movq %rax,%rdi
48 call rwsem_down_read_failed
49 popq %rdx
50 restore_common_regs
51 ret
52 ENDPROC(call_rwsem_down_read_failed)
53
54ENTRY(call_rwsem_down_write_failed)
55 save_common_regs
56 movq %rax,%rdi
57 call rwsem_down_write_failed
58 restore_common_regs
59 ret
60 ENDPROC(call_rwsem_down_write_failed)
61
62ENTRY(call_rwsem_wake)
63 decw %dx /* do nothing if still outstanding active readers */
64 jnz 1f
65 save_common_regs
66 movq %rax,%rdi
67 call rwsem_wake
68 restore_common_regs
691: ret
70 ENDPROC(call_rwsem_wake)
71
72/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake)
74 save_common_regs
75 pushq %rdx
76 movq %rax,%rdi
77 call rwsem_downgrade_wake
78 popq %rdx
79 restore_common_regs
80 ret
81 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 71da1bca13cb..738e6593799d 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -18,7 +18,7 @@ static inline pte_t gup_get_pte(pte_t *ptep)
18#else 18#else
19 /* 19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking 20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally, 21 * any locks. For this we would like to load the pointers atomically,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What 22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not 23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not 24 * present to present, or present to not present or both -- it will not
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f46c340727b8..069ce7c37c01 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -9,7 +9,6 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/hugetlb.h> 10#include <linux/hugetlb.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
13#include <linux/err.h> 12#include <linux/err.h>
14#include <linux/sysctl.h> 13#include <linux/sysctl.h>
15#include <asm/mman.h> 14#include <asm/mman.h>
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d406c5239019..b278535b14aa 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
1#include <linux/gfp.h>
1#include <linux/initrd.h> 2#include <linux/initrd.h>
2#include <linux/ioport.h> 3#include <linux/ioport.h>
3#include <linux/swap.h> 4#include <linux/swap.h>
@@ -266,16 +267,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
266 if (!after_bootmem) 267 if (!after_bootmem)
267 find_early_table_space(end, use_pse, use_gbpages); 268 find_early_table_space(end, use_pse, use_gbpages);
268 269
269#ifdef CONFIG_X86_32
270 for (i = 0; i < nr_range; i++)
271 kernel_physical_mapping_init(mr[i].start, mr[i].end,
272 mr[i].page_size_mask);
273 ret = end;
274#else /* CONFIG_X86_64 */
275 for (i = 0; i < nr_range; i++) 270 for (i = 0; i < nr_range; i++)
276 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 271 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
277 mr[i].page_size_mask); 272 mr[i].page_size_mask);
278#endif
279 273
280#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
281 early_ioremap_page_table_range_init(); 275 early_ioremap_page_table_range_init();
@@ -338,11 +332,23 @@ int devmem_is_allowed(unsigned long pagenr)
338 332
339void free_init_pages(char *what, unsigned long begin, unsigned long end) 333void free_init_pages(char *what, unsigned long begin, unsigned long end)
340{ 334{
341 unsigned long addr = begin; 335 unsigned long addr;
336 unsigned long begin_aligned, end_aligned;
337
338 /* Make sure boundaries are page aligned */
339 begin_aligned = PAGE_ALIGN(begin);
340 end_aligned = end & PAGE_MASK;
342 341
343 if (addr >= end) 342 if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
343 begin = begin_aligned;
344 end = end_aligned;
345 }
346
347 if (begin >= end)
344 return; 348 return;
345 349
350 addr = begin;
351
346 /* 352 /*
347 * If debugging page accesses then do not free this memory but 353 * If debugging page accesses then do not free this memory but
348 * mark them not present - any buggy init-section access will 354 * mark them not present - any buggy init-section access will
@@ -350,7 +356,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
350 */ 356 */
351#ifdef CONFIG_DEBUG_PAGEALLOC 357#ifdef CONFIG_DEBUG_PAGEALLOC
352 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", 358 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
353 begin, PAGE_ALIGN(end)); 359 begin, end);
354 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 360 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
355#else 361#else
356 /* 362 /*
@@ -365,8 +371,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
365 for (; addr < end; addr += PAGE_SIZE) { 371 for (; addr < end; addr += PAGE_SIZE) {
366 ClearPageReserved(virt_to_page(addr)); 372 ClearPageReserved(virt_to_page(addr));
367 init_page_count(virt_to_page(addr)); 373 init_page_count(virt_to_page(addr));
368 memset((void *)(addr & ~(PAGE_SIZE-1)), 374 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
369 POISON_FREE_INITMEM, PAGE_SIZE);
370 free_page(addr); 375 free_page(addr);
371 totalram_pages++; 376 totalram_pages++;
372 } 377 }
@@ -383,6 +388,15 @@ void free_initmem(void)
383#ifdef CONFIG_BLK_DEV_INITRD 388#ifdef CONFIG_BLK_DEV_INITRD
384void free_initrd_mem(unsigned long start, unsigned long end) 389void free_initrd_mem(unsigned long start, unsigned long end)
385{ 390{
386 free_init_pages("initrd memory", start, end); 391 /*
392 * end could be not aligned, and We can not align that,
393 * decompresser could be confused by aligned initrd_end
394 * We already reserve the end partial page before in
395 * - i386_start_kernel()
396 * - x86_64_start_kernel()
397 * - relocate_initrd()
398 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
399 */
400 free_init_pages("initrd memory", start, PAGE_ALIGN(end));
387} 401}
388#endif 402#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c973f8e2a6cf..bca79091b9d6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,11 +25,11 @@
25#include <linux/pfn.h> 25#include <linux/pfn.h>
26#include <linux/poison.h> 26#include <linux/poison.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/slab.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
30#include <linux/memory_hotplug.h> 29#include <linux/memory_hotplug.h>
31#include <linux/initrd.h> 30#include <linux/initrd.h>
32#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32#include <linux/gfp.h>
33 33
34#include <asm/asm.h> 34#include <asm/asm.h>
35#include <asm/bios_ebda.h> 35#include <asm/bios_ebda.h>
@@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start,
241 unsigned long page_size_mask) 241 unsigned long page_size_mask)
242{ 242{
243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M); 243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
244 unsigned long last_map_addr = end;
244 unsigned long start_pfn, end_pfn; 245 unsigned long start_pfn, end_pfn;
245 pgd_t *pgd_base = swapper_pg_dir; 246 pgd_t *pgd_base = swapper_pg_dir;
246 int pgd_idx, pmd_idx, pte_ofs; 247 int pgd_idx, pmd_idx, pte_ofs;
@@ -341,9 +342,10 @@ repeat:
341 prot = PAGE_KERNEL_EXEC; 342 prot = PAGE_KERNEL_EXEC;
342 343
343 pages_4k++; 344 pages_4k++;
344 if (mapping_iter == 1) 345 if (mapping_iter == 1) {
345 set_pte(pte, pfn_pte(pfn, init_prot)); 346 set_pte(pte, pfn_pte(pfn, init_prot));
346 else 347 last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
348 } else
347 set_pte(pte, pfn_pte(pfn, prot)); 349 set_pte(pte, pfn_pte(pfn, prot));
348 } 350 }
349 } 351 }
@@ -368,7 +370,7 @@ repeat:
368 mapping_iter = 2; 370 mapping_iter = 2;
369 goto repeat; 371 goto repeat;
370 } 372 }
371 return 0; 373 return last_map_addr;
372} 374}
373 375
374pte_t *kmap_pte; 376pte_t *kmap_pte;
@@ -748,6 +750,7 @@ static void __init zone_sizes_init(void)
748 free_area_init_nodes(max_zone_pfns); 750 free_area_init_nodes(max_zone_pfns);
749} 751}
750 752
753#ifndef CONFIG_NO_BOOTMEM
751static unsigned long __init setup_node_bootmem(int nodeid, 754static unsigned long __init setup_node_bootmem(int nodeid,
752 unsigned long start_pfn, 755 unsigned long start_pfn,
753 unsigned long end_pfn, 756 unsigned long end_pfn,
@@ -764,13 +767,14 @@ static unsigned long __init setup_node_bootmem(int nodeid,
764 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", 767 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
765 nodeid, bootmap, bootmap + bootmap_size); 768 nodeid, bootmap, bootmap + bootmap_size);
766 free_bootmem_with_active_regions(nodeid, end_pfn); 769 free_bootmem_with_active_regions(nodeid, end_pfn);
767 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
768 770
769 return bootmap + bootmap_size; 771 return bootmap + bootmap_size;
770} 772}
773#endif
771 774
772void __init setup_bootmem_allocator(void) 775void __init setup_bootmem_allocator(void)
773{ 776{
777#ifndef CONFIG_NO_BOOTMEM
774 int nodeid; 778 int nodeid;
775 unsigned long bootmap_size, bootmap; 779 unsigned long bootmap_size, bootmap;
776 /* 780 /*
@@ -782,11 +786,13 @@ void __init setup_bootmem_allocator(void)
782 if (bootmap == -1L) 786 if (bootmap == -1L)
783 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 787 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
784 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 788 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
789#endif
785 790
786 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 791 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
787 max_pfn_mapped<<PAGE_SHIFT); 792 max_pfn_mapped<<PAGE_SHIFT);
788 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 793 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
789 794
795#ifndef CONFIG_NO_BOOTMEM
790 for_each_online_node(nodeid) { 796 for_each_online_node(nodeid) {
791 unsigned long start_pfn, end_pfn; 797 unsigned long start_pfn, end_pfn;
792 798
@@ -804,6 +810,7 @@ void __init setup_bootmem_allocator(void)
804 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, 810 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
805 bootmap); 811 bootmap);
806 } 812 }
813#endif
807 814
808 after_bootmem = 1; 815 after_bootmem = 1;
809} 816}
@@ -892,8 +899,7 @@ void __init mem_init(void)
892 reservedpages << (PAGE_SHIFT-10), 899 reservedpages << (PAGE_SHIFT-10),
893 datasize >> 10, 900 datasize >> 10,
894 initsize >> 10, 901 initsize >> 10,
895 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 902 totalhigh_pages << (PAGE_SHIFT-10));
896 );
897 903
898 printk(KERN_INFO "virtual kernel memory layout:\n" 904 printk(KERN_INFO "virtual kernel memory layout:\n"
899 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 905 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5198b9bb34ef..ee41bba315d1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/memory_hotplug.h> 30#include <linux/memory_hotplug.h>
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32#include <linux/gfp.h>
32 33
33#include <asm/processor.h> 34#include <asm/processor.h>
34#include <asm/bios_ebda.h> 35#include <asm/bios_ebda.h>
@@ -49,6 +50,7 @@
49#include <asm/numa.h> 50#include <asm/numa.h>
50#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
51#include <asm/init.h> 52#include <asm/init.h>
53#include <linux/bootmem.h>
52 54
53static unsigned long dma_reserve __initdata; 55static unsigned long dma_reserve __initdata;
54 56
@@ -571,6 +573,7 @@ kernel_physical_mapping_init(unsigned long start,
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 573void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
572 int acpi, int k8) 574 int acpi, int k8)
573{ 575{
576#ifndef CONFIG_NO_BOOTMEM
574 unsigned long bootmap_size, bootmap; 577 unsigned long bootmap_size, bootmap;
575 578
576 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 579 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -578,13 +581,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
578 PAGE_SIZE); 581 PAGE_SIZE);
579 if (bootmap == -1L) 582 if (bootmap == -1L)
580 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 583 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
584 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
581 /* don't touch min_low_pfn */ 585 /* don't touch min_low_pfn */
582 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, 586 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
583 0, end_pfn); 587 0, end_pfn);
584 e820_register_active_regions(0, start_pfn, end_pfn); 588 e820_register_active_regions(0, start_pfn, end_pfn);
585 free_bootmem_with_active_regions(0, end_pfn); 589 free_bootmem_with_active_regions(0, end_pfn);
586 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 590#else
587 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 591 e820_register_active_regions(0, start_pfn, end_pfn);
592#endif
588} 593}
589#endif 594#endif
590 595
@@ -616,6 +621,21 @@ void __init paging_init(void)
616 */ 621 */
617#ifdef CONFIG_MEMORY_HOTPLUG 622#ifdef CONFIG_MEMORY_HOTPLUG
618/* 623/*
624 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
625 * updating.
626 */
627static void update_end_of_memory_vars(u64 start, u64 size)
628{
629 unsigned long end_pfn = PFN_UP(start + size);
630
631 if (end_pfn > max_pfn) {
632 max_pfn = end_pfn;
633 max_low_pfn = end_pfn;
634 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
635 }
636}
637
638/*
619 * Memory is added always to NORMAL zone. This means you will never get 639 * Memory is added always to NORMAL zone. This means you will never get
620 * additional DMA/DMA32 memory. 640 * additional DMA/DMA32 memory.
621 */ 641 */
@@ -634,6 +654,9 @@ int arch_add_memory(int nid, u64 start, u64 size)
634 ret = __add_pages(nid, zone, start_pfn, nr_pages); 654 ret = __add_pages(nid, zone, start_pfn, nr_pages);
635 WARN_ON_ONCE(ret); 655 WARN_ON_ONCE(ret);
636 656
657 /* update max_pfn, max_low_pfn and high_memory */
658 update_end_of_memory_vars(start, size);
659
637 return ret; 660 return ret;
638} 661}
639EXPORT_SYMBOL_GPL(arch_add_memory); 662EXPORT_SYMBOL_GPL(arch_add_memory);
@@ -955,7 +978,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
955 if (pmd_none(*pmd)) { 978 if (pmd_none(*pmd)) {
956 pte_t entry; 979 pte_t entry;
957 980
958 p = vmemmap_alloc_block(PMD_SIZE, node); 981 p = vmemmap_alloc_block_buf(PMD_SIZE, node);
959 if (!p) 982 if (!p)
960 return -ENOMEM; 983 return -ENOMEM;
961 984
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 03c75ffd5c2a..5eb1ba74a3a9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,43 +24,6 @@
24 24
25#include "physaddr.h" 25#include "physaddr.h"
26 26
27int page_is_ram(unsigned long pagenr)
28{
29 resource_size_t addr, end;
30 int i;
31
32 /*
33 * A special case is the first 4Kb of memory;
34 * This is a BIOS owned area, not kernel ram, but generally
35 * not listed as such in the E820 table.
36 */
37 if (pagenr == 0)
38 return 0;
39
40 /*
41 * Second special case: Some BIOSen report the PC BIOS
42 * area (640->1Mb) as ram even though it is not.
43 */
44 if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
45 pagenr < (BIOS_END >> PAGE_SHIFT))
46 return 0;
47
48 for (i = 0; i < e820.nr_map; i++) {
49 /*
50 * Not usable memory:
51 */
52 if (e820.map[i].type != E820_RAM)
53 continue;
54 addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
55 end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
56
57
58 if ((pagenr >= addr) && (pagenr < end))
59 return 1;
60 }
61 return 0;
62}
63
64/* 27/*
65 * Fix up the linear direct mapping of the kernel to avoid cache attribute 28 * Fix up the linear direct mapping of the kernel to avoid cache attribute
66 * conflicts. 29 * conflicts.
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 4901d0dafda6..af3b6c8a436f 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -106,26 +106,25 @@ void kmemcheck_error_recall(void)
106 106
107 switch (e->type) { 107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS: 108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " 109 printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ? 110 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)", 111 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address); 112 (void *) e->address);
114 113
115 printk(KERN_INFO); 114 printk(KERN_WARNING);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i) 115 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]); 116 printk(KERN_CONT "%02x", e->memory_copy[i]);
118 printk("\n"); 117 printk(KERN_CONT "\n");
119 118
120 printk(KERN_INFO); 119 printk(KERN_WARNING);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) { 120 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) 121 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]); 122 printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
124 else 123 else
125 printk(" ?"); 124 printk(KERN_CONT " ?");
126 } 125 }
127 printk("\n"); 126 printk(KERN_CONT "\n");
128 printk(KERN_INFO "%*c\n", 2 + 2 127 printk(KERN_WARNING "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); 128 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break; 129 break;
131 case KMEMCHECK_ERROR_BUG: 130 case KMEMCHECK_ERROR_BUG:
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 8cc183344140..b3b531a4f8e5 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
337 if (!shadow) 337 if (!shadow)
338 return true; 338 return true;
339 339
340 status = kmemcheck_shadow_test(shadow, size); 340 status = kmemcheck_shadow_test_all(shadow, size);
341 341
342 return status == KMEMCHECK_SHADOW_INITIALIZED; 342 return status == KMEMCHECK_SHADOW_INITIALIZED;
343} 343}
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index 3f66b82076a3..aec124214d97 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
125 125
126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) 126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
127{ 127{
128#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
128 uint8_t *x; 129 uint8_t *x;
129 unsigned int i; 130 unsigned int i;
130 131
131 x = shadow; 132 x = shadow;
132 133
133#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
134 /* 134 /*
135 * Make sure _some_ bytes are initialized. Gcc frequently generates 135 * Make sure _some_ bytes are initialized. Gcc frequently generates
136 * code to access neighboring bytes. 136 * code to access neighboring bytes.
@@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) 139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
140 return x[i]; 140 return x[i];
141 } 141 }
142
143 return x[0];
142#else 144#else
145 return kmemcheck_shadow_test_all(shadow, size);
146#endif
147}
148
149enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
150{
151 uint8_t *x;
152 unsigned int i;
153
154 x = shadow;
155
143 /* All bytes must be initialized. */ 156 /* All bytes must be initialized. */
144 for (i = 0; i < size; ++i) { 157 for (i = 0; i < size; ++i) {
145 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) 158 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
146 return x[i]; 159 return x[i];
147 } 160 }
148#endif
149 161
150 return x[0]; 162 return x[0];
151} 163}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index af46d9ab9d86..ff0b2f70fbcb 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -11,6 +11,8 @@ enum kmemcheck_shadow {
11void *kmemcheck_shadow_lookup(unsigned long address); 11void *kmemcheck_shadow_lookup(unsigned long address);
12 12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); 13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
15 unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size); 16void kmemcheck_shadow_set(void *shadow, unsigned int size);
15 17
16#endif 18#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index c0f6198565eb..5d0e67fff1a6 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -21,6 +21,7 @@
21#include <linux/kdebug.h> 21#include <linux/kdebug.h>
22#include <linux/mutex.h> 22#include <linux/mutex.h>
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/slab.h>
24#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
25#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
@@ -538,14 +539,15 @@ static int
538kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) 539kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
539{ 540{
540 struct die_args *arg = args; 541 struct die_args *arg = args;
542 unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
541 543
542 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 544 if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
543 if (post_kmmio_handler(arg->err, arg->regs) == 1) { 545 if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
544 /* 546 /*
545 * Reset the BS bit in dr6 (pointed by args->err) to 547 * Reset the BS bit in dr6 (pointed by args->err) to
546 * denote completion of processing 548 * denote completion of processing
547 */ 549 */
548 (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; 550 *dr6_p &= ~DR_STEP;
549 return NOTIFY_STOP; 551 return NOTIFY_STOP;
550 } 552 }
551 553
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c8191defc38a..1dab5194fd9d 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,7 +71,7 @@ static int mmap_is_legacy(void)
71 if (current->personality & ADDR_COMPAT_LAYOUT) 71 if (current->personality & ADDR_COMPAT_LAYOUT)
72 return 1; 72 return 1;
73 73
74 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) 74 if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
75 return 1; 75 return 1;
76 76
77 return sysctl_legacy_va_layout; 77 return sysctl_legacy_va_layout;
@@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void)
96 96
97static unsigned long mmap_base(void) 97static unsigned long mmap_base(void)
98{ 98{
99 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; 99 unsigned long gap = rlimit(RLIMIT_STACK);
100 100
101 if (gap < MIN_GAP) 101 if (gap < MIN_GAP)
102 gap = MIN_GAP; 102 gap = MIN_GAP;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 34a3291ca103..3adff7dcc148 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -26,6 +26,7 @@
26 26
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/debugfs.h> 28#include <linux/debugfs.h>
29#include <linux/slab.h>
29#include <linux/uaccess.h> 30#include <linux/uaccess.h>
30#include <linux/io.h> 31#include <linux/io.h>
31#include <linux/version.h> 32#include <linux/version.h>
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index b20760ca7244..809baaaf48b1 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -418,7 +418,10 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
418 418
419 for_each_online_node(nid) { 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->node_id = nid;
422#ifndef CONFIG_NO_BOOTMEM
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 423 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
424#endif
422 } 425 }
423 426
424 setup_bootmem_allocator(); 427 setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 83bbc70d11bb..8948f47fde05 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
163 unsigned long end, unsigned long size, 163 unsigned long end, unsigned long size,
164 unsigned long align) 164 unsigned long align)
165{ 165{
166 unsigned long mem = find_e820_area(start, end, size, align); 166 unsigned long mem;
167 void *ptr;
168 167
168 /*
169 * put it on high as possible
170 * something will go with NODE_DATA
171 */
172 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
173 start = MAX_DMA_PFN<<PAGE_SHIFT;
174 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
175 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
176 start = MAX_DMA32_PFN<<PAGE_SHIFT;
177 mem = find_e820_area(start, end, size, align);
178 if (mem != -1L)
179 return __va(mem);
180
181 /* extend the search scope */
182 end = max_pfn_mapped << PAGE_SHIFT;
183 if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
184 start = MAX_DMA32_PFN<<PAGE_SHIFT;
185 else
186 start = MAX_DMA_PFN<<PAGE_SHIFT;
187 mem = find_e820_area(start, end, size, align);
169 if (mem != -1L) 188 if (mem != -1L)
170 return __va(mem); 189 return __va(mem);
171 190
172 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 191 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
173 if (ptr == NULL) {
174 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
175 size, nodeid); 192 size, nodeid);
176 return NULL; 193
177 } 194 return NULL;
178 return ptr;
179} 195}
180 196
181/* Initialize bootmem allocator for a node */ 197/* Initialize bootmem allocator for a node */
182void __init 198void __init
183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 199setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 200{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 201 unsigned long start_pfn, last_pfn, nodedata_phys;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 202 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
187 unsigned long bootmap_start, nodedata_phys;
188 void *bootmap;
189 int nid; 203 int nid;
204#ifndef CONFIG_NO_BOOTMEM
205 unsigned long bootmap_start, bootmap_pages, bootmap_size;
206 void *bootmap;
207#endif
190 208
191 if (!end) 209 if (!end)
192 return; 210 return;
@@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
200 218
201 start = roundup(start, ZONE_ALIGN); 219 start = roundup(start, ZONE_ALIGN);
202 220
203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 221 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
204 start, end); 222 start, end);
205 223
206 start_pfn = start >> PAGE_SHIFT; 224 start_pfn = start >> PAGE_SHIFT;
@@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
211 if (node_data[nodeid] == NULL) 229 if (node_data[nodeid] == NULL)
212 return; 230 return;
213 nodedata_phys = __pa(node_data[nodeid]); 231 nodedata_phys = __pa(node_data[nodeid]);
232 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
214 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 233 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
215 nodedata_phys + pgdat_size - 1); 234 nodedata_phys + pgdat_size - 1);
235 nid = phys_to_nid(nodedata_phys);
236 if (nid != nodeid)
237 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
216 238
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 239 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 240 NODE_DATA(nodeid)->node_id = nodeid;
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 241 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 242 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 243
244#ifndef CONFIG_NO_BOOTMEM
245 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
246
222 /* 247 /*
223 * Find a place for the bootmem map 248 * Find a place for the bootmem map
224 * nodedata_phys could be on other nodes by alloc_bootmem, 249 * nodedata_phys could be on other nodes by alloc_bootmem,
@@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
227 * of alloc_bootmem, that could clash with reserved range 252 * of alloc_bootmem, that could clash with reserved range
228 */ 253 */
229 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 254 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 255 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
231 if (nid == nodeid)
232 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else
234 bootmap_start = roundup(start, PAGE_SIZE);
235 /* 256 /*
236 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 257 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 258 * to use that to align to PAGE_SIZE
@@ -239,18 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 260 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 261 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 262 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) { 263 free_early(nodedata_phys, nodedata_phys + pgdat_size);
243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
250 node_data[nodeid] = NULL; 264 node_data[nodeid] = NULL;
251 return; 265 return;
252 } 266 }
253 bootmap_start = __pa(bootmap); 267 bootmap_start = __pa(bootmap);
268 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
269 "BOOTMAP");
254 270
255 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 271 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
256 bootmap_start >> PAGE_SHIFT, 272 bootmap_start >> PAGE_SHIFT,
@@ -259,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
259 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 275 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
260 bootmap_start, bootmap_start + bootmap_size - 1, 276 bootmap_start, bootmap_start + bootmap_size - 1,
261 bootmap_pages); 277 bootmap_pages);
262
263 free_bootmem_with_active_regions(nodeid, end);
264
265 /*
266 * convert early reserve to bootmem reserve earlier
267 * otherwise early_node_mem could use early reserved mem
268 * on previous node
269 */
270 early_res_to_bootmem(start, end);
271
272 /*
273 * in some case early_node_mem could use alloc_bootmem
274 * to get range on other node, don't reserve that again
275 */
276 if (nid != nodeid)
277 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
278 else
279 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
280 pgdat_size, BOOTMEM_DEFAULT);
281 nid = phys_to_nid(bootmap_start); 278 nid = phys_to_nid(bootmap_start);
282 if (nid != nodeid) 279 if (nid != nodeid)
283 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 280 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
284 else 281
285 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 282 free_bootmem_with_active_regions(nodeid, end);
286 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 283#endif
287 284
288 node_set_online(nodeid); 285 node_set_online(nodeid);
289} 286}
@@ -427,7 +424,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
427 * Calculate the number of big nodes that can be allocated as a result 424 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder. 425 * of consolidating the remainder.
429 */ 426 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / 427 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
431 FAKE_NODE_MIN_SIZE; 428 FAKE_NODE_MIN_SIZE;
432 429
433 size &= FAKE_NODE_MIN_HASH_MASK; 430 size &= FAKE_NODE_MIN_HASH_MASK;
@@ -502,77 +499,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
502} 499}
503 500
504/* 501/*
505 * Splits num_nodes nodes up equally starting at node_start. The return value 502 * Returns the end address of a node so that there is at least `size' amount of
506 * is the number of nodes split up and addr is adjusted to be at the end of the 503 * non-reserved memory or `max_addr' is reached.
507 * last node allocated.
508 */ 504 */
509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, 505static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
510 int num_nodes)
511{ 506{
512 unsigned int big; 507 u64 end = start + size;
513 u64 size;
514 int i;
515 508
516 if (num_nodes <= 0) 509 while (end - start - e820_hole_size(start, end) < size) {
517 return -1; 510 end += FAKE_NODE_MIN_SIZE;
518 if (num_nodes > MAX_NUMNODES) 511 if (end > max_addr) {
519 num_nodes = MAX_NUMNODES;
520 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
521 num_nodes;
522 /*
523 * Calculate the number of big nodes that can be allocated as a result
524 * of consolidating the leftovers.
525 */
526 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
527 FAKE_NODE_MIN_SIZE;
528
529 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
530 size &= FAKE_NODE_MIN_HASH_MASK;
531 if (!size) {
532 printk(KERN_ERR "Not enough memory for each node. "
533 "NUMA emulation disabled.\n");
534 return -1;
535 }
536
537 for (i = node_start; i < num_nodes + node_start; i++) {
538 u64 end = *addr + size;
539
540 if (i < big)
541 end += FAKE_NODE_MIN_SIZE;
542 /*
543 * The final node can have the remaining system RAM. Other
544 * nodes receive roughly the same amount of available pages.
545 */
546 if (i == num_nodes + node_start - 1)
547 end = max_addr; 512 end = max_addr;
548 else
549 while (end - *addr - e820_hole_size(*addr, end) <
550 size) {
551 end += FAKE_NODE_MIN_SIZE;
552 if (end > max_addr) {
553 end = max_addr;
554 break;
555 }
556 }
557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
558 break; 513 break;
514 }
559 } 515 }
560 return i - node_start + 1; 516 return end;
561} 517}
562 518
563/* 519/*
564 * Splits the remaining system RAM into chunks of size. The remaining memory is 520 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
565 * always assigned to a final node and can be asymmetric. Returns the number of 521 * `addr' to `max_addr'. The return value is the number of nodes allocated.
566 * nodes split.
567 */ 522 */
568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, 523static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
569 u64 size)
570{ 524{
571 int i = node_start; 525 nodemask_t physnode_mask = NODE_MASK_NONE;
572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 526 u64 min_size;
573 while (!setup_node_range(i++, addr, size, max_addr)) 527 int ret = 0;
574 ; 528 int i;
575 return i - node_start; 529
530 if (!size)
531 return -1;
532 /*
533 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
534 * increased accordingly if the requested size is too small. This
535 * creates a uniform distribution of node sizes across the entire
536 * machine (but not necessarily over physical nodes).
537 */
538 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
539 MAX_NUMNODES;
540 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
541 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
542 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
543 FAKE_NODE_MIN_HASH_MASK;
544 if (size < min_size) {
545 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
546 size >> 20, min_size >> 20);
547 size = min_size;
548 }
549 size &= FAKE_NODE_MIN_HASH_MASK;
550
551 for (i = 0; i < MAX_NUMNODES; i++)
552 if (physnodes[i].start != physnodes[i].end)
553 node_set(i, physnode_mask);
554 /*
555 * Fill physical nodes with fake nodes of size until there is no memory
556 * left on any of them.
557 */
558 while (nodes_weight(physnode_mask)) {
559 for_each_node_mask(i, physnode_mask) {
560 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
561 u64 end;
562
563 end = find_end_of_node(physnodes[i].start,
564 physnodes[i].end, size);
565 /*
566 * If there won't be at least FAKE_NODE_MIN_SIZE of
567 * non-reserved memory in ZONE_DMA32 for the next node,
568 * this one must extend to the boundary.
569 */
570 if (end < dma32_end && dma32_end - end -
571 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
572 end = dma32_end;
573
574 /*
575 * If there won't be enough non-reserved memory for the
576 * next node, this one must extend to the end of the
577 * physical node.
578 */
579 if (physnodes[i].end - end -
580 e820_hole_size(end, physnodes[i].end) < size)
581 end = physnodes[i].end;
582
583 /*
584 * Setup the fake node that will be allocated as bootmem
585 * later. If setup_node_range() returns non-zero, there
586 * is no more memory available on this physical node.
587 */
588 if (setup_node_range(ret++, &physnodes[i].start,
589 end - physnodes[i].start,
590 physnodes[i].end) < 0)
591 node_clear(i, physnode_mask);
592 }
593 }
594 return ret;
576} 595}
577 596
578/* 597/*
@@ -582,87 +601,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
582static int __init numa_emulation(unsigned long start_pfn, 601static int __init numa_emulation(unsigned long start_pfn,
583 unsigned long last_pfn, int acpi, int k8) 602 unsigned long last_pfn, int acpi, int k8)
584{ 603{
585 u64 size, addr = start_pfn << PAGE_SHIFT; 604 u64 addr = start_pfn << PAGE_SHIFT;
586 u64 max_addr = last_pfn << PAGE_SHIFT; 605 u64 max_addr = last_pfn << PAGE_SHIFT;
587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes; 606 int num_phys_nodes;
607 int num_nodes;
608 int i;
589 609
590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 610 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
591 /* 611 /*
592 * If the numa=fake command-line is just a single number N, split the 612 * If the numa=fake command-line contains a 'M' or 'G', it represents
593 * system RAM into N fake nodes. 613 * the fixed node size. Otherwise, if it is just a single number N,
614 * split the system RAM into N fake nodes.
594 */ 615 */
595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 616 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
596 long n = simple_strtol(cmdline, NULL, 0); 617 u64 size;
597
598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
600 if (num_nodes < 0)
601 return num_nodes;
602 goto out;
603 }
604 618
605 /* Parse the command line. */ 619 size = memparse(cmdline, &cmdline);
606 for (coeff_flag = 0; ; cmdline++) { 620 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
607 if (*cmdline && isdigit(*cmdline)) { 621 } else {
608 num = num * 10 + *cmdline - '0'; 622 unsigned long n;
609 continue; 623
610 } 624 n = simple_strtoul(cmdline, NULL, 0);
611 if (*cmdline == '*') { 625 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
612 if (num > 0)
613 coeff = num;
614 coeff_flag = 1;
615 }
616 if (!*cmdline || *cmdline == ',') {
617 if (!coeff_flag)
618 coeff = 1;
619 /*
620 * Round down to the nearest FAKE_NODE_MIN_SIZE.
621 * Command-line coefficients are in megabytes.
622 */
623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
624 if (size)
625 for (i = 0; i < coeff; i++, num_nodes++)
626 if (setup_node_range(num_nodes, &addr,
627 size, max_addr) < 0)
628 goto done;
629 if (!*cmdline)
630 break;
631 coeff_flag = 0;
632 coeff = -1;
633 }
634 num = 0;
635 }
636done:
637 if (!num_nodes)
638 return -1;
639 /* Fill remainder of system RAM, if appropriate. */
640 if (addr < max_addr) {
641 if (coeff_flag && coeff < 0) {
642 /* Split remaining nodes into num-sized chunks */
643 num_nodes += split_nodes_by_size(&addr, max_addr,
644 num_nodes, num);
645 goto out;
646 }
647 switch (*(cmdline - 1)) {
648 case '*':
649 /* Split remaining nodes into coeff chunks */
650 if (coeff <= 0)
651 break;
652 num_nodes += split_nodes_equally(&addr, max_addr,
653 num_nodes, coeff);
654 break;
655 case ',':
656 /* Do not allocate remaining system RAM */
657 break;
658 default:
659 /* Give one final node */
660 setup_node_range(num_nodes, &addr, max_addr - addr,
661 max_addr);
662 num_nodes++;
663 }
664 } 626 }
665out: 627
628 if (num_nodes < 0)
629 return num_nodes;
666 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 630 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
667 if (memnode_shift < 0) { 631 if (memnode_shift < 0) {
668 memnode_shift = 0; 632 memnode_shift = 0;
@@ -742,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void)
742 for_each_online_node(i) 706 for_each_online_node(i)
743 pages += free_all_bootmem_node(NODE_DATA(i)); 707 pages += free_all_bootmem_node(NODE_DATA(i));
744 708
709#ifdef CONFIG_NO_BOOTMEM
710 pages += free_all_memory_core_early(MAX_NUMNODES);
711#endif
712
745 return pages; 713 return pages;
746} 714}
747 715
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1d4eb93d333c..28195c350b97 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -6,13 +6,13 @@
6#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/mm.h> 9#include <linux/mm.h>
11#include <linux/interrupt.h> 10#include <linux/interrupt.h>
12#include <linux/seq_file.h> 11#include <linux/seq_file.h>
13#include <linux/debugfs.h> 12#include <linux/debugfs.h>
14#include <linux/pfn.h> 13#include <linux/pfn.h>
15#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <linux/gfp.h>
16 16
17#include <asm/e820.h> 17#include <asm/e820.h>
18#include <asm/processor.h> 18#include <asm/processor.h>
@@ -291,8 +291,29 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
291 */ 291 */
292 if (kernel_set_to_readonly && 292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text, 293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align)) 294 (unsigned long)__end_rodata_hpage_align)) {
295 pgprot_val(forbidden) |= _PAGE_RW; 295 unsigned int level;
296
297 /*
298 * Don't enforce the !RW mapping for the kernel text mapping,
299 * if the current mapping is already using small page mapping.
300 * No need to work hard to preserve large page mappings in this
301 * case.
302 *
303 * This also fixes the Linux Xen paravirt guest boot failure
304 * (because of unexpected read-only mappings for kernel identity
305 * mappings). In this paravirt guest case, the kernel text
306 * mapping and the kernel identity mapping share the same
307 * page-table pages. Thus we can't really use different
308 * protections for the kernel text and identity mappings. Also,
309 * these shared mappings are made of small page mappings.
310 * Thus this don't enforce !RW mapping for small page kernel
311 * text mapping logic will help Linux Xen parvirt guest boot
312 * aswell.
313 */
314 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
315 pgprot_val(forbidden) |= _PAGE_RW;
316 }
296#endif 317#endif
297 318
298 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 319 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index ae9648eb1c7f..edc8b95afc1a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,7 +12,7 @@
12#include <linux/debugfs.h> 12#include <linux/debugfs.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/gfp.h> 15#include <linux/slab.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/rbtree.h> 18#include <linux/rbtree.h>
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ed34f5e35999..5c4ee422590e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,4 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/gfp.h>
2#include <asm/pgalloc.h> 3#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 4#include <asm/pgtable.h>
4#include <asm/tlb.h> 5#include <asm/tlb.h>
@@ -6,6 +7,14 @@
6 7
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 8#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8 9
10#ifdef CONFIG_HIGHPTE
11#define PGALLOC_USER_GFP __GFP_HIGHMEM
12#else
13#define PGALLOC_USER_GFP 0
14#endif
15
16gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
17
9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 18pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
10{ 19{
11 return (pte_t *)__get_free_page(PGALLOC_GFP); 20 return (pte_t *)__get_free_page(PGALLOC_GFP);
@@ -15,16 +24,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
15{ 24{
16 struct page *pte; 25 struct page *pte;
17 26
18#ifdef CONFIG_HIGHPTE 27 pte = alloc_pages(__userpte_alloc_gfp, 0);
19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
20#else
21 pte = alloc_pages(PGALLOC_GFP, 0);
22#endif
23 if (pte) 28 if (pte)
24 pgtable_page_ctor(pte); 29 pgtable_page_ctor(pte);
25 return pte; 30 return pte;
26} 31}
27 32
33static int __init setup_userpte(char *arg)
34{
35 if (!arg)
36 return -EINVAL;
37
38 /*
39 * "userpte=nohigh" disables allocation of user pagetables in
40 * high memory.
41 */
42 if (strcmp(arg, "nohigh") == 0)
43 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
44 else
45 return -EINVAL;
46 return 0;
47}
48early_param("userpte", setup_userpte);
49
28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 50void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
29{ 51{
30 pgtable_page_dtor(pte); 52 pgtable_page_dtor(pte);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 46c8834aedc0..1a8faf09afed 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -6,7 +6,6 @@
6#include <linux/swap.h> 6#include <linux/swap.h>
7#include <linux/smp.h> 7#include <linux/smp.h>
8#include <linux/highmem.h> 8#include <linux/highmem.h>
9#include <linux/slab.h>
10#include <linux/pagemap.h> 9#include <linux/pagemap.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/module.h> 11#include <linux/module.h>
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a27124185fc1..28c68762648f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -229,9 +229,11 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
229 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); 229 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
230 } 230 }
231 231
232 if (changed) 232 if (changed) {
233 node_set(node, cpu_nodes_parsed);
233 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", 234 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
234 nd->start, nd->end); 235 nd->start, nd->end);
236 }
235} 237}
236 238
237/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 239/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 65b58e4b0b8b..426f3a1a64d3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -41,7 +41,7 @@ union smp_flush_state {
41 struct { 41 struct {
42 struct mm_struct *flush_mm; 42 struct mm_struct *flush_mm;
43 unsigned long flush_va; 43 unsigned long flush_va;
44 spinlock_t tlbstate_lock; 44 raw_spinlock_t tlbstate_lock;
45 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
46 }; 46 };
47 char pad[INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
@@ -181,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
182 * probably not worth checking this for a cache-hot lock. 182 * probably not worth checking this for a cache-hot lock.
183 */ 183 */
184 spin_lock(&f->tlbstate_lock); 184 raw_spin_lock(&f->tlbstate_lock);
185 185
186 f->flush_mm = mm; 186 f->flush_mm = mm;
187 f->flush_va = va; 187 f->flush_va = va;
@@ -199,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199 199
200 f->flush_mm = NULL; 200 f->flush_mm = NULL;
201 f->flush_va = 0; 201 f->flush_va = 0;
202 spin_unlock(&f->tlbstate_lock); 202 raw_spin_unlock(&f->tlbstate_lock);
203} 203}
204 204
205void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -223,7 +223,7 @@ static int __cpuinit init_smp_flush(void)
223 int i; 223 int i;
224 224
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 225 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 spin_lock_init(&flush_state[i].tlbstate_lock); 226 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 227
228 return 0; 228 return 0;
229} 229}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cb88b1a0bd5f..2c505ee71014 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -159,7 +159,7 @@ static int nmi_setup_mux(void)
159 159
160 for_each_possible_cpu(i) { 160 for_each_possible_cpu(i) {
161 per_cpu(cpu_msrs, i).multiplex = 161 per_cpu(cpu_msrs, i).multiplex =
162 kmalloc(multiplex_size, GFP_KERNEL); 162 kzalloc(multiplex_size, GFP_KERNEL);
163 if (!per_cpu(cpu_msrs, i).multiplex) 163 if (!per_cpu(cpu_msrs, i).multiplex)
164 return 0; 164 return 0;
165 } 165 }
@@ -179,7 +179,6 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
179 if (counter_config[i].enabled) { 179 if (counter_config[i].enabled) {
180 multiplex[i].saved = -(u64)counter_config[i].count; 180 multiplex[i].saved = -(u64)counter_config[i].count;
181 } else { 181 } else {
182 multiplex[i].addr = 0;
183 multiplex[i].saved = 0; 182 multiplex[i].saved = 0;
184 } 183 }
185 } 184 }
@@ -189,25 +188,27 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
189 188
190static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) 189static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
191{ 190{
191 struct op_msr *counters = msrs->counters;
192 struct op_msr *multiplex = msrs->multiplex; 192 struct op_msr *multiplex = msrs->multiplex;
193 int i; 193 int i;
194 194
195 for (i = 0; i < model->num_counters; ++i) { 195 for (i = 0; i < model->num_counters; ++i) {
196 int virt = op_x86_phys_to_virt(i); 196 int virt = op_x86_phys_to_virt(i);
197 if (multiplex[virt].addr) 197 if (counters[i].addr)
198 rdmsrl(multiplex[virt].addr, multiplex[virt].saved); 198 rdmsrl(counters[i].addr, multiplex[virt].saved);
199 } 199 }
200} 200}
201 201
202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) 202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
203{ 203{
204 struct op_msr *counters = msrs->counters;
204 struct op_msr *multiplex = msrs->multiplex; 205 struct op_msr *multiplex = msrs->multiplex;
205 int i; 206 int i;
206 207
207 for (i = 0; i < model->num_counters; ++i) { 208 for (i = 0; i < model->num_counters; ++i) {
208 int virt = op_x86_phys_to_virt(i); 209 int virt = op_x86_phys_to_virt(i);
209 if (multiplex[virt].addr) 210 if (counters[i].addr)
210 wrmsrl(multiplex[virt].addr, multiplex[virt].saved); 211 wrmsrl(counters[i].addr, multiplex[virt].saved);
211 } 212 }
212} 213}
213 214
@@ -222,7 +223,7 @@ static void nmi_cpu_switch(void *dummy)
222 223
223 /* move to next set */ 224 /* move to next set */
224 si += model->num_counters; 225 si += model->num_counters;
225 if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) 226 if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
226 per_cpu(switch_index, cpu) = 0; 227 per_cpu(switch_index, cpu) = 0;
227 else 228 else
228 per_cpu(switch_index, cpu) = si; 229 per_cpu(switch_index, cpu) = si;
@@ -303,11 +304,11 @@ static int allocate_msrs(void)
303 304
304 int i; 305 int i;
305 for_each_possible_cpu(i) { 306 for_each_possible_cpu(i) {
306 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, 307 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
307 GFP_KERNEL); 308 GFP_KERNEL);
308 if (!per_cpu(cpu_msrs, i).counters) 309 if (!per_cpu(cpu_msrs, i).counters)
309 return 0; 310 return 0;
310 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, 311 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
311 GFP_KERNEL); 312 GFP_KERNEL);
312 if (!per_cpu(cpu_msrs, i).controls) 313 if (!per_cpu(cpu_msrs, i).controls)
313 return 0; 314 return 0;
@@ -598,6 +599,7 @@ static int __init ppro_init(char **cpu_type)
598 case 15: case 23: 599 case 15: case 23:
599 *cpu_type = "i386/core_2"; 600 *cpu_type = "i386/core_2";
600 break; 601 break;
602 case 0x2e:
601 case 26: 603 case 26:
602 spec = &op_arch_perfmon_spec; 604 spec = &op_arch_perfmon_spec;
603 *cpu_type = "i386/core_i7"; 605 *cpu_type = "i386/core_i7";
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 39686c29f03a..090cbbec7dbd 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -22,6 +22,9 @@
22#include <asm/ptrace.h> 22#include <asm/ptrace.h>
23#include <asm/msr.h> 23#include <asm/msr.h>
24#include <asm/nmi.h> 24#include <asm/nmi.h>
25#include <asm/apic.h>
26#include <asm/processor.h>
27#include <asm/cpufeature.h>
25 28
26#include "op_x86_model.h" 29#include "op_x86_model.h"
27#include "op_counter.h" 30#include "op_counter.h"
@@ -43,23 +46,10 @@
43 46
44static unsigned long reset_value[NUM_VIRT_COUNTERS]; 47static unsigned long reset_value[NUM_VIRT_COUNTERS];
45 48
46#ifdef CONFIG_OPROFILE_IBS
47
48/* IbsFetchCtl bits/masks */
49#define IBS_FETCH_RAND_EN (1ULL<<57)
50#define IBS_FETCH_VAL (1ULL<<49)
51#define IBS_FETCH_ENABLE (1ULL<<48)
52#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL
53
54/*IbsOpCtl bits */
55#define IBS_OP_CNT_CTL (1ULL<<19)
56#define IBS_OP_VAL (1ULL<<18)
57#define IBS_OP_ENABLE (1ULL<<17)
58
59#define IBS_FETCH_SIZE 6 49#define IBS_FETCH_SIZE 6
60#define IBS_OP_SIZE 12 50#define IBS_OP_SIZE 12
61 51
62static int has_ibs; /* AMD Family10h and later */ 52static u32 ibs_caps;
63 53
64struct op_ibs_config { 54struct op_ibs_config {
65 unsigned long op_enabled; 55 unsigned long op_enabled;
@@ -71,24 +61,52 @@ struct op_ibs_config {
71}; 61};
72 62
73static struct op_ibs_config ibs_config; 63static struct op_ibs_config ibs_config;
64static u64 ibs_op_ctl;
74 65
75#endif 66/*
67 * IBS cpuid feature detection
68 */
76 69
77#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 70#define IBS_CPUID_FEATURES 0x8000001b
71
72/*
73 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
74 * bit 0 is used to indicate the existence of IBS.
75 */
76#define IBS_CAPS_AVAIL (1LL<<0)
77#define IBS_CAPS_RDWROPCNT (1LL<<3)
78#define IBS_CAPS_OPCNT (1LL<<4)
78 79
79static void op_mux_fill_in_addresses(struct op_msrs * const msrs) 80/*
81 * IBS randomization macros
82 */
83#define IBS_RANDOM_BITS 12
84#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
85#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
86
87static u32 get_ibs_caps(void)
80{ 88{
81 int i; 89 u32 ibs_caps;
90 unsigned int max_level;
82 91
83 for (i = 0; i < NUM_VIRT_COUNTERS; i++) { 92 if (!boot_cpu_has(X86_FEATURE_IBS))
84 int hw_counter = op_x86_virt_to_phys(i); 93 return 0;
85 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 94
86 msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; 95 /* check IBS cpuid feature flags */
87 else 96 max_level = cpuid_eax(0x80000000);
88 msrs->multiplex[i].addr = 0; 97 if (max_level < IBS_CPUID_FEATURES)
89 } 98 return IBS_CAPS_AVAIL;
99
100 ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
101 if (!(ibs_caps & IBS_CAPS_AVAIL))
102 /* cpuid flags not valid */
103 return IBS_CAPS_AVAIL;
104
105 return ibs_caps;
90} 106}
91 107
108#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
109
92static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, 110static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
93 struct op_msrs const * const msrs) 111 struct op_msrs const * const msrs)
94{ 112{
@@ -98,7 +116,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
98 /* enable active counters */ 116 /* enable active counters */
99 for (i = 0; i < NUM_COUNTERS; ++i) { 117 for (i = 0; i < NUM_COUNTERS; ++i) {
100 int virt = op_x86_phys_to_virt(i); 118 int virt = op_x86_phys_to_virt(i);
101 if (!counter_config[virt].enabled) 119 if (!reset_value[virt])
102 continue; 120 continue;
103 rdmsrl(msrs->controls[i].addr, val); 121 rdmsrl(msrs->controls[i].addr, val);
104 val &= model->reserved; 122 val &= model->reserved;
@@ -107,10 +125,6 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
107 } 125 }
108} 126}
109 127
110#else
111
112static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { }
113
114#endif 128#endif
115 129
116/* functions for op_amd_spec */ 130/* functions for op_amd_spec */
@@ -122,18 +136,12 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
122 for (i = 0; i < NUM_COUNTERS; i++) { 136 for (i = 0; i < NUM_COUNTERS; i++) {
123 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 137 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
124 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; 138 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
125 else
126 msrs->counters[i].addr = 0;
127 } 139 }
128 140
129 for (i = 0; i < NUM_CONTROLS; i++) { 141 for (i = 0; i < NUM_CONTROLS; i++) {
130 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) 142 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
131 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; 143 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
132 else
133 msrs->controls[i].addr = 0;
134 } 144 }
135
136 op_mux_fill_in_addresses(msrs);
137} 145}
138 146
139static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, 147static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
@@ -144,7 +152,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
144 152
145 /* setup reset_value */ 153 /* setup reset_value */
146 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { 154 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
147 if (counter_config[i].enabled) 155 if (counter_config[i].enabled
156 && msrs->counters[op_x86_virt_to_phys(i)].addr)
148 reset_value[i] = counter_config[i].count; 157 reset_value[i] = counter_config[i].count;
149 else 158 else
150 reset_value[i] = 0; 159 reset_value[i] = 0;
@@ -152,9 +161,18 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
152 161
153 /* clear all counters */ 162 /* clear all counters */
154 for (i = 0; i < NUM_CONTROLS; ++i) { 163 for (i = 0; i < NUM_CONTROLS; ++i) {
155 if (unlikely(!msrs->controls[i].addr)) 164 if (unlikely(!msrs->controls[i].addr)) {
165 if (counter_config[i].enabled && !smp_processor_id())
166 /*
167 * counter is reserved, this is on all
168 * cpus, so report only for cpu #0
169 */
170 op_x86_warn_reserved(i);
156 continue; 171 continue;
172 }
157 rdmsrl(msrs->controls[i].addr, val); 173 rdmsrl(msrs->controls[i].addr, val);
174 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
175 op_x86_warn_in_use(i);
158 val &= model->reserved; 176 val &= model->reserved;
159 wrmsrl(msrs->controls[i].addr, val); 177 wrmsrl(msrs->controls[i].addr, val);
160 } 178 }
@@ -169,9 +187,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
169 /* enable active counters */ 187 /* enable active counters */
170 for (i = 0; i < NUM_COUNTERS; ++i) { 188 for (i = 0; i < NUM_COUNTERS; ++i) {
171 int virt = op_x86_phys_to_virt(i); 189 int virt = op_x86_phys_to_virt(i);
172 if (!counter_config[virt].enabled) 190 if (!reset_value[virt])
173 continue;
174 if (!msrs->counters[i].addr)
175 continue; 191 continue;
176 192
177 /* setup counter registers */ 193 /* setup counter registers */
@@ -185,7 +201,60 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
185 } 201 }
186} 202}
187 203
188#ifdef CONFIG_OPROFILE_IBS 204/*
205 * 16-bit Linear Feedback Shift Register (LFSR)
206 *
207 * 16 14 13 11
208 * Feedback polynomial = X + X + X + X + 1
209 */
210static unsigned int lfsr_random(void)
211{
212 static unsigned int lfsr_value = 0xF00D;
213 unsigned int bit;
214
215 /* Compute next bit to shift in */
216 bit = ((lfsr_value >> 0) ^
217 (lfsr_value >> 2) ^
218 (lfsr_value >> 3) ^
219 (lfsr_value >> 5)) & 0x0001;
220
221 /* Advance to next register value */
222 lfsr_value = (lfsr_value >> 1) | (bit << 15);
223
224 return lfsr_value;
225}
226
227/*
228 * IBS software randomization
229 *
230 * The IBS periodic op counter is randomized in software. The lower 12
231 * bits of the 20 bit counter are randomized. IbsOpCurCnt is
232 * initialized with a 12 bit random value.
233 */
234static inline u64 op_amd_randomize_ibs_op(u64 val)
235{
236 unsigned int random = lfsr_random();
237
238 if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
239 /*
240 * Work around if the hw can not write to IbsOpCurCnt
241 *
242 * Randomize the lower 8 bits of the 16 bit
243 * IbsOpMaxCnt [15:0] value in the range of -128 to
244 * +127 by adding/subtracting an offset to the
245 * maximum count (IbsOpMaxCnt).
246 *
247 * To avoid over or underflows and protect upper bits
248 * starting at bit 16, the initial value for
249 * IbsOpMaxCnt must fit in the range from 0x0081 to
250 * 0xff80.
251 */
252 val += (s8)(random >> 4);
253 else
254 val |= (u64)(random & IBS_RANDOM_MASK) << 32;
255
256 return val;
257}
189 258
190static inline void 259static inline void
191op_amd_handle_ibs(struct pt_regs * const regs, 260op_amd_handle_ibs(struct pt_regs * const regs,
@@ -194,7 +263,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
194 u64 val, ctl; 263 u64 val, ctl;
195 struct op_entry entry; 264 struct op_entry entry;
196 265
197 if (!has_ibs) 266 if (!ibs_caps)
198 return; 267 return;
199 268
200 if (ibs_config.fetch_enabled) { 269 if (ibs_config.fetch_enabled) {
@@ -210,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
210 oprofile_write_commit(&entry); 279 oprofile_write_commit(&entry);
211 280
212 /* reenable the IRQ */ 281 /* reenable the IRQ */
213 ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); 282 ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
214 ctl |= IBS_FETCH_ENABLE; 283 ctl |= IBS_FETCH_ENABLE;
215 wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); 284 wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
216 } 285 }
@@ -236,8 +305,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
236 oprofile_write_commit(&entry); 305 oprofile_write_commit(&entry);
237 306
238 /* reenable the IRQ */ 307 /* reenable the IRQ */
239 ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; 308 ctl = op_amd_randomize_ibs_op(ibs_op_ctl);
240 ctl |= IBS_OP_ENABLE;
241 wrmsrl(MSR_AMD64_IBSOPCTL, ctl); 309 wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
242 } 310 }
243 } 311 }
@@ -246,41 +314,57 @@ op_amd_handle_ibs(struct pt_regs * const regs,
246static inline void op_amd_start_ibs(void) 314static inline void op_amd_start_ibs(void)
247{ 315{
248 u64 val; 316 u64 val;
249 if (has_ibs && ibs_config.fetch_enabled) { 317
250 val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; 318 if (!ibs_caps)
319 return;
320
321 if (ibs_config.fetch_enabled) {
322 val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT;
251 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; 323 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
252 val |= IBS_FETCH_ENABLE; 324 val |= IBS_FETCH_ENABLE;
253 wrmsrl(MSR_AMD64_IBSFETCHCTL, val); 325 wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
254 } 326 }
255 327
256 if (has_ibs && ibs_config.op_enabled) { 328 if (ibs_config.op_enabled) {
257 val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; 329 ibs_op_ctl = ibs_config.max_cnt_op >> 4;
258 val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; 330 if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
259 val |= IBS_OP_ENABLE; 331 /*
332 * IbsOpCurCnt not supported. See
333 * op_amd_randomize_ibs_op() for details.
334 */
335 ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL);
336 } else {
337 /*
338 * The start value is randomized with a
339 * positive offset, we need to compensate it
340 * with the half of the randomized range. Also
341 * avoid underflows.
342 */
343 ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET,
344 IBS_OP_MAX_CNT);
345 }
346 if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops)
347 ibs_op_ctl |= IBS_OP_CNT_CTL;
348 ibs_op_ctl |= IBS_OP_ENABLE;
349 val = op_amd_randomize_ibs_op(ibs_op_ctl);
260 wrmsrl(MSR_AMD64_IBSOPCTL, val); 350 wrmsrl(MSR_AMD64_IBSOPCTL, val);
261 } 351 }
262} 352}
263 353
264static void op_amd_stop_ibs(void) 354static void op_amd_stop_ibs(void)
265{ 355{
266 if (has_ibs && ibs_config.fetch_enabled) 356 if (!ibs_caps)
357 return;
358
359 if (ibs_config.fetch_enabled)
267 /* clear max count and enable */ 360 /* clear max count and enable */
268 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); 361 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
269 362
270 if (has_ibs && ibs_config.op_enabled) 363 if (ibs_config.op_enabled)
271 /* clear max count and enable */ 364 /* clear max count and enable */
272 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 365 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
273} 366}
274 367
275#else
276
277static inline void op_amd_handle_ibs(struct pt_regs * const regs,
278 struct op_msrs const * const msrs) { }
279static inline void op_amd_start_ibs(void) { }
280static inline void op_amd_stop_ibs(void) { }
281
282#endif
283
284static int op_amd_check_ctrs(struct pt_regs * const regs, 368static int op_amd_check_ctrs(struct pt_regs * const regs,
285 struct op_msrs const * const msrs) 369 struct op_msrs const * const msrs)
286{ 370{
@@ -314,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
314 if (!reset_value[op_x86_phys_to_virt(i)]) 398 if (!reset_value[op_x86_phys_to_virt(i)])
315 continue; 399 continue;
316 rdmsrl(msrs->controls[i].addr, val); 400 rdmsrl(msrs->controls[i].addr, val);
317 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 401 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
318 wrmsrl(msrs->controls[i].addr, val); 402 wrmsrl(msrs->controls[i].addr, val);
319 } 403 }
320 404
@@ -334,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
334 if (!reset_value[op_x86_phys_to_virt(i)]) 418 if (!reset_value[op_x86_phys_to_virt(i)])
335 continue; 419 continue;
336 rdmsrl(msrs->controls[i].addr, val); 420 rdmsrl(msrs->controls[i].addr, val);
337 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 421 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
338 wrmsrl(msrs->controls[i].addr, val); 422 wrmsrl(msrs->controls[i].addr, val);
339 } 423 }
340 424
@@ -355,8 +439,6 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
355 } 439 }
356} 440}
357 441
358#ifdef CONFIG_OPROFILE_IBS
359
360static u8 ibs_eilvt_off; 442static u8 ibs_eilvt_off;
361 443
362static inline void apic_init_ibs_nmi_per_cpu(void *arg) 444static inline void apic_init_ibs_nmi_per_cpu(void *arg)
@@ -405,45 +487,36 @@ static int init_ibs_nmi(void)
405 return 1; 487 return 1;
406 } 488 }
407 489
408#ifdef CONFIG_NUMA
409 /* Sanity check */
410 /* Works only for 64bit with proper numa implementation. */
411 if (nodes != num_possible_nodes()) {
412 printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
413 "found: %d, expected %d",
414 nodes, num_possible_nodes());
415 return 1;
416 }
417#endif
418 return 0; 490 return 0;
419} 491}
420 492
421/* uninitialize the APIC for the IBS interrupts if needed */ 493/* uninitialize the APIC for the IBS interrupts if needed */
422static void clear_ibs_nmi(void) 494static void clear_ibs_nmi(void)
423{ 495{
424 if (has_ibs) 496 if (ibs_caps)
425 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); 497 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
426} 498}
427 499
428/* initialize the APIC for the IBS interrupts if available */ 500/* initialize the APIC for the IBS interrupts if available */
429static void ibs_init(void) 501static void ibs_init(void)
430{ 502{
431 has_ibs = boot_cpu_has(X86_FEATURE_IBS); 503 ibs_caps = get_ibs_caps();
432 504
433 if (!has_ibs) 505 if (!ibs_caps)
434 return; 506 return;
435 507
436 if (init_ibs_nmi()) { 508 if (init_ibs_nmi()) {
437 has_ibs = 0; 509 ibs_caps = 0;
438 return; 510 return;
439 } 511 }
440 512
441 printk(KERN_INFO "oprofile: AMD IBS detected\n"); 513 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n",
514 (unsigned)ibs_caps);
442} 515}
443 516
444static void ibs_exit(void) 517static void ibs_exit(void)
445{ 518{
446 if (!has_ibs) 519 if (!ibs_caps)
447 return; 520 return;
448 521
449 clear_ibs_nmi(); 522 clear_ibs_nmi();
@@ -463,7 +536,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
463 if (ret) 536 if (ret)
464 return ret; 537 return ret;
465 538
466 if (!has_ibs) 539 if (!ibs_caps)
467 return ret; 540 return ret;
468 541
469 /* model specific files */ 542 /* model specific files */
@@ -473,7 +546,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
473 ibs_config.fetch_enabled = 0; 546 ibs_config.fetch_enabled = 0;
474 ibs_config.max_cnt_op = 250000; 547 ibs_config.max_cnt_op = 250000;
475 ibs_config.op_enabled = 0; 548 ibs_config.op_enabled = 0;
476 ibs_config.dispatched_ops = 1; 549 ibs_config.dispatched_ops = 0;
477 550
478 dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); 551 dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
479 oprofilefs_create_ulong(sb, dir, "enable", 552 oprofilefs_create_ulong(sb, dir, "enable",
@@ -488,8 +561,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
488 &ibs_config.op_enabled); 561 &ibs_config.op_enabled);
489 oprofilefs_create_ulong(sb, dir, "max_count", 562 oprofilefs_create_ulong(sb, dir, "max_count",
490 &ibs_config.max_cnt_op); 563 &ibs_config.max_cnt_op);
491 oprofilefs_create_ulong(sb, dir, "dispatched_ops", 564 if (ibs_caps & IBS_CAPS_OPCNT)
492 &ibs_config.dispatched_ops); 565 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
566 &ibs_config.dispatched_ops);
493 567
494 return 0; 568 return 0;
495} 569}
@@ -507,19 +581,6 @@ static void op_amd_exit(void)
507 ibs_exit(); 581 ibs_exit();
508} 582}
509 583
510#else
511
512/* no IBS support */
513
514static int op_amd_init(struct oprofile_operations *ops)
515{
516 return 0;
517}
518
519static void op_amd_exit(void) {}
520
521#endif /* CONFIG_OPROFILE_IBS */
522
523struct op_x86_model_spec op_amd_spec = { 584struct op_x86_model_spec op_amd_spec = {
524 .num_counters = NUM_COUNTERS, 585 .num_counters = NUM_COUNTERS,
525 .num_controls = NUM_CONTROLS, 586 .num_controls = NUM_CONTROLS,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index ac6b354becdf..e6a160a4684a 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -394,12 +394,6 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
394 setup_num_counters(); 394 setup_num_counters();
395 stag = get_stagger(); 395 stag = get_stagger();
396 396
397 /* initialize some registers */
398 for (i = 0; i < num_counters; ++i)
399 msrs->counters[i].addr = 0;
400 for (i = 0; i < num_controls; ++i)
401 msrs->controls[i].addr = 0;
402
403 /* the counter & cccr registers we pay attention to */ 397 /* the counter & cccr registers we pay attention to */
404 for (i = 0; i < num_counters; ++i) { 398 for (i = 0; i < num_counters; ++i) {
405 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 399 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 8eb05878554c..2bf90fafa7b5 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -37,15 +37,11 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
37 for (i = 0; i < num_counters; i++) { 37 for (i = 0; i < num_counters; i++) {
38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) 38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; 39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
40 else
41 msrs->counters[i].addr = 0;
42 } 40 }
43 41
44 for (i = 0; i < num_counters; i++) { 42 for (i = 0; i < num_counters; i++) {
45 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) 43 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
46 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; 44 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
47 else
48 msrs->controls[i].addr = 0;
49 } 45 }
50} 46}
51 47
@@ -57,7 +53,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
57 int i; 53 int i;
58 54
59 if (!reset_value) { 55 if (!reset_value) {
60 reset_value = kmalloc(sizeof(reset_value[0]) * num_counters, 56 reset_value = kzalloc(sizeof(reset_value[0]) * num_counters,
61 GFP_ATOMIC); 57 GFP_ATOMIC);
62 if (!reset_value) 58 if (!reset_value)
63 return; 59 return;
@@ -82,9 +78,18 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
82 78
83 /* clear all counters */ 79 /* clear all counters */
84 for (i = 0; i < num_counters; ++i) { 80 for (i = 0; i < num_counters; ++i) {
85 if (unlikely(!msrs->controls[i].addr)) 81 if (unlikely(!msrs->controls[i].addr)) {
82 if (counter_config[i].enabled && !smp_processor_id())
83 /*
84 * counter is reserved, this is on all
85 * cpus, so report only for cpu #0
86 */
87 op_x86_warn_reserved(i);
86 continue; 88 continue;
89 }
87 rdmsrl(msrs->controls[i].addr, val); 90 rdmsrl(msrs->controls[i].addr, val);
91 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
92 op_x86_warn_in_use(i);
88 val &= model->reserved; 93 val &= model->reserved;
89 wrmsrl(msrs->controls[i].addr, val); 94 wrmsrl(msrs->controls[i].addr, val);
90 } 95 }
@@ -161,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs)
161 for (i = 0; i < num_counters; ++i) { 166 for (i = 0; i < num_counters; ++i) {
162 if (reset_value[i]) { 167 if (reset_value[i]) {
163 rdmsrl(msrs->controls[i].addr, val); 168 rdmsrl(msrs->controls[i].addr, val);
164 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 169 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
165 wrmsrl(msrs->controls[i].addr, val); 170 wrmsrl(msrs->controls[i].addr, val);
166 } 171 }
167 } 172 }
@@ -179,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs)
179 if (!reset_value[i]) 184 if (!reset_value[i])
180 continue; 185 continue;
181 rdmsrl(msrs->controls[i].addr, val); 186 rdmsrl(msrs->controls[i].addr, val);
182 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 187 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
183 wrmsrl(msrs->controls[i].addr, val); 188 wrmsrl(msrs->controls[i].addr, val);
184 } 189 }
185} 190}
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 7b8e75d16081..ff82a755edd4 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -57,6 +57,26 @@ struct op_x86_model_spec {
57 57
58struct op_counter_config; 58struct op_counter_config;
59 59
60static inline void op_x86_warn_in_use(int counter)
61{
62 /*
63 * The warning indicates an already running counter. If
64 * oprofile doesn't collect data, then try using a different
65 * performance counter on your platform to monitor the desired
66 * event. Delete counter #%d from the desired event by editing
67 * the /usr/share/oprofile/%s/<cpu>/events file. If the event
68 * cannot be monitored by any other counter, contact your
69 * hardware or BIOS vendor.
70 */
71 pr_warning("oprofile: counter #%d on cpu #%d may already be used\n",
72 counter, smp_processor_id());
73}
74
75static inline void op_x86_warn_reserved(int counter)
76{
77 pr_warning("oprofile: counter #%d is already reserved\n", counter);
78}
79
60extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, 80extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
61 struct op_counter_config *counter_config); 81 struct op_counter_config *counter_config);
62extern int op_x86_phys_to_virt(int phys); 82extern int op_x86_phys_to_virt(int phys);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 564b008a51c7..b110d97fb925 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -13,9 +13,10 @@ obj-$(CONFIG_X86_VISWS) += visws.o
13 13
14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-$(CONFIG_X86_MRST) += mrst.o
17
16obj-y += common.o early.o 18obj-y += common.o early.o
17obj-y += amd_bus.o 19obj-y += amd_bus.o bus_numa.o
18obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o
19 20
20ifeq ($(CONFIG_PCI_DEBUG),y) 21ifeq ($(CONFIG_PCI_DEBUG),y)
21EXTRA_CFLAGS += -DDEBUG 22EXTRA_CFLAGS += -DDEBUG
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 959e548a7039..44f83ce02470 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/irq.h> 4#include <linux/irq.h>
5#include <linux/dmi.h> 5#include <linux/dmi.h>
6#include <linux/slab.h>
6#include <asm/numa.h> 7#include <asm/numa.h>
7#include <asm/pci_x86.h> 8#include <asm/pci_x86.h>
8 9
@@ -15,19 +16,94 @@ struct pci_root_info {
15 int busnum; 16 int busnum;
16}; 17};
17 18
19static bool pci_use_crs = true;
20
21static int __init set_use_crs(const struct dmi_system_id *id)
22{
23 pci_use_crs = true;
24 return 0;
25}
26
27static const struct dmi_system_id pci_use_crs_table[] __initconst = {
28 /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
29 {
30 .callback = set_use_crs,
31 .ident = "IBM System x3800",
32 .matches = {
33 DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
34 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
35 },
36 },
37 {}
38};
39
40void __init pci_acpi_crs_quirks(void)
41{
42 int year;
43
44 if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008)
45 pci_use_crs = false;
46
47 dmi_check_system(pci_use_crs_table);
48
49 /*
50 * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that
51 * takes precedence over anything we figured out above.
52 */
53 if (pci_probe & PCI_ROOT_NO_CRS)
54 pci_use_crs = false;
55 else if (pci_probe & PCI_USE__CRS)
56 pci_use_crs = true;
57
58 printk(KERN_INFO "PCI: %s host bridge windows from ACPI; "
59 "if necessary, use \"pci=%s\" and report a bug\n",
60 pci_use_crs ? "Using" : "Ignoring",
61 pci_use_crs ? "nocrs" : "use_crs");
62}
63
18static acpi_status 64static acpi_status
19resource_to_addr(struct acpi_resource *resource, 65resource_to_addr(struct acpi_resource *resource,
20 struct acpi_resource_address64 *addr) 66 struct acpi_resource_address64 *addr)
21{ 67{
22 acpi_status status; 68 acpi_status status;
23 69 struct acpi_resource_memory24 *memory24;
24 status = acpi_resource_to_address64(resource, addr); 70 struct acpi_resource_memory32 *memory32;
25 if (ACPI_SUCCESS(status) && 71 struct acpi_resource_fixed_memory32 *fixed_memory32;
26 (addr->resource_type == ACPI_MEMORY_RANGE || 72
27 addr->resource_type == ACPI_IO_RANGE) && 73 memset(addr, 0, sizeof(*addr));
28 addr->address_length > 0 && 74 switch (resource->type) {
29 addr->producer_consumer == ACPI_PRODUCER) { 75 case ACPI_RESOURCE_TYPE_MEMORY24:
76 memory24 = &resource->data.memory24;
77 addr->resource_type = ACPI_MEMORY_RANGE;
78 addr->minimum = memory24->minimum;
79 addr->address_length = memory24->address_length;
80 addr->maximum = addr->minimum + addr->address_length - 1;
30 return AE_OK; 81 return AE_OK;
82 case ACPI_RESOURCE_TYPE_MEMORY32:
83 memory32 = &resource->data.memory32;
84 addr->resource_type = ACPI_MEMORY_RANGE;
85 addr->minimum = memory32->minimum;
86 addr->address_length = memory32->address_length;
87 addr->maximum = addr->minimum + addr->address_length - 1;
88 return AE_OK;
89 case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
90 fixed_memory32 = &resource->data.fixed_memory32;
91 addr->resource_type = ACPI_MEMORY_RANGE;
92 addr->minimum = fixed_memory32->address;
93 addr->address_length = fixed_memory32->address_length;
94 addr->maximum = addr->minimum + addr->address_length - 1;
95 return AE_OK;
96 case ACPI_RESOURCE_TYPE_ADDRESS16:
97 case ACPI_RESOURCE_TYPE_ADDRESS32:
98 case ACPI_RESOURCE_TYPE_ADDRESS64:
99 status = acpi_resource_to_address64(resource, addr);
100 if (ACPI_SUCCESS(status) &&
101 (addr->resource_type == ACPI_MEMORY_RANGE ||
102 addr->resource_type == ACPI_IO_RANGE) &&
103 addr->address_length > 0) {
104 return AE_OK;
105 }
106 break;
31 } 107 }
32 return AE_ERROR; 108 return AE_ERROR;
33} 109}
@@ -45,20 +121,6 @@ count_resource(struct acpi_resource *acpi_res, void *data)
45 return AE_OK; 121 return AE_OK;
46} 122}
47 123
48static int
49bus_has_transparent_bridge(struct pci_bus *bus)
50{
51 struct pci_dev *dev;
52
53 list_for_each_entry(dev, &bus->devices, bus_list) {
54 u16 class = dev->class >> 8;
55
56 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent)
57 return true;
58 }
59 return false;
60}
61
62static void 124static void
63align_resource(struct acpi_device *bridge, struct resource *res) 125align_resource(struct acpi_device *bridge, struct resource *res)
64{ 126{
@@ -91,12 +153,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
91 struct acpi_resource_address64 addr; 153 struct acpi_resource_address64 addr;
92 acpi_status status; 154 acpi_status status;
93 unsigned long flags; 155 unsigned long flags;
94 struct resource *root; 156 struct resource *root, *conflict;
95 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; 157 u64 start, end, max_len;
96 u64 start, end;
97
98 if (bus_has_transparent_bridge(info->bus))
99 max_root_bus_resources -= 3;
100 158
101 status = resource_to_addr(acpi_res, &addr); 159 status = resource_to_addr(acpi_res, &addr);
102 if (!ACPI_SUCCESS(status)) 160 if (!ACPI_SUCCESS(status))
@@ -113,17 +171,19 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
113 } else 171 } else
114 return AE_OK; 172 return AE_OK;
115 173
174 max_len = addr.maximum - addr.minimum + 1;
175 if (addr.address_length > max_len) {
176 dev_printk(KERN_DEBUG, &info->bridge->dev,
177 "host bridge window length %#llx doesn't fit in "
178 "%#llx-%#llx, trimming\n",
179 (unsigned long long) addr.address_length,
180 (unsigned long long) addr.minimum,
181 (unsigned long long) addr.maximum);
182 addr.address_length = max_len;
183 }
184
116 start = addr.minimum + addr.translation_offset; 185 start = addr.minimum + addr.translation_offset;
117 end = start + addr.address_length - 1; 186 end = start + addr.address_length - 1;
118 if (info->res_num >= max_root_bus_resources) {
119 if (pci_probe & PCI_USE__CRS)
120 printk(KERN_WARNING "PCI: Failed to allocate "
121 "0x%lx-0x%lx from %s for %s due to _CRS "
122 "returning more than %d resource descriptors\n",
123 (unsigned long) start, (unsigned long) end,
124 root->name, info->name, max_root_bus_resources);
125 return AE_OK;
126 }
127 187
128 res = &info->res[info->res_num]; 188 res = &info->res[info->res_num];
129 res->name = info->name; 189 res->name = info->name;
@@ -133,17 +193,20 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
133 res->child = NULL; 193 res->child = NULL;
134 align_resource(info->bridge, res); 194 align_resource(info->bridge, res);
135 195
136 if (!(pci_probe & PCI_USE__CRS)) { 196 if (!pci_use_crs) {
137 dev_printk(KERN_DEBUG, &info->bridge->dev, 197 dev_printk(KERN_DEBUG, &info->bridge->dev,
138 "host bridge window %pR (ignored)\n", res); 198 "host bridge window %pR (ignored)\n", res);
139 return AE_OK; 199 return AE_OK;
140 } 200 }
141 201
142 if (insert_resource(root, res)) { 202 conflict = insert_resource_conflict(root, res);
203 if (conflict) {
143 dev_err(&info->bridge->dev, 204 dev_err(&info->bridge->dev,
144 "can't allocate host bridge window %pR\n", res); 205 "address space collision: host bridge window %pR "
206 "conflicts with %s %pR\n",
207 res, conflict->name, conflict);
145 } else { 208 } else {
146 info->bus->resource[info->res_num] = res; 209 pci_bus_add_resource(info->bus, res, 0);
147 info->res_num++; 210 info->res_num++;
148 if (addr.translation_offset) 211 if (addr.translation_offset)
149 dev_info(&info->bridge->dev, "host bridge window %pR " 212 dev_info(&info->bridge->dev, "host bridge window %pR "
@@ -164,10 +227,8 @@ get_current_resources(struct acpi_device *device, int busnum,
164 struct pci_root_info info; 227 struct pci_root_info info;
165 size_t size; 228 size_t size;
166 229
167 if (!(pci_probe & PCI_USE__CRS)) 230 if (pci_use_crs)
168 dev_info(&device->dev, 231 pci_bus_remove_resources(bus);
169 "ignoring host bridge windows from ACPI; "
170 "boot with \"pci=use_crs\" to use them\n");
171 232
172 info.bridge = device; 233 info.bridge = device;
173 info.bus = bus; 234 info.bus = bus;
@@ -282,17 +343,14 @@ int __init pci_acpi_init(void)
282{ 343{
283 struct pci_dev *dev = NULL; 344 struct pci_dev *dev = NULL;
284 345
285 if (pcibios_scanned)
286 return 0;
287
288 if (acpi_noirq) 346 if (acpi_noirq)
289 return 0; 347 return -ENODEV;
290 348
291 printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); 349 printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
292 acpi_irq_penalty_init(); 350 acpi_irq_penalty_init();
293 pcibios_scanned++;
294 pcibios_enable_irq = acpi_pci_irq_enable; 351 pcibios_enable_irq = acpi_pci_irq_enable;
295 pcibios_disable_irq = acpi_pci_irq_disable; 352 pcibios_disable_irq = acpi_pci_irq_disable;
353 x86_init.pci.init_irq = x86_init_noop;
296 354
297 if (pci_routeirq) { 355 if (pci_routeirq) {
298 /* 356 /*
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 95ecbd495955..fc1e8fe07e5c 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,11 +2,11 @@
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h> 3#include <linux/topology.h>
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/range.h>
6
5#include <asm/pci_x86.h> 7#include <asm/pci_x86.h>
6 8
7#ifdef CONFIG_X86_64
8#include <asm/pci-direct.h> 9#include <asm/pci-direct.h>
9#endif
10 10
11#include "bus_numa.h" 11#include "bus_numa.h"
12 12
@@ -15,60 +15,6 @@
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
16 */ 16 */
17 17
18#ifdef CONFIG_X86_64
19
20#define RANGE_NUM 16
21
22struct res_range {
23 size_t start;
24 size_t end;
25};
26
27static void __init update_range(struct res_range *range, size_t start,
28 size_t end)
29{
30 int i;
31 int j;
32
33 for (j = 0; j < RANGE_NUM; j++) {
34 if (!range[j].end)
35 continue;
36
37 if (start <= range[j].start && end >= range[j].end) {
38 range[j].start = 0;
39 range[j].end = 0;
40 continue;
41 }
42
43 if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
44 range[j].start = end + 1;
45 continue;
46 }
47
48
49 if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
50 range[j].end = start - 1;
51 continue;
52 }
53
54 if (start > range[j].start && end < range[j].end) {
55 /* find the new spare */
56 for (i = 0; i < RANGE_NUM; i++) {
57 if (range[i].end == 0)
58 break;
59 }
60 if (i < RANGE_NUM) {
61 range[i].end = range[j].end;
62 range[i].start = end + 1;
63 } else {
64 printk(KERN_ERR "run of slot in ranges\n");
65 }
66 range[j].end = start - 1;
67 continue;
68 }
69 }
70}
71
72struct pci_hostbridge_probe { 18struct pci_hostbridge_probe {
73 u32 bus; 19 u32 bus;
74 u32 slot; 20 u32 slot;
@@ -111,6 +57,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void)
111 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; 57 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
112} 58}
113 59
60#define RANGE_NUM 16
61
114/** 62/**
115 * early_fill_mp_bus_to_node() 63 * early_fill_mp_bus_to_node()
116 * called before pcibios_scan_root and pci_scan_bus 64 * called before pcibios_scan_root and pci_scan_bus
@@ -130,16 +78,17 @@ static int __init early_fill_mp_bus_info(void)
130 struct pci_root_info *info; 78 struct pci_root_info *info;
131 u32 reg; 79 u32 reg;
132 struct resource *res; 80 struct resource *res;
133 size_t start; 81 u64 start;
134 size_t end; 82 u64 end;
135 struct res_range range[RANGE_NUM]; 83 struct range range[RANGE_NUM];
136 u64 val; 84 u64 val;
137 u32 address; 85 u32 address;
86 bool found;
138 87
139 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
140 return -1; 89 return -1;
141 90
142 found_all_numa_early = 0; 91 found = false;
143 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 92 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
144 u32 id; 93 u32 id;
145 u16 device; 94 u16 device;
@@ -153,12 +102,12 @@ static int __init early_fill_mp_bus_info(void)
153 device = (id>>16) & 0xffff; 102 device = (id>>16) & 0xffff;
154 if (pci_probes[i].vendor == vendor && 103 if (pci_probes[i].vendor == vendor &&
155 pci_probes[i].device == device) { 104 pci_probes[i].device == device) {
156 found_all_numa_early = 1; 105 found = true;
157 break; 106 break;
158 } 107 }
159 } 108 }
160 109
161 if (!found_all_numa_early) 110 if (!found)
162 return 0; 111 return 0;
163 112
164 pci_root_num = 0; 113 pci_root_num = 0;
@@ -196,7 +145,7 @@ static int __init early_fill_mp_bus_info(void)
196 def_link = (reg >> 8) & 0x03; 145 def_link = (reg >> 8) & 0x03;
197 146
198 memset(range, 0, sizeof(range)); 147 memset(range, 0, sizeof(range));
199 range[0].end = 0xffff; 148 add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
200 /* io port resource */ 149 /* io port resource */
201 for (i = 0; i < 4; i++) { 150 for (i = 0; i < 4; i++) {
202 reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); 151 reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
@@ -220,13 +169,13 @@ static int __init early_fill_mp_bus_info(void)
220 169
221 info = &pci_root_info[j]; 170 info = &pci_root_info[j];
222 printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", 171 printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
223 node, link, (u64)start, (u64)end); 172 node, link, start, end);
224 173
225 /* kernel only handle 16 bit only */ 174 /* kernel only handle 16 bit only */
226 if (end > 0xffff) 175 if (end > 0xffff)
227 end = 0xffff; 176 end = 0xffff;
228 update_res(info, start, end, IORESOURCE_IO, 1); 177 update_res(info, start, end, IORESOURCE_IO, 1);
229 update_range(range, start, end); 178 subtract_range(range, RANGE_NUM, start, end + 1);
230 } 179 }
231 /* add left over io port range to def node/link, [0, 0xffff] */ 180 /* add left over io port range to def node/link, [0, 0xffff] */
232 /* find the position */ 181 /* find the position */
@@ -241,29 +190,32 @@ static int __init early_fill_mp_bus_info(void)
241 if (!range[i].end) 190 if (!range[i].end)
242 continue; 191 continue;
243 192
244 update_res(info, range[i].start, range[i].end, 193 update_res(info, range[i].start, range[i].end - 1,
245 IORESOURCE_IO, 1); 194 IORESOURCE_IO, 1);
246 } 195 }
247 } 196 }
248 197
249 memset(range, 0, sizeof(range)); 198 memset(range, 0, sizeof(range));
250 /* 0xfd00000000-0xffffffffff for HT */ 199 /* 0xfd00000000-0xffffffffff for HT */
251 range[0].end = (0xfdULL<<32) - 1; 200 end = cap_resource((0xfdULL<<32) - 1);
201 end++;
202 add_range(range, RANGE_NUM, 0, 0, end);
252 203
253 /* need to take out [0, TOM) for RAM*/ 204 /* need to take out [0, TOM) for RAM*/
254 address = MSR_K8_TOP_MEM1; 205 address = MSR_K8_TOP_MEM1;
255 rdmsrl(address, val); 206 rdmsrl(address, val);
256 end = (val & 0xffffff800000ULL); 207 end = (val & 0xffffff800000ULL);
257 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); 208 printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
258 if (end < (1ULL<<32)) 209 if (end < (1ULL<<32))
259 update_range(range, 0, end - 1); 210 subtract_range(range, RANGE_NUM, 0, end);
260 211
261 /* get mmconfig */ 212 /* get mmconfig */
262 get_pci_mmcfg_amd_fam10h_range(); 213 get_pci_mmcfg_amd_fam10h_range();
263 /* need to take out mmconf range */ 214 /* need to take out mmconf range */
264 if (fam10h_mmconf_end) { 215 if (fam10h_mmconf_end) {
265 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); 216 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
266 update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); 217 subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
218 fam10h_mmconf_end + 1);
267 } 219 }
268 220
269 /* mmio resource */ 221 /* mmio resource */
@@ -293,7 +245,7 @@ static int __init early_fill_mp_bus_info(void)
293 info = &pci_root_info[j]; 245 info = &pci_root_info[j];
294 246
295 printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", 247 printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
296 node, link, (u64)start, (u64)end); 248 node, link, start, end);
297 /* 249 /*
298 * some sick allocation would have range overlap with fam10h 250 * some sick allocation would have range overlap with fam10h
299 * mmconf range, so need to update start and end. 251 * mmconf range, so need to update start and end.
@@ -318,14 +270,15 @@ static int __init early_fill_mp_bus_info(void)
318 /* we got a hole */ 270 /* we got a hole */
319 endx = fam10h_mmconf_start - 1; 271 endx = fam10h_mmconf_start - 1;
320 update_res(info, start, endx, IORESOURCE_MEM, 0); 272 update_res(info, start, endx, IORESOURCE_MEM, 0);
321 update_range(range, start, endx); 273 subtract_range(range, RANGE_NUM, start,
322 printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); 274 endx + 1);
275 printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
323 start = fam10h_mmconf_end + 1; 276 start = fam10h_mmconf_end + 1;
324 changed = 1; 277 changed = 1;
325 } 278 }
326 if (changed) { 279 if (changed) {
327 if (start <= end) { 280 if (start <= end) {
328 printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); 281 printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end);
329 } else { 282 } else {
330 printk(KERN_CONT "%s\n", endx?"":" ==> none"); 283 printk(KERN_CONT "%s\n", endx?"":" ==> none");
331 continue; 284 continue;
@@ -333,8 +286,9 @@ static int __init early_fill_mp_bus_info(void)
333 } 286 }
334 } 287 }
335 288
336 update_res(info, start, end, IORESOURCE_MEM, 1); 289 update_res(info, cap_resource(start), cap_resource(end),
337 update_range(range, start, end); 290 IORESOURCE_MEM, 1);
291 subtract_range(range, RANGE_NUM, start, end + 1);
338 printk(KERN_CONT "\n"); 292 printk(KERN_CONT "\n");
339 } 293 }
340 294
@@ -348,8 +302,8 @@ static int __init early_fill_mp_bus_info(void)
348 address = MSR_K8_TOP_MEM2; 302 address = MSR_K8_TOP_MEM2;
349 rdmsrl(address, val); 303 rdmsrl(address, val);
350 end = (val & 0xffffff800000ULL); 304 end = (val & 0xffffff800000ULL);
351 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); 305 printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
352 update_range(range, 1ULL<<32, end - 1); 306 subtract_range(range, RANGE_NUM, 1ULL<<32, end);
353 } 307 }
354 308
355 /* 309 /*
@@ -368,7 +322,8 @@ static int __init early_fill_mp_bus_info(void)
368 if (!range[i].end) 322 if (!range[i].end)
369 continue; 323 continue;
370 324
371 update_res(info, range[i].start, range[i].end, 325 update_res(info, cap_resource(range[i].start),
326 cap_resource(range[i].end - 1),
372 IORESOURCE_MEM, 1); 327 IORESOURCE_MEM, 1);
373 } 328 }
374 } 329 }
@@ -384,24 +339,14 @@ static int __init early_fill_mp_bus_info(void)
384 info->bus_min, info->bus_max, info->node, info->link); 339 info->bus_min, info->bus_max, info->node, info->link);
385 for (j = 0; j < res_num; j++) { 340 for (j = 0; j < res_num; j++) {
386 res = &info->res[j]; 341 res = &info->res[j];
387 printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", 342 printk(KERN_DEBUG "bus: %02x index %x %pR\n",
388 busnum, j, 343 busnum, j, res);
389 (res->flags & IORESOURCE_IO)?"io port":"mmio",
390 res->start, res->end);
391 } 344 }
392 } 345 }
393 346
394 return 0; 347 return 0;
395} 348}
396 349
397#else /* !CONFIG_X86_64 */
398
399static int __init early_fill_mp_bus_info(void) { return 0; }
400
401#endif /* !CONFIG_X86_64 */
402
403/* common 32/64 bit code */
404
405#define ENABLE_CF8_EXT_CFG (1ULL << 46) 350#define ENABLE_CF8_EXT_CFG (1ULL << 46)
406 351
407static void enable_pci_io_ecs(void *unused) 352static void enable_pci_io_ecs(void *unused)
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 145df00e0387..64a122883896 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -1,11 +1,11 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/range.h>
3 4
4#include "bus_numa.h" 5#include "bus_numa.h"
5 6
6int pci_root_num; 7int pci_root_num;
7struct pci_root_info pci_root_info[PCI_ROOT_NR]; 8struct pci_root_info pci_root_info[PCI_ROOT_NR];
8int found_all_numa_early;
9 9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b) 10void x86_pci_root_bus_res_quirks(struct pci_bus *b)
11{ 11{
@@ -21,10 +21,6 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
21 if (!pci_root_num) 21 if (!pci_root_num)
22 return; 22 return;
23 23
24 /* for amd, if only one root bus, don't need to do anything */
25 if (pci_root_num < 2 && found_all_numa_early)
26 return;
27
28 for (i = 0; i < pci_root_num; i++) { 24 for (i = 0; i < pci_root_num; i++) {
29 if (pci_root_info[i].bus_min == b->number) 25 if (pci_root_info[i].bus_min == b->number)
30 break; 26 break;
@@ -36,13 +32,14 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
36 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", 32 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
37 b->number); 33 b->number);
38 34
35 pci_bus_remove_resources(b);
39 info = &pci_root_info[i]; 36 info = &pci_root_info[i];
40 for (j = 0; j < info->res_num; j++) { 37 for (j = 0; j < info->res_num; j++) {
41 struct resource *res; 38 struct resource *res;
42 struct resource *root; 39 struct resource *root;
43 40
44 res = &info->res[j]; 41 res = &info->res[j];
45 b->resource[j] = res; 42 pci_bus_add_resource(b, res, 0);
46 if (res->flags & IORESOURCE_IO) 43 if (res->flags & IORESOURCE_IO)
47 root = &ioport_resource; 44 root = &ioport_resource;
48 else 45 else
@@ -51,8 +48,8 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
51 } 48 }
52} 49}
53 50
54void __init update_res(struct pci_root_info *info, size_t start, 51void __devinit update_res(struct pci_root_info *info, resource_size_t start,
55 size_t end, unsigned long flags, int merge) 52 resource_size_t end, unsigned long flags, int merge)
56{ 53{
57 int i; 54 int i;
58 struct resource *res; 55 struct resource *res;
@@ -60,25 +57,28 @@ void __init update_res(struct pci_root_info *info, size_t start,
60 if (start > end) 57 if (start > end)
61 return; 58 return;
62 59
60 if (start == MAX_RESOURCE)
61 return;
62
63 if (!merge) 63 if (!merge)
64 goto addit; 64 goto addit;
65 65
66 /* try to merge it with old one */ 66 /* try to merge it with old one */
67 for (i = 0; i < info->res_num; i++) { 67 for (i = 0; i < info->res_num; i++) {
68 size_t final_start, final_end; 68 resource_size_t final_start, final_end;
69 size_t common_start, common_end; 69 resource_size_t common_start, common_end;
70 70
71 res = &info->res[i]; 71 res = &info->res[i];
72 if (res->flags != flags) 72 if (res->flags != flags)
73 continue; 73 continue;
74 74
75 common_start = max((size_t)res->start, start); 75 common_start = max(res->start, start);
76 common_end = min((size_t)res->end, end); 76 common_end = min(res->end, end);
77 if (common_start > common_end + 1) 77 if (common_start > common_end + 1)
78 continue; 78 continue;
79 79
80 final_start = min((size_t)res->start, start); 80 final_start = min(res->start, start);
81 final_end = max((size_t)res->end, end); 81 final_end = max(res->end, end);
82 82
83 res->start = final_start; 83 res->start = final_start;
84 res->end = final_end; 84 res->end = final_end;
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index adbc23fe82ac..804a4b40c31a 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -1,9 +1,8 @@
1#ifdef CONFIG_X86_64 1#ifndef __BUS_NUMA_H
2 2#define __BUS_NUMA_H
3/* 3/*
4 * sub bus (transparent) will use entres from 3 to store extra from 4 * sub bus (transparent) will use entres from 3 to store extra from
5 * root, so need to make sure we have enough slot there, Should we 5 * root, so need to make sure we have enough slot there.
6 * increase PCI_BUS_NUM_RESOURCES?
7 */ 6 */
8#define RES_NUM 16 7#define RES_NUM 16
9struct pci_root_info { 8struct pci_root_info {
@@ -20,8 +19,7 @@ struct pci_root_info {
20#define PCI_ROOT_NR 4 19#define PCI_ROOT_NR 4
21extern int pci_root_num; 20extern int pci_root_num;
22extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; 21extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
23extern int found_all_numa_early;
24 22
25extern void update_res(struct pci_root_info *info, size_t start, 23extern void update_res(struct pci_root_info *info, resource_size_t start,
26 size_t end, unsigned long flags, int merge); 24 resource_size_t end, unsigned long flags, int merge);
27#endif 25#endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index d2552c68e94d..cf2e93869c48 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -9,6 +9,7 @@
9#include <linux/ioport.h> 9#include <linux/ioport.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/dmi.h> 11#include <linux/dmi.h>
12#include <linux/slab.h>
12 13
13#include <asm/acpi.h> 14#include <asm/acpi.h>
14#include <asm/segment.h> 15#include <asm/segment.h>
@@ -72,12 +73,6 @@ struct pci_ops pci_root_ops = {
72}; 73};
73 74
74/* 75/*
75 * legacy, numa, and acpi all want to call pcibios_scan_root
76 * from their initcalls. This flag prevents that.
77 */
78int pcibios_scanned;
79
80/*
81 * This interrupt-safe spinlock protects all accesses to PCI 76 * This interrupt-safe spinlock protects all accesses to PCI
82 * configuration space. 77 * configuration space.
83 */ 78 */
@@ -520,6 +515,9 @@ char * __devinit pcibios_setup(char *str)
520 } else if (!strcmp(str, "use_crs")) { 515 } else if (!strcmp(str, "use_crs")) {
521 pci_probe |= PCI_USE__CRS; 516 pci_probe |= PCI_USE__CRS;
522 return NULL; 517 return NULL;
518 } else if (!strcmp(str, "nocrs")) {
519 pci_probe |= PCI_ROOT_NO_CRS;
520 return NULL;
523 } else if (!strcmp(str, "earlydump")) { 521 } else if (!strcmp(str, "earlydump")) {
524 pci_early_dump_regs = 1; 522 pci_early_dump_regs = 1;
525 return NULL; 523 return NULL;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5dc9e8c63fcd..46fd43f79103 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -60,22 +60,20 @@ skip_isa_ioresource_align(struct pci_dev *dev) {
60 * but we want to try to avoid allocating at 0x2900-0x2bff 60 * but we want to try to avoid allocating at 0x2900-0x2bff
61 * which might have be mirrored at 0x0100-0x03ff.. 61 * which might have be mirrored at 0x0100-0x03ff..
62 */ 62 */
63void 63resource_size_t
64pcibios_align_resource(void *data, struct resource *res, 64pcibios_align_resource(void *data, const struct resource *res,
65 resource_size_t size, resource_size_t align) 65 resource_size_t size, resource_size_t align)
66{ 66{
67 struct pci_dev *dev = data; 67 struct pci_dev *dev = data;
68 resource_size_t start = res->start;
68 69
69 if (res->flags & IORESOURCE_IO) { 70 if (res->flags & IORESOURCE_IO) {
70 resource_size_t start = res->start;
71
72 if (skip_isa_ioresource_align(dev)) 71 if (skip_isa_ioresource_align(dev))
73 return; 72 return start;
74 if (start & 0x300) { 73 if (start & 0x300)
75 start = (start + 0x3ff) & ~0x3ff; 74 start = (start + 0x3ff) & ~0x3ff;
76 res->start = start;
77 }
78 } 75 }
76 return start;
79} 77}
80EXPORT_SYMBOL(pcibios_align_resource); 78EXPORT_SYMBOL(pcibios_align_resource);
81 79
@@ -129,9 +127,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
129 continue; 127 continue;
130 if (!r->start || 128 if (!r->start ||
131 pci_claim_resource(dev, idx) < 0) { 129 pci_claim_resource(dev, idx) < 0) {
132 dev_info(&dev->dev,
133 "can't reserve window %pR\n",
134 r);
135 /* 130 /*
136 * Something is wrong with the region. 131 * Something is wrong with the region.
137 * Invalidate the resource to prevent 132 * Invalidate the resource to prevent
@@ -183,8 +178,6 @@ static void __init pcibios_allocate_resources(int pass)
183 "BAR %d: reserving %pr (d=%d, p=%d)\n", 178 "BAR %d: reserving %pr (d=%d, p=%d)\n",
184 idx, r, disabled, pass); 179 idx, r, disabled, pass);
185 if (pci_claim_resource(dev, idx) < 0) { 180 if (pci_claim_resource(dev, idx) < 0) {
186 dev_info(&dev->dev,
187 "can't reserve %pR\n", r);
188 /* We'll assign a new address later */ 181 /* We'll assign a new address later */
189 r->end -= r->start; 182 r->end -= r->start;
190 r->start = 0; 183 r->start = 0;
@@ -257,10 +250,6 @@ void __init pcibios_resource_survey(void)
257 */ 250 */
258fs_initcall(pcibios_assign_resources); 251fs_initcall(pcibios_assign_resources);
259 252
260void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
261{
262}
263
264/* 253/*
265 * If we set up a device for bus mastering, we need to check the latency 254 * If we set up a device for bus mastering, we need to check the latency
266 * timer as certain crappy BIOSes forget to set it properly. 255 * timer as certain crappy BIOSes forget to set it properly.
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 25a1f8efed4a..adb62aaa7ecd 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -1,6 +1,7 @@
1#include <linux/pci.h> 1#include <linux/pci.h>
2#include <linux/init.h> 2#include <linux/init.h>
3#include <asm/pci_x86.h> 3#include <asm/pci_x86.h>
4#include <asm/x86_init.h>
4 5
5/* arch_initcall has too random ordering, so call the initializers 6/* arch_initcall has too random ordering, so call the initializers
6 in the right sequence from here. */ 7 in the right sequence from here. */
@@ -15,10 +16,9 @@ static __init int pci_arch_init(void)
15 if (!(pci_probe & PCI_PROBE_NOEARLY)) 16 if (!(pci_probe & PCI_PROBE_NOEARLY))
16 pci_mmcfg_early_init(); 17 pci_mmcfg_early_init();
17 18
18#ifdef CONFIG_PCI_OLPC 19 if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
19 if (!pci_olpc_init()) 20 return 0;
20 return 0; /* skip additional checks if it's an XO */ 21
21#endif
22#ifdef CONFIG_PCI_BIOS 22#ifdef CONFIG_PCI_BIOS
23 pci_pcbios_init(); 23 pci_pcbios_init();
24#endif 24#endif
diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c
deleted file mode 100644
index b7a55dc55d13..000000000000
--- a/arch/x86/pci/intel_bus.c
+++ /dev/null
@@ -1,90 +0,0 @@
1/*
2 * to read io range from IOH pci conf, need to do it after mmconfig is there
3 */
4
5#include <linux/delay.h>
6#include <linux/dmi.h>
7#include <linux/pci.h>
8#include <linux/init.h>
9#include <asm/pci_x86.h>
10
11#include "bus_numa.h"
12
13static inline void print_ioh_resources(struct pci_root_info *info)
14{
15 int res_num;
16 int busnum;
17 int i;
18
19 printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n",
20 info->bus_min, info->bus_max);
21 res_num = info->res_num;
22 busnum = info->bus_min;
23 for (i = 0; i < res_num; i++) {
24 struct resource *res;
25
26 res = &info->res[i];
27 printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n",
28 busnum, i,
29 (res->flags & IORESOURCE_IO) ? "io port" :
30 "mmio",
31 res->start, res->end);
32 }
33}
34
35#define IOH_LIO 0x108
36#define IOH_LMMIOL 0x10c
37#define IOH_LMMIOH 0x110
38#define IOH_LMMIOH_BASEU 0x114
39#define IOH_LMMIOH_LIMITU 0x118
40#define IOH_LCFGBUS 0x11c
41
42static void __devinit pci_root_bus_res(struct pci_dev *dev)
43{
44 u16 word;
45 u32 dword;
46 struct pci_root_info *info;
47 u16 io_base, io_end;
48 u32 mmiol_base, mmiol_end;
49 u64 mmioh_base, mmioh_end;
50 int bus_base, bus_end;
51
52 if (pci_root_num >= PCI_ROOT_NR) {
53 printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n");
54 return;
55 }
56
57 info = &pci_root_info[pci_root_num];
58 pci_root_num++;
59
60 pci_read_config_word(dev, IOH_LCFGBUS, &word);
61 bus_base = (word & 0xff);
62 bus_end = (word & 0xff00) >> 8;
63 sprintf(info->name, "PCI Bus #%02x", bus_base);
64 info->bus_min = bus_base;
65 info->bus_max = bus_end;
66
67 pci_read_config_word(dev, IOH_LIO, &word);
68 io_base = (word & 0xf0) << (12 - 4);
69 io_end = (word & 0xf000) | 0xfff;
70 update_res(info, io_base, io_end, IORESOURCE_IO, 0);
71
72 pci_read_config_dword(dev, IOH_LMMIOL, &dword);
73 mmiol_base = (dword & 0xff00) << (24 - 8);
74 mmiol_end = (dword & 0xff000000) | 0xffffff;
75 update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0);
76
77 pci_read_config_dword(dev, IOH_LMMIOH, &dword);
78 mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10);
79 mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff);
80 pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword);
81 mmioh_base |= ((u64)(dword & 0x7ffff)) << 32;
82 pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword);
83 mmioh_end |= ((u64)(dword & 0x7ffff)) << 32;
84 update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0);
85
86 print_ioh_resources(info);
87}
88
89/* intel IOH */
90DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0696d506c4ad..5d362b5ba06f 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -8,7 +8,6 @@
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/pci.h> 9#include <linux/pci.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/slab.h>
12#include <linux/interrupt.h> 11#include <linux/interrupt.h>
13#include <linux/dmi.h> 12#include <linux/dmi.h>
14#include <linux/io.h> 13#include <linux/io.h>
@@ -53,7 +52,7 @@ struct irq_router_handler {
53 int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); 52 int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
54}; 53};
55 54
56int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; 55int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
57void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; 56void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
58 57
59/* 58/*
@@ -590,6 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
590 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
591 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
592 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_CPT_LPC1:
593 case PCI_DEVICE_ID_INTEL_CPT_LPC2:
593 r->name = "PIIX/ICH"; 594 r->name = "PIIX/ICH";
594 r->get = pirq_piix_get; 595 r->get = pirq_piix_get;
595 r->set = pirq_piix_set; 596 r->set = pirq_piix_set;
@@ -1016,7 +1017,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
1016 return 1; 1017 return 1;
1017} 1018}
1018 1019
1019static void __init pcibios_fixup_irqs(void) 1020void __init pcibios_fixup_irqs(void)
1020{ 1021{
1021 struct pci_dev *dev = NULL; 1022 struct pci_dev *dev = NULL;
1022 u8 pin; 1023 u8 pin;
@@ -1110,12 +1111,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1110 { } 1111 { }
1111}; 1112};
1112 1113
1113int __init pcibios_irq_init(void) 1114void __init pcibios_irq_init(void)
1114{ 1115{
1115 DBG(KERN_DEBUG "PCI: IRQ init\n"); 1116 DBG(KERN_DEBUG "PCI: IRQ init\n");
1116 1117
1117 if (pcibios_enable_irq || raw_pci_ops == NULL) 1118 if (raw_pci_ops == NULL)
1118 return 0; 1119 return;
1119 1120
1120 dmi_check_system(pciirq_dmi_table); 1121 dmi_check_system(pciirq_dmi_table);
1121 1122
@@ -1142,9 +1143,7 @@ int __init pcibios_irq_init(void)
1142 pirq_table = NULL; 1143 pirq_table = NULL;
1143 } 1144 }
1144 1145
1145 pcibios_enable_irq = pirq_enable_irq; 1146 x86_init.pci.fixup_irqs();
1146
1147 pcibios_fixup_irqs();
1148 1147
1149 if (io_apic_assign_pci_irqs && pci_routeirq) { 1148 if (io_apic_assign_pci_irqs && pci_routeirq) {
1150 struct pci_dev *dev = NULL; 1149 struct pci_dev *dev = NULL;
@@ -1157,8 +1156,6 @@ int __init pcibios_irq_init(void)
1157 for_each_pci_dev(dev) 1156 for_each_pci_dev(dev)
1158 pirq_enable_irq(dev); 1157 pirq_enable_irq(dev);
1159 } 1158 }
1160
1161 return 0;
1162} 1159}
1163 1160
1164static void pirq_penalize_isa_irq(int irq, int active) 1161static void pirq_penalize_isa_irq(int irq, int active)
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 4061bb0f267d..0db5eaf54560 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void)
35 } 35 }
36} 36}
37 37
38static int __init pci_legacy_init(void) 38int __init pci_legacy_init(void)
39{ 39{
40 if (!raw_pci_ops) { 40 if (!raw_pci_ops) {
41 printk("PCI: System does not support PCI\n"); 41 printk("PCI: System does not support PCI\n");
42 return 0; 42 return 0;
43 } 43 }
44 44
45 if (pcibios_scanned++)
46 return 0;
47
48 printk("PCI: Probing PCI hardware\n"); 45 printk("PCI: Probing PCI hardware\n");
49 pci_root_bus = pcibios_scan_root(0); 46 pci_root_bus = pcibios_scan_root(0);
50 if (pci_root_bus) 47 if (pci_root_bus)
@@ -55,18 +52,15 @@ static int __init pci_legacy_init(void)
55 52
56int __init pci_subsys_init(void) 53int __init pci_subsys_init(void)
57{ 54{
58#ifdef CONFIG_X86_NUMAQ 55 /*
59 pci_numaq_init(); 56 * The init function returns an non zero value when
60#endif 57 * pci_legacy_init should be invoked.
61#ifdef CONFIG_ACPI 58 */
62 pci_acpi_init(); 59 if (x86_init.pci.init())
63#endif 60 pci_legacy_init();
64#ifdef CONFIG_X86_VISWS 61
65 pci_visws_init();
66#endif
67 pci_legacy_init();
68 pcibios_fixup_peer_bridges(); 62 pcibios_fixup_peer_bridges();
69 pcibios_irq_init(); 63 x86_init.pci.init_irq();
70 pcibios_init(); 64 pcibios_init();
71 65
72 return 0; 66 return 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index b19d1e54201e..39b9ebe8f886 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -16,6 +16,7 @@
16#include <linux/sfi_acpi.h> 16#include <linux/sfi_acpi.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/dmi.h> 18#include <linux/dmi.h>
19#include <linux/slab.h>
19#include <asm/e820.h> 20#include <asm/e820.h>
20#include <asm/pci_x86.h> 21#include <asm/pci_x86.h>
21#include <asm/acpi.h> 22#include <asm/acpi.h>
@@ -303,22 +304,17 @@ static void __init pci_mmcfg_check_end_bus_number(void)
303{ 304{
304 struct pci_mmcfg_region *cfg, *cfgx; 305 struct pci_mmcfg_region *cfg, *cfgx;
305 306
306 /* last one*/ 307 /* Fixup overlaps */
307 cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list);
308 if (cfg)
309 if (cfg->end_bus < cfg->start_bus)
310 cfg->end_bus = 255;
311
312 if (list_is_singular(&pci_mmcfg_list))
313 return;
314
315 /* don't overlap please */
316 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 308 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
317 if (cfg->end_bus < cfg->start_bus) 309 if (cfg->end_bus < cfg->start_bus)
318 cfg->end_bus = 255; 310 cfg->end_bus = 255;
319 311
312 /* Don't access the list head ! */
313 if (cfg->list.next == &pci_mmcfg_list)
314 break;
315
320 cfgx = list_entry(cfg->list.next, typeof(*cfg), list); 316 cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
321 if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus) 317 if (cfg->end_bus >= cfgx->start_bus)
322 cfg->end_bus = cfgx->start_bus - 1; 318 cfg->end_bus = cfgx->start_bus - 1;
323 } 319 }
324} 320}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
new file mode 100644
index 000000000000..8bf2fcb88d04
--- /dev/null
+++ b/arch/x86/pci/mrst.c
@@ -0,0 +1,262 @@
1/*
2 * Moorestown PCI support
3 * Copyright (c) 2008 Intel Corporation
4 * Jesse Barnes <jesse.barnes@intel.com>
5 *
6 * Moorestown has an interesting PCI implementation:
7 * - configuration space is memory mapped (as defined by MCFG)
8 * - Lincroft devices also have a real, type 1 configuration space
9 * - Early Lincroft silicon has a type 1 access bug that will cause
10 * a hang if non-existent devices are accessed
11 * - some devices have the "fixed BAR" capability, which means
12 * they can't be relocated or modified; check for that during
13 * BAR sizing
14 *
15 * So, we use the MCFG space for all reads and writes, but also send
16 * Lincroft writes to type 1 space. But only read/write if the device
17 * actually exists, otherwise return all 1s for reads and bit bucket
18 * the writes.
19 */
20
21#include <linux/sched.h>
22#include <linux/pci.h>
23#include <linux/ioport.h>
24#include <linux/init.h>
25#include <linux/dmi.h>
26
27#include <asm/acpi.h>
28#include <asm/segment.h>
29#include <asm/io.h>
30#include <asm/smp.h>
31#include <asm/pci_x86.h>
32#include <asm/hw_irq.h>
33#include <asm/io_apic.h>
34
35#define PCIE_CAP_OFFSET 0x100
36
37/* Fixed BAR fields */
38#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */
39#define PCI_FIXED_BAR_0_SIZE 0x04
40#define PCI_FIXED_BAR_1_SIZE 0x08
41#define PCI_FIXED_BAR_2_SIZE 0x0c
42#define PCI_FIXED_BAR_3_SIZE 0x10
43#define PCI_FIXED_BAR_4_SIZE 0x14
44#define PCI_FIXED_BAR_5_SIZE 0x1c
45
46/**
47 * fixed_bar_cap - return the offset of the fixed BAR cap if found
48 * @bus: PCI bus
49 * @devfn: device in question
50 *
51 * Look for the fixed BAR cap on @bus and @devfn, returning its offset
52 * if found or 0 otherwise.
53 */
54static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
55{
56 int pos;
57 u32 pcie_cap = 0, cap_data;
58
59 pos = PCIE_CAP_OFFSET;
60
61 if (!raw_pci_ext_ops)
62 return 0;
63
64 while (pos) {
65 if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
66 devfn, pos, 4, &pcie_cap))
67 return 0;
68
69 if (pcie_cap == 0xffffffff)
70 return 0;
71
72 if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) {
73 raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
74 devfn, pos + 4, 4, &cap_data);
75 if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR)
76 return pos;
77 }
78
79 pos = pcie_cap >> 20;
80 }
81
82 return 0;
83}
84
85static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
86 int reg, int len, u32 val, int offset)
87{
88 u32 size;
89 unsigned int domain, busnum;
90 int bar = (reg - PCI_BASE_ADDRESS_0) >> 2;
91
92 domain = pci_domain_nr(bus);
93 busnum = bus->number;
94
95 if (val == ~0 && len == 4) {
96 unsigned long decode;
97
98 raw_pci_ext_ops->read(domain, busnum, devfn,
99 offset + 8 + (bar * 4), 4, &size);
100
101 /* Turn the size into a decode pattern for the sizing code */
102 if (size) {
103 decode = size - 1;
104 decode |= decode >> 1;
105 decode |= decode >> 2;
106 decode |= decode >> 4;
107 decode |= decode >> 8;
108 decode |= decode >> 16;
109 decode++;
110 decode = ~(decode - 1);
111 } else {
112 decode = ~0;
113 }
114
115 /*
116 * If val is all ones, the core code is trying to size the reg,
117 * so update the mmconfig space with the real size.
118 *
119 * Note: this assumes the fixed size we got is a power of two.
120 */
121 return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4,
122 decode);
123 }
124
125 /* This is some other kind of BAR write, so just do it. */
126 return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val);
127}
128
129/**
130 * type1_access_ok - check whether to use type 1
131 * @bus: bus number
132 * @devfn: device & function in question
133 *
134 * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at
135 * all, the we can go ahead with any reads & writes. If it's on a Lincroft,
136 * but doesn't exist, avoid the access altogether to keep the chip from
137 * hanging.
138 */
139static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
140{
141 /* This is a workaround for A0 LNC bug where PCI status register does
142 * not have new CAP bit set. can not be written by SW either.
143 *
144 * PCI header type in real LNC indicates a single function device, this
145 * will prevent probing other devices under the same function in PCI
146 * shim. Therefore, use the header type in shim instead.
147 */
148 if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
149 return 0;
150 if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0)))
151 return 1;
152 return 0; /* langwell on others */
153}
154
155static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
156 int size, u32 *value)
157{
158 if (type1_access_ok(bus->number, devfn, where))
159 return pci_direct_conf1.read(pci_domain_nr(bus), bus->number,
160 devfn, where, size, value);
161 return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
162 devfn, where, size, value);
163}
164
165static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
166 int size, u32 value)
167{
168 int offset;
169
170 /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read
171 * to ROM BAR return 0 then being ignored.
172 */
173 if (where == PCI_ROM_ADDRESS)
174 return 0;
175
176 /*
177 * Devices with fixed BARs need special handling:
178 * - BAR sizing code will save, write ~0, read size, restore
179 * - so writes to fixed BARs need special handling
180 * - other writes to fixed BAR devices should go through mmconfig
181 */
182 offset = fixed_bar_cap(bus, devfn);
183 if (offset &&
184 (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) {
185 return pci_device_update_fixed(bus, devfn, where, size, value,
186 offset);
187 }
188
189 /*
190 * On Moorestown update both real & mmconfig space
191 * Note: early Lincroft silicon can't handle type 1 accesses to
192 * non-existent devices, so just eat the write in that case.
193 */
194 if (type1_access_ok(bus->number, devfn, where))
195 return pci_direct_conf1.write(pci_domain_nr(bus), bus->number,
196 devfn, where, size, value);
197 return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn,
198 where, size, value);
199}
200
201static int mrst_pci_irq_enable(struct pci_dev *dev)
202{
203 u8 pin;
204 struct io_apic_irq_attr irq_attr;
205
206 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
207
208 /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
209 * IOAPIC RTE entries, so we just enable RTE for the device.
210 */
211 irq_attr.ioapic = mp_find_ioapic(dev->irq);
212 irq_attr.ioapic_pin = dev->irq;
213 irq_attr.trigger = 1; /* level */
214 irq_attr.polarity = 1; /* active low */
215 io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
216
217 return 0;
218}
219
220struct pci_ops pci_mrst_ops = {
221 .read = pci_read,
222 .write = pci_write,
223};
224
225/**
226 * pci_mrst_init - installs pci_mrst_ops
227 *
228 * Moorestown has an interesting PCI implementation (see above).
229 * Called when the early platform detection installs it.
230 */
231int __init pci_mrst_init(void)
232{
233 printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n");
234 pci_mmcfg_late_init();
235 pcibios_enable_irq = mrst_pci_irq_enable;
236 pci_root_ops = pci_mrst_ops;
237 /* Continue with standard init */
238 return 1;
239}
240
241/*
242 * Langwell devices reside at fixed offsets, don't try to move them.
243 */
244static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
245{
246 unsigned long offset;
247 u32 size;
248 int i;
249
250 /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
251 offset = fixed_bar_cap(dev->bus, dev->devfn);
252 if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
253 PCI_DEVFN(2, 2) == dev->devfn)
254 return;
255
256 for (i = 0; i < PCI_ROM_RESOURCE; i++) {
257 pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
258 dev->resource[i].end = dev->resource[i].start + size - 1;
259 dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
260 }
261}
262DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup);
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8eb295e116f6..8223738ad806 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -8,9 +8,7 @@
8#include <asm/apic.h> 8#include <asm/apic.h>
9#include <asm/mpspec.h> 9#include <asm/mpspec.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11 11#include <asm/numaq.h>
12#define XQUAD_PORTIO_BASE 0xfe400000
13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
14 12
15#define BUS2QUAD(global) (mp_bus_id_to_node[global]) 13#define BUS2QUAD(global) (mp_bus_id_to_node[global])
16 14
@@ -18,8 +16,6 @@
18 16
19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 17#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
20 18
21#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
22
23#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ 19#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
24 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) 20 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
25 21
@@ -152,14 +148,8 @@ int __init pci_numaq_init(void)
152{ 148{
153 int quad; 149 int quad;
154 150
155 if (!found_numaq)
156 return 0;
157
158 raw_pci_ops = &pci_direct_conf1_mq; 151 raw_pci_ops = &pci_direct_conf1_mq;
159 152
160 if (pcibios_scanned++)
161 return 0;
162
163 pci_root_bus = pcibios_scan_root(0); 153 pci_root_bus = pcibios_scan_root(0);
164 if (pci_root_bus) 154 if (pci_root_bus)
165 pci_bus_add_devices(pci_root_bus); 155 pci_bus_add_devices(pci_root_bus);
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b889d824f7c6..b34815408f58 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = {
304 304
305int __init pci_olpc_init(void) 305int __init pci_olpc_init(void)
306{ 306{
307 if (!machine_is_olpc() || olpc_has_vsa())
308 return -ENODEV;
309
310 printk(KERN_INFO "PCI: Using configuration type OLPC\n"); 307 printk(KERN_INFO "PCI: Using configuration type OLPC\n");
311 raw_pci_ops = &pci_olpc_conf; 308 raw_pci_ops = &pci_olpc_conf;
312 is_lx = is_geode_lx(); 309 is_lx = is_geode_lx();
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 1c975cc9839e..59a225c17b84 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -4,6 +4,7 @@
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/slab.h>
7#include <linux/module.h> 8#include <linux/module.h>
8#include <linux/uaccess.h> 9#include <linux/uaccess.h>
9#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index bcead7a46871..03008f72eb04 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
69 69
70int __init pci_visws_init(void) 70int __init pci_visws_init(void)
71{ 71{
72 if (!is_visws_box())
73 return -1;
74
75 pcibios_enable_irq = &pci_visws_enable_irq; 72 pcibios_enable_irq = &pci_visws_enable_irq;
76 pcibios_disable_irq = &pci_visws_disable_irq; 73 pcibios_disable_irq = &pci_visws_disable_irq;
77 74
@@ -90,5 +87,6 @@ int __init pci_visws_init(void)
90 pci_scan_bus_with_sysdata(pci_bus1); 87 pci_scan_bus_with_sysdata(pci_bus1);
91 pci_fixup_irqs(pci_common_swizzle, visws_map_irq); 88 pci_fixup_irqs(pci_common_swizzle, visws_map_irq);
92 pcibios_resource_survey(); 89 pcibios_resource_survey();
93 return 0; 90 /* Request bus scan */
91 return 1;
94} 92}
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 81197c62d5b3..3769079874d8 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl> 6 * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 */ 7 */
8 8
9#include <linux/gfp.h>
9#include <linux/suspend.h> 10#include <linux/suspend.h>
10#include <linux/bootmem.h> 11#include <linux/bootmem.h>
11 12
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 65fdc86e923f..d24f983ba1e5 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -8,6 +8,7 @@
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
11#include <linux/gfp.h>
11#include <linux/smp.h> 12#include <linux/smp.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <asm/proto.h> 14#include <asm/proto.h>
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index b641388d8286..ad47daeafa4e 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -27,10 +27,17 @@ ENTRY(swsusp_arch_suspend)
27 ret 27 ret
28 28
29ENTRY(restore_image) 29ENTRY(restore_image)
30 movl mmu_cr4_features, %ecx
30 movl resume_pg_dir, %eax 31 movl resume_pg_dir, %eax
31 subl $__PAGE_OFFSET, %eax 32 subl $__PAGE_OFFSET, %eax
32 movl %eax, %cr3 33 movl %eax, %cr3
33 34
35 jecxz 1f # cr4 Pentium and higher, skip if zero
36 andl $~(X86_CR4_PGE), %ecx
37 movl %ecx, %cr4; # turn off PGE
38 movl %cr3, %eax; # flush TLB
39 movl %eax, %cr3
401:
34 movl restore_pblist, %edx 41 movl restore_pblist, %edx
35 .p2align 4,,7 42 .p2align 4,,7
36 43
@@ -54,16 +61,8 @@ done:
54 movl $swapper_pg_dir, %eax 61 movl $swapper_pg_dir, %eax
55 subl $__PAGE_OFFSET, %eax 62 subl $__PAGE_OFFSET, %eax
56 movl %eax, %cr3 63 movl %eax, %cr3
57 /* Flush TLB, including "global" things (vmalloc) */
58 movl mmu_cr4_features, %ecx 64 movl mmu_cr4_features, %ecx
59 jecxz 1f # cr4 Pentium and higher, skip if zero 65 jecxz 1f # cr4 Pentium and higher, skip if zero
60 movl %ecx, %edx
61 andl $~(X86_CR4_PGE), %edx
62 movl %edx, %cr4; # turn off PGE
631:
64 movl %cr3, %eax; # flush TLB
65 movl %eax, %cr3
66 jecxz 1f # cr4 Pentium and higher, skip if zero
67 movl %ecx, %cr4; # turn PGE back on 66 movl %ecx, %cr4; # turn PGE back on
681: 671:
69 68
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
index 5bbb5a33f220..fd1ab80be0de 100644
--- a/arch/x86/tools/chkobjdump.awk
+++ b/arch/x86/tools/chkobjdump.awk
@@ -8,14 +8,24 @@ BEGIN {
8 od_sver = 19; 8 od_sver = 19;
9} 9}
10 10
11/^GNU/ { 11/^GNU objdump/ {
12 split($3, ver, "."); 12 verstr = ""
13 for (i = 3; i <= NF; i++)
14 if (match($(i), "^[0-9]")) {
15 verstr = $(i);
16 break;
17 }
18 if (verstr == "") {
19 printf("Warning: Failed to find objdump version number.\n");
20 exit 0;
21 }
22 split(verstr, ver, ".");
13 if (ver[1] > od_ver || 23 if (ver[1] > od_ver ||
14 (ver[1] == od_ver && ver[2] >= od_sver)) { 24 (ver[1] == od_ver && ver[2] >= od_sver)) {
15 exit 1; 25 exit 1;
16 } else { 26 } else {
17 printf("Warning: objdump version %s is older than %d.%d\n", 27 printf("Warning: objdump version %s is older than %d.%d\n",
18 $4, od_ver, od_sver); 28 verstr, od_ver, od_sver);
19 print("Warning: Skipping posttest."); 29 print("Warning: Skipping posttest.");
20 # Logic is inverted, because we just skip test without error. 30 # Logic is inverted, because we just skip test without error.
21 exit 0; 31 exit 0;
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
index bee8d6ac2691..13403fc95a96 100644
--- a/arch/x86/tools/test_get_len.c
+++ b/arch/x86/tools/test_get_len.c
@@ -43,7 +43,7 @@ static int x86_64;
43static void usage(void) 43static void usage(void)
44{ 44{
45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" 45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
46 " %s [-y|-n] [-v] \n", prog); 46 " %s [-y|-n] [-v]\n", prog);
47 fprintf(stderr, "\t-y 64bit mode\n"); 47 fprintf(stderr, "\t-y 64bit mode\n");
48 fprintf(stderr, "\t-n 32bit mode\n"); 48 fprintf(stderr, "\t-n 32bit mode\n");
49 fprintf(stderr, "\t-v verbose mode\n"); 49 fprintf(stderr, "\t-v verbose mode\n");
@@ -69,7 +69,7 @@ static void dump_field(FILE *fp, const char *name, const char *indent,
69 69
70static void dump_insn(FILE *fp, struct insn *insn) 70static void dump_insn(FILE *fp, struct insn *insn)
71{ 71{
72 fprintf(fp, "Instruction = { \n"); 72 fprintf(fp, "Instruction = {\n");
73 dump_field(fp, "prefixes", "\t", &insn->prefixes); 73 dump_field(fp, "prefixes", "\t", &insn->prefixes);
74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); 74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); 75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 21e1aeb9f3ea..ac74869b8140 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -6,6 +6,7 @@
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/err.h> 7#include <linux/err.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include <linux/init.h> 10#include <linux/init.h>
10#include <linux/random.h> 11#include <linux/random.h>
11#include <linux/elf.h> 12#include <linux/elf.h>
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index e133ce25e290..1304bcec8ee5 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -1,5 +1,6 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/debugfs.h> 2#include <linux/debugfs.h>
3#include <linux/slab.h>
3#include <linux/module.h> 4#include <linux/module.h>
4 5
5#include "debugfs.h" 6#include "debugfs.h"
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2b26dd5930c6..65d8d79b46a8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -28,6 +28,7 @@
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/console.h> 29#include <linux/console.h>
30#include <linux/pci.h> 30#include <linux/pci.h>
31#include <linux/gfp.h>
31 32
32#include <xen/xen.h> 33#include <xen/xen.h>
33#include <xen/interface/xen.h> 34#include <xen/interface/xen.h>
@@ -50,6 +51,7 @@
50#include <asm/traps.h> 51#include <asm/traps.h>
51#include <asm/setup.h> 52#include <asm/setup.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
54#include <asm/pgalloc.h>
53#include <asm/pgtable.h> 55#include <asm/pgtable.h>
54#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
55#include <asm/reboot.h> 57#include <asm/reboot.h>
@@ -1094,6 +1096,12 @@ asmlinkage void __init xen_start_kernel(void)
1094 1096
1095 __supported_pte_mask |= _PAGE_IOMAP; 1097 __supported_pte_mask |= _PAGE_IOMAP;
1096 1098
1099 /*
1100 * Prevent page tables from being allocated in highmem, even
1101 * if CONFIG_HIGHPTE is enabled.
1102 */
1103 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
1104
1097 /* Work out if we support NX */ 1105 /* Work out if we support NX */
1098 x86_configure_nx(); 1106 x86_configure_nx();
1099 1107
@@ -1151,9 +1159,13 @@ asmlinkage void __init xen_start_kernel(void)
1151 1159
1152 /* keep using Xen gdt for now; no urgent need to change it */ 1160 /* keep using Xen gdt for now; no urgent need to change it */
1153 1161
1162#ifdef CONFIG_X86_32
1154 pv_info.kernel_rpl = 1; 1163 pv_info.kernel_rpl = 1;
1155 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1164 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1156 pv_info.kernel_rpl = 0; 1165 pv_info.kernel_rpl = 0;
1166#else
1167 pv_info.kernel_rpl = 0;
1168#endif
1157 1169
1158 /* set the limit of our address space */ 1170 /* set the limit of our address space */
1159 xen_reserve_top(); 1171 xen_reserve_top();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index bf4cd6bfe959..914f04695ce5 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/bug.h> 44#include <linux/bug.h>
45#include <linux/module.h> 45#include <linux/module.h>
46#include <linux/gfp.h>
46 47
47#include <asm/pgtable.h> 48#include <asm/pgtable.h>
48#include <asm/tlbflush.h> 49#include <asm/tlbflush.h>
@@ -1427,23 +1428,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1427#endif 1428#endif
1428} 1429}
1429 1430
1430#ifdef CONFIG_HIGHPTE
1431static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1432{
1433 pgprot_t prot = PAGE_KERNEL;
1434
1435 if (PagePinned(page))
1436 prot = PAGE_KERNEL_RO;
1437
1438 if (0 && PageHighMem(page))
1439 printk("mapping highpte %lx type %d prot %s\n",
1440 page_to_pfn(page), type,
1441 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1442
1443 return kmap_atomic_prot(page, type, prot);
1444}
1445#endif
1446
1447#ifdef CONFIG_X86_32 1431#ifdef CONFIG_X86_32
1448static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1432static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1449{ 1433{
@@ -1902,10 +1886,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1902 .alloc_pmd_clone = paravirt_nop, 1886 .alloc_pmd_clone = paravirt_nop,
1903 .release_pmd = xen_release_pmd_init, 1887 .release_pmd = xen_release_pmd_init,
1904 1888
1905#ifdef CONFIG_HIGHPTE
1906 .kmap_atomic_pte = xen_kmap_atomic_pte,
1907#endif
1908
1909#ifdef CONFIG_X86_64 1889#ifdef CONFIG_X86_64
1910 .set_pte = xen_set_pte, 1890 .set_pte = xen_set_pte,
1911#else 1891#else
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 563d20504988..a29693fd3138 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -14,6 +14,7 @@
14 */ 14 */
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h>
17#include <linux/smp.h> 18#include <linux/smp.h>
18 19
19#include <asm/paravirt.h> 20#include <asm/paravirt.h>
@@ -361,7 +362,7 @@ static void xen_cpu_die(unsigned int cpu)
361 alternatives_smp_switch(0); 362 alternatives_smp_switch(0);
362} 363}
363 364
364static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */ 365static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
365{ 366{
366 play_dead_common(); 367 play_dead_common();
367 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 24ded31b5aec..e0500646585d 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -6,6 +6,7 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/log2.h> 8#include <linux/log2.h>
9#include <linux/gfp.h>
9 10
10#include <asm/paravirt.h> 11#include <asm/paravirt.h>
11 12
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 0d3f07cd1b5f..32764b8880b5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -13,6 +13,7 @@
13#include <linux/clockchips.h> 13#include <linux/clockchips.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/math64.h> 15#include <linux/math64.h>
16#include <linux/gfp.h>
16 17
17#include <asm/pvclock.h> 18#include <asm/pvclock.h>
18#include <asm/xen/hypervisor.h> 19#include <asm/xen/hypervisor.h>
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 88e15deb8b82..22a2093b5862 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -90,9 +90,9 @@ ENTRY(xen_iret)
90 GET_THREAD_INFO(%eax) 90 GET_THREAD_INFO(%eax)
91 movl TI_cpu(%eax), %eax 91 movl TI_cpu(%eax), %eax
92 movl __per_cpu_offset(,%eax,4), %eax 92 movl __per_cpu_offset(,%eax,4), %eax
93 mov per_cpu__xen_vcpu(%eax), %eax 93 mov xen_vcpu(%eax), %eax
94#else 94#else
95 movl per_cpu__xen_vcpu, %eax 95 movl xen_vcpu, %eax
96#endif 96#endif
97 97
98 /* check IF state we're restoring */ 98 /* check IF state we're restoring */