aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig51
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Kconfig.debug23
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/Makefile_32.cpu9
-rw-r--r--arch/x86/boot/compressed/Makefile6
-rw-r--r--arch/x86/boot/compressed/misc.c4
-rw-r--r--arch/x86/boot/compressed/relocs.c87
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/boot/setup.ld3
-rw-r--r--arch/x86/boot/version.c4
-rw-r--r--arch/x86/boot/video.c6
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S517
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c10
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S157
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c333
-rw-r--r--arch/x86/ia32/ia32_aout.c11
-rw-r--r--arch/x86/ia32/ia32entry.S10
-rw-r--r--arch/x86/ia32/sys_ia32.c99
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/a.out-core.h10
-rw-r--r--arch/x86/include/asm/acpi.h26
-rw-r--r--arch/x86/include/asm/alternative-asm.h10
-rw-r--r--arch/x86/include/asm/alternative.h1
-rw-r--r--arch/x86/include/asm/amd_iommu.h15
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h41
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h54
-rw-r--r--arch/x86/include/asm/apic.h21
-rw-r--r--arch/x86/include/asm/apicdef.h6
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/asm-offsets.h1
-rw-r--r--arch/x86/include/asm/bug.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h1
-rw-r--r--arch/x86/include/asm/calgary.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h218
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h234
-rw-r--r--arch/x86/include/asm/cpu_debug.h127
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/debugreg.h33
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/desc_defs.h4
-rw-r--r--arch/x86/include/asm/device.h2
-rw-r--r--arch/x86/include/asm/dma-mapping.h17
-rw-r--r--arch/x86/include/asm/elf.h31
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/gart.h9
-rw-r--r--arch/x86/include/asm/geode.h219
-rw-r--r--arch/x86/include/asm/hardirq.h8
-rw-r--r--arch/x86/include/asm/hpet.h8
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h73
-rw-r--r--arch/x86/include/asm/hw_irq.h35
-rw-r--r--arch/x86/include/asm/i387.h7
-rw-r--r--arch/x86/include/asm/inat.h220
-rw-r--r--arch/x86/include/asm/inat_types.h29
-rw-r--r--arch/x86/include/asm/insn.h184
-rw-r--r--arch/x86/include/asm/inst.h150
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_vectors.h4
-rw-r--r--arch/x86/include/asm/kvm.h34
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h34
-rw-r--r--arch/x86/include/asm/mce.h15
-rw-r--r--arch/x86/include/asm/mmzone_32.h2
-rw-r--r--arch/x86/include/asm/mpspec.h16
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/msr.h27
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/paravirt.h42
-rw-r--r--arch/x86/include/asm/paravirt_types.h24
-rw-r--r--arch/x86/include/asm/pci_x86.h20
-rw-r--r--arch/x86/include/asm/percpu.h104
-rw-r--r--arch/x86/include/asm/perf_event.h14
-rw-r--r--arch/x86/include/asm/processor.h18
-rw-r--r--arch/x86/include/asm/ptrace.h64
-rw-r--r--arch/x86/include/asm/sigcontext.h4
-rw-r--r--arch/x86/include/asm/spinlock.h62
-rw-r--r--arch/x86/include/asm/spinlock_types.h10
-rw-r--r--arch/x86/include/asm/stacktrace.h24
-rw-r--r--arch/x86/include/asm/string_32.h9
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/asm/swiotlb.h11
-rw-r--r--arch/x86/include/asm/sys_ia32.h9
-rw-r--r--arch/x86/include/asm/syscalls.h34
-rw-r--r--arch/x86/include/asm/system.h32
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/topology.h10
-rw-r--r--arch/x86/include/asm/trampoline.h1
-rw-r--r--arch/x86/include/asm/uaccess.h1
-rw-r--r--arch/x86/include/asm/uaccess_32.h26
-rw-r--r--arch/x86/include/asm/uaccess_64.h35
-rw-r--r--arch/x86/include/asm/unistd_32.h3
-rw-r--r--arch/x86/include/asm/unistd_64.h2
-rw-r--r--arch/x86/include/asm/uv/bios.h11
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h163
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h14
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/x86_init.h10
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h27
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c25
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/processor.c100
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S3
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c1312
-rw-r--r--arch/x86/kernel/amd_iommu_init.c143
-rw-r--r--arch/x86/kernel/aperture_64.c14
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c61
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c10
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c18
-rw-r--r--arch/x86/kernel/apic/es7000_32.c28
-rw-r--r--arch/x86/kernel/apic/io_apic.c454
-rw-r--r--arch/x86/kernel/apic/nmi.c19
-rw-r--r--arch/x86/kernel/apic/numaq_32.c13
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/probe_64.c13
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c59
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/bios_uv.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c15
-rw-r--r--arch/x86/kernel/cpu/amd.c57
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c50
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c72
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c37
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c88
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c115
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c45
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event.c248
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpuid.c24
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/crash_dump_32.c19
-rw-r--r--arch/x86/kernel/ds.c4
-rw-r--r--arch/x86/kernel/dumpstack.c50
-rw-r--r--arch/x86/kernel/dumpstack.h6
-rw-r--r--arch/x86/kernel/dumpstack_32.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c83
-rw-r--r--arch/x86/kernel/e820.c13
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/entry_32.S100
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c84
-rw-r--r--arch/x86/kernel/geode_32.c196
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c85
-rw-r--r--arch/x86/kernel/hw_breakpoint.c554
-rw-r--r--arch/x86/kernel/ioport.c28
-rw-r--r--arch/x86/kernel/irq.c132
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c241
-rw-r--r--arch/x86/kernel/kprobes.c261
-rw-r--r--arch/x86/kernel/machine_kexec_32.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c410
-rw-r--r--arch/x86/kernel/microcode_amd.c61
-rw-r--r--arch/x86/kernel/microcode_core.c28
-rw-r--r--arch/x86/kernel/microcode_intel.c47
-rw-r--r--arch/x86/kernel/mpparse.c3
-rw-r--r--arch/x86/kernel/msr.c25
-rw-r--r--arch/x86/kernel/olpc.c4
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c100
-rw-r--r--arch/x86/kernel/pci-dma.c47
-rw-r--r--arch/x86/kernel/pci-gart_64.c163
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c21
-rw-r--r--arch/x86/kernel/process.c128
-rw-r--r--arch/x86/kernel/process_32.c111
-rw-r--r--arch/x86/kernel/process_64.c127
-rw-r--r--arch/x86/kernel/ptrace.c480
-rw-r--r--arch/x86/kernel/quirks.c22
-rw-r--r--arch/x86/kernel/reboot.c29
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c3
-rw-r--r--arch/x86/kernel/setup.c39
-rw-r--r--arch/x86/kernel/setup_percpu.c13
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c58
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/sys_i386_32.c27
-rw-r--r--arch/x86/kernel/sys_x86_64.c17
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/time.c3
-rw-r--r--arch/x86/kernel/tlb_uv.c9
-rw-r--r--arch/x86/kernel/trampoline.c30
-rw-r--r--arch/x86/kernel/trampoline_64.S4
-rw-r--r--arch/x86/kernel/traps.c73
-rw-r--r--arch/x86/kernel/tsc.c1
-rw-r--r--arch/x86/kernel/tsc_sync.c23
-rw-r--r--arch/x86/kernel/uv_irq.c238
-rw-r--r--arch/x86/kernel/uv_time.c93
-rw-r--r--arch/x86/kernel/visws_quirks.c8
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S7
-rw-r--r--arch/x86/kernel/vsyscall_64.c7
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c11
-rw-r--r--arch/x86/kernel/x86_init.c8
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c159
-rw-r--r--arch/x86/kvm/i8254.c19
-rw-r--r--arch/x86/kvm/i8259.c44
-rw-r--r--arch/x86/kvm/irq.h7
-rw-r--r--arch/x86/kvm/lapic.c22
-rw-r--r--arch/x86/kvm/mmu.c25
-rw-r--r--arch/x86/kvm/paging_tmpl.h23
-rw-r--r--arch/x86/kvm/svm.c395
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c448
-rw-r--r--arch/x86/kvm/x86.c589
-rw-r--r--arch/x86/lib/.gitignore1
-rw-r--r--arch/x86/lib/Makefile17
-rw-r--r--arch/x86/lib/copy_user_64.S14
-rw-r--r--arch/x86/lib/inat.c90
-rw-r--r--arch/x86/lib/insn.c516
-rw-r--r--arch/x86/lib/msr-smp.c204
-rw-r--r--arch/x86/lib/msr.c227
-rw-r--r--arch/x86/lib/usercopy_32.c10
-rw-r--r--arch/x86/lib/x86-opcode-map.txt893
-rw-r--r--arch/x86/mm/extable.c31
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c19
-rw-r--r--arch/x86/mm/ioremap.c50
-rw-r--r--arch/x86/mm/kmemcheck/error.c19
-rw-r--r--arch/x86/mm/kmmio.c57
-rw-r--r--arch/x86/mm/mmio-mod.c71
-rw-r--r--arch/x86/mm/pat.c10
-rw-r--r--arch/x86/mm/srat_32.c2
-rw-r--r--arch/x86/mm/srat_64.c12
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/oprofile/backtrace.c9
-rw-r--r--arch/x86/oprofile/nmi_int.c3
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/acpi.c74
-rw-r--r--arch/x86/pci/amd_bus.c120
-rw-r--r--arch/x86/pci/bus_numa.c101
-rw-r--r--arch/x86/pci/bus_numa.h27
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/early.c7
-rw-r--r--arch/x86/pci/i386.c42
-rw-r--r--arch/x86/pci/mmconfig-shared.c356
-rw-r--r--arch/x86/pci/mmconfig_32.c16
-rw-r--r--arch/x86/pci/mmconfig_64.c88
-rw-r--r--arch/x86/power/cpu.c26
-rw-r--r--arch/x86/tools/Makefile31
-rw-r--r--arch/x86/tools/chkobjdump.awk33
-rw-r--r--arch/x86/tools/distill.awk47
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk378
-rw-r--r--arch/x86/tools/test_get_len.c173
-rw-r--r--arch/x86/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/xen/enlighten.c51
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/smp.c44
-rw-r--r--arch/x86/xen/spinlock.c16
-rw-r--r--arch/x86/xen/suspend.c17
-rw-r--r--arch/x86/xen/time.c31
-rw-r--r--arch/x86/xen/xen-asm_64.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
291 files changed, 11501 insertions, 7240 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c876bace8fdc..eb4092568f9e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -49,7 +49,12 @@ config X86
49 select HAVE_KERNEL_GZIP 49 select HAVE_KERNEL_GZIP
50 select HAVE_KERNEL_BZIP2 50 select HAVE_KERNEL_BZIP2
51 select HAVE_KERNEL_LZMA 51 select HAVE_KERNEL_LZMA
52 select HAVE_KERNEL_LZO
53 select HAVE_HW_BREAKPOINT
54 select PERF_EVENTS
55 select ANON_INODES
52 select HAVE_ARCH_KMEMCHECK 56 select HAVE_ARCH_KMEMCHECK
57 select HAVE_USER_RETURN_NOTIFIER
53 58
54config OUTPUT_FORMAT 59config OUTPUT_FORMAT
55 string 60 string
@@ -491,7 +496,7 @@ if PARAVIRT_GUEST
491source "arch/x86/xen/Kconfig" 496source "arch/x86/xen/Kconfig"
492 497
493config VMI 498config VMI
494 bool "VMI Guest support" 499 bool "VMI Guest support (DEPRECATED)"
495 select PARAVIRT 500 select PARAVIRT
496 depends on X86_32 501 depends on X86_32
497 ---help--- 502 ---help---
@@ -500,6 +505,15 @@ config VMI
500 at the moment), by linking the kernel to a GPL-ed ROM module 505 at the moment), by linking the kernel to a GPL-ed ROM module
501 provided by the hypervisor. 506 provided by the hypervisor.
502 507
508 As of September 2009, VMware has started a phased retirement
509 of this feature from VMware's products. Please see
510 feature-removal-schedule.txt for details. If you are
511 planning to enable this option, please note that you cannot
512 live migrate a VMI enabled VM to a future VMware product,
513 which doesn't support VMI. So if you expect your kernel to
514 seamlessly migrate to newer VMware products, keep this
515 disabled.
516
503config KVM_CLOCK 517config KVM_CLOCK
504 bool "KVM paravirtualized clock" 518 bool "KVM paravirtualized clock"
505 select PARAVIRT 519 select PARAVIRT
@@ -975,12 +989,6 @@ config X86_CPUID
975 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to 989 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
976 /dev/cpu/31/cpuid. 990 /dev/cpu/31/cpuid.
977 991
978config X86_CPU_DEBUG
979 tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
980 ---help---
981 If you select this option, this will provide various x86 CPUs
982 information through debugfs.
983
984choice 992choice
985 prompt "High Memory Support" 993 prompt "High Memory Support"
986 default HIGHMEM4G if !X86_NUMAQ 994 default HIGHMEM4G if !X86_NUMAQ
@@ -1233,6 +1241,11 @@ config ARCH_MEMORY_PROBE
1233 def_bool X86_64 1241 def_bool X86_64
1234 depends on MEMORY_HOTPLUG 1242 depends on MEMORY_HOTPLUG
1235 1243
1244config ILLEGAL_POINTER_VALUE
1245 hex
1246 default 0 if X86_32
1247 default 0xdead000000000000 if X86_64
1248
1236source "mm/Kconfig" 1249source "mm/Kconfig"
1237 1250
1238config HIGHPTE 1251config HIGHPTE
@@ -1321,7 +1334,9 @@ config MATH_EMULATION
1321 kernel, it won't hurt. 1334 kernel, it won't hurt.
1322 1335
1323config MTRR 1336config MTRR
1324 bool "MTRR (Memory Type Range Register) support" 1337 bool
1338 default y
1339 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
1325 ---help--- 1340 ---help---
1326 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1341 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1327 the Memory Type Range Registers (MTRRs) may be used to control 1342 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1387,7 +1402,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1387 1402
1388config X86_PAT 1403config X86_PAT
1389 bool 1404 bool
1390 prompt "x86 PAT support" 1405 default y
1406 prompt "x86 PAT support" if EMBEDDED
1391 depends on MTRR 1407 depends on MTRR
1392 ---help--- 1408 ---help---
1393 Use PAT attributes to setup page level cache control. 1409 Use PAT attributes to setup page level cache control.
@@ -1434,12 +1450,8 @@ config SECCOMP
1434 1450
1435 If unsure, say Y. Only embedded should say N here. 1451 If unsure, say Y. Only embedded should say N here.
1436 1452
1437config CC_STACKPROTECTOR_ALL
1438 bool
1439
1440config CC_STACKPROTECTOR 1453config CC_STACKPROTECTOR
1441 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" 1454 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
1442 select CC_STACKPROTECTOR_ALL
1443 ---help--- 1455 ---help---
1444 This option turns on the -fstack-protector GCC feature. This 1456 This option turns on the -fstack-protector GCC feature. This
1445 feature puts, at the beginning of functions, a canary value on 1457 feature puts, at the beginning of functions, a canary value on
@@ -1597,7 +1609,7 @@ config COMPAT_VDSO
1597 depends on X86_32 || IA32_EMULATION 1609 depends on X86_32 || IA32_EMULATION
1598 ---help--- 1610 ---help---
1599 Map the 32-bit VDSO to the predictable old-style address too. 1611 Map the 32-bit VDSO to the predictable old-style address too.
1600 ---help--- 1612
1601 Say N here if you are running a sufficiently recent glibc 1613 Say N here if you are running a sufficiently recent glibc
1602 version (2.3.3 or later), to remove the high-mapped 1614 version (2.3.3 or later), to remove the high-mapped
1603 VDSO mapping and to exclusively use the randomized VDSO. 1615 VDSO mapping and to exclusively use the randomized VDSO.
@@ -2002,18 +2014,9 @@ config SCx200HR_TIMER
2002 processor goes idle (as is done by the scheduler). The 2014 processor goes idle (as is done by the scheduler). The
2003 other workaround is idle=poll boot option. 2015 other workaround is idle=poll boot option.
2004 2016
2005config GEODE_MFGPT_TIMER
2006 def_bool y
2007 prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
2008 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
2009 ---help---
2010 This driver provides a clock event source based on the MFGPT
2011 timer(s) in the CS5535 and CS5536 companion chip for the geode.
2012 MFGPTs have a better resolution and max interval than the
2013 generic PIT, and are suitable for use as high-res timers.
2014
2015config OLPC 2017config OLPC
2016 bool "One Laptop Per Child support" 2018 bool "One Laptop Per Child support"
2019 select GPIOLIB
2017 default n 2020 default n
2018 ---help--- 2021 ---help---
2019 Add support for detecting the unique features of the OLPC 2022 Add support for detecting the unique features of the OLPC
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 621f2bd0ef56..f20ddf84a893 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -402,7 +402,7 @@ config X86_CMPXCHG64
402# generates cmov. 402# generates cmov.
403config X86_CMOV 403config X86_CMOV
404 def_bool y 404 def_bool y
405 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM) 405 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
406 406
407config X86_MINIMUM_CPU_FAMILY 407config X86_MINIMUM_CPU_FAMILY
408 int 408 int
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..bc01e3ebfeb2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
186config HAVE_MMIOTRACE_SUPPORT 186config HAVE_MMIOTRACE_SUPPORT
187 def_bool y 187 def_bool y
188 188
189config X86_DECODER_SELFTEST
190 bool "x86 instruction decoder selftest"
191 depends on DEBUG_KERNEL && KPROBES
192 ---help---
193 Perform x86 instruction decoder selftests at build time.
194 This option is useful for checking the sanity of x86 instruction
195 decoder code.
196 If unsure, say "N".
197
189# 198#
190# IO delay types: 199# IO delay types:
191# 200#
@@ -287,4 +296,18 @@ config OPTIMIZE_INLINING
287 296
288 If unsure, say N. 297 If unsure, say N.
289 298
299config DEBUG_STRICT_USER_COPY_CHECKS
300 bool "Strict copy size checks"
301 depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
302 ---help---
303 Enabling this option turns a certain set of sanity checks for user
304 copy operations into compile time failures.
305
306 The copy_from_user() etc checks are there to help test if there
307 are sufficient security checks on the length argument of
308 the copy operation, by having gcc prove that the argument is
309 within bounds.
310
311 If unsure, or if you run an older (pre 4.4) gcc, say N.
312
290endmenu 313endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a012ee8ef803..78b32be55e9e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -76,7 +76,6 @@ ifdef CONFIG_CC_STACKPROTECTOR
76 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh 76 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
77 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) 77 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y)
78 stackp-y := -fstack-protector 78 stackp-y := -fstack-protector
79 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
80 KBUILD_CFLAGS += $(stackp-y) 79 KBUILD_CFLAGS += $(stackp-y)
81 else 80 else
82 $(warning stack protector enabled but no compiler support) 81 $(warning stack protector enabled but no compiler support)
@@ -156,6 +155,9 @@ all: bzImage
156KBUILD_IMAGE := $(boot)/bzImage 155KBUILD_IMAGE := $(boot)/bzImage
157 156
158bzImage: vmlinux 157bzImage: vmlinux
158ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
159 $(Q)$(MAKE) $(build)=arch/x86/tools posttest
160endif
159 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 161 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
160 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 162 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
161 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ 163 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 30e9a264f69d..1255d953c65d 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -41,11 +41,18 @@ cflags-$(CONFIG_X86_ELAN) += -march=i486
41 41
42# Geode GX1 support 42# Geode GX1 support
43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx 43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
44 44cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx)
45# add at the end to overwrite eventual tuning options from earlier 45# add at the end to overwrite eventual tuning options from earlier
46# cpu entries 46# cpu entries
47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
48 48
49# Work around the pentium-mmx code generator madness of gcc4.4.x which
50# does stack alignment by generating horrible code _before_ the mcount
51# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
52# tracer assumptions. For i686, generic, core2 this is set by the
53# compiler anyway
54cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
55
49# Bug fix for binutils: this option is required in order to keep 56# Bug fix for binutils: this option is required in order to keep
50# binutils from generating NOPL instructions against our will. 57# binutils from generating NOPL instructions against our will.
51ifneq ($(CONFIG_X86_P6_NOP),y) 58ifneq ($(CONFIG_X86_P6_NOP),y)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f8ed0658404c..fbb47daf2459 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,11 +4,12 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
12cflags-$(CONFIG_X86_32) := -march=i386
12cflags-$(CONFIG_X86_64) := -mcmodel=small 13cflags-$(CONFIG_X86_64) := -mcmodel=small
13KBUILD_CFLAGS += $(cflags-y) 14KBUILD_CFLAGS += $(cflags-y)
14KBUILD_CFLAGS += $(call cc-option,-ffreestanding) 15KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
@@ -48,10 +49,13 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
48 $(call if_changed,bzip2) 49 $(call if_changed,bzip2)
49$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE 50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
50 $(call if_changed,lzma) 51 $(call if_changed,lzma)
52$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,lzo)
51 54
52suffix-$(CONFIG_KERNEL_GZIP) := gz 55suffix-$(CONFIG_KERNEL_GZIP) := gz
53suffix-$(CONFIG_KERNEL_BZIP2) := bz2 56suffix-$(CONFIG_KERNEL_BZIP2) := bz2
54suffix-$(CONFIG_KERNEL_LZMA) := lzma 57suffix-$(CONFIG_KERNEL_LZMA) := lzma
58suffix-$(CONFIG_KERNEL_LZO) := lzo
55 59
56quiet_cmd_mkpiggy = MKPIGGY $@ 60quiet_cmd_mkpiggy = MKPIGGY $@
57 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) 61 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 842b2a36174a..3b22fe8ab91b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -162,6 +162,10 @@ static int lines, cols;
162#include "../../../../lib/decompress_unlzma.c" 162#include "../../../../lib/decompress_unlzma.c"
163#endif 163#endif
164 164
165#ifdef CONFIG_KERNEL_LZO
166#include "../../../../lib/decompress_unlzo.c"
167#endif
168
165static void scroll(void) 169static void scroll(void)
166{ 170{
167 int i; 171 int i;
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index bbeb0c3fbd90..89bbf4e4d05d 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -9,6 +9,9 @@
9#include <byteswap.h> 9#include <byteswap.h>
10#define USE_BSD 10#define USE_BSD
11#include <endian.h> 11#include <endian.h>
12#include <regex.h>
13
14static void die(char *fmt, ...);
12 15
13#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 16#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
14static Elf32_Ehdr ehdr; 17static Elf32_Ehdr ehdr;
@@ -30,25 +33,47 @@ static struct section *secs;
30 * the address for which it has been compiled. Don't warn user about 33 * the address for which it has been compiled. Don't warn user about
31 * absolute relocations present w.r.t these symbols. 34 * absolute relocations present w.r.t these symbols.
32 */ 35 */
33static const char* safe_abs_relocs[] = { 36static const char abs_sym_regex[] =
34 "xen_irq_disable_direct_reloc", 37 "^(xen_irq_disable_direct_reloc$|"
35 "xen_save_fl_direct_reloc", 38 "xen_save_fl_direct_reloc$|"
36}; 39 "VDSO|"
40 "__crc_)";
41static regex_t abs_sym_regex_c;
42static int is_abs_reloc(const char *sym_name)
43{
44 return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0);
45}
37 46
38static int is_safe_abs_reloc(const char* sym_name) 47/*
48 * These symbols are known to be relative, even if the linker marks them
49 * as absolute (typically defined outside any section in the linker script.)
50 */
51static const char rel_sym_regex[] =
52 "^_end$";
53static regex_t rel_sym_regex_c;
54static int is_rel_reloc(const char *sym_name)
39{ 55{
40 int i; 56 return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0);
57}
41 58
42 for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { 59static void regex_init(void)
43 if (!strcmp(sym_name, safe_abs_relocs[i])) 60{
44 /* Match found */ 61 char errbuf[128];
45 return 1; 62 int err;
46 } 63
47 if (strncmp(sym_name, "VDSO", 4) == 0) 64 err = regcomp(&abs_sym_regex_c, abs_sym_regex,
48 return 1; 65 REG_EXTENDED|REG_NOSUB);
49 if (strncmp(sym_name, "__crc_", 6) == 0) 66 if (err) {
50 return 1; 67 regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf);
51 return 0; 68 die("%s", errbuf);
69 }
70
71 err = regcomp(&rel_sym_regex_c, rel_sym_regex,
72 REG_EXTENDED|REG_NOSUB);
73 if (err) {
74 regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf);
75 die("%s", errbuf);
76 }
52} 77}
53 78
54static void die(char *fmt, ...) 79static void die(char *fmt, ...)
@@ -131,7 +156,7 @@ static const char *rel_type(unsigned type)
131#undef REL_TYPE 156#undef REL_TYPE
132 }; 157 };
133 const char *name = "unknown type rel type name"; 158 const char *name = "unknown type rel type name";
134 if (type < ARRAY_SIZE(type_name)) { 159 if (type < ARRAY_SIZE(type_name) && type_name[type]) {
135 name = type_name[type]; 160 name = type_name[type];
136 } 161 }
137 return name; 162 return name;
@@ -448,7 +473,7 @@ static void print_absolute_relocs(void)
448 * Before warning check if this absolute symbol 473 * Before warning check if this absolute symbol
449 * relocation is harmless. 474 * relocation is harmless.
450 */ 475 */
451 if (is_safe_abs_reloc(name)) 476 if (is_abs_reloc(name) || is_rel_reloc(name))
452 continue; 477 continue;
453 478
454 if (!printed) { 479 if (!printed) {
@@ -501,21 +526,26 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
501 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; 526 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
502 r_type = ELF32_R_TYPE(rel->r_info); 527 r_type = ELF32_R_TYPE(rel->r_info);
503 /* Don't visit relocations to absolute symbols */ 528 /* Don't visit relocations to absolute symbols */
504 if (sym->st_shndx == SHN_ABS) { 529 if (sym->st_shndx == SHN_ABS &&
530 !is_rel_reloc(sym_name(sym_strtab, sym))) {
505 continue; 531 continue;
506 } 532 }
507 if (r_type == R_386_NONE || r_type == R_386_PC32) { 533 switch (r_type) {
534 case R_386_NONE:
535 case R_386_PC32:
508 /* 536 /*
509 * NONE can be ignored and and PC relative 537 * NONE can be ignored and and PC relative
510 * relocations don't need to be adjusted. 538 * relocations don't need to be adjusted.
511 */ 539 */
512 } 540 break;
513 else if (r_type == R_386_32) { 541 case R_386_32:
514 /* Visit relocations that need to be adjusted */ 542 /* Visit relocations that need to be adjusted */
515 visit(rel, sym); 543 visit(rel, sym);
516 } 544 break;
517 else { 545 default:
518 die("Unsupported relocation type: %d\n", r_type); 546 die("Unsupported relocation type: %s (%d)\n",
547 rel_type(r_type), r_type);
548 break;
519 } 549 }
520 } 550 }
521 } 551 }
@@ -571,16 +601,15 @@ static void emit_relocs(int as_text)
571 } 601 }
572 else { 602 else {
573 unsigned char buf[4]; 603 unsigned char buf[4];
574 buf[0] = buf[1] = buf[2] = buf[3] = 0;
575 /* Print a stop */ 604 /* Print a stop */
576 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); 605 fwrite("\0\0\0\0", 4, 1, stdout);
577 /* Now print each relocation */ 606 /* Now print each relocation */
578 for (i = 0; i < reloc_count; i++) { 607 for (i = 0; i < reloc_count; i++) {
579 buf[0] = (relocs[i] >> 0) & 0xff; 608 buf[0] = (relocs[i] >> 0) & 0xff;
580 buf[1] = (relocs[i] >> 8) & 0xff; 609 buf[1] = (relocs[i] >> 8) & 0xff;
581 buf[2] = (relocs[i] >> 16) & 0xff; 610 buf[2] = (relocs[i] >> 16) & 0xff;
582 buf[3] = (relocs[i] >> 24) & 0xff; 611 buf[3] = (relocs[i] >> 24) & 0xff;
583 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); 612 fwrite(buf, 4, 1, stdout);
584 } 613 }
585 } 614 }
586} 615}
@@ -598,6 +627,8 @@ int main(int argc, char **argv)
598 FILE *fp; 627 FILE *fp;
599 int i; 628 int i;
600 629
630 regex_init();
631
601 show_absolute_syms = 0; 632 show_absolute_syms = 0;
602 show_absolute_relocs = 0; 633 show_absolute_relocs = 0;
603 as_text = 0; 634 as_text = 0;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b31cc54b4641..93e689f4bd86 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -16,7 +16,7 @@
16 */ 16 */
17 17
18#include <asm/segment.h> 18#include <asm/segment.h>
19#include <linux/utsrelease.h> 19#include <generated/utsrelease.h>
20#include <asm/boot.h> 20#include <asm/boot.h>
21#include <asm/e820.h> 21#include <asm/e820.h>
22#include <asm/page_types.h> 22#include <asm/page_types.h>
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 0f6ec455a2b1..03c0683636b6 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -53,6 +53,9 @@ SECTIONS
53 53
54 /DISCARD/ : { *(.note*) } 54 /DISCARD/ : { *(.note*) }
55 55
56 /*
57 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
58 */
56 . = ASSERT(_end <= 0x8000, "Setup too big!"); 59 . = ASSERT(_end <= 0x8000, "Setup too big!");
57 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); 60 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
58 /* Necessary for the very-old-loader check to work... */ 61 /* Necessary for the very-old-loader check to work... */
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
index 2723d9b5ce43..2b15aa488ffb 100644
--- a/arch/x86/boot/version.c
+++ b/arch/x86/boot/version.c
@@ -13,8 +13,8 @@
13 */ 13 */
14 14
15#include "boot.h" 15#include "boot.h"
16#include <linux/utsrelease.h> 16#include <generated/utsrelease.h>
17#include <linux/compile.h> 17#include <generated/compile.h>
18 18
19const char kernel_version[] = 19const char kernel_version[] =
20 UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") " 20 UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index d42da3802499..f767164cd5df 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -27,6 +27,12 @@ static void store_cursor_position(void)
27 27
28 boot_params.screen_info.orig_x = oreg.dl; 28 boot_params.screen_info.orig_x = oreg.dl;
29 boot_params.screen_info.orig_y = oreg.dh; 29 boot_params.screen_info.orig_y = oreg.dh;
30
31 if (oreg.ch & 0x20)
32 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
33
34 if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
35 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
30} 36}
31 37
32static void store_video_mode(void) 38static void store_video_mode(void)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index cfb0010fa940..1a58ad89fdf7 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
15 16
16obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 17obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
17 18
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
24salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
25 26
26aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
28
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb0566e83319..20bb0e1ac681 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#include <linux/linkage.h> 18#include <linux/linkage.h>
19#include <asm/inst.h>
19 20
20.text 21.text
21 22
@@ -122,103 +123,72 @@ ENTRY(aesni_set_key)
122 movups 0x10(%rsi), %xmm2 # other user key 123 movups 0x10(%rsi), %xmm2 # other user key
123 movaps %xmm2, (%rcx) 124 movaps %xmm2, (%rcx)
124 add $0x10, %rcx 125 add $0x10, %rcx
125 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 126 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
126 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
127 call _key_expansion_256a 127 call _key_expansion_256a
128 # aeskeygenassist $0x1, %xmm0, %xmm1 128 AESKEYGENASSIST 0x1 %xmm0 %xmm1
129 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
130 call _key_expansion_256b 129 call _key_expansion_256b
131 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 130 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
132 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
133 call _key_expansion_256a 131 call _key_expansion_256a
134 # aeskeygenassist $0x2, %xmm0, %xmm1 132 AESKEYGENASSIST 0x2 %xmm0 %xmm1
135 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
136 call _key_expansion_256b 133 call _key_expansion_256b
137 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 134 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
138 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
139 call _key_expansion_256a 135 call _key_expansion_256a
140 # aeskeygenassist $0x4, %xmm0, %xmm1 136 AESKEYGENASSIST 0x4 %xmm0 %xmm1
141 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
142 call _key_expansion_256b 137 call _key_expansion_256b
143 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 138 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
144 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
145 call _key_expansion_256a 139 call _key_expansion_256a
146 # aeskeygenassist $0x8, %xmm0, %xmm1 140 AESKEYGENASSIST 0x8 %xmm0 %xmm1
147 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
148 call _key_expansion_256b 141 call _key_expansion_256b
149 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 142 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
150 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
151 call _key_expansion_256a 143 call _key_expansion_256a
152 # aeskeygenassist $0x10, %xmm0, %xmm1 144 AESKEYGENASSIST 0x10 %xmm0 %xmm1
153 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
154 call _key_expansion_256b 145 call _key_expansion_256b
155 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 146 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
156 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
157 call _key_expansion_256a 147 call _key_expansion_256a
158 # aeskeygenassist $0x20, %xmm0, %xmm1 148 AESKEYGENASSIST 0x20 %xmm0 %xmm1
159 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
160 call _key_expansion_256b 149 call _key_expansion_256b
161 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 150 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
162 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
163 call _key_expansion_256a 151 call _key_expansion_256a
164 jmp .Ldec_key 152 jmp .Ldec_key
165.Lenc_key192: 153.Lenc_key192:
166 movq 0x10(%rsi), %xmm2 # other user key 154 movq 0x10(%rsi), %xmm2 # other user key
167 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 155 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
168 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
169 call _key_expansion_192a 156 call _key_expansion_192a
170 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 157 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
171 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
172 call _key_expansion_192b 158 call _key_expansion_192b
173 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 159 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
174 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
175 call _key_expansion_192a 160 call _key_expansion_192a
176 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 161 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
177 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
178 call _key_expansion_192b 162 call _key_expansion_192b
179 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 163 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
180 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
181 call _key_expansion_192a 164 call _key_expansion_192a
182 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 165 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
183 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
184 call _key_expansion_192b 166 call _key_expansion_192b
185 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 167 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
186 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
187 call _key_expansion_192a 168 call _key_expansion_192a
188 # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 169 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
189 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
190 call _key_expansion_192b 170 call _key_expansion_192b
191 jmp .Ldec_key 171 jmp .Ldec_key
192.Lenc_key128: 172.Lenc_key128:
193 # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 173 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
194 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
195 call _key_expansion_128 174 call _key_expansion_128
196 # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 175 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
197 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
198 call _key_expansion_128 176 call _key_expansion_128
199 # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 177 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
200 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
201 call _key_expansion_128 178 call _key_expansion_128
202 # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 179 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
203 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
204 call _key_expansion_128 180 call _key_expansion_128
205 # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 181 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
206 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
207 call _key_expansion_128 182 call _key_expansion_128
208 # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 183 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
209 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
210 call _key_expansion_128 184 call _key_expansion_128
211 # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 185 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
212 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
213 call _key_expansion_128 186 call _key_expansion_128
214 # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 187 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
215 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
216 call _key_expansion_128 188 call _key_expansion_128
217 # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 189 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
218 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
219 call _key_expansion_128 190 call _key_expansion_128
220 # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 191 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
221 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
222 call _key_expansion_128 192 call _key_expansion_128
223.Ldec_key: 193.Ldec_key:
224 sub $0x10, %rcx 194 sub $0x10, %rcx
@@ -231,8 +201,7 @@ ENTRY(aesni_set_key)
231.align 4 201.align 4
232.Ldec_key_loop: 202.Ldec_key_loop:
233 movaps (%rdi), %xmm0 203 movaps (%rdi), %xmm0
234 # aesimc %xmm0, %xmm1 204 AESIMC %xmm0 %xmm1
235 .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
236 movaps %xmm1, (%rsi) 205 movaps %xmm1, (%rsi)
237 add $0x10, %rdi 206 add $0x10, %rdi
238 sub $0x10, %rsi 207 sub $0x10, %rsi
@@ -274,51 +243,37 @@ _aesni_enc1:
274 je .Lenc192 243 je .Lenc192
275 add $0x20, TKEYP 244 add $0x20, TKEYP
276 movaps -0x60(TKEYP), KEY 245 movaps -0x60(TKEYP), KEY
277 # aesenc KEY, STATE 246 AESENC KEY STATE
278 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
279 movaps -0x50(TKEYP), KEY 247 movaps -0x50(TKEYP), KEY
280 # aesenc KEY, STATE 248 AESENC KEY STATE
281 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
282.align 4 249.align 4
283.Lenc192: 250.Lenc192:
284 movaps -0x40(TKEYP), KEY 251 movaps -0x40(TKEYP), KEY
285 # aesenc KEY, STATE 252 AESENC KEY STATE
286 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
287 movaps -0x30(TKEYP), KEY 253 movaps -0x30(TKEYP), KEY
288 # aesenc KEY, STATE 254 AESENC KEY STATE
289 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
290.align 4 255.align 4
291.Lenc128: 256.Lenc128:
292 movaps -0x20(TKEYP), KEY 257 movaps -0x20(TKEYP), KEY
293 # aesenc KEY, STATE 258 AESENC KEY STATE
294 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
295 movaps -0x10(TKEYP), KEY 259 movaps -0x10(TKEYP), KEY
296 # aesenc KEY, STATE 260 AESENC KEY STATE
297 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
298 movaps (TKEYP), KEY 261 movaps (TKEYP), KEY
299 # aesenc KEY, STATE 262 AESENC KEY STATE
300 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
301 movaps 0x10(TKEYP), KEY 263 movaps 0x10(TKEYP), KEY
302 # aesenc KEY, STATE 264 AESENC KEY STATE
303 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
304 movaps 0x20(TKEYP), KEY 265 movaps 0x20(TKEYP), KEY
305 # aesenc KEY, STATE 266 AESENC KEY STATE
306 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
307 movaps 0x30(TKEYP), KEY 267 movaps 0x30(TKEYP), KEY
308 # aesenc KEY, STATE 268 AESENC KEY STATE
309 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
310 movaps 0x40(TKEYP), KEY 269 movaps 0x40(TKEYP), KEY
311 # aesenc KEY, STATE 270 AESENC KEY STATE
312 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
313 movaps 0x50(TKEYP), KEY 271 movaps 0x50(TKEYP), KEY
314 # aesenc KEY, STATE 272 AESENC KEY STATE
315 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
316 movaps 0x60(TKEYP), KEY 273 movaps 0x60(TKEYP), KEY
317 # aesenc KEY, STATE 274 AESENC KEY STATE
318 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
319 movaps 0x70(TKEYP), KEY 275 movaps 0x70(TKEYP), KEY
320 # aesenclast KEY, STATE # last round 276 AESENCLAST KEY STATE
321 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
322 ret 277 ret
323 278
324/* 279/*
@@ -353,135 +308,79 @@ _aesni_enc4:
353 je .L4enc192 308 je .L4enc192
354 add $0x20, TKEYP 309 add $0x20, TKEYP
355 movaps -0x60(TKEYP), KEY 310 movaps -0x60(TKEYP), KEY
356 # aesenc KEY, STATE1 311 AESENC KEY STATE1
357 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 312 AESENC KEY STATE2
358 # aesenc KEY, STATE2 313 AESENC KEY STATE3
359 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 314 AESENC KEY STATE4
360 # aesenc KEY, STATE3
361 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
362 # aesenc KEY, STATE4
363 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
364 movaps -0x50(TKEYP), KEY 315 movaps -0x50(TKEYP), KEY
365 # aesenc KEY, STATE1 316 AESENC KEY STATE1
366 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 317 AESENC KEY STATE2
367 # aesenc KEY, STATE2 318 AESENC KEY STATE3
368 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 319 AESENC KEY STATE4
369 # aesenc KEY, STATE3
370 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
371 # aesenc KEY, STATE4
372 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
373#.align 4 320#.align 4
374.L4enc192: 321.L4enc192:
375 movaps -0x40(TKEYP), KEY 322 movaps -0x40(TKEYP), KEY
376 # aesenc KEY, STATE1 323 AESENC KEY STATE1
377 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 324 AESENC KEY STATE2
378 # aesenc KEY, STATE2 325 AESENC KEY STATE3
379 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 326 AESENC KEY STATE4
380 # aesenc KEY, STATE3
381 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
382 # aesenc KEY, STATE4
383 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
384 movaps -0x30(TKEYP), KEY 327 movaps -0x30(TKEYP), KEY
385 # aesenc KEY, STATE1 328 AESENC KEY STATE1
386 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 329 AESENC KEY STATE2
387 # aesenc KEY, STATE2 330 AESENC KEY STATE3
388 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 331 AESENC KEY STATE4
389 # aesenc KEY, STATE3
390 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
391 # aesenc KEY, STATE4
392 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
393#.align 4 332#.align 4
394.L4enc128: 333.L4enc128:
395 movaps -0x20(TKEYP), KEY 334 movaps -0x20(TKEYP), KEY
396 # aesenc KEY, STATE1 335 AESENC KEY STATE1
397 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 336 AESENC KEY STATE2
398 # aesenc KEY, STATE2 337 AESENC KEY STATE3
399 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 338 AESENC KEY STATE4
400 # aesenc KEY, STATE3
401 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
402 # aesenc KEY, STATE4
403 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
404 movaps -0x10(TKEYP), KEY 339 movaps -0x10(TKEYP), KEY
405 # aesenc KEY, STATE1 340 AESENC KEY STATE1
406 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 341 AESENC KEY STATE2
407 # aesenc KEY, STATE2 342 AESENC KEY STATE3
408 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 343 AESENC KEY STATE4
409 # aesenc KEY, STATE3
410 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
411 # aesenc KEY, STATE4
412 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
413 movaps (TKEYP), KEY 344 movaps (TKEYP), KEY
414 # aesenc KEY, STATE1 345 AESENC KEY STATE1
415 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 346 AESENC KEY STATE2
416 # aesenc KEY, STATE2 347 AESENC KEY STATE3
417 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 348 AESENC KEY STATE4
418 # aesenc KEY, STATE3
419 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
420 # aesenc KEY, STATE4
421 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
422 movaps 0x10(TKEYP), KEY 349 movaps 0x10(TKEYP), KEY
423 # aesenc KEY, STATE1 350 AESENC KEY STATE1
424 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 351 AESENC KEY STATE2
425 # aesenc KEY, STATE2 352 AESENC KEY STATE3
426 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 353 AESENC KEY STATE4
427 # aesenc KEY, STATE3
428 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
429 # aesenc KEY, STATE4
430 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
431 movaps 0x20(TKEYP), KEY 354 movaps 0x20(TKEYP), KEY
432 # aesenc KEY, STATE1 355 AESENC KEY STATE1
433 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 356 AESENC KEY STATE2
434 # aesenc KEY, STATE2 357 AESENC KEY STATE3
435 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 358 AESENC KEY STATE4
436 # aesenc KEY, STATE3
437 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
438 # aesenc KEY, STATE4
439 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
440 movaps 0x30(TKEYP), KEY 359 movaps 0x30(TKEYP), KEY
441 # aesenc KEY, STATE1 360 AESENC KEY STATE1
442 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 361 AESENC KEY STATE2
443 # aesenc KEY, STATE2 362 AESENC KEY STATE3
444 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 363 AESENC KEY STATE4
445 # aesenc KEY, STATE3
446 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
447 # aesenc KEY, STATE4
448 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
449 movaps 0x40(TKEYP), KEY 364 movaps 0x40(TKEYP), KEY
450 # aesenc KEY, STATE1 365 AESENC KEY STATE1
451 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 366 AESENC KEY STATE2
452 # aesenc KEY, STATE2 367 AESENC KEY STATE3
453 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 368 AESENC KEY STATE4
454 # aesenc KEY, STATE3
455 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
456 # aesenc KEY, STATE4
457 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
458 movaps 0x50(TKEYP), KEY 369 movaps 0x50(TKEYP), KEY
459 # aesenc KEY, STATE1 370 AESENC KEY STATE1
460 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 371 AESENC KEY STATE2
461 # aesenc KEY, STATE2 372 AESENC KEY STATE3
462 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 373 AESENC KEY STATE4
463 # aesenc KEY, STATE3
464 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
465 # aesenc KEY, STATE4
466 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
467 movaps 0x60(TKEYP), KEY 374 movaps 0x60(TKEYP), KEY
468 # aesenc KEY, STATE1 375 AESENC KEY STATE1
469 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 376 AESENC KEY STATE2
470 # aesenc KEY, STATE2 377 AESENC KEY STATE3
471 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 378 AESENC KEY STATE4
472 # aesenc KEY, STATE3
473 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
474 # aesenc KEY, STATE4
475 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
476 movaps 0x70(TKEYP), KEY 379 movaps 0x70(TKEYP), KEY
477 # aesenclast KEY, STATE1 # last round 380 AESENCLAST KEY STATE1 # last round
478 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 381 AESENCLAST KEY STATE2
479 # aesenclast KEY, STATE2 382 AESENCLAST KEY STATE3
480 .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 383 AESENCLAST KEY STATE4
481 # aesenclast KEY, STATE3
482 .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
483 # aesenclast KEY, STATE4
484 .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
485 ret 384 ret
486 385
487/* 386/*
@@ -518,51 +417,37 @@ _aesni_dec1:
518 je .Ldec192 417 je .Ldec192
519 add $0x20, TKEYP 418 add $0x20, TKEYP
520 movaps -0x60(TKEYP), KEY 419 movaps -0x60(TKEYP), KEY
521 # aesdec KEY, STATE 420 AESDEC KEY STATE
522 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
523 movaps -0x50(TKEYP), KEY 421 movaps -0x50(TKEYP), KEY
524 # aesdec KEY, STATE 422 AESDEC KEY STATE
525 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
526.align 4 423.align 4
527.Ldec192: 424.Ldec192:
528 movaps -0x40(TKEYP), KEY 425 movaps -0x40(TKEYP), KEY
529 # aesdec KEY, STATE 426 AESDEC KEY STATE
530 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
531 movaps -0x30(TKEYP), KEY 427 movaps -0x30(TKEYP), KEY
532 # aesdec KEY, STATE 428 AESDEC KEY STATE
533 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
534.align 4 429.align 4
535.Ldec128: 430.Ldec128:
536 movaps -0x20(TKEYP), KEY 431 movaps -0x20(TKEYP), KEY
537 # aesdec KEY, STATE 432 AESDEC KEY STATE
538 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
539 movaps -0x10(TKEYP), KEY 433 movaps -0x10(TKEYP), KEY
540 # aesdec KEY, STATE 434 AESDEC KEY STATE
541 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
542 movaps (TKEYP), KEY 435 movaps (TKEYP), KEY
543 # aesdec KEY, STATE 436 AESDEC KEY STATE
544 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
545 movaps 0x10(TKEYP), KEY 437 movaps 0x10(TKEYP), KEY
546 # aesdec KEY, STATE 438 AESDEC KEY STATE
547 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
548 movaps 0x20(TKEYP), KEY 439 movaps 0x20(TKEYP), KEY
549 # aesdec KEY, STATE 440 AESDEC KEY STATE
550 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
551 movaps 0x30(TKEYP), KEY 441 movaps 0x30(TKEYP), KEY
552 # aesdec KEY, STATE 442 AESDEC KEY STATE
553 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
554 movaps 0x40(TKEYP), KEY 443 movaps 0x40(TKEYP), KEY
555 # aesdec KEY, STATE 444 AESDEC KEY STATE
556 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
557 movaps 0x50(TKEYP), KEY 445 movaps 0x50(TKEYP), KEY
558 # aesdec KEY, STATE 446 AESDEC KEY STATE
559 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
560 movaps 0x60(TKEYP), KEY 447 movaps 0x60(TKEYP), KEY
561 # aesdec KEY, STATE 448 AESDEC KEY STATE
562 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
563 movaps 0x70(TKEYP), KEY 449 movaps 0x70(TKEYP), KEY
564 # aesdeclast KEY, STATE # last round 450 AESDECLAST KEY STATE
565 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
566 ret 451 ret
567 452
568/* 453/*
@@ -597,135 +482,79 @@ _aesni_dec4:
597 je .L4dec192 482 je .L4dec192
598 add $0x20, TKEYP 483 add $0x20, TKEYP
599 movaps -0x60(TKEYP), KEY 484 movaps -0x60(TKEYP), KEY
600 # aesdec KEY, STATE1 485 AESDEC KEY STATE1
601 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 486 AESDEC KEY STATE2
602 # aesdec KEY, STATE2 487 AESDEC KEY STATE3
603 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 488 AESDEC KEY STATE4
604 # aesdec KEY, STATE3
605 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
606 # aesdec KEY, STATE4
607 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
608 movaps -0x50(TKEYP), KEY 489 movaps -0x50(TKEYP), KEY
609 # aesdec KEY, STATE1 490 AESDEC KEY STATE1
610 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 491 AESDEC KEY STATE2
611 # aesdec KEY, STATE2 492 AESDEC KEY STATE3
612 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 493 AESDEC KEY STATE4
613 # aesdec KEY, STATE3
614 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
615 # aesdec KEY, STATE4
616 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
617.align 4 494.align 4
618.L4dec192: 495.L4dec192:
619 movaps -0x40(TKEYP), KEY 496 movaps -0x40(TKEYP), KEY
620 # aesdec KEY, STATE1 497 AESDEC KEY STATE1
621 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 498 AESDEC KEY STATE2
622 # aesdec KEY, STATE2 499 AESDEC KEY STATE3
623 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 500 AESDEC KEY STATE4
624 # aesdec KEY, STATE3
625 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
626 # aesdec KEY, STATE4
627 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
628 movaps -0x30(TKEYP), KEY 501 movaps -0x30(TKEYP), KEY
629 # aesdec KEY, STATE1 502 AESDEC KEY STATE1
630 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 503 AESDEC KEY STATE2
631 # aesdec KEY, STATE2 504 AESDEC KEY STATE3
632 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 505 AESDEC KEY STATE4
633 # aesdec KEY, STATE3
634 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
635 # aesdec KEY, STATE4
636 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
637.align 4 506.align 4
638.L4dec128: 507.L4dec128:
639 movaps -0x20(TKEYP), KEY 508 movaps -0x20(TKEYP), KEY
640 # aesdec KEY, STATE1 509 AESDEC KEY STATE1
641 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 510 AESDEC KEY STATE2
642 # aesdec KEY, STATE2 511 AESDEC KEY STATE3
643 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 512 AESDEC KEY STATE4
644 # aesdec KEY, STATE3
645 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
646 # aesdec KEY, STATE4
647 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
648 movaps -0x10(TKEYP), KEY 513 movaps -0x10(TKEYP), KEY
649 # aesdec KEY, STATE1 514 AESDEC KEY STATE1
650 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 515 AESDEC KEY STATE2
651 # aesdec KEY, STATE2 516 AESDEC KEY STATE3
652 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 517 AESDEC KEY STATE4
653 # aesdec KEY, STATE3
654 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
655 # aesdec KEY, STATE4
656 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
657 movaps (TKEYP), KEY 518 movaps (TKEYP), KEY
658 # aesdec KEY, STATE1 519 AESDEC KEY STATE1
659 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 520 AESDEC KEY STATE2
660 # aesdec KEY, STATE2 521 AESDEC KEY STATE3
661 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 522 AESDEC KEY STATE4
662 # aesdec KEY, STATE3
663 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
664 # aesdec KEY, STATE4
665 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
666 movaps 0x10(TKEYP), KEY 523 movaps 0x10(TKEYP), KEY
667 # aesdec KEY, STATE1 524 AESDEC KEY STATE1
668 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 525 AESDEC KEY STATE2
669 # aesdec KEY, STATE2 526 AESDEC KEY STATE3
670 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 527 AESDEC KEY STATE4
671 # aesdec KEY, STATE3
672 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
673 # aesdec KEY, STATE4
674 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
675 movaps 0x20(TKEYP), KEY 528 movaps 0x20(TKEYP), KEY
676 # aesdec KEY, STATE1 529 AESDEC KEY STATE1
677 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 530 AESDEC KEY STATE2
678 # aesdec KEY, STATE2 531 AESDEC KEY STATE3
679 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 532 AESDEC KEY STATE4
680 # aesdec KEY, STATE3
681 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
682 # aesdec KEY, STATE4
683 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
684 movaps 0x30(TKEYP), KEY 533 movaps 0x30(TKEYP), KEY
685 # aesdec KEY, STATE1 534 AESDEC KEY STATE1
686 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 535 AESDEC KEY STATE2
687 # aesdec KEY, STATE2 536 AESDEC KEY STATE3
688 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 537 AESDEC KEY STATE4
689 # aesdec KEY, STATE3
690 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
691 # aesdec KEY, STATE4
692 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
693 movaps 0x40(TKEYP), KEY 538 movaps 0x40(TKEYP), KEY
694 # aesdec KEY, STATE1 539 AESDEC KEY STATE1
695 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 540 AESDEC KEY STATE2
696 # aesdec KEY, STATE2 541 AESDEC KEY STATE3
697 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 542 AESDEC KEY STATE4
698 # aesdec KEY, STATE3
699 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
700 # aesdec KEY, STATE4
701 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
702 movaps 0x50(TKEYP), KEY 543 movaps 0x50(TKEYP), KEY
703 # aesdec KEY, STATE1 544 AESDEC KEY STATE1
704 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 545 AESDEC KEY STATE2
705 # aesdec KEY, STATE2 546 AESDEC KEY STATE3
706 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 547 AESDEC KEY STATE4
707 # aesdec KEY, STATE3
708 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
709 # aesdec KEY, STATE4
710 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
711 movaps 0x60(TKEYP), KEY 548 movaps 0x60(TKEYP), KEY
712 # aesdec KEY, STATE1 549 AESDEC KEY STATE1
713 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 550 AESDEC KEY STATE2
714 # aesdec KEY, STATE2 551 AESDEC KEY STATE3
715 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 552 AESDEC KEY STATE4
716 # aesdec KEY, STATE3
717 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
718 # aesdec KEY, STATE4
719 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
720 movaps 0x70(TKEYP), KEY 553 movaps 0x70(TKEYP), KEY
721 # aesdeclast KEY, STATE1 # last round 554 AESDECLAST KEY STATE1 # last round
722 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 555 AESDECLAST KEY STATE2
723 # aesdeclast KEY, STATE2 556 AESDECLAST KEY STATE3
724 .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 557 AESDECLAST KEY STATE4
725 # aesdeclast KEY, STATE3
726 .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
727 # aesdeclast KEY, STATE4
728 .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
729 ret 558 ret
730 559
731/* 560/*
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 585edebe12cf..49c552c060e9 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -82,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
82 return -EINVAL; 82 return -EINVAL;
83 } 83 }
84 84
85 if (irq_fpu_usable()) 85 if (!irq_fpu_usable())
86 err = crypto_aes_expand_key(ctx, in_key, key_len); 86 err = crypto_aes_expand_key(ctx, in_key, key_len);
87 else { 87 else {
88 kernel_fpu_begin(); 88 kernel_fpu_begin();
@@ -103,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
103{ 103{
104 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); 104 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
105 105
106 if (irq_fpu_usable()) 106 if (!irq_fpu_usable())
107 crypto_aes_encrypt_x86(ctx, dst, src); 107 crypto_aes_encrypt_x86(ctx, dst, src);
108 else { 108 else {
109 kernel_fpu_begin(); 109 kernel_fpu_begin();
@@ -116,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
116{ 116{
117 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); 117 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
118 118
119 if (irq_fpu_usable()) 119 if (!irq_fpu_usable())
120 crypto_aes_decrypt_x86(ctx, dst, src); 120 crypto_aes_decrypt_x86(ctx, dst, src);
121 else { 121 else {
122 kernel_fpu_begin(); 122 kernel_fpu_begin();
@@ -342,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req)
342 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); 342 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
343 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 343 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
344 344
345 if (irq_fpu_usable()) { 345 if (!irq_fpu_usable()) {
346 struct ablkcipher_request *cryptd_req = 346 struct ablkcipher_request *cryptd_req =
347 ablkcipher_request_ctx(req); 347 ablkcipher_request_ctx(req);
348 memcpy(cryptd_req, req, sizeof(*req)); 348 memcpy(cryptd_req, req, sizeof(*req));
@@ -363,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req)
363 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); 363 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
364 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 364 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
365 365
366 if (irq_fpu_usable()) { 366 if (!irq_fpu_usable()) {
367 struct ablkcipher_request *cryptd_req = 367 struct ablkcipher_request *cryptd_req =
368 ablkcipher_request_ctx(req); 368 ablkcipher_request_ctx(req);
369 memcpy(cryptd_req, req, sizeof(*req)); 369 memcpy(cryptd_req, req, sizeof(*req));
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 000000000000..1eb7f90cb7b9
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains accelerated part of ghash
4 * implementation. More information about PCLMULQDQ can be found at:
5 *
6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
7 *
8 * Copyright (c) 2009 Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal
11 * Erdinc Ozturk
12 * Deniz Karakoyunlu
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License version 2 as published
16 * by the Free Software Foundation.
17 */
18
19#include <linux/linkage.h>
20#include <asm/inst.h>
21
22.data
23
24.align 16
25.Lbswap_mask:
26 .octa 0x000102030405060708090a0b0c0d0e0f
27.Lpoly:
28 .octa 0xc2000000000000000000000000000001
29.Ltwo_one:
30 .octa 0x00000001000000000000000000000001
31
32#define DATA %xmm0
33#define SHASH %xmm1
34#define T1 %xmm2
35#define T2 %xmm3
36#define T3 %xmm4
37#define BSWAP %xmm5
38#define IN1 %xmm6
39
40.text
41
42/*
43 * __clmul_gf128mul_ble: internal ABI
44 * input:
45 * DATA: operand1
46 * SHASH: operand2, hash_key << 1 mod poly
47 * output:
48 * DATA: operand1 * operand2 mod poly
49 * changed:
50 * T1
51 * T2
52 * T3
53 */
54__clmul_gf128mul_ble:
55 movaps DATA, T1
56 pshufd $0b01001110, DATA, T2
57 pshufd $0b01001110, SHASH, T3
58 pxor DATA, T2
59 pxor SHASH, T3
60
61 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
62 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
63 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
64 pxor DATA, T2
65 pxor T1, T2 # T2 = a0 * b1 + a1 * b0
66
67 movaps T2, T3
68 pslldq $8, T3
69 psrldq $8, T2
70 pxor T3, DATA
71 pxor T2, T1 # <T1:DATA> is result of
72 # carry-less multiplication
73
74 # first phase of the reduction
75 movaps DATA, T3
76 psllq $1, T3
77 pxor DATA, T3
78 psllq $5, T3
79 pxor DATA, T3
80 psllq $57, T3
81 movaps T3, T2
82 pslldq $8, T2
83 psrldq $8, T3
84 pxor T2, DATA
85 pxor T3, T1
86
87 # second phase of the reduction
88 movaps DATA, T2
89 psrlq $5, T2
90 pxor DATA, T2
91 psrlq $1, T2
92 pxor DATA, T2
93 psrlq $1, T2
94 pxor T2, T1
95 pxor T1, DATA
96 ret
97
98/* void clmul_ghash_mul(char *dst, const be128 *shash) */
99ENTRY(clmul_ghash_mul)
100 movups (%rdi), DATA
101 movups (%rsi), SHASH
102 movaps .Lbswap_mask, BSWAP
103 PSHUFB_XMM BSWAP DATA
104 call __clmul_gf128mul_ble
105 PSHUFB_XMM BSWAP DATA
106 movups DATA, (%rdi)
107 ret
108
109/*
110 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
111 * const be128 *shash);
112 */
113ENTRY(clmul_ghash_update)
114 cmp $16, %rdx
115 jb .Lupdate_just_ret # check length
116 movaps .Lbswap_mask, BSWAP
117 movups (%rdi), DATA
118 movups (%rcx), SHASH
119 PSHUFB_XMM BSWAP DATA
120.align 4
121.Lupdate_loop:
122 movups (%rsi), IN1
123 PSHUFB_XMM BSWAP IN1
124 pxor IN1, DATA
125 call __clmul_gf128mul_ble
126 sub $16, %rdx
127 add $16, %rsi
128 cmp $16, %rdx
129 jge .Lupdate_loop
130 PSHUFB_XMM BSWAP DATA
131 movups DATA, (%rdi)
132.Lupdate_just_ret:
133 ret
134
135/*
136 * void clmul_ghash_setkey(be128 *shash, const u8 *key);
137 *
138 * Calculate hash_key << 1 mod poly
139 */
140ENTRY(clmul_ghash_setkey)
141 movaps .Lbswap_mask, BSWAP
142 movups (%rsi), %xmm0
143 PSHUFB_XMM BSWAP %xmm0
144 movaps %xmm0, %xmm1
145 psllq $1, %xmm0
146 psrlq $63, %xmm1
147 movaps %xmm1, %xmm2
148 pslldq $8, %xmm1
149 psrldq $8, %xmm2
150 por %xmm1, %xmm0
151 # reduction
152 pshufd $0b00100100, %xmm2, %xmm1
153 pcmpeqd .Ltwo_one, %xmm1
154 pand .Lpoly, %xmm1
155 pxor %xmm1, %xmm0
156 movups %xmm0, (%rdi)
157 ret
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 000000000000..cbcc8d8ea93a
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,333 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains glue code.
4 *
5 * Copyright (c) 2009 Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License version 2 as published
10 * by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/crypto.h>
17#include <crypto/algapi.h>
18#include <crypto/cryptd.h>
19#include <crypto/gf128mul.h>
20#include <crypto/internal/hash.h>
21#include <asm/i387.h>
22
23#define GHASH_BLOCK_SIZE 16
24#define GHASH_DIGEST_SIZE 16
25
26void clmul_ghash_mul(char *dst, const be128 *shash);
27
28void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
29 const be128 *shash);
30
31void clmul_ghash_setkey(be128 *shash, const u8 *key);
32
33struct ghash_async_ctx {
34 struct cryptd_ahash *cryptd_tfm;
35};
36
37struct ghash_ctx {
38 be128 shash;
39};
40
41struct ghash_desc_ctx {
42 u8 buffer[GHASH_BLOCK_SIZE];
43 u32 bytes;
44};
45
46static int ghash_init(struct shash_desc *desc)
47{
48 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
49
50 memset(dctx, 0, sizeof(*dctx));
51
52 return 0;
53}
54
55static int ghash_setkey(struct crypto_shash *tfm,
56 const u8 *key, unsigned int keylen)
57{
58 struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
59
60 if (keylen != GHASH_BLOCK_SIZE) {
61 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
62 return -EINVAL;
63 }
64
65 clmul_ghash_setkey(&ctx->shash, key);
66
67 return 0;
68}
69
70static int ghash_update(struct shash_desc *desc,
71 const u8 *src, unsigned int srclen)
72{
73 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
74 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
75 u8 *dst = dctx->buffer;
76
77 kernel_fpu_begin();
78 if (dctx->bytes) {
79 int n = min(srclen, dctx->bytes);
80 u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
81
82 dctx->bytes -= n;
83 srclen -= n;
84
85 while (n--)
86 *pos++ ^= *src++;
87
88 if (!dctx->bytes)
89 clmul_ghash_mul(dst, &ctx->shash);
90 }
91
92 clmul_ghash_update(dst, src, srclen, &ctx->shash);
93 kernel_fpu_end();
94
95 if (srclen & 0xf) {
96 src += srclen - (srclen & 0xf);
97 srclen &= 0xf;
98 dctx->bytes = GHASH_BLOCK_SIZE - srclen;
99 while (srclen--)
100 *dst++ ^= *src++;
101 }
102
103 return 0;
104}
105
106static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
107{
108 u8 *dst = dctx->buffer;
109
110 if (dctx->bytes) {
111 u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
112
113 while (dctx->bytes--)
114 *tmp++ ^= 0;
115
116 kernel_fpu_begin();
117 clmul_ghash_mul(dst, &ctx->shash);
118 kernel_fpu_end();
119 }
120
121 dctx->bytes = 0;
122}
123
124static int ghash_final(struct shash_desc *desc, u8 *dst)
125{
126 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
127 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
128 u8 *buf = dctx->buffer;
129
130 ghash_flush(ctx, dctx);
131 memcpy(dst, buf, GHASH_BLOCK_SIZE);
132
133 return 0;
134}
135
136static struct shash_alg ghash_alg = {
137 .digestsize = GHASH_DIGEST_SIZE,
138 .init = ghash_init,
139 .update = ghash_update,
140 .final = ghash_final,
141 .setkey = ghash_setkey,
142 .descsize = sizeof(struct ghash_desc_ctx),
143 .base = {
144 .cra_name = "__ghash",
145 .cra_driver_name = "__ghash-pclmulqdqni",
146 .cra_priority = 0,
147 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
148 .cra_blocksize = GHASH_BLOCK_SIZE,
149 .cra_ctxsize = sizeof(struct ghash_ctx),
150 .cra_module = THIS_MODULE,
151 .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list),
152 },
153};
154
155static int ghash_async_init(struct ahash_request *req)
156{
157 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
158 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
159 struct ahash_request *cryptd_req = ahash_request_ctx(req);
160 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
161
162 if (!irq_fpu_usable()) {
163 memcpy(cryptd_req, req, sizeof(*req));
164 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
165 return crypto_ahash_init(cryptd_req);
166 } else {
167 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
168 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
169
170 desc->tfm = child;
171 desc->flags = req->base.flags;
172 return crypto_shash_init(desc);
173 }
174}
175
176static int ghash_async_update(struct ahash_request *req)
177{
178 struct ahash_request *cryptd_req = ahash_request_ctx(req);
179
180 if (!irq_fpu_usable()) {
181 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
182 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
183 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
184
185 memcpy(cryptd_req, req, sizeof(*req));
186 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
187 return crypto_ahash_update(cryptd_req);
188 } else {
189 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
190 return shash_ahash_update(req, desc);
191 }
192}
193
194static int ghash_async_final(struct ahash_request *req)
195{
196 struct ahash_request *cryptd_req = ahash_request_ctx(req);
197
198 if (!irq_fpu_usable()) {
199 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
200 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
201 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
202
203 memcpy(cryptd_req, req, sizeof(*req));
204 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
205 return crypto_ahash_final(cryptd_req);
206 } else {
207 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
208 return crypto_shash_final(desc, req->result);
209 }
210}
211
212static int ghash_async_digest(struct ahash_request *req)
213{
214 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
215 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
216 struct ahash_request *cryptd_req = ahash_request_ctx(req);
217 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
218
219 if (!irq_fpu_usable()) {
220 memcpy(cryptd_req, req, sizeof(*req));
221 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
222 return crypto_ahash_digest(cryptd_req);
223 } else {
224 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
225 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
226
227 desc->tfm = child;
228 desc->flags = req->base.flags;
229 return shash_ahash_digest(req, desc);
230 }
231}
232
233static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
234 unsigned int keylen)
235{
236 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
237 struct crypto_ahash *child = &ctx->cryptd_tfm->base;
238 int err;
239
240 crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
241 crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
242 & CRYPTO_TFM_REQ_MASK);
243 err = crypto_ahash_setkey(child, key, keylen);
244 crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
245 & CRYPTO_TFM_RES_MASK);
246
247 return 0;
248}
249
250static int ghash_async_init_tfm(struct crypto_tfm *tfm)
251{
252 struct cryptd_ahash *cryptd_tfm;
253 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
254
255 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
256 if (IS_ERR(cryptd_tfm))
257 return PTR_ERR(cryptd_tfm);
258 ctx->cryptd_tfm = cryptd_tfm;
259 crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
260 sizeof(struct ahash_request) +
261 crypto_ahash_reqsize(&cryptd_tfm->base));
262
263 return 0;
264}
265
266static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
267{
268 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
269
270 cryptd_free_ahash(ctx->cryptd_tfm);
271}
272
273static struct ahash_alg ghash_async_alg = {
274 .init = ghash_async_init,
275 .update = ghash_async_update,
276 .final = ghash_async_final,
277 .setkey = ghash_async_setkey,
278 .digest = ghash_async_digest,
279 .halg = {
280 .digestsize = GHASH_DIGEST_SIZE,
281 .base = {
282 .cra_name = "ghash",
283 .cra_driver_name = "ghash-clmulni",
284 .cra_priority = 400,
285 .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
286 .cra_blocksize = GHASH_BLOCK_SIZE,
287 .cra_type = &crypto_ahash_type,
288 .cra_module = THIS_MODULE,
289 .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
290 .cra_init = ghash_async_init_tfm,
291 .cra_exit = ghash_async_exit_tfm,
292 },
293 },
294};
295
296static int __init ghash_pclmulqdqni_mod_init(void)
297{
298 int err;
299
300 if (!cpu_has_pclmulqdq) {
301 printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
302 " detected.\n");
303 return -ENODEV;
304 }
305
306 err = crypto_register_shash(&ghash_alg);
307 if (err)
308 goto err_out;
309 err = crypto_register_ahash(&ghash_async_alg);
310 if (err)
311 goto err_shash;
312
313 return 0;
314
315err_shash:
316 crypto_unregister_shash(&ghash_alg);
317err_out:
318 return err;
319}
320
321static void __exit ghash_pclmulqdqni_mod_exit(void)
322{
323 crypto_unregister_ahash(&ghash_async_alg);
324 crypto_unregister_shash(&ghash_alg);
325}
326
327module_init(ghash_pclmulqdqni_mod_init);
328module_exit(ghash_pclmulqdqni_mod_exit);
329
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
332 "acclerated by PCLMULQDQ-NI");
333MODULE_ALIAS("ghash");
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2a4d073d2cf1..f9f472462753 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -308,14 +308,15 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
308 if (retval) 308 if (retval)
309 return retval; 309 return retval;
310 310
311 regs->cs = __USER32_CS;
312 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
313 regs->r13 = regs->r14 = regs->r15 = 0;
314
315 /* OK, This is the point of no return */ 311 /* OK, This is the point of no return */
316 set_personality(PER_LINUX); 312 set_personality(PER_LINUX);
317 set_thread_flag(TIF_IA32); 313 set_thread_flag(TIF_IA32);
318 clear_thread_flag(TIF_ABI_PENDING); 314
315 setup_new_exec(bprm);
316
317 regs->cs = __USER32_CS;
318 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
319 regs->r13 = regs->r14 = regs->r15 = 0;
319 320
320 current->mm->end_code = ex.a_text + 321 current->mm->end_code = ex.a_text +
321 (current->mm->start_code = N_TXTADDR(ex)); 322 (current->mm->start_code = N_TXTADDR(ex));
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 1733f9f65e82..53147ad85b96 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -204,7 +204,7 @@ sysexit_from_sys_call:
204 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ 204 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
205 .endm 205 .endm
206 206
207 .macro auditsys_exit exit,ebpsave=RBP 207 .macro auditsys_exit exit
208 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 208 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
209 jnz ia32_ret_from_sys_call 209 jnz ia32_ret_from_sys_call
210 TRACE_IRQS_ON 210 TRACE_IRQS_ON
@@ -217,7 +217,6 @@ sysexit_from_sys_call:
217 call audit_syscall_exit 217 call audit_syscall_exit
218 GET_THREAD_INFO(%r10) 218 GET_THREAD_INFO(%r10)
219 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ 219 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
220 movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
221 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 220 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
222 cli 221 cli
223 TRACE_IRQS_OFF 222 TRACE_IRQS_OFF
@@ -351,7 +350,7 @@ cstar_auditsys:
351 jmp cstar_dispatch 350 jmp cstar_dispatch
352 351
353sysretl_audit: 352sysretl_audit:
354 auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */ 353 auditsys_exit sysretl_from_sys_call
355#endif 354#endif
356 355
357cstar_tracesys: 356cstar_tracesys:
@@ -654,7 +653,7 @@ ia32_sys_call_table:
654 .quad compat_sys_writev 653 .quad compat_sys_writev
655 .quad sys_getsid 654 .quad sys_getsid
656 .quad sys_fdatasync 655 .quad sys_fdatasync
657 .quad sys32_sysctl /* sysctl */ 656 .quad compat_sys_sysctl /* sysctl */
658 .quad sys_mlock /* 150 */ 657 .quad sys_mlock /* 150 */
659 .quad sys_munlock 658 .quad sys_munlock
660 .quad sys_mlockall 659 .quad sys_mlockall
@@ -697,7 +696,7 @@ ia32_sys_call_table:
697 .quad quiet_ni_syscall /* streams2 */ 696 .quad quiet_ni_syscall /* streams2 */
698 .quad stub32_vfork /* 190 */ 697 .quad stub32_vfork /* 190 */
699 .quad compat_sys_getrlimit 698 .quad compat_sys_getrlimit
700 .quad sys32_mmap2 699 .quad sys_mmap_pgoff
701 .quad sys32_truncate64 700 .quad sys32_truncate64
702 .quad sys32_ftruncate64 701 .quad sys32_ftruncate64
703 .quad sys32_stat64 /* 195 */ 702 .quad sys32_stat64 /* 195 */
@@ -842,4 +841,5 @@ ia32_sys_call_table:
842 .quad compat_sys_pwritev 841 .quad compat_sys_pwritev
843 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
844 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg
845ia32_syscall_end: 845ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 9f5527198825..422572c77923 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -155,9 +155,6 @@ struct mmap_arg_struct {
155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) 155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
156{ 156{
157 struct mmap_arg_struct a; 157 struct mmap_arg_struct a;
158 struct file *file = NULL;
159 unsigned long retval;
160 struct mm_struct *mm ;
161 158
162 if (copy_from_user(&a, arg, sizeof(a))) 159 if (copy_from_user(&a, arg, sizeof(a)))
163 return -EFAULT; 160 return -EFAULT;
@@ -165,22 +162,8 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
165 if (a.offset & ~PAGE_MASK) 162 if (a.offset & ~PAGE_MASK)
166 return -EINVAL; 163 return -EINVAL;
167 164
168 if (!(a.flags & MAP_ANONYMOUS)) { 165 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
169 file = fget(a.fd);
170 if (!file)
171 return -EBADF;
172 }
173
174 mm = current->mm;
175 down_write(&mm->mmap_sem);
176 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
177 a.offset>>PAGE_SHIFT); 166 a.offset>>PAGE_SHIFT);
178 if (file)
179 fput(file);
180
181 up_write(&mm->mmap_sem);
182
183 return retval;
184} 167}
185 168
186asmlinkage long sys32_mprotect(unsigned long start, size_t len, 169asmlinkage long sys32_mprotect(unsigned long start, size_t len,
@@ -434,62 +417,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
434 return ret; 417 return ret;
435} 418}
436 419
437#ifdef CONFIG_SYSCTL_SYSCALL
438struct sysctl_ia32 {
439 unsigned int name;
440 int nlen;
441 unsigned int oldval;
442 unsigned int oldlenp;
443 unsigned int newval;
444 unsigned int newlen;
445 unsigned int __unused[4];
446};
447
448
449asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
450{
451 struct sysctl_ia32 a32;
452 mm_segment_t old_fs = get_fs();
453 void __user *oldvalp, *newvalp;
454 size_t oldlen;
455 int __user *namep;
456 long ret;
457
458 if (copy_from_user(&a32, args32, sizeof(a32)))
459 return -EFAULT;
460
461 /*
462 * We need to pre-validate these because we have to disable
463 * address checking before calling do_sysctl() because of
464 * OLDLEN but we can't run the risk of the user specifying bad
465 * addresses here. Well, since we're dealing with 32 bit
466 * addresses, we KNOW that access_ok() will always succeed, so
467 * this is an expensive NOP, but so what...
468 */
469 namep = compat_ptr(a32.name);
470 oldvalp = compat_ptr(a32.oldval);
471 newvalp = compat_ptr(a32.newval);
472
473 if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
474 || !access_ok(VERIFY_WRITE, namep, 0)
475 || !access_ok(VERIFY_WRITE, oldvalp, 0)
476 || !access_ok(VERIFY_WRITE, newvalp, 0))
477 return -EFAULT;
478
479 set_fs(KERNEL_DS);
480 lock_kernel();
481 ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
482 newvalp, (size_t) a32.newlen);
483 unlock_kernel();
484 set_fs(old_fs);
485
486 if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
487 return -EFAULT;
488
489 return ret;
490}
491#endif
492
493/* warning: next two assume little endian */ 420/* warning: next two assume little endian */
494asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, 421asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
495 u32 poslo, u32 poshi) 422 u32 poslo, u32 poshi)
@@ -539,30 +466,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
539 return ret; 466 return ret;
540} 467}
541 468
542asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
543 unsigned long prot, unsigned long flags,
544 unsigned long fd, unsigned long pgoff)
545{
546 struct mm_struct *mm = current->mm;
547 unsigned long error;
548 struct file *file = NULL;
549
550 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
551 if (!(flags & MAP_ANONYMOUS)) {
552 file = fget(fd);
553 if (!file)
554 return -EBADF;
555 }
556
557 down_write(&mm->mmap_sem);
558 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
559 up_write(&mm->mmap_sem);
560
561 if (file)
562 fput(file);
563 return error;
564}
565
566asmlinkage long sys32_olduname(struct oldold_utsname __user *name) 469asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
567{ 470{
568 char *arch = "x86_64"; 471 char *arch = "x86_64";
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..9f828f87ca35 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
10header-y += sigcontext32.h 10header-y += sigcontext32.h
11header-y += ucontext.h 11header-y += ucontext.h
12header-y += processor-flags.h 12header-y += processor-flags.h
13header-y += hw_breakpoint.h
13 14
14unifdef-y += e820.h 15unifdef-y += e820.h
15unifdef-y += ist.h 16unifdef-y += ist.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index bb70e397aa84..7a15588e45d4 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,7 @@
17 17
18#include <linux/user.h> 18#include <linux/user.h>
19#include <linux/elfcore.h> 19#include <linux/elfcore.h>
20#include <asm/debugreg.h>
20 21
21/* 22/*
22 * fill in the user structure for an a.out core dump 23 * fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
32 >> PAGE_SHIFT; 33 >> PAGE_SHIFT;
33 dump->u_dsize -= dump->u_tsize; 34 dump->u_dsize -= dump->u_tsize;
34 dump->u_ssize = 0; 35 dump->u_ssize = 0;
35 dump->u_debugreg[0] = current->thread.debugreg0; 36 aout_dump_debugregs(dump);
36 dump->u_debugreg[1] = current->thread.debugreg1;
37 dump->u_debugreg[2] = current->thread.debugreg2;
38 dump->u_debugreg[3] = current->thread.debugreg3;
39 dump->u_debugreg[4] = 0;
40 dump->u_debugreg[5] = 0;
41 dump->u_debugreg[6] = current->thread.debugreg6;
42 dump->u_debugreg[7] = current->thread.debugreg7;
43 37
44 if (dump->start_stack < TASK_SIZE) 38 if (dump->start_stack < TASK_SIZE)
45 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) 39 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 60d2b2db0bc5..56f462cf22d2 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -142,6 +142,32 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
142 return max_cstate; 142 return max_cstate;
143} 143}
144 144
145static inline bool arch_has_acpi_pdc(void)
146{
147 struct cpuinfo_x86 *c = &cpu_data(0);
148 return (c->x86_vendor == X86_VENDOR_INTEL ||
149 c->x86_vendor == X86_VENDOR_CENTAUR);
150}
151
152static inline void arch_acpi_set_pdc_bits(u32 *buf)
153{
154 struct cpuinfo_x86 *c = &cpu_data(0);
155
156 buf[2] |= ACPI_PDC_C_CAPABILITY_SMP;
157
158 if (cpu_has(c, X86_FEATURE_EST))
159 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
160
161 if (cpu_has(c, X86_FEATURE_ACPI))
162 buf[2] |= ACPI_PDC_T_FFH;
163
164 /*
165 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
166 */
167 if (!cpu_has(c, X86_FEATURE_MWAIT))
168 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
169}
170
145#else /* !CONFIG_ACPI */ 171#else /* !CONFIG_ACPI */
146 172
147#define acpi_lapic 0 173#define acpi_lapic 0
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e2077d343c33..b97f786a48d5 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -1,17 +1,13 @@
1#ifdef __ASSEMBLY__ 1#ifdef __ASSEMBLY__
2 2
3#ifdef CONFIG_X86_32 3#include <asm/asm.h>
4# define X86_ALIGN .long
5#else
6# define X86_ALIGN .quad
7#endif
8 4
9#ifdef CONFIG_SMP 5#ifdef CONFIG_SMP
10 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
111: lock 71: lock
12 .section .smp_locks,"a" 8 .section .smp_locks,"a"
13 .align 4 9 _ASM_ALIGN
14 X86_ALIGN 1b 10 _ASM_PTR 1b
15 .previous 11 .previous
16 .endm 12 .endm
17#else 13#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c240efc74e00..69b74a7b877f 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -84,6 +84,7 @@ static inline void alternatives_smp_switch(int smp) {}
84 " .byte " __stringify(feature) "\n" /* feature bit */ \ 84 " .byte " __stringify(feature) "\n" /* feature bit */ \
85 " .byte 662b-661b\n" /* sourcelen */ \ 85 " .byte 662b-661b\n" /* sourcelen */ \
86 " .byte 664f-663f\n" /* replacementlen */ \ 86 " .byte 664f-663f\n" /* replacementlen */ \
87 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
87 ".previous\n" \ 88 ".previous\n" \
88 ".section .altinstr_replacement, \"ax\"\n" \ 89 ".section .altinstr_replacement, \"ax\"\n" \
89 "663:\n\t" newinstr "\n664:\n" /* replacement */ \ 90 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index ac95995b7bad..5af2982133b5 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -23,18 +23,13 @@
23#include <linux/irqreturn.h> 23#include <linux/irqreturn.h>
24 24
25#ifdef CONFIG_AMD_IOMMU 25#ifdef CONFIG_AMD_IOMMU
26extern int amd_iommu_init(void); 26
27extern int amd_iommu_init_dma_ops(void);
28extern int amd_iommu_init_passthrough(void);
29extern void amd_iommu_detect(void); 27extern void amd_iommu_detect(void);
30extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 28
31extern void amd_iommu_flush_all_domains(void);
32extern void amd_iommu_flush_all_devices(void);
33extern void amd_iommu_shutdown(void);
34#else 29#else
35static inline int amd_iommu_init(void) { return -ENODEV; } 30
36static inline void amd_iommu_detect(void) { } 31static inline void amd_iommu_detect(void) { }
37static inline void amd_iommu_shutdown(void) { } 32
38#endif 33#endif
39 34
40#endif /* _ASM_X86_AMD_IOMMU_H */ 35#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
new file mode 100644
index 000000000000..d2544f1d705d
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H
21
22struct amd_iommu;
23
24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
27extern void amd_iommu_flush_all_domains(void);
28extern void amd_iommu_flush_all_devices(void);
29extern void amd_iommu_apply_erratum_63(u16 devid);
30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31extern int amd_iommu_init_devices(void);
32extern void amd_iommu_uninit_devices(void);
33extern void amd_iommu_init_notifier(void);
34extern void amd_iommu_init_api(void);
35#ifndef CONFIG_AMD_IOMMU_STATS
36
37static inline void amd_iommu_stats_init(void) { }
38
39#endif /* !CONFIG_AMD_IOMMU_STATS */
40
41#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 2a2cc7a78a81..ba19ad4c47d0 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,6 +25,11 @@
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26 26
27/* 27/*
28 * Maximum number of IOMMUs supported
29 */
30#define MAX_IOMMUS 32
31
32/*
28 * some size calculation constants 33 * some size calculation constants
29 */ 34 */
30#define DEV_TABLE_ENTRY_SIZE 32 35#define DEV_TABLE_ENTRY_SIZE 32
@@ -206,6 +211,9 @@ extern bool amd_iommu_dump;
206 printk(KERN_INFO "AMD-Vi: " format, ## arg); \ 211 printk(KERN_INFO "AMD-Vi: " format, ## arg); \
207 } while(0); 212 } while(0);
208 213
214/* global flag if IOMMUs cache non-present entries */
215extern bool amd_iommu_np_cache;
216
209/* 217/*
210 * Make iterating over all IOMMUs easier 218 * Make iterating over all IOMMUs easier
211 */ 219 */
@@ -226,6 +234,8 @@ extern bool amd_iommu_dump;
226 * independent of their use. 234 * independent of their use.
227 */ 235 */
228struct protection_domain { 236struct protection_domain {
237 struct list_head list; /* for list of all protection domains */
238 struct list_head dev_list; /* List of all devices in this domain */
229 spinlock_t lock; /* mostly used to lock the page table*/ 239 spinlock_t lock; /* mostly used to lock the page table*/
230 u16 id; /* the domain id written to the device table */ 240 u16 id; /* the domain id written to the device table */
231 int mode; /* paging mode (0-6 levels) */ 241 int mode; /* paging mode (0-6 levels) */
@@ -233,7 +243,20 @@ struct protection_domain {
233 unsigned long flags; /* flags to find out type of domain */ 243 unsigned long flags; /* flags to find out type of domain */
234 bool updated; /* complete domain flush required */ 244 bool updated; /* complete domain flush required */
235 unsigned dev_cnt; /* devices assigned to this domain */ 245 unsigned dev_cnt; /* devices assigned to this domain */
246 unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
236 void *priv; /* private data */ 247 void *priv; /* private data */
248
249};
250
251/*
252 * This struct contains device specific data for the IOMMU
253 */
254struct iommu_dev_data {
255 struct list_head list; /* For domain->dev_list */
256 struct device *dev; /* Device this data belong to */
257 struct device *alias; /* The Alias Device */
258 struct protection_domain *domain; /* Domain the device is bound to */
259 atomic_t bind; /* Domain attach reverent count */
237}; 260};
238 261
239/* 262/*
@@ -291,6 +314,9 @@ struct dma_ops_domain {
291struct amd_iommu { 314struct amd_iommu {
292 struct list_head list; 315 struct list_head list;
293 316
317 /* Index within the IOMMU array */
318 int index;
319
294 /* locks the accesses to the hardware */ 320 /* locks the accesses to the hardware */
295 spinlock_t lock; 321 spinlock_t lock;
296 322
@@ -357,6 +383,21 @@ struct amd_iommu {
357extern struct list_head amd_iommu_list; 383extern struct list_head amd_iommu_list;
358 384
359/* 385/*
386 * Array with pointers to each IOMMU struct
387 * The indices are referenced in the protection domains
388 */
389extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
390
391/* Number of IOMMUs present in the system */
392extern int amd_iommus_present;
393
394/*
395 * Declarations for the global list of all protection domains
396 */
397extern spinlock_t amd_iommu_pd_lock;
398extern struct list_head amd_iommu_pd_list;
399
400/*
360 * Structure defining one entry in the device table 401 * Structure defining one entry in the device table
361 */ 402 */
362struct dev_table_entry { 403struct dev_table_entry {
@@ -416,15 +457,9 @@ extern unsigned amd_iommu_aperture_order;
416/* largest PCI device id we expect translation requests for */ 457/* largest PCI device id we expect translation requests for */
417extern u16 amd_iommu_last_bdf; 458extern u16 amd_iommu_last_bdf;
418 459
419/* data structures for protection domain handling */
420extern struct protection_domain **amd_iommu_pd_table;
421
422/* allocation bitmap for domain ids */ 460/* allocation bitmap for domain ids */
423extern unsigned long *amd_iommu_pd_alloc_bitmap; 461extern unsigned long *amd_iommu_pd_alloc_bitmap;
424 462
425/* will be 1 if device isolation is enabled */
426extern bool amd_iommu_isolate;
427
428/* 463/*
429 * If true, the addresses will be flushed on unmap time, not when 464 * If true, the addresses will be flushed on unmap time, not when
430 * they are reused 465 * they are reused
@@ -462,11 +497,6 @@ struct __iommu_counter {
462#define ADD_STATS_COUNTER(name, x) 497#define ADD_STATS_COUNTER(name, x)
463#define SUB_STATS_COUNTER(name, x) 498#define SUB_STATS_COUNTER(name, x)
464 499
465static inline void amd_iommu_stats_init(void) { }
466
467#endif /* CONFIG_AMD_IOMMU_STATS */ 500#endif /* CONFIG_AMD_IOMMU_STATS */
468 501
469/* some function prototypes */
470extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
471
472#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ 502#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 474d80d3e6cc..b4ac2cdcb64f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -297,20 +297,20 @@ struct apic {
297 int disable_esr; 297 int disable_esr;
298 298
299 int dest_logical; 299 int dest_logical;
300 unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); 300 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
301 unsigned long (*check_apicid_present)(int apicid); 301 unsigned long (*check_apicid_present)(int apicid);
302 302
303 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); 303 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
304 void (*init_apic_ldr)(void); 304 void (*init_apic_ldr)(void);
305 305
306 physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); 306 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
307 307
308 void (*setup_apic_routing)(void); 308 void (*setup_apic_routing)(void);
309 int (*multi_timer_check)(int apic, int irq); 309 int (*multi_timer_check)(int apic, int irq);
310 int (*apicid_to_node)(int logical_apicid); 310 int (*apicid_to_node)(int logical_apicid);
311 int (*cpu_to_logical_apicid)(int cpu); 311 int (*cpu_to_logical_apicid)(int cpu);
312 int (*cpu_present_to_apicid)(int mps_cpu); 312 int (*cpu_present_to_apicid)(int mps_cpu);
313 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); 313 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
314 void (*setup_portio_remap)(void); 314 void (*setup_portio_remap)(void);
315 int (*check_phys_apicid_present)(int phys_apicid); 315 int (*check_phys_apicid_present)(int phys_apicid);
316 void (*enable_apic_mode)(void); 316 void (*enable_apic_mode)(void);
@@ -488,6 +488,8 @@ static inline unsigned int read_apic_id(void)
488 488
489extern void default_setup_apic_routing(void); 489extern void default_setup_apic_routing(void);
490 490
491extern struct apic apic_noop;
492
491#ifdef CONFIG_X86_32 493#ifdef CONFIG_X86_32
492 494
493extern struct apic apic_default; 495extern struct apic apic_default;
@@ -532,9 +534,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
532 return (unsigned int)(mask1 & mask2 & mask3); 534 return (unsigned int)(mask1 & mask2 & mask3);
533} 535}
534 536
535static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) 537static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
536{ 538{
537 return physid_isset(apicid, bitmap); 539 return physid_isset(apicid, *map);
538} 540}
539 541
540static inline unsigned long default_check_apicid_present(int bit) 542static inline unsigned long default_check_apicid_present(int bit)
@@ -542,9 +544,9 @@ static inline unsigned long default_check_apicid_present(int bit)
542 return physid_isset(bit, phys_cpu_present_map); 544 return physid_isset(bit, phys_cpu_present_map);
543} 545}
544 546
545static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map) 547static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
546{ 548{
547 return phys_map; 549 *retmap = *phys_map;
548} 550}
549 551
550/* Mapping from cpu number to logical apicid */ 552/* Mapping from cpu number to logical apicid */
@@ -583,11 +585,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
583extern int default_check_phys_apicid_present(int phys_apicid); 585extern int default_check_phys_apicid_present(int phys_apicid);
584#endif 586#endif
585 587
586static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
587{
588 return physid_mask_of_physid(phys_apicid);
589}
590
591#endif /* CONFIG_X86_LOCAL_APIC */ 588#endif /* CONFIG_X86_LOCAL_APIC */
592 589
593#ifdef CONFIG_X86_32 590#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 3b62da926de9..7fe3b3060f08 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -11,6 +11,12 @@
11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
12#define APIC_DEFAULT_PHYS_BASE 0xfee00000 12#define APIC_DEFAULT_PHYS_BASE 0xfee00000
13 13
14/*
15 * This is the IO-APIC register space as specified
16 * by Intel docs:
17 */
18#define IO_APIC_SLOT_SIZE 1024
19
14#define APIC_ID 0x20 20#define APIC_ID 0x20
15 21
16#define APIC_LVR 0x30 22#define APIC_LVR 0x30
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
deleted file mode 100644
index 82f613c607ce..000000000000
--- a/arch/x86/include/asm/apicnum.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _ASM_X86_APICNUM_H
2#define _ASM_X86_APICNUM_H
3
4/* define MAX_IO_APICS */
5#ifdef CONFIG_X86_32
6# define MAX_IO_APICS 64
7#else
8# define MAX_IO_APICS 128
9# define MAX_LOCAL_APIC 32768
10#endif
11
12#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/asm-offsets.h b/arch/x86/include/asm/asm-offsets.h
new file mode 100644
index 000000000000..d370ee36a182
--- /dev/null
+++ b/arch/x86/include/asm/asm-offsets.h
@@ -0,0 +1 @@
#include <generated/asm-offsets.h>
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index d9cf1cd156d2..f654d1bb17fb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -22,14 +22,14 @@ do { \
22 ".popsection" \ 22 ".popsection" \
23 : : "i" (__FILE__), "i" (__LINE__), \ 23 : : "i" (__FILE__), "i" (__LINE__), \
24 "i" (sizeof(struct bug_entry))); \ 24 "i" (sizeof(struct bug_entry))); \
25 for (;;) ; \ 25 unreachable(); \
26} while (0) 26} while (0)
27 27
28#else 28#else
29#define BUG() \ 29#define BUG() \
30do { \ 30do { \
31 asm volatile("ud2"); \ 31 asm volatile("ud2"); \
32 for (;;) ; \ 32 unreachable(); \
33} while (0) 33} while (0)
34#endif 34#endif
35 35
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index eebb2cd2b9bf..634c40a739a6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { } 12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma, 13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { } 14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15static inline void flush_dcache_page(struct page *page) { } 16static inline void flush_dcache_page(struct page *page) { }
16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } 17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } 18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb62aa7..0918654305af 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@ struct cal_chipset_ops {
62extern int use_calgary; 62extern int use_calgary;
63 63
64#ifdef CONFIG_CALGARY_IOMMU 64#ifdef CONFIG_CALGARY_IOMMU
65extern int calgary_iommu_init(void);
66extern void detect_calgary(void); 65extern void detect_calgary(void);
67#else 66#else
68static inline int calgary_iommu_init(void) { return 1; }
69static inline void detect_calgary(void) { return; } 67static inline void detect_calgary(void) { return; }
70#endif 68#endif
71 69
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ee1931be6593..ffb9bb6b6c37 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -8,14 +8,50 @@
8 * you need to test for the feature in boot_cpu_data. 8 * you need to test for the feature in boot_cpu_data.
9 */ 9 */
10 10
11#define xchg(ptr, v) \ 11extern void __xchg_wrong_size(void);
12 ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr)))) 12
13/*
14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
15 * Note 2: xchg has side effect, so that attribute volatile is necessary,
16 * but generally the primitive is invalid, *ptr is output argument. --ANK
17 */
13 18
14struct __xchg_dummy { 19struct __xchg_dummy {
15 unsigned long a[100]; 20 unsigned long a[100];
16}; 21};
17#define __xg(x) ((struct __xchg_dummy *)(x)) 22#define __xg(x) ((struct __xchg_dummy *)(x))
18 23
24#define __xchg(x, ptr, size) \
25({ \
26 __typeof(*(ptr)) __x = (x); \
27 switch (size) { \
28 case 1: \
29 asm volatile("xchgb %b0,%1" \
30 : "=q" (__x) \
31 : "m" (*__xg(ptr)), "0" (__x) \
32 : "memory"); \
33 break; \
34 case 2: \
35 asm volatile("xchgw %w0,%1" \
36 : "=r" (__x) \
37 : "m" (*__xg(ptr)), "0" (__x) \
38 : "memory"); \
39 break; \
40 case 4: \
41 asm volatile("xchgl %0,%1" \
42 : "=r" (__x) \
43 : "m" (*__xg(ptr)), "0" (__x) \
44 : "memory"); \
45 break; \
46 default: \
47 __xchg_wrong_size(); \
48 } \
49 __x; \
50})
51
52#define xchg(ptr, v) \
53 __xchg((v), (ptr), sizeof(*ptr))
54
19/* 55/*
20 * The semantics of XCHGCMP8B are a bit strange, this is why 56 * The semantics of XCHGCMP8B are a bit strange, this is why
21 * there is a loop and the loading of %%eax and %%edx has to 57 * there is a loop and the loading of %%eax and %%edx has to
@@ -71,57 +107,63 @@ static inline void __set_64bit_var(unsigned long long *ptr,
71 (unsigned int)((value) >> 32)) \ 107 (unsigned int)((value) >> 32)) \
72 : __set_64bit(ptr, ll_low((value)), ll_high((value)))) 108 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
73 109
74/* 110extern void __cmpxchg_wrong_size(void);
75 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
76 * Note 2: xchg has side effect, so that attribute volatile is necessary,
77 * but generally the primitive is invalid, *ptr is output argument. --ANK
78 */
79static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
80 int size)
81{
82 switch (size) {
83 case 1:
84 asm volatile("xchgb %b0,%1"
85 : "=q" (x)
86 : "m" (*__xg(ptr)), "0" (x)
87 : "memory");
88 break;
89 case 2:
90 asm volatile("xchgw %w0,%1"
91 : "=r" (x)
92 : "m" (*__xg(ptr)), "0" (x)
93 : "memory");
94 break;
95 case 4:
96 asm volatile("xchgl %0,%1"
97 : "=r" (x)
98 : "m" (*__xg(ptr)), "0" (x)
99 : "memory");
100 break;
101 }
102 return x;
103}
104 111
105/* 112/*
106 * Atomic compare and exchange. Compare OLD with MEM, if identical, 113 * Atomic compare and exchange. Compare OLD with MEM, if identical,
107 * store NEW in MEM. Return the initial value in MEM. Success is 114 * store NEW in MEM. Return the initial value in MEM. Success is
108 * indicated by comparing RETURN with OLD. 115 * indicated by comparing RETURN with OLD.
109 */ 116 */
117#define __raw_cmpxchg(ptr, old, new, size, lock) \
118({ \
119 __typeof__(*(ptr)) __ret; \
120 __typeof__(*(ptr)) __old = (old); \
121 __typeof__(*(ptr)) __new = (new); \
122 switch (size) { \
123 case 1: \
124 asm volatile(lock "cmpxchgb %b1,%2" \
125 : "=a"(__ret) \
126 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
127 : "memory"); \
128 break; \
129 case 2: \
130 asm volatile(lock "cmpxchgw %w1,%2" \
131 : "=a"(__ret) \
132 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
133 : "memory"); \
134 break; \
135 case 4: \
136 asm volatile(lock "cmpxchgl %1,%2" \
137 : "=a"(__ret) \
138 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
139 : "memory"); \
140 break; \
141 default: \
142 __cmpxchg_wrong_size(); \
143 } \
144 __ret; \
145})
146
147#define __cmpxchg(ptr, old, new, size) \
148 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
149
150#define __sync_cmpxchg(ptr, old, new, size) \
151 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
152
153#define __cmpxchg_local(ptr, old, new, size) \
154 __raw_cmpxchg((ptr), (old), (new), (size), "")
110 155
111#ifdef CONFIG_X86_CMPXCHG 156#ifdef CONFIG_X86_CMPXCHG
112#define __HAVE_ARCH_CMPXCHG 1 157#define __HAVE_ARCH_CMPXCHG 1
113#define cmpxchg(ptr, o, n) \ 158
114 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ 159#define cmpxchg(ptr, old, new) \
115 (unsigned long)(n), \ 160 __cmpxchg((ptr), (old), (new), sizeof(*ptr))
116 sizeof(*(ptr)))) 161
117#define sync_cmpxchg(ptr, o, n) \ 162#define sync_cmpxchg(ptr, old, new) \
118 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ 163 __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
119 (unsigned long)(n), \ 164
120 sizeof(*(ptr)))) 165#define cmpxchg_local(ptr, old, new) \
121#define cmpxchg_local(ptr, o, n) \ 166 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
122 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
123 (unsigned long)(n), \
124 sizeof(*(ptr))))
125#endif 167#endif
126 168
127#ifdef CONFIG_X86_CMPXCHG64 169#ifdef CONFIG_X86_CMPXCHG64
@@ -133,94 +175,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
133 (unsigned long long)(n))) 175 (unsigned long long)(n)))
134#endif 176#endif
135 177
136static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
137 unsigned long new, int size)
138{
139 unsigned long prev;
140 switch (size) {
141 case 1:
142 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
143 : "=a"(prev)
144 : "q"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 2:
148 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 4:
154 asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162
163/*
164 * Always use locked operations when touching memory shared with a
165 * hypervisor, since the system may be SMP even if the guest kernel
166 * isn't.
167 */
168static inline unsigned long __sync_cmpxchg(volatile void *ptr,
169 unsigned long old,
170 unsigned long new, int size)
171{
172 unsigned long prev;
173 switch (size) {
174 case 1:
175 asm volatile("lock; cmpxchgb %b1,%2"
176 : "=a"(prev)
177 : "q"(new), "m"(*__xg(ptr)), "0"(old)
178 : "memory");
179 return prev;
180 case 2:
181 asm volatile("lock; cmpxchgw %w1,%2"
182 : "=a"(prev)
183 : "r"(new), "m"(*__xg(ptr)), "0"(old)
184 : "memory");
185 return prev;
186 case 4:
187 asm volatile("lock; cmpxchgl %1,%2"
188 : "=a"(prev)
189 : "r"(new), "m"(*__xg(ptr)), "0"(old)
190 : "memory");
191 return prev;
192 }
193 return old;
194}
195
196static inline unsigned long __cmpxchg_local(volatile void *ptr,
197 unsigned long old,
198 unsigned long new, int size)
199{
200 unsigned long prev;
201 switch (size) {
202 case 1:
203 asm volatile("cmpxchgb %b1,%2"
204 : "=a"(prev)
205 : "q"(new), "m"(*__xg(ptr)), "0"(old)
206 : "memory");
207 return prev;
208 case 2:
209 asm volatile("cmpxchgw %w1,%2"
210 : "=a"(prev)
211 : "r"(new), "m"(*__xg(ptr)), "0"(old)
212 : "memory");
213 return prev;
214 case 4:
215 asm volatile("cmpxchgl %1,%2"
216 : "=a"(prev)
217 : "r"(new), "m"(*__xg(ptr)), "0"(old)
218 : "memory");
219 return prev;
220 }
221 return old;
222}
223
224static inline unsigned long long __cmpxchg64(volatile void *ptr, 178static inline unsigned long long __cmpxchg64(volatile void *ptr,
225 unsigned long long old, 179 unsigned long long old,
226 unsigned long long new) 180 unsigned long long new)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 52de72e0de8c..485ae415faec 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,9 +3,6 @@
3 3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */ 4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5 5
6#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
7 (ptr), sizeof(*(ptr))))
8
9#define __xg(x) ((volatile long *)(x)) 6#define __xg(x) ((volatile long *)(x))
10 7
11static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) 8static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
@@ -15,167 +12,118 @@ static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
15 12
16#define _set_64bit set_64bit 13#define _set_64bit set_64bit
17 14
15extern void __xchg_wrong_size(void);
16extern void __cmpxchg_wrong_size(void);
17
18/* 18/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
20 * Note 2: xchg has side effect, so that attribute volatile is necessary, 20 * Note 2: xchg has side effect, so that attribute volatile is necessary,
21 * but generally the primitive is invalid, *ptr is output argument. --ANK 21 * but generally the primitive is invalid, *ptr is output argument. --ANK
22 */ 22 */
23static inline unsigned long __xchg(unsigned long x, volatile void *ptr, 23#define __xchg(x, ptr, size) \
24 int size) 24({ \
25{ 25 __typeof(*(ptr)) __x = (x); \
26 switch (size) { 26 switch (size) { \
27 case 1: 27 case 1: \
28 asm volatile("xchgb %b0,%1" 28 asm volatile("xchgb %b0,%1" \
29 : "=q" (x) 29 : "=q" (__x) \
30 : "m" (*__xg(ptr)), "0" (x) 30 : "m" (*__xg(ptr)), "0" (__x) \
31 : "memory"); 31 : "memory"); \
32 break; 32 break; \
33 case 2: 33 case 2: \
34 asm volatile("xchgw %w0,%1" 34 asm volatile("xchgw %w0,%1" \
35 : "=r" (x) 35 : "=r" (__x) \
36 : "m" (*__xg(ptr)), "0" (x) 36 : "m" (*__xg(ptr)), "0" (__x) \
37 : "memory"); 37 : "memory"); \
38 break; 38 break; \
39 case 4: 39 case 4: \
40 asm volatile("xchgl %k0,%1" 40 asm volatile("xchgl %k0,%1" \
41 : "=r" (x) 41 : "=r" (__x) \
42 : "m" (*__xg(ptr)), "0" (x) 42 : "m" (*__xg(ptr)), "0" (__x) \
43 : "memory"); 43 : "memory"); \
44 break; 44 break; \
45 case 8: 45 case 8: \
46 asm volatile("xchgq %0,%1" 46 asm volatile("xchgq %0,%1" \
47 : "=r" (x) 47 : "=r" (__x) \
48 : "m" (*__xg(ptr)), "0" (x) 48 : "m" (*__xg(ptr)), "0" (__x) \
49 : "memory"); 49 : "memory"); \
50 break; 50 break; \
51 } 51 default: \
52 return x; 52 __xchg_wrong_size(); \
53} 53 } \
54 __x; \
55})
56
57#define xchg(ptr, v) \
58 __xchg((v), (ptr), sizeof(*ptr))
59
60#define __HAVE_ARCH_CMPXCHG 1
54 61
55/* 62/*
56 * Atomic compare and exchange. Compare OLD with MEM, if identical, 63 * Atomic compare and exchange. Compare OLD with MEM, if identical,
57 * store NEW in MEM. Return the initial value in MEM. Success is 64 * store NEW in MEM. Return the initial value in MEM. Success is
58 * indicated by comparing RETURN with OLD. 65 * indicated by comparing RETURN with OLD.
59 */ 66 */
67#define __raw_cmpxchg(ptr, old, new, size, lock) \
68({ \
69 __typeof__(*(ptr)) __ret; \
70 __typeof__(*(ptr)) __old = (old); \
71 __typeof__(*(ptr)) __new = (new); \
72 switch (size) { \
73 case 1: \
74 asm volatile(lock "cmpxchgb %b1,%2" \
75 : "=a"(__ret) \
76 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
77 : "memory"); \
78 break; \
79 case 2: \
80 asm volatile(lock "cmpxchgw %w1,%2" \
81 : "=a"(__ret) \
82 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
83 : "memory"); \
84 break; \
85 case 4: \
86 asm volatile(lock "cmpxchgl %k1,%2" \
87 : "=a"(__ret) \
88 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
89 : "memory"); \
90 break; \
91 case 8: \
92 asm volatile(lock "cmpxchgq %1,%2" \
93 : "=a"(__ret) \
94 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
95 : "memory"); \
96 break; \
97 default: \
98 __cmpxchg_wrong_size(); \
99 } \
100 __ret; \
101})
60 102
61#define __HAVE_ARCH_CMPXCHG 1 103#define __cmpxchg(ptr, old, new, size) \
104 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
62 105
63static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, 106#define __sync_cmpxchg(ptr, old, new, size) \
64 unsigned long new, int size) 107 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
65{
66 unsigned long prev;
67 switch (size) {
68 case 1:
69 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
70 : "=a"(prev)
71 : "q"(new), "m"(*__xg(ptr)), "0"(old)
72 : "memory");
73 return prev;
74 case 2:
75 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
76 : "=a"(prev)
77 : "r"(new), "m"(*__xg(ptr)), "0"(old)
78 : "memory");
79 return prev;
80 case 4:
81 asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
82 : "=a"(prev)
83 : "r"(new), "m"(*__xg(ptr)), "0"(old)
84 : "memory");
85 return prev;
86 case 8:
87 asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
88 : "=a"(prev)
89 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90 : "memory");
91 return prev;
92 }
93 return old;
94}
95 108
96/* 109#define __cmpxchg_local(ptr, old, new, size) \
97 * Always use locked operations when touching memory shared with a 110 __raw_cmpxchg((ptr), (old), (new), (size), "")
98 * hypervisor, since the system may be SMP even if the guest kernel
99 * isn't.
100 */
101static inline unsigned long __sync_cmpxchg(volatile void *ptr,
102 unsigned long old,
103 unsigned long new, int size)
104{
105 unsigned long prev;
106 switch (size) {
107 case 1:
108 asm volatile("lock; cmpxchgb %b1,%2"
109 : "=a"(prev)
110 : "q"(new), "m"(*__xg(ptr)), "0"(old)
111 : "memory");
112 return prev;
113 case 2:
114 asm volatile("lock; cmpxchgw %w1,%2"
115 : "=a"(prev)
116 : "r"(new), "m"(*__xg(ptr)), "0"(old)
117 : "memory");
118 return prev;
119 case 4:
120 asm volatile("lock; cmpxchgl %1,%2"
121 : "=a"(prev)
122 : "r"(new), "m"(*__xg(ptr)), "0"(old)
123 : "memory");
124 return prev;
125 }
126 return old;
127}
128 111
129static inline unsigned long __cmpxchg_local(volatile void *ptr, 112#define cmpxchg(ptr, old, new) \
130 unsigned long old, 113 __cmpxchg((ptr), (old), (new), sizeof(*ptr))
131 unsigned long new, int size) 114
132{ 115#define sync_cmpxchg(ptr, old, new) \
133 unsigned long prev; 116 __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
134 switch (size) { 117
135 case 1: 118#define cmpxchg_local(ptr, old, new) \
136 asm volatile("cmpxchgb %b1,%2" 119 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
137 : "=a"(prev)
138 : "q"(new), "m"(*__xg(ptr)), "0"(old)
139 : "memory");
140 return prev;
141 case 2:
142 asm volatile("cmpxchgw %w1,%2"
143 : "=a"(prev)
144 : "r"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 4:
148 asm volatile("cmpxchgl %k1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 8:
154 asm volatile("cmpxchgq %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162 120
163#define cmpxchg(ptr, o, n) \
164 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
165 (unsigned long)(n), sizeof(*(ptr))))
166#define cmpxchg64(ptr, o, n) \ 121#define cmpxchg64(ptr, o, n) \
167({ \ 122({ \
168 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ 123 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
169 cmpxchg((ptr), (o), (n)); \ 124 cmpxchg((ptr), (o), (n)); \
170}) 125})
171#define cmpxchg_local(ptr, o, n) \ 126
172 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
173 (unsigned long)(n), \
174 sizeof(*(ptr))))
175#define sync_cmpxchg(ptr, o, n) \
176 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
177 (unsigned long)(n), \
178 sizeof(*(ptr))))
179#define cmpxchg64_local(ptr, o, n) \ 127#define cmpxchg64_local(ptr, o, n) \
180({ \ 128({ \
181 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ 129 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
deleted file mode 100644
index d96c1ee3a95c..000000000000
--- a/arch/x86/include/asm/cpu_debug.h
+++ /dev/null
@@ -1,127 +0,0 @@
1#ifndef _ASM_X86_CPU_DEBUG_H
2#define _ASM_X86_CPU_DEBUG_H
3
4/*
5 * CPU x86 architecture debug
6 *
7 * Copyright(C) 2009 Jaswinder Singh Rajput
8 */
9
10/* Register flags */
11enum cpu_debug_bit {
12/* Model Specific Registers (MSRs) */
13 CPU_MC_BIT, /* Machine Check */
14 CPU_MONITOR_BIT, /* Monitor */
15 CPU_TIME_BIT, /* Time */
16 CPU_PMC_BIT, /* Performance Monitor */
17 CPU_PLATFORM_BIT, /* Platform */
18 CPU_APIC_BIT, /* APIC */
19 CPU_POWERON_BIT, /* Power-on */
20 CPU_CONTROL_BIT, /* Control */
21 CPU_FEATURES_BIT, /* Features control */
22 CPU_LBRANCH_BIT, /* Last Branch */
23 CPU_BIOS_BIT, /* BIOS */
24 CPU_FREQ_BIT, /* Frequency */
25 CPU_MTTR_BIT, /* MTRR */
26 CPU_PERF_BIT, /* Performance */
27 CPU_CACHE_BIT, /* Cache */
28 CPU_SYSENTER_BIT, /* Sysenter */
29 CPU_THERM_BIT, /* Thermal */
30 CPU_MISC_BIT, /* Miscellaneous */
31 CPU_DEBUG_BIT, /* Debug */
32 CPU_PAT_BIT, /* PAT */
33 CPU_VMX_BIT, /* VMX */
34 CPU_CALL_BIT, /* System Call */
35 CPU_BASE_BIT, /* BASE Address */
36 CPU_VER_BIT, /* Version ID */
37 CPU_CONF_BIT, /* Configuration */
38 CPU_SMM_BIT, /* System mgmt mode */
39 CPU_SVM_BIT, /*Secure Virtual Machine*/
40 CPU_OSVM_BIT, /* OS-Visible Workaround*/
41/* Standard Registers */
42 CPU_TSS_BIT, /* Task Stack Segment */
43 CPU_CR_BIT, /* Control Registers */
44 CPU_DT_BIT, /* Descriptor Table */
45/* End of Registers flags */
46 CPU_REG_ALL_BIT, /* Select all Registers */
47};
48
49#define CPU_REG_ALL (~0) /* Select all Registers */
50
51#define CPU_MC (1 << CPU_MC_BIT)
52#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
53#define CPU_TIME (1 << CPU_TIME_BIT)
54#define CPU_PMC (1 << CPU_PMC_BIT)
55#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
56#define CPU_APIC (1 << CPU_APIC_BIT)
57#define CPU_POWERON (1 << CPU_POWERON_BIT)
58#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
59#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
60#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
61#define CPU_BIOS (1 << CPU_BIOS_BIT)
62#define CPU_FREQ (1 << CPU_FREQ_BIT)
63#define CPU_MTRR (1 << CPU_MTTR_BIT)
64#define CPU_PERF (1 << CPU_PERF_BIT)
65#define CPU_CACHE (1 << CPU_CACHE_BIT)
66#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
67#define CPU_THERM (1 << CPU_THERM_BIT)
68#define CPU_MISC (1 << CPU_MISC_BIT)
69#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
70#define CPU_PAT (1 << CPU_PAT_BIT)
71#define CPU_VMX (1 << CPU_VMX_BIT)
72#define CPU_CALL (1 << CPU_CALL_BIT)
73#define CPU_BASE (1 << CPU_BASE_BIT)
74#define CPU_VER (1 << CPU_VER_BIT)
75#define CPU_CONF (1 << CPU_CONF_BIT)
76#define CPU_SMM (1 << CPU_SMM_BIT)
77#define CPU_SVM (1 << CPU_SVM_BIT)
78#define CPU_OSVM (1 << CPU_OSVM_BIT)
79#define CPU_TSS (1 << CPU_TSS_BIT)
80#define CPU_CR (1 << CPU_CR_BIT)
81#define CPU_DT (1 << CPU_DT_BIT)
82
83/* Register file flags */
84enum cpu_file_bit {
85 CPU_INDEX_BIT, /* index */
86 CPU_VALUE_BIT, /* value */
87};
88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91#define MAX_CPU_FILES 512
92
93struct cpu_private {
94 unsigned cpu;
95 unsigned type;
96 unsigned reg;
97 unsigned file;
98};
99
100struct cpu_debug_base {
101 char *name; /* Register name */
102 unsigned flag; /* Register flag */
103 unsigned write; /* Register write flag */
104};
105
106/*
107 * Currently it looks similar to cpu_debug_base but once we add more files
108 * cpu_file_base will go in different direction
109 */
110struct cpu_file_base {
111 char *name; /* Register file name */
112 unsigned flag; /* Register file flag */
113 unsigned write; /* Register write flag */
114};
115
116struct cpu_cpuX_base {
117 struct dentry *dentry; /* Register dentry */
118 int init; /* Register index file */
119};
120
121struct cpu_debug_range {
122 unsigned min; /* Register range min */
123 unsigned max; /* Register range max */
124 unsigned flag; /* Supported flags */
125};
126
127#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9cfc88b97742..637e1ec963c3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -153,6 +153,7 @@
153#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ 153#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
154#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ 154#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
155#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ 155#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
156#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
156 157
157/* 158/*
158 * Auxiliary flags: Linux defined - For features scattered in various 159 * Auxiliary flags: Linux defined - For features scattered in various
@@ -248,6 +249,7 @@ extern const char * const x86_power_flags[32];
248#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) 249#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
249#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 250#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
250#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 251#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
252#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
251 253
252#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 254#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
253# define cpu_has_invlpg 1 255# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37be9e2..8240f76b531e 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -18,6 +18,7 @@
18#define DR_TRAP1 (0x2) /* db1 */ 18#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */ 19#define DR_TRAP2 (0x4) /* db2 */
20#define DR_TRAP3 (0x8) /* db3 */ 20#define DR_TRAP3 (0x8) /* db3 */
21#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
21 22
22#define DR_STEP (0x4000) /* single-step */ 23#define DR_STEP (0x4000) /* single-step */
23#define DR_SWITCH (0x8000) /* task switch */ 24#define DR_SWITCH (0x8000) /* task switch */
@@ -49,6 +50,8 @@
49 50
50#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ 51#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
51#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ 52#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
53#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
54#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
52#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ 55#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
53 56
54#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ 57#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
@@ -67,4 +70,34 @@
67#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ 70#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
68#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ 71#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
69 72
73/*
74 * HW breakpoint additions
75 */
76#ifdef __KERNEL__
77
78DECLARE_PER_CPU(unsigned long, cpu_dr7);
79
80static inline void hw_breakpoint_disable(void)
81{
82 /* Zero the control register for HW Breakpoint */
83 set_debugreg(0UL, 7);
84
85 /* Zero-out the individual HW breakpoint address registers */
86 set_debugreg(0UL, 0);
87 set_debugreg(0UL, 1);
88 set_debugreg(0UL, 2);
89 set_debugreg(0UL, 3);
90}
91
92static inline int hw_breakpoint_active(void)
93{
94 return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
95}
96
97extern void aout_dump_debugregs(struct user *dump);
98
99extern void hw_breakpoint_restore(void);
100
101#endif /* __KERNEL__ */
102
70#endif /* _ASM_X86_DEBUGREG_H */ 103#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e8de2f6f5ca5..617bd56b3070 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -288,7 +288,7 @@ static inline void load_LDT(mm_context_t *pc)
288 288
289static inline unsigned long get_desc_base(const struct desc_struct *desc) 289static inline unsigned long get_desc_base(const struct desc_struct *desc)
290{ 290{
291 return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); 291 return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
292} 292}
293 293
294static inline void set_desc_base(struct desc_struct *desc, unsigned long base) 294static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 9d6684849fd9..278441f39856 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -12,9 +12,9 @@
12#include <linux/types.h> 12#include <linux/types.h>
13 13
14/* 14/*
15 * FIXME: Acessing the desc_struct through its fields is more elegant, 15 * FIXME: Accessing the desc_struct through its fields is more elegant,
16 * and should be the one valid thing to do. However, a lot of open code 16 * and should be the one valid thing to do. However, a lot of open code
17 * still touches the a and b acessors, and doing this allow us to do it 17 * still touches the a and b accessors, and doing this allow us to do it
18 * incrementally. We keep the signature as a struct, rather than an union, 18 * incrementally. We keep the signature as a struct, rather than an union,
19 * so we can get rid of it transparently in the future -- glommer 19 * so we can get rid of it transparently in the future -- glommer
20 */ 20 */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index cee34e9ca45b..029f230ab637 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -8,7 +8,7 @@ struct dev_archdata {
8#ifdef CONFIG_X86_64 8#ifdef CONFIG_X86_64
9struct dma_map_ops *dma_ops; 9struct dma_map_ops *dma_ops;
10#endif 10#endif
11#ifdef CONFIG_DMAR 11#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU)
12 void *iommu; /* hook for IOMMU specific extension */ 12 void *iommu; /* hook for IOMMU specific extension */
13#endif 13#endif
14}; 14};
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 0ee770d23d0e..ac91eed21061 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -14,7 +14,14 @@
14#include <asm/swiotlb.h> 14#include <asm/swiotlb.h>
15#include <asm-generic/dma-coherent.h> 15#include <asm-generic/dma-coherent.h>
16 16
17extern dma_addr_t bad_dma_address; 17#ifdef CONFIG_ISA
18# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
19#else
20# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
21#endif
22
23#define DMA_ERROR_CODE 0
24
18extern int iommu_merge; 25extern int iommu_merge;
19extern struct device x86_dma_fallback_dev; 26extern struct device x86_dma_fallback_dev;
20extern int panic_on_overflow; 27extern int panic_on_overflow;
@@ -42,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
42 if (ops->mapping_error) 49 if (ops->mapping_error)
43 return ops->mapping_error(dev, dma_addr); 50 return ops->mapping_error(dev, dma_addr);
44 51
45 return (dma_addr == bad_dma_address); 52 return (dma_addr == DMA_ERROR_CODE);
46} 53}
47 54
48#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) 55#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -60,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
60 if (!dev->dma_mask) 67 if (!dev->dma_mask)
61 return 0; 68 return 0;
62 69
63 return addr + size <= *dev->dma_mask; 70 return addr + size - 1 <= *dev->dma_mask;
64} 71}
65 72
66static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) 73static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
@@ -124,10 +131,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
124 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) 131 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
125 return memory; 132 return memory;
126 133
127 if (!dev) { 134 if (!dev)
128 dev = &x86_dma_fallback_dev; 135 dev = &x86_dma_fallback_dev;
129 gfp |= GFP_DMA;
130 }
131 136
132 if (!is_device_dma_capable(dev)) 137 if (!is_device_dma_capable(dev))
133 return NULL; 138 return NULL;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 456a304b8172..1994d3f58443 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -157,19 +157,6 @@ do { \
157 157
158#define compat_elf_check_arch(x) elf_check_arch_ia32(x) 158#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
159 159
160static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
161{
162 loadsegment(fs, 0);
163 loadsegment(ds, __USER32_DS);
164 loadsegment(es, __USER32_DS);
165 load_gs_index(0);
166 regs->ip = ip;
167 regs->sp = sp;
168 regs->flags = X86_EFLAGS_IF;
169 regs->cs = __USER32_CS;
170 regs->ss = __USER32_DS;
171}
172
173static inline void elf_common_init(struct thread_struct *t, 160static inline void elf_common_init(struct thread_struct *t,
174 struct pt_regs *regs, const u16 ds) 161 struct pt_regs *regs, const u16 ds)
175{ 162{
@@ -191,20 +178,11 @@ do { \
191#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
192 elf_common_init(&current->thread, regs, __USER_DS) 179 elf_common_init(&current->thread, regs, __USER_DS)
193 180
194#define compat_start_thread(regs, ip, sp) \ 181void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
195do { \ 182#define compat_start_thread start_thread_ia32
196 start_ia32_thread(regs, ip, sp); \
197 set_fs(USER_DS); \
198} while (0)
199 183
200#define COMPAT_SET_PERSONALITY(ex) \ 184void set_personality_ia32(void);
201do { \ 185#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32()
202 if (test_thread_flag(TIF_IA32)) \
203 clear_thread_flag(TIF_ABI_PENDING); \
204 else \
205 set_thread_flag(TIF_ABI_PENDING); \
206 current->personality |= force_personality32; \
207} while (0)
208 186
209#define COMPAT_ELF_PLATFORM ("i686") 187#define COMPAT_ELF_PLATFORM ("i686")
210 188
@@ -255,7 +233,6 @@ extern int force_personality32;
255#endif /* !CONFIG_X86_32 */ 233#endif /* !CONFIG_X86_32 */
256 234
257#define CORE_DUMP_USE_REGSET 235#define CORE_DUMP_USE_REGSET
258#define USE_ELF_CORE_DUMP
259#define ELF_EXEC_PAGESIZE 4096 236#define ELF_EXEC_PAGESIZE 4096
260 237
261/* This is the location that an ET_DYN program is loaded if exec'ed. Typical 238/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index f5693c81a1db..8e8ec663a98f 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -34,7 +34,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
34 smp_invalidate_interrupt) 34 smp_invalidate_interrupt)
35#endif 35#endif
36 36
37BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) 37BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
38 38
39/* 39/*
40 * every pentium local APIC has two 'local interrupts', with a 40 * every pentium local APIC has two 'local interrupts', with a
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa409d8..4ac5b0f33fc1 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed;
35extern int gart_iommu_aperture_disabled; 35extern int gart_iommu_aperture_disabled;
36 36
37extern void early_gart_iommu_check(void); 37extern void early_gart_iommu_check(void);
38extern void gart_iommu_init(void); 38extern int gart_iommu_init(void);
39extern void gart_iommu_shutdown(void);
40extern void __init gart_parse_options(char *); 39extern void __init gart_parse_options(char *);
41extern void gart_iommu_hole_init(void); 40extern void gart_iommu_hole_init(void);
42 41
@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void);
48static inline void early_gart_iommu_check(void) 47static inline void early_gart_iommu_check(void)
49{ 48{
50} 49}
51static inline void gart_iommu_init(void)
52{
53}
54static inline void gart_iommu_shutdown(void)
55{
56}
57static inline void gart_parse_options(char *options) 50static inline void gart_parse_options(char *options)
58{ 51{
59} 52}
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
index ad3c2ed75481..7cd73552a4e8 100644
--- a/arch/x86/include/asm/geode.h
+++ b/arch/x86/include/asm/geode.h
@@ -12,160 +12,7 @@
12 12
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <linux/io.h> 14#include <linux/io.h>
15 15#include <linux/cs5535.h>
16/* Generic southbridge functions */
17
18#define GEODE_DEV_PMS 0
19#define GEODE_DEV_ACPI 1
20#define GEODE_DEV_GPIO 2
21#define GEODE_DEV_MFGPT 3
22
23extern int geode_get_dev_base(unsigned int dev);
24
25/* Useful macros */
26#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
27#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
28#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
29#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
30
31/* MSRS */
32
33#define MSR_GLIU_P2D_RO0 0x10000029
34
35#define MSR_LX_GLD_MSR_CONFIG 0x48002001
36#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
37 * sheet has the wrong value */
38#define MSR_GLCP_SYS_RSTPLL 0x4C000014
39#define MSR_GLCP_DOTPLL 0x4C000015
40
41#define MSR_LBAR_SMB 0x5140000B
42#define MSR_LBAR_GPIO 0x5140000C
43#define MSR_LBAR_MFGPT 0x5140000D
44#define MSR_LBAR_ACPI 0x5140000E
45#define MSR_LBAR_PMS 0x5140000F
46
47#define MSR_DIVIL_SOFT_RESET 0x51400017
48
49#define MSR_PIC_YSEL_LOW 0x51400020
50#define MSR_PIC_YSEL_HIGH 0x51400021
51#define MSR_PIC_ZSEL_LOW 0x51400022
52#define MSR_PIC_ZSEL_HIGH 0x51400023
53#define MSR_PIC_IRQM_LPC 0x51400025
54
55#define MSR_MFGPT_IRQ 0x51400028
56#define MSR_MFGPT_NR 0x51400029
57#define MSR_MFGPT_SETUP 0x5140002B
58
59#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
60
61#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
62#define MSR_GX_MSR_PADSEL 0xC0002011
63
64/* Resource Sizes */
65
66#define LBAR_GPIO_SIZE 0xFF
67#define LBAR_MFGPT_SIZE 0x40
68#define LBAR_ACPI_SIZE 0x40
69#define LBAR_PMS_SIZE 0x80
70
71/* ACPI registers (PMS block) */
72
73/*
74 * PM1_EN is only valid when VSA is enabled for 16 bit reads.
75 * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
76 * with a 32 bit read at offset 0x0
77 */
78
79#define PM1_STS 0x00
80#define PM1_EN 0x02
81#define PM1_CNT 0x08
82#define PM2_CNT 0x0C
83#define PM_TMR 0x10
84#define PM_GPE0_STS 0x18
85#define PM_GPE0_EN 0x1C
86
87/* PMC registers (PMS block) */
88
89#define PM_SSD 0x00
90#define PM_SCXA 0x04
91#define PM_SCYA 0x08
92#define PM_OUT_SLPCTL 0x0C
93#define PM_SCLK 0x10
94#define PM_SED 0x1
95#define PM_SCXD 0x18
96#define PM_SCYD 0x1C
97#define PM_IN_SLPCTL 0x20
98#define PM_WKD 0x30
99#define PM_WKXD 0x34
100#define PM_RD 0x38
101#define PM_WKXA 0x3C
102#define PM_FSD 0x40
103#define PM_TSD 0x44
104#define PM_PSD 0x48
105#define PM_NWKD 0x4C
106#define PM_AWKD 0x50
107#define PM_SSC 0x54
108
109/* VSA2 magic values */
110
111#define VSA_VRC_INDEX 0xAC1C
112#define VSA_VRC_DATA 0xAC1E
113#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
114#define VSA_VR_SIGNATURE 0x0003
115#define VSA_VR_MEM_SIZE 0x0200
116#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
117#define GSW_VSA_SIG 0x534d /* General Software signature */
118/* GPIO */
119
120#define GPIO_OUTPUT_VAL 0x00
121#define GPIO_OUTPUT_ENABLE 0x04
122#define GPIO_OUTPUT_OPEN_DRAIN 0x08
123#define GPIO_OUTPUT_INVERT 0x0C
124#define GPIO_OUTPUT_AUX1 0x10
125#define GPIO_OUTPUT_AUX2 0x14
126#define GPIO_PULL_UP 0x18
127#define GPIO_PULL_DOWN 0x1C
128#define GPIO_INPUT_ENABLE 0x20
129#define GPIO_INPUT_INVERT 0x24
130#define GPIO_INPUT_FILTER 0x28
131#define GPIO_INPUT_EVENT_COUNT 0x2C
132#define GPIO_READ_BACK 0x30
133#define GPIO_INPUT_AUX1 0x34
134#define GPIO_EVENTS_ENABLE 0x38
135#define GPIO_LOCK_ENABLE 0x3C
136#define GPIO_POSITIVE_EDGE_EN 0x40
137#define GPIO_NEGATIVE_EDGE_EN 0x44
138#define GPIO_POSITIVE_EDGE_STS 0x48
139#define GPIO_NEGATIVE_EDGE_STS 0x4C
140
141#define GPIO_MAP_X 0xE0
142#define GPIO_MAP_Y 0xE4
143#define GPIO_MAP_Z 0xE8
144#define GPIO_MAP_W 0xEC
145
146static inline u32 geode_gpio(unsigned int nr)
147{
148 BUG_ON(nr > 28);
149 return 1 << nr;
150}
151
152extern void geode_gpio_set(u32, unsigned int);
153extern void geode_gpio_clear(u32, unsigned int);
154extern int geode_gpio_isset(u32, unsigned int);
155extern void geode_gpio_setup_event(unsigned int, int, int);
156extern void geode_gpio_set_irq(unsigned int, unsigned int);
157
158static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
159{
160 geode_gpio_setup_event(gpio, pair, 0);
161}
162
163static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
164{
165 geode_gpio_setup_event(gpio, pair, 1);
166}
167
168/* Specific geode tests */
169 16
170static inline int is_geode_gx(void) 17static inline int is_geode_gx(void)
171{ 18{
@@ -186,68 +33,4 @@ static inline int is_geode(void)
186 return (is_geode_gx() || is_geode_lx()); 33 return (is_geode_gx() || is_geode_lx());
187} 34}
188 35
189#ifdef CONFIG_MGEODE_LX
190extern int geode_has_vsa2(void);
191#else
192static inline int geode_has_vsa2(void)
193{
194 return 0;
195}
196#endif
197
198/* MFGPTs */
199
200#define MFGPT_MAX_TIMERS 8
201#define MFGPT_TIMER_ANY (-1)
202
203#define MFGPT_DOMAIN_WORKING 1
204#define MFGPT_DOMAIN_STANDBY 2
205#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
206
207#define MFGPT_CMP1 0
208#define MFGPT_CMP2 1
209
210#define MFGPT_EVENT_IRQ 0
211#define MFGPT_EVENT_NMI 1
212#define MFGPT_EVENT_RESET 3
213
214#define MFGPT_REG_CMP1 0
215#define MFGPT_REG_CMP2 2
216#define MFGPT_REG_COUNTER 4
217#define MFGPT_REG_SETUP 6
218
219#define MFGPT_SETUP_CNTEN (1 << 15)
220#define MFGPT_SETUP_CMP2 (1 << 14)
221#define MFGPT_SETUP_CMP1 (1 << 13)
222#define MFGPT_SETUP_SETUP (1 << 12)
223#define MFGPT_SETUP_STOPEN (1 << 11)
224#define MFGPT_SETUP_EXTEN (1 << 10)
225#define MFGPT_SETUP_REVEN (1 << 5)
226#define MFGPT_SETUP_CLKSEL (1 << 4)
227
228static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
229{
230 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
231 outw(value, base + reg + (timer * 8));
232}
233
234static inline u16 geode_mfgpt_read(int timer, u16 reg)
235{
236 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
237 return inw(base + reg + (timer * 8));
238}
239
240extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
241extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
242extern int geode_mfgpt_alloc_timer(int timer, int domain);
243
244#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
245#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
246
247#ifdef CONFIG_GEODE_MFGPT_TIMER
248extern int __init mfgpt_timer_setup(void);
249#else
250static inline int mfgpt_timer_setup(void) { return 0; }
251#endif
252
253#endif /* _ASM_X86_GEODE_H */ 36#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 82e3e8f01043..0f8576427cfe 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,7 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs; 17 unsigned int apic_pending_irqs;
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
@@ -20,11 +20,11 @@ typedef struct {
20 unsigned int irq_call_count; 20 unsigned int irq_call_count;
21 unsigned int irq_tlb_count; 21 unsigned int irq_tlb_count;
22#endif 22#endif
23#ifdef CONFIG_X86_MCE 23#ifdef CONFIG_X86_THERMAL_VECTOR
24 unsigned int irq_thermal_count; 24 unsigned int irq_thermal_count;
25# ifdef CONFIG_X86_MCE_THRESHOLD 25#endif
26#ifdef CONFIG_X86_MCE_THRESHOLD
26 unsigned int irq_threshold_count; 27 unsigned int irq_threshold_count;
27# endif
28#endif 28#endif
29} ____cacheline_aligned irq_cpustat_t; 29} ____cacheline_aligned irq_cpustat_t;
30 30
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1c22cb05ad6a..1d5c08a1bdfd 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -65,11 +65,13 @@
65/* hpet memory map physical address */ 65/* hpet memory map physical address */
66extern unsigned long hpet_address; 66extern unsigned long hpet_address;
67extern unsigned long force_hpet_address; 67extern unsigned long force_hpet_address;
68extern u8 hpet_blockid;
68extern int hpet_force_user; 69extern int hpet_force_user;
70extern u8 hpet_msi_disable;
69extern int is_hpet_enabled(void); 71extern int is_hpet_enabled(void);
70extern int hpet_enable(void); 72extern int hpet_enable(void);
71extern void hpet_disable(void); 73extern void hpet_disable(void);
72extern unsigned long hpet_readl(unsigned long a); 74extern unsigned int hpet_readl(unsigned int a);
73extern void force_hpet_resume(void); 75extern void force_hpet_resume(void);
74 76
75extern void hpet_msi_unmask(unsigned int irq); 77extern void hpet_msi_unmask(unsigned int irq);
@@ -78,9 +80,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
78extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); 80extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
79 81
80#ifdef CONFIG_PCI_MSI 82#ifdef CONFIG_PCI_MSI
81extern int arch_setup_hpet_msi(unsigned int irq); 83extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
82#else 84#else
83static inline int arch_setup_hpet_msi(unsigned int irq) 85static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
84{ 86{
85 return -EINVAL; 87 return -EINVAL;
86} 88}
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..0675a7c4c20e
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
1#ifndef _I386_HW_BREAKPOINT_H
2#define _I386_HW_BREAKPOINT_H
3
4#ifdef __KERNEL__
5#define __ARCH_HW_BREAKPOINT_H
6
7/*
8 * The name should probably be something dealt in
9 * a higher level. While dealing with the user
10 * (display/resolving)
11 */
12struct arch_hw_breakpoint {
13 char *name; /* Contains name of the symbol to set bkpt */
14 unsigned long address;
15 u8 len;
16 u8 type;
17};
18
19#include <linux/kdebug.h>
20#include <linux/percpu.h>
21#include <linux/list.h>
22
23/* Available HW breakpoint length encodings */
24#define X86_BREAKPOINT_LEN_1 0x40
25#define X86_BREAKPOINT_LEN_2 0x44
26#define X86_BREAKPOINT_LEN_4 0x4c
27#define X86_BREAKPOINT_LEN_EXECUTE 0x40
28
29#ifdef CONFIG_X86_64
30#define X86_BREAKPOINT_LEN_8 0x48
31#endif
32
33/* Available HW breakpoint type encodings */
34
35/* trigger on instruction execute */
36#define X86_BREAKPOINT_EXECUTE 0x80
37/* trigger on memory write */
38#define X86_BREAKPOINT_WRITE 0x81
39/* trigger on memory read or write */
40#define X86_BREAKPOINT_RW 0x83
41
42/* Total number of available HW breakpoint registers */
43#define HBP_NUM 4
44
45struct perf_event;
46struct pmu;
47
48extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
49extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
50 struct task_struct *tsk);
51extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
52 unsigned long val, void *data);
53
54
55int arch_install_hw_breakpoint(struct perf_event *bp);
56void arch_uninstall_hw_breakpoint(struct perf_event *bp);
57void hw_breakpoint_pmu_read(struct perf_event *bp);
58void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
59
60extern void
61arch_fill_perf_breakpoint(struct perf_event *bp);
62
63unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
64int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
65
66extern int arch_bp_generic_fields(int x86_len, int x86_type,
67 int *gen_len, int *gen_type);
68
69extern struct pmu perf_ops_bp;
70
71#endif /* __KERNEL__ */
72#endif /* _I386_HW_BREAKPOINT_H */
73
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ba180d93b08c..eeac829a0f44 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,7 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void x86_platform_ipi(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void); 32extern void perf_pending_interrupt(void);
33 33
@@ -79,14 +79,33 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
79 int ioapic, int ioapic_pin, 79 int ioapic, int ioapic_pin,
80 int trigger, int polarity) 80 int trigger, int polarity)
81{ 81{
82 irq_attr->ioapic = ioapic; 82 irq_attr->ioapic = ioapic;
83 irq_attr->ioapic_pin = ioapic_pin; 83 irq_attr->ioapic_pin = ioapic_pin;
84 irq_attr->trigger = trigger; 84 irq_attr->trigger = trigger;
85 irq_attr->polarity = polarity; 85 irq_attr->polarity = polarity;
86} 86}
87 87
88extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, 88/*
89 struct io_apic_irq_attr *irq_attr); 89 * This is performance-critical, we want to do it O(1)
90 *
91 * Most irqs are mapped 1:1 with pins.
92 */
93struct irq_cfg {
94 struct irq_pin_list *irq_2_pin;
95 cpumask_var_t domain;
96 cpumask_var_t old_domain;
97 u8 vector;
98 u8 move_in_progress : 1;
99};
100
101extern struct irq_cfg *irq_cfg(unsigned int);
102extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
103extern void send_cleanup_vector(struct irq_cfg *);
104
105struct irq_desc;
106extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
107 unsigned int *dest_id);
108extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
90extern void setup_ioapic_dest(void); 109extern void setup_ioapic_dest(void);
91 110
92extern void enable_IO_APIC(void); 111extern void enable_IO_APIC(void);
@@ -101,7 +120,7 @@ extern void eisa_set_level_irq(unsigned int irq);
101/* SMP */ 120/* SMP */
102extern void smp_apic_timer_interrupt(struct pt_regs *); 121extern void smp_apic_timer_interrupt(struct pt_regs *);
103extern void smp_spurious_interrupt(struct pt_regs *); 122extern void smp_spurious_interrupt(struct pt_regs *);
104extern void smp_generic_interrupt(struct pt_regs *); 123extern void smp_x86_platform_ipi(struct pt_regs *);
105extern void smp_error_interrupt(struct pt_regs *); 124extern void smp_error_interrupt(struct pt_regs *);
106#ifdef CONFIG_X86_IO_APIC 125#ifdef CONFIG_X86_IO_APIC
107extern asmlinkage void smp_irq_move_cleanup_interrupt(void); 126extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb758f2..ebfb8a9e11f7 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,8 @@
10#ifndef _ASM_X86_I387_H 10#ifndef _ASM_X86_I387_H
11#define _ASM_X86_I387_H 11#define _ASM_X86_I387_H
12 12
13#ifndef __ASSEMBLY__
14
13#include <linux/sched.h> 15#include <linux/sched.h>
14#include <linux/kernel_stat.h> 16#include <linux/kernel_stat.h>
15#include <linux/regset.h> 17#include <linux/regset.h>
@@ -411,4 +413,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
411 } 413 }
412} 414}
413 415
416#endif /* __ASSEMBLY__ */
417
418#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
419#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
420
414#endif /* _ASM_X86_I387_H */ 421#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644
index 000000000000..205b063e3e32
--- /dev/null
+++ b/arch/x86/include/asm/inat.h
@@ -0,0 +1,220 @@
1#ifndef _ASM_X86_INAT_H
2#define _ASM_X86_INAT_H
3/*
4 * x86 instruction attributes
5 *
6 * Written by Masami Hiramatsu <mhiramat@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 */
23#include <asm/inat_types.h>
24
25/*
26 * Internal bits. Don't use bitmasks directly, because these bits are
27 * unstable. You should use checking functions.
28 */
29
30#define INAT_OPCODE_TABLE_SIZE 256
31#define INAT_GROUP_TABLE_SIZE 8
32
33/* Legacy last prefixes */
34#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */
35#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */
36#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */
37/* Other Legacy prefixes */
38#define INAT_PFX_LOCK 4 /* 0xF0 */
39#define INAT_PFX_CS 5 /* 0x2E */
40#define INAT_PFX_DS 6 /* 0x3E */
41#define INAT_PFX_ES 7 /* 0x26 */
42#define INAT_PFX_FS 8 /* 0x64 */
43#define INAT_PFX_GS 9 /* 0x65 */
44#define INAT_PFX_SS 10 /* 0x36 */
45#define INAT_PFX_ADDRSZ 11 /* 0x67 */
46/* x86-64 REX prefix */
47#define INAT_PFX_REX 12 /* 0x4X */
48/* AVX VEX prefixes */
49#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
50#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
51
52#define INAT_LSTPFX_MAX 3
53#define INAT_LGCPFX_MAX 11
54
55/* Immediate size */
56#define INAT_IMM_BYTE 1
57#define INAT_IMM_WORD 2
58#define INAT_IMM_DWORD 3
59#define INAT_IMM_QWORD 4
60#define INAT_IMM_PTR 5
61#define INAT_IMM_VWORD32 6
62#define INAT_IMM_VWORD 7
63
64/* Legacy prefix */
65#define INAT_PFX_OFFS 0
66#define INAT_PFX_BITS 4
67#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
68#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
69/* Escape opcodes */
70#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS)
71#define INAT_ESC_BITS 2
72#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1)
73#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS)
74/* Group opcodes (1-16) */
75#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS)
76#define INAT_GRP_BITS 5
77#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1)
78#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS)
79/* Immediates */
80#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS)
81#define INAT_IMM_BITS 3
82#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
83/* Flags */
84#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
85#define INAT_MODRM (1 << (INAT_FLAG_OFFS))
86#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1))
87#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2))
88#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3))
89#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4))
90#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
91#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
92/* Attribute making macros for attribute tables */
93#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
94#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
95#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
96#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
97
98/* Attribute search APIs */
99extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
100extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
101 insn_byte_t last_pfx,
102 insn_attr_t esc_attr);
103extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
104 insn_byte_t last_pfx,
105 insn_attr_t esc_attr);
106extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
107 insn_byte_t vex_m,
108 insn_byte_t vex_pp);
109
110/* Attribute checking functions */
111static inline int inat_is_legacy_prefix(insn_attr_t attr)
112{
113 attr &= INAT_PFX_MASK;
114 return attr && attr <= INAT_LGCPFX_MAX;
115}
116
117static inline int inat_is_address_size_prefix(insn_attr_t attr)
118{
119 return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
120}
121
122static inline int inat_is_operand_size_prefix(insn_attr_t attr)
123{
124 return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
125}
126
127static inline int inat_is_rex_prefix(insn_attr_t attr)
128{
129 return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
130}
131
132static inline int inat_last_prefix_id(insn_attr_t attr)
133{
134 if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
135 return 0;
136 else
137 return attr & INAT_PFX_MASK;
138}
139
140static inline int inat_is_vex_prefix(insn_attr_t attr)
141{
142 attr &= INAT_PFX_MASK;
143 return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
144}
145
146static inline int inat_is_vex3_prefix(insn_attr_t attr)
147{
148 return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
149}
150
151static inline int inat_is_escape(insn_attr_t attr)
152{
153 return attr & INAT_ESC_MASK;
154}
155
156static inline int inat_escape_id(insn_attr_t attr)
157{
158 return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
159}
160
161static inline int inat_is_group(insn_attr_t attr)
162{
163 return attr & INAT_GRP_MASK;
164}
165
166static inline int inat_group_id(insn_attr_t attr)
167{
168 return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
169}
170
171static inline int inat_group_common_attribute(insn_attr_t attr)
172{
173 return attr & ~INAT_GRP_MASK;
174}
175
176static inline int inat_has_immediate(insn_attr_t attr)
177{
178 return attr & INAT_IMM_MASK;
179}
180
181static inline int inat_immediate_size(insn_attr_t attr)
182{
183 return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
184}
185
186static inline int inat_has_modrm(insn_attr_t attr)
187{
188 return attr & INAT_MODRM;
189}
190
191static inline int inat_is_force64(insn_attr_t attr)
192{
193 return attr & INAT_FORCE64;
194}
195
196static inline int inat_has_second_immediate(insn_attr_t attr)
197{
198 return attr & INAT_SCNDIMM;
199}
200
201static inline int inat_has_moffset(insn_attr_t attr)
202{
203 return attr & INAT_MOFFSET;
204}
205
206static inline int inat_has_variant(insn_attr_t attr)
207{
208 return attr & INAT_VARIANT;
209}
210
211static inline int inat_accept_vex(insn_attr_t attr)
212{
213 return attr & INAT_VEXOK;
214}
215
216static inline int inat_must_vex(insn_attr_t attr)
217{
218 return attr & INAT_VEXONLY;
219}
220#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644
index 000000000000..cb3c20ce39cf
--- /dev/null
+++ b/arch/x86/include/asm/inat_types.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_INAT_TYPES_H
2#define _ASM_X86_INAT_TYPES_H
3/*
4 * x86 instruction attributes
5 *
6 * Written by Masami Hiramatsu <mhiramat@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 */
23
24/* Instruction attributes */
25typedef unsigned int insn_attr_t;
26typedef unsigned char insn_byte_t;
27typedef signed int insn_value_t;
28
29#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644
index 000000000000..96c2e0ad04ca
--- /dev/null
+++ b/arch/x86/include/asm/insn.h
@@ -0,0 +1,184 @@
1#ifndef _ASM_X86_INSN_H
2#define _ASM_X86_INSN_H
3/*
4 * x86 instruction analysis
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2009
21 */
22
23/* insn_attr_t is defined in inat.h */
24#include <asm/inat.h>
25
26struct insn_field {
27 union {
28 insn_value_t value;
29 insn_byte_t bytes[4];
30 };
31 /* !0 if we've run insn_get_xxx() for this field */
32 unsigned char got;
33 unsigned char nbytes;
34};
35
36struct insn {
37 struct insn_field prefixes; /*
38 * Prefixes
39 * prefixes.bytes[3]: last prefix
40 */
41 struct insn_field rex_prefix; /* REX prefix */
42 struct insn_field vex_prefix; /* VEX prefix */
43 struct insn_field opcode; /*
44 * opcode.bytes[0]: opcode1
45 * opcode.bytes[1]: opcode2
46 * opcode.bytes[2]: opcode3
47 */
48 struct insn_field modrm;
49 struct insn_field sib;
50 struct insn_field displacement;
51 union {
52 struct insn_field immediate;
53 struct insn_field moffset1; /* for 64bit MOV */
54 struct insn_field immediate1; /* for 64bit imm or off16/32 */
55 };
56 union {
57 struct insn_field moffset2; /* for 64bit MOV */
58 struct insn_field immediate2; /* for 64bit imm or seg16 */
59 };
60
61 insn_attr_t attr;
62 unsigned char opnd_bytes;
63 unsigned char addr_bytes;
64 unsigned char length;
65 unsigned char x86_64;
66
67 const insn_byte_t *kaddr; /* kernel address of insn to analyze */
68 const insn_byte_t *next_byte;
69};
70
71#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
72#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
73#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
74
75#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
76#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
77#define X86_SIB_BASE(sib) ((sib) & 0x07)
78
79#define X86_REX_W(rex) ((rex) & 8)
80#define X86_REX_R(rex) ((rex) & 4)
81#define X86_REX_X(rex) ((rex) & 2)
82#define X86_REX_B(rex) ((rex) & 1)
83
84/* VEX bit flags */
85#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
86#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */
87#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */
88#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
89#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
90/* VEX bit fields */
91#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
92#define X86_VEX2_M 1 /* VEX2.M always 1 */
93#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
94#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */
95#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
96
97/* The last prefix is needed for two-byte and three-byte opcodes */
98static inline insn_byte_t insn_last_prefix(struct insn *insn)
99{
100 return insn->prefixes.bytes[3];
101}
102
103extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
104extern void insn_get_prefixes(struct insn *insn);
105extern void insn_get_opcode(struct insn *insn);
106extern void insn_get_modrm(struct insn *insn);
107extern void insn_get_sib(struct insn *insn);
108extern void insn_get_displacement(struct insn *insn);
109extern void insn_get_immediate(struct insn *insn);
110extern void insn_get_length(struct insn *insn);
111
112/* Attribute will be determined after getting ModRM (for opcode groups) */
113static inline void insn_get_attribute(struct insn *insn)
114{
115 insn_get_modrm(insn);
116}
117
118/* Instruction uses RIP-relative addressing */
119extern int insn_rip_relative(struct insn *insn);
120
121/* Init insn for kernel text */
122static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
123{
124#ifdef CONFIG_X86_64
125 insn_init(insn, kaddr, 1);
126#else /* CONFIG_X86_32 */
127 insn_init(insn, kaddr, 0);
128#endif
129}
130
131static inline int insn_is_avx(struct insn *insn)
132{
133 if (!insn->prefixes.got)
134 insn_get_prefixes(insn);
135 return (insn->vex_prefix.value != 0);
136}
137
138static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
139{
140 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
141 return X86_VEX2_M;
142 else
143 return X86_VEX3_M(insn->vex_prefix.bytes[1]);
144}
145
146static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
147{
148 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
149 return X86_VEX_P(insn->vex_prefix.bytes[1]);
150 else
151 return X86_VEX_P(insn->vex_prefix.bytes[2]);
152}
153
154/* Offset of each field from kaddr */
155static inline int insn_offset_rex_prefix(struct insn *insn)
156{
157 return insn->prefixes.nbytes;
158}
159static inline int insn_offset_vex_prefix(struct insn *insn)
160{
161 return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
162}
163static inline int insn_offset_opcode(struct insn *insn)
164{
165 return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
166}
167static inline int insn_offset_modrm(struct insn *insn)
168{
169 return insn_offset_opcode(insn) + insn->opcode.nbytes;
170}
171static inline int insn_offset_sib(struct insn *insn)
172{
173 return insn_offset_modrm(insn) + insn->modrm.nbytes;
174}
175static inline int insn_offset_displacement(struct insn *insn)
176{
177 return insn_offset_sib(insn) + insn->sib.nbytes;
178}
179static inline int insn_offset_immediate(struct insn *insn)
180{
181 return insn_offset_displacement(insn) + insn->displacement.nbytes;
182}
183
184#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
new file mode 100644
index 000000000000..14cf526091f9
--- /dev/null
+++ b/arch/x86/include/asm/inst.h
@@ -0,0 +1,150 @@
1/*
2 * Generate .byte code for some instructions not supported by old
3 * binutils.
4 */
5#ifndef X86_ASM_INST_H
6#define X86_ASM_INST_H
7
8#ifdef __ASSEMBLY__
9
10 .macro XMM_NUM opd xmm
11 .ifc \xmm,%xmm0
12 \opd = 0
13 .endif
14 .ifc \xmm,%xmm1
15 \opd = 1
16 .endif
17 .ifc \xmm,%xmm2
18 \opd = 2
19 .endif
20 .ifc \xmm,%xmm3
21 \opd = 3
22 .endif
23 .ifc \xmm,%xmm4
24 \opd = 4
25 .endif
26 .ifc \xmm,%xmm5
27 \opd = 5
28 .endif
29 .ifc \xmm,%xmm6
30 \opd = 6
31 .endif
32 .ifc \xmm,%xmm7
33 \opd = 7
34 .endif
35 .ifc \xmm,%xmm8
36 \opd = 8
37 .endif
38 .ifc \xmm,%xmm9
39 \opd = 9
40 .endif
41 .ifc \xmm,%xmm10
42 \opd = 10
43 .endif
44 .ifc \xmm,%xmm11
45 \opd = 11
46 .endif
47 .ifc \xmm,%xmm12
48 \opd = 12
49 .endif
50 .ifc \xmm,%xmm13
51 \opd = 13
52 .endif
53 .ifc \xmm,%xmm14
54 \opd = 14
55 .endif
56 .ifc \xmm,%xmm15
57 \opd = 15
58 .endif
59 .endm
60
61 .macro PFX_OPD_SIZE
62 .byte 0x66
63 .endm
64
65 .macro PFX_REX opd1 opd2
66 .if (\opd1 | \opd2) & 8
67 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
68 .endif
69 .endm
70
71 .macro MODRM mod opd1 opd2
72 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
73 .endm
74
75 .macro PSHUFB_XMM xmm1 xmm2
76 XMM_NUM pshufb_opd1 \xmm1
77 XMM_NUM pshufb_opd2 \xmm2
78 PFX_OPD_SIZE
79 PFX_REX pshufb_opd1 pshufb_opd2
80 .byte 0x0f, 0x38, 0x00
81 MODRM 0xc0 pshufb_opd1 pshufb_opd2
82 .endm
83
84 .macro PCLMULQDQ imm8 xmm1 xmm2
85 XMM_NUM clmul_opd1 \xmm1
86 XMM_NUM clmul_opd2 \xmm2
87 PFX_OPD_SIZE
88 PFX_REX clmul_opd1 clmul_opd2
89 .byte 0x0f, 0x3a, 0x44
90 MODRM 0xc0 clmul_opd1 clmul_opd2
91 .byte \imm8
92 .endm
93
94 .macro AESKEYGENASSIST rcon xmm1 xmm2
95 XMM_NUM aeskeygen_opd1 \xmm1
96 XMM_NUM aeskeygen_opd2 \xmm2
97 PFX_OPD_SIZE
98 PFX_REX aeskeygen_opd1 aeskeygen_opd2
99 .byte 0x0f, 0x3a, 0xdf
100 MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
101 .byte \rcon
102 .endm
103
104 .macro AESIMC xmm1 xmm2
105 XMM_NUM aesimc_opd1 \xmm1
106 XMM_NUM aesimc_opd2 \xmm2
107 PFX_OPD_SIZE
108 PFX_REX aesimc_opd1 aesimc_opd2
109 .byte 0x0f, 0x38, 0xdb
110 MODRM 0xc0 aesimc_opd1 aesimc_opd2
111 .endm
112
113 .macro AESENC xmm1 xmm2
114 XMM_NUM aesenc_opd1 \xmm1
115 XMM_NUM aesenc_opd2 \xmm2
116 PFX_OPD_SIZE
117 PFX_REX aesenc_opd1 aesenc_opd2
118 .byte 0x0f, 0x38, 0xdc
119 MODRM 0xc0 aesenc_opd1 aesenc_opd2
120 .endm
121
122 .macro AESENCLAST xmm1 xmm2
123 XMM_NUM aesenclast_opd1 \xmm1
124 XMM_NUM aesenclast_opd2 \xmm2
125 PFX_OPD_SIZE
126 PFX_REX aesenclast_opd1 aesenclast_opd2
127 .byte 0x0f, 0x38, 0xdd
128 MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
129 .endm
130
131 .macro AESDEC xmm1 xmm2
132 XMM_NUM aesdec_opd1 \xmm1
133 XMM_NUM aesdec_opd2 \xmm2
134 PFX_OPD_SIZE
135 PFX_REX aesdec_opd1 aesdec_opd2
136 .byte 0x0f, 0x38, 0xde
137 MODRM 0xc0 aesdec_opd1 aesdec_opd2
138 .endm
139
140 .macro AESDECLAST xmm1 xmm2
141 XMM_NUM aesdeclast_opd1 \xmm1
142 XMM_NUM aesdeclast_opd2 \xmm2
143 PFX_OPD_SIZE
144 PFX_REX aesdeclast_opd1 aesdeclast_opd2
145 .byte 0x0f, 0x38, 0xdf
146 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
147 .endm
148#endif
149
150#endif
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21bbee6c..345c99cef152 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_IOMMU_H 1#ifndef _ASM_X86_IOMMU_H
2#define _ASM_X86_IOMMU_H 2#define _ASM_X86_IOMMU_H
3 3
4extern void pci_iommu_shutdown(void);
5extern void no_iommu_init(void);
6extern struct dma_map_ops nommu_dma_ops; 4extern struct dma_map_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
8extern int iommu_detected; 6extern int iommu_detected;
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ddda6cbed6f4..5458380b6ef8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -34,9 +34,10 @@ static inline int irq_canonicalize(int irq)
34#ifdef CONFIG_HOTPLUG_CPU 34#ifdef CONFIG_HOTPLUG_CPU
35#include <linux/cpumask.h> 35#include <linux/cpumask.h>
36extern void fixup_irqs(void); 36extern void fixup_irqs(void);
37extern void irq_force_complete_move(int);
37#endif 38#endif
38 39
39extern void (*generic_interrupt_extension)(void); 40extern void (*x86_platform_ipi_callback)(void);
40extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
41extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
42 43
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5b21f0ec3df2..4611f085cd43 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,14 +106,14 @@
106/* 106/*
107 * Generic system vector for platform specific use 107 * Generic system vector for platform specific use
108 */ 108 */
109#define GENERIC_INTERRUPT_VECTOR 0xed 109#define X86_PLATFORM_IPI_VECTOR 0xed
110 110
111/* 111/*
112 * Performance monitoring pending work vector: 112 * Performance monitoring pending work vector:
113 */ 113 */
114#define LOCAL_PENDING_VECTOR 0xec 114#define LOCAL_PENDING_VECTOR 0xec
115 115
116#define UV_BAU_MESSAGE 0xec 116#define UV_BAU_MESSAGE 0xea
117 117
118/* 118/*
119 * Self IPI vector for machine checks 119 * Self IPI vector for machine checks
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..f46b79f6c16c 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,8 @@
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE 20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS
22 24
23/* Architectural interrupt line count. */ 25/* Architectural interrupt line count. */
24#define KVM_NR_INTERRUPTS 256 26#define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
79#define KVM_IRQCHIP_PIC_MASTER 0 81#define KVM_IRQCHIP_PIC_MASTER 0
80#define KVM_IRQCHIP_PIC_SLAVE 1 82#define KVM_IRQCHIP_PIC_SLAVE 1
81#define KVM_IRQCHIP_IOAPIC 2 83#define KVM_IRQCHIP_IOAPIC 2
84#define KVM_NR_IRQCHIPS 3
82 85
83/* for KVM_GET_REGS and KVM_SET_REGS */ 86/* for KVM_GET_REGS and KVM_SET_REGS */
84struct kvm_regs { 87struct kvm_regs {
@@ -250,4 +253,35 @@ struct kvm_reinject_control {
250 __u8 pit_reinject; 253 __u8 pit_reinject;
251 __u8 reserved[31]; 254 __u8 reserved[31];
252}; 255};
256
257/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
258#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
259#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
260
261/* for KVM_GET/SET_VCPU_EVENTS */
262struct kvm_vcpu_events {
263 struct {
264 __u8 injected;
265 __u8 nr;
266 __u8 has_error_code;
267 __u8 pad;
268 __u32 error_code;
269 } exception;
270 struct {
271 __u8 injected;
272 __u8 nr;
273 __u8 soft;
274 __u8 pad;
275 } interrupt;
276 struct {
277 __u8 injected;
278 __u8 pending;
279 __u8 masked;
280 __u8 pad;
281 } nmi;
282 __u32 sipi_vector;
283 __u32 flags;
284 __u32 reserved[10];
285};
286
253#endif /* _ASM_X86_KVM_H */ 287#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..7c18e1230f54 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -129,7 +129,7 @@ struct decode_cache {
129 u8 seg_override; 129 u8 seg_override;
130 unsigned int d; 130 unsigned int d;
131 unsigned long regs[NR_VCPU_REGS]; 131 unsigned long regs[NR_VCPU_REGS];
132 unsigned long eip; 132 unsigned long eip, eip_orig;
133 /* modrm */ 133 /* modrm */
134 u8 modrm; 134 u8 modrm;
135 u8 modrm_mod; 135 u8 modrm_mod;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d83892226f73..4f865e8b8540 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,7 +354,6 @@ struct kvm_vcpu_arch {
354 unsigned int time_offset; 354 unsigned int time_offset;
355 struct page *time_page; 355 struct page *time_page;
356 356
357 bool singlestep; /* guest is single stepped by KVM */
358 bool nmi_pending; 357 bool nmi_pending;
359 bool nmi_injected; 358 bool nmi_injected;
360 359
@@ -371,6 +370,10 @@ struct kvm_vcpu_arch {
371 u64 mcg_status; 370 u64 mcg_status;
372 u64 mcg_ctl; 371 u64 mcg_ctl;
373 u64 *mce_banks; 372 u64 *mce_banks;
373
374 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs;
376 unsigned long singlestep_rip;
374}; 377};
375 378
376struct kvm_mem_alias { 379struct kvm_mem_alias {
@@ -397,7 +400,6 @@ struct kvm_arch{
397 struct kvm_pic *vpic; 400 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 401 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 402 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 unsigned int tss_addr; 405 unsigned int tss_addr;
@@ -410,8 +412,10 @@ struct kvm_arch{
410 gpa_t ept_identity_map_addr; 412 gpa_t ept_identity_map_addr;
411 413
412 unsigned long irq_sources_bitmap; 414 unsigned long irq_sources_bitmap;
413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
414 u64 vm_init_tsc; 415 u64 vm_init_tsc;
416 s64 kvmclock_offset;
417
418 struct kvm_xen_hvm_config xen_hvm_config;
415}; 419};
416 420
417struct kvm_vm_stat { 421struct kvm_vm_stat {
@@ -461,7 +465,7 @@ struct descriptor_table {
461struct kvm_x86_ops { 465struct kvm_x86_ops {
462 int (*cpu_has_kvm_support)(void); /* __init */ 466 int (*cpu_has_kvm_support)(void); /* __init */
463 int (*disabled_by_bios)(void); /* __init */ 467 int (*disabled_by_bios)(void); /* __init */
464 void (*hardware_enable)(void *dummy); /* __init */ 468 int (*hardware_enable)(void *dummy);
465 void (*hardware_disable)(void *dummy); 469 void (*hardware_disable)(void *dummy);
466 void (*check_processor_compatibility)(void *rtn); 470 void (*check_processor_compatibility)(void *rtn);
467 int (*hardware_setup)(void); /* __init */ 471 int (*hardware_setup)(void); /* __init */
@@ -477,8 +481,8 @@ struct kvm_x86_ops {
477 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 481 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
478 void (*vcpu_put)(struct kvm_vcpu *vcpu); 482 void (*vcpu_put)(struct kvm_vcpu *vcpu);
479 483
480 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 484 void (*set_guest_debug)(struct kvm_vcpu *vcpu,
481 struct kvm_guest_debug *dbg); 485 struct kvm_guest_debug *dbg);
482 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 486 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
483 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 487 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
484 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 488 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -506,8 +510,8 @@ struct kvm_x86_ops {
506 510
507 void (*tlb_flush)(struct kvm_vcpu *vcpu); 511 void (*tlb_flush)(struct kvm_vcpu *vcpu);
508 512
509 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 513 void (*run)(struct kvm_vcpu *vcpu);
510 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 514 int (*handle_exit)(struct kvm_vcpu *vcpu);
511 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 515 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
512 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 516 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
513 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 517 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,6 +523,8 @@ struct kvm_x86_ops {
519 bool has_error_code, u32 error_code); 523 bool has_error_code, u32 error_code);
520 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 524 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 525 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
526 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
527 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
522 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 528 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 529 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 530 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -568,7 +574,7 @@ enum emulation_result {
568#define EMULTYPE_NO_DECODE (1 << 0) 574#define EMULTYPE_NO_DECODE (1 << 0)
569#define EMULTYPE_TRAP_UD (1 << 1) 575#define EMULTYPE_TRAP_UD (1 << 1)
570#define EMULTYPE_SKIP (1 << 2) 576#define EMULTYPE_SKIP (1 << 2)
571int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 577int emulate_instruction(struct kvm_vcpu *vcpu,
572 unsigned long cr2, u16 error_code, int emulation_type); 578 unsigned long cr2, u16 error_code, int emulation_type);
573void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
574void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +591,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
585 591
586struct x86_emulate_ctxt; 592struct x86_emulate_ctxt;
587 593
588int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 594int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
589 int size, unsigned port); 595 int size, unsigned port);
590int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 596int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
591 int size, unsigned long count, int down, 597 int size, unsigned long count, int down,
592 gva_t address, int rep, unsigned port); 598 gva_t address, int rep, unsigned port);
593void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 599void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -616,6 +622,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
616int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 622int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
617int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 623int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
618 624
625unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
626void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
627
619void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 628void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 629void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 630void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -802,4 +811,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
802int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 811int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
803int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 812int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
804 813
814void kvm_define_shared_msr(unsigned index, u32 msr);
815void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
816
805#endif /* _ASM_X86_KVM_HOST_H */ 817#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f1363b72364f..6c3fdd631ed3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -108,8 +108,11 @@ struct mce_log {
108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
110 110
111
111#ifdef __KERNEL__ 112#ifdef __KERNEL__
112 113
114extern struct atomic_notifier_head x86_mce_decoder_chain;
115
113#include <linux/percpu.h> 116#include <linux/percpu.h>
114#include <linux/init.h> 117#include <linux/init.h>
115#include <asm/atomic.h> 118#include <asm/atomic.h>
@@ -118,9 +121,11 @@ extern int mce_disabled;
118extern int mce_p5_enabled; 121extern int mce_p5_enabled;
119 122
120#ifdef CONFIG_X86_MCE 123#ifdef CONFIG_X86_MCE
121void mcheck_init(struct cpuinfo_x86 *c); 124int mcheck_init(void);
125void mcheck_cpu_init(struct cpuinfo_x86 *c);
122#else 126#else
123static inline void mcheck_init(struct cpuinfo_x86 *c) {} 127static inline int mcheck_init(void) { return 0; }
128static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
124#endif 129#endif
125 130
126#ifdef CONFIG_X86_ANCIENT_MCE 131#ifdef CONFIG_X86_ANCIENT_MCE
@@ -214,5 +219,11 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
214 219
215void mce_log_therm_throt_event(__u64 status); 220void mce_log_therm_throt_event(__u64 status);
216 221
222#ifdef CONFIG_X86_THERMAL_VECTOR
223extern void mcheck_intel_therm_init(void);
224#else
225static inline void mcheck_intel_therm_init(void) { }
226#endif
227
217#endif /* __KERNEL__ */ 228#endif /* __KERNEL__ */
218#endif /* _ASM_X86_MCE_H */ 229#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index ede6998bd92c..91df7c51806c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
47/* 47/*
48 * generic node memory support, the following assumptions apply: 48 * generic node memory support, the following assumptions apply:
49 * 49 *
50 * 1) memory comes in 64Mb contigious chunks which are either present or not 50 * 1) memory comes in 64Mb contiguous chunks which are either present or not
51 * 2) we will not have more than 64Gb in total 51 * 2) we will not have more than 64Gb in total
52 * 52 *
53 * for now assume that 64Gb is max amount of RAM for whole system 53 * for now assume that 64Gb is max amount of RAM for whole system
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 644cf1a50bfd..d8bf23a88d05 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -158,14 +158,16 @@ typedef struct physid_mask physid_mask_t;
158#define physids_shift_left(d, s, n) \ 158#define physids_shift_left(d, s, n) \
159 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 159 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
160 160
161#define physids_coerce(map) ((map).mask[0]) 161static inline unsigned long physids_coerce(physid_mask_t *map)
162{
163 return map->mask[0];
164}
162 165
163#define physids_promote(physids) \ 166static inline void physids_promote(unsigned long physids, physid_mask_t *map)
164 ({ \ 167{
165 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ 168 physids_clear(*map);
166 __physid_mask.mask[0] = physids; \ 169 map->mask[0] = physids;
167 __physid_mask; \ 170}
168 })
169 171
170/* Note: will create very large stack frames if physid_mask_t is big */ 172/* Note: will create very large stack frames if physid_mask_t is big */
171#define physid_mask_of_physid(physid) \ 173#define physid_mask_of_physid(physid) \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4ffe09b2ad75..1cd58cdbc03f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -12,6 +12,7 @@
12#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ 12#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
13#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ 13#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
14#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ 14#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
15#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */
15 16
16/* EFER bits: */ 17/* EFER bits: */
17#define _EFER_SCE 0 /* SYSCALL/SYSRET */ 18#define _EFER_SCE 0 /* SYSCALL/SYSRET */
@@ -123,6 +124,7 @@
123#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 124#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
124#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff 125#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
125#define FAM10H_MMIO_CONF_BASE_SHIFT 20 126#define FAM10H_MMIO_CONF_BASE_SHIFT 20
127#define MSR_FAM10H_NODE_ID 0xc001100c
126 128
127/* K8 MSRs */ 129/* K8 MSRs */
128#define MSR_K8_TOP_MEM1 0xc001001a 130#define MSR_K8_TOP_MEM1 0xc001001a
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 7e2b6ba962ff..c5bc4c2d33f5 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -27,6 +27,18 @@ struct msr {
27 }; 27 };
28}; 28};
29 29
30struct msr_info {
31 u32 msr_no;
32 struct msr reg;
33 struct msr *msrs;
34 int err;
35};
36
37struct msr_regs_info {
38 u32 *regs;
39 int err;
40};
41
30static inline unsigned long long native_read_tscp(unsigned int *aux) 42static inline unsigned long long native_read_tscp(unsigned int *aux)
31{ 43{
32 unsigned long low, high; 44 unsigned long low, high;
@@ -240,15 +252,18 @@ do { \
240#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ 252#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
241 (u32)((val) >> 32)) 253 (u32)((val) >> 32))
242 254
243#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2)) 255#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2))
256
257#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
244 258
245#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) 259struct msr *msrs_alloc(void);
260void msrs_free(struct msr *msrs);
246 261
247#ifdef CONFIG_SMP 262#ifdef CONFIG_SMP
248int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 263int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
249int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 264int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
250void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 265void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
251void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 266void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
252int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 267int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
253int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 268int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
254int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); 269int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
@@ -264,12 +279,12 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
264 wrmsr(msr_no, l, h); 279 wrmsr(msr_no, l, h);
265 return 0; 280 return 0;
266} 281}
267static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no, 282static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
268 struct msr *msrs) 283 struct msr *msrs)
269{ 284{
270 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); 285 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
271} 286}
272static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no, 287static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
273 struct msr *msrs) 288 struct msr *msrs)
274{ 289{
275 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); 290 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 834a30295fab..3a57385d9fa7 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -120,7 +120,7 @@ extern int olpc_ec_mask_unset(uint8_t bits);
120 120
121/* GPIO assignments */ 121/* GPIO assignments */
122 122
123#define OLPC_GPIO_MIC_AC geode_gpio(1) 123#define OLPC_GPIO_MIC_AC 1
124#define OLPC_GPIO_DCON_IRQ geode_gpio(7) 124#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
125#define OLPC_GPIO_THRM_ALRM geode_gpio(10) 125#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
126#define OLPC_GPIO_SMB_CLK geode_gpio(14) 126#define OLPC_GPIO_SMB_CLK geode_gpio(14)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8aebcc41041d..dd59a85a918f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -731,34 +731,34 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
731 731
732#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) 732#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
733 733
734static inline int __raw_spin_is_locked(struct raw_spinlock *lock) 734static inline int arch_spin_is_locked(struct arch_spinlock *lock)
735{ 735{
736 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); 736 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
737} 737}
738 738
739static inline int __raw_spin_is_contended(struct raw_spinlock *lock) 739static inline int arch_spin_is_contended(struct arch_spinlock *lock)
740{ 740{
741 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); 741 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
742} 742}
743#define __raw_spin_is_contended __raw_spin_is_contended 743#define arch_spin_is_contended arch_spin_is_contended
744 744
745static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) 745static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
746{ 746{
747 PVOP_VCALL1(pv_lock_ops.spin_lock, lock); 747 PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
748} 748}
749 749
750static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock, 750static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
751 unsigned long flags) 751 unsigned long flags)
752{ 752{
753 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); 753 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
754} 754}
755 755
756static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) 756static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
757{ 757{
758 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); 758 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
759} 759}
760 760
761static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) 761static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
762{ 762{
763 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); 763 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
764} 764}
@@ -840,42 +840,22 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
840 840
841static inline unsigned long __raw_local_save_flags(void) 841static inline unsigned long __raw_local_save_flags(void)
842{ 842{
843 unsigned long f; 843 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
844
845 asm volatile(paravirt_alt(PARAVIRT_CALL)
846 : "=a"(f)
847 : paravirt_type(pv_irq_ops.save_fl),
848 paravirt_clobber(CLBR_EAX)
849 : "memory", "cc");
850 return f;
851} 844}
852 845
853static inline void raw_local_irq_restore(unsigned long f) 846static inline void raw_local_irq_restore(unsigned long f)
854{ 847{
855 asm volatile(paravirt_alt(PARAVIRT_CALL) 848 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
856 : "=a"(f)
857 : PV_FLAGS_ARG(f),
858 paravirt_type(pv_irq_ops.restore_fl),
859 paravirt_clobber(CLBR_EAX)
860 : "memory", "cc");
861} 849}
862 850
863static inline void raw_local_irq_disable(void) 851static inline void raw_local_irq_disable(void)
864{ 852{
865 asm volatile(paravirt_alt(PARAVIRT_CALL) 853 PVOP_VCALLEE0(pv_irq_ops.irq_disable);
866 :
867 : paravirt_type(pv_irq_ops.irq_disable),
868 paravirt_clobber(CLBR_EAX)
869 : "memory", "eax", "cc");
870} 854}
871 855
872static inline void raw_local_irq_enable(void) 856static inline void raw_local_irq_enable(void)
873{ 857{
874 asm volatile(paravirt_alt(PARAVIRT_CALL) 858 PVOP_VCALLEE0(pv_irq_ops.irq_enable);
875 :
876 : paravirt_type(pv_irq_ops.irq_enable),
877 paravirt_clobber(CLBR_EAX)
878 : "memory", "eax", "cc");
879} 859}
880 860
881static inline unsigned long __raw_local_irq_save(void) 861static inline unsigned long __raw_local_irq_save(void)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index dd0f5b32489d..b1e70d51e40c 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -318,14 +318,14 @@ struct pv_mmu_ops {
318 phys_addr_t phys, pgprot_t flags); 318 phys_addr_t phys, pgprot_t flags);
319}; 319};
320 320
321struct raw_spinlock; 321struct arch_spinlock;
322struct pv_lock_ops { 322struct pv_lock_ops {
323 int (*spin_is_locked)(struct raw_spinlock *lock); 323 int (*spin_is_locked)(struct arch_spinlock *lock);
324 int (*spin_is_contended)(struct raw_spinlock *lock); 324 int (*spin_is_contended)(struct arch_spinlock *lock);
325 void (*spin_lock)(struct raw_spinlock *lock); 325 void (*spin_lock)(struct arch_spinlock *lock);
326 void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); 326 void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags);
327 int (*spin_trylock)(struct raw_spinlock *lock); 327 int (*spin_trylock)(struct arch_spinlock *lock);
328 void (*spin_unlock)(struct raw_spinlock *lock); 328 void (*spin_unlock)(struct arch_spinlock *lock);
329}; 329};
330 330
331/* This contains all the paravirt structures: we get a convenient 331/* This contains all the paravirt structures: we get a convenient
@@ -494,10 +494,11 @@ int paravirt_disable_iospace(void);
494#define EXTRA_CLOBBERS 494#define EXTRA_CLOBBERS
495#define VEXTRA_CLOBBERS 495#define VEXTRA_CLOBBERS
496#else /* CONFIG_X86_64 */ 496#else /* CONFIG_X86_64 */
497/* [re]ax isn't an arg, but the return val */
497#define PVOP_VCALL_ARGS \ 498#define PVOP_VCALL_ARGS \
498 unsigned long __edi = __edi, __esi = __esi, \ 499 unsigned long __edi = __edi, __esi = __esi, \
499 __edx = __edx, __ecx = __ecx 500 __edx = __edx, __ecx = __ecx, __eax = __eax
500#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax 501#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
501 502
502#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) 503#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
503#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) 504#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
@@ -509,6 +510,7 @@ int paravirt_disable_iospace(void);
509 "=c" (__ecx) 510 "=c" (__ecx)
510#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) 511#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
511 512
513/* void functions are still allowed [re]ax for scratch */
512#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) 514#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
513#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS 515#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
514 516
@@ -583,8 +585,8 @@ int paravirt_disable_iospace(void);
583 VEXTRA_CLOBBERS, \ 585 VEXTRA_CLOBBERS, \
584 pre, post, ##__VA_ARGS__) 586 pre, post, ##__VA_ARGS__)
585 587
586#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ 588#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
587 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ 589 ____PVOP_VCALL(op.func, CLBR_RET_REG, \
588 PVOP_VCALLEE_CLOBBERS, , \ 590 PVOP_VCALLEE_CLOBBERS, , \
589 pre, post, ##__VA_ARGS__) 591 pre, post, ##__VA_ARGS__)
590 592
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988eee3a..b4bf9a942ed0 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -118,11 +118,27 @@ extern int __init pcibios_init(void);
118 118
119/* pci-mmconfig.c */ 119/* pci-mmconfig.c */
120 120
121/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
122#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
123
124struct pci_mmcfg_region {
125 struct list_head list;
126 struct resource res;
127 u64 address;
128 char __iomem *virt;
129 u16 segment;
130 u8 start_bus;
131 u8 end_bus;
132 char name[PCI_MMCFG_RESOURCE_NAME_LEN];
133};
134
121extern int __init pci_mmcfg_arch_init(void); 135extern int __init pci_mmcfg_arch_init(void);
122extern void __init pci_mmcfg_arch_free(void); 136extern void __init pci_mmcfg_arch_free(void);
137extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
138
139extern struct list_head pci_mmcfg_list;
123 140
124extern struct acpi_mcfg_allocation *pci_mmcfg_config; 141#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
125extern int pci_mmcfg_config_num;
126 142
127/* 143/*
128 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space 144 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index b65a36defeb7..0c44196b78ac 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -74,31 +74,31 @@ extern void __bad_percpu_size(void);
74 74
75#define percpu_to_op(op, var, val) \ 75#define percpu_to_op(op, var, val) \
76do { \ 76do { \
77 typedef typeof(var) T__; \ 77 typedef typeof(var) pto_T__; \
78 if (0) { \ 78 if (0) { \
79 T__ tmp__; \ 79 pto_T__ pto_tmp__; \
80 tmp__ = (val); \ 80 pto_tmp__ = (val); \
81 } \ 81 } \
82 switch (sizeof(var)) { \ 82 switch (sizeof(var)) { \
83 case 1: \ 83 case 1: \
84 asm(op "b %1,"__percpu_arg(0) \ 84 asm(op "b %1,"__percpu_arg(0) \
85 : "+m" (var) \ 85 : "+m" (var) \
86 : "qi" ((T__)(val))); \ 86 : "qi" ((pto_T__)(val))); \
87 break; \ 87 break; \
88 case 2: \ 88 case 2: \
89 asm(op "w %1,"__percpu_arg(0) \ 89 asm(op "w %1,"__percpu_arg(0) \
90 : "+m" (var) \ 90 : "+m" (var) \
91 : "ri" ((T__)(val))); \ 91 : "ri" ((pto_T__)(val))); \
92 break; \ 92 break; \
93 case 4: \ 93 case 4: \
94 asm(op "l %1,"__percpu_arg(0) \ 94 asm(op "l %1,"__percpu_arg(0) \
95 : "+m" (var) \ 95 : "+m" (var) \
96 : "ri" ((T__)(val))); \ 96 : "ri" ((pto_T__)(val))); \
97 break; \ 97 break; \
98 case 8: \ 98 case 8: \
99 asm(op "q %1,"__percpu_arg(0) \ 99 asm(op "q %1,"__percpu_arg(0) \
100 : "+m" (var) \ 100 : "+m" (var) \
101 : "re" ((T__)(val))); \ 101 : "re" ((pto_T__)(val))); \
102 break; \ 102 break; \
103 default: __bad_percpu_size(); \ 103 default: __bad_percpu_size(); \
104 } \ 104 } \
@@ -106,31 +106,31 @@ do { \
106 106
107#define percpu_from_op(op, var, constraint) \ 107#define percpu_from_op(op, var, constraint) \
108({ \ 108({ \
109 typeof(var) ret__; \ 109 typeof(var) pfo_ret__; \
110 switch (sizeof(var)) { \ 110 switch (sizeof(var)) { \
111 case 1: \ 111 case 1: \
112 asm(op "b "__percpu_arg(1)",%0" \ 112 asm(op "b "__percpu_arg(1)",%0" \
113 : "=q" (ret__) \ 113 : "=q" (pfo_ret__) \
114 : constraint); \ 114 : constraint); \
115 break; \ 115 break; \
116 case 2: \ 116 case 2: \
117 asm(op "w "__percpu_arg(1)",%0" \ 117 asm(op "w "__percpu_arg(1)",%0" \
118 : "=r" (ret__) \ 118 : "=r" (pfo_ret__) \
119 : constraint); \ 119 : constraint); \
120 break; \ 120 break; \
121 case 4: \ 121 case 4: \
122 asm(op "l "__percpu_arg(1)",%0" \ 122 asm(op "l "__percpu_arg(1)",%0" \
123 : "=r" (ret__) \ 123 : "=r" (pfo_ret__) \
124 : constraint); \ 124 : constraint); \
125 break; \ 125 break; \
126 case 8: \ 126 case 8: \
127 asm(op "q "__percpu_arg(1)",%0" \ 127 asm(op "q "__percpu_arg(1)",%0" \
128 : "=r" (ret__) \ 128 : "=r" (pfo_ret__) \
129 : constraint); \ 129 : constraint); \
130 break; \ 130 break; \
131 default: __bad_percpu_size(); \ 131 default: __bad_percpu_size(); \
132 } \ 132 } \
133 ret__; \ 133 pfo_ret__; \
134}) 134})
135 135
136/* 136/*
@@ -153,6 +153,84 @@ do { \
153#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) 153#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
154#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) 154#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
155 155
156#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
157#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
158#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
159
160#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
161#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
162#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
163#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
164#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
165#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
166#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
167#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
168#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
169#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
170#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
171#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
172#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
173#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
174#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
175
176#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
177#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
178#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
179#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
180#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
181#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
182#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
183#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
184#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
185#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
186#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
187#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
188#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
189#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
190#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
191#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
192#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
193#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
194
195#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
196#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
197#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
198#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
199#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
200#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
201#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
202#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
203#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
204#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
205#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
206#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
207
208/*
209 * Per cpu atomic 64 bit operations are only available under 64 bit.
210 * 32 bit must fall back to generic operations.
211 */
212#ifdef CONFIG_X86_64
213#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
214#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
215#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
216#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
217#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
218#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
219
220#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
221#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
222#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
223#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
224#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
225#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
226
227#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
228#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
229#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
230#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
231
232#endif
233
156/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 234/* This is not atomic against other CPUs -- CPU preemption needs to be off */
157#define x86_test_and_clear_bit_percpu(bit, var) \ 235#define x86_test_and_clear_bit_percpu(bit, var) \
158({ \ 236({ \
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index ad7ce3fd5065..1380367dabd9 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -19,6 +19,7 @@
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20 20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) 21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) 23#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) 24#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) 25#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
@@ -28,9 +29,20 @@
28 */ 29 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff 30#define ARCH_PERFMON_EVENT_MASK 0xffff
30 31
32/*
33 * filter mask to validate fixed counter events.
34 * the following filters disqualify for fixed counters:
35 * - inv
36 * - edge
37 * - cnt-mask
38 * The other filters are supported by fixed counters.
39 * The any-thread option is supported starting with v3.
40 */
41#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000
42
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 43#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 44#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 45#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ 46#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) 47 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36 48
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c3429e8b2424..fc801bab1b3b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -30,6 +30,7 @@ struct mm_struct;
30#include <linux/math64.h> 30#include <linux/math64.h>
31#include <linux/init.h> 31#include <linux/init.h>
32 32
33#define HBP_NUM 4
33/* 34/*
34 * Default implementation of macro that returns current 35 * Default implementation of macro that returns current
35 * instruction pointer ("program counter"). 36 * instruction pointer ("program counter").
@@ -180,7 +181,7 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
180 unsigned int *ecx, unsigned int *edx) 181 unsigned int *ecx, unsigned int *edx)
181{ 182{
182 /* ecx is often an input as well as an output. */ 183 /* ecx is often an input as well as an output. */
183 asm("cpuid" 184 asm volatile("cpuid"
184 : "=a" (*eax), 185 : "=a" (*eax),
185 "=b" (*ebx), 186 "=b" (*ebx),
186 "=c" (*ecx), 187 "=c" (*ecx),
@@ -422,6 +423,8 @@ extern unsigned int xstate_size;
422extern void free_thread_xstate(struct task_struct *); 423extern void free_thread_xstate(struct task_struct *);
423extern struct kmem_cache *task_xstate_cachep; 424extern struct kmem_cache *task_xstate_cachep;
424 425
426struct perf_event;
427
425struct thread_struct { 428struct thread_struct {
426 /* Cached TLS descriptors: */ 429 /* Cached TLS descriptors: */
427 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; 430 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -443,13 +446,10 @@ struct thread_struct {
443 unsigned long fs; 446 unsigned long fs;
444#endif 447#endif
445 unsigned long gs; 448 unsigned long gs;
446 /* Hardware debugging registers: */ 449 /* Save middle states of ptrace breakpoints */
447 unsigned long debugreg0; 450 struct perf_event *ptrace_bps[HBP_NUM];
448 unsigned long debugreg1; 451 /* Debug status used for traps, single steps, etc... */
449 unsigned long debugreg2; 452 unsigned long debugreg6;
450 unsigned long debugreg3;
451 unsigned long debugreg6;
452 unsigned long debugreg7;
453 /* Fault info: */ 453 /* Fault info: */
454 unsigned long cr2; 454 unsigned long cr2;
455 unsigned long trap_no; 455 unsigned long trap_no;
@@ -1000,7 +1000,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
1000#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) 1000#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
1001 1001
1002#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) 1002#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
1003#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ 1003extern unsigned long KSTK_ESP(struct task_struct *task);
1004#endif /* CONFIG_X86_64 */ 1004#endif /* CONFIG_X86_64 */
1005 1005
1006extern void start_thread(struct pt_regs *regs, unsigned long new_ip, 1006extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908349aa..9d369f680321 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
7 7
8#ifdef __KERNEL__ 8#ifdef __KERNEL__
9#include <asm/segment.h> 9#include <asm/segment.h>
10#include <asm/page_types.h>
10#endif 11#endif
11 12
12#ifndef __ASSEMBLY__ 13#ifndef __ASSEMBLY__
@@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
216 return regs->sp; 217 return regs->sp;
217} 218}
218 219
220/* Query offset/name of register from its name/offset */
221extern int regs_query_register_offset(const char *name);
222extern const char *regs_query_register_name(unsigned int offset);
223#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
224
225/**
226 * regs_get_register() - get register value from its offset
227 * @regs: pt_regs from which register value is gotten.
228 * @offset: offset number of the register.
229 *
230 * regs_get_register returns the value of a register. The @offset is the
231 * offset of the register in struct pt_regs address which specified by @regs.
232 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
233 */
234static inline unsigned long regs_get_register(struct pt_regs *regs,
235 unsigned int offset)
236{
237 if (unlikely(offset > MAX_REG_OFFSET))
238 return 0;
239 return *(unsigned long *)((unsigned long)regs + offset);
240}
241
242/**
243 * regs_within_kernel_stack() - check the address in the stack
244 * @regs: pt_regs which contains kernel stack pointer.
245 * @addr: address which is checked.
246 *
247 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
248 * If @addr is within the kernel stack, it returns true. If not, returns false.
249 */
250static inline int regs_within_kernel_stack(struct pt_regs *regs,
251 unsigned long addr)
252{
253 return ((addr & ~(THREAD_SIZE - 1)) ==
254 (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
255}
256
257/**
258 * regs_get_kernel_stack_nth() - get Nth entry of the stack
259 * @regs: pt_regs which contains kernel stack pointer.
260 * @n: stack entry number.
261 *
262 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
263 * is specified by @regs. If the @n th entry is NOT in the kernel stack,
264 * this returns 0.
265 */
266static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
267 unsigned int n)
268{
269 unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
270 addr += n;
271 if (regs_within_kernel_stack(regs, (unsigned long)addr))
272 return *addr;
273 else
274 return 0;
275}
276
277/* Get Nth argument at function call */
278extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
279 unsigned int n);
280
219/* 281/*
220 * These are defined as per linux/ptrace.h, which see. 282 * These are defined as per linux/ptrace.h, which see.
221 */ 283 */
@@ -230,6 +292,8 @@ extern void user_enable_block_step(struct task_struct *);
230#define arch_has_block_step() (boot_cpu_data.x86 >= 6) 292#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
231#endif 293#endif
232 294
295#define ARCH_HAS_USER_SINGLE_STEP_INFO
296
233struct user_desc; 297struct user_desc;
234extern int do_get_thread_area(struct task_struct *p, int idx, 298extern int do_get_thread_area(struct task_struct *p, int idx,
235 struct user_desc __user *info); 299 struct user_desc __user *info);
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 72e5a4491661..04459d25e66e 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -124,7 +124,7 @@ struct sigcontext {
124 * fpstate is really (struct _fpstate *) or (struct _xstate *) 124 * fpstate is really (struct _fpstate *) or (struct _xstate *)
125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved 125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end 126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
127 * of extended memory layout. See comments at the defintion of 127 * of extended memory layout. See comments at the definition of
128 * (struct _fpx_sw_bytes) 128 * (struct _fpx_sw_bytes)
129 */ 129 */
130 void __user *fpstate; /* zero when no FPU/extended context */ 130 void __user *fpstate; /* zero when no FPU/extended context */
@@ -219,7 +219,7 @@ struct sigcontext {
219 * fpstate is really (struct _fpstate *) or (struct _xstate *) 219 * fpstate is really (struct _fpstate *) or (struct _xstate *)
220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved 220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end 221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
222 * of extended memory layout. See comments at the defintion of 222 * of extended memory layout. See comments at the definition of
223 * (struct _fpx_sw_bytes) 223 * (struct _fpx_sw_bytes)
224 */ 224 */
225 void __user *fpstate; /* zero when no FPU/extended context */ 225 void __user *fpstate; /* zero when no FPU/extended context */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 4e77853321db..3089f70c0c52 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -58,7 +58,7 @@
58#if (NR_CPUS < 256) 58#if (NR_CPUS < 256)
59#define TICKET_SHIFT 8 59#define TICKET_SHIFT 8
60 60
61static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) 61static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
62{ 62{
63 short inc = 0x0100; 63 short inc = 0x0100;
64 64
@@ -77,7 +77,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
77 : "memory", "cc"); 77 : "memory", "cc");
78} 78}
79 79
80static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) 80static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
81{ 81{
82 int tmp, new; 82 int tmp, new;
83 83
@@ -96,7 +96,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
96 return tmp; 96 return tmp;
97} 97}
98 98
99static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) 99static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
100{ 100{
101 asm volatile(UNLOCK_LOCK_PREFIX "incb %0" 101 asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
102 : "+m" (lock->slock) 102 : "+m" (lock->slock)
@@ -106,7 +106,7 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
106#else 106#else
107#define TICKET_SHIFT 16 107#define TICKET_SHIFT 16
108 108
109static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) 109static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
110{ 110{
111 int inc = 0x00010000; 111 int inc = 0x00010000;
112 int tmp; 112 int tmp;
@@ -127,7 +127,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
127 : "memory", "cc"); 127 : "memory", "cc");
128} 128}
129 129
130static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) 130static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
131{ 131{
132 int tmp; 132 int tmp;
133 int new; 133 int new;
@@ -149,7 +149,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
149 return tmp; 149 return tmp;
150} 150}
151 151
152static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) 152static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
153{ 153{
154 asm volatile(UNLOCK_LOCK_PREFIX "incw %0" 154 asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
155 : "+m" (lock->slock) 155 : "+m" (lock->slock)
@@ -158,14 +158,14 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
158} 158}
159#endif 159#endif
160 160
161static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) 161static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
162{ 162{
163 int tmp = ACCESS_ONCE(lock->slock); 163 int tmp = ACCESS_ONCE(lock->slock);
164 164
165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); 165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
166} 166}
167 167
168static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) 168static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
169{ 169{
170 int tmp = ACCESS_ONCE(lock->slock); 170 int tmp = ACCESS_ONCE(lock->slock);
171 171
@@ -174,43 +174,43 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
174 174
175#ifndef CONFIG_PARAVIRT_SPINLOCKS 175#ifndef CONFIG_PARAVIRT_SPINLOCKS
176 176
177static inline int __raw_spin_is_locked(raw_spinlock_t *lock) 177static inline int arch_spin_is_locked(arch_spinlock_t *lock)
178{ 178{
179 return __ticket_spin_is_locked(lock); 179 return __ticket_spin_is_locked(lock);
180} 180}
181 181
182static inline int __raw_spin_is_contended(raw_spinlock_t *lock) 182static inline int arch_spin_is_contended(arch_spinlock_t *lock)
183{ 183{
184 return __ticket_spin_is_contended(lock); 184 return __ticket_spin_is_contended(lock);
185} 185}
186#define __raw_spin_is_contended __raw_spin_is_contended 186#define arch_spin_is_contended arch_spin_is_contended
187 187
188static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) 188static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
189{ 189{
190 __ticket_spin_lock(lock); 190 __ticket_spin_lock(lock);
191} 191}
192 192
193static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) 193static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
194{ 194{
195 return __ticket_spin_trylock(lock); 195 return __ticket_spin_trylock(lock);
196} 196}
197 197
198static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) 198static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
199{ 199{
200 __ticket_spin_unlock(lock); 200 __ticket_spin_unlock(lock);
201} 201}
202 202
203static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, 203static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
204 unsigned long flags) 204 unsigned long flags)
205{ 205{
206 __raw_spin_lock(lock); 206 arch_spin_lock(lock);
207} 207}
208 208
209#endif /* CONFIG_PARAVIRT_SPINLOCKS */ 209#endif /* CONFIG_PARAVIRT_SPINLOCKS */
210 210
211static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) 211static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
212{ 212{
213 while (__raw_spin_is_locked(lock)) 213 while (arch_spin_is_locked(lock))
214 cpu_relax(); 214 cpu_relax();
215} 215}
216 216
@@ -232,7 +232,7 @@ static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
232 * read_can_lock - would read_trylock() succeed? 232 * read_can_lock - would read_trylock() succeed?
233 * @lock: the rwlock in question. 233 * @lock: the rwlock in question.
234 */ 234 */
235static inline int __raw_read_can_lock(raw_rwlock_t *lock) 235static inline int arch_read_can_lock(arch_rwlock_t *lock)
236{ 236{
237 return (int)(lock)->lock > 0; 237 return (int)(lock)->lock > 0;
238} 238}
@@ -241,12 +241,12 @@ static inline int __raw_read_can_lock(raw_rwlock_t *lock)
241 * write_can_lock - would write_trylock() succeed? 241 * write_can_lock - would write_trylock() succeed?
242 * @lock: the rwlock in question. 242 * @lock: the rwlock in question.
243 */ 243 */
244static inline int __raw_write_can_lock(raw_rwlock_t *lock) 244static inline int arch_write_can_lock(arch_rwlock_t *lock)
245{ 245{
246 return (lock)->lock == RW_LOCK_BIAS; 246 return (lock)->lock == RW_LOCK_BIAS;
247} 247}
248 248
249static inline void __raw_read_lock(raw_rwlock_t *rw) 249static inline void arch_read_lock(arch_rwlock_t *rw)
250{ 250{
251 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" 251 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
252 "jns 1f\n" 252 "jns 1f\n"
@@ -255,7 +255,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw)
255 ::LOCK_PTR_REG (rw) : "memory"); 255 ::LOCK_PTR_REG (rw) : "memory");
256} 256}
257 257
258static inline void __raw_write_lock(raw_rwlock_t *rw) 258static inline void arch_write_lock(arch_rwlock_t *rw)
259{ 259{
260 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" 260 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
261 "jz 1f\n" 261 "jz 1f\n"
@@ -264,7 +264,7 @@ static inline void __raw_write_lock(raw_rwlock_t *rw)
264 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); 264 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
265} 265}
266 266
267static inline int __raw_read_trylock(raw_rwlock_t *lock) 267static inline int arch_read_trylock(arch_rwlock_t *lock)
268{ 268{
269 atomic_t *count = (atomic_t *)lock; 269 atomic_t *count = (atomic_t *)lock;
270 270
@@ -274,7 +274,7 @@ static inline int __raw_read_trylock(raw_rwlock_t *lock)
274 return 0; 274 return 0;
275} 275}
276 276
277static inline int __raw_write_trylock(raw_rwlock_t *lock) 277static inline int arch_write_trylock(arch_rwlock_t *lock)
278{ 278{
279 atomic_t *count = (atomic_t *)lock; 279 atomic_t *count = (atomic_t *)lock;
280 280
@@ -284,23 +284,23 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
284 return 0; 284 return 0;
285} 285}
286 286
287static inline void __raw_read_unlock(raw_rwlock_t *rw) 287static inline void arch_read_unlock(arch_rwlock_t *rw)
288{ 288{
289 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); 289 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
290} 290}
291 291
292static inline void __raw_write_unlock(raw_rwlock_t *rw) 292static inline void arch_write_unlock(arch_rwlock_t *rw)
293{ 293{
294 asm volatile(LOCK_PREFIX "addl %1, %0" 294 asm volatile(LOCK_PREFIX "addl %1, %0"
295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); 295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
296} 296}
297 297
298#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) 298#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
299#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) 299#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
300 300
301#define _raw_spin_relax(lock) cpu_relax() 301#define arch_spin_relax(lock) cpu_relax()
302#define _raw_read_relax(lock) cpu_relax() 302#define arch_read_relax(lock) cpu_relax()
303#define _raw_write_relax(lock) cpu_relax() 303#define arch_write_relax(lock) cpu_relax()
304 304
305/* The {read|write|spin}_lock() on x86 are full memory barriers. */ 305/* The {read|write|spin}_lock() on x86 are full memory barriers. */
306static inline void smp_mb__after_lock(void) { } 306static inline void smp_mb__after_lock(void) { }
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 845f81c87091..dcb48b2edc11 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -5,16 +5,16 @@
5# error "please don't include this file directly" 5# error "please don't include this file directly"
6#endif 6#endif
7 7
8typedef struct raw_spinlock { 8typedef struct arch_spinlock {
9 unsigned int slock; 9 unsigned int slock;
10} raw_spinlock_t; 10} arch_spinlock_t;
11 11
12#define __RAW_SPIN_LOCK_UNLOCKED { 0 } 12#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
13 13
14typedef struct { 14typedef struct {
15 unsigned int lock; 15 unsigned int lock;
16} raw_rwlock_t; 16} arch_rwlock_t;
17 17
18#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } 18#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
19 19
20#endif /* _ASM_X86_SPINLOCK_TYPES_H */ 20#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index cf86a5e73815..35e89122a42f 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -5,6 +5,29 @@ extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name); 6int x86_is_stack_id(int id, char *name);
7 7
8struct thread_info;
9struct stacktrace_ops;
10
11typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo,
12 unsigned long *stack,
13 unsigned long bp,
14 const struct stacktrace_ops *ops,
15 void *data,
16 unsigned long *end,
17 int *graph);
18
19extern unsigned long
20print_context_stack(struct thread_info *tinfo,
21 unsigned long *stack, unsigned long bp,
22 const struct stacktrace_ops *ops, void *data,
23 unsigned long *end, int *graph);
24
25extern unsigned long
26print_context_stack_bp(struct thread_info *tinfo,
27 unsigned long *stack, unsigned long bp,
28 const struct stacktrace_ops *ops, void *data,
29 unsigned long *end, int *graph);
30
8/* Generic stack tracer with callbacks */ 31/* Generic stack tracer with callbacks */
9 32
10struct stacktrace_ops { 33struct stacktrace_ops {
@@ -14,6 +37,7 @@ struct stacktrace_ops {
14 void (*address)(void *data, unsigned long address, int reliable); 37 void (*address)(void *data, unsigned long address, int reliable);
15 /* On negative return stop dumping */ 38 /* On negative return stop dumping */
16 int (*stack)(void *data, char *name); 39 int (*stack)(void *data, char *name);
40 walk_stack_t walk_stack;
17}; 41};
18 42
19void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 43void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index ae907e617181..3d3e8353ee5c 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 */ 177 */
178 178
179#ifndef CONFIG_KMEMCHECK 179#ifndef CONFIG_KMEMCHECK
180
181#if (__GNUC__ >= 4)
182#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
183#else
180#define memcpy(t, f, n) \ 184#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 185 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 186 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 187 : __memcpy((t), (f), (n)))
188#endif
184#else 189#else
185/* 190/*
186 * kmemcheck becomes very happy if we use the REP instructions unconditionally, 191 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
@@ -316,11 +321,15 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
316 : __memset_generic((s), (c), (count))) 321 : __memset_generic((s), (c), (count)))
317 322
318#define __HAVE_ARCH_MEMSET 323#define __HAVE_ARCH_MEMSET
324#if (__GNUC__ >= 4)
325#define memset(s, c, count) __builtin_memset(s, c, count)
326#else
319#define memset(s, c, count) \ 327#define memset(s, c, count) \
320 (__builtin_constant_p(c) \ 328 (__builtin_constant_p(c) \
321 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ 329 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
322 (count)) \ 330 (count)) \
323 : __memset((s), (c), (count))) 331 : __memset((s), (c), (count)))
332#endif
324 333
325/* 334/*
326 * find the first occurrence of byte 'c', or 1 past the area if none 335 * find the first occurrence of byte 'c', or 1 past the area if none
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..1fecb7e61130 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
57 u16 intercept_dr_write; 57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 58 u32 intercept_exceptions;
59 u64 intercept; 59 u64 intercept;
60 u8 reserved_1[44]; 60 u8 reserved_1[42];
61 u16 pause_filter_count;
61 u64 iopm_base_pa; 62 u64 iopm_base_pa;
62 u64 msrpm_base_pa; 63 u64 msrpm_base_pa;
63 u64 tsc_offset; 64 u64 tsc_offset;
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20174fb..8085277e1b8b 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,15 +3,16 @@
3 3
4#include <linux/swiotlb.h> 4#include <linux/swiotlb.h>
5 5
6/* SWIOTLB interface */
7
8extern int swiotlb_force;
9
10#ifdef CONFIG_SWIOTLB 6#ifdef CONFIG_SWIOTLB
11extern int swiotlb; 7extern int swiotlb;
12extern void pci_swiotlb_init(void); 8extern int __init pci_swiotlb_detect(void);
9extern void __init pci_swiotlb_init(void);
13#else 10#else
14#define swiotlb 0 11#define swiotlb 0
12static inline int pci_swiotlb_detect(void)
13{
14 return 0;
15}
15static inline void pci_swiotlb_init(void) 16static inline void pci_swiotlb_init(void)
16{ 17{
17} 18}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 72a6dcd1299b..d5f69045c100 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -30,7 +30,6 @@ struct mmap_arg_struct;
30asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); 30asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); 31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
32 32
33asmlinkage long sys32_pipe(int __user *);
34struct sigaction32; 33struct sigaction32;
35struct old_sigaction32; 34struct old_sigaction32;
36asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, 35asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
@@ -51,20 +50,12 @@ asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); 50asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); 51asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
53 52
54#ifdef CONFIG_SYSCTL_SYSCALL
55struct sysctl_ia32;
56asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
57#endif
58
59asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); 53asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
60asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); 54asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
61 55
62asmlinkage long sys32_personality(unsigned long); 56asmlinkage long sys32_personality(unsigned long);
63asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); 57asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
64 58
65asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
66 unsigned long, unsigned long, unsigned long);
67
68struct oldold_utsname; 59struct oldold_utsname;
69struct old_utsname; 60struct old_utsname;
70asmlinkage long sys32_olduname(struct oldold_utsname __user *); 61asmlinkage long sys32_olduname(struct oldold_utsname __user *);
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 372b76edd63f..8868b9420b0e 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -18,16 +18,24 @@
18/* Common in X86_32 and X86_64 */ 18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21long sys_iopl(unsigned int, struct pt_regs *);
21 22
22/* kernel/process.c */ 23/* kernel/process.c */
23int sys_fork(struct pt_regs *); 24int sys_fork(struct pt_regs *);
24int sys_vfork(struct pt_regs *); 25int sys_vfork(struct pt_regs *);
26long sys_execve(char __user *, char __user * __user *,
27 char __user * __user *, struct pt_regs *);
28long sys_clone(unsigned long, unsigned long, void __user *,
29 void __user *, struct pt_regs *);
25 30
26/* kernel/ldt.c */ 31/* kernel/ldt.c */
27asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); 32asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
28 33
29/* kernel/signal.c */ 34/* kernel/signal.c */
30long sys_rt_sigreturn(struct pt_regs *); 35long sys_rt_sigreturn(struct pt_regs *);
36long sys_sigaltstack(const stack_t __user *, stack_t __user *,
37 struct pt_regs *);
38
31 39
32/* kernel/tls.c */ 40/* kernel/tls.c */
33asmlinkage int sys_set_thread_area(struct user_desc __user *); 41asmlinkage int sys_set_thread_area(struct user_desc __user *);
@@ -35,18 +43,11 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
35 43
36/* X86_32 only */ 44/* X86_32 only */
37#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
38/* kernel/ioport.c */
39long sys_iopl(struct pt_regs *);
40
41/* kernel/process_32.c */
42int sys_clone(struct pt_regs *);
43int sys_execve(struct pt_regs *);
44 46
45/* kernel/signal.c */ 47/* kernel/signal.c */
46asmlinkage int sys_sigsuspend(int, int, old_sigset_t); 48asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
47asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, 49asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
48 struct old_sigaction __user *); 50 struct old_sigaction __user *);
49int sys_sigaltstack(struct pt_regs *);
50unsigned long sys_sigreturn(struct pt_regs *); 51unsigned long sys_sigreturn(struct pt_regs *);
51 52
52/* kernel/sys_i386_32.c */ 53/* kernel/sys_i386_32.c */
@@ -55,8 +56,6 @@ struct sel_arg_struct;
55struct oldold_utsname; 56struct oldold_utsname;
56struct old_utsname; 57struct old_utsname;
57 58
58asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
59 unsigned long, unsigned long, unsigned long);
60asmlinkage int old_mmap(struct mmap_arg_struct __user *); 59asmlinkage int old_mmap(struct mmap_arg_struct __user *);
61asmlinkage int old_select(struct sel_arg_struct __user *); 60asmlinkage int old_select(struct sel_arg_struct __user *);
62asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); 61asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
@@ -64,28 +63,15 @@ asmlinkage int sys_uname(struct old_utsname __user *);
64asmlinkage int sys_olduname(struct oldold_utsname __user *); 63asmlinkage int sys_olduname(struct oldold_utsname __user *);
65 64
66/* kernel/vm86_32.c */ 65/* kernel/vm86_32.c */
67int sys_vm86old(struct pt_regs *); 66int sys_vm86old(struct vm86_struct __user *, struct pt_regs *);
68int sys_vm86(struct pt_regs *); 67int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
69 68
70#else /* CONFIG_X86_32 */ 69#else /* CONFIG_X86_32 */
71 70
72/* X86_64 only */ 71/* X86_64 only */
73/* kernel/ioport.c */
74asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
75
76/* kernel/process_64.c */ 72/* kernel/process_64.c */
77asmlinkage long sys_clone(unsigned long, unsigned long,
78 void __user *, void __user *,
79 struct pt_regs *);
80asmlinkage long sys_execve(char __user *, char __user * __user *,
81 char __user * __user *,
82 struct pt_regs *);
83long sys_arch_prctl(int, unsigned long); 73long sys_arch_prctl(int, unsigned long);
84 74
85/* kernel/signal.c */
86asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
87 struct pt_regs *);
88
89/* kernel/sys_x86_64.c */ 75/* kernel/sys_x86_64.c */
90struct new_utsname; 76struct new_utsname;
91 77
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index f08f97374892..ecb544e65382 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
23struct tss_struct; 23struct tss_struct;
24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
25 struct tss_struct *tss); 25 struct tss_struct *tss);
26extern void show_regs_common(void);
26 27
27#ifdef CONFIG_X86_32 28#ifdef CONFIG_X86_32
28 29
@@ -128,8 +129,6 @@ do { \
128 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ 129 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
129 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ 130 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
130 "call __switch_to\n\t" \ 131 "call __switch_to\n\t" \
131 ".globl thread_return\n" \
132 "thread_return:\n\t" \
133 "movq "__percpu_arg([current_task])",%%rsi\n\t" \ 132 "movq "__percpu_arg([current_task])",%%rsi\n\t" \
134 __switch_canary \ 133 __switch_canary \
135 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 134 "movq %P[thread_info](%%rsi),%%r8\n\t" \
@@ -157,19 +156,22 @@ extern void native_load_gs_index(unsigned);
157 * Load a segment. Fall back on loading the zero 156 * Load a segment. Fall back on loading the zero
158 * segment if something goes wrong.. 157 * segment if something goes wrong..
159 */ 158 */
160#define loadsegment(seg, value) \ 159#define loadsegment(seg, value) \
161 asm volatile("\n" \ 160do { \
162 "1:\t" \ 161 unsigned short __val = (value); \
163 "movl %k0,%%" #seg "\n" \ 162 \
164 "2:\n" \ 163 asm volatile(" \n" \
165 ".section .fixup,\"ax\"\n" \ 164 "1: movl %k0,%%" #seg " \n" \
166 "3:\t" \ 165 \
167 "movl %k1, %%" #seg "\n\t" \ 166 ".section .fixup,\"ax\" \n" \
168 "jmp 2b\n" \ 167 "2: xorl %k0,%k0 \n" \
169 ".previous\n" \ 168 " jmp 1b \n" \
170 _ASM_EXTABLE(1b,3b) \ 169 ".previous \n" \
171 : :"r" (value), "r" (0) : "memory") 170 \
172 171 _ASM_EXTABLE(1b, 2b) \
172 \
173 : "+r" (__val) : : "memory"); \
174} while (0)
173 175
174/* 176/*
175 * Save a segment register away 177 * Save a segment register away
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..e0d28901e969 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,10 +83,10 @@ struct thread_info {
83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
84#define TIF_SECCOMP 8 /* secure computing */ 84#define TIF_SECCOMP 8 /* secure computing */
85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
86#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
86#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
87#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
88#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
89#define TIF_ABI_PENDING 19
90#define TIF_MEMDIE 20 90#define TIF_MEMDIE 20
91#define TIF_DEBUG 21 /* uses debug registers */ 91#define TIF_DEBUG 21 /* uses debug registers */
92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -107,10 +107,10 @@ struct thread_info {
107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
108#define _TIF_SECCOMP (1 << TIF_SECCOMP) 108#define _TIF_SECCOMP (1 << TIF_SECCOMP)
109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
110#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
110#define _TIF_NOTSC (1 << TIF_NOTSC) 111#define _TIF_NOTSC (1 << TIF_NOTSC)
111#define _TIF_IA32 (1 << TIF_IA32) 112#define _TIF_IA32 (1 << TIF_IA32)
112#define _TIF_FORK (1 << TIF_FORK) 113#define _TIF_FORK (1 << TIF_FORK)
113#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
114#define _TIF_DEBUG (1 << TIF_DEBUG) 114#define _TIF_DEBUG (1 << TIF_DEBUG)
115#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 115#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
116#define _TIF_FREEZE (1 << TIF_FREEZE) 116#define _TIF_FREEZE (1 << TIF_FREEZE)
@@ -142,13 +142,14 @@ struct thread_info {
142 142
143/* Only used for 64 bit */ 143/* Only used for 64 bit */
144#define _TIF_DO_NOTIFY_MASK \ 144#define _TIF_DO_NOTIFY_MASK \
145 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 145 (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
146 _TIF_USER_RETURN_NOTIFY)
146 147
147/* flags to check in __switch_to() */ 148/* flags to check in __switch_to() */
148#define _TIF_WORK_CTXSW \ 149#define _TIF_WORK_CTXSW \
149 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 150 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
150 151
151#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 152#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
152#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 153#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
153 154
154#define PREEMPT_ACTIVE 0x10000000 155#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 25a92842dd99..c5087d796587 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -35,11 +35,16 @@
35# endif 35# endif
36#endif 36#endif
37 37
38/* Node not present */ 38/*
39#define NUMA_NO_NODE (-1) 39 * to preserve the visibility of NUMA_NO_NODE definition,
40 * moved to there from here. May be used independent of
41 * CONFIG_NUMA.
42 */
43#include <linux/numa.h>
40 44
41#ifdef CONFIG_NUMA 45#ifdef CONFIG_NUMA
42#include <linux/cpumask.h> 46#include <linux/cpumask.h>
47
43#include <asm/mpspec.h> 48#include <asm/mpspec.h>
44 49
45#ifdef CONFIG_X86_32 50#ifdef CONFIG_X86_32
@@ -143,6 +148,7 @@ extern unsigned long node_remap_size[];
143 | 1*SD_BALANCE_FORK \ 148 | 1*SD_BALANCE_FORK \
144 | 0*SD_BALANCE_WAKE \ 149 | 0*SD_BALANCE_WAKE \
145 | 1*SD_WAKE_AFFINE \ 150 | 1*SD_WAKE_AFFINE \
151 | 0*SD_PREFER_LOCAL \
146 | 0*SD_SHARE_CPUPOWER \ 152 | 0*SD_SHARE_CPUPOWER \
147 | 0*SD_POWERSAVINGS_BALANCE \ 153 | 0*SD_POWERSAVINGS_BALANCE \
148 | 0*SD_SHARE_PKG_RESOURCES \ 154 | 0*SD_SHARE_PKG_RESOURCES \
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 90f06c25221d..cb507bb05d79 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -16,7 +16,6 @@ extern unsigned long initial_code;
16extern unsigned long initial_gs; 16extern unsigned long initial_gs;
17 17
18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
19#define TRAMPOLINE_BASE 0x6000
20 19
21extern unsigned long setup_trampoline(void); 20extern unsigned long setup_trampoline(void);
22extern void __init reserve_trampoline_memory(void); 21extern void __init reserve_trampoline_memory(void);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d2c6c930b491..abd3e0ea762a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -570,7 +570,6 @@ extern struct movsl_mask {
570#ifdef CONFIG_X86_32 570#ifdef CONFIG_X86_32
571# include "uaccess_32.h" 571# include "uaccess_32.h"
572#else 572#else
573# define ARCH_HAS_SEARCH_EXTABLE
574# include "uaccess_64.h" 573# include "uaccess_64.h"
575#endif 574#endif
576 575
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 632fb44b4cb5..088d09fb1615 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -187,9 +187,33 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
187 187
188unsigned long __must_check copy_to_user(void __user *to, 188unsigned long __must_check copy_to_user(void __user *to,
189 const void *from, unsigned long n); 189 const void *from, unsigned long n);
190unsigned long __must_check copy_from_user(void *to, 190unsigned long __must_check _copy_from_user(void *to,
191 const void __user *from, 191 const void __user *from,
192 unsigned long n); 192 unsigned long n);
193
194
195extern void copy_from_user_overflow(void)
196#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
197 __compiletime_error("copy_from_user() buffer size is not provably correct")
198#else
199 __compiletime_warning("copy_from_user() buffer size is not provably correct")
200#endif
201;
202
203static inline unsigned long __must_check copy_from_user(void *to,
204 const void __user *from,
205 unsigned long n)
206{
207 int sz = __compiletime_object_size(to);
208
209 if (likely(sz == -1 || sz >= n))
210 n = _copy_from_user(to, from, n);
211 else
212 copy_from_user_overflow();
213
214 return n;
215}
216
193long __must_check strncpy_from_user(char *dst, const char __user *src, 217long __must_check strncpy_from_user(char *dst, const char __user *src,
194 long count); 218 long count);
195long __must_check __strncpy_from_user(char *dst, 219long __must_check __strncpy_from_user(char *dst,
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index db24b215fc50..535e421498f6 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -19,12 +19,36 @@ __must_check unsigned long
19copy_user_generic(void *to, const void *from, unsigned len); 19copy_user_generic(void *to, const void *from, unsigned len);
20 20
21__must_check unsigned long 21__must_check unsigned long
22copy_to_user(void __user *to, const void *from, unsigned len); 22_copy_to_user(void __user *to, const void *from, unsigned len);
23__must_check unsigned long 23__must_check unsigned long
24copy_from_user(void *to, const void __user *from, unsigned len); 24_copy_from_user(void *to, const void __user *from, unsigned len);
25__must_check unsigned long 25__must_check unsigned long
26copy_in_user(void __user *to, const void __user *from, unsigned len); 26copy_in_user(void __user *to, const void __user *from, unsigned len);
27 27
28static inline unsigned long __must_check copy_from_user(void *to,
29 const void __user *from,
30 unsigned long n)
31{
32 int sz = __compiletime_object_size(to);
33
34 might_fault();
35 if (likely(sz == -1 || sz >= n))
36 n = _copy_from_user(to, from, n);
37#ifdef CONFIG_DEBUG_VM
38 else
39 WARN(1, "Buffer overflow detected!\n");
40#endif
41 return n;
42}
43
44static __always_inline __must_check
45int copy_to_user(void __user *dst, const void *src, unsigned size)
46{
47 might_fault();
48
49 return _copy_to_user(dst, src, size);
50}
51
28static __always_inline __must_check 52static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 53int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 54{
@@ -176,8 +200,11 @@ __must_check long strlen_user(const char __user *str);
176__must_check unsigned long clear_user(void __user *mem, unsigned long len); 200__must_check unsigned long clear_user(void __user *mem, unsigned long len);
177__must_check unsigned long __clear_user(void __user *mem, unsigned long len); 201__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
178 202
179__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, 203static __must_check __always_inline int
180 unsigned size); 204__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
205{
206 return copy_user_generic(dst, (__force const void *)src, size);
207}
181 208
182static __must_check __always_inline int 209static __must_check __always_inline int
183__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) 210__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c209a7e3..3baf379fa840 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,11 @@
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337
345 346
346#ifdef __KERNEL__ 347#ifdef __KERNEL__
347 348
348#define NR_syscalls 337 349#define NR_syscalls 338
349 350
350#define __ARCH_WANT_IPC_PARSE_VERSION 351#define __ARCH_WANT_IPC_PARSE_VERSION
351#define __ARCH_WANT_OLD_READDIR 352#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0adbc68..4843f7ba754a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_event_open 298 662#define __NR_perf_event_open 298
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
664 666
665#ifndef __NO_STUBS 667#ifndef __NO_STUBS
666#define __ARCH_WANT_OLD_READDIR 668#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 7ed17ff502b9..2751f3075d8b 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -76,15 +76,6 @@ union partition_info_u {
76 }; 76 };
77}; 77};
78 78
79union uv_watchlist_u {
80 u64 val;
81 struct {
82 u64 blade : 16,
83 size : 32,
84 filler : 16;
85 };
86};
87
88enum uv_memprotect { 79enum uv_memprotect {
89 UV_MEMPROT_RESTRICT_ACCESS, 80 UV_MEMPROT_RESTRICT_ACCESS,
90 UV_MEMPROT_ALLOW_AMO, 81 UV_MEMPROT_ALLOW_AMO,
@@ -100,7 +91,7 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
100 91
101extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); 92extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
102extern s64 uv_bios_freq_base(u64, u64 *); 93extern s64 uv_bios_freq_base(u64, u64 *);
103extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, 94extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
104 unsigned long *); 95 unsigned long *);
105extern int uv_bios_mq_watchlist_free(int, int); 96extern int uv_bios_mq_watchlist_free(int, int);
106extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); 97extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 80e2984f521c..b414d2b401f6 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -55,7 +55,7 @@
55#define DESC_STATUS_SOURCE_TIMEOUT 3 55#define DESC_STATUS_SOURCE_TIMEOUT 3
56 56
57/* 57/*
58 * source side threshholds at which message retries print a warning 58 * source side thresholds at which message retries print a warning
59 */ 59 */
60#define SOURCE_TIMEOUT_LIMIT 20 60#define SOURCE_TIMEOUT_LIMIT 20
61#define DESTINATION_TIMEOUT_LIMIT 20 61#define DESTINATION_TIMEOUT_LIMIT 20
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 04eb6c958b9d..40be813fefb1 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -19,6 +19,8 @@
19#include <asm/types.h> 19#include <asm/types.h>
20#include <asm/percpu.h> 20#include <asm/percpu.h>
21#include <asm/uv/uv_mmrs.h> 21#include <asm/uv/uv_mmrs.h>
22#include <asm/irq_vectors.h>
23#include <asm/io_apic.h>
22 24
23 25
24/* 26/*
@@ -29,20 +31,20 @@
29 * contiguous (although various IO spaces may punch holes in 31 * contiguous (although various IO spaces may punch holes in
30 * it).. 32 * it)..
31 * 33 *
32 * N - Number of bits in the node portion of a socket physical 34 * N - Number of bits in the node portion of a socket physical
33 * address. 35 * address.
34 * 36 *
35 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of 37 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
36 * routers always have low bit of 1, C/MBricks have low bit 38 * routers always have low bit of 1, C/MBricks have low bit
37 * equal to 0. Most addressing macros that target UV hub chips 39 * equal to 0. Most addressing macros that target UV hub chips
38 * right shift the NASID by 1 to exclude the always-zero bit. 40 * right shift the NASID by 1 to exclude the always-zero bit.
39 * NASIDs contain up to 15 bits. 41 * NASIDs contain up to 15 bits.
40 * 42 *
41 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead 43 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
42 * of nasids. 44 * of nasids.
43 * 45 *
44 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant 46 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
45 * of the nasid for socket usage. 47 * of the nasid for socket usage.
46 * 48 *
47 * 49 *
48 * NumaLink Global Physical Address Format: 50 * NumaLink Global Physical Address Format:
@@ -69,12 +71,12 @@
69 * 71 *
70 * 72 *
71 * APICID format 73 * APICID format
72 * NOTE!!!!!! This is the current format of the APICID. However, code 74 * NOTE!!!!!! This is the current format of the APICID. However, code
73 * should assume that this will change in the future. Use functions 75 * should assume that this will change in the future. Use functions
74 * in this file for all APICID bit manipulations and conversion. 76 * in this file for all APICID bit manipulations and conversion.
75 * 77 *
76 * 1111110000000000 78 * 1111110000000000
77 * 5432109876543210 79 * 5432109876543210
78 * pppppppppplc0cch 80 * pppppppppplc0cch
79 * sssssssssss 81 * sssssssssss
80 * 82 *
@@ -87,9 +89,9 @@
87 * Note: Processor only supports 12 bits in the APICID register. The ACPI 89 * Note: Processor only supports 12 bits in the APICID register. The ACPI
88 * tables hold all 16 bits. Software needs to be aware of this. 90 * tables hold all 16 bits. Software needs to be aware of this.
89 * 91 *
90 * Unless otherwise specified, all references to APICID refer to 92 * Unless otherwise specified, all references to APICID refer to
91 * the FULL value contained in ACPI tables, not the subset in the 93 * the FULL value contained in ACPI tables, not the subset in the
92 * processor APICID register. 94 * processor APICID register.
93 */ 95 */
94 96
95 97
@@ -114,7 +116,7 @@
114/* 116/*
115 * The largest possible NASID of a C or M brick (+ 2) 117 * The largest possible NASID of a C or M brick (+ 2)
116 */ 118 */
117#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) 119#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2)
118 120
119struct uv_scir_s { 121struct uv_scir_s {
120 struct timer_list timer; 122 struct timer_list timer;
@@ -149,16 +151,16 @@ struct uv_hub_info_s {
149}; 151};
150 152
151DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 153DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
152#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) 154#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
153#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) 155#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
154 156
155/* 157/*
156 * Local & Global MMR space macros. 158 * Local & Global MMR space macros.
157 * Note: macros are intended to be used ONLY by inline functions 159 * Note: macros are intended to be used ONLY by inline functions
158 * in this file - not by other kernel code. 160 * in this file - not by other kernel code.
159 * n - NASID (full 15-bit global nasid) 161 * n - NASID (full 15-bit global nasid)
160 * g - GNODE (full 15-bit global nasid, right shifted 1) 162 * g - GNODE (full 15-bit global nasid, right shifted 1)
161 * p - PNODE (local part of nsids, right shifted 1) 163 * p - PNODE (local part of nsids, right shifted 1)
162 */ 164 */
163#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) 165#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
164#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) 166#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
@@ -170,6 +172,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
170#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) 172#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
171#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) 173#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
172 174
175#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
176
173#define UV_GLOBAL_MMR32_PNODE_SHIFT 15 177#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
174#define UV_GLOBAL_MMR64_PNODE_SHIFT 26 178#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
175 179
@@ -211,8 +215,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
211/* 215/*
212 * Macros for converting between kernel virtual addresses, socket local physical 216 * Macros for converting between kernel virtual addresses, socket local physical
213 * addresses, and UV global physical addresses. 217 * addresses, and UV global physical addresses.
214 * Note: use the standard __pa() & __va() macros for converting 218 * Note: use the standard __pa() & __va() macros for converting
215 * between socket virtual and socket physical addresses. 219 * between socket virtual and socket physical addresses.
216 */ 220 */
217 221
218/* socket phys RAM --> UV global physical address */ 222/* socket phys RAM --> UV global physical address */
@@ -230,6 +234,40 @@ static inline unsigned long uv_gpa(void *v)
230 return uv_soc_phys_ram_to_gpa(__pa(v)); 234 return uv_soc_phys_ram_to_gpa(__pa(v));
231} 235}
232 236
237/* Top two bits indicate the requested address is in MMR space. */
238static inline int
239uv_gpa_in_mmr_space(unsigned long gpa)
240{
241 return (gpa >> 62) == 0x3UL;
242}
243
244/* UV global physical address --> socket phys RAM */
245static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
246{
247 unsigned long paddr = gpa & uv_hub_info->gpa_mask;
248 unsigned long remap_base = uv_hub_info->lowmem_remap_base;
249 unsigned long remap_top = uv_hub_info->lowmem_remap_top;
250
251 if (paddr >= remap_base && paddr < remap_base + remap_top)
252 paddr -= remap_base;
253 return paddr;
254}
255
256
257/* gnode -> pnode */
258static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
259{
260 return gpa >> uv_hub_info->m_val;
261}
262
263/* gpa -> pnode */
264static inline int uv_gpa_to_pnode(unsigned long gpa)
265{
266 unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1;
267
268 return uv_gpa_to_gnode(gpa) & n_mask;
269}
270
233/* pnode, offset --> socket virtual */ 271/* pnode, offset --> socket virtual */
234static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) 272static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
235{ 273{
@@ -249,21 +287,18 @@ static inline int uv_apicid_to_pnode(int apicid)
249 * Access global MMRs using the low memory MMR32 space. This region supports 287 * Access global MMRs using the low memory MMR32 space. This region supports
250 * faster MMR access but not all MMRs are accessible in this space. 288 * faster MMR access but not all MMRs are accessible in this space.
251 */ 289 */
252static inline unsigned long *uv_global_mmr32_address(int pnode, 290static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset)
253 unsigned long offset)
254{ 291{
255 return __va(UV_GLOBAL_MMR32_BASE | 292 return __va(UV_GLOBAL_MMR32_BASE |
256 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); 293 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
257} 294}
258 295
259static inline void uv_write_global_mmr32(int pnode, unsigned long offset, 296static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val)
260 unsigned long val)
261{ 297{
262 writeq(val, uv_global_mmr32_address(pnode, offset)); 298 writeq(val, uv_global_mmr32_address(pnode, offset));
263} 299}
264 300
265static inline unsigned long uv_read_global_mmr32(int pnode, 301static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset)
266 unsigned long offset)
267{ 302{
268 return readq(uv_global_mmr32_address(pnode, offset)); 303 return readq(uv_global_mmr32_address(pnode, offset));
269} 304}
@@ -272,26 +307,42 @@ static inline unsigned long uv_read_global_mmr32(int pnode,
272 * Access Global MMR space using the MMR space located at the top of physical 307 * Access Global MMR space using the MMR space located at the top of physical
273 * memory. 308 * memory.
274 */ 309 */
275static inline unsigned long *uv_global_mmr64_address(int pnode, 310static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset)
276 unsigned long offset)
277{ 311{
278 return __va(UV_GLOBAL_MMR64_BASE | 312 return __va(UV_GLOBAL_MMR64_BASE |
279 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); 313 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
280} 314}
281 315
282static inline void uv_write_global_mmr64(int pnode, unsigned long offset, 316static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val)
283 unsigned long val)
284{ 317{
285 writeq(val, uv_global_mmr64_address(pnode, offset)); 318 writeq(val, uv_global_mmr64_address(pnode, offset));
286} 319}
287 320
288static inline unsigned long uv_read_global_mmr64(int pnode, 321static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset)
289 unsigned long offset)
290{ 322{
291 return readq(uv_global_mmr64_address(pnode, offset)); 323 return readq(uv_global_mmr64_address(pnode, offset));
292} 324}
293 325
294/* 326/*
327 * Global MMR space addresses when referenced by the GRU. (GRU does
328 * NOT use socket addressing).
329 */
330static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset)
331{
332 return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val);
333}
334
335static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
336{
337 writeb(val, uv_global_mmr64_address(pnode, offset));
338}
339
340static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset)
341{
342 return readb(uv_global_mmr64_address(pnode, offset));
343}
344
345/*
295 * Access hub local MMRs. Faster than using global space but only local MMRs 346 * Access hub local MMRs. Faster than using global space but only local MMRs
296 * are accessible. 347 * are accessible.
297 */ 348 */
@@ -410,23 +461,51 @@ static inline void uv_set_scir_bits(unsigned char value)
410 } 461 }
411} 462}
412 463
464static inline unsigned long uv_scir_offset(int apicid)
465{
466 return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f);
467}
468
413static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) 469static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
414{ 470{
415 if (uv_cpu_hub_info(cpu)->scir.state != value) { 471 if (uv_cpu_hub_info(cpu)->scir.state != value) {
472 uv_write_global_mmr8(uv_cpu_to_pnode(cpu),
473 uv_cpu_hub_info(cpu)->scir.offset, value);
416 uv_cpu_hub_info(cpu)->scir.state = value; 474 uv_cpu_hub_info(cpu)->scir.state = value;
417 uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
418 } 475 }
419} 476}
420 477
478static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
479{
480 return (1UL << UVH_IPI_INT_SEND_SHFT) |
481 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
482 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
483 (vector << UVH_IPI_INT_VECTOR_SHFT);
484}
485
421static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) 486static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
422{ 487{
423 unsigned long val; 488 unsigned long val;
489 unsigned long dmode = dest_Fixed;
424 490
425 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 491 if (vector == NMI_VECTOR)
426 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | 492 dmode = dest_NMI;
427 (vector << UVH_IPI_INT_VECTOR_SHFT); 493
494 val = uv_hub_ipi_value(apicid, vector, dmode);
428 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 495 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
429} 496}
430 497
498/*
499 * Get the minimum revision number of the hub chips within the partition.
500 * 1 - initial rev 1.0 silicon
501 * 2 - rev 2.0 production silicon
502 */
503static inline int uv_get_min_hub_revision_id(void)
504{
505 extern int uv_min_hub_revision_id;
506
507 return uv_min_hub_revision_id;
508}
509
431#endif /* CONFIG_X86_64 */ 510#endif /* CONFIG_X86_64 */
432#endif /* _ASM_X86_UV_UV_HUB_H */ 511#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
index 9613c8c0b647..d6b17c760622 100644
--- a/arch/x86/include/asm/uv/uv_irq.h
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -25,12 +25,14 @@ struct uv_IO_APIC_route_entry {
25 dest : 32; 25 dest : 32;
26}; 26};
27 27
28extern struct irq_chip uv_irq_chip; 28enum {
29 29 UV_AFFINITY_ALL,
30extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); 30 UV_AFFINITY_NODE,
31extern void arch_disable_uv_irq(int, unsigned long); 31 UV_AFFINITY_CPU
32};
32 33
33extern int uv_setup_irq(char *, int, int, unsigned long); 34extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
34extern void uv_teardown_irq(unsigned int, int, unsigned long); 35extern int uv_setup_irq(char *, int, int, unsigned long, int);
36extern void uv_teardown_irq(unsigned int);
35 37
36#endif /* _ASM_X86_UV_UV_IRQ_H */ 38#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d456..2b4945419a84 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -56,6 +56,7 @@
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
59#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
59 60
60 61
61#define PIN_BASED_EXT_INTR_MASK 0x00000001 62#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -144,6 +145,8 @@ enum vmcs_field {
144 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, 145 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
145 TPR_THRESHOLD = 0x0000401c, 146 TPR_THRESHOLD = 0x0000401c,
146 SECONDARY_VM_EXEC_CONTROL = 0x0000401e, 147 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
148 PLE_GAP = 0x00004020,
149 PLE_WINDOW = 0x00004022,
147 VM_INSTRUCTION_ERROR = 0x00004400, 150 VM_INSTRUCTION_ERROR = 0x00004400,
148 VM_EXIT_REASON = 0x00004402, 151 VM_EXIT_REASON = 0x00004402,
149 VM_EXIT_INTR_INFO = 0x00004404, 152 VM_EXIT_INTR_INFO = 0x00004404,
@@ -248,6 +251,7 @@ enum vmcs_field {
248#define EXIT_REASON_MSR_READ 31 251#define EXIT_REASON_MSR_READ 31
249#define EXIT_REASON_MSR_WRITE 32 252#define EXIT_REASON_MSR_WRITE 32
250#define EXIT_REASON_MWAIT_INSTRUCTION 36 253#define EXIT_REASON_MWAIT_INSTRUCTION 36
254#define EXIT_REASON_PAUSE_INSTRUCTION 40
251#define EXIT_REASON_MCE_DURING_VMENTRY 41 255#define EXIT_REASON_MCE_DURING_VMENTRY 41
252#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
253#define EXIT_REASON_APIC_ACCESS 44 257#define EXIT_REASON_APIC_ACCESS 44
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 97e5fb4f3bd3..ea0e8ea15e15 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -91,6 +91,14 @@ struct x86_init_timers {
91}; 91};
92 92
93/** 93/**
94 * struct x86_init_iommu - platform specific iommu setup
95 * @iommu_init: platform specific iommu setup
96 */
97struct x86_init_iommu {
98 int (*iommu_init)(void);
99};
100
101/**
94 * struct x86_init_ops - functions for platform specific setup 102 * struct x86_init_ops - functions for platform specific setup
95 * 103 *
96 */ 104 */
@@ -101,6 +109,7 @@ struct x86_init_ops {
101 struct x86_init_oem oem; 109 struct x86_init_oem oem;
102 struct x86_init_paging paging; 110 struct x86_init_paging paging;
103 struct x86_init_timers timers; 111 struct x86_init_timers timers;
112 struct x86_init_iommu iommu;
104}; 113};
105 114
106/** 115/**
@@ -122,6 +131,7 @@ struct x86_platform_ops {
122 unsigned long (*calibrate_tsc)(void); 131 unsigned long (*calibrate_tsc)(void);
123 unsigned long (*get_wallclock)(void); 132 unsigned long (*get_wallclock)(void);
124 int (*set_wallclock)(unsigned long nowtime); 133 int (*set_wallclock)(unsigned long nowtime);
134 void (*iommu_shutdown)(void);
125 bool (*is_untracked_pat_range)(u64 start, u64 end); 135 bool (*is_untracked_pat_range)(u64 start, u64 end);
126}; 136};
127 137
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d5b7e90c0edf..396ff4cc8ed4 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,31 +37,4 @@
37extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
38extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
39 39
40enum xen_domain_type {
41 XEN_NATIVE, /* running on bare hardware */
42 XEN_PV_DOMAIN, /* running in a PV domain */
43 XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
44};
45
46#ifdef CONFIG_XEN
47extern enum xen_domain_type xen_domain_type;
48#else
49#define xen_domain_type XEN_NATIVE
50#endif
51
52#define xen_domain() (xen_domain_type != XEN_NATIVE)
53#define xen_pv_domain() (xen_domain() && \
54 xen_domain_type == XEN_PV_DOMAIN)
55#define xen_hvm_domain() (xen_domain() && \
56 xen_domain_type == XEN_HVM_DOMAIN)
57
58#ifdef CONFIG_XEN_DOM0
59#include <xen/interface/xen.h>
60
61#define xen_initial_domain() (xen_pv_domain() && \
62 xen_start_info->flags & SIF_INITDOMAIN)
63#else /* !CONFIG_XEN_DOM0 */
64#define xen_initial_domain() (0)
65#endif /* CONFIG_XEN_DOM0 */
66
67#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 40#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..d87f09bc5a52 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -89,7 +89,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90 90
91obj-$(CONFIG_K8_NB) += k8.o 91obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 92obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 93obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 94
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fd5ca97a2ad5..6f35260bb3ef 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o 4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
5 5
6ifneq ($(CONFIG_ACPI_PROCESSOR),) 6ifneq ($(CONFIG_ACPI_PROCESSOR),)
7obj-y += cstate.o processor.o 7obj-y += cstate.o
8endif 8endif
9 9
10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin 10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..036d28adf59d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
624 } 624 }
625 625
626 hpet_address = hpet_tbl->address.address; 626 hpet_address = hpet_tbl->address.address;
627 hpet_blockid = hpet_tbl->sequence;
627 628
628 /* 629 /*
629 * Some broken BIOSes advertise HPET at 0x0. We really do not 630 * Some broken BIOSes advertise HPET at 0x0. We really do not
@@ -1122,7 +1123,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1122 if (!acpi_sci_override_gsi) 1123 if (!acpi_sci_override_gsi)
1123 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1124 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
1124 1125
1125 /* Fill in identity legacy mapings where no override */ 1126 /* Fill in identity legacy mappings where no override */
1126 mp_config_acpi_legacy_irqs(); 1127 mp_config_acpi_legacy_irqs();
1127 1128
1128 count = 1129 count =
@@ -1528,16 +1529,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1528 * if acpi_blacklisted() acpi_disabled = 1; 1529 * if acpi_blacklisted() acpi_disabled = 1;
1529 * acpi_irq_model=... 1530 * acpi_irq_model=...
1530 * ... 1531 * ...
1531 *
1532 * return value: (currently ignored)
1533 * 0: success
1534 * !0: failure
1535 */ 1532 */
1536 1533
1537int __init acpi_boot_table_init(void) 1534void __init acpi_boot_table_init(void)
1538{ 1535{
1539 int error;
1540
1541 dmi_check_system(acpi_dmi_table); 1536 dmi_check_system(acpi_dmi_table);
1542 1537
1543 /* 1538 /*
@@ -1545,15 +1540,14 @@ int __init acpi_boot_table_init(void)
1545 * One exception: acpi=ht continues far enough to enumerate LAPICs 1540 * One exception: acpi=ht continues far enough to enumerate LAPICs
1546 */ 1541 */
1547 if (acpi_disabled && !acpi_ht) 1542 if (acpi_disabled && !acpi_ht)
1548 return 1; 1543 return;
1549 1544
1550 /* 1545 /*
1551 * Initialize the ACPI boot-time table parser. 1546 * Initialize the ACPI boot-time table parser.
1552 */ 1547 */
1553 error = acpi_table_init(); 1548 if (acpi_table_init()) {
1554 if (error) {
1555 disable_acpi(); 1549 disable_acpi();
1556 return error; 1550 return;
1557 } 1551 }
1558 1552
1559 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1553 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1561,18 +1555,15 @@ int __init acpi_boot_table_init(void)
1561 /* 1555 /*
1562 * blacklist may disable ACPI entirely 1556 * blacklist may disable ACPI entirely
1563 */ 1557 */
1564 error = acpi_blacklisted(); 1558 if (acpi_blacklisted()) {
1565 if (error) {
1566 if (acpi_force) { 1559 if (acpi_force) {
1567 printk(KERN_WARNING PREFIX "acpi=force override\n"); 1560 printk(KERN_WARNING PREFIX "acpi=force override\n");
1568 } else { 1561 } else {
1569 printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); 1562 printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1570 disable_acpi(); 1563 disable_acpi();
1571 return error; 1564 return;
1572 } 1565 }
1573 } 1566 }
1574
1575 return 0;
1576} 1567}
1577 1568
1578int __init early_acpi_boot_init(void) 1569int __init early_acpi_boot_init(void)
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 59cdfa4686b2..2e837f5080fe 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
deleted file mode 100644
index d296f4a195c9..000000000000
--- a/arch/x86/kernel/acpi/processor.c
+++ /dev/null
@@ -1,100 +0,0 @@
1/*
2 * Copyright (C) 2005 Intel Corporation
3 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
4 * - Added _PDC for platforms with Intel CPUs
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/acpi.h>
11
12#include <acpi/processor.h>
13#include <asm/acpi.h>
14
15static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
16{
17 struct acpi_object_list *obj_list;
18 union acpi_object *obj;
19 u32 *buf;
20
21 /* allocate and initialize pdc. It will be used later. */
22 obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
23 if (!obj_list) {
24 printk(KERN_ERR "Memory allocation error\n");
25 return;
26 }
27
28 obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
29 if (!obj) {
30 printk(KERN_ERR "Memory allocation error\n");
31 kfree(obj_list);
32 return;
33 }
34
35 buf = kmalloc(12, GFP_KERNEL);
36 if (!buf) {
37 printk(KERN_ERR "Memory allocation error\n");
38 kfree(obj);
39 kfree(obj_list);
40 return;
41 }
42
43 buf[0] = ACPI_PDC_REVISION_ID;
44 buf[1] = 1;
45 buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
46
47 /*
48 * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so
49 * that OSPM is capable of native ACPI throttling software
50 * coordination using BIOS supplied _TSD info.
51 */
52 buf[2] |= ACPI_PDC_SMP_T_SWCOORD;
53 if (cpu_has(c, X86_FEATURE_EST))
54 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
55
56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH;
58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
65 obj->type = ACPI_TYPE_BUFFER;
66 obj->buffer.length = 12;
67 obj->buffer.pointer = (u8 *) buf;
68 obj_list->count = 1;
69 obj_list->pointer = obj;
70 pr->pdc = obj_list;
71
72 return;
73}
74
75
76/* Initialize _PDC data based on the CPU vendor */
77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
78{
79 struct cpuinfo_x86 *c = &cpu_data(pr->id);
80
81 pr->pdc = NULL;
82 if (c->x86_vendor == X86_VENDOR_INTEL)
83 init_intel_pdc(pr, c);
84
85 return;
86}
87
88EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
89
90void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
91{
92 if (pr->pdc) {
93 kfree(pr->pdc->pointer->buffer.pointer);
94 kfree(pr->pdc->pointer);
95 kfree(pr->pdc);
96 pr->pdc = NULL;
97 }
98}
99
100EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cda..060fff8f5c5b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -57,5 +57,8 @@ SECTIONS
57 *(.note*) 57 *(.note*)
58 } 58 }
59 59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
60 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); 63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
61} 64}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 82e508677b91..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
165 str = strchr(str, ','); 167 str = strchr(str, ',');
166 if (str != NULL) 168 if (str != NULL)
167 str += strspn(str, ", \t"); 169 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 98f230f6a28d..adb0ba025702 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -19,7 +19,7 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/bitops.h> 22#include <linux/bitmap.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -28,6 +28,7 @@
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/iommu.h> 29#include <asm/iommu.h>
30#include <asm/gart.h> 30#include <asm/gart.h>
31#include <asm/amd_iommu_proto.h>
31#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
32#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
33 34
@@ -56,20 +57,152 @@ struct iommu_cmd {
56 u32 data[4]; 57 u32 data[4];
57}; 58};
58 59
59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
60 struct unity_map_entry *e);
61static struct dma_ops_domain *find_protection_domain(u16 devid);
62static u64 *alloc_pte(struct protection_domain *domain,
63 unsigned long address, int end_lvl,
64 u64 **pte_page, gfp_t gfp);
65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
66 unsigned long start_page,
67 unsigned int pages);
68static void reset_iommu_command_buffer(struct amd_iommu *iommu); 60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
69static u64 *fetch_pte(struct protection_domain *domain,
70 unsigned long address, int map_size);
71static void update_domain(struct protection_domain *domain); 61static void update_domain(struct protection_domain *domain);
72 62
63/****************************************************************************
64 *
65 * Helper functions
66 *
67 ****************************************************************************/
68
69static inline u16 get_device_id(struct device *dev)
70{
71 struct pci_dev *pdev = to_pci_dev(dev);
72
73 return calc_devid(pdev->bus->number, pdev->devfn);
74}
75
76static struct iommu_dev_data *get_dev_data(struct device *dev)
77{
78 return dev->archdata.iommu;
79}
80
81/*
82 * In this function the list of preallocated protection domains is traversed to
83 * find the domain for a specific device
84 */
85static struct dma_ops_domain *find_protection_domain(u16 devid)
86{
87 struct dma_ops_domain *entry, *ret = NULL;
88 unsigned long flags;
89 u16 alias = amd_iommu_alias_table[devid];
90
91 if (list_empty(&iommu_pd_list))
92 return NULL;
93
94 spin_lock_irqsave(&iommu_pd_list_lock, flags);
95
96 list_for_each_entry(entry, &iommu_pd_list, list) {
97 if (entry->target_dev == devid ||
98 entry->target_dev == alias) {
99 ret = entry;
100 break;
101 }
102 }
103
104 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
105
106 return ret;
107}
108
109/*
110 * This function checks if the driver got a valid device from the caller to
111 * avoid dereferencing invalid pointers.
112 */
113static bool check_device(struct device *dev)
114{
115 u16 devid;
116
117 if (!dev || !dev->dma_mask)
118 return false;
119
120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type)
122 return false;
123
124 devid = get_device_id(dev);
125
126 /* Out of our scope? */
127 if (devid > amd_iommu_last_bdf)
128 return false;
129
130 if (amd_iommu_rlookup_table[devid] == NULL)
131 return false;
132
133 return true;
134}
135
136static int iommu_init_device(struct device *dev)
137{
138 struct iommu_dev_data *dev_data;
139 struct pci_dev *pdev;
140 u16 devid, alias;
141
142 if (dev->archdata.iommu)
143 return 0;
144
145 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
146 if (!dev_data)
147 return -ENOMEM;
148
149 dev_data->dev = dev;
150
151 devid = get_device_id(dev);
152 alias = amd_iommu_alias_table[devid];
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev)
155 dev_data->alias = &pdev->dev;
156
157 atomic_set(&dev_data->bind, 0);
158
159 dev->archdata.iommu = dev_data;
160
161
162 return 0;
163}
164
165static void iommu_uninit_device(struct device *dev)
166{
167 kfree(dev->archdata.iommu);
168}
169
170void __init amd_iommu_uninit_devices(void)
171{
172 struct pci_dev *pdev = NULL;
173
174 for_each_pci_dev(pdev) {
175
176 if (!check_device(&pdev->dev))
177 continue;
178
179 iommu_uninit_device(&pdev->dev);
180 }
181}
182
183int __init amd_iommu_init_devices(void)
184{
185 struct pci_dev *pdev = NULL;
186 int ret = 0;
187
188 for_each_pci_dev(pdev) {
189
190 if (!check_device(&pdev->dev))
191 continue;
192
193 ret = iommu_init_device(&pdev->dev);
194 if (ret)
195 goto out_free;
196 }
197
198 return 0;
199
200out_free:
201
202 amd_iommu_uninit_devices();
203
204 return ret;
205}
73#ifdef CONFIG_AMD_IOMMU_STATS 206#ifdef CONFIG_AMD_IOMMU_STATS
74 207
75/* 208/*
@@ -90,7 +223,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
90DECLARE_STATS_COUNTER(total_map_requests); 223DECLARE_STATS_COUNTER(total_map_requests);
91 224
92static struct dentry *stats_dir; 225static struct dentry *stats_dir;
93static struct dentry *de_isolate;
94static struct dentry *de_fflush; 226static struct dentry *de_fflush;
95 227
96static void amd_iommu_stats_add(struct __iommu_counter *cnt) 228static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +240,6 @@ static void amd_iommu_stats_init(void)
108 if (stats_dir == NULL) 240 if (stats_dir == NULL)
109 return; 241 return;
110 242
111 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
112 (u32 *)&amd_iommu_isolate);
113
114 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 243 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
115 (u32 *)&amd_iommu_unmap_flush); 244 (u32 *)&amd_iommu_unmap_flush);
116 245
@@ -130,12 +259,6 @@ static void amd_iommu_stats_init(void)
130 259
131#endif 260#endif
132 261
133/* returns !0 if the IOMMU is caching non-present entries in its TLB */
134static int iommu_has_npcache(struct amd_iommu *iommu)
135{
136 return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
137}
138
139/**************************************************************************** 262/****************************************************************************
140 * 263 *
141 * Interrupt handling functions 264 * Interrupt handling functions
@@ -199,6 +322,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
199 break; 322 break;
200 case EVENT_TYPE_ILL_CMD: 323 case EVENT_TYPE_ILL_CMD:
201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
202 reset_iommu_command_buffer(iommu); 326 reset_iommu_command_buffer(iommu);
203 dump_command(address); 327 dump_command(address);
204 break; 328 break;
@@ -321,11 +445,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 445 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 446 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
323 447
324 if (unlikely(i == EXIT_LOOP_COUNT)) { 448 if (unlikely(i == EXIT_LOOP_COUNT))
325 spin_unlock(&iommu->lock); 449 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
329} 450}
330 451
331/* 452/*
@@ -372,26 +493,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
372out: 493out:
373 spin_unlock_irqrestore(&iommu->lock, flags); 494 spin_unlock_irqrestore(&iommu->lock, flags);
374 495
496 if (iommu->reset_in_progress)
497 reset_iommu_command_buffer(iommu);
498
375 return 0; 499 return 0;
376} 500}
377 501
502static void iommu_flush_complete(struct protection_domain *domain)
503{
504 int i;
505
506 for (i = 0; i < amd_iommus_present; ++i) {
507 if (!domain->dev_iommu[i])
508 continue;
509
510 /*
511 * Devices of this domain are behind this IOMMU
512 * We need to wait for completion of all commands.
513 */
514 iommu_completion_wait(amd_iommus[i]);
515 }
516}
517
378/* 518/*
379 * Command send function for invalidating a device table entry 519 * Command send function for invalidating a device table entry
380 */ 520 */
381static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 521static int iommu_flush_device(struct device *dev)
382{ 522{
523 struct amd_iommu *iommu;
383 struct iommu_cmd cmd; 524 struct iommu_cmd cmd;
384 int ret; 525 u16 devid;
385 526
386 BUG_ON(iommu == NULL); 527 devid = get_device_id(dev);
528 iommu = amd_iommu_rlookup_table[devid];
387 529
530 /* Build command */
388 memset(&cmd, 0, sizeof(cmd)); 531 memset(&cmd, 0, sizeof(cmd));
389 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 532 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
390 cmd.data[0] = devid; 533 cmd.data[0] = devid;
391 534
392 ret = iommu_queue_command(iommu, &cmd); 535 return iommu_queue_command(iommu, &cmd);
393
394 return ret;
395} 536}
396 537
397static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 538static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +571,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
430 * It invalidates a single PTE if the range to flush is within a single 571 * It invalidates a single PTE if the range to flush is within a single
431 * page. Otherwise it flushes the whole TLB of the IOMMU. 572 * page. Otherwise it flushes the whole TLB of the IOMMU.
432 */ 573 */
433static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 574static void __iommu_flush_pages(struct protection_domain *domain,
434 u64 address, size_t size) 575 u64 address, size_t size, int pde)
435{ 576{
436 int s = 0; 577 int s = 0, i;
437 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); 578 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
438 579
439 address &= PAGE_MASK; 580 address &= PAGE_MASK;
440 581
@@ -447,142 +588,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
447 s = 1; 588 s = 1;
448 } 589 }
449 590
450 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
451 591
452 return 0; 592 for (i = 0; i < amd_iommus_present; ++i) {
593 if (!domain->dev_iommu[i])
594 continue;
595
596 /*
597 * Devices of this domain are behind this IOMMU
598 * We need a TLB flush
599 */
600 iommu_queue_inv_iommu_pages(amd_iommus[i], address,
601 domain->id, pde, s);
602 }
603
604 return;
453} 605}
454 606
455/* Flush the whole IO/TLB for a given protection domain */ 607static void iommu_flush_pages(struct protection_domain *domain,
456static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) 608 u64 address, size_t size)
457{ 609{
458 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 610 __iommu_flush_pages(domain, address, size, 0);
459 611}
460 INC_STATS_COUNTER(domain_flush_single);
461 612
462 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 613/* Flush the whole IO/TLB for a given protection domain */
614static void iommu_flush_tlb(struct protection_domain *domain)
615{
616 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
463} 617}
464 618
465/* Flush the whole IO/TLB for a given protection domain - including PDE */ 619/* Flush the whole IO/TLB for a given protection domain - including PDE */
466static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) 620static void iommu_flush_tlb_pde(struct protection_domain *domain)
467{ 621{
468 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 622 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
469
470 INC_STATS_COUNTER(domain_flush_single);
471
472 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
473} 623}
474 624
625
475/* 626/*
476 * This function flushes one domain on one IOMMU 627 * This function flushes the DTEs for all devices in domain
477 */ 628 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) 629static void iommu_flush_domain_devices(struct protection_domain *domain)
479{ 630{
480 struct iommu_cmd cmd; 631 struct iommu_dev_data *dev_data;
481 unsigned long flags; 632 unsigned long flags;
482 633
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 634 spin_lock_irqsave(&domain->lock, flags);
484 domid, 1, 1);
485 635
486 spin_lock_irqsave(&iommu->lock, flags); 636 list_for_each_entry(dev_data, &domain->dev_list, list)
487 __iommu_queue_command(iommu, &cmd); 637 iommu_flush_device(dev_data->dev);
488 __iommu_completion_wait(iommu); 638
489 __iommu_wait_for_completion(iommu); 639 spin_unlock_irqrestore(&domain->lock, flags);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491} 640}
492 641
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu) 642static void iommu_flush_all_domain_devices(void)
494{ 643{
495 int i; 644 struct protection_domain *domain;
645 unsigned long flags;
496 646
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 647 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 648
499 continue; 649 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
500 flush_domain_on_iommu(iommu, i); 650 iommu_flush_domain_devices(domain);
651 iommu_flush_complete(domain);
501 } 652 }
502 653
654 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
655}
656
657void amd_iommu_flush_all_devices(void)
658{
659 iommu_flush_all_domain_devices();
503} 660}
504 661
505/* 662/*
506 * This function is used to flush the IO/TLB for a given protection domain 663 * This function uses heavy locking and may disable irqs for some time. But
507 * on every IOMMU in the system 664 * this is no issue because it is only called during resume.
508 */ 665 */
509static void iommu_flush_domain(u16 domid) 666void amd_iommu_flush_all_domains(void)
510{ 667{
511 struct amd_iommu *iommu; 668 struct protection_domain *domain;
669 unsigned long flags;
512 670
513 INC_STATS_COUNTER(domain_flush_all); 671 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
514 672
515 for_each_iommu(iommu) 673 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
516 flush_domain_on_iommu(iommu, domid); 674 spin_lock(&domain->lock);
675 iommu_flush_tlb_pde(domain);
676 iommu_flush_complete(domain);
677 spin_unlock(&domain->lock);
678 }
679
680 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
517} 681}
518 682
519void amd_iommu_flush_all_domains(void) 683static void reset_iommu_command_buffer(struct amd_iommu *iommu)
520{ 684{
521 struct amd_iommu *iommu; 685 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
522 686
523 for_each_iommu(iommu) 687 if (iommu->reset_in_progress)
524 flush_all_domains_on_iommu(iommu); 688 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
689
690 amd_iommu_reset_cmd_buffer(iommu);
691 amd_iommu_flush_all_devices();
692 amd_iommu_flush_all_domains();
693
694 iommu->reset_in_progress = false;
525} 695}
526 696
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu) 697/****************************************************************************
698 *
699 * The functions below are used the create the page table mappings for
700 * unity mapped regions.
701 *
702 ****************************************************************************/
703
704/*
705 * This function is used to add another level to an IO page table. Adding
706 * another level increases the size of the address space by 9 bits to a size up
707 * to 64 bits.
708 */
709static bool increase_address_space(struct protection_domain *domain,
710 gfp_t gfp)
528{ 711{
529 int i; 712 u64 *pte;
530 713
531 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 714 if (domain->mode == PAGE_MODE_6_LEVEL)
532 if (iommu != amd_iommu_rlookup_table[i]) 715 /* address space already 64 bit large */
533 continue; 716 return false;
534 717
535 iommu_queue_inv_dev_entry(iommu, i); 718 pte = (void *)get_zeroed_page(gfp);
536 iommu_completion_wait(iommu); 719 if (!pte)
537 } 720 return false;
721
722 *pte = PM_LEVEL_PDE(domain->mode,
723 virt_to_phys(domain->pt_root));
724 domain->pt_root = pte;
725 domain->mode += 1;
726 domain->updated = true;
727
728 return true;
538} 729}
539 730
540static void flush_devices_by_domain(struct protection_domain *domain) 731static u64 *alloc_pte(struct protection_domain *domain,
732 unsigned long address,
733 int end_lvl,
734 u64 **pte_page,
735 gfp_t gfp)
541{ 736{
542 struct amd_iommu *iommu; 737 u64 *pte, *page;
543 int i; 738 int level;
544 739
545 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 740 while (address > PM_LEVEL_SIZE(domain->mode))
546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || 741 increase_address_space(domain, gfp);
547 (amd_iommu_pd_table[i] != domain))
548 continue;
549 742
550 iommu = amd_iommu_rlookup_table[i]; 743 level = domain->mode - 1;
551 if (!iommu) 744 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
552 continue; 745
746 while (level > end_lvl) {
747 if (!IOMMU_PTE_PRESENT(*pte)) {
748 page = (u64 *)get_zeroed_page(gfp);
749 if (!page)
750 return NULL;
751 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
752 }
553 753
554 iommu_queue_inv_dev_entry(iommu, i); 754 level -= 1;
555 iommu_completion_wait(iommu); 755
756 pte = IOMMU_PTE_PAGE(*pte);
757
758 if (pte_page && level == end_lvl)
759 *pte_page = pte;
760
761 pte = &pte[PM_LEVEL_INDEX(level, address)];
556 } 762 }
763
764 return pte;
557} 765}
558 766
559static void reset_iommu_command_buffer(struct amd_iommu *iommu) 767/*
768 * This function checks if there is a PTE for a given dma address. If
769 * there is one, it returns the pointer to it.
770 */
771static u64 *fetch_pte(struct protection_domain *domain,
772 unsigned long address, int map_size)
560{ 773{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); 774 int level;
775 u64 *pte;
562 776
563 if (iommu->reset_in_progress) 777 level = domain->mode - 1;
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); 778 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
565 779
566 iommu->reset_in_progress = true; 780 while (level > map_size) {
781 if (!IOMMU_PTE_PRESENT(*pte))
782 return NULL;
567 783
568 amd_iommu_reset_cmd_buffer(iommu); 784 level -= 1;
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571 785
572 iommu->reset_in_progress = false; 786 pte = IOMMU_PTE_PAGE(*pte);
573} 787 pte = &pte[PM_LEVEL_INDEX(level, address)];
574 788
575void amd_iommu_flush_all_devices(void) 789 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
576{ 790 pte = NULL;
577 flush_devices_by_domain(NULL); 791 break;
578} 792 }
793 }
579 794
580/**************************************************************************** 795 return pte;
581 * 796}
582 * The functions below are used the create the page table mappings for
583 * unity mapped regions.
584 *
585 ****************************************************************************/
586 797
587/* 798/*
588 * Generic mapping functions. It maps a physical address into a DMA 799 * Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +865,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
654} 865}
655 866
656/* 867/*
657 * Init the unity mappings for a specific IOMMU in the system
658 *
659 * Basically iterates over all unity mapping entries and applies them to
660 * the default domain DMA of that IOMMU if necessary.
661 */
662static int iommu_init_unity_mappings(struct amd_iommu *iommu)
663{
664 struct unity_map_entry *entry;
665 int ret;
666
667 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
668 if (!iommu_for_unity_map(iommu, entry))
669 continue;
670 ret = dma_ops_unity_map(iommu->default_dom, entry);
671 if (ret)
672 return ret;
673 }
674
675 return 0;
676}
677
678/*
679 * This function actually applies the mapping to the page table of the 868 * This function actually applies the mapping to the page table of the
680 * dma_ops domain. 869 * dma_ops domain.
681 */ 870 */
@@ -704,6 +893,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
704} 893}
705 894
706/* 895/*
896 * Init the unity mappings for a specific IOMMU in the system
897 *
898 * Basically iterates over all unity mapping entries and applies them to
899 * the default domain DMA of that IOMMU if necessary.
900 */
901static int iommu_init_unity_mappings(struct amd_iommu *iommu)
902{
903 struct unity_map_entry *entry;
904 int ret;
905
906 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
907 if (!iommu_for_unity_map(iommu, entry))
908 continue;
909 ret = dma_ops_unity_map(iommu->default_dom, entry);
910 if (ret)
911 return ret;
912 }
913
914 return 0;
915}
916
917/*
707 * Inits the unity mappings required for a specific device 918 * Inits the unity mappings required for a specific device
708 */ 919 */
709static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 920static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +951,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
740 */ 951 */
741 952
742/* 953/*
743 * This function checks if there is a PTE for a given dma address. If 954 * Used to reserve address ranges in the aperture (e.g. for exclusion
744 * there is one, it returns the pointer to it. 955 * ranges.
745 */ 956 */
746static u64 *fetch_pte(struct protection_domain *domain, 957static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
747 unsigned long address, int map_size) 958 unsigned long start_page,
959 unsigned int pages)
748{ 960{
749 int level; 961 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
750 u64 *pte;
751
752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
754
755 while (level > map_size) {
756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
758
759 level -= 1;
760 962
761 pte = IOMMU_PTE_PAGE(*pte); 963 if (start_page + pages > last_page)
762 pte = &pte[PM_LEVEL_INDEX(level, address)]; 964 pages = last_page - start_page;
763 965
764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { 966 for (i = start_page; i < start_page + pages; ++i) {
765 pte = NULL; 967 int index = i / APERTURE_RANGE_PAGES;
766 break; 968 int page = i % APERTURE_RANGE_PAGES;
767 } 969 __set_bit(page, dom->aperture[index]->bitmap);
768 } 970 }
769
770 return pte;
771} 971}
772 972
773/* 973/*
@@ -775,12 +975,12 @@ static u64 *fetch_pte(struct protection_domain *domain,
775 * aperture in case of dma_ops domain allocation or address allocation 975 * aperture in case of dma_ops domain allocation or address allocation
776 * failure. 976 * failure.
777 */ 977 */
778static int alloc_new_range(struct amd_iommu *iommu, 978static int alloc_new_range(struct dma_ops_domain *dma_dom,
779 struct dma_ops_domain *dma_dom,
780 bool populate, gfp_t gfp) 979 bool populate, gfp_t gfp)
781{ 980{
782 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 981 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
783 int i; 982 struct amd_iommu *iommu;
983 unsigned long i;
784 984
785#ifdef CONFIG_IOMMU_STRESS 985#ifdef CONFIG_IOMMU_STRESS
786 populate = false; 986 populate = false;
@@ -819,14 +1019,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
819 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1019 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
820 1020
821 /* Intialize the exclusion range if necessary */ 1021 /* Intialize the exclusion range if necessary */
822 if (iommu->exclusion_start && 1022 for_each_iommu(iommu) {
823 iommu->exclusion_start >= dma_dom->aperture[index]->offset && 1023 if (iommu->exclusion_start &&
824 iommu->exclusion_start < dma_dom->aperture_size) { 1024 iommu->exclusion_start >= dma_dom->aperture[index]->offset
825 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 1025 && iommu->exclusion_start < dma_dom->aperture_size) {
826 int pages = iommu_num_pages(iommu->exclusion_start, 1026 unsigned long startpage;
827 iommu->exclusion_length, 1027 int pages = iommu_num_pages(iommu->exclusion_start,
828 PAGE_SIZE); 1028 iommu->exclusion_length,
829 dma_ops_reserve_addresses(dma_dom, startpage, pages); 1029 PAGE_SIZE);
1030 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1031 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1032 }
830 } 1033 }
831 1034
832 /* 1035 /*
@@ -928,7 +1131,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
928 } 1131 }
929 1132
930 if (unlikely(address == -1)) 1133 if (unlikely(address == -1))
931 address = bad_dma_address; 1134 address = DMA_ERROR_CODE;
932 1135
933 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1136 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
934 1137
@@ -959,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
959 1162
960 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; 1163 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
961 1164
962 iommu_area_free(range->bitmap, address, pages); 1165 bitmap_clear(range->bitmap, address, pages);
963 1166
964} 1167}
965 1168
@@ -973,6 +1176,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
973 * 1176 *
974 ****************************************************************************/ 1177 ****************************************************************************/
975 1178
1179/*
1180 * This function adds a protection domain to the global protection domain list
1181 */
1182static void add_domain_to_list(struct protection_domain *domain)
1183{
1184 unsigned long flags;
1185
1186 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1187 list_add(&domain->list, &amd_iommu_pd_list);
1188 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1189}
1190
1191/*
1192 * This function removes a protection domain to the global
1193 * protection domain list
1194 */
1195static void del_domain_from_list(struct protection_domain *domain)
1196{
1197 unsigned long flags;
1198
1199 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1200 list_del(&domain->list);
1201 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1202}
1203
976static u16 domain_id_alloc(void) 1204static u16 domain_id_alloc(void)
977{ 1205{
978 unsigned long flags; 1206 unsigned long flags;
@@ -1000,26 +1228,6 @@ static void domain_id_free(int id)
1000 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1228 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1001} 1229}
1002 1230
1003/*
1004 * Used to reserve address ranges in the aperture (e.g. for exclusion
1005 * ranges.
1006 */
1007static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1008 unsigned long start_page,
1009 unsigned int pages)
1010{
1011 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1012
1013 if (start_page + pages > last_page)
1014 pages = last_page - start_page;
1015
1016 for (i = start_page; i < start_page + pages; ++i) {
1017 int index = i / APERTURE_RANGE_PAGES;
1018 int page = i % APERTURE_RANGE_PAGES;
1019 __set_bit(page, dom->aperture[index]->bitmap);
1020 }
1021}
1022
1023static void free_pagetable(struct protection_domain *domain) 1231static void free_pagetable(struct protection_domain *domain)
1024{ 1232{
1025 int i, j; 1233 int i, j;
@@ -1061,6 +1269,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1061 if (!dom) 1269 if (!dom)
1062 return; 1270 return;
1063 1271
1272 del_domain_from_list(&dom->domain);
1273
1064 free_pagetable(&dom->domain); 1274 free_pagetable(&dom->domain);
1065 1275
1066 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1276 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1288,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1078 * It also intializes the page table and the address allocator data 1288 * It also intializes the page table and the address allocator data
1079 * structures required for the dma_ops interface 1289 * structures required for the dma_ops interface
1080 */ 1290 */
1081static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) 1291static struct dma_ops_domain *dma_ops_domain_alloc(void)
1082{ 1292{
1083 struct dma_ops_domain *dma_dom; 1293 struct dma_ops_domain *dma_dom;
1084 1294
@@ -1091,6 +1301,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1091 dma_dom->domain.id = domain_id_alloc(); 1301 dma_dom->domain.id = domain_id_alloc();
1092 if (dma_dom->domain.id == 0) 1302 if (dma_dom->domain.id == 0)
1093 goto free_dma_dom; 1303 goto free_dma_dom;
1304 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1305 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1306 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1096 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1307 dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1312,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1101 dma_dom->need_flush = false; 1312 dma_dom->need_flush = false;
1102 dma_dom->target_dev = 0xffff; 1313 dma_dom->target_dev = 0xffff;
1103 1314
1104 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) 1315 add_domain_to_list(&dma_dom->domain);
1316
1317 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1105 goto free_dma_dom; 1318 goto free_dma_dom;
1106 1319
1107 /* 1320 /*
@@ -1129,22 +1342,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
1129 return domain->flags & PD_DMA_OPS_MASK; 1342 return domain->flags & PD_DMA_OPS_MASK;
1130} 1343}
1131 1344
1132/*
1133 * Find out the protection domain structure for a given PCI device. This
1134 * will give us the pointer to the page table root for example.
1135 */
1136static struct protection_domain *domain_for_device(u16 devid)
1137{
1138 struct protection_domain *dom;
1139 unsigned long flags;
1140
1141 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1142 dom = amd_iommu_pd_table[devid];
1143 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1144
1145 return dom;
1146}
1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain) 1345static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{ 1346{
1150 u64 pte_root = virt_to_phys(domain->pt_root); 1347 u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1353,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
1156 amd_iommu_dev_table[devid].data[2] = domain->id; 1353 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1354 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1355 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1356}
1357
1358static void clear_dte_entry(u16 devid)
1359{
1360 /* remove entry from the device table seen by the hardware */
1361 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1362 amd_iommu_dev_table[devid].data[1] = 0;
1363 amd_iommu_dev_table[devid].data[2] = 0;
1364
1365 amd_iommu_apply_erratum_63(devid);
1366}
1367
1368static void do_attach(struct device *dev, struct protection_domain *domain)
1369{
1370 struct iommu_dev_data *dev_data;
1371 struct amd_iommu *iommu;
1372 u16 devid;
1373
1374 devid = get_device_id(dev);
1375 iommu = amd_iommu_rlookup_table[devid];
1376 dev_data = get_dev_data(dev);
1377
1378 /* Update data structures */
1379 dev_data->domain = domain;
1380 list_add(&dev_data->list, &domain->dev_list);
1381 set_dte_entry(devid, domain);
1382
1383 /* Do reference counting */
1384 domain->dev_iommu[iommu->index] += 1;
1385 domain->dev_cnt += 1;
1159 1386
1160 amd_iommu_pd_table[devid] = domain; 1387 /* Flush the DTE entry */
1388 iommu_flush_device(dev);
1389}
1390
1391static void do_detach(struct device *dev)
1392{
1393 struct iommu_dev_data *dev_data;
1394 struct amd_iommu *iommu;
1395 u16 devid;
1396
1397 devid = get_device_id(dev);
1398 iommu = amd_iommu_rlookup_table[devid];
1399 dev_data = get_dev_data(dev);
1400
1401 /* decrease reference counters */
1402 dev_data->domain->dev_iommu[iommu->index] -= 1;
1403 dev_data->domain->dev_cnt -= 1;
1404
1405 /* Update data structures */
1406 dev_data->domain = NULL;
1407 list_del(&dev_data->list);
1408 clear_dte_entry(devid);
1409
1410 /* Flush the DTE entry */
1411 iommu_flush_device(dev);
1161} 1412}
1162 1413
1163/* 1414/*
1164 * If a device is not yet associated with a domain, this function does 1415 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware 1416 * assigns it visible for the hardware
1166 */ 1417 */
1167static void __attach_device(struct amd_iommu *iommu, 1418static int __attach_device(struct device *dev,
1168 struct protection_domain *domain, 1419 struct protection_domain *domain)
1169 u16 devid)
1170{ 1420{
1421 struct iommu_dev_data *dev_data, *alias_data;
1422
1423 dev_data = get_dev_data(dev);
1424 alias_data = get_dev_data(dev_data->alias);
1425
1426 if (!alias_data)
1427 return -EINVAL;
1428
1171 /* lock domain */ 1429 /* lock domain */
1172 spin_lock(&domain->lock); 1430 spin_lock(&domain->lock);
1173 1431
1174 /* update DTE entry */ 1432 /* Some sanity checks */
1175 set_dte_entry(devid, domain); 1433 if (alias_data->domain != NULL &&
1434 alias_data->domain != domain)
1435 return -EBUSY;
1176 1436
1177 domain->dev_cnt += 1; 1437 if (dev_data->domain != NULL &&
1438 dev_data->domain != domain)
1439 return -EBUSY;
1440
1441 /* Do real assignment */
1442 if (dev_data->alias != dev) {
1443 alias_data = get_dev_data(dev_data->alias);
1444 if (alias_data->domain == NULL)
1445 do_attach(dev_data->alias, domain);
1446
1447 atomic_inc(&alias_data->bind);
1448 }
1449
1450 if (dev_data->domain == NULL)
1451 do_attach(dev, domain);
1452
1453 atomic_inc(&dev_data->bind);
1178 1454
1179 /* ready */ 1455 /* ready */
1180 spin_unlock(&domain->lock); 1456 spin_unlock(&domain->lock);
1457
1458 return 0;
1181} 1459}
1182 1460
1183/* 1461/*
1184 * If a device is not yet associated with a domain, this function does 1462 * If a device is not yet associated with a domain, this function does
1185 * assigns it visible for the hardware 1463 * assigns it visible for the hardware
1186 */ 1464 */
1187static void attach_device(struct amd_iommu *iommu, 1465static int attach_device(struct device *dev,
1188 struct protection_domain *domain, 1466 struct protection_domain *domain)
1189 u16 devid)
1190{ 1467{
1191 unsigned long flags; 1468 unsigned long flags;
1469 int ret;
1192 1470
1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1471 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1194 __attach_device(iommu, domain, devid); 1472 ret = __attach_device(dev, domain);
1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1473 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1196 1474
1197 /* 1475 /*
@@ -1199,96 +1477,130 @@ static void attach_device(struct amd_iommu *iommu,
1199 * left the caches in the IOMMU dirty. So we have to flush 1477 * left the caches in the IOMMU dirty. So we have to flush
1200 * here to evict all dirty stuff. 1478 * here to evict all dirty stuff.
1201 */ 1479 */
1202 iommu_queue_inv_dev_entry(iommu, devid); 1480 iommu_flush_tlb_pde(domain);
1203 iommu_flush_tlb_pde(iommu, domain->id); 1481
1482 return ret;
1204} 1483}
1205 1484
1206/* 1485/*
1207 * Removes a device from a protection domain (unlocked) 1486 * Removes a device from a protection domain (unlocked)
1208 */ 1487 */
1209static void __detach_device(struct protection_domain *domain, u16 devid) 1488static void __detach_device(struct device *dev)
1210{ 1489{
1490 struct iommu_dev_data *dev_data = get_dev_data(dev);
1491 struct iommu_dev_data *alias_data;
1492 struct protection_domain *domain;
1493 unsigned long flags;
1211 1494
1212 /* lock domain */ 1495 BUG_ON(!dev_data->domain);
1213 spin_lock(&domain->lock);
1214 1496
1215 /* remove domain from the lookup table */ 1497 domain = dev_data->domain;
1216 amd_iommu_pd_table[devid] = NULL;
1217 1498
1218 /* remove entry from the device table seen by the hardware */ 1499 spin_lock_irqsave(&domain->lock, flags);
1219 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1220 amd_iommu_dev_table[devid].data[1] = 0;
1221 amd_iommu_dev_table[devid].data[2] = 0;
1222 1500
1223 /* decrease reference counter */ 1501 if (dev_data->alias != dev) {
1224 domain->dev_cnt -= 1; 1502 alias_data = get_dev_data(dev_data->alias);
1503 if (atomic_dec_and_test(&alias_data->bind))
1504 do_detach(dev_data->alias);
1505 }
1225 1506
1226 /* ready */ 1507 if (atomic_dec_and_test(&dev_data->bind))
1227 spin_unlock(&domain->lock); 1508 do_detach(dev);
1509
1510 spin_unlock_irqrestore(&domain->lock, flags);
1228 1511
1229 /* 1512 /*
1230 * If we run in passthrough mode the device must be assigned to the 1513 * If we run in passthrough mode the device must be assigned to the
1231 * passthrough domain if it is detached from any other domain 1514 * passthrough domain if it is detached from any other domain.
1515 * Make sure we can deassign from the pt_domain itself.
1232 */ 1516 */
1233 if (iommu_pass_through) { 1517 if (iommu_pass_through &&
1234 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 1518 (dev_data->domain == NULL && domain != pt_domain))
1235 __attach_device(iommu, pt_domain, devid); 1519 __attach_device(dev, pt_domain);
1236 }
1237} 1520}
1238 1521
1239/* 1522/*
1240 * Removes a device from a protection domain (with devtable_lock held) 1523 * Removes a device from a protection domain (with devtable_lock held)
1241 */ 1524 */
1242static void detach_device(struct protection_domain *domain, u16 devid) 1525static void detach_device(struct device *dev)
1243{ 1526{
1244 unsigned long flags; 1527 unsigned long flags;
1245 1528
1246 /* lock device table */ 1529 /* lock device table */
1247 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1530 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1248 __detach_device(domain, devid); 1531 __detach_device(dev);
1249 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1532 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1250} 1533}
1251 1534
1535/*
1536 * Find out the protection domain structure for a given PCI device. This
1537 * will give us the pointer to the page table root for example.
1538 */
1539static struct protection_domain *domain_for_device(struct device *dev)
1540{
1541 struct protection_domain *dom;
1542 struct iommu_dev_data *dev_data, *alias_data;
1543 unsigned long flags;
1544 u16 devid, alias;
1545
1546 devid = get_device_id(dev);
1547 alias = amd_iommu_alias_table[devid];
1548 dev_data = get_dev_data(dev);
1549 alias_data = get_dev_data(dev_data->alias);
1550 if (!alias_data)
1551 return NULL;
1552
1553 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1554 dom = dev_data->domain;
1555 if (dom == NULL &&
1556 alias_data->domain != NULL) {
1557 __attach_device(dev, alias_data->domain);
1558 dom = alias_data->domain;
1559 }
1560
1561 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1562
1563 return dom;
1564}
1565
1252static int device_change_notifier(struct notifier_block *nb, 1566static int device_change_notifier(struct notifier_block *nb,
1253 unsigned long action, void *data) 1567 unsigned long action, void *data)
1254{ 1568{
1255 struct device *dev = data; 1569 struct device *dev = data;
1256 struct pci_dev *pdev = to_pci_dev(dev); 1570 u16 devid;
1257 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
1258 struct protection_domain *domain; 1571 struct protection_domain *domain;
1259 struct dma_ops_domain *dma_domain; 1572 struct dma_ops_domain *dma_domain;
1260 struct amd_iommu *iommu; 1573 struct amd_iommu *iommu;
1261 unsigned long flags; 1574 unsigned long flags;
1262 1575
1263 if (devid > amd_iommu_last_bdf) 1576 if (!check_device(dev))
1264 goto out; 1577 return 0;
1265
1266 devid = amd_iommu_alias_table[devid];
1267
1268 iommu = amd_iommu_rlookup_table[devid];
1269 if (iommu == NULL)
1270 goto out;
1271
1272 domain = domain_for_device(devid);
1273 1578
1274 if (domain && !dma_ops_domain(domain)) 1579 devid = get_device_id(dev);
1275 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " 1580 iommu = amd_iommu_rlookup_table[devid];
1276 "to a non-dma-ops domain\n", dev_name(dev));
1277 1581
1278 switch (action) { 1582 switch (action) {
1279 case BUS_NOTIFY_UNBOUND_DRIVER: 1583 case BUS_NOTIFY_UNBOUND_DRIVER:
1584
1585 domain = domain_for_device(dev);
1586
1280 if (!domain) 1587 if (!domain)
1281 goto out; 1588 goto out;
1282 if (iommu_pass_through) 1589 if (iommu_pass_through)
1283 break; 1590 break;
1284 detach_device(domain, devid); 1591 detach_device(dev);
1285 break; 1592 break;
1286 case BUS_NOTIFY_ADD_DEVICE: 1593 case BUS_NOTIFY_ADD_DEVICE:
1594
1595 iommu_init_device(dev);
1596
1597 domain = domain_for_device(dev);
1598
1287 /* allocate a protection domain if a device is added */ 1599 /* allocate a protection domain if a device is added */
1288 dma_domain = find_protection_domain(devid); 1600 dma_domain = find_protection_domain(devid);
1289 if (dma_domain) 1601 if (dma_domain)
1290 goto out; 1602 goto out;
1291 dma_domain = dma_ops_domain_alloc(iommu); 1603 dma_domain = dma_ops_domain_alloc();
1292 if (!dma_domain) 1604 if (!dma_domain)
1293 goto out; 1605 goto out;
1294 dma_domain->target_dev = devid; 1606 dma_domain->target_dev = devid;
@@ -1298,11 +1610,15 @@ static int device_change_notifier(struct notifier_block *nb,
1298 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 1610 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1299 1611
1300 break; 1612 break;
1613 case BUS_NOTIFY_DEL_DEVICE:
1614
1615 iommu_uninit_device(dev);
1616
1301 default: 1617 default:
1302 goto out; 1618 goto out;
1303 } 1619 }
1304 1620
1305 iommu_queue_inv_dev_entry(iommu, devid); 1621 iommu_flush_device(dev);
1306 iommu_completion_wait(iommu); 1622 iommu_completion_wait(iommu);
1307 1623
1308out: 1624out:
@@ -1313,6 +1629,11 @@ static struct notifier_block device_nb = {
1313 .notifier_call = device_change_notifier, 1629 .notifier_call = device_change_notifier,
1314}; 1630};
1315 1631
1632void amd_iommu_init_notifier(void)
1633{
1634 bus_register_notifier(&pci_bus_type, &device_nb);
1635}
1636
1316/***************************************************************************** 1637/*****************************************************************************
1317 * 1638 *
1318 * The next functions belong to the dma_ops mapping/unmapping code. 1639 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1320,106 +1641,46 @@ static struct notifier_block device_nb = {
1320 *****************************************************************************/ 1641 *****************************************************************************/
1321 1642
1322/* 1643/*
1323 * This function checks if the driver got a valid device from the caller to
1324 * avoid dereferencing invalid pointers.
1325 */
1326static bool check_device(struct device *dev)
1327{
1328 if (!dev || !dev->dma_mask)
1329 return false;
1330
1331 return true;
1332}
1333
1334/*
1335 * In this function the list of preallocated protection domains is traversed to
1336 * find the domain for a specific device
1337 */
1338static struct dma_ops_domain *find_protection_domain(u16 devid)
1339{
1340 struct dma_ops_domain *entry, *ret = NULL;
1341 unsigned long flags;
1342
1343 if (list_empty(&iommu_pd_list))
1344 return NULL;
1345
1346 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1347
1348 list_for_each_entry(entry, &iommu_pd_list, list) {
1349 if (entry->target_dev == devid) {
1350 ret = entry;
1351 break;
1352 }
1353 }
1354
1355 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1356
1357 return ret;
1358}
1359
1360/*
1361 * In the dma_ops path we only have the struct device. This function 1644 * In the dma_ops path we only have the struct device. This function
1362 * finds the corresponding IOMMU, the protection domain and the 1645 * finds the corresponding IOMMU, the protection domain and the
1363 * requestor id for a given device. 1646 * requestor id for a given device.
1364 * If the device is not yet associated with a domain this is also done 1647 * If the device is not yet associated with a domain this is also done
1365 * in this function. 1648 * in this function.
1366 */ 1649 */
1367static int get_device_resources(struct device *dev, 1650static struct protection_domain *get_domain(struct device *dev)
1368 struct amd_iommu **iommu,
1369 struct protection_domain **domain,
1370 u16 *bdf)
1371{ 1651{
1652 struct protection_domain *domain;
1372 struct dma_ops_domain *dma_dom; 1653 struct dma_ops_domain *dma_dom;
1373 struct pci_dev *pcidev; 1654 u16 devid = get_device_id(dev);
1374 u16 _bdf;
1375
1376 *iommu = NULL;
1377 *domain = NULL;
1378 *bdf = 0xffff;
1379 1655
1380 if (dev->bus != &pci_bus_type) 1656 if (!check_device(dev))
1381 return 0; 1657 return ERR_PTR(-EINVAL);
1382
1383 pcidev = to_pci_dev(dev);
1384 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1385 1658
1386 /* device not translated by any IOMMU in the system? */ 1659 domain = domain_for_device(dev);
1387 if (_bdf > amd_iommu_last_bdf) 1660 if (domain != NULL && !dma_ops_domain(domain))
1388 return 0; 1661 return ERR_PTR(-EBUSY);
1389 1662
1390 *bdf = amd_iommu_alias_table[_bdf]; 1663 if (domain != NULL)
1664 return domain;
1391 1665
1392 *iommu = amd_iommu_rlookup_table[*bdf]; 1666 /* Device not bount yet - bind it */
1393 if (*iommu == NULL) 1667 dma_dom = find_protection_domain(devid);
1394 return 0; 1668 if (!dma_dom)
1395 *domain = domain_for_device(*bdf); 1669 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1396 if (*domain == NULL) { 1670 attach_device(dev, &dma_dom->domain);
1397 dma_dom = find_protection_domain(*bdf); 1671 DUMP_printk("Using protection domain %d for device %s\n",
1398 if (!dma_dom) 1672 dma_dom->domain.id, dev_name(dev));
1399 dma_dom = (*iommu)->default_dom;
1400 *domain = &dma_dom->domain;
1401 attach_device(*iommu, *domain, *bdf);
1402 DUMP_printk("Using protection domain %d for device %s\n",
1403 (*domain)->id, dev_name(dev));
1404 }
1405
1406 if (domain_for_device(_bdf) == NULL)
1407 attach_device(*iommu, *domain, _bdf);
1408 1673
1409 return 1; 1674 return &dma_dom->domain;
1410} 1675}
1411 1676
1412static void update_device_table(struct protection_domain *domain) 1677static void update_device_table(struct protection_domain *domain)
1413{ 1678{
1414 unsigned long flags; 1679 struct iommu_dev_data *dev_data;
1415 int i;
1416 1680
1417 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 1681 list_for_each_entry(dev_data, &domain->dev_list, list) {
1418 if (amd_iommu_pd_table[i] != domain) 1682 u16 devid = get_device_id(dev_data->dev);
1419 continue; 1683 set_dte_entry(devid, domain);
1420 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1421 set_dte_entry(i, domain);
1422 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1423 } 1684 }
1424} 1685}
1425 1686
@@ -1429,76 +1690,13 @@ static void update_domain(struct protection_domain *domain)
1429 return; 1690 return;
1430 1691
1431 update_device_table(domain); 1692 update_device_table(domain);
1432 flush_devices_by_domain(domain); 1693 iommu_flush_domain_devices(domain);
1433 iommu_flush_domain(domain->id); 1694 iommu_flush_tlb_pde(domain);
1434 1695
1435 domain->updated = false; 1696 domain->updated = false;
1436} 1697}
1437 1698
1438/* 1699/*
1439 * This function is used to add another level to an IO page table. Adding
1440 * another level increases the size of the address space by 9 bits to a size up
1441 * to 64 bits.
1442 */
1443static bool increase_address_space(struct protection_domain *domain,
1444 gfp_t gfp)
1445{
1446 u64 *pte;
1447
1448 if (domain->mode == PAGE_MODE_6_LEVEL)
1449 /* address space already 64 bit large */
1450 return false;
1451
1452 pte = (void *)get_zeroed_page(gfp);
1453 if (!pte)
1454 return false;
1455
1456 *pte = PM_LEVEL_PDE(domain->mode,
1457 virt_to_phys(domain->pt_root));
1458 domain->pt_root = pte;
1459 domain->mode += 1;
1460 domain->updated = true;
1461
1462 return true;
1463}
1464
1465static u64 *alloc_pte(struct protection_domain *domain,
1466 unsigned long address,
1467 int end_lvl,
1468 u64 **pte_page,
1469 gfp_t gfp)
1470{
1471 u64 *pte, *page;
1472 int level;
1473
1474 while (address > PM_LEVEL_SIZE(domain->mode))
1475 increase_address_space(domain, gfp);
1476
1477 level = domain->mode - 1;
1478 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1479
1480 while (level > end_lvl) {
1481 if (!IOMMU_PTE_PRESENT(*pte)) {
1482 page = (u64 *)get_zeroed_page(gfp);
1483 if (!page)
1484 return NULL;
1485 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1486 }
1487
1488 level -= 1;
1489
1490 pte = IOMMU_PTE_PAGE(*pte);
1491
1492 if (pte_page && level == end_lvl)
1493 *pte_page = pte;
1494
1495 pte = &pte[PM_LEVEL_INDEX(level, address)];
1496 }
1497
1498 return pte;
1499}
1500
1501/*
1502 * This function fetches the PTE for a given address in the aperture 1700 * This function fetches the PTE for a given address in the aperture
1503 */ 1701 */
1504static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 1702static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1528,8 +1726,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1528 * This is the generic map function. It maps one 4kb page at paddr to 1726 * This is the generic map function. It maps one 4kb page at paddr to
1529 * the given address in the DMA address space for the domain. 1727 * the given address in the DMA address space for the domain.
1530 */ 1728 */
1531static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 1729static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1532 struct dma_ops_domain *dom,
1533 unsigned long address, 1730 unsigned long address,
1534 phys_addr_t paddr, 1731 phys_addr_t paddr,
1535 int direction) 1732 int direction)
@@ -1542,7 +1739,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1542 1739
1543 pte = dma_ops_get_pte(dom, address); 1740 pte = dma_ops_get_pte(dom, address);
1544 if (!pte) 1741 if (!pte)
1545 return bad_dma_address; 1742 return DMA_ERROR_CODE;
1546 1743
1547 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1744 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1548 1745
@@ -1563,8 +1760,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1563/* 1760/*
1564 * The generic unmapping function for on page in the DMA address space. 1761 * The generic unmapping function for on page in the DMA address space.
1565 */ 1762 */
1566static void dma_ops_domain_unmap(struct amd_iommu *iommu, 1763static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1567 struct dma_ops_domain *dom,
1568 unsigned long address) 1764 unsigned long address)
1569{ 1765{
1570 struct aperture_range *aperture; 1766 struct aperture_range *aperture;
@@ -1595,7 +1791,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1595 * Must be called with the domain lock held. 1791 * Must be called with the domain lock held.
1596 */ 1792 */
1597static dma_addr_t __map_single(struct device *dev, 1793static dma_addr_t __map_single(struct device *dev,
1598 struct amd_iommu *iommu,
1599 struct dma_ops_domain *dma_dom, 1794 struct dma_ops_domain *dma_dom,
1600 phys_addr_t paddr, 1795 phys_addr_t paddr,
1601 size_t size, 1796 size_t size,
@@ -1623,7 +1818,7 @@ static dma_addr_t __map_single(struct device *dev,
1623retry: 1818retry:
1624 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1819 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1625 dma_mask); 1820 dma_mask);
1626 if (unlikely(address == bad_dma_address)) { 1821 if (unlikely(address == DMA_ERROR_CODE)) {
1627 /* 1822 /*
1628 * setting next_address here will let the address 1823 * setting next_address here will let the address
1629 * allocator only scan the new allocated range in the 1824 * allocator only scan the new allocated range in the
@@ -1631,11 +1826,11 @@ retry:
1631 */ 1826 */
1632 dma_dom->next_address = dma_dom->aperture_size; 1827 dma_dom->next_address = dma_dom->aperture_size;
1633 1828
1634 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) 1829 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
1635 goto out; 1830 goto out;
1636 1831
1637 /* 1832 /*
1638 * aperture was sucessfully enlarged by 128 MB, try 1833 * aperture was successfully enlarged by 128 MB, try
1639 * allocation again 1834 * allocation again
1640 */ 1835 */
1641 goto retry; 1836 goto retry;
@@ -1643,8 +1838,8 @@ retry:
1643 1838
1644 start = address; 1839 start = address;
1645 for (i = 0; i < pages; ++i) { 1840 for (i = 0; i < pages; ++i) {
1646 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1841 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
1647 if (ret == bad_dma_address) 1842 if (ret == DMA_ERROR_CODE)
1648 goto out_unmap; 1843 goto out_unmap;
1649 1844
1650 paddr += PAGE_SIZE; 1845 paddr += PAGE_SIZE;
@@ -1655,10 +1850,10 @@ retry:
1655 ADD_STATS_COUNTER(alloced_io_mem, size); 1850 ADD_STATS_COUNTER(alloced_io_mem, size);
1656 1851
1657 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1852 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1658 iommu_flush_tlb(iommu, dma_dom->domain.id); 1853 iommu_flush_tlb(&dma_dom->domain);
1659 dma_dom->need_flush = false; 1854 dma_dom->need_flush = false;
1660 } else if (unlikely(iommu_has_npcache(iommu))) 1855 } else if (unlikely(amd_iommu_np_cache))
1661 iommu_flush_pages(iommu, dma_dom->domain.id, address, size); 1856 iommu_flush_pages(&dma_dom->domain, address, size);
1662 1857
1663out: 1858out:
1664 return address; 1859 return address;
@@ -1667,20 +1862,19 @@ out_unmap:
1667 1862
1668 for (--i; i >= 0; --i) { 1863 for (--i; i >= 0; --i) {
1669 start -= PAGE_SIZE; 1864 start -= PAGE_SIZE;
1670 dma_ops_domain_unmap(iommu, dma_dom, start); 1865 dma_ops_domain_unmap(dma_dom, start);
1671 } 1866 }
1672 1867
1673 dma_ops_free_addresses(dma_dom, address, pages); 1868 dma_ops_free_addresses(dma_dom, address, pages);
1674 1869
1675 return bad_dma_address; 1870 return DMA_ERROR_CODE;
1676} 1871}
1677 1872
1678/* 1873/*
1679 * Does the reverse of the __map_single function. Must be called with 1874 * Does the reverse of the __map_single function. Must be called with
1680 * the domain lock held too 1875 * the domain lock held too
1681 */ 1876 */
1682static void __unmap_single(struct amd_iommu *iommu, 1877static void __unmap_single(struct dma_ops_domain *dma_dom,
1683 struct dma_ops_domain *dma_dom,
1684 dma_addr_t dma_addr, 1878 dma_addr_t dma_addr,
1685 size_t size, 1879 size_t size,
1686 int dir) 1880 int dir)
@@ -1688,7 +1882,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1688 dma_addr_t i, start; 1882 dma_addr_t i, start;
1689 unsigned int pages; 1883 unsigned int pages;
1690 1884
1691 if ((dma_addr == bad_dma_address) || 1885 if ((dma_addr == DMA_ERROR_CODE) ||
1692 (dma_addr + size > dma_dom->aperture_size)) 1886 (dma_addr + size > dma_dom->aperture_size))
1693 return; 1887 return;
1694 1888
@@ -1697,7 +1891,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1697 start = dma_addr; 1891 start = dma_addr;
1698 1892
1699 for (i = 0; i < pages; ++i) { 1893 for (i = 0; i < pages; ++i) {
1700 dma_ops_domain_unmap(iommu, dma_dom, start); 1894 dma_ops_domain_unmap(dma_dom, start);
1701 start += PAGE_SIZE; 1895 start += PAGE_SIZE;
1702 } 1896 }
1703 1897
@@ -1706,7 +1900,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1706 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1900 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1707 1901
1708 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1902 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1709 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); 1903 iommu_flush_pages(&dma_dom->domain, dma_addr, size);
1710 dma_dom->need_flush = false; 1904 dma_dom->need_flush = false;
1711 } 1905 }
1712} 1906}
@@ -1720,36 +1914,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
1720 struct dma_attrs *attrs) 1914 struct dma_attrs *attrs)
1721{ 1915{
1722 unsigned long flags; 1916 unsigned long flags;
1723 struct amd_iommu *iommu;
1724 struct protection_domain *domain; 1917 struct protection_domain *domain;
1725 u16 devid;
1726 dma_addr_t addr; 1918 dma_addr_t addr;
1727 u64 dma_mask; 1919 u64 dma_mask;
1728 phys_addr_t paddr = page_to_phys(page) + offset; 1920 phys_addr_t paddr = page_to_phys(page) + offset;
1729 1921
1730 INC_STATS_COUNTER(cnt_map_single); 1922 INC_STATS_COUNTER(cnt_map_single);
1731 1923
1732 if (!check_device(dev)) 1924 domain = get_domain(dev);
1733 return bad_dma_address; 1925 if (PTR_ERR(domain) == -EINVAL)
1734
1735 dma_mask = *dev->dma_mask;
1736
1737 get_device_resources(dev, &iommu, &domain, &devid);
1738
1739 if (iommu == NULL || domain == NULL)
1740 /* device not handled by any AMD IOMMU */
1741 return (dma_addr_t)paddr; 1926 return (dma_addr_t)paddr;
1927 else if (IS_ERR(domain))
1928 return DMA_ERROR_CODE;
1742 1929
1743 if (!dma_ops_domain(domain)) 1930 dma_mask = *dev->dma_mask;
1744 return bad_dma_address;
1745 1931
1746 spin_lock_irqsave(&domain->lock, flags); 1932 spin_lock_irqsave(&domain->lock, flags);
1747 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1933
1934 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
1748 dma_mask); 1935 dma_mask);
1749 if (addr == bad_dma_address) 1936 if (addr == DMA_ERROR_CODE)
1750 goto out; 1937 goto out;
1751 1938
1752 iommu_completion_wait(iommu); 1939 iommu_flush_complete(domain);
1753 1940
1754out: 1941out:
1755 spin_unlock_irqrestore(&domain->lock, flags); 1942 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1764,25 +1951,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1764 enum dma_data_direction dir, struct dma_attrs *attrs) 1951 enum dma_data_direction dir, struct dma_attrs *attrs)
1765{ 1952{
1766 unsigned long flags; 1953 unsigned long flags;
1767 struct amd_iommu *iommu;
1768 struct protection_domain *domain; 1954 struct protection_domain *domain;
1769 u16 devid;
1770 1955
1771 INC_STATS_COUNTER(cnt_unmap_single); 1956 INC_STATS_COUNTER(cnt_unmap_single);
1772 1957
1773 if (!check_device(dev) || 1958 domain = get_domain(dev);
1774 !get_device_resources(dev, &iommu, &domain, &devid)) 1959 if (IS_ERR(domain))
1775 /* device not handled by any AMD IOMMU */
1776 return;
1777
1778 if (!dma_ops_domain(domain))
1779 return; 1960 return;
1780 1961
1781 spin_lock_irqsave(&domain->lock, flags); 1962 spin_lock_irqsave(&domain->lock, flags);
1782 1963
1783 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1964 __unmap_single(domain->priv, dma_addr, size, dir);
1784 1965
1785 iommu_completion_wait(iommu); 1966 iommu_flush_complete(domain);
1786 1967
1787 spin_unlock_irqrestore(&domain->lock, flags); 1968 spin_unlock_irqrestore(&domain->lock, flags);
1788} 1969}
@@ -1814,9 +1995,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1814 struct dma_attrs *attrs) 1995 struct dma_attrs *attrs)
1815{ 1996{
1816 unsigned long flags; 1997 unsigned long flags;
1817 struct amd_iommu *iommu;
1818 struct protection_domain *domain; 1998 struct protection_domain *domain;
1819 u16 devid;
1820 int i; 1999 int i;
1821 struct scatterlist *s; 2000 struct scatterlist *s;
1822 phys_addr_t paddr; 2001 phys_addr_t paddr;
@@ -1825,25 +2004,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1825 2004
1826 INC_STATS_COUNTER(cnt_map_sg); 2005 INC_STATS_COUNTER(cnt_map_sg);
1827 2006
1828 if (!check_device(dev)) 2007 domain = get_domain(dev);
2008 if (PTR_ERR(domain) == -EINVAL)
2009 return map_sg_no_iommu(dev, sglist, nelems, dir);
2010 else if (IS_ERR(domain))
1829 return 0; 2011 return 0;
1830 2012
1831 dma_mask = *dev->dma_mask; 2013 dma_mask = *dev->dma_mask;
1832 2014
1833 get_device_resources(dev, &iommu, &domain, &devid);
1834
1835 if (!iommu || !domain)
1836 return map_sg_no_iommu(dev, sglist, nelems, dir);
1837
1838 if (!dma_ops_domain(domain))
1839 return 0;
1840
1841 spin_lock_irqsave(&domain->lock, flags); 2015 spin_lock_irqsave(&domain->lock, flags);
1842 2016
1843 for_each_sg(sglist, s, nelems, i) { 2017 for_each_sg(sglist, s, nelems, i) {
1844 paddr = sg_phys(s); 2018 paddr = sg_phys(s);
1845 2019
1846 s->dma_address = __map_single(dev, iommu, domain->priv, 2020 s->dma_address = __map_single(dev, domain->priv,
1847 paddr, s->length, dir, false, 2021 paddr, s->length, dir, false,
1848 dma_mask); 2022 dma_mask);
1849 2023
@@ -1854,7 +2028,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1854 goto unmap; 2028 goto unmap;
1855 } 2029 }
1856 2030
1857 iommu_completion_wait(iommu); 2031 iommu_flush_complete(domain);
1858 2032
1859out: 2033out:
1860 spin_unlock_irqrestore(&domain->lock, flags); 2034 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1863,7 +2037,7 @@ out:
1863unmap: 2037unmap:
1864 for_each_sg(sglist, s, mapped_elems, i) { 2038 for_each_sg(sglist, s, mapped_elems, i) {
1865 if (s->dma_address) 2039 if (s->dma_address)
1866 __unmap_single(iommu, domain->priv, s->dma_address, 2040 __unmap_single(domain->priv, s->dma_address,
1867 s->dma_length, dir); 2041 s->dma_length, dir);
1868 s->dma_address = s->dma_length = 0; 2042 s->dma_address = s->dma_length = 0;
1869 } 2043 }
@@ -1882,30 +2056,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1882 struct dma_attrs *attrs) 2056 struct dma_attrs *attrs)
1883{ 2057{
1884 unsigned long flags; 2058 unsigned long flags;
1885 struct amd_iommu *iommu;
1886 struct protection_domain *domain; 2059 struct protection_domain *domain;
1887 struct scatterlist *s; 2060 struct scatterlist *s;
1888 u16 devid;
1889 int i; 2061 int i;
1890 2062
1891 INC_STATS_COUNTER(cnt_unmap_sg); 2063 INC_STATS_COUNTER(cnt_unmap_sg);
1892 2064
1893 if (!check_device(dev) || 2065 domain = get_domain(dev);
1894 !get_device_resources(dev, &iommu, &domain, &devid)) 2066 if (IS_ERR(domain))
1895 return;
1896
1897 if (!dma_ops_domain(domain))
1898 return; 2067 return;
1899 2068
1900 spin_lock_irqsave(&domain->lock, flags); 2069 spin_lock_irqsave(&domain->lock, flags);
1901 2070
1902 for_each_sg(sglist, s, nelems, i) { 2071 for_each_sg(sglist, s, nelems, i) {
1903 __unmap_single(iommu, domain->priv, s->dma_address, 2072 __unmap_single(domain->priv, s->dma_address,
1904 s->dma_length, dir); 2073 s->dma_length, dir);
1905 s->dma_address = s->dma_length = 0; 2074 s->dma_address = s->dma_length = 0;
1906 } 2075 }
1907 2076
1908 iommu_completion_wait(iommu); 2077 iommu_flush_complete(domain);
1909 2078
1910 spin_unlock_irqrestore(&domain->lock, flags); 2079 spin_unlock_irqrestore(&domain->lock, flags);
1911} 2080}
@@ -1918,49 +2087,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
1918{ 2087{
1919 unsigned long flags; 2088 unsigned long flags;
1920 void *virt_addr; 2089 void *virt_addr;
1921 struct amd_iommu *iommu;
1922 struct protection_domain *domain; 2090 struct protection_domain *domain;
1923 u16 devid;
1924 phys_addr_t paddr; 2091 phys_addr_t paddr;
1925 u64 dma_mask = dev->coherent_dma_mask; 2092 u64 dma_mask = dev->coherent_dma_mask;
1926 2093
1927 INC_STATS_COUNTER(cnt_alloc_coherent); 2094 INC_STATS_COUNTER(cnt_alloc_coherent);
1928 2095
1929 if (!check_device(dev)) 2096 domain = get_domain(dev);
2097 if (PTR_ERR(domain) == -EINVAL) {
2098 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2099 *dma_addr = __pa(virt_addr);
2100 return virt_addr;
2101 } else if (IS_ERR(domain))
1930 return NULL; 2102 return NULL;
1931 2103
1932 if (!get_device_resources(dev, &iommu, &domain, &devid)) 2104 dma_mask = dev->coherent_dma_mask;
1933 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2105 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2106 flag |= __GFP_ZERO;
1934 2107
1935 flag |= __GFP_ZERO;
1936 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 2108 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1937 if (!virt_addr) 2109 if (!virt_addr)
1938 return NULL; 2110 return NULL;
1939 2111
1940 paddr = virt_to_phys(virt_addr); 2112 paddr = virt_to_phys(virt_addr);
1941 2113
1942 if (!iommu || !domain) {
1943 *dma_addr = (dma_addr_t)paddr;
1944 return virt_addr;
1945 }
1946
1947 if (!dma_ops_domain(domain))
1948 goto out_free;
1949
1950 if (!dma_mask) 2114 if (!dma_mask)
1951 dma_mask = *dev->dma_mask; 2115 dma_mask = *dev->dma_mask;
1952 2116
1953 spin_lock_irqsave(&domain->lock, flags); 2117 spin_lock_irqsave(&domain->lock, flags);
1954 2118
1955 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 2119 *dma_addr = __map_single(dev, domain->priv, paddr,
1956 size, DMA_BIDIRECTIONAL, true, dma_mask); 2120 size, DMA_BIDIRECTIONAL, true, dma_mask);
1957 2121
1958 if (*dma_addr == bad_dma_address) { 2122 if (*dma_addr == DMA_ERROR_CODE) {
1959 spin_unlock_irqrestore(&domain->lock, flags); 2123 spin_unlock_irqrestore(&domain->lock, flags);
1960 goto out_free; 2124 goto out_free;
1961 } 2125 }
1962 2126
1963 iommu_completion_wait(iommu); 2127 iommu_flush_complete(domain);
1964 2128
1965 spin_unlock_irqrestore(&domain->lock, flags); 2129 spin_unlock_irqrestore(&domain->lock, flags);
1966 2130
@@ -1980,28 +2144,19 @@ static void free_coherent(struct device *dev, size_t size,
1980 void *virt_addr, dma_addr_t dma_addr) 2144 void *virt_addr, dma_addr_t dma_addr)
1981{ 2145{
1982 unsigned long flags; 2146 unsigned long flags;
1983 struct amd_iommu *iommu;
1984 struct protection_domain *domain; 2147 struct protection_domain *domain;
1985 u16 devid;
1986 2148
1987 INC_STATS_COUNTER(cnt_free_coherent); 2149 INC_STATS_COUNTER(cnt_free_coherent);
1988 2150
1989 if (!check_device(dev)) 2151 domain = get_domain(dev);
1990 return; 2152 if (IS_ERR(domain))
1991
1992 get_device_resources(dev, &iommu, &domain, &devid);
1993
1994 if (!iommu || !domain)
1995 goto free_mem;
1996
1997 if (!dma_ops_domain(domain))
1998 goto free_mem; 2153 goto free_mem;
1999 2154
2000 spin_lock_irqsave(&domain->lock, flags); 2155 spin_lock_irqsave(&domain->lock, flags);
2001 2156
2002 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2157 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2003 2158
2004 iommu_completion_wait(iommu); 2159 iommu_flush_complete(domain);
2005 2160
2006 spin_unlock_irqrestore(&domain->lock, flags); 2161 spin_unlock_irqrestore(&domain->lock, flags);
2007 2162
@@ -2015,22 +2170,7 @@ free_mem:
2015 */ 2170 */
2016static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2171static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2017{ 2172{
2018 u16 bdf; 2173 return check_device(dev);
2019 struct pci_dev *pcidev;
2020
2021 /* No device or no PCI device */
2022 if (!dev || dev->bus != &pci_bus_type)
2023 return 0;
2024
2025 pcidev = to_pci_dev(dev);
2026
2027 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
2028
2029 /* Out of our scope? */
2030 if (bdf > amd_iommu_last_bdf)
2031 return 0;
2032
2033 return 1;
2034} 2174}
2035 2175
2036/* 2176/*
@@ -2044,25 +2184,28 @@ static void prealloc_protection_domains(void)
2044{ 2184{
2045 struct pci_dev *dev = NULL; 2185 struct pci_dev *dev = NULL;
2046 struct dma_ops_domain *dma_dom; 2186 struct dma_ops_domain *dma_dom;
2047 struct amd_iommu *iommu;
2048 u16 devid; 2187 u16 devid;
2049 2188
2050 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2189 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2051 devid = calc_devid(dev->bus->number, dev->devfn); 2190
2052 if (devid > amd_iommu_last_bdf) 2191 /* Do we handle this device? */
2053 continue; 2192 if (!check_device(&dev->dev))
2054 devid = amd_iommu_alias_table[devid];
2055 if (domain_for_device(devid))
2056 continue; 2193 continue;
2057 iommu = amd_iommu_rlookup_table[devid]; 2194
2058 if (!iommu) 2195 /* Is there already any domain for it? */
2196 if (domain_for_device(&dev->dev))
2059 continue; 2197 continue;
2060 dma_dom = dma_ops_domain_alloc(iommu); 2198
2199 devid = get_device_id(&dev->dev);
2200
2201 dma_dom = dma_ops_domain_alloc();
2061 if (!dma_dom) 2202 if (!dma_dom)
2062 continue; 2203 continue;
2063 init_unity_mappings_for_device(dma_dom, devid); 2204 init_unity_mappings_for_device(dma_dom, devid);
2064 dma_dom->target_dev = devid; 2205 dma_dom->target_dev = devid;
2065 2206
2207 attach_device(&dev->dev, &dma_dom->domain);
2208
2066 list_add_tail(&dma_dom->list, &iommu_pd_list); 2209 list_add_tail(&dma_dom->list, &iommu_pd_list);
2067 } 2210 }
2068} 2211}
@@ -2080,6 +2223,12 @@ static struct dma_map_ops amd_iommu_dma_ops = {
2080/* 2223/*
2081 * The function which clues the AMD IOMMU driver into dma_ops. 2224 * The function which clues the AMD IOMMU driver into dma_ops.
2082 */ 2225 */
2226
2227void __init amd_iommu_init_api(void)
2228{
2229 register_iommu(&amd_iommu_ops);
2230}
2231
2083int __init amd_iommu_init_dma_ops(void) 2232int __init amd_iommu_init_dma_ops(void)
2084{ 2233{
2085 struct amd_iommu *iommu; 2234 struct amd_iommu *iommu;
@@ -2091,7 +2240,7 @@ int __init amd_iommu_init_dma_ops(void)
2091 * protection domain will be assigned to the default one. 2240 * protection domain will be assigned to the default one.
2092 */ 2241 */
2093 for_each_iommu(iommu) { 2242 for_each_iommu(iommu) {
2094 iommu->default_dom = dma_ops_domain_alloc(iommu); 2243 iommu->default_dom = dma_ops_domain_alloc();
2095 if (iommu->default_dom == NULL) 2244 if (iommu->default_dom == NULL)
2096 return -ENOMEM; 2245 return -ENOMEM;
2097 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 2246 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2101,15 +2250,12 @@ int __init amd_iommu_init_dma_ops(void)
2101 } 2250 }
2102 2251
2103 /* 2252 /*
2104 * If device isolation is enabled, pre-allocate the protection 2253 * Pre-allocate the protection domains for each device.
2105 * domains for each device.
2106 */ 2254 */
2107 if (amd_iommu_isolate) 2255 prealloc_protection_domains();
2108 prealloc_protection_domains();
2109 2256
2110 iommu_detected = 1; 2257 iommu_detected = 1;
2111 force_iommu = 1; 2258 swiotlb = 0;
2112 bad_dma_address = 0;
2113#ifdef CONFIG_GART_IOMMU 2259#ifdef CONFIG_GART_IOMMU
2114 gart_iommu_aperture_disabled = 1; 2260 gart_iommu_aperture_disabled = 1;
2115 gart_iommu_aperture = 0; 2261 gart_iommu_aperture = 0;
@@ -2118,10 +2264,6 @@ int __init amd_iommu_init_dma_ops(void)
2118 /* Make the driver finally visible to the drivers */ 2264 /* Make the driver finally visible to the drivers */
2119 dma_ops = &amd_iommu_dma_ops; 2265 dma_ops = &amd_iommu_dma_ops;
2120 2266
2121 register_iommu(&amd_iommu_ops);
2122
2123 bus_register_notifier(&pci_bus_type, &device_nb);
2124
2125 amd_iommu_stats_init(); 2267 amd_iommu_stats_init();
2126 2268
2127 return 0; 2269 return 0;
@@ -2148,14 +2290,17 @@ free_domains:
2148 2290
2149static void cleanup_domain(struct protection_domain *domain) 2291static void cleanup_domain(struct protection_domain *domain)
2150{ 2292{
2293 struct iommu_dev_data *dev_data, *next;
2151 unsigned long flags; 2294 unsigned long flags;
2152 u16 devid;
2153 2295
2154 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2296 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2155 2297
2156 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) 2298 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2157 if (amd_iommu_pd_table[devid] == domain) 2299 struct device *dev = dev_data->dev;
2158 __detach_device(domain, devid); 2300
2301 do_detach(dev);
2302 atomic_set(&dev_data->bind, 0);
2303 }
2159 2304
2160 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2305 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2161} 2306}
@@ -2165,6 +2310,8 @@ static void protection_domain_free(struct protection_domain *domain)
2165 if (!domain) 2310 if (!domain)
2166 return; 2311 return;
2167 2312
2313 del_domain_from_list(domain);
2314
2168 if (domain->id) 2315 if (domain->id)
2169 domain_id_free(domain->id); 2316 domain_id_free(domain->id);
2170 2317
@@ -2183,6 +2330,9 @@ static struct protection_domain *protection_domain_alloc(void)
2183 domain->id = domain_id_alloc(); 2330 domain->id = domain_id_alloc();
2184 if (!domain->id) 2331 if (!domain->id)
2185 goto out_err; 2332 goto out_err;
2333 INIT_LIST_HEAD(&domain->dev_list);
2334
2335 add_domain_to_list(domain);
2186 2336
2187 return domain; 2337 return domain;
2188 2338
@@ -2239,26 +2389,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2239static void amd_iommu_detach_device(struct iommu_domain *dom, 2389static void amd_iommu_detach_device(struct iommu_domain *dom,
2240 struct device *dev) 2390 struct device *dev)
2241{ 2391{
2242 struct protection_domain *domain = dom->priv; 2392 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2243 struct amd_iommu *iommu; 2393 struct amd_iommu *iommu;
2244 struct pci_dev *pdev;
2245 u16 devid; 2394 u16 devid;
2246 2395
2247 if (dev->bus != &pci_bus_type) 2396 if (!check_device(dev))
2248 return; 2397 return;
2249 2398
2250 pdev = to_pci_dev(dev); 2399 devid = get_device_id(dev);
2251 2400
2252 devid = calc_devid(pdev->bus->number, pdev->devfn); 2401 if (dev_data->domain != NULL)
2253 2402 detach_device(dev);
2254 if (devid > 0)
2255 detach_device(domain, devid);
2256 2403
2257 iommu = amd_iommu_rlookup_table[devid]; 2404 iommu = amd_iommu_rlookup_table[devid];
2258 if (!iommu) 2405 if (!iommu)
2259 return; 2406 return;
2260 2407
2261 iommu_queue_inv_dev_entry(iommu, devid); 2408 iommu_flush_device(dev);
2262 iommu_completion_wait(iommu); 2409 iommu_completion_wait(iommu);
2263} 2410}
2264 2411
@@ -2266,35 +2413,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2266 struct device *dev) 2413 struct device *dev)
2267{ 2414{
2268 struct protection_domain *domain = dom->priv; 2415 struct protection_domain *domain = dom->priv;
2269 struct protection_domain *old_domain; 2416 struct iommu_dev_data *dev_data;
2270 struct amd_iommu *iommu; 2417 struct amd_iommu *iommu;
2271 struct pci_dev *pdev; 2418 int ret;
2272 u16 devid; 2419 u16 devid;
2273 2420
2274 if (dev->bus != &pci_bus_type) 2421 if (!check_device(dev))
2275 return -EINVAL; 2422 return -EINVAL;
2276 2423
2277 pdev = to_pci_dev(dev); 2424 dev_data = dev->archdata.iommu;
2278
2279 devid = calc_devid(pdev->bus->number, pdev->devfn);
2280 2425
2281 if (devid >= amd_iommu_last_bdf || 2426 devid = get_device_id(dev);
2282 devid != amd_iommu_alias_table[devid])
2283 return -EINVAL;
2284 2427
2285 iommu = amd_iommu_rlookup_table[devid]; 2428 iommu = amd_iommu_rlookup_table[devid];
2286 if (!iommu) 2429 if (!iommu)
2287 return -EINVAL; 2430 return -EINVAL;
2288 2431
2289 old_domain = domain_for_device(devid); 2432 if (dev_data->domain)
2290 if (old_domain) 2433 detach_device(dev);
2291 detach_device(old_domain, devid);
2292 2434
2293 attach_device(iommu, domain, devid); 2435 ret = attach_device(dev, domain);
2294 2436
2295 iommu_completion_wait(iommu); 2437 iommu_completion_wait(iommu);
2296 2438
2297 return 0; 2439 return ret;
2298} 2440}
2299 2441
2300static int amd_iommu_map_range(struct iommu_domain *dom, 2442static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2340,7 +2482,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2340 iova += PAGE_SIZE; 2482 iova += PAGE_SIZE;
2341 } 2483 }
2342 2484
2343 iommu_flush_domain(domain->id); 2485 iommu_flush_tlb_pde(domain);
2344} 2486}
2345 2487
2346static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2488static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2391,10 +2533,11 @@ static struct iommu_ops amd_iommu_ops = {
2391 2533
2392int __init amd_iommu_init_passthrough(void) 2534int __init amd_iommu_init_passthrough(void)
2393{ 2535{
2536 struct amd_iommu *iommu;
2394 struct pci_dev *dev = NULL; 2537 struct pci_dev *dev = NULL;
2395 u16 devid, devid2; 2538 u16 devid;
2396 2539
2397 /* allocate passthroug domain */ 2540 /* allocate passthrough domain */
2398 pt_domain = protection_domain_alloc(); 2541 pt_domain = protection_domain_alloc();
2399 if (!pt_domain) 2542 if (!pt_domain)
2400 return -ENOMEM; 2543 return -ENOMEM;
@@ -2402,20 +2545,17 @@ int __init amd_iommu_init_passthrough(void)
2402 pt_domain->mode |= PAGE_MODE_NONE; 2545 pt_domain->mode |= PAGE_MODE_NONE;
2403 2546
2404 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2547 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2405 struct amd_iommu *iommu;
2406 2548
2407 devid = calc_devid(dev->bus->number, dev->devfn); 2549 if (!check_device(&dev->dev))
2408 if (devid > amd_iommu_last_bdf)
2409 continue; 2550 continue;
2410 2551
2411 devid2 = amd_iommu_alias_table[devid]; 2552 devid = get_device_id(&dev->dev);
2412 2553
2413 iommu = amd_iommu_rlookup_table[devid2]; 2554 iommu = amd_iommu_rlookup_table[devid];
2414 if (!iommu) 2555 if (!iommu)
2415 continue; 2556 continue;
2416 2557
2417 __attach_device(iommu, pt_domain, devid); 2558 attach_device(&dev->dev, pt_domain);
2418 __attach_device(iommu, pt_domain, devid2);
2419 } 2559 }
2420 2560
2421 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 2561 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b4b61d462dcc..9dc91b431470 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,10 +25,12 @@
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
28#include <asm/amd_iommu_types.h> 29#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 30#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 31#include <asm/iommu.h>
31#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h>
32 34
33/* 35/*
34 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
@@ -123,18 +125,29 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
123 to handle */ 125 to handle */
124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 126LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
125 we find in ACPI */ 127 we find in ACPI */
126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
129bool amd_iommu_isolate = true; /* if true, device isolation is
130 enabled */
131#endif
132
133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 128bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
134 129
135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 130LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
136 system */ 131 system */
137 132
133/* Array to assign indices to IOMMUs*/
134struct amd_iommu *amd_iommus[MAX_IOMMUS];
135int amd_iommus_present;
136
137/* IOMMUs have a non-present cache? */
138bool amd_iommu_np_cache __read_mostly;
139
140/*
141 * Set to true if ACPI table parsing and hardware intialization went properly
142 */
143static bool amd_iommu_initialized;
144
145/*
146 * List of protection domains - used during resume
147 */
148LIST_HEAD(amd_iommu_pd_list);
149spinlock_t amd_iommu_pd_lock;
150
138/* 151/*
139 * Pointer to the device table which is shared by all AMD IOMMUs 152 * Pointer to the device table which is shared by all AMD IOMMUs
140 * it is indexed by the PCI device id or the HT unit id and contains 153 * it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +170,6 @@ u16 *amd_iommu_alias_table;
157struct amd_iommu **amd_iommu_rlookup_table; 170struct amd_iommu **amd_iommu_rlookup_table;
158 171
159/* 172/*
160 * The pd table (protection domain table) is used to find the protection domain
161 * data structure a device belongs to. Indexed with the PCI device id too.
162 */
163struct protection_domain **amd_iommu_pd_table;
164
165/*
166 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap 173 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
167 * to know which ones are already in use. 174 * to know which ones are already in use.
168 */ 175 */
@@ -240,7 +247,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
240 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 247 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
241} 248}
242 249
243static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) 250static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244{ 251{
245 u32 ctrl; 252 u32 ctrl;
246 253
@@ -519,6 +526,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
519 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 526 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
520} 527}
521 528
529static int get_dev_entry_bit(u16 devid, u8 bit)
530{
531 int i = (bit >> 5) & 0x07;
532 int _bit = bit & 0x1f;
533
534 return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
535}
536
537
538void amd_iommu_apply_erratum_63(u16 devid)
539{
540 int sysmgt;
541
542 sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
543 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
544
545 if (sysmgt == 0x01)
546 set_dev_entry_bit(devid, DEV_ENTRY_IW);
547}
548
522/* Writes the specific IOMMU for a device into the rlookup table */ 549/* Writes the specific IOMMU for a device into the rlookup table */
523static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 550static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
524{ 551{
@@ -547,6 +574,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
547 if (flags & ACPI_DEVFLAG_LINT1) 574 if (flags & ACPI_DEVFLAG_LINT1)
548 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 575 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
549 576
577 amd_iommu_apply_erratum_63(devid);
578
550 set_iommu_for_device(iommu, devid); 579 set_iommu_for_device(iommu, devid);
551} 580}
552 581
@@ -816,7 +845,18 @@ static void __init free_iommu_all(void)
816static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 845static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
817{ 846{
818 spin_lock_init(&iommu->lock); 847 spin_lock_init(&iommu->lock);
848
849 /* Add IOMMU to internal data structures */
819 list_add_tail(&iommu->list, &amd_iommu_list); 850 list_add_tail(&iommu->list, &amd_iommu_list);
851 iommu->index = amd_iommus_present++;
852
853 if (unlikely(iommu->index >= MAX_IOMMUS)) {
854 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
855 return -ENOSYS;
856 }
857
858 /* Index is fine - add IOMMU to the array */
859 amd_iommus[iommu->index] = iommu;
820 860
821 /* 861 /*
822 * Copy data from ACPI table entry to the iommu struct 862 * Copy data from ACPI table entry to the iommu struct
@@ -846,6 +886,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
846 init_iommu_from_acpi(iommu, h); 886 init_iommu_from_acpi(iommu, h);
847 init_iommu_devices(iommu); 887 init_iommu_devices(iommu);
848 888
889 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
890 amd_iommu_np_cache = true;
891
849 return pci_enable_device(iommu->dev); 892 return pci_enable_device(iommu->dev);
850} 893}
851 894
@@ -891,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table)
891 } 934 }
892 WARN_ON(p != end); 935 WARN_ON(p != end);
893 936
937 amd_iommu_initialized = true;
938
894 return 0; 939 return 0;
895} 940}
896 941
@@ -903,7 +948,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
903 * 948 *
904 ****************************************************************************/ 949 ****************************************************************************/
905 950
906static int __init iommu_setup_msi(struct amd_iommu *iommu) 951static int iommu_setup_msi(struct amd_iommu *iommu)
907{ 952{
908 int r; 953 int r;
909 954
@@ -1154,19 +1199,10 @@ static struct sys_device device_amd_iommu = {
1154 * functions. Finally it prints some information about AMD IOMMUs and 1199 * functions. Finally it prints some information about AMD IOMMUs and
1155 * the driver state and enables the hardware. 1200 * the driver state and enables the hardware.
1156 */ 1201 */
1157int __init amd_iommu_init(void) 1202static int __init amd_iommu_init(void)
1158{ 1203{
1159 int i, ret = 0; 1204 int i, ret = 0;
1160 1205
1161
1162 if (no_iommu) {
1163 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1164 return 0;
1165 }
1166
1167 if (!amd_iommu_detected)
1168 return -ENODEV;
1169
1170 /* 1206 /*
1171 * First parse ACPI tables to find the largest Bus/Dev/Func 1207 * First parse ACPI tables to find the largest Bus/Dev/Func
1172 * we need to handle. Upon this information the shared data 1208 * we need to handle. Upon this information the shared data
@@ -1203,15 +1239,6 @@ int __init amd_iommu_init(void)
1203 if (amd_iommu_rlookup_table == NULL) 1239 if (amd_iommu_rlookup_table == NULL)
1204 goto free; 1240 goto free;
1205 1241
1206 /*
1207 * Protection Domain table - maps devices to protection domains
1208 * This table has the same size as the rlookup_table
1209 */
1210 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1211 get_order(rlookup_table_size));
1212 if (amd_iommu_pd_table == NULL)
1213 goto free;
1214
1215 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( 1242 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1216 GFP_KERNEL | __GFP_ZERO, 1243 GFP_KERNEL | __GFP_ZERO,
1217 get_order(MAX_DOMAIN_ID/8)); 1244 get_order(MAX_DOMAIN_ID/8));
@@ -1233,6 +1260,8 @@ int __init amd_iommu_init(void)
1233 */ 1260 */
1234 amd_iommu_pd_alloc_bitmap[0] = 1; 1261 amd_iommu_pd_alloc_bitmap[0] = 1;
1235 1262
1263 spin_lock_init(&amd_iommu_pd_lock);
1264
1236 /* 1265 /*
1237 * now the data structures are allocated and basically initialized 1266 * now the data structures are allocated and basically initialized
1238 * start the real acpi table scan 1267 * start the real acpi table scan
@@ -1241,6 +1270,9 @@ int __init amd_iommu_init(void)
1241 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1270 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1242 goto free; 1271 goto free;
1243 1272
1273 if (!amd_iommu_initialized)
1274 goto free;
1275
1244 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1276 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1245 goto free; 1277 goto free;
1246 1278
@@ -1252,39 +1284,43 @@ int __init amd_iommu_init(void)
1252 if (ret) 1284 if (ret)
1253 goto free; 1285 goto free;
1254 1286
1287 ret = amd_iommu_init_devices();
1288 if (ret)
1289 goto free;
1290
1255 if (iommu_pass_through) 1291 if (iommu_pass_through)
1256 ret = amd_iommu_init_passthrough(); 1292 ret = amd_iommu_init_passthrough();
1257 else 1293 else
1258 ret = amd_iommu_init_dma_ops(); 1294 ret = amd_iommu_init_dma_ops();
1295
1259 if (ret) 1296 if (ret)
1260 goto free; 1297 goto free;
1261 1298
1299 amd_iommu_init_api();
1300
1301 amd_iommu_init_notifier();
1302
1262 enable_iommus(); 1303 enable_iommus();
1263 1304
1264 if (iommu_pass_through) 1305 if (iommu_pass_through)
1265 goto out; 1306 goto out;
1266 1307
1267 printk(KERN_INFO "AMD-Vi: device isolation ");
1268 if (amd_iommu_isolate)
1269 printk("enabled\n");
1270 else
1271 printk("disabled\n");
1272
1273 if (amd_iommu_unmap_flush) 1308 if (amd_iommu_unmap_flush)
1274 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); 1309 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1275 else 1310 else
1276 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); 1311 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1277 1312
1313 x86_platform.iommu_shutdown = disable_iommus;
1278out: 1314out:
1279 return ret; 1315 return ret;
1280 1316
1281free: 1317free:
1318
1319 amd_iommu_uninit_devices();
1320
1282 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1321 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1283 get_order(MAX_DOMAIN_ID/8)); 1322 get_order(MAX_DOMAIN_ID/8));
1284 1323
1285 free_pages((unsigned long)amd_iommu_pd_table,
1286 get_order(rlookup_table_size));
1287
1288 free_pages((unsigned long)amd_iommu_rlookup_table, 1324 free_pages((unsigned long)amd_iommu_rlookup_table,
1289 get_order(rlookup_table_size)); 1325 get_order(rlookup_table_size));
1290 1326
@@ -1301,11 +1337,6 @@ free:
1301 goto out; 1337 goto out;
1302} 1338}
1303 1339
1304void amd_iommu_shutdown(void)
1305{
1306 disable_iommus();
1307}
1308
1309/**************************************************************************** 1340/****************************************************************************
1310 * 1341 *
1311 * Early detect code. This code runs at IOMMU detection time in the DMA 1342 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1320,16 +1351,16 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1320 1351
1321void __init amd_iommu_detect(void) 1352void __init amd_iommu_detect(void)
1322{ 1353{
1323 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) 1354 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1324 return; 1355 return;
1325 1356
1326 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1357 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1327 iommu_detected = 1; 1358 iommu_detected = 1;
1328 amd_iommu_detected = 1; 1359 amd_iommu_detected = 1;
1329#ifdef CONFIG_GART_IOMMU 1360 x86_init.iommu.iommu_init = amd_iommu_init;
1330 gart_iommu_aperture_disabled = 1; 1361
1331 gart_iommu_aperture = 0; 1362 /* Make sure ACS will be enabled */
1332#endif 1363 pci_request_acs();
1333 } 1364 }
1334} 1365}
1335 1366
@@ -1350,10 +1381,6 @@ static int __init parse_amd_iommu_dump(char *str)
1350static int __init parse_amd_iommu_options(char *str) 1381static int __init parse_amd_iommu_options(char *str)
1351{ 1382{
1352 for (; *str; ++str) { 1383 for (; *str; ++str) {
1353 if (strncmp(str, "isolate", 7) == 0)
1354 amd_iommu_isolate = true;
1355 if (strncmp(str, "share", 5) == 0)
1356 amd_iommu_isolate = false;
1357 if (strncmp(str, "fullflush", 9) == 0) 1384 if (strncmp(str, "fullflush", 9) == 0)
1358 amd_iommu_unmap_flush = true; 1385 amd_iommu_unmap_flush = true;
1359 } 1386 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..f147a95fd84a 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,8 +28,10 @@
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/k8.h>
31#include <asm/x86_init.h>
31 32
32int gart_iommu_aperture; 33int gart_iommu_aperture;
34EXPORT_SYMBOL_GPL(gart_iommu_aperture);
33int gart_iommu_aperture_disabled __initdata; 35int gart_iommu_aperture_disabled __initdata;
34int gart_iommu_aperture_allowed __initdata; 36int gart_iommu_aperture_allowed __initdata;
35 37
@@ -279,7 +281,8 @@ void __init early_gart_iommu_check(void)
279 * or BIOS forget to put that in reserved. 281 * or BIOS forget to put that in reserved.
280 * try to update e820 to make that region as reserved. 282 * try to update e820 to make that region as reserved.
281 */ 283 */
282 int i, fix, slot; 284 u32 agp_aper_base = 0, agp_aper_order = 0;
285 int i, fix, slot, valid_agp = 0;
283 u32 ctl; 286 u32 ctl;
284 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 287 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
285 u64 aper_base = 0, last_aper_base = 0; 288 u64 aper_base = 0, last_aper_base = 0;
@@ -289,6 +292,8 @@ void __init early_gart_iommu_check(void)
289 return; 292 return;
290 293
291 /* This is mostly duplicate of iommu_hole_init */ 294 /* This is mostly duplicate of iommu_hole_init */
295 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
296
292 fix = 0; 297 fix = 0;
293 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 298 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
294 int bus; 299 int bus;
@@ -341,10 +346,10 @@ void __init early_gart_iommu_check(void)
341 } 346 }
342 } 347 }
343 348
344 if (!fix) 349 if (valid_agp)
345 return; 350 return;
346 351
347 /* different nodes have different setting, disable them all at first*/ 352 /* disable them all at first */
348 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 353 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
349 int bus; 354 int bus;
350 int dev_base, dev_limit; 355 int dev_base, dev_limit;
@@ -400,6 +405,7 @@ void __init gart_iommu_hole_init(void)
400 405
401 iommu_detected = 1; 406 iommu_detected = 1;
402 gart_iommu_aperture = 1; 407 gart_iommu_aperture = 1;
408 x86_init.iommu.iommu_init = gart_iommu_init;
403 409
404 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 410 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
405 aper_size = (32 * 1024 * 1024) << aper_order; 411 aper_size = (32 * 1024 * 1024) << aper_order;
@@ -456,8 +462,6 @@ out:
456 462
457 if (aper_alloc) { 463 if (aper_alloc) {
458 /* Got the aperture from the AGP bridge */ 464 /* Got the aperture from the AGP bridge */
459 } else if (swiotlb && !valid_agp) {
460 /* Do nothing */
461 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || 465 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
462 force_iommu || 466 force_iommu ||
463 valid_agp || 467 valid_agp ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 7obj-$(CONFIG_SMP) += ipi.o
8 8
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..3987e4408f75 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U;
61 61
62/* 62/*
63 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
64 *
65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
66 * are in the 0 ... 7 range, then we can use logical addressing which
67 * has some performance advantages (better broadcasting).
68 *
69 * If there's an APIC ID above 8, we use physical addressing.
70 */ 64 */
71unsigned int max_physical_apicid; 65unsigned int max_physical_apicid;
72 66
@@ -241,28 +235,13 @@ static int modern_apic(void)
241} 235}
242 236
243/* 237/*
244 * bare function to substitute write operation 238 * right after this call apic become NOOP driven
245 * and it's _that_ fast :) 239 * so apic->write/read doesn't do anything
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */ 240 */
262void apic_disable(void) 241void apic_disable(void)
263{ 242{
264 apic->read = native_apic_read_dummy; 243 pr_info("APIC: switched to apic NOOP\n");
265 apic->write = native_apic_write_dummy; 244 apic = &apic_noop;
266} 245}
267 246
268void native_apic_wait_icr_idle(void) 247void native_apic_wait_icr_idle(void)
@@ -459,7 +438,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
459 v = apic_read(APIC_LVTT); 438 v = apic_read(APIC_LVTT);
460 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 439 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
461 apic_write(APIC_LVTT, v); 440 apic_write(APIC_LVTT, v);
462 apic_write(APIC_TMICT, 0xffffffff); 441 apic_write(APIC_TMICT, 0);
463 break; 442 break;
464 case CLOCK_EVT_MODE_RESUME: 443 case CLOCK_EVT_MODE_RESUME:
465 /* Nothing to do here */ 444 /* Nothing to do here */
@@ -662,7 +641,7 @@ static int __init calibrate_APIC_clock(void)
662 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 641 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
663 642
664 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 643 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
665 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 644 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
666 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 645 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
667 calibration_result); 646 calibration_result);
668 647
@@ -1356,7 +1335,7 @@ void enable_x2apic(void)
1356 1335
1357 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1336 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1358 if (!(msr & X2APIC_ENABLE)) { 1337 if (!(msr & X2APIC_ENABLE)) {
1359 pr_info("Enabling x2apic\n"); 1338 printk_once(KERN_INFO "Enabling x2apic\n");
1360 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1339 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1361 } 1340 }
1362} 1341}
@@ -1392,14 +1371,11 @@ void __init enable_IR_x2apic(void)
1392 unsigned long flags; 1371 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL; 1372 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0; 1373 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0; 1374 int dmar_table_init_ret;
1396 1375
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init(); 1376 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret) 1377 if (dmar_table_init_ret && !x2apic_supported())
1400 pr_debug("dmar_table_init() failed with %d:\n", 1378 return;
1401 dmar_table_init_ret);
1402#endif
1403 1379
1404 ioapic_entries = alloc_ioapic_entries(); 1380 ioapic_entries = alloc_ioapic_entries();
1405 if (!ioapic_entries) { 1381 if (!ioapic_entries) {
@@ -1916,14 +1892,17 @@ void __cpuinit generic_processor_info(int apicid, int version)
1916 max_physical_apicid = apicid; 1892 max_physical_apicid = apicid;
1917 1893
1918#ifdef CONFIG_X86_32 1894#ifdef CONFIG_X86_32
1919 switch (boot_cpu_data.x86_vendor) { 1895 if (num_processors > 8) {
1920 case X86_VENDOR_INTEL: 1896 switch (boot_cpu_data.x86_vendor) {
1921 if (num_processors > 8) 1897 case X86_VENDOR_INTEL:
1922 def_to_bigsmp = 1; 1898 if (!APIC_XAPIC(version)) {
1923 break; 1899 def_to_bigsmp = 0;
1924 case X86_VENDOR_AMD: 1900 break;
1925 if (max_physical_apicid >= 8) 1901 }
1902 /* If P4 and above fall through */
1903 case X86_VENDOR_AMD:
1926 def_to_bigsmp = 1; 1904 def_to_bigsmp = 1;
1905 }
1927 } 1906 }
1928#endif 1907#endif
1929 1908
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index d0c99abc26c3..e3c3d820c325 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
242 } 242 }
243
244 if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
245 printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
246 return 1;
247 }
243#endif 248#endif
244 249
245 return 0; 250 return 0;
@@ -306,10 +311,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
306 if (cpumask_test_cpu(cpu, cpu_online_mask)) 311 if (cpumask_test_cpu(cpu, cpu_online_mask))
307 break; 312 break;
308 } 313 }
309 if (cpu < nr_cpu_ids) 314 return per_cpu(x86_cpu_to_apicid, cpu);
310 return per_cpu(x86_cpu_to_apicid, cpu);
311
312 return BAD_APICID;
313} 315}
314 316
315struct apic apic_physflat = { 317struct apic apic_physflat = {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..e31b9ffe25f5
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
1/*
2 * NOOP APIC driver.
3 *
4 * Does almost nothing and should be substituted by a real apic driver via
5 * probe routine.
6 *
7 * Though in case if apic is disabled (for some reason) we try
8 * to not uglify the caller's code and allow to call (some) apic routines
9 * like self-ipi, etc...
10 */
11
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h>
20#include <asm/fixmap.h>
21#include <asm/mpspec.h>
22#include <asm/apicdef.h>
23#include <asm/apic.h>
24#include <asm/setup.h>
25
26#include <linux/smp.h>
27#include <asm/ipi.h>
28
29#include <linux/interrupt.h>
30#include <asm/acpi.h>
31#include <asm/e820.h>
32
33static void noop_init_apic_ldr(void) { }
34static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
35static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
36static void noop_send_IPI_allbutself(int vector) { }
37static void noop_send_IPI_all(int vector) { }
38static void noop_send_IPI_self(int vector) { }
39static void noop_apic_wait_icr_idle(void) { }
40static void noop_apic_icr_write(u32 low, u32 id) { }
41
42static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
43{
44 return -1;
45}
46
47static u32 noop_safe_apic_wait_icr_idle(void)
48{
49 return 0;
50}
51
52static u64 noop_apic_icr_read(void)
53{
54 return 0;
55}
56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{
64 return 0;
65}
66
67static unsigned int noop_get_apic_id(unsigned long x)
68{
69 return 0;
70}
71
72static int noop_probe(void)
73{
74 /*
75 * NOOP apic should not ever be
76 * enabled via probe routine
77 */
78 return 0;
79}
80
81static int noop_apic_id_registered(void)
82{
83 /*
84 * if we would be really "pedantic"
85 * we should pass read_apic_id() here
86 * but since NOOP suppose APIC ID = 0
87 * lets save a few cycles
88 */
89 return physid_isset(0, phys_cpu_present_map);
90}
91
92static const struct cpumask *noop_target_cpus(void)
93{
94 /* only BSP here */
95 return cpumask_of(0);
96}
97
98static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
99{
100 return physid_isset(apicid, *map);
101}
102
103static unsigned long noop_check_apicid_present(int bit)
104{
105 return physid_isset(bit, phys_cpu_present_map);
106}
107
108static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
109{
110 if (cpu != 0)
111 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
112 cpumask_clear(retmask);
113 cpumask_set_cpu(cpu, retmask);
114}
115
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg)
123{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
125 return 0;
126}
127
128static void noop_apic_write(u32 reg, u32 v)
129{
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131}
132
133struct apic apic_noop = {
134 .name = "noop",
135 .probe = noop_probe,
136 .acpi_madt_oem_check = NULL,
137
138 .apic_id_registered = noop_apic_id_registered,
139
140 .irq_delivery_mode = dest_LowestPrio,
141 /* logical delivery broadcast to all CPUs: */
142 .irq_dest_mode = 1,
143
144 .target_cpus = noop_target_cpus,
145 .disable_esr = 0,
146 .dest_logical = APIC_DEST_LOGICAL,
147 .check_apicid_used = noop_check_apicid_used,
148 .check_apicid_present = noop_check_apicid_present,
149
150 .vector_allocation_domain = noop_vector_allocation_domain,
151 .init_apic_ldr = noop_init_apic_ldr,
152
153 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid,
161
162 .setup_portio_remap = NULL,
163 .check_phys_apicid_present = default_check_phys_apicid_present,
164 .enable_apic_mode = NULL,
165
166 .phys_pkg_id = noop_phys_pkg_id,
167
168 .mps_oem_check = NULL,
169
170 .get_apic_id = noop_get_apic_id,
171 .set_apic_id = NULL,
172 .apic_id_mask = 0x0F << 24,
173
174 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
175 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
176
177 .send_IPI_mask = noop_send_IPI_mask,
178 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
179 .send_IPI_allbutself = noop_send_IPI_allbutself,
180 .send_IPI_all = noop_send_IPI_all,
181 .send_IPI_self = noop_send_IPI_self,
182
183 .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
184
185 /* should be safe */
186 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
187 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
188
189 .wait_for_init_deassert = NULL,
190
191 .smp_callin_clear_local_apic = NULL,
192 .inquire_remote_apic = NULL,
193
194 .read = noop_apic_read,
195 .write = noop_apic_write,
196 .icr_read = noop_apic_icr_read,
197 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..cb804c5091b9 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
35#endif 35#endif
36} 36}
37 37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 39{
40 return 0; 40 return 0;
41} 41}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 93 return BAD_APICID;
94} 94}
95 95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */ 96/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu) 97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{ 98{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106 return cpu_physical_id(cpu); 101 return cpu_physical_id(cpu);
107} 102}
108 103
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
110{ 105{
111 /* For clustered we don't have a good way to do this yet - hack */ 106 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL); 107 physids_promote(0xFFL, retmap);
113} 108}
114 109
115static int bigsmp_check_phys_apicid_present(int phys_apicid) 110static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -136,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
136 if (cpumask_test_cpu(cpu, cpu_online_mask)) 131 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break; 132 break;
138 } 133 }
139 if (cpu < nr_cpu_ids) 134 return bigsmp_cpu_to_logical_apicid(cpu);
140 return bigsmp_cpu_to_logical_apicid(cpu);
141
142 return BAD_APICID;
143} 135}
144 136
145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -230,7 +222,7 @@ struct apic apic_bigsmp = {
230 .apicid_to_node = bigsmp_apicid_to_node, 222 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, 223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, 225 .apicid_to_cpu_present = physid_set_mask_of_physid,
234 .setup_portio_remap = NULL, 226 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present, 227 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL, 228 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..dd2b5f264643 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -27,6 +27,9 @@
27 * 27 *
28 * http://www.unisys.com 28 * http://www.unisys.com
29 */ 29 */
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
30#include <linux/notifier.h> 33#include <linux/notifier.h>
31#include <linux/spinlock.h> 34#include <linux/spinlock.h>
32#include <linux/cpumask.h> 35#include <linux/cpumask.h>
@@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr)
223 mip_addr = val; 226 mip_addr = val;
224 mip = (struct mip_reg *)val; 227 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip); 228 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", 229 pr_debug("host_reg = 0x%lx\n",
227 (unsigned long)host_reg); 230 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", 231 pr_debug("mip_reg = 0x%lx\n",
229 (unsigned long)mip_reg); 232 (unsigned long)mip_reg);
230 success++; 233 success++;
231 break; 234 break;
@@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void)
401 if (!es7000_plat) 404 if (!es7000_plat)
402 return; 405 return;
403 406
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); 407 pr_info("Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); 408 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC; 409 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID; 410 es7000_mip_reg.off_0x38 = MIP_VALID;
@@ -466,11 +469,11 @@ static const struct cpumask *es7000_target_cpus(void)
466 return cpumask_of(smp_processor_id()); 469 return cpumask_of(smp_processor_id());
467} 470}
468 471
469static unsigned long 472static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{ 473{
472 return 0; 474 return 0;
473} 475}
476
474static unsigned long es7000_check_apicid_present(int bit) 477static unsigned long es7000_check_apicid_present(int bit)
475{ 478{
476 return physid_isset(bit, phys_cpu_present_map); 479 return physid_isset(bit, phys_cpu_present_map);
@@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void)
514{ 517{
515 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 518 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
516 519
517 printk(KERN_INFO 520 pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
519 (apic_version[apic] == 0x14) ? 521 (apic_version[apic] == 0x14) ?
520 "Physical Cluster" : "Logical Cluster", 522 "Physical Cluster" : "Logical Cluster",
521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 523 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
@@ -539,14 +541,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
539 541
540static int cpu_id; 542static int cpu_id;
541 543
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) 544static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
543{ 545{
544 physid_mask_t mask; 546 physid_set_mask_of_physid(cpu_id, retmap);
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id; 547 ++cpu_id;
548
549 return mask;
550} 548}
551 549
552/* Mapping from cpu number to logical apicid */ 550/* Mapping from cpu number to logical apicid */
@@ -561,10 +559,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
561#endif 559#endif
562} 560}
563 561
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) 562static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
565{ 563{
566 /* For clustered we don't have a good way to do this yet - hack */ 564 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff); 565 physids_promote(0xFFL, retmap);
568} 566}
569 567
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid) 568static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..53243ca7816d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -60,8 +60,6 @@
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h> 62#include <asm/hw_irq.h>
63#include <asm/uv/uv_hub.h>
64#include <asm/uv/uv_irq.h>
65 63
66#include <asm/apic.h> 64#include <asm/apic.h>
67 65
@@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 return pin; 138 return pin;
141} 139}
142 140
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
148struct irq_cfg {
149 struct irq_pin_list *irq_2_pin;
150 cpumask_var_t domain;
151 cpumask_var_t old_domain;
152 unsigned move_cleanup_count;
153 u8 vector;
154 u8 move_in_progress : 1;
155};
156
157/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
158#ifdef CONFIG_SPARSE_IRQ 142#ifdef CONFIG_SPARSE_IRQ
159static struct irq_cfg irq_cfgx[] = { 143static struct irq_cfg irq_cfgx[] = {
@@ -209,7 +193,7 @@ int __init arch_early_irq_init(void)
209} 193}
210 194
211#ifdef CONFIG_SPARSE_IRQ 195#ifdef CONFIG_SPARSE_IRQ
212static struct irq_cfg *irq_cfg(unsigned int irq) 196struct irq_cfg *irq_cfg(unsigned int irq)
213{ 197{
214 struct irq_cfg *cfg = NULL; 198 struct irq_cfg *cfg = NULL;
215 struct irq_desc *desc; 199 struct irq_desc *desc;
@@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
361/* end for move_irq_desc */ 345/* end for move_irq_desc */
362 346
363#else 347#else
364static struct irq_cfg *irq_cfg(unsigned int irq) 348struct irq_cfg *irq_cfg(unsigned int irq)
365{ 349{
366 return irq < nr_irqs ? irq_cfgx + irq : NULL; 350 return irq < nr_irqs ? irq_cfgx + irq : NULL;
367} 351}
@@ -555,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
555 add_pin_to_irq_node(cfg, node, newapic, newpin); 539 add_pin_to_irq_node(cfg, node, newapic, newpin);
556} 540}
557 541
542static void __io_apic_modify_irq(struct irq_pin_list *entry,
543 int mask_and, int mask_or,
544 void (*final)(struct irq_pin_list *entry))
545{
546 unsigned int reg, pin;
547
548 pin = entry->pin;
549 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
550 reg &= mask_and;
551 reg |= mask_or;
552 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
553 if (final)
554 final(entry);
555}
556
558static void io_apic_modify_irq(struct irq_cfg *cfg, 557static void io_apic_modify_irq(struct irq_cfg *cfg,
559 int mask_and, int mask_or, 558 int mask_and, int mask_or,
560 void (*final)(struct irq_pin_list *entry)) 559 void (*final)(struct irq_pin_list *entry))
561{ 560{
562 int pin;
563 struct irq_pin_list *entry; 561 struct irq_pin_list *entry;
564 562
565 for_each_irq_pin(entry, cfg->irq_2_pin) { 563 for_each_irq_pin(entry, cfg->irq_2_pin)
566 unsigned int reg; 564 __io_apic_modify_irq(entry, mask_and, mask_or, final);
567 pin = entry->pin; 565}
568 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 566
569 reg &= mask_and; 567static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
570 reg |= mask_or; 568{
571 io_apic_modify(entry->apic, 0x10 + pin * 2, reg); 569 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
572 if (final) 570 IO_APIC_REDIR_MASKED, NULL);
573 final(entry); 571}
574 } 572
573static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
574{
575 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
576 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
575} 577}
576 578
577static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) 579static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 597 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
596} 598}
597 599
598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
599{
600 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
601 IO_APIC_REDIR_MASKED, NULL);
602}
603
604static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
605{
606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
608}
609
610static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 600static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
611{ 601{
612 struct irq_cfg *cfg = desc->chip_data; 602 struct irq_cfg *cfg = desc->chip_data;
@@ -1177,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1177 int cpu, err; 1167 int cpu, err;
1178 cpumask_var_t tmp_mask; 1168 cpumask_var_t tmp_mask;
1179 1169
1180 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1170 if (cfg->move_in_progress)
1181 return -EBUSY; 1171 return -EBUSY;
1182 1172
1183 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1173 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1237,8 +1227,7 @@ next:
1237 return err; 1227 return err;
1238} 1228}
1239 1229
1240static int 1230int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1241assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1242{ 1231{
1243 int err; 1232 int err;
1244 unsigned long flags; 1233 unsigned long flags;
@@ -1599,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1599 struct irq_desc *desc; 1588 struct irq_desc *desc;
1600 unsigned int irq; 1589 unsigned int irq;
1601 1590
1602 if (apic_verbosity == APIC_QUIET)
1603 return;
1604
1605 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1591 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1606 for (i = 0; i < nr_ioapics; i++) 1592 for (i = 0; i < nr_ioapics; i++)
1607 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1593 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1708,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base)
1708{ 1694{
1709 int i; 1695 int i;
1710 1696
1711 if (apic_verbosity == APIC_QUIET)
1712 return;
1713
1714 printk(KERN_DEBUG); 1697 printk(KERN_DEBUG);
1715 1698
1716 for (i = 0; i < 8; i++) 1699 for (i = 0; i < 8; i++)
@@ -1724,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1724 unsigned int i, v, ver, maxlvt; 1707 unsigned int i, v, ver, maxlvt;
1725 u64 icr; 1708 u64 icr;
1726 1709
1727 if (apic_verbosity == APIC_QUIET)
1728 return;
1729
1730 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1710 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1731 smp_processor_id(), hard_smp_processor_id()); 1711 smp_processor_id(), hard_smp_processor_id());
1732 v = apic_read(APIC_ID); 1712 v = apic_read(APIC_ID);
@@ -1824,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1824 printk("\n"); 1804 printk("\n");
1825} 1805}
1826 1806
1827__apicdebuginit(void) print_all_local_APICs(void) 1807__apicdebuginit(void) print_local_APICs(int maxcpu)
1828{ 1808{
1829 int cpu; 1809 int cpu;
1830 1810
1811 if (!maxcpu)
1812 return;
1813
1831 preempt_disable(); 1814 preempt_disable();
1832 for_each_online_cpu(cpu) 1815 for_each_online_cpu(cpu) {
1816 if (cpu >= maxcpu)
1817 break;
1833 smp_call_function_single(cpu, print_local_APIC, NULL, 1); 1818 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1819 }
1834 preempt_enable(); 1820 preempt_enable();
1835} 1821}
1836 1822
@@ -1839,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void)
1839 unsigned int v; 1825 unsigned int v;
1840 unsigned long flags; 1826 unsigned long flags;
1841 1827
1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) 1828 if (!nr_legacy_irqs)
1843 return; 1829 return;
1844 1830
1845 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1831 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1866,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void)
1866 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1852 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1867} 1853}
1868 1854
1869__apicdebuginit(int) print_all_ICs(void) 1855static int __initdata show_lapic = 1;
1856static __init int setup_show_lapic(char *arg)
1870{ 1857{
1858 int num = -1;
1859
1860 if (strcmp(arg, "all") == 0) {
1861 show_lapic = CONFIG_NR_CPUS;
1862 } else {
1863 get_option(&arg, &num);
1864 if (num >= 0)
1865 show_lapic = num;
1866 }
1867
1868 return 1;
1869}
1870__setup("show_lapic=", setup_show_lapic);
1871
1872__apicdebuginit(int) print_ICs(void)
1873{
1874 if (apic_verbosity == APIC_QUIET)
1875 return 0;
1876
1871 print_PIC(); 1877 print_PIC();
1872 1878
1873 /* don't print out if apic is not there */ 1879 /* don't print out if apic is not there */
1874 if (!cpu_has_apic && !apic_from_smp_config()) 1880 if (!cpu_has_apic && !apic_from_smp_config())
1875 return 0; 1881 return 0;
1876 1882
1877 print_all_local_APICs(); 1883 print_local_APICs(show_lapic);
1878 print_IO_APIC(); 1884 print_IO_APIC();
1879 1885
1880 return 0; 1886 return 0;
1881} 1887}
1882 1888
1883fs_initcall(print_all_ICs); 1889fs_initcall(print_ICs);
1884 1890
1885 1891
1886/* Where if anywhere is the i8259 connect in external int mode */ 1892/* Where if anywhere is the i8259 connect in external int mode */
@@ -2031,7 +2037,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2031 * This is broken; anything with a real cpu count has to 2037 * This is broken; anything with a real cpu count has to
2032 * circumvent this idiocy regardless. 2038 * circumvent this idiocy regardless.
2033 */ 2039 */
2034 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 2040 apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
2035 2041
2036 /* 2042 /*
2037 * Set the IOAPIC ID to the value stored in the MPC table. 2043 * Set the IOAPIC ID to the value stored in the MPC table.
@@ -2058,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2058 * system must have a unique ID or we get lots of nice 2064 * system must have a unique ID or we get lots of nice
2059 * 'stuck on smp_invalidate_needed IPI wait' messages. 2065 * 'stuck on smp_invalidate_needed IPI wait' messages.
2060 */ 2066 */
2061 if (apic->check_apicid_used(phys_id_present_map, 2067 if (apic->check_apicid_used(&phys_id_present_map,
2062 mp_ioapics[apic_id].apicid)) { 2068 mp_ioapics[apic_id].apicid)) {
2063 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2069 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2064 apic_id, mp_ioapics[apic_id].apicid); 2070 apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2079,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2073 mp_ioapics[apic_id].apicid = i; 2079 mp_ioapics[apic_id].apicid = i;
2074 } else { 2080 } else {
2075 physid_mask_t tmp; 2081 physid_mask_t tmp;
2076 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); 2082 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
2077 apic_printk(APIC_VERBOSE, "Setting %d in the " 2083 apic_printk(APIC_VERBOSE, "Setting %d in the "
2078 "phys_id_present_map\n", 2084 "phys_id_present_map\n",
2079 mp_ioapics[apic_id].apicid); 2085 mp_ioapics[apic_id].apicid);
@@ -2228,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
2228 */ 2234 */
2229 2235
2230#ifdef CONFIG_SMP 2236#ifdef CONFIG_SMP
2231static void send_cleanup_vector(struct irq_cfg *cfg) 2237void send_cleanup_vector(struct irq_cfg *cfg)
2232{ 2238{
2233 cpumask_var_t cleanup_mask; 2239 cpumask_var_t cleanup_mask;
2234 2240
2235 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { 2241 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2236 unsigned int i; 2242 unsigned int i;
2237 cfg->move_cleanup_count = 0;
2238 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2239 cfg->move_cleanup_count++;
2240 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 2243 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2241 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 2244 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2242 } else { 2245 } else {
2243 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 2246 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2244 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2245 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2247 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2246 free_cpumask_var(cleanup_mask); 2248 free_cpumask_var(cleanup_mask);
2247 } 2249 }
@@ -2272,31 +2274,30 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2272 } 2274 }
2273} 2275}
2274 2276
2275static int
2276assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2277
2278/* 2277/*
2279 * Either sets desc->affinity to a valid value, and returns 2278 * Either sets desc->affinity to a valid value, and returns
2280 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and 2279 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2281 * leaves desc->affinity untouched. 2280 * leaves desc->affinity untouched.
2282 */ 2281 */
2283static unsigned int 2282unsigned int
2284set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 2283set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
2284 unsigned int *dest_id)
2285{ 2285{
2286 struct irq_cfg *cfg; 2286 struct irq_cfg *cfg;
2287 unsigned int irq; 2287 unsigned int irq;
2288 2288
2289 if (!cpumask_intersects(mask, cpu_online_mask)) 2289 if (!cpumask_intersects(mask, cpu_online_mask))
2290 return BAD_APICID; 2290 return -1;
2291 2291
2292 irq = desc->irq; 2292 irq = desc->irq;
2293 cfg = desc->chip_data; 2293 cfg = desc->chip_data;
2294 if (assign_irq_vector(irq, cfg, mask)) 2294 if (assign_irq_vector(irq, cfg, mask))
2295 return BAD_APICID; 2295 return -1;
2296 2296
2297 cpumask_copy(desc->affinity, mask); 2297 cpumask_copy(desc->affinity, mask);
2298 2298
2299 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2299 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2300 return 0;
2300} 2301}
2301 2302
2302static int 2303static int
@@ -2312,12 +2313,11 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2312 cfg = desc->chip_data; 2313 cfg = desc->chip_data;
2313 2314
2314 spin_lock_irqsave(&ioapic_lock, flags); 2315 spin_lock_irqsave(&ioapic_lock, flags);
2315 dest = set_desc_affinity(desc, mask); 2316 ret = set_desc_affinity(desc, mask, &dest);
2316 if (dest != BAD_APICID) { 2317 if (!ret) {
2317 /* Only the high 8 bits are valid. */ 2318 /* Only the high 8 bits are valid. */
2318 dest = SET_APIC_LOGICAL_ID(dest); 2319 dest = SET_APIC_LOGICAL_ID(dest);
2319 __target_IO_APIC_irq(irq, dest, cfg); 2320 __target_IO_APIC_irq(irq, dest, cfg);
2320 ret = 0;
2321 } 2321 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2322 spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2323
@@ -2432,8 +2432,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2432 continue; 2432 continue;
2433 2433
2434 cfg = irq_cfg(irq); 2434 cfg = irq_cfg(irq);
2435 spin_lock(&desc->lock); 2435 raw_spin_lock(&desc->lock);
2436 if (!cfg->move_cleanup_count) 2436
2437 /*
2438 * Check if the irq migration is in progress. If so, we
2439 * haven't received the cleanup request yet for this irq.
2440 */
2441 if (cfg->move_in_progress)
2437 goto unlock; 2442 goto unlock;
2438 2443
2439 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2444 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
@@ -2452,29 +2457,40 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2452 goto unlock; 2457 goto unlock;
2453 } 2458 }
2454 __get_cpu_var(vector_irq)[vector] = -1; 2459 __get_cpu_var(vector_irq)[vector] = -1;
2455 cfg->move_cleanup_count--;
2456unlock: 2460unlock:
2457 spin_unlock(&desc->lock); 2461 raw_spin_unlock(&desc->lock);
2458 } 2462 }
2459 2463
2460 irq_exit(); 2464 irq_exit();
2461} 2465}
2462 2466
2463static void irq_complete_move(struct irq_desc **descp) 2467static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2464{ 2468{
2465 struct irq_desc *desc = *descp; 2469 struct irq_desc *desc = *descp;
2466 struct irq_cfg *cfg = desc->chip_data; 2470 struct irq_cfg *cfg = desc->chip_data;
2467 unsigned vector, me; 2471 unsigned me;
2468 2472
2469 if (likely(!cfg->move_in_progress)) 2473 if (likely(!cfg->move_in_progress))
2470 return; 2474 return;
2471 2475
2472 vector = ~get_irq_regs()->orig_ax;
2473 me = smp_processor_id(); 2476 me = smp_processor_id();
2474 2477
2475 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2478 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2476 send_cleanup_vector(cfg); 2479 send_cleanup_vector(cfg);
2477} 2480}
2481
2482static void irq_complete_move(struct irq_desc **descp)
2483{
2484 __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
2485}
2486
2487void irq_force_complete_move(int irq)
2488{
2489 struct irq_desc *desc = irq_to_desc(irq);
2490 struct irq_cfg *cfg = desc->chip_data;
2491
2492 __irq_complete_move(&desc, cfg->vector);
2493}
2478#else 2494#else
2479static inline void irq_complete_move(struct irq_desc **descp) {} 2495static inline void irq_complete_move(struct irq_desc **descp) {}
2480#endif 2496#endif
@@ -2490,6 +2506,59 @@ static void ack_apic_edge(unsigned int irq)
2490 2506
2491atomic_t irq_mis_count; 2507atomic_t irq_mis_count;
2492 2508
2509/*
2510 * IO-APIC versions below 0x20 don't support EOI register.
2511 * For the record, here is the information about various versions:
2512 * 0Xh 82489DX
2513 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2514 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2515 * 30h-FFh Reserved
2516 *
2517 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2518 * version as 0x2. This is an error with documentation and these ICH chips
2519 * use io-apic's of version 0x20.
2520 *
2521 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2522 * Otherwise, we simulate the EOI message manually by changing the trigger
2523 * mode to edge and then back to level, with RTE being masked during this.
2524*/
2525static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2526{
2527 struct irq_pin_list *entry;
2528
2529 for_each_irq_pin(entry, cfg->irq_2_pin) {
2530 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2531 /*
2532 * Intr-remapping uses pin number as the virtual vector
2533 * in the RTE. Actual vector is programmed in
2534 * intr-remapping table entry. Hence for the io-apic
2535 * EOI we use the pin number.
2536 */
2537 if (irq_remapped(irq))
2538 io_apic_eoi(entry->apic, entry->pin);
2539 else
2540 io_apic_eoi(entry->apic, cfg->vector);
2541 } else {
2542 __mask_and_edge_IO_APIC_irq(entry);
2543 __unmask_and_level_IO_APIC_irq(entry);
2544 }
2545 }
2546}
2547
2548static void eoi_ioapic_irq(struct irq_desc *desc)
2549{
2550 struct irq_cfg *cfg;
2551 unsigned long flags;
2552 unsigned int irq;
2553
2554 irq = desc->irq;
2555 cfg = desc->chip_data;
2556
2557 spin_lock_irqsave(&ioapic_lock, flags);
2558 __eoi_ioapic_irq(irq, cfg);
2559 spin_unlock_irqrestore(&ioapic_lock, flags);
2560}
2561
2493static void ack_apic_level(unsigned int irq) 2562static void ack_apic_level(unsigned int irq)
2494{ 2563{
2495 struct irq_desc *desc = irq_to_desc(irq); 2564 struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2594,19 @@ static void ack_apic_level(unsigned int irq)
2525 * level-triggered interrupt. We mask the source for the time of the 2594 * level-triggered interrupt. We mask the source for the time of the
2526 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2595 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2527 * The idea is from Manfred Spraul. --macro 2596 * The idea is from Manfred Spraul. --macro
2597 *
2598 * Also in the case when cpu goes offline, fixup_irqs() will forward
2599 * any unhandled interrupt on the offlined cpu to the new cpu
2600 * destination that is handling the corresponding interrupt. This
2601 * interrupt forwarding is done via IPI's. Hence, in this case also
2602 * level-triggered io-apic interrupt will be seen as an edge
2603 * interrupt in the IRR. And we can't rely on the cpu's EOI
2604 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
2605 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
2606 * supporting EOI register, we do an explicit EOI to clear the
2607 * remote IRR and on IO-APIC's which don't have an EOI register,
2608 * we use the above logic (mask+edge followed by unmask+level) from
2609 * Manfred Spraul to clear the remote IRR.
2528 */ 2610 */
2529 cfg = desc->chip_data; 2611 cfg = desc->chip_data;
2530 i = cfg->vector; 2612 i = cfg->vector;
@@ -2536,6 +2618,19 @@ static void ack_apic_level(unsigned int irq)
2536 */ 2618 */
2537 ack_APIC_irq(); 2619 ack_APIC_irq();
2538 2620
2621 /*
2622 * Tail end of clearing remote IRR bit (either by delivering the EOI
2623 * message via io-apic EOI register write or simulating it using
2624 * mask+edge followed by unnask+level logic) manually when the
2625 * level triggered interrupt is seen as the edge triggered interrupt
2626 * at the cpu.
2627 */
2628 if (!(v & (1 << (i & 0x1f)))) {
2629 atomic_inc(&irq_mis_count);
2630
2631 eoi_ioapic_irq(desc);
2632 }
2633
2539 /* Now we can move and renable the irq */ 2634 /* Now we can move and renable the irq */
2540 if (unlikely(do_unmask_irq)) { 2635 if (unlikely(do_unmask_irq)) {
2541 /* Only migrate the irq if the ack has been received. 2636 /* Only migrate the irq if the ack has been received.
@@ -2569,41 +2664,9 @@ static void ack_apic_level(unsigned int irq)
2569 move_masked_irq(irq); 2664 move_masked_irq(irq);
2570 unmask_IO_APIC_irq_desc(desc); 2665 unmask_IO_APIC_irq_desc(desc);
2571 } 2666 }
2572
2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2574 if (!(v & (1 << (i & 0x1f)))) {
2575 atomic_inc(&irq_mis_count);
2576 spin_lock(&ioapic_lock);
2577 __mask_and_edge_IO_APIC_irq(cfg);
2578 __unmask_and_level_IO_APIC_irq(cfg);
2579 spin_unlock(&ioapic_lock);
2580 }
2581} 2667}
2582 2668
2583#ifdef CONFIG_INTR_REMAP 2669#ifdef CONFIG_INTR_REMAP
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2589 io_apic_eoi(entry->apic, entry->pin);
2590}
2591
2592static void
2593eoi_ioapic_irq(struct irq_desc *desc)
2594{
2595 struct irq_cfg *cfg;
2596 unsigned long flags;
2597 unsigned int irq;
2598
2599 irq = desc->irq;
2600 cfg = desc->chip_data;
2601
2602 spin_lock_irqsave(&ioapic_lock, flags);
2603 __eoi_ioapic_irq(irq, cfg);
2604 spin_unlock_irqrestore(&ioapic_lock, flags);
2605}
2606
2607static void ir_ack_apic_edge(unsigned int irq) 2670static void ir_ack_apic_edge(unsigned int irq)
2608{ 2671{
2609 ack_APIC_irq(); 2672 ack_APIC_irq();
@@ -3157,6 +3220,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3157 continue; 3220 continue;
3158 3221
3159 desc_new = move_irq_desc(desc_new, node); 3222 desc_new = move_irq_desc(desc_new, node);
3223 cfg_new = desc_new->chip_data;
3160 3224
3161 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3225 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3162 irq = new; 3226 irq = new;
@@ -3211,7 +3275,8 @@ void destroy_irq(unsigned int irq)
3211 * MSI message composition 3275 * MSI message composition
3212 */ 3276 */
3213#ifdef CONFIG_PCI_MSI 3277#ifdef CONFIG_PCI_MSI
3214static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 3278static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3279 struct msi_msg *msg, u8 hpet_id)
3215{ 3280{
3216 struct irq_cfg *cfg; 3281 struct irq_cfg *cfg;
3217 int err; 3282 int err;
@@ -3245,7 +3310,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3245 irte.dest_id = IRTE_DEST(dest); 3310 irte.dest_id = IRTE_DEST(dest);
3246 3311
3247 /* Set source-id of interrupt request */ 3312 /* Set source-id of interrupt request */
3248 set_msi_sid(&irte, pdev); 3313 if (pdev)
3314 set_msi_sid(&irte, pdev);
3315 else
3316 set_hpet_sid(&irte, hpet_id);
3249 3317
3250 modify_irte(irq, &irte); 3318 modify_irte(irq, &irte);
3251 3319
@@ -3291,8 +3359,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3291 struct msi_msg msg; 3359 struct msi_msg msg;
3292 unsigned int dest; 3360 unsigned int dest;
3293 3361
3294 dest = set_desc_affinity(desc, mask); 3362 if (set_desc_affinity(desc, mask, &dest))
3295 if (dest == BAD_APICID)
3296 return -1; 3363 return -1;
3297 3364
3298 cfg = desc->chip_data; 3365 cfg = desc->chip_data;
@@ -3324,8 +3391,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3324 if (get_irte(irq, &irte)) 3391 if (get_irte(irq, &irte))
3325 return -1; 3392 return -1;
3326 3393
3327 dest = set_desc_affinity(desc, mask); 3394 if (set_desc_affinity(desc, mask, &dest))
3328 if (dest == BAD_APICID)
3329 return -1; 3395 return -1;
3330 3396
3331 irte.vector = cfg->vector; 3397 irte.vector = cfg->vector;
@@ -3410,7 +3476,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3410 int ret; 3476 int ret;
3411 struct msi_msg msg; 3477 struct msi_msg msg;
3412 3478
3413 ret = msi_compose_msg(dev, irq, &msg); 3479 ret = msi_compose_msg(dev, irq, &msg, -1);
3414 if (ret < 0) 3480 if (ret < 0)
3415 return ret; 3481 return ret;
3416 3482
@@ -3507,8 +3573,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3507 struct msi_msg msg; 3573 struct msi_msg msg;
3508 unsigned int dest; 3574 unsigned int dest;
3509 3575
3510 dest = set_desc_affinity(desc, mask); 3576 if (set_desc_affinity(desc, mask, &dest))
3511 if (dest == BAD_APICID)
3512 return -1; 3577 return -1;
3513 3578
3514 cfg = desc->chip_data; 3579 cfg = desc->chip_data;
@@ -3543,7 +3608,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3543 int ret; 3608 int ret;
3544 struct msi_msg msg; 3609 struct msi_msg msg;
3545 3610
3546 ret = msi_compose_msg(NULL, irq, &msg); 3611 ret = msi_compose_msg(NULL, irq, &msg, -1);
3547 if (ret < 0) 3612 if (ret < 0)
3548 return ret; 3613 return ret;
3549 dmar_msi_write(irq, &msg); 3614 dmar_msi_write(irq, &msg);
@@ -3563,8 +3628,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3563 struct msi_msg msg; 3628 struct msi_msg msg;
3564 unsigned int dest; 3629 unsigned int dest;
3565 3630
3566 dest = set_desc_affinity(desc, mask); 3631 if (set_desc_affinity(desc, mask, &dest))
3567 if (dest == BAD_APICID)
3568 return -1; 3632 return -1;
3569 3633
3570 cfg = desc->chip_data; 3634 cfg = desc->chip_data;
@@ -3583,6 +3647,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3583 3647
3584#endif /* CONFIG_SMP */ 3648#endif /* CONFIG_SMP */
3585 3649
3650static struct irq_chip ir_hpet_msi_type = {
3651 .name = "IR-HPET_MSI",
3652 .unmask = hpet_msi_unmask,
3653 .mask = hpet_msi_mask,
3654#ifdef CONFIG_INTR_REMAP
3655 .ack = ir_ack_apic_edge,
3656#ifdef CONFIG_SMP
3657 .set_affinity = ir_set_msi_irq_affinity,
3658#endif
3659#endif
3660 .retrigger = ioapic_retrigger_irq,
3661};
3662
3586static struct irq_chip hpet_msi_type = { 3663static struct irq_chip hpet_msi_type = {
3587 .name = "HPET_MSI", 3664 .name = "HPET_MSI",
3588 .unmask = hpet_msi_unmask, 3665 .unmask = hpet_msi_unmask,
@@ -3594,20 +3671,36 @@ static struct irq_chip hpet_msi_type = {
3594 .retrigger = ioapic_retrigger_irq, 3671 .retrigger = ioapic_retrigger_irq,
3595}; 3672};
3596 3673
3597int arch_setup_hpet_msi(unsigned int irq) 3674int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3598{ 3675{
3599 int ret; 3676 int ret;
3600 struct msi_msg msg; 3677 struct msi_msg msg;
3601 struct irq_desc *desc = irq_to_desc(irq); 3678 struct irq_desc *desc = irq_to_desc(irq);
3602 3679
3603 ret = msi_compose_msg(NULL, irq, &msg); 3680 if (intr_remapping_enabled) {
3681 struct intel_iommu *iommu = map_hpet_to_ir(id);
3682 int index;
3683
3684 if (!iommu)
3685 return -1;
3686
3687 index = alloc_irte(iommu, irq, 1);
3688 if (index < 0)
3689 return -1;
3690 }
3691
3692 ret = msi_compose_msg(NULL, irq, &msg, id);
3604 if (ret < 0) 3693 if (ret < 0)
3605 return ret; 3694 return ret;
3606 3695
3607 hpet_msi_write(irq, &msg); 3696 hpet_msi_write(irq, &msg);
3608 desc->status |= IRQ_MOVE_PCNTXT; 3697 desc->status |= IRQ_MOVE_PCNTXT;
3609 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, 3698 if (irq_remapped(irq))
3610 "edge"); 3699 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3700 handle_edge_irq, "edge");
3701 else
3702 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3703 handle_edge_irq, "edge");
3611 3704
3612 return 0; 3705 return 0;
3613} 3706}
@@ -3641,8 +3734,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3641 struct irq_cfg *cfg; 3734 struct irq_cfg *cfg;
3642 unsigned int dest; 3735 unsigned int dest;
3643 3736
3644 dest = set_desc_affinity(desc, mask); 3737 if (set_desc_affinity(desc, mask, &dest))
3645 if (dest == BAD_APICID)
3646 return -1; 3738 return -1;
3647 3739
3648 cfg = desc->chip_data; 3740 cfg = desc->chip_data;
@@ -3708,75 +3800,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3708} 3800}
3709#endif /* CONFIG_HT_IRQ */ 3801#endif /* CONFIG_HT_IRQ */
3710 3802
3711#ifdef CONFIG_X86_UV
3712/*
3713 * Re-target the irq to the specified CPU and enable the specified MMR located
3714 * on the specified blade to allow the sending of MSIs to the specified CPU.
3715 */
3716int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3717 unsigned long mmr_offset)
3718{
3719 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3720 struct irq_cfg *cfg;
3721 int mmr_pnode;
3722 unsigned long mmr_value;
3723 struct uv_IO_APIC_route_entry *entry;
3724 unsigned long flags;
3725 int err;
3726
3727 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3728
3729 cfg = irq_cfg(irq);
3730
3731 err = assign_irq_vector(irq, cfg, eligible_cpu);
3732 if (err != 0)
3733 return err;
3734
3735 spin_lock_irqsave(&vector_lock, flags);
3736 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3737 irq_name);
3738 spin_unlock_irqrestore(&vector_lock, flags);
3739
3740 mmr_value = 0;
3741 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3742 entry->vector = cfg->vector;
3743 entry->delivery_mode = apic->irq_delivery_mode;
3744 entry->dest_mode = apic->irq_dest_mode;
3745 entry->polarity = 0;
3746 entry->trigger = 0;
3747 entry->mask = 0;
3748 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3749
3750 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3751 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3752
3753 if (cfg->move_in_progress)
3754 send_cleanup_vector(cfg);
3755
3756 return irq;
3757}
3758
3759/*
3760 * Disable the specified MMR located on the specified blade so that MSIs are
3761 * longer allowed to be sent.
3762 */
3763void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3764{
3765 unsigned long mmr_value;
3766 struct uv_IO_APIC_route_entry *entry;
3767 int mmr_pnode;
3768
3769 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3770
3771 mmr_value = 0;
3772 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3773 entry->mask = 1;
3774
3775 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3776 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3777}
3778#endif /* CONFIG_X86_64 */
3779
3780int __init io_apic_get_redir_entries (int ioapic) 3803int __init io_apic_get_redir_entries (int ioapic)
3781{ 3804{
3782 union IO_APIC_reg_01 reg_01; 3805 union IO_APIC_reg_01 reg_01;
@@ -3944,7 +3967,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3944 */ 3967 */
3945 3968
3946 if (physids_empty(apic_id_map)) 3969 if (physids_empty(apic_id_map))
3947 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 3970 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3948 3971
3949 spin_lock_irqsave(&ioapic_lock, flags); 3972 spin_lock_irqsave(&ioapic_lock, flags);
3950 reg_00.raw = io_apic_read(ioapic, 0); 3973 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3960,10 +3983,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3960 * Every APIC in a system must have a unique ID or we get lots of nice 3983 * Every APIC in a system must have a unique ID or we get lots of nice
3961 * 'stuck on smp_invalidate_needed IPI wait' messages. 3984 * 'stuck on smp_invalidate_needed IPI wait' messages.
3962 */ 3985 */
3963 if (apic->check_apicid_used(apic_id_map, apic_id)) { 3986 if (apic->check_apicid_used(&apic_id_map, apic_id)) {
3964 3987
3965 for (i = 0; i < get_physical_broadcast(); i++) { 3988 for (i = 0; i < get_physical_broadcast(); i++) {
3966 if (!apic->check_apicid_used(apic_id_map, i)) 3989 if (!apic->check_apicid_used(&apic_id_map, i))
3967 break; 3990 break;
3968 } 3991 }
3969 3992
@@ -3976,7 +3999,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3976 apic_id = i; 3999 apic_id = i;
3977 } 4000 }
3978 4001
3979 tmp = apic->apicid_to_cpu_present(apic_id); 4002 apic->apicid_to_cpu_present(apic_id, &tmp);
3980 physids_or(apic_id_map, apic_id_map, tmp); 4003 physids_or(apic_id_map, apic_id_map, tmp);
3981 4004
3982 if (reg_00.bits.ID != apic_id) { 4005 if (reg_00.bits.ID != apic_id) {
@@ -4106,7 +4129,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4106 for (i = 0; i < nr_ioapics; i++) { 4129 for (i = 0; i < nr_ioapics; i++) {
4107 res[i].name = mem; 4130 res[i].name = mem;
4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4131 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4109 sprintf(mem, "IOAPIC %u", i); 4132 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
4110 mem += IOAPIC_RESOURCE_NAME_SIZE; 4133 mem += IOAPIC_RESOURCE_NAME_SIZE;
4111 } 4134 }
4112 4135
@@ -4140,18 +4163,17 @@ void __init ioapic_init_mappings(void)
4140#ifdef CONFIG_X86_32 4163#ifdef CONFIG_X86_32
4141fake_ioapic_page: 4164fake_ioapic_page:
4142#endif 4165#endif
4143 ioapic_phys = (unsigned long) 4166 ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
4144 alloc_bootmem_pages(PAGE_SIZE);
4145 ioapic_phys = __pa(ioapic_phys); 4167 ioapic_phys = __pa(ioapic_phys);
4146 } 4168 }
4147 set_fixmap_nocache(idx, ioapic_phys); 4169 set_fixmap_nocache(idx, ioapic_phys);
4148 apic_printk(APIC_VERBOSE, 4170 apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
4149 "mapped IOAPIC to %08lx (%08lx)\n", 4171 __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
4150 __fix_to_virt(idx), ioapic_phys); 4172 ioapic_phys);
4151 idx++; 4173 idx++;
4152 4174
4153 ioapic_res->start = ioapic_phys; 4175 ioapic_res->start = ioapic_phys;
4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4176 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4155 ioapic_res++; 4177 ioapic_res++;
4156 } 4178 }
4157} 4179}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6a188a..0159a69396cb 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,8 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_t backtrace_mask __read_mostly; 42/* For reliability, we're prepared to waste bits here. */
43static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
43 44
44/* nmi_active: 45/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 46 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -360,7 +361,7 @@ void stop_apic_nmi_watchdog(void *unused)
360 */ 361 */
361 362
362static DEFINE_PER_CPU(unsigned, last_irq_sum); 363static DEFINE_PER_CPU(unsigned, last_irq_sum);
363static DEFINE_PER_CPU(local_t, alert_counter); 364static DEFINE_PER_CPU(long, alert_counter);
364static DEFINE_PER_CPU(int, nmi_touch); 365static DEFINE_PER_CPU(int, nmi_touch);
365 366
366void touch_nmi_watchdog(void) 367void touch_nmi_watchdog(void)
@@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
414 } 415 }
415 416
416 /* We can be called before check_nmi_watchdog, hence NULL check. */ 417 /* We can be called before check_nmi_watchdog, hence NULL check. */
417 if (cpumask_test_cpu(cpu, &backtrace_mask)) { 418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
419 420
420 spin_lock(&lock); 421 spin_lock(&lock);
@@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
422 show_regs(regs); 423 show_regs(regs);
423 dump_stack(); 424 dump_stack();
424 spin_unlock(&lock); 425 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, &backtrace_mask); 426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
426 427
427 rc = 1; 428 rc = 1;
428 } 429 }
@@ -437,8 +438,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
437 * Ayiee, looks like this CPU is stuck ... 438 * Ayiee, looks like this CPU is stuck ...
438 * wait a few IRQs (5 seconds) before doing the oops ... 439 * wait a few IRQs (5 seconds) before doing the oops ...
439 */ 440 */
440 local_inc(&__get_cpu_var(alert_counter)); 441 __this_cpu_inc(per_cpu_var(alert_counter));
441 if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) 442 if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz)
442 /* 443 /*
443 * die_nmi will return ONLY if NOTIFY_STOP happens.. 444 * die_nmi will return ONLY if NOTIFY_STOP happens..
444 */ 445 */
@@ -446,7 +447,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
446 regs, panic_on_timeout); 447 regs, panic_on_timeout);
447 } else { 448 } else {
448 __get_cpu_var(last_irq_sum) = sum; 449 __get_cpu_var(last_irq_sum) = sum;
449 local_set(&__get_cpu_var(alert_counter), 0); 450 __this_cpu_write(per_cpu_var(alert_counter), 0);
450 } 451 }
451 452
452 /* see if the nmi watchdog went off */ 453 /* see if the nmi watchdog went off */
@@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void)
558{ 559{
559 int i; 560 int i;
560 561
561 cpumask_copy(&backtrace_mask, cpu_online_mask); 562 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
562 563
563 printk(KERN_INFO "sending NMI to all CPUs:\n"); 564 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR); 565 apic->send_IPI_all(NMI_VECTOR);
565 566
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 567 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 568 for (i = 0; i < 10 * 1000; i++) {
568 if (cpumask_empty(&backtrace_mask)) 569 if (cpumask_empty(to_cpumask(backtrace_mask)))
569 break; 570 break;
570 mdelay(1); 571 mdelay(1);
571 } 572 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 9c0629ceb528..98c4665f251c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -329,10 +329,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
329 return cpu_all_mask; 329 return cpu_all_mask;
330} 330}
331 331
332static inline unsigned long 332static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
333numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
334{ 333{
335 return physid_isset(apicid, bitmap); 334 return physid_isset(apicid, *map);
336} 335}
337 336
338static inline unsigned long numaq_check_apicid_present(int bit) 337static inline unsigned long numaq_check_apicid_present(int bit)
@@ -366,10 +365,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
366 return apic != 0 && irq == 0; 365 return apic != 0 && irq == 0;
367} 366}
368 367
369static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) 368static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
370{ 369{
371 /* We don't have a good way to do this yet - hack */ 370 /* We don't have a good way to do this yet - hack */
372 return physids_promote(0xFUL); 371 return physids_promote(0xFUL, retmap);
373} 372}
374 373
375static inline int numaq_cpu_to_logical_apicid(int cpu) 374static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -397,12 +396,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
397 return logical_apicid >> 4; 396 return logical_apicid >> 4;
398} 397}
399 398
400static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) 399static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
401{ 400{
402 int node = numaq_apicid_to_node(logical_apicid); 401 int node = numaq_apicid_to_node(logical_apicid);
403 int cpu = __ffs(logical_apicid & 0xf); 402 int cpu = __ffs(logical_apicid & 0xf);
404 403
405 return physid_mask_of_physid(cpu + 4*node); 404 physid_set_mask_of_physid(cpu + 4*node, retmap);
406} 405}
407 406
408/* Where the IO area was mapped on multiquad, always 0 otherwise */ 407/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..1a6559f6768c 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -108,7 +108,7 @@ struct apic apic_default = {
108 .apicid_to_node = default_apicid_to_node, 108 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
110 .cpu_present_to_apicid = default_cpu_present_to_apicid, 110 .cpu_present_to_apicid = default_cpu_present_to_apicid,
111 .apicid_to_cpu_present = default_apicid_to_cpu_present, 111 .apicid_to_cpu_present = physid_set_mask_of_physid,
112 .setup_portio_remap = NULL, 112 .setup_portio_remap = NULL,
113 .check_phys_apicid_present = default_check_phys_apicid_present, 113 .check_phys_apicid_present = default_check_phys_apicid_present,
114 .enable_apic_mode = NULL, 114 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c4cbd3080c1c..450fe2064a14 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void)
67 } 67 }
68#endif 68#endif
69 69
70 if (apic == &apic_flat) { 70 if (apic == &apic_flat && num_processors > 8)
71 switch (boot_cpu_data.x86_vendor) { 71 apic = &apic_physflat;
72 case X86_VENDOR_INTEL:
73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
80 }
81 72
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83 74
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
183 return cpumask_of(0); 183 return cpumask_of(0);
184} 184}
185 185
186static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 186static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
187{ 187{
188 return 0; 188 return 0;
189} 189}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
261 return BAD_APICID; 261 return BAD_APICID;
262} 262}
263 263
264static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) 264static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
265{ 265{
266 /* For clustered we don't have a good way to do this yet - hack */ 266 /* For clustered we don't have a good way to do this yet - hack */
267 return physids_promote(0x0F); 267 physids_promote(0x0FL, retmap);
268} 268}
269 269
270static physid_mask_t summit_apicid_to_cpu_present(int apicid) 270static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
271{ 271{
272 return physid_mask_of_physid(0); 272 physid_set_mask_of_physid(0, retmap);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5371ec36776..cf69c59f4910 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
148 break; 148 break;
149 } 149 }
150 150
151 if (cpu < nr_cpu_ids) 151 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152 return per_cpu(x86_cpu_to_logical_apicid, cpu);
153
154 return BAD_APICID;
155} 152}
156 153
157static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a8989aadc99a..8972f38c5ced 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
146 break; 146 break;
147 } 147 }
148 148
149 if (cpu < nr_cpu_ids) 149 return per_cpu(x86_cpu_to_apicid, cpu);
150 return per_cpu(x86_cpu_to_apicid, cpu);
151
152 return BAD_APICID;
153} 150}
154 151
155static unsigned int x2apic_phys_get_apic_id(unsigned long x) 152static unsigned int x2apic_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1e09417c992f..21db3cbea7dc 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -36,6 +36,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
36 36
37static enum uv_system_type uv_system_type; 37static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr; 38static u64 gru_start_paddr, gru_end_paddr;
39int uv_min_hub_revision_id;
40EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
39 41
40static inline bool is_GRU_range(u64 start, u64 end) 42static inline bool is_GRU_range(u64 start, u64 end)
41{ 43{
@@ -55,12 +57,19 @@ static int early_get_nodeid(void)
55 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); 57 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
56 node_id.v = *mmr; 58 node_id.v = *mmr;
57 early_iounmap(mmr, sizeof(*mmr)); 59 early_iounmap(mmr, sizeof(*mmr));
60
61 /* Currently, all blades have same revision number */
62 uv_min_hub_revision_id = node_id.s.revision;
63
58 return node_id.s.node_id; 64 return node_id.s.node_id;
59} 65}
60 66
61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 67static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
62{ 68{
69 int nodeid;
70
63 if (!strcmp(oem_id, "SGI")) { 71 if (!strcmp(oem_id, "SGI")) {
72 nodeid = early_get_nodeid();
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 73 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
65 if (!strcmp(oem_table_id, "UVL")) 74 if (!strcmp(oem_table_id, "UVL"))
66 uv_system_type = UV_LEGACY_APIC; 75 uv_system_type = UV_LEGACY_APIC;
@@ -68,7 +77,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
68 uv_system_type = UV_X2APIC; 77 uv_system_type = UV_X2APIC;
69 else if (!strcmp(oem_table_id, "UVH")) { 78 else if (!strcmp(oem_table_id, "UVH")) {
70 __get_cpu_var(x2apic_extra_bits) = 79 __get_cpu_var(x2apic_extra_bits) =
71 early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); 80 nodeid << (UV_APIC_PNODE_SHIFT - 1);
72 uv_system_type = UV_NON_UNIQUE_APIC; 81 uv_system_type = UV_NON_UNIQUE_APIC;
73 return 1; 82 return 1;
74 } 83 }
@@ -225,10 +234,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
225 if (cpumask_test_cpu(cpu, cpu_online_mask)) 234 if (cpumask_test_cpu(cpu, cpu_online_mask))
226 break; 235 break;
227 } 236 }
228 if (cpu < nr_cpu_ids) 237 return per_cpu(x86_cpu_to_apicid, cpu);
229 return per_cpu(x86_cpu_to_apicid, cpu);
230
231 return BAD_APICID;
232} 238}
233 239
234static unsigned int x2apic_get_apic_id(unsigned long x) 240static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -365,25 +371,25 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
365 371
366 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { 372 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
367 alias.v = uv_read_local_mmr(redir_addrs[i].alias); 373 alias.v = uv_read_local_mmr(redir_addrs[i].alias);
368 if (alias.s.base == 0) { 374 if (alias.s.enable && alias.s.base == 0) {
369 *size = (1UL << alias.s.m_alias); 375 *size = (1UL << alias.s.m_alias);
370 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); 376 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
371 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; 377 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
372 return; 378 return;
373 } 379 }
374 } 380 }
375 BUG(); 381 *base = *size = 0;
376} 382}
377 383
378enum map_type {map_wb, map_uc}; 384enum map_type {map_wb, map_uc};
379 385
380static __init void map_high(char *id, unsigned long base, int shift, 386static __init void map_high(char *id, unsigned long base, int pshift,
381 int max_pnode, enum map_type map_type) 387 int bshift, int max_pnode, enum map_type map_type)
382{ 388{
383 unsigned long bytes, paddr; 389 unsigned long bytes, paddr;
384 390
385 paddr = base << shift; 391 paddr = base << pshift;
386 bytes = (1UL << shift) * (max_pnode + 1); 392 bytes = (1UL << bshift) * (max_pnode + 1);
387 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 393 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
388 paddr + bytes); 394 paddr + bytes);
389 if (map_type == map_uc) 395 if (map_type == map_uc)
@@ -399,7 +405,7 @@ static __init void map_gru_high(int max_pnode)
399 405
400 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 406 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
401 if (gru.s.enable) { 407 if (gru.s.enable) {
402 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 408 map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
403 gru_start_paddr = ((u64)gru.s.base << shift); 409 gru_start_paddr = ((u64)gru.s.base << shift);
404 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); 410 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
405 411
@@ -413,7 +419,7 @@ static __init void map_mmr_high(int max_pnode)
413 419
414 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 420 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
415 if (mmr.s.enable) 421 if (mmr.s.enable)
416 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); 422 map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
417} 423}
418 424
419static __init void map_mmioh_high(int max_pnode) 425static __init void map_mmioh_high(int max_pnode)
@@ -423,7 +429,14 @@ static __init void map_mmioh_high(int max_pnode)
423 429
424 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 430 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
425 if (mmioh.s.enable) 431 if (mmioh.s.enable)
426 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 432 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
433 max_pnode, map_uc);
434}
435
436static __init void map_low_mmrs(void)
437{
438 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
439 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
427} 440}
428 441
429static __init void uv_rtc_init(void) 442static __init void uv_rtc_init(void)
@@ -567,6 +580,8 @@ void __init uv_system_init(void)
567 unsigned long mmr_base, present, paddr; 580 unsigned long mmr_base, present, paddr;
568 unsigned short pnode_mask; 581 unsigned short pnode_mask;
569 582
583 map_low_mmrs();
584
570 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 585 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
571 m_val = m_n_config.s.m_skt; 586 m_val = m_n_config.s.m_skt;
572 n_val = m_n_config.s.n_skt; 587 n_val = m_n_config.s.n_skt;
@@ -624,8 +639,10 @@ void __init uv_system_init(void)
624 uv_rtc_init(); 639 uv_rtc_init();
625 640
626 for_each_present_cpu(cpu) { 641 for_each_present_cpu(cpu) {
642 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
643
627 nid = cpu_to_node(cpu); 644 nid = cpu_to_node(cpu);
628 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 645 pnode = uv_apicid_to_pnode(apicid);
629 blade = boot_pnode_to_blade(pnode); 646 blade = boot_pnode_to_blade(pnode);
630 lcpu = uv_blade_info[blade].nr_possible_cpus; 647 lcpu = uv_blade_info[blade].nr_possible_cpus;
631 uv_blade_info[blade].nr_possible_cpus++; 648 uv_blade_info[blade].nr_possible_cpus++;
@@ -636,25 +653,23 @@ void __init uv_system_init(void)
636 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 653 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
637 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 654 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
638 uv_cpu_hub_info(cpu)->m_val = m_val; 655 uv_cpu_hub_info(cpu)->m_val = m_val;
639 uv_cpu_hub_info(cpu)->n_val = m_val; 656 uv_cpu_hub_info(cpu)->n_val = n_val;
640 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 657 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
641 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 658 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
642 uv_cpu_hub_info(cpu)->pnode = pnode; 659 uv_cpu_hub_info(cpu)->pnode = pnode;
643 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 660 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
644 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 661 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
645 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 662 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
646 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 663 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
647 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 664 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
648 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 665 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
649 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 666 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
650 uv_node_to_blade[nid] = blade; 667 uv_node_to_blade[nid] = blade;
651 uv_cpu_to_blade[cpu] = blade; 668 uv_cpu_to_blade[cpu] = blade;
652 max_pnode = max(pnode, max_pnode); 669 max_pnode = max(pnode, max_pnode);
653 670
654 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " 671 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
655 "lcpu %d, blade %d\n", 672 cpu, apicid, pnode, nid, lcpu, blade);
656 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
657 lcpu, blade);
658 } 673 }
659 674
660 /* Add blade/pnode info for nodes without cpus */ 675 /* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..b5b6b23bce53 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
208#include <linux/types.h> 207#include <linux/types.h>
209#include <linux/stddef.h> 208#include <linux/stddef.h>
210#include <linux/timer.h> 209#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 403static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 404static DEFINE_SPINLOCK(user_list_lock);
405static DEFINE_MUTEX(apm_mutex);
406 406
407/* 407/*
408 * Set up a segment that references the real mode segment 0x40 408 * Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1531 return -EPERM; 1531 return -EPERM;
1532 switch (cmd) { 1532 switch (cmd) {
1533 case APM_IOC_STANDBY: 1533 case APM_IOC_STANDBY:
1534 lock_kernel(); 1534 mutex_lock(&apm_mutex);
1535 if (as->standbys_read > 0) { 1535 if (as->standbys_read > 0) {
1536 as->standbys_read--; 1536 as->standbys_read--;
1537 as->standbys_pending--; 1537 as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1540 queue_event(APM_USER_STANDBY, as); 1540 queue_event(APM_USER_STANDBY, as);
1541 if (standbys_pending <= 0) 1541 if (standbys_pending <= 0)
1542 standby(); 1542 standby();
1543 unlock_kernel(); 1543 mutex_unlock(&apm_mutex);
1544 break; 1544 break;
1545 case APM_IOC_SUSPEND: 1545 case APM_IOC_SUSPEND:
1546 lock_kernel(); 1546 mutex_lock(&apm_mutex);
1547 if (as->suspends_read > 0) { 1547 if (as->suspends_read > 0) {
1548 as->suspends_read--; 1548 as->suspends_read--;
1549 as->suspends_pending--; 1549 as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1552 queue_event(APM_USER_SUSPEND, as); 1552 queue_event(APM_USER_SUSPEND, as);
1553 if (suspends_pending <= 0) { 1553 if (suspends_pending <= 0) {
1554 ret = suspend(1); 1554 ret = suspend(1);
1555 mutex_unlock(&apm_mutex);
1555 } else { 1556 } else {
1556 as->suspend_wait = 1; 1557 as->suspend_wait = 1;
1558 mutex_unlock(&apm_mutex);
1557 wait_event_interruptible(apm_suspend_waitqueue, 1559 wait_event_interruptible(apm_suspend_waitqueue,
1558 as->suspend_wait == 0); 1560 as->suspend_wait == 0);
1559 ret = as->suspend_result; 1561 ret = as->suspend_result;
1560 } 1562 }
1561 unlock_kernel();
1562 return ret; 1563 return ret;
1563 default: 1564 default:
1564 return -ENOTTY; 1565 return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
1608{ 1609{
1609 struct apm_user *as; 1610 struct apm_user *as;
1610 1611
1611 lock_kernel();
1612 as = kmalloc(sizeof(*as), GFP_KERNEL); 1612 as = kmalloc(sizeof(*as), GFP_KERNEL);
1613 if (as == NULL) { 1613 if (as == NULL) {
1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1615 sizeof(*as)); 1615 sizeof(*as));
1616 unlock_kernel();
1617 return -ENOMEM; 1616 return -ENOMEM;
1618 } 1617 }
1619 as->magic = APM_BIOS_MAGIC; 1618 as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
1635 user_list = as; 1634 user_list = as;
1636 spin_unlock(&user_list_lock); 1635 spin_unlock(&user_list_lock);
1637 filp->private_data = as; 1636 filp->private_data = as;
1638 unlock_kernel();
1639 return 0; 1637 return 0;
1640} 1638}
1641 1639
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 63a88e1f987d..b0206a211b09 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -101,21 +101,17 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
101} 101}
102 102
103int 103int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, 104uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset) 105 unsigned long *intr_mmr_offset)
106{ 106{
107 union uv_watchlist_u size_blade;
108 u64 watchlist; 107 u64 watchlist;
109 s64 ret; 108 s64 ret;
110 109
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /* 110 /*
115 * bios returns watchlist number or negative error number. 111 * bios returns watchlist number or negative error number.
116 */ 112 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, 113 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset, 114 mq_size, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0); 115 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS) 116 if (ret < BIOS_STATUS_SUCCESS)
121 return ret; 117 return ret;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..c202b62f3671 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
@@ -18,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
20 21
21obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
22
23obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
24obj-$(CONFIG_CPU_SUP_AMD) += amd.o 23obj-$(CONFIG_CPU_SUP_AMD) += amd.o
25obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 24obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c965e5212714..468489b57aae 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -74,6 +74,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
74 unsigned int eax, ebx, ecx, edx, sub_index; 74 unsigned int eax, ebx, ecx, edx, sub_index;
75 unsigned int ht_mask_width, core_plus_mask_width; 75 unsigned int ht_mask_width, core_plus_mask_width;
76 unsigned int core_select_mask, core_level_siblings; 76 unsigned int core_select_mask, core_level_siblings;
77 static bool printed;
77 78
78 if (c->cpuid_level < 0xb) 79 if (c->cpuid_level < 0xb)
79 return; 80 return;
@@ -127,12 +128,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
127 128
128 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 129 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
129 130
130 131 if (!printed) {
131 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 132 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
132 c->phys_proc_id); 133 c->phys_proc_id);
133 if (c->x86_max_cores > 1) 134 if (c->x86_max_cores > 1)
134 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 135 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
135 c->cpu_core_id); 136 c->cpu_core_id);
137 printed = 1;
138 }
136 return; 139 return;
137#endif 140#endif
138} 141}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..e485825130d2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid)
254 254
255/* 255/*
256 * Fixup core topology information for AMD multi-node processors. 256 * Fixup core topology information for AMD multi-node processors.
257 * Assumption 1: Number of cores in each internal node is the same. 257 * Assumption: Number of cores in each internal node is the same.
258 * Assumption 2: Mixed systems with both single-node and dual-node
259 * processors are not supported.
260 */ 258 */
261#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
262static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
263{ 261{
264#ifdef CONFIG_PCI 262 unsigned long long value;
265 u32 t, cpn; 263 u32 nodes, cores_per_node;
266 u8 n, n_id;
267 int cpu = smp_processor_id(); 264 int cpu = smp_processor_id();
268 265
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
267 return;
268
269 /* fixup topology information only once for a core */ 269 /* fixup topology information only once for a core */
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 270 if (cpu_has(c, X86_FEATURE_AMD_DCM))
271 return; 271 return;
272 272
273 /* check for multi-node processor on boot cpu */ 273 rdmsrl(MSR_FAM10H_NODE_ID, value);
274 t = read_pci_config(0, 24, 3, 0xe8); 274
275 if (!(t & (1 << 29))) 275 nodes = ((value >> 3) & 7) + 1;
276 if (nodes == 1)
276 return; 277 return;
277 278
278 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 279 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
280 cores_per_node = c->x86_max_cores / nodes;
279 281
280 /* cores per node: each internal node has half the number of cores */ 282 /* store NodeID, use llc_shared_map to store sibling info */
281 cpn = c->x86_max_cores >> 1; 283 per_cpu(cpu_llc_id, cpu) = value & 7;
282
283 /* even-numbered NB_id of this dual-node processor */
284 n = c->phys_proc_id << 1;
285
286 /*
287 * determine internal node id and assign cores fifty-fifty to
288 * each node of the dual-node processor
289 */
290 t = read_pci_config(0, 24 + n, 3, 0xe8);
291 n = (t>>30) & 0x3;
292 if (n == 0) {
293 if (c->cpu_core_id < cpn)
294 n_id = 0;
295 else
296 n_id = 1;
297 } else {
298 if (c->cpu_core_id < cpn)
299 n_id = 1;
300 else
301 n_id = 0;
302 }
303
304 /* compute entire NodeID, use llc_shared_map to store sibling info */
305 per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
306 284
307 /* fixup core id to be in range from 0 to cpn */ 285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */
308 c->cpu_core_id = c->cpu_core_id % cpn; 286 c->cpu_core_id = c->cpu_core_id % cores_per_node;
309#endif
310} 287}
311#endif 288#endif
312 289
@@ -375,8 +352,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
375 node = nearby_node(apicid); 352 node = nearby_node(apicid);
376 } 353 }
377 numa_set_node(cpu, node); 354 numa_set_node(cpu, node);
378
379 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
380#endif 355#endif
381} 356}
382 357
@@ -535,7 +510,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
535 } 510 }
536 } 511 }
537 512
538 display_cacheinfo(c); 513 cpu_detect_cache_sizes(c);
539 514
540 /* Multi core CPU? */ 515 /* Multi core CPU? */
541 if (c->extended_cpuid_level >= 0x80000008) { 516 if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 } 295 }
296 296
297 display_cacheinfo(c); 297 cpu_detect_cache_sizes(c);
298} 298}
299 299
300enum { 300enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 18346da8c594..4868e4a951ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
61static void __cpuinit default_init(struct cpuinfo_x86 *c) 61static void __cpuinit default_init(struct cpuinfo_x86 *c)
62{ 62{
63#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
64 display_cacheinfo(c); 64 cpu_detect_cache_sizes(c);
65#else 65#else
66 /* Not much we can do here... */ 66 /* Not much we can do here... */
67 /* Check if at least it has cpuid */ 67 /* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
383 } 383 }
384} 384}
385 385
386void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 386void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
387{ 387{
388 unsigned int n, dummy, ebx, ecx, edx, l2size; 388 unsigned int n, dummy, ebx, ecx, edx, l2size;
389 389
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
391 391
392 if (n >= 0x80000005) { 392 if (n >= 0x80000005) {
393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
394 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
395 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
396 c->x86_cache_size = (ecx>>24) + (edx>>24); 394 c->x86_cache_size = (ecx>>24) + (edx>>24);
397#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
398 /* On K8 L1 TLB is inclusive, so don't count it */ 396 /* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
422#endif 420#endif
423 421
424 c->x86_cache_size = l2size; 422 c->x86_cache_size = l2size;
425
426 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
427 l2size, ecx & 0xFF);
428} 423}
429 424
430void __cpuinit detect_ht(struct cpuinfo_x86 *c) 425void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -432,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
432#ifdef CONFIG_X86_HT 427#ifdef CONFIG_X86_HT
433 u32 eax, ebx, ecx, edx; 428 u32 eax, ebx, ecx, edx;
434 int index_msb, core_bits; 429 int index_msb, core_bits;
430 static bool printed;
435 431
436 if (!cpu_has(c, X86_FEATURE_HT)) 432 if (!cpu_has(c, X86_FEATURE_HT))
437 return; 433 return;
@@ -447,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
447 smp_num_siblings = (ebx & 0xff0000) >> 16; 443 smp_num_siblings = (ebx & 0xff0000) >> 16;
448 444
449 if (smp_num_siblings == 1) { 445 if (smp_num_siblings == 1) {
450 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 446 printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
451 goto out; 447 goto out;
452 } 448 }
453 449
@@ -474,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
474 ((1 << core_bits) - 1); 470 ((1 << core_bits) - 1);
475 471
476out: 472out:
477 if ((c->x86_max_cores * smp_num_siblings) > 1) { 473 if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
478 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 474 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
479 c->phys_proc_id); 475 c->phys_proc_id);
480 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 476 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
481 c->cpu_core_id); 477 c->cpu_core_id);
478 printed = 1;
482 } 479 }
483#endif 480#endif
484} 481}
@@ -659,24 +656,31 @@ void __init early_cpu_init(void)
659 const struct cpu_dev *const *cdev; 656 const struct cpu_dev *const *cdev;
660 int count = 0; 657 int count = 0;
661 658
659#ifdef PROCESSOR_SELECT
662 printk(KERN_INFO "KERNEL supported cpus:\n"); 660 printk(KERN_INFO "KERNEL supported cpus:\n");
661#endif
662
663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
664 const struct cpu_dev *cpudev = *cdev; 664 const struct cpu_dev *cpudev = *cdev;
665 unsigned int j;
666 665
667 if (count >= X86_VENDOR_NUM) 666 if (count >= X86_VENDOR_NUM)
668 break; 667 break;
669 cpu_devs[count] = cpudev; 668 cpu_devs[count] = cpudev;
670 count++; 669 count++;
671 670
672 for (j = 0; j < 2; j++) { 671#ifdef PROCESSOR_SELECT
673 if (!cpudev->c_ident[j]) 672 {
674 continue; 673 unsigned int j;
675 printk(KERN_INFO " %s %s\n", cpudev->c_vendor, 674
676 cpudev->c_ident[j]); 675 for (j = 0; j < 2; j++) {
676 if (!cpudev->c_ident[j])
677 continue;
678 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
679 cpudev->c_ident[j]);
680 }
677 } 681 }
682#endif
678 } 683 }
679
680 early_identify_cpu(&boot_cpu_data); 684 early_identify_cpu(&boot_cpu_data);
681} 685}
682 686
@@ -837,10 +841,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 841 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 842 }
839 843
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 844 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 845 mcheck_cpu_init(c);
843#endif
844 846
845 select_idle_routine(c); 847 select_idle_routine(c);
846 848
@@ -1093,7 +1095,7 @@ static void clear_all_debug_regs(void)
1093 1095
1094void __cpuinit cpu_init(void) 1096void __cpuinit cpu_init(void)
1095{ 1097{
1096 struct orig_ist *orig_ist; 1098 struct orig_ist *oist;
1097 struct task_struct *me; 1099 struct task_struct *me;
1098 struct tss_struct *t; 1100 struct tss_struct *t;
1099 unsigned long v; 1101 unsigned long v;
@@ -1102,7 +1104,7 @@ void __cpuinit cpu_init(void)
1102 1104
1103 cpu = stack_smp_processor_id(); 1105 cpu = stack_smp_processor_id();
1104 t = &per_cpu(init_tss, cpu); 1106 t = &per_cpu(init_tss, cpu);
1105 orig_ist = &per_cpu(orig_ist, cpu); 1107 oist = &per_cpu(orig_ist, cpu);
1106 1108
1107#ifdef CONFIG_NUMA 1109#ifdef CONFIG_NUMA
1108 if (cpu != 0 && percpu_read(node_number) == 0 && 1110 if (cpu != 0 && percpu_read(node_number) == 0 &&
@@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void)
1115 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) 1117 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1116 panic("CPU#%d already initialized!\n", cpu); 1118 panic("CPU#%d already initialized!\n", cpu);
1117 1119
1118 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1120 pr_debug("Initializing CPU#%d\n", cpu);
1119 1121
1120 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1122 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1121 1123
@@ -1143,12 +1145,12 @@ void __cpuinit cpu_init(void)
1143 /* 1145 /*
1144 * set up and load the per-CPU TSS 1146 * set up and load the per-CPU TSS
1145 */ 1147 */
1146 if (!orig_ist->ist[0]) { 1148 if (!oist->ist[0]) {
1147 char *estacks = per_cpu(exception_stacks, cpu); 1149 char *estacks = per_cpu(exception_stacks, cpu);
1148 1150
1149 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1151 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1150 estacks += exception_stack_sizes[v]; 1152 estacks += exception_stack_sizes[v];
1151 orig_ist->ist[v] = t->x86_tss.ist[v] = 1153 oist->ist[v] = t->x86_tss.ist[v] =
1152 (unsigned long)estacks; 1154 (unsigned long)estacks;
1153 } 1155 }
1154 } 1156 }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36 36
37#endif 37#endif
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index dca325c03999..000000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count);
36
37static DEFINE_MUTEX(cpu_debug_lock);
38
39static struct dentry *cpu_debugfs_dir;
40
41static struct cpu_debug_base cpu_base[] = {
42 { "mc", CPU_MC, 0 },
43 { "monitor", CPU_MONITOR, 0 },
44 { "time", CPU_TIME, 0 },
45 { "pmc", CPU_PMC, 1 },
46 { "platform", CPU_PLATFORM, 0 },
47 { "apic", CPU_APIC, 0 },
48 { "poweron", CPU_POWERON, 0 },
49 { "control", CPU_CONTROL, 0 },
50 { "features", CPU_FEATURES, 0 },
51 { "lastbranch", CPU_LBRANCH, 0 },
52 { "bios", CPU_BIOS, 0 },
53 { "freq", CPU_FREQ, 0 },
54 { "mtrr", CPU_MTRR, 0 },
55 { "perf", CPU_PERF, 0 },
56 { "cache", CPU_CACHE, 0 },
57 { "sysenter", CPU_SYSENTER, 0 },
58 { "therm", CPU_THERM, 0 },
59 { "misc", CPU_MISC, 0 },
60 { "debug", CPU_DEBUG, 0 },
61 { "pat", CPU_PAT, 0 },
62 { "vmx", CPU_VMX, 0 },
63 { "call", CPU_CALL, 0 },
64 { "base", CPU_BASE, 0 },
65 { "ver", CPU_VER, 0 },
66 { "conf", CPU_CONF, 0 },
67 { "smm", CPU_SMM, 0 },
68 { "svm", CPU_SVM, 0 },
69 { "osvm", CPU_OSVM, 0 },
70 { "tss", CPU_TSS, 0 },
71 { "cr", CPU_CR, 0 },
72 { "dt", CPU_DT, 0 },
73 { "registers", CPU_REG_ALL, 0 },
74};
75
76static struct cpu_file_base cpu_file[] = {
77 { "index", CPU_REG_ALL, 0 },
78 { "value", CPU_REG_ALL, 1 },
79};
80
81/* CPU Registers Range */
82static struct cpu_debug_range cpu_reg_range[] = {
83 { 0x00000000, 0x00000001, CPU_MC, },
84 { 0x00000006, 0x00000007, CPU_MONITOR, },
85 { 0x00000010, 0x00000010, CPU_TIME, },
86 { 0x00000011, 0x00000013, CPU_PMC, },
87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
88 { 0x0000001B, 0x0000001B, CPU_APIC, },
89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
94 { 0x00000079, 0x00000079, CPU_BIOS, },
95 { 0x00000088, 0x0000008A, CPU_CACHE, },
96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
98 { 0x000000C1, 0x000000C4, CPU_PMC, },
99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
100 { 0x000000E7, 0x000000E8, CPU_PERF, },
101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
102
103 { 0x00000116, 0x0000011E, CPU_CACHE, },
104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
105 { 0x00000179, 0x0000017B, CPU_MC, },
106 { 0x00000186, 0x00000189, CPU_PMC, },
107 { 0x00000198, 0x00000199, CPU_PERF, },
108 { 0x0000019A, 0x0000019A, CPU_TIME, },
109 { 0x0000019B, 0x0000019D, CPU_THERM, },
110 { 0x000001A0, 0x000001A0, CPU_MISC, },
111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
115
116 { 0x00000200, 0x0000020F, CPU_MTRR, },
117 { 0x00000250, 0x00000250, CPU_MTRR, },
118 { 0x00000258, 0x00000259, CPU_MTRR, },
119 { 0x00000268, 0x0000026F, CPU_MTRR, },
120 { 0x00000277, 0x00000277, CPU_PAT, },
121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
122
123 { 0x00000300, 0x00000311, CPU_PMC, },
124 { 0x00000345, 0x00000345, CPU_PMC, },
125 { 0x00000360, 0x00000371, CPU_PMC, },
126 { 0x0000038D, 0x00000390, CPU_PMC, },
127 { 0x000003A0, 0x000003BE, CPU_PMC, },
128 { 0x000003C0, 0x000003CD, CPU_PMC, },
129 { 0x000003E0, 0x000003E1, CPU_PMC, },
130 { 0x000003F0, 0x000003F2, CPU_PMC, },
131
132 { 0x00000400, 0x00000417, CPU_MC, },
133 { 0x00000480, 0x0000048B, CPU_VMX, },
134
135 { 0x00000600, 0x00000600, CPU_DEBUG, },
136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
138
139 { 0x000107CC, 0x000107D3, CPU_PMC, },
140
141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
142 { 0xC0000081, 0xC0000084, CPU_CALL, },
143 { 0xC0000100, 0xC0000102, CPU_BASE, },
144 { 0xC0000103, 0xC0000103, CPU_TIME, },
145
146 { 0xC0010000, 0xC0010007, CPU_PMC, },
147 { 0xC0010010, 0xC0010010, CPU_CONF, },
148 { 0xC0010015, 0xC0010015, CPU_CONF, },
149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
151 { 0xC001001F, 0xC001001F, CPU_CONF, },
152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
153 { 0xC0010044, 0xC0010048, CPU_MC, },
154 { 0xC0010050, 0xC0010056, CPU_SMM, },
155 { 0xC0010058, 0xC0010058, CPU_CONF, },
156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
157 { 0xC0010061, 0xC0010068, CPU_SMM, },
158 { 0xC0010069, 0xC001006B, CPU_SMM, },
159 { 0xC0010070, 0xC0010071, CPU_SMM, },
160 { 0xC0010111, 0xC0010113, CPU_SMM, },
161 { 0xC0010114, 0xC0010118, CPU_SVM, },
162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
163 { 0xC0011022, 0xC0011023, CPU_CONF, },
164};
165
166static int is_typeflag_valid(unsigned cpu, unsigned flag)
167{
168 int i;
169
170 /* Standard Registers should be always valid */
171 if (flag >= CPU_TSS)
172 return 1;
173
174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
175 if (cpu_reg_range[i].flag == flag)
176 return 1;
177 }
178
179 /* Invalid */
180 return 0;
181}
182
183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
184 int index, unsigned flag)
185{
186 if (cpu_reg_range[index].flag == flag) {
187 *min = cpu_reg_range[index].min;
188 *max = cpu_reg_range[index].max;
189 } else
190 *max = 0;
191
192 return *max;
193}
194
195/* This function can also be called with seq = NULL for printk */
196static void print_cpu_data(struct seq_file *seq, unsigned type,
197 u32 low, u32 high)
198{
199 struct cpu_private *priv;
200 u64 val = high;
201
202 if (seq) {
203 priv = seq->private;
204 if (priv->file) {
205 val = (val << 32) | low;
206 seq_printf(seq, "0x%llx\n", val);
207 } else
208 seq_printf(seq, " %08x: %08x_%08x\n",
209 type, high, low);
210 } else
211 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
212}
213
214/* This function can also be called with seq = NULL for printk */
215static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
216{
217 unsigned msr, msr_min, msr_max;
218 struct cpu_private *priv;
219 u32 low, high;
220 int i;
221
222 if (seq) {
223 priv = seq->private;
224 if (priv->file) {
225 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
226 &low, &high))
227 print_cpu_data(seq, priv->reg, low, high);
228 return;
229 }
230 }
231
232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
234 continue;
235
236 for (msr = msr_min; msr <= msr_max; msr++) {
237 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
238 continue;
239 print_cpu_data(seq, msr, low, high);
240 }
241 }
242}
243
244static void print_tss(void *arg)
245{
246 struct pt_regs *regs = task_pt_regs(current);
247 struct seq_file *seq = arg;
248 unsigned int seg;
249
250 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
251 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
252 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
253 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
254
255 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
256 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
257 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
258 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
259
260#ifdef CONFIG_X86_64
261 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
262 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
263 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
264 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
265 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
266 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
267 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
268 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
269#endif
270
271 asm("movl %%cs,%0" : "=r" (seg));
272 seq_printf(seq, " CS\t: %04x\n", seg);
273 asm("movl %%ds,%0" : "=r" (seg));
274 seq_printf(seq, " DS\t: %04x\n", seg);
275 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
276 asm("movl %%es,%0" : "=r" (seg));
277 seq_printf(seq, " ES\t: %04x\n", seg);
278 asm("movl %%fs,%0" : "=r" (seg));
279 seq_printf(seq, " FS\t: %04x\n", seg);
280 asm("movl %%gs,%0" : "=r" (seg));
281 seq_printf(seq, " GS\t: %04x\n", seg);
282
283 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
284
285 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
286}
287
288static void print_cr(void *arg)
289{
290 struct seq_file *seq = arg;
291
292 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
293 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
294 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
295 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
296#ifdef CONFIG_X86_64
297 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
298#endif
299}
300
301static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
302{
303 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
304}
305
306static void print_dt(void *seq)
307{
308 struct desc_ptr dt;
309 unsigned long ldt;
310
311 /* IDT */
312 store_idt((struct desc_ptr *)&dt);
313 print_desc_ptr("IDT", seq, dt);
314
315 /* GDT */
316 store_gdt((struct desc_ptr *)&dt);
317 print_desc_ptr("GDT", seq, dt);
318
319 /* LDT */
320 store_ldt(ldt);
321 seq_printf(seq, " LDT\t: %016lx\n", ldt);
322
323 /* TR */
324 store_tr(ldt);
325 seq_printf(seq, " TR\t: %016lx\n", ldt);
326}
327
328static void print_dr(void *arg)
329{
330 struct seq_file *seq = arg;
331 unsigned long dr;
332 int i;
333
334 for (i = 0; i < 8; i++) {
335 /* Ignore db4, db5 */
336 if ((i == 4) || (i == 5))
337 continue;
338 get_debugreg(dr, i);
339 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
340 }
341
342 seq_printf(seq, "\n MSR\t:\n");
343}
344
345static void print_apic(void *arg)
346{
347 struct seq_file *seq = arg;
348
349#ifdef CONFIG_X86_LOCAL_APIC
350 seq_printf(seq, " LAPIC\t:\n");
351 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
352 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
353 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
354 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
355 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
356 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
357 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
358 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
359 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
360 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
361 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
362 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
363 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
364 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
365 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
366 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
367 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
368 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
386 seq_printf(seq, "\n MSR\t:\n");
387}
388
389static int cpu_seq_show(struct seq_file *seq, void *v)
390{
391 struct cpu_private *priv = seq->private;
392
393 if (priv == NULL)
394 return -EINVAL;
395
396 switch (cpu_base[priv->type].flag) {
397 case CPU_TSS:
398 smp_call_function_single(priv->cpu, print_tss, seq, 1);
399 break;
400 case CPU_CR:
401 smp_call_function_single(priv->cpu, print_cr, seq, 1);
402 break;
403 case CPU_DT:
404 smp_call_function_single(priv->cpu, print_dt, seq, 1);
405 break;
406 case CPU_DEBUG:
407 if (priv->file == CPU_INDEX_BIT)
408 smp_call_function_single(priv->cpu, print_dr, seq, 1);
409 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
410 break;
411 case CPU_APIC:
412 if (priv->file == CPU_INDEX_BIT)
413 smp_call_function_single(priv->cpu, print_apic, seq, 1);
414 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
415 break;
416
417 default:
418 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
419 break;
420 }
421 seq_printf(seq, "\n");
422
423 return 0;
424}
425
426static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
427{
428 if (*pos == 0) /* One time is enough ;-) */
429 return seq;
430
431 return NULL;
432}
433
434static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435{
436 (*pos)++;
437
438 return cpu_seq_start(seq, pos);
439}
440
441static void cpu_seq_stop(struct seq_file *seq, void *v)
442{
443}
444
445static const struct seq_operations cpu_seq_ops = {
446 .start = cpu_seq_start,
447 .next = cpu_seq_next,
448 .stop = cpu_seq_stop,
449 .show = cpu_seq_show,
450};
451
452static int cpu_seq_open(struct inode *inode, struct file *file)
453{
454 struct cpu_private *priv = inode->i_private;
455 struct seq_file *seq;
456 int err;
457
458 err = seq_open(file, &cpu_seq_ops);
459 if (!err) {
460 seq = file->private_data;
461 seq->private = priv;
462 }
463
464 return err;
465}
466
467static int write_msr(struct cpu_private *priv, u64 val)
468{
469 u32 low, high;
470
471 high = (val >> 32) & 0xffffffff;
472 low = val & 0xffffffff;
473
474 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
475 return 0;
476
477 return -EPERM;
478}
479
480static int write_cpu_register(struct cpu_private *priv, const char *buf)
481{
482 int ret = -EPERM;
483 u64 val;
484
485 ret = strict_strtoull(buf, 0, &val);
486 if (ret < 0)
487 return ret;
488
489 /* Supporting only MSRs */
490 if (priv->type < CPU_TSS_BIT)
491 return write_msr(priv, val);
492
493 return ret;
494}
495
496static ssize_t cpu_write(struct file *file, const char __user *ubuf,
497 size_t count, loff_t *off)
498{
499 struct seq_file *seq = file->private_data;
500 struct cpu_private *priv = seq->private;
501 char buf[19];
502
503 if ((priv == NULL) || (count >= sizeof(buf)))
504 return -EINVAL;
505
506 if (copy_from_user(&buf, ubuf, count))
507 return -EFAULT;
508
509 buf[count] = 0;
510
511 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
512 if (!write_cpu_register(priv, buf))
513 return count;
514
515 return -EACCES;
516}
517
518static const struct file_operations cpu_fops = {
519 .owner = THIS_MODULE,
520 .open = cpu_seq_open,
521 .read = seq_read,
522 .write = cpu_write,
523 .llseek = seq_lseek,
524 .release = seq_release,
525};
526
527static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
528 unsigned file, struct dentry *dentry)
529{
530 struct cpu_private *priv = NULL;
531
532 /* Already intialized */
533 if (file == CPU_INDEX_BIT)
534 if (per_cpu(cpu_arr[type].init, cpu))
535 return 0;
536
537 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
538 if (priv == NULL)
539 return -ENOMEM;
540
541 priv->cpu = cpu;
542 priv->type = type;
543 priv->reg = reg;
544 priv->file = file;
545 mutex_lock(&cpu_debug_lock);
546 per_cpu(priv_arr[type], cpu) = priv;
547 per_cpu(cpu_priv_count, cpu)++;
548 mutex_unlock(&cpu_debug_lock);
549
550 if (file)
551 debugfs_create_file(cpu_file[file].name, S_IRUGO,
552 dentry, (void *)priv, &cpu_fops);
553 else {
554 debugfs_create_file(cpu_base[type].name, S_IRUGO,
555 per_cpu(cpu_arr[type].dentry, cpu),
556 (void *)priv, &cpu_fops);
557 mutex_lock(&cpu_debug_lock);
558 per_cpu(cpu_arr[type].init, cpu) = 1;
559 mutex_unlock(&cpu_debug_lock);
560 }
561
562 return 0;
563}
564
565static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
566 struct dentry *dentry)
567{
568 unsigned file;
569 int err = 0;
570
571 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
572 err = cpu_create_file(cpu, type, reg, file, dentry);
573 if (err)
574 return err;
575 }
576
577 return err;
578}
579
580static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
581{
582 struct dentry *cpu_dentry = NULL;
583 unsigned reg, reg_min, reg_max;
584 int i, err = 0;
585 char reg_dir[12];
586 u32 low, high;
587
588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
590 cpu_base[type].flag))
591 continue;
592
593 for (reg = reg_min; reg <= reg_max; reg++) {
594 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
595 continue;
596
597 sprintf(reg_dir, "0x%x", reg);
598 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
599 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
600 if (err)
601 return err;
602 }
603 }
604
605 return err;
606}
607
608static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
609{
610 struct dentry *cpu_dentry = NULL;
611 unsigned type;
612 int err = 0;
613
614 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
615 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
616 continue;
617 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
618 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
619
620 if (type < CPU_TSS_BIT)
621 err = cpu_init_msr(cpu, type, cpu_dentry);
622 else
623 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
624 cpu_dentry);
625 if (err)
626 return err;
627 }
628
629 return err;
630}
631
632static int cpu_init_cpu(void)
633{
634 struct dentry *cpu_dentry = NULL;
635 struct cpuinfo_x86 *cpui;
636 char cpu_dir[12];
637 unsigned cpu;
638 int err = 0;
639
640 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
641 cpui = &cpu_data(cpu);
642 if (!cpu_has(cpui, X86_FEATURE_MSR))
643 continue;
644
645 sprintf(cpu_dir, "cpu%d", cpu);
646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
647 err = cpu_init_allreg(cpu, cpu_dentry);
648
649 pr_info("cpu%d(%d) debug files %d\n",
650 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
651 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
652 pr_err("Register files count %d exceeds limit %d\n",
653 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
654 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
655 err = -ENFILE;
656 }
657 if (err)
658 return err;
659 }
660
661 return err;
662}
663
664static int __init cpu_debug_init(void)
665{
666 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
667
668 return cpu_init_cpu();
669}
670
671static void __exit cpu_debug_exit(void)
672{
673 int i, cpu;
674
675 if (cpu_debugfs_dir)
676 debugfs_remove_recursive(cpu_debugfs_dir);
677
678 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
679 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
680 kfree(per_cpu(priv_arr[i], cpu));
681}
682
683module_init(cpu_debug_init);
684module_exit(cpu_debug_exit);
685
686MODULE_AUTHOR("Jaswinder Singh Rajput");
687MODULE_DESCRIPTION("CPU Debug module");
688MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 7d5c3b0ea8da..1b1920fa7c80 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -68,9 +68,9 @@ struct acpi_cpufreq_data {
68 unsigned int cpu_feature; 68 unsigned int cpu_feature;
69}; 69};
70 70
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
72 72
73static DEFINE_PER_CPU(struct aperfmperf, old_perf); 73static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
74 74
75/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
76static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
@@ -190,9 +190,11 @@ static void do_drv_write(void *_cmd)
190 190
191static void drv_read(struct drv_cmd *cmd) 191static void drv_read(struct drv_cmd *cmd)
192{ 192{
193 int err;
193 cmd->val = 0; 194 cmd->val = 0;
194 195
195 smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); 196 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
197 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
196} 198}
197 199
198static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
@@ -214,14 +216,14 @@ static u32 get_cur_val(const struct cpumask *mask)
214 if (unlikely(cpumask_empty(mask))) 216 if (unlikely(cpumask_empty(mask)))
215 return 0; 217 return 0;
216 218
217 switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) { 219 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
218 case SYSTEM_INTEL_MSR_CAPABLE: 220 case SYSTEM_INTEL_MSR_CAPABLE:
219 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 221 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
220 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 222 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
221 break; 223 break;
222 case SYSTEM_IO_CAPABLE: 224 case SYSTEM_IO_CAPABLE:
223 cmd.type = SYSTEM_IO_CAPABLE; 225 cmd.type = SYSTEM_IO_CAPABLE;
224 perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data; 226 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
225 cmd.addr.io.port = perf->control_register.address; 227 cmd.addr.io.port = perf->control_register.address;
226 cmd.addr.io.bit_width = perf->control_register.bit_width; 228 cmd.addr.io.bit_width = perf->control_register.bit_width;
227 break; 229 break;
@@ -268,8 +270,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) 270 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
269 return 0; 271 return 0;
270 272
271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); 273 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
272 per_cpu(old_perf, cpu) = perf; 274 per_cpu(acfreq_old_perf, cpu) = perf;
273 275
274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; 276 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
275 277
@@ -278,7 +280,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
278 280
279static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 281static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
280{ 282{
281 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 283 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
282 unsigned int freq; 284 unsigned int freq;
283 unsigned int cached_freq; 285 unsigned int cached_freq;
284 286
@@ -322,7 +324,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
322static int acpi_cpufreq_target(struct cpufreq_policy *policy, 324static int acpi_cpufreq_target(struct cpufreq_policy *policy,
323 unsigned int target_freq, unsigned int relation) 325 unsigned int target_freq, unsigned int relation)
324{ 326{
325 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 327 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
326 struct acpi_processor_performance *perf; 328 struct acpi_processor_performance *perf;
327 struct cpufreq_freqs freqs; 329 struct cpufreq_freqs freqs;
328 struct drv_cmd cmd; 330 struct drv_cmd cmd;
@@ -416,7 +418,7 @@ out:
416 418
417static int acpi_cpufreq_verify(struct cpufreq_policy *policy) 419static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
418{ 420{
419 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 421 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
420 422
421 dprintk("acpi_cpufreq_verify\n"); 423 dprintk("acpi_cpufreq_verify\n");
422 424
@@ -526,15 +528,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
526 528
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) 529static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{ 530{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf 531 /* Intel Xeon Processor 7100 Series Specification Update
532 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an 533 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause 534 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/ 535 * Both Processor Cores to Lock Up. */
533 if (c->x86_vendor == X86_VENDOR_INTEL) { 536 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) && 537 if ((c->x86 == 15) &&
535 (c->x86_model == 6) && 538 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable()) 539 (c->x86_mask == 8)) {
540 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
541 "Xeon(R) 7100 Errata AL30, processors may "
542 "lock up on frequency changes: disabling "
543 "acpi-cpufreq.\n");
537 return -ENODEV; 544 return -ENODEV;
545 }
538 } 546 }
539 return 0; 547 return 0;
540} 548}
@@ -549,13 +557,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
549 unsigned int result = 0; 557 unsigned int result = 0;
550 struct cpuinfo_x86 *c = &cpu_data(policy->cpu); 558 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
551 struct acpi_processor_performance *perf; 559 struct acpi_processor_performance *perf;
560#ifdef CONFIG_SMP
561 static int blacklisted;
562#endif
552 563
553 dprintk("acpi_cpufreq_cpu_init\n"); 564 dprintk("acpi_cpufreq_cpu_init\n");
554 565
555#ifdef CONFIG_SMP 566#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c); 567 if (blacklisted)
557 if (result) 568 return blacklisted;
558 return result; 569 blacklisted = acpi_cpufreq_blacklist(c);
570 if (blacklisted)
571 return blacklisted;
559#endif 572#endif
560 573
561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 574 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
@@ -563,7 +576,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
563 return -ENOMEM; 576 return -ENOMEM;
564 577
565 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); 578 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
566 per_cpu(drv_data, cpu) = data; 579 per_cpu(acfreq_data, cpu) = data;
567 580
568 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 581 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
569 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; 582 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +727,20 @@ err_unreg:
714 acpi_processor_unregister_performance(perf, cpu); 727 acpi_processor_unregister_performance(perf, cpu);
715err_free: 728err_free:
716 kfree(data); 729 kfree(data);
717 per_cpu(drv_data, cpu) = NULL; 730 per_cpu(acfreq_data, cpu) = NULL;
718 731
719 return result; 732 return result;
720} 733}
721 734
722static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) 735static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
723{ 736{
724 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 737 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
725 738
726 dprintk("acpi_cpufreq_cpu_exit\n"); 739 dprintk("acpi_cpufreq_cpu_exit\n");
727 740
728 if (data) { 741 if (data) {
729 cpufreq_frequency_table_put_attr(policy->cpu); 742 cpufreq_frequency_table_put_attr(policy->cpu);
730 per_cpu(drv_data, policy->cpu) = NULL; 743 per_cpu(acfreq_data, policy->cpu) = NULL;
731 acpi_processor_unregister_performance(data->acpi_data, 744 acpi_processor_unregister_performance(data->acpi_data,
732 policy->cpu); 745 policy->cpu);
733 kfree(data); 746 kfree(data);
@@ -738,7 +751,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
738 751
739static int acpi_cpufreq_resume(struct cpufreq_policy *policy) 752static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
740{ 753{
741 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 754 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
742 755
743 dprintk("acpi_cpufreq_resume\n"); 756 dprintk("acpi_cpufreq_resume\n");
744 757
@@ -753,14 +766,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = {
753}; 766};
754 767
755static struct cpufreq_driver acpi_cpufreq_driver = { 768static struct cpufreq_driver acpi_cpufreq_driver = {
756 .verify = acpi_cpufreq_verify, 769 .verify = acpi_cpufreq_verify,
757 .target = acpi_cpufreq_target, 770 .target = acpi_cpufreq_target,
758 .init = acpi_cpufreq_cpu_init, 771 .bios_limit = acpi_processor_get_bios_limit,
759 .exit = acpi_cpufreq_cpu_exit, 772 .init = acpi_cpufreq_cpu_init,
760 .resume = acpi_cpufreq_resume, 773 .exit = acpi_cpufreq_cpu_exit,
761 .name = "acpi-cpufreq", 774 .resume = acpi_cpufreq_resume,
762 .owner = THIS_MODULE, 775 .name = "acpi-cpufreq",
763 .attr = acpi_cpufreq_attr, 776 .owner = THIS_MODULE,
777 .attr = acpi_cpufreq_attr,
764}; 778};
765 779
766static int __init acpi_cpufreq_init(void) 780static int __init acpi_cpufreq_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index ce2ed3e4aad9..7e7eea4f8261 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); 813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break; 814 break;
815 case 1 ... 15: 815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V1; 816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) { 817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2; 818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]"; 819 cpuname = "C3 'Samuel 2' [C5B]";
@@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
885 885
886 /* Find ACPI data for processor */ 886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, 887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, 888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr); 889 NULL, (void *)&pr);
890 890
891 /* Check ACPI support for C3 state */ 891 /* Check ACPI support for C3 state */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index f10dea409f40..cb01dac267d3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
164 } 164 }
165 165
166 /* cpuinfo and default policy values */ 166 /* cpuinfo and default policy values */
167 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 167 policy->cpuinfo.transition_latency = 200000;
168 policy->cur = busfreq * max_multiplier; 168 policy->cur = busfreq * max_multiplier;
169 169
170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index d47c775eb0ab..9a97116f89e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = {
714}; 714};
715 715
716static struct cpufreq_driver powernow_driver = { 716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify, 717 .verify = powernow_verify,
718 .target = powernow_target, 718 .target = powernow_target,
719 .get = powernow_get, 719 .get = powernow_get,
720 .init = powernow_cpu_init, 720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .exit = powernow_cpu_exit, 721 .bios_limit = acpi_processor_get_bios_limit,
722 .name = "powernow-k7", 722#endif
723 .owner = THIS_MODULE, 723 .init = powernow_cpu_init,
724 .attr = powernow_table_attr, 724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
725}; 728};
726 729
727static int __init powernow_init(void) 730static int __init powernow_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6394aa5c7985..6e44519960c8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data)
1022 * set it to 1 to avoid problems in the future. 1022 * set it to 1 to avoid problems in the future.
1023 * For all others it's a BIOS bug. 1023 * For all others it's a BIOS bug.
1024 */ 1024 */
1025 if (!boot_cpu_data.x86 == 0x11) 1025 if (boot_cpu_data.x86 != 0x11)
1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1027 "latency\n"); 1027 "latency\n");
1028 max_latency = 1; 1028 max_latency = 1;
@@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1118static int powernowk8_target(struct cpufreq_policy *pol, 1118static int powernowk8_target(struct cpufreq_policy *pol,
1119 unsigned targfreq, unsigned relation) 1119 unsigned targfreq, unsigned relation)
1120{ 1120{
1121 cpumask_t oldmask; 1121 cpumask_var_t oldmask;
1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1123 u32 checkfid; 1123 u32 checkfid;
1124 u32 checkvid; 1124 u32 checkvid;
@@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1131 checkfid = data->currfid; 1131 checkfid = data->currfid;
1132 checkvid = data->currvid; 1132 checkvid = data->currvid;
1133 1133
1134 /* only run on specific CPU from here on */ 1134 /* only run on specific CPU from here on. */
1135 oldmask = current->cpus_allowed; 1135 /* This is poor form: use a workqueue or smp_call_function_single */
1136 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1136 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1137 return -ENOMEM;
1138
1139 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1140 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1137 1141
1138 if (smp_processor_id() != pol->cpu) { 1142 if (smp_processor_id() != pol->cpu) {
1139 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1143 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1193 ret = 0; 1197 ret = 0;
1194 1198
1195err_out: 1199err_out:
1196 set_cpus_allowed_ptr(current, &oldmask); 1200 set_cpus_allowed_ptr(current, oldmask);
1201 free_cpumask_var(oldmask);
1197 return ret; 1202 return ret;
1198} 1203}
1199 1204
@@ -1351,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1351 1356
1352 kfree(data->powernow_table); 1357 kfree(data->powernow_table);
1353 kfree(data); 1358 kfree(data);
1359 per_cpu(powernow_data, pol->cpu) = NULL;
1354 1360
1355 return 0; 1361 return 0;
1356} 1362}
@@ -1370,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
1370 int err; 1376 int err;
1371 1377
1372 if (!data) 1378 if (!data)
1373 return -EINVAL; 1379 return 0;
1374 1380
1375 smp_call_function_single(cpu, query_values_on_cpu, &err, true); 1381 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1376 if (err) 1382 if (err)
@@ -1393,14 +1399,15 @@ static struct freq_attr *powernow_k8_attr[] = {
1393}; 1399};
1394 1400
1395static struct cpufreq_driver cpufreq_amd64_driver = { 1401static struct cpufreq_driver cpufreq_amd64_driver = {
1396 .verify = powernowk8_verify, 1402 .verify = powernowk8_verify,
1397 .target = powernowk8_target, 1403 .target = powernowk8_target,
1398 .init = powernowk8_cpu_init, 1404 .bios_limit = acpi_processor_get_bios_limit,
1399 .exit = __devexit_p(powernowk8_cpu_exit), 1405 .init = powernowk8_cpu_init,
1400 .get = powernowk8_get, 1406 .exit = __devexit_p(powernowk8_cpu_exit),
1401 .name = "powernow-k8", 1407 .get = powernowk8_get,
1402 .owner = THIS_MODULE, 1408 .name = "powernow-k8",
1403 .attr = powernow_k8_attr, 1409 .owner = THIS_MODULE,
1410 .attr = powernow_k8_attr,
1404}; 1411};
1405 1412
1406/* driver entry point for init */ 1413/* driver entry point for init */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 6911e91fb4f6..2ce8e0b5cc54 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 39
40/* speedstep_processor 40/* speedstep_processor
41 */ 41 */
42static unsigned int speedstep_processor; 42static enum speedstep_processor speedstep_processor;
43 43
44static u32 pmbase; 44static u32 pmbase;
45 45
@@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void)
232 return 0; 232 return 0;
233} 233}
234 234
235struct get_freq_data { 235static void get_freq_data(void *_speed)
236 unsigned int speed;
237 unsigned int processor;
238};
239
240static void get_freq_data(void *_data)
241{ 236{
242 struct get_freq_data *data = _data; 237 unsigned int *speed = _speed;
243 238
244 data->speed = speedstep_get_frequency(data->processor); 239 *speed = speedstep_get_frequency(speedstep_processor);
245} 240}
246 241
247static unsigned int speedstep_get(unsigned int cpu) 242static unsigned int speedstep_get(unsigned int cpu)
248{ 243{
249 struct get_freq_data data = { .processor = cpu }; 244 unsigned int speed;
250 245
251 /* You're supposed to ensure CPU is online. */ 246 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) 247 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
253 BUG(); 248 BUG();
254 249
255 dprintk("detected %u kHz as current frequency\n", data.speed); 250 dprintk("detected %u kHz as current frequency\n", speed);
256 return data.speed; 251 return speed;
257} 252}
258 253
259/** 254/**
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index f4c290b8482f..ad0083abfa23 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -34,7 +34,7 @@ static int relaxed_check;
34 * GET PROCESSOR CORE SPEED IN KHZ * 34 * GET PROCESSOR CORE SPEED IN KHZ *
35 *********************************************************************/ 35 *********************************************************************/
36 36
37static unsigned int pentium3_get_frequency(unsigned int processor) 37static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
38{ 38{
39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
40 struct { 40 struct {
@@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void)
227 227
228 228
229/* Warning: may get called from smp_call_function_single. */ 229/* Warning: may get called from smp_call_function_single. */
230unsigned int speedstep_get_frequency(unsigned int processor) 230unsigned int speedstep_get_frequency(enum speedstep_processor processor)
231{ 231{
232 switch (processor) { 232 switch (processor) {
233 case SPEEDSTEP_CPU_PCORE: 233 case SPEEDSTEP_CPU_PCORE:
@@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor);
380 * DETECT SPEEDSTEP SPEEDS * 380 * DETECT SPEEDSTEP SPEEDS *
381 *********************************************************************/ 381 *********************************************************************/
382 382
383unsigned int speedstep_get_freqs(unsigned int processor, 383unsigned int speedstep_get_freqs(enum speedstep_processor processor,
384 unsigned int *low_speed, 384 unsigned int *low_speed,
385 unsigned int *high_speed, 385 unsigned int *high_speed,
386 unsigned int *transition_latency, 386 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index 2b6c04e5a304..70d9cea1219d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -11,18 +11,18 @@
11 11
12 12
13/* processors */ 13/* processors */
14 14enum speedstep_processor {
15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ 16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ 17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ 18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected 19/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 20 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_frequency() call. */ 21 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ 22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ 23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ 24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -31,10 +31,10 @@
31 31
32 32
33/* detect a speedstep-capable processor */ 33/* detect a speedstep-capable processor */
34extern unsigned int speedstep_detect_processor (void); 34extern enum speedstep_processor speedstep_detect_processor(void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
@@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor);
42 * SPEEDSTEP_LOW; the second argument is zero so that no 42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated. 43 * cpufreq_notify_transition calls are initiated.
44 */ 44 */
45extern unsigned int speedstep_get_freqs(unsigned int processor, 45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed, 46 unsigned int *low_speed,
47 unsigned int *high_speed, 47 unsigned int *high_speed,
48 unsigned int *transition_latency, 48 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index befea088e4f5..04d73c114e49 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -35,7 +35,7 @@ static int smi_cmd;
35static unsigned int smi_sig; 35static unsigned int smi_sig;
36 36
37/* info about the processor */ 37/* info about the processor */
38static unsigned int speedstep_processor; 38static enum speedstep_processor speedstep_processor;
39 39
40/* 40/*
41 * There are only two frequency states for each processor. Values 41 * There are only two frequency states for each processor. Values
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
373 /* Handle the GX (Formally known as the GX2) */ 373 /* Handle the GX (Formally known as the GX2) */
374 374
375 if (c->x86 == 5 && c->x86_model == 5) 375 if (c->x86 == 5 && c->x86_model == 5)
376 display_cacheinfo(c); 376 cpu_detect_cache_sizes(c);
377 else 377 else
378 init_cyrix(c); 378 init_cyrix(c);
379} 379}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c900b73f9224..879666f4d871 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -70,7 +70,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 70 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
74 sched_clock_stable = 1; 73 sched_clock_stable = 1;
75 } 74 }
76 75
@@ -270,8 +269,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
270 node = cpu_to_node(cpu); 269 node = cpu_to_node(cpu);
271 } 270 }
272 numa_set_node(cpu, node); 271 numa_set_node(cpu, node);
273
274 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
275#endif 272#endif
276} 273}
277 274
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 804c40e2bc3e..fc6c8ef92dcc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ 97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
@@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */
105 { 0x00, 0, 0} 108 { 0x00, 0, 0}
106}; 109};
107 110
@@ -488,22 +491,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488#endif 491#endif
489 } 492 }
490 493
491 if (trace)
492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
493 else if (l1i)
494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
495
496 if (l1d)
497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
498 else
499 printk(KERN_CONT "\n");
500
501 if (l2)
502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
503
504 if (l3)
505 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
506
507 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 494 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
508 495
509 return l2; 496 return l2;
@@ -512,26 +499,27 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
512#ifdef CONFIG_SYSFS 499#ifdef CONFIG_SYSFS
513 500
514/* pointer to _cpuid4_info array (for each cache leaf) */ 501/* pointer to _cpuid4_info array (for each cache leaf) */
515static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 502static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
516#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 503#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
517 504
518#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
519static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 506static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
520{ 507{
521 struct _cpuid4_info *this_leaf, *sibling_leaf; 508 struct _cpuid4_info *this_leaf, *sibling_leaf;
522 unsigned long num_threads_sharing; 509 unsigned long num_threads_sharing;
523 int index_msb, i; 510 int index_msb, i, sibling;
524 struct cpuinfo_x86 *c = &cpu_data(cpu); 511 struct cpuinfo_x86 *c = &cpu_data(cpu);
525 512
526 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 513 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
527 struct cpuinfo_x86 *d; 514 for_each_cpu(i, c->llc_shared_map) {
528 for_each_online_cpu(i) { 515 if (!per_cpu(ici_cpuid4_info, i))
529 if (!per_cpu(cpuid4_info, i))
530 continue; 516 continue;
531 d = &cpu_data(i);
532 this_leaf = CPUID4_INFO_IDX(i, index); 517 this_leaf = CPUID4_INFO_IDX(i, index);
533 cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), 518 for_each_cpu(sibling, c->llc_shared_map) {
534 d->llc_shared_map); 519 if (!cpu_online(sibling))
520 continue;
521 set_bit(sibling, this_leaf->shared_cpu_map);
522 }
535 } 523 }
536 return; 524 return;
537 } 525 }
@@ -548,7 +536,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
548 c->apicid >> index_msb) { 536 c->apicid >> index_msb) {
549 cpumask_set_cpu(i, 537 cpumask_set_cpu(i,
550 to_cpumask(this_leaf->shared_cpu_map)); 538 to_cpumask(this_leaf->shared_cpu_map));
551 if (i != cpu && per_cpu(cpuid4_info, i)) { 539 if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
552 sibling_leaf = 540 sibling_leaf =
553 CPUID4_INFO_IDX(i, index); 541 CPUID4_INFO_IDX(i, index);
554 cpumask_set_cpu(cpu, to_cpumask( 542 cpumask_set_cpu(cpu, to_cpumask(
@@ -587,8 +575,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
587 for (i = 0; i < num_cache_leaves; i++) 575 for (i = 0; i < num_cache_leaves; i++)
588 cache_remove_shared_cpu_map(cpu, i); 576 cache_remove_shared_cpu_map(cpu, i);
589 577
590 kfree(per_cpu(cpuid4_info, cpu)); 578 kfree(per_cpu(ici_cpuid4_info, cpu));
591 per_cpu(cpuid4_info, cpu) = NULL; 579 per_cpu(ici_cpuid4_info, cpu) = NULL;
592} 580}
593 581
594static int 582static int
@@ -627,15 +615,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
627 if (num_cache_leaves == 0) 615 if (num_cache_leaves == 0)
628 return -ENOENT; 616 return -ENOENT;
629 617
630 per_cpu(cpuid4_info, cpu) = kzalloc( 618 per_cpu(ici_cpuid4_info, cpu) = kzalloc(
631 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 619 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
632 if (per_cpu(cpuid4_info, cpu) == NULL) 620 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
633 return -ENOMEM; 621 return -ENOMEM;
634 622
635 smp_call_function_single(cpu, get_cpu_leaves, &retval, true); 623 smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
636 if (retval) { 624 if (retval) {
637 kfree(per_cpu(cpuid4_info, cpu)); 625 kfree(per_cpu(ici_cpuid4_info, cpu));
638 per_cpu(cpuid4_info, cpu) = NULL; 626 per_cpu(ici_cpuid4_info, cpu) = NULL;
639 } 627 }
640 628
641 return retval; 629 return retval;
@@ -647,7 +635,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
647extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ 635extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
648 636
649/* pointer to kobject for cpuX/cache */ 637/* pointer to kobject for cpuX/cache */
650static DEFINE_PER_CPU(struct kobject *, cache_kobject); 638static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
651 639
652struct _index_kobject { 640struct _index_kobject {
653 struct kobject kobj; 641 struct kobject kobj;
@@ -656,8 +644,8 @@ struct _index_kobject {
656}; 644};
657 645
658/* pointer to array of kobjects for cpuX/cache/indexY */ 646/* pointer to array of kobjects for cpuX/cache/indexY */
659static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 647static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
660#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 648#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
661 649
662#define show_one_plus(file_name, object, val) \ 650#define show_one_plus(file_name, object, val) \
663static ssize_t show_##file_name \ 651static ssize_t show_##file_name \
@@ -876,10 +864,10 @@ static struct kobj_type ktype_percpu_entry = {
876 864
877static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) 865static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
878{ 866{
879 kfree(per_cpu(cache_kobject, cpu)); 867 kfree(per_cpu(ici_cache_kobject, cpu));
880 kfree(per_cpu(index_kobject, cpu)); 868 kfree(per_cpu(ici_index_kobject, cpu));
881 per_cpu(cache_kobject, cpu) = NULL; 869 per_cpu(ici_cache_kobject, cpu) = NULL;
882 per_cpu(index_kobject, cpu) = NULL; 870 per_cpu(ici_index_kobject, cpu) = NULL;
883 free_cache_attributes(cpu); 871 free_cache_attributes(cpu);
884} 872}
885 873
@@ -895,14 +883,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
895 return err; 883 return err;
896 884
897 /* Allocate all required memory */ 885 /* Allocate all required memory */
898 per_cpu(cache_kobject, cpu) = 886 per_cpu(ici_cache_kobject, cpu) =
899 kzalloc(sizeof(struct kobject), GFP_KERNEL); 887 kzalloc(sizeof(struct kobject), GFP_KERNEL);
900 if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) 888 if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
901 goto err_out; 889 goto err_out;
902 890
903 per_cpu(index_kobject, cpu) = kzalloc( 891 per_cpu(ici_index_kobject, cpu) = kzalloc(
904 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); 892 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
905 if (unlikely(per_cpu(index_kobject, cpu) == NULL)) 893 if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
906 goto err_out; 894 goto err_out;
907 895
908 return 0; 896 return 0;
@@ -926,7 +914,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
926 if (unlikely(retval < 0)) 914 if (unlikely(retval < 0))
927 return retval; 915 return retval;
928 916
929 retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), 917 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
930 &ktype_percpu_entry, 918 &ktype_percpu_entry,
931 &sys_dev->kobj, "%s", "cache"); 919 &sys_dev->kobj, "%s", "cache");
932 if (retval < 0) { 920 if (retval < 0) {
@@ -940,12 +928,12 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
940 this_object->index = i; 928 this_object->index = i;
941 retval = kobject_init_and_add(&(this_object->kobj), 929 retval = kobject_init_and_add(&(this_object->kobj),
942 &ktype_cache, 930 &ktype_cache,
943 per_cpu(cache_kobject, cpu), 931 per_cpu(ici_cache_kobject, cpu),
944 "index%1lu", i); 932 "index%1lu", i);
945 if (unlikely(retval)) { 933 if (unlikely(retval)) {
946 for (j = 0; j < i; j++) 934 for (j = 0; j < i; j++)
947 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); 935 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
948 kobject_put(per_cpu(cache_kobject, cpu)); 936 kobject_put(per_cpu(ici_cache_kobject, cpu));
949 cpuid4_cache_sysfs_exit(cpu); 937 cpuid4_cache_sysfs_exit(cpu);
950 return retval; 938 return retval;
951 } 939 }
@@ -953,7 +941,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
953 } 941 }
954 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); 942 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
955 943
956 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 944 kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
957 return 0; 945 return 0;
958} 946}
959 947
@@ -962,7 +950,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
962 unsigned int cpu = sys_dev->id; 950 unsigned int cpu = sys_dev->id;
963 unsigned long i; 951 unsigned long i;
964 952
965 if (per_cpu(cpuid4_info, cpu) == NULL) 953 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
966 return; 954 return;
967 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) 955 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
968 return; 956 return;
@@ -970,7 +958,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
970 958
971 for (i = 0; i < num_cache_leaves; i++) 959 for (i = 0; i < num_cache_leaves; i++)
972 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); 960 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
973 kobject_put(per_cpu(cache_kobject, cpu)); 961 kobject_put(per_cpu(ici_cache_kobject, cpu));
974 cpuid4_cache_sysfs_exit(cpu); 962 cpuid4_cache_sysfs_exit(cpu);
975} 963}
976 964
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 472763d92098..73734baa50f2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
74 m->finished = 0; 74 m->finished = 0;
75} 75}
76 76
77static cpumask_t mce_inject_cpumask; 77static cpumask_var_t mce_inject_cpumask;
78 78
79static int mce_raise_notify(struct notifier_block *self, 79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data) 80 unsigned long val, void *data)
@@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self,
82 struct die_args *args = (struct die_args *)data; 82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id(); 83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm); 84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) 85 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE; 86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask); 87 cpumask_clear_cpu(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION) 88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs); 89 raise_exception(m, args->regs);
90 else if (m->status) 90 else if (m->status)
@@ -148,22 +148,22 @@ static void raise_mce(struct mce *m)
148 unsigned long start; 148 unsigned long start;
149 int cpu; 149 int cpu;
150 get_online_cpus(); 150 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map; 151 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
152 cpu_clear(get_cpu(), mce_inject_cpumask); 152 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) { 153 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu); 154 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished || 155 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask); 157 cpumask_clear_cpu(cpu, mce_inject_cpumask);
158 } 158 }
159 if (!cpus_empty(mce_inject_cpumask)) 159 if (!cpumask_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); 160 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies; 161 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) { 162 while (!cpumask_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) { 163 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR 164 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n", 165 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask)); 166 *cpumask_bits(mce_inject_cpumask));
167 break; 167 break;
168 } 168 }
169 cpu_relax(); 169 cpu_relax();
@@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
210 210
211static int inject_init(void) 211static int inject_init(void)
212{ 212{
213 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
214 return -ENOMEM;
213 printk(KERN_INFO "Machine check injector initialized\n"); 215 printk(KERN_INFO "Machine check injector initialized\n");
214 mce_chrdev_ops.write = mce_write; 216 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb); 217 register_die_notifier(&mce_raise_nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b1598a9436d0..a8aacd4b513c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,9 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h>
51
49int mce_disabled __read_mostly; 52int mce_disabled __read_mostly;
50 53
51#define MISC_MCELOG_MINOR 227 54#define MISC_MCELOG_MINOR 227
@@ -85,18 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 88static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 89static int cpu_missing;
87 90
88static void default_decode_mce(struct mce *m) 91/*
92 * CPU/chipset specific EDAC code can register a notifier call here to print
93 * MCE errors in a human-readable form.
94 */
95ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
96EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
97
98static int default_decode_mce(struct notifier_block *nb, unsigned long val,
99 void *data)
89{ 100{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 101 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
103
104 return NOTIFY_STOP;
92} 105}
93 106
94/* 107static struct notifier_block mce_dec_nb = {
95 * CPU/chipset specific EDAC code can register a callback here to print 108 .notifier_call = default_decode_mce,
96 * MCE errors in a human-readable form: 109 .priority = -1,
97 */ 110};
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
100 111
101/* MCA banks polled by the period polling timer for corrected events */ 112/* MCA banks polled by the period polling timer for corrected events */
102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 113DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,6 +152,9 @@ void mce_log(struct mce *mce)
141{ 152{
142 unsigned next, entry; 153 unsigned next, entry;
143 154
155 /* Emit the trace record: */
156 trace_mce_record(mce);
157
144 mce->finished = 0; 158 mce->finished = 0;
145 wmb(); 159 wmb();
146 for (;;) { 160 for (;;) {
@@ -204,9 +218,9 @@ static void print_mce(struct mce *m)
204 218
205 /* 219 /*
206 * Print out human-readable details about the MCE error, 220 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that): 221 * (if the CPU has an implementation for that)
208 */ 222 */
209 x86_mce_decode_callback(m); 223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
210} 224}
211 225
212static void print_mce_head(void) 226static void print_mce_head(void)
@@ -1122,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1136static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1123static DEFINE_PER_CPU(struct timer_list, mce_timer); 1137static DEFINE_PER_CPU(struct timer_list, mce_timer);
1124 1138
1125static void mcheck_timer(unsigned long data) 1139static void mce_start_timer(unsigned long data)
1126{ 1140{
1127 struct timer_list *t = &per_cpu(mce_timer, data); 1141 struct timer_list *t = &per_cpu(mce_timer, data);
1128 int *n; 1142 int *n;
@@ -1187,7 +1201,7 @@ int mce_notify_irq(void)
1187} 1201}
1188EXPORT_SYMBOL_GPL(mce_notify_irq); 1202EXPORT_SYMBOL_GPL(mce_notify_irq);
1189 1203
1190static int mce_banks_init(void) 1204static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1191{ 1205{
1192 int i; 1206 int i;
1193 1207
@@ -1206,7 +1220,7 @@ static int mce_banks_init(void)
1206/* 1220/*
1207 * Initialize Machine Checks for a CPU. 1221 * Initialize Machine Checks for a CPU.
1208 */ 1222 */
1209static int __cpuinit mce_cap_init(void) 1223static int __cpuinit __mcheck_cpu_cap_init(void)
1210{ 1224{
1211 unsigned b; 1225 unsigned b;
1212 u64 cap; 1226 u64 cap;
@@ -1214,7 +1228,8 @@ static int __cpuinit mce_cap_init(void)
1214 rdmsrl(MSR_IA32_MCG_CAP, cap); 1228 rdmsrl(MSR_IA32_MCG_CAP, cap);
1215 1229
1216 b = cap & MCG_BANKCNT_MASK; 1230 b = cap & MCG_BANKCNT_MASK;
1217 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1231 if (!banks)
1232 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1218 1233
1219 if (b > MAX_NR_BANKS) { 1234 if (b > MAX_NR_BANKS) {
1220 printk(KERN_WARNING 1235 printk(KERN_WARNING
@@ -1227,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
1227 WARN_ON(banks != 0 && b != banks); 1242 WARN_ON(banks != 0 && b != banks);
1228 banks = b; 1243 banks = b;
1229 if (!mce_banks) { 1244 if (!mce_banks) {
1230 int err = mce_banks_init(); 1245 int err = __mcheck_cpu_mce_banks_init();
1231 1246
1232 if (err) 1247 if (err)
1233 return err; 1248 return err;
@@ -1243,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
1243 return 0; 1258 return 0;
1244} 1259}
1245 1260
1246static void mce_init(void) 1261static void __mcheck_cpu_init_generic(void)
1247{ 1262{
1248 mce_banks_t all_banks; 1263 mce_banks_t all_banks;
1249 u64 cap; 1264 u64 cap;
@@ -1272,7 +1287,7 @@ static void mce_init(void)
1272} 1287}
1273 1288
1274/* Add per CPU specific workarounds here */ 1289/* Add per CPU specific workarounds here */
1275static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1290static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1276{ 1291{
1277 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1292 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1278 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1293 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1340,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1340 return 0; 1355 return 0;
1341} 1356}
1342 1357
1343static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1358static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1344{ 1359{
1345 if (c->x86 != 5) 1360 if (c->x86 != 5)
1346 return; 1361 return;
@@ -1354,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1354 } 1369 }
1355} 1370}
1356 1371
1357static void mce_cpu_features(struct cpuinfo_x86 *c) 1372static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1358{ 1373{
1359 switch (c->x86_vendor) { 1374 switch (c->x86_vendor) {
1360 case X86_VENDOR_INTEL: 1375 case X86_VENDOR_INTEL:
@@ -1368,18 +1383,19 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1368 } 1383 }
1369} 1384}
1370 1385
1371static void mce_init_timer(void) 1386static void __mcheck_cpu_init_timer(void)
1372{ 1387{
1373 struct timer_list *t = &__get_cpu_var(mce_timer); 1388 struct timer_list *t = &__get_cpu_var(mce_timer);
1374 int *n = &__get_cpu_var(mce_next_interval); 1389 int *n = &__get_cpu_var(mce_next_interval);
1375 1390
1391 setup_timer(t, mce_start_timer, smp_processor_id());
1392
1376 if (mce_ignore_ce) 1393 if (mce_ignore_ce)
1377 return; 1394 return;
1378 1395
1379 *n = check_interval * HZ; 1396 *n = check_interval * HZ;
1380 if (!*n) 1397 if (!*n)
1381 return; 1398 return;
1382 setup_timer(t, mcheck_timer, smp_processor_id());
1383 t->expires = round_jiffies(jiffies + *n); 1399 t->expires = round_jiffies(jiffies + *n);
1384 add_timer_on(t, smp_processor_id()); 1400 add_timer_on(t, smp_processor_id());
1385} 1401}
@@ -1399,27 +1415,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1399 * Called for each booted CPU to set up machine checks. 1415 * Called for each booted CPU to set up machine checks.
1400 * Must be called with preempt off: 1416 * Must be called with preempt off:
1401 */ 1417 */
1402void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1418void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1403{ 1419{
1404 if (mce_disabled) 1420 if (mce_disabled)
1405 return; 1421 return;
1406 1422
1407 mce_ancient_init(c); 1423 __mcheck_cpu_ancient_init(c);
1408 1424
1409 if (!mce_available(c)) 1425 if (!mce_available(c))
1410 return; 1426 return;
1411 1427
1412 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1428 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1413 mce_disabled = 1; 1429 mce_disabled = 1;
1414 return; 1430 return;
1415 } 1431 }
1416 1432
1417 machine_check_vector = do_machine_check; 1433 machine_check_vector = do_machine_check;
1418 1434
1419 mce_init(); 1435 __mcheck_cpu_init_generic();
1420 mce_cpu_features(c); 1436 __mcheck_cpu_init_vendor(c);
1421 mce_init_timer(); 1437 __mcheck_cpu_init_timer();
1422 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1438 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1439
1423} 1440}
1424 1441
1425/* 1442/*
@@ -1639,6 +1656,15 @@ static int __init mcheck_enable(char *str)
1639} 1656}
1640__setup("mce", mcheck_enable); 1657__setup("mce", mcheck_enable);
1641 1658
1659int __init mcheck_init(void)
1660{
1661 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1662
1663 mcheck_intel_therm_init();
1664
1665 return 0;
1666}
1667
1642/* 1668/*
1643 * Sysfs support 1669 * Sysfs support
1644 */ 1670 */
@@ -1647,7 +1673,7 @@ __setup("mce", mcheck_enable);
1647 * Disable machine checks on suspend and shutdown. We can't really handle 1673 * Disable machine checks on suspend and shutdown. We can't really handle
1648 * them later. 1674 * them later.
1649 */ 1675 */
1650static int mce_disable(void) 1676static int mce_disable_error_reporting(void)
1651{ 1677{
1652 int i; 1678 int i;
1653 1679
@@ -1662,12 +1688,12 @@ static int mce_disable(void)
1662 1688
1663static int mce_suspend(struct sys_device *dev, pm_message_t state) 1689static int mce_suspend(struct sys_device *dev, pm_message_t state)
1664{ 1690{
1665 return mce_disable(); 1691 return mce_disable_error_reporting();
1666} 1692}
1667 1693
1668static int mce_shutdown(struct sys_device *dev) 1694static int mce_shutdown(struct sys_device *dev)
1669{ 1695{
1670 return mce_disable(); 1696 return mce_disable_error_reporting();
1671} 1697}
1672 1698
1673/* 1699/*
@@ -1677,8 +1703,8 @@ static int mce_shutdown(struct sys_device *dev)
1677 */ 1703 */
1678static int mce_resume(struct sys_device *dev) 1704static int mce_resume(struct sys_device *dev)
1679{ 1705{
1680 mce_init(); 1706 __mcheck_cpu_init_generic();
1681 mce_cpu_features(&current_cpu_data); 1707 __mcheck_cpu_init_vendor(&current_cpu_data);
1682 1708
1683 return 0; 1709 return 0;
1684} 1710}
@@ -1688,8 +1714,8 @@ static void mce_cpu_restart(void *data)
1688 del_timer_sync(&__get_cpu_var(mce_timer)); 1714 del_timer_sync(&__get_cpu_var(mce_timer));
1689 if (!mce_available(&current_cpu_data)) 1715 if (!mce_available(&current_cpu_data))
1690 return; 1716 return;
1691 mce_init(); 1717 __mcheck_cpu_init_generic();
1692 mce_init_timer(); 1718 __mcheck_cpu_init_timer();
1693} 1719}
1694 1720
1695/* Reinit MCEs after user configuration changes */ 1721/* Reinit MCEs after user configuration changes */
@@ -1715,7 +1741,7 @@ static void mce_enable_ce(void *all)
1715 cmci_reenable(); 1741 cmci_reenable();
1716 cmci_recheck(); 1742 cmci_recheck();
1717 if (all) 1743 if (all)
1718 mce_init_timer(); 1744 __mcheck_cpu_init_timer();
1719} 1745}
1720 1746
1721static struct sysdev_class mce_sysclass = { 1747static struct sysdev_class mce_sysclass = {
@@ -1903,7 +1929,7 @@ error2:
1903 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1929 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1904error: 1930error:
1905 while (--i >= 0) 1931 while (--i >= 0)
1906 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1932 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1907 1933
1908 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1934 sysdev_unregister(&per_cpu(mce_dev, cpu));
1909 1935
@@ -1928,13 +1954,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1928} 1954}
1929 1955
1930/* Make sure there are no machine checks on offlined CPUs. */ 1956/* Make sure there are no machine checks on offlined CPUs. */
1931static void mce_disable_cpu(void *h) 1957static void __cpuinit mce_disable_cpu(void *h)
1932{ 1958{
1933 unsigned long action = *(unsigned long *)h; 1959 unsigned long action = *(unsigned long *)h;
1934 int i; 1960 int i;
1935 1961
1936 if (!mce_available(&current_cpu_data)) 1962 if (!mce_available(&current_cpu_data))
1937 return; 1963 return;
1964
1938 if (!(action & CPU_TASKS_FROZEN)) 1965 if (!(action & CPU_TASKS_FROZEN))
1939 cmci_clear(); 1966 cmci_clear();
1940 for (i = 0; i < banks; i++) { 1967 for (i = 0; i < banks; i++) {
@@ -1945,7 +1972,7 @@ static void mce_disable_cpu(void *h)
1945 } 1972 }
1946} 1973}
1947 1974
1948static void mce_reenable_cpu(void *h) 1975static void __cpuinit mce_reenable_cpu(void *h)
1949{ 1976{
1950 unsigned long action = *(unsigned long *)h; 1977 unsigned long action = *(unsigned long *)h;
1951 int i; 1978 int i;
@@ -1990,9 +2017,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1990 break; 2017 break;
1991 case CPU_DOWN_FAILED: 2018 case CPU_DOWN_FAILED:
1992 case CPU_DOWN_FAILED_FROZEN: 2019 case CPU_DOWN_FAILED_FROZEN:
1993 t->expires = round_jiffies(jiffies + 2020 if (!mce_ignore_ce && check_interval) {
2021 t->expires = round_jiffies(jiffies +
1994 __get_cpu_var(mce_next_interval)); 2022 __get_cpu_var(mce_next_interval));
1995 add_timer_on(t, cpu); 2023 add_timer_on(t, cpu);
2024 }
1996 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2025 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1997 break; 2026 break;
1998 case CPU_POST_DEAD: 2027 case CPU_POST_DEAD:
@@ -2024,7 +2053,7 @@ static __init void mce_init_banks(void)
2024 } 2053 }
2025} 2054}
2026 2055
2027static __init int mce_init_device(void) 2056static __init int mcheck_init_device(void)
2028{ 2057{
2029 int err; 2058 int err;
2030 int i = 0; 2059 int i = 0;
@@ -2052,7 +2081,7 @@ static __init int mce_init_device(void)
2052 return err; 2081 return err;
2053} 2082}
2054 2083
2055device_initcall(mce_init_device); 2084device_initcall(mcheck_init_device);
2056 2085
2057/* 2086/*
2058 * Old style boot options parsing. Only for compatibility. 2087 * Old style boot options parsing. Only for compatibility.
@@ -2100,7 +2129,7 @@ static int fake_panic_set(void *data, u64 val)
2100DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2129DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2101 fake_panic_set, "%llu\n"); 2130 fake_panic_set, "%llu\n");
2102 2131
2103static int __init mce_debugfs_init(void) 2132static int __init mcheck_debugfs_init(void)
2104{ 2133{
2105 struct dentry *dmce, *ffake_panic; 2134 struct dentry *dmce, *ffake_panic;
2106 2135
@@ -2114,5 +2143,5 @@ static int __init mce_debugfs_init(void)
2114 2143
2115 return 0; 2144 return 0;
2116} 2145}
2117late_initcall(mce_debugfs_init); 2146late_initcall(mcheck_debugfs_init);
2118#endif 2147#endif
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..81c499eceb21 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,14 +256,34 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259/* Thermal monitoring depends on APIC, ACPI and clock modulation */
260static int intel_thermal_supported(struct cpuinfo_x86 *c)
261{
262 if (!cpu_has_apic)
263 return 0;
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return 0;
266 return 1;
267}
268
269void __init mcheck_intel_therm_init(void)
270{
271 /*
272 * This function is only called on boot CPU. Save the init thermal
273 * LVT value on BSP and use that value to restore APs' thermal LVT
274 * entry BIOS programmed later
275 */
276 if (intel_thermal_supported(&boot_cpu_data))
277 lvtthmr_init = apic_read(APIC_LVTTHMR);
278}
279
257void intel_init_thermal(struct cpuinfo_x86 *c) 280void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 281{
259 unsigned int cpu = smp_processor_id(); 282 unsigned int cpu = smp_processor_id();
260 int tm2 = 0; 283 int tm2 = 0;
261 u32 l, h; 284 u32 l, h;
262 285
263 /* Thermal monitoring depends on ACPI and clock modulation*/ 286 if (!intel_thermal_supported(c))
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return; 287 return;
266 288
267 /* 289 /*
@@ -270,7 +292,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 292 * since it might be delivered via SMI already:
271 */ 293 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 294 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 295
296 /*
297 * The initial value of thermal LVT entries on all APs always reads
298 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
299 * sequence to them and LVT registers are reset to 0s except for
300 * the mask bits which are set to 1s when APs receive INIT IPI.
301 * Always restore the value that BIOS has programmed on AP based on
302 * BSP's info we saved since BIOS is always setting the same value
303 * for all threads/cores
304 */
305 apic_write(APIC_LVTTHMR, lvtthmr_init);
306
307 h = lvtthmr_init;
308
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 309 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 310 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 311 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
@@ -312,8 +347,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
312 l = apic_read(APIC_LVTTHMR); 347 l = apic_read(APIC_LVTTHMR);
313 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 348 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
314 349
315 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 350 printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
316 cpu, tm2 ? "TM2" : "TM1"); 351 tm2 ? "TM2" : "TM1");
317 352
318 /* enable thermal throttle processing */ 353 /* enable thermal throttle processing */
319 atomic_set(&therm_throt_en, 1); 354 atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 6987af786c02..09b1698e0466 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -875,7 +875,7 @@ int __init mtrr_cleanup(unsigned address_bits)
875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
876 876
877 range_sums = sum_ranges(range, nr_range); 877 range_sums = sum_ranges(range, nr_range);
878 printk(KERN_INFO "total RAM coverred: %ldM\n", 878 printk(KERN_INFO "total RAM covered: %ldM\n",
879 range_sums >> (20 - PAGE_SHIFT)); 879 range_sums >> (20 - PAGE_SHIFT));
880 880
881 if (mtrr_chunk_size && mtrr_gran_size) { 881 if (mtrr_chunk_size && mtrr_gran_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 3c1b12d461d1..e006e56f699c 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -4,6 +4,7 @@
4#include <linux/proc_fs.h> 4#include <linux/proc_fs.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h>
7#include <linux/init.h> 8#include <linux/init.h>
8 9
9#define LINE_SIZE 80 10#define LINE_SIZE 80
@@ -133,8 +134,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
133 return -EINVAL; 134 return -EINVAL;
134 135
135 base = simple_strtoull(line + 5, &ptr, 0); 136 base = simple_strtoull(line + 5, &ptr, 0);
136 while (isspace(*ptr)) 137 ptr = skip_spaces(ptr);
137 ptr++;
138 138
139 if (strncmp(ptr, "size=", 5)) 139 if (strncmp(ptr, "size=", 5))
140 return -EINVAL; 140 return -EINVAL;
@@ -142,14 +142,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
142 size = simple_strtoull(ptr + 5, &ptr, 0); 142 size = simple_strtoull(ptr + 5, &ptr, 0);
143 if ((base & 0xfff) || (size & 0xfff)) 143 if ((base & 0xfff) || (size & 0xfff))
144 return -EINVAL; 144 return -EINVAL;
145 while (isspace(*ptr)) 145 ptr = skip_spaces(ptr);
146 ptr++;
147 146
148 if (strncmp(ptr, "type=", 5)) 147 if (strncmp(ptr, "type=", 5))
149 return -EINVAL; 148 return -EINVAL;
150 ptr += 5; 149 ptr = skip_spaces(ptr + 5);
151 while (isspace(*ptr))
152 ptr++;
153 150
154 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 151 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
155 if (strcmp(ptr, mtrr_strings[i])) 152 if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..8c1c07073ccc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -77,6 +77,18 @@ struct cpu_hw_events {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -102,6 +114,8 @@ struct x86_pmu {
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
155 return hw_event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
173static u64 intel_pmu_event_map(int hw_event) 228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[hw_event]; 230 return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 245 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 246 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 247
193static const u64 nehalem_hw_cache_event_ids 248static __initconst u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX] 249 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX] 250 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
281 }, 336 },
282}; 337};
283 338
284static const u64 core2_hw_cache_event_ids 339static __initconst u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX] 340 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX] 341 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
372 }, 427 },
373}; 428};
374 429
375static const u64 atom_hw_cache_event_ids 430static __initconst u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX] 431 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX] 432 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
481 return hw_event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static __initconst u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX] 540 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX] 541 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 987 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 989
990 hwc->idx = -1;
991
935 /* 992 /*
936 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
937 */ 994 */
@@ -1229,7 +1286,7 @@ x86_perf_event_set_period(struct perf_event *event,
1229 return 0; 1286 return 0;
1230 1287
1231 /* 1288 /*
1232 * If we are way outside a reasoable range then just skip forward: 1289 * If we are way outside a reasonable range then just skip forward:
1233 */ 1290 */
1234 if (unlikely(left <= -period)) { 1291 if (unlikely(left <= -period)) {
1235 left = period; 1292 left = period;
@@ -1286,6 +1343,13 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1286 bits |= 0x2; 1343 bits |= 0x2;
1287 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 1344 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1288 bits |= 0x1; 1345 bits |= 0x1;
1346
1347 /*
1348 * ANY bit is supported in v3 and up
1349 */
1350 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
1351 bits |= 0x4;
1352
1289 bits <<= (idx * 4); 1353 bits <<= (idx * 4);
1290 mask = 0xfULL << (idx * 4); 1354 mask = 0xfULL << (idx * 4);
1291 1355
@@ -1334,8 +1398,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1334 x86_pmu_enable_event(hwc, idx); 1398 x86_pmu_enable_event(hwc, idx);
1335} 1399}
1336 1400
1337static int 1401static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{ 1402{
1340 unsigned int hw_event; 1403 unsigned int hw_event;
1341 1404
@@ -1349,6 +1412,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1349 if (!x86_pmu.num_events_fixed) 1412 if (!x86_pmu.num_events_fixed)
1350 return -1; 1413 return -1;
1351 1414
1415 /*
1416 * fixed counters do not take all possible filters
1417 */
1418 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1419 return -1;
1420
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1421 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1422 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1423 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1429,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1360} 1429}
1361 1430
1362/* 1431/*
1363 * Find a PMC slot for the freshly enabled / scheduled in event: 1432 * generic counter allocator: get next free counter
1364 */ 1433 */
1365static int x86_pmu_enable(struct perf_event *event) 1434static int
1435gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1436{
1437 int idx;
1438
1439 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1440 return idx == x86_pmu.num_events ? -1 : idx;
1441}
1442
1443/*
1444 * intel-specific counter allocator: check event constraints
1445 */
1446static int
1447intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1448{
1449 const struct event_constraint *event_constraint;
1450 int i, code;
1451
1452 if (!event_constraints)
1453 goto skip;
1454
1455 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1456
1457 for_each_event_constraint(event_constraint, event_constraints) {
1458 if (code == event_constraint->code) {
1459 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1460 if (!test_and_set_bit(i, cpuc->used_mask))
1461 return i;
1462 }
1463 return -1;
1464 }
1465 }
1466skip:
1467 return gen_get_event_idx(cpuc, hwc);
1468}
1469
1470static int
1471x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1472{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw;
1369 int idx; 1473 int idx;
1370 1474
1371 idx = fixed_mode_idx(event, hwc); 1475 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1476 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1477 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1478 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1479 return -EAGAIN;
1376 1480
1377 hwc->config_base = 0; 1481 hwc->config_base = 0;
1378 hwc->event_base = 0; 1482 hwc->event_base = 0;
1379 hwc->idx = idx; 1483 hwc->idx = idx;
1380 } else if (idx >= 0) { 1484 } else if (idx >= 0) {
1381 /* 1485 /*
@@ -1396,20 +1500,35 @@ static int x86_pmu_enable(struct perf_event *event)
1396 } else { 1500 } else {
1397 idx = hwc->idx; 1501 idx = hwc->idx;
1398 /* Try to get the previous generic event again */ 1502 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1503 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1504try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1505 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_events); 1506 if (idx == -1)
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN; 1507 return -EAGAIN;
1405 1508
1406 set_bit(idx, cpuc->used_mask); 1509 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1510 hwc->idx = idx;
1408 } 1511 }
1409 hwc->config_base = x86_pmu.eventsel; 1512 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr; 1513 hwc->event_base = x86_pmu.perfctr;
1411 } 1514 }
1412 1515
1516 return idx;
1517}
1518
1519/*
1520 * Find a PMC slot for the freshly enabled / scheduled in event:
1521 */
1522static int x86_pmu_enable(struct perf_event *event)
1523{
1524 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1525 struct hw_perf_event *hwc = &event->hw;
1526 int idx;
1527
1528 idx = x86_schedule_event(cpuc, hwc);
1529 if (idx < 0)
1530 return idx;
1531
1413 perf_events_lapic_init(); 1532 perf_events_lapic_init();
1414 1533
1415 x86_pmu.disable(hwc, idx); 1534 x86_pmu.disable(hwc, idx);
@@ -1520,6 +1639,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1520 1639
1521 data.period = event->hw.last_period; 1640 data.period = event->hw.last_period;
1522 data.addr = 0; 1641 data.addr = 0;
1642 data.raw = NULL;
1523 regs.ip = 0; 1643 regs.ip = 0;
1524 1644
1525 /* 1645 /*
@@ -1637,6 +1757,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1637 u64 val; 1757 u64 val;
1638 1758
1639 data.addr = 0; 1759 data.addr = 0;
1760 data.raw = NULL;
1640 1761
1641 cpuc = &__get_cpu_var(cpu_hw_events); 1762 cpuc = &__get_cpu_var(cpu_hw_events);
1642 1763
@@ -1682,6 +1803,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1682 u64 ack, status; 1803 u64 ack, status;
1683 1804
1684 data.addr = 0; 1805 data.addr = 0;
1806 data.raw = NULL;
1685 1807
1686 cpuc = &__get_cpu_var(cpu_hw_events); 1808 cpuc = &__get_cpu_var(cpu_hw_events);
1687 1809
@@ -1745,6 +1867,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1745 u64 val; 1867 u64 val;
1746 1868
1747 data.addr = 0; 1869 data.addr = 0;
1870 data.raw = NULL;
1748 1871
1749 cpuc = &__get_cpu_var(cpu_hw_events); 1872 cpuc = &__get_cpu_var(cpu_hw_events);
1750 1873
@@ -1852,7 +1975,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1975 .priority = 1
1853}; 1976};
1854 1977
1855static struct x86_pmu p6_pmu = { 1978static __initconst struct x86_pmu p6_pmu = {
1856 .name = "p6", 1979 .name = "p6",
1857 .handle_irq = p6_pmu_handle_irq, 1980 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all, 1981 .disable_all = p6_pmu_disable_all,
@@ -1877,9 +2000,10 @@ static struct x86_pmu p6_pmu = {
1877 */ 2000 */
1878 .event_bits = 32, 2001 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1, 2002 .event_mask = (1ULL << 32) - 1,
2003 .get_event_idx = intel_get_event_idx,
1880}; 2004};
1881 2005
1882static struct x86_pmu intel_pmu = { 2006static __initconst struct x86_pmu intel_pmu = {
1883 .name = "Intel", 2007 .name = "Intel",
1884 .handle_irq = intel_pmu_handle_irq, 2008 .handle_irq = intel_pmu_handle_irq,
1885 .disable_all = intel_pmu_disable_all, 2009 .disable_all = intel_pmu_disable_all,
@@ -1900,9 +2024,10 @@ static struct x86_pmu intel_pmu = {
1900 .max_period = (1ULL << 31) - 1, 2024 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts, 2025 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts, 2026 .disable_bts = intel_pmu_disable_bts,
2027 .get_event_idx = intel_get_event_idx,
1903}; 2028};
1904 2029
1905static struct x86_pmu amd_pmu = { 2030static __initconst struct x86_pmu amd_pmu = {
1906 .name = "AMD", 2031 .name = "AMD",
1907 .handle_irq = amd_pmu_handle_irq, 2032 .handle_irq = amd_pmu_handle_irq,
1908 .disable_all = amd_pmu_disable_all, 2033 .disable_all = amd_pmu_disable_all,
@@ -1920,9 +2045,10 @@ static struct x86_pmu amd_pmu = {
1920 .apic = 1, 2045 .apic = 1,
1921 /* use highest bit to detect overflow */ 2046 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1, 2047 .max_period = (1ULL << 47) - 1,
2048 .get_event_idx = gen_get_event_idx,
1923}; 2049};
1924 2050
1925static int p6_pmu_init(void) 2051static __init int p6_pmu_init(void)
1926{ 2052{
1927 switch (boot_cpu_data.x86_model) { 2053 switch (boot_cpu_data.x86_model) {
1928 case 1: 2054 case 1:
@@ -1932,10 +2058,12 @@ static int p6_pmu_init(void)
1932 case 7: 2058 case 7:
1933 case 8: 2059 case 8:
1934 case 11: /* Pentium III */ 2060 case 11: /* Pentium III */
2061 event_constraints = intel_p6_event_constraints;
1935 break; 2062 break;
1936 case 9: 2063 case 9:
1937 case 13: 2064 case 13:
1938 /* Pentium M */ 2065 /* Pentium M */
2066 event_constraints = intel_p6_event_constraints;
1939 break; 2067 break;
1940 default: 2068 default:
1941 pr_cont("unsupported p6 CPU model %d ", 2069 pr_cont("unsupported p6 CPU model %d ",
@@ -1945,16 +2073,10 @@ static int p6_pmu_init(void)
1945 2073
1946 x86_pmu = p6_pmu; 2074 x86_pmu = p6_pmu;
1947 2075
1948 if (!cpu_has_apic) {
1949 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1950 pr_info("no hardware sampling interrupt available.\n");
1951 x86_pmu.apic = 0;
1952 }
1953
1954 return 0; 2076 return 0;
1955} 2077}
1956 2078
1957static int intel_pmu_init(void) 2079static __init int intel_pmu_init(void)
1958{ 2080{
1959 union cpuid10_edx edx; 2081 union cpuid10_edx edx;
1960 union cpuid10_eax eax; 2082 union cpuid10_eax eax;
@@ -2007,12 +2129,14 @@ static int intel_pmu_init(void)
2007 sizeof(hw_cache_event_ids)); 2129 sizeof(hw_cache_event_ids));
2008 2130
2009 pr_cont("Core2 events, "); 2131 pr_cont("Core2 events, ");
2132 event_constraints = intel_core_event_constraints;
2010 break; 2133 break;
2011 default: 2134 default:
2012 case 26: 2135 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2136 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids)); 2137 sizeof(hw_cache_event_ids));
2015 2138
2139 event_constraints = intel_nehalem_event_constraints;
2016 pr_cont("Nehalem/Corei7 events, "); 2140 pr_cont("Nehalem/Corei7 events, ");
2017 break; 2141 break;
2018 case 28: 2142 case 28:
@@ -2025,7 +2149,7 @@ static int intel_pmu_init(void)
2025 return 0; 2149 return 0;
2026} 2150}
2027 2151
2028static int amd_pmu_init(void) 2152static __init int amd_pmu_init(void)
2029{ 2153{
2030 /* Performance-monitoring supported from K7 and later: */ 2154 /* Performance-monitoring supported from K7 and later: */
2031 if (boot_cpu_data.x86 < 6) 2155 if (boot_cpu_data.x86 < 6)
@@ -2040,6 +2164,16 @@ static int amd_pmu_init(void)
2040 return 0; 2164 return 0;
2041} 2165}
2042 2166
2167static void __init pmu_check_apic(void)
2168{
2169 if (cpu_has_apic)
2170 return;
2171
2172 x86_pmu.apic = 0;
2173 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
2174 pr_info("no hardware sampling interrupt available.\n");
2175}
2176
2043void __init init_hw_perf_events(void) 2177void __init init_hw_perf_events(void)
2044{ 2178{
2045 int err; 2179 int err;
@@ -2061,6 +2195,8 @@ void __init init_hw_perf_events(void)
2061 return; 2195 return;
2062 } 2196 }
2063 2197
2198 pmu_check_apic();
2199
2064 pr_cont("%s PMU driver.\n", x86_pmu.name); 2200 pr_cont("%s PMU driver.\n", x86_pmu.name);
2065 2201
2066 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 2202 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
@@ -2105,11 +2241,47 @@ static const struct pmu pmu = {
2105 .unthrottle = x86_pmu_unthrottle, 2241 .unthrottle = x86_pmu_unthrottle,
2106}; 2242};
2107 2243
2244static int
2245validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2246{
2247 struct hw_perf_event fake_event = event->hw;
2248
2249 if (event->pmu && event->pmu != &pmu)
2250 return 0;
2251
2252 return x86_schedule_event(cpuc, &fake_event) >= 0;
2253}
2254
2255static int validate_group(struct perf_event *event)
2256{
2257 struct perf_event *sibling, *leader = event->group_leader;
2258 struct cpu_hw_events fake_pmu;
2259
2260 memset(&fake_pmu, 0, sizeof(fake_pmu));
2261
2262 if (!validate_event(&fake_pmu, leader))
2263 return -ENOSPC;
2264
2265 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2266 if (!validate_event(&fake_pmu, sibling))
2267 return -ENOSPC;
2268 }
2269
2270 if (!validate_event(&fake_pmu, event))
2271 return -ENOSPC;
2272
2273 return 0;
2274}
2275
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 2276const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 2277{
2110 int err; 2278 int err;
2111 2279
2112 err = __hw_perf_event_init(event); 2280 err = __hw_perf_event_init(event);
2281 if (!err) {
2282 if (event->group_leader != event)
2283 err = validate_group(event);
2284 }
2113 if (err) { 2285 if (err) {
2114 if (event->destroy) 2286 if (event->destroy)
2115 event->destroy(event); 2287 event->destroy(event);
@@ -2132,7 +2304,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2132 2304
2133static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 2305static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2134static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 2306static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2135static DEFINE_PER_CPU(int, in_nmi_frame); 2307static DEFINE_PER_CPU(int, in_ignored_frame);
2136 2308
2137 2309
2138static void 2310static void
@@ -2148,8 +2320,9 @@ static void backtrace_warning(void *data, char *msg)
2148 2320
2149static int backtrace_stack(void *data, char *name) 2321static int backtrace_stack(void *data, char *name)
2150{ 2322{
2151 per_cpu(in_nmi_frame, smp_processor_id()) = 2323 per_cpu(in_ignored_frame, smp_processor_id()) =
2152 x86_is_stack_id(NMI_STACK, name); 2324 x86_is_stack_id(NMI_STACK, name) ||
2325 x86_is_stack_id(DEBUG_STACK, name);
2153 2326
2154 return 0; 2327 return 0;
2155} 2328}
@@ -2158,7 +2331,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2158{ 2331{
2159 struct perf_callchain_entry *entry = data; 2332 struct perf_callchain_entry *entry = data;
2160 2333
2161 if (per_cpu(in_nmi_frame, smp_processor_id())) 2334 if (per_cpu(in_ignored_frame, smp_processor_id()))
2162 return; 2335 return;
2163 2336
2164 if (reliable) 2337 if (reliable)
@@ -2170,6 +2343,7 @@ static const struct stacktrace_ops backtrace_ops = {
2170 .warning_symbol = backtrace_warning_symbol, 2343 .warning_symbol = backtrace_warning_symbol,
2171 .stack = backtrace_stack, 2344 .stack = backtrace_stack,
2172 .address = backtrace_address, 2345 .address = backtrace_address,
2346 .walk_stack = print_context_stack_bp,
2173}; 2347};
2174 2348
2175#include "../dumpstack.h" 2349#include "../dumpstack.h"
@@ -2180,7 +2354,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2180 callchain_store(entry, PERF_CONTEXT_KERNEL); 2354 callchain_store(entry, PERF_CONTEXT_KERNEL);
2181 callchain_store(entry, regs->ip); 2355 callchain_store(entry, regs->ip);
2182 2356
2183 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 2357 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2184} 2358}
2185 2359
2186/* 2360/*
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..898df9719afb 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void)
712 switch (boot_cpu_data.x86_vendor) { 712 switch (boot_cpu_data.x86_vendor) {
713 case X86_VENDOR_AMD: 713 case X86_VENDOR_AMD:
714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
715 boot_cpu_data.x86 != 16) 715 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
716 return; 716 return;
717 wd_ops = &k7_wd_ops; 717 wd_ops = &k7_wd_ops;
718 break; 718 break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
26 26
27 early_init_transmeta(c); 27 early_init_transmeta(c);
28 28
29 display_cacheinfo(c); 29 cpu_detect_cache_sizes(c);
30 30
31 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
32 max = cpuid_eax(0x80860000); 32 max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..83e5e628de73 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -116,21 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
116{ 116{
117 unsigned int cpu; 117 unsigned int cpu;
118 struct cpuinfo_x86 *c; 118 struct cpuinfo_x86 *c;
119 int ret = 0;
120
121 lock_kernel();
122 119
123 cpu = iminor(file->f_path.dentry->d_inode); 120 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 121 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
125 ret = -ENXIO; /* No such CPU */ 122 return -ENXIO; /* No such CPU */
126 goto out; 123
127 }
128 c = &cpu_data(cpu); 124 c = &cpu_data(cpu);
129 if (c->cpuid_level < 0) 125 if (c->cpuid_level < 0)
130 ret = -EIO; /* CPUID not supported */ 126 return -EIO; /* CPUID not supported */
131out: 127
132 unlock_kernel(); 128 return 0;
133 return ret;
134} 129}
135 130
136/* 131/*
@@ -192,7 +187,8 @@ static int __init cpuid_init(void)
192 int i, err = 0; 187 int i, err = 0;
193 i = 0; 188 i = 0;
194 189
195 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { 190 if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
191 "cpu/cpuid", &cpuid_fops)) {
196 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", 192 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
197 CPUID_MAJOR); 193 CPUID_MAJOR);
198 err = -EBUSY; 194 err = -EBUSY;
@@ -221,7 +217,7 @@ out_class:
221 } 217 }
222 class_destroy(cpuid_class); 218 class_destroy(cpuid_class);
223out_chrdev: 219out_chrdev:
224 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 220 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
225out: 221out:
226 return err; 222 return err;
227} 223}
@@ -233,7 +229,7 @@ static void __exit cpuid_exit(void)
233 for_each_online_cpu(cpu) 229 for_each_online_cpu(cpu)
234 cpuid_device_destroy(cpu); 230 cpuid_device_destroy(cpu);
235 class_destroy(cpuid_class); 231 class_destroy(cpuid_class);
236 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 232 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
237 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 233 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
238} 234}
239 235
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..a4849c10a77e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h> 30#include <asm/x86_init.h>
31
32 31
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 33
@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
106#endif 105#endif
107 106
108#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
109 pci_iommu_shutdown(); 108 x86_platform.iommu_shutdown();
110#endif 109#endif
111 110
112 crash_save_cpu(regs, safe_smp_processor_id()); 111 crash_save_cpu(regs, safe_smp_processor_id());
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index f7cdb3b457aa..cd97ce18c29d 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -16,6 +16,22 @@ static void *kdump_buf_page;
16/* Stores the physical address of elf header of crash image. */ 16/* Stores the physical address of elf header of crash image. */
17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
18 18
19static inline bool is_crashed_pfn_valid(unsigned long pfn)
20{
21#ifndef CONFIG_X86_PAE
22 /*
23 * non-PAE kdump kernel executed from a PAE one will crop high pte
24 * bits and poke unwanted space counting again from address 0, we
25 * don't want that. pte must fit into unsigned long. In fact the
26 * test checks high 12 bits for being zero (pfn will be shifted left
27 * by PAGE_SHIFT).
28 */
29 return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
30#else
31 return true;
32#endif
33}
34
19/** 35/**
20 * copy_oldmem_page - copy one page from "oldmem" 36 * copy_oldmem_page - copy one page from "oldmem"
21 * @pfn: page frame number to be copied 37 * @pfn: page frame number to be copied
@@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
41 if (!csize) 57 if (!csize)
42 return 0; 58 return 0;
43 59
60 if (!is_crashed_pfn_valid(pfn))
61 return -EFAULT;
62
44 vaddr = kmap_atomic_pfn(pfn, KM_PTE0); 63 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
45 64
46 if (!userbuf) { 65 if (!userbuf) {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ef42a038f1a6..1c47390dd0e5 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -265,13 +265,13 @@ struct ds_context {
265 int cpu; 265 int cpu;
266}; 266};
267 267
268static DEFINE_PER_CPU(struct ds_context *, cpu_context); 268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269 269
270 270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu) 271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{ 272{
273 struct ds_context **p_context = 273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); 274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL; 275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL; 276 struct ds_context *new_context = NULL;
277 277
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..6d817554780a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -109,6 +109,32 @@ print_context_stack(struct thread_info *tinfo,
109 } 109 }
110 return bp; 110 return bp;
111} 111}
112EXPORT_SYMBOL_GPL(print_context_stack);
113
114unsigned long
115print_context_stack_bp(struct thread_info *tinfo,
116 unsigned long *stack, unsigned long bp,
117 const struct stacktrace_ops *ops, void *data,
118 unsigned long *end, int *graph)
119{
120 struct stack_frame *frame = (struct stack_frame *)bp;
121 unsigned long *ret_addr = &frame->return_address;
122
123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
124 unsigned long addr = *ret_addr;
125
126 if (!__kernel_text_address(addr))
127 break;
128
129 ops->address(data, addr, 1);
130 frame = frame->next_frame;
131 ret_addr = &frame->return_address;
132 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
133 }
134
135 return (unsigned long)frame;
136}
137EXPORT_SYMBOL_GPL(print_context_stack_bp);
112 138
113 139
114static void 140static void
@@ -141,10 +167,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
141} 167}
142 168
143static const struct stacktrace_ops print_trace_ops = { 169static const struct stacktrace_ops print_trace_ops = {
144 .warning = print_trace_warning, 170 .warning = print_trace_warning,
145 .warning_symbol = print_trace_warning_symbol, 171 .warning_symbol = print_trace_warning_symbol,
146 .stack = print_trace_stack, 172 .stack = print_trace_stack,
147 .address = print_trace_address, 173 .address = print_trace_address,
174 .walk_stack = print_context_stack,
148}; 175};
149 176
150void 177void
@@ -188,7 +215,7 @@ void dump_stack(void)
188} 215}
189EXPORT_SYMBOL(dump_stack); 216EXPORT_SYMBOL(dump_stack);
190 217
191static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 218static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
192static int die_owner = -1; 219static int die_owner = -1;
193static unsigned int die_nest_count; 220static unsigned int die_nest_count;
194 221
@@ -207,11 +234,11 @@ unsigned __kprobes long oops_begin(void)
207 /* racy, but better than risking deadlock. */ 234 /* racy, but better than risking deadlock. */
208 raw_local_irq_save(flags); 235 raw_local_irq_save(flags);
209 cpu = smp_processor_id(); 236 cpu = smp_processor_id();
210 if (!__raw_spin_trylock(&die_lock)) { 237 if (!arch_spin_trylock(&die_lock)) {
211 if (cpu == die_owner) 238 if (cpu == die_owner)
212 /* nested oops. should stop eventually */; 239 /* nested oops. should stop eventually */;
213 else 240 else
214 __raw_spin_lock(&die_lock); 241 arch_spin_lock(&die_lock);
215 } 242 }
216 die_nest_count++; 243 die_nest_count++;
217 die_owner = cpu; 244 die_owner = cpu;
@@ -231,7 +258,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
231 die_nest_count--; 258 die_nest_count--;
232 if (!die_nest_count) 259 if (!die_nest_count)
233 /* Nest count reaches zero, release the lock. */ 260 /* Nest count reaches zero, release the lock. */
234 __raw_spin_unlock(&die_lock); 261 arch_spin_unlock(&die_lock);
235 raw_local_irq_restore(flags); 262 raw_local_irq_restore(flags);
236 oops_exit(); 263 oops_exit();
237 264
@@ -268,11 +295,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268 295
269 show_registers(regs); 296 show_registers(regs);
270#ifdef CONFIG_X86_32 297#ifdef CONFIG_X86_32
271 sp = (unsigned long) (&regs->sp); 298 if (user_mode_vm(regs)) {
272 savesegment(ss, ss);
273 if (user_mode(regs)) {
274 sp = regs->sp; 299 sp = regs->sp;
275 ss = regs->ss & 0xffff; 300 ss = regs->ss & 0xffff;
301 } else {
302 sp = kernel_stack_pointer(regs);
303 savesegment(ss, ss);
276 } 304 }
277 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 305 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
278 print_symbol("%s", regs->ip); 306 print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 81086c227ab7..4fd1420faffa 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,12 +14,6 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void 17extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 18show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl); 19 unsigned long *stack, unsigned long bp, char *log_lvl);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..ae775ca47b25 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,9 +10,9 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
@@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
35 35
36 if (!stack) { 36 if (!stack) {
37 unsigned long dummy; 37 unsigned long dummy;
38
38 stack = &dummy; 39 stack = &dummy;
39 if (task && task != current) 40 if (task && task != current)
40 stack = (unsigned long *)task->thread.sp; 41 stack = (unsigned long *)task->thread.sp;
@@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
57 58
58 context = (struct thread_info *) 59 context = (struct thread_info *)
59 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 60 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
60 bp = print_context_stack(context, stack, bp, ops, 61 bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
61 data, NULL, &graph);
62 62
63 stack = (unsigned long *)context->previous_esp; 63 stack = (unsigned long *)context->previous_esp;
64 if (!stack) 64 if (!stack)
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace);
72 72
73void 73void
74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
75 unsigned long *sp, unsigned long bp, char *log_lvl) 75 unsigned long *sp, unsigned long bp, char *log_lvl)
76{ 76{
77 unsigned long *stack; 77 unsigned long *stack;
78 int i; 78 int i;
@@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip)
156 156
157 return ud2 == 0x0b0f; 157 return ud2 == 0x0b0f;
158} 158}
159
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..0ad9597073f5 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,26 +10,28 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
21 23
22static char x86_stack_ids[][8] = { 24static char x86_stack_ids[][8] = {
23 [DEBUG_STACK - 1] = "#DB", 25 [ DEBUG_STACK-1 ] = "#DB",
24 [NMI_STACK - 1] = "NMI", 26 [ NMI_STACK-1 ] = "NMI",
25 [DOUBLEFAULT_STACK - 1] = "#DF", 27 [ DOUBLEFAULT_STACK-1 ] = "#DF",
26 [STACKFAULT_STACK - 1] = "#SS", 28 [ STACKFAULT_STACK-1 ] = "#SS",
27 [MCE_STACK - 1] = "#MC", 29 [ MCE_STACK-1 ] = "#MC",
28#if DEBUG_STKSZ > EXCEPTION_STKSZ 30#if DEBUG_STKSZ > EXCEPTION_STKSZ
29 [N_EXCEPTION_STACKS ... 31 [ N_EXCEPTION_STACKS ...
30 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 32 N_EXCEPTION_STACKS_END ] = "#DB[?]"
31#endif 33#endif
32 }; 34};
33 35
34int x86_is_stack_id(int id, char *name) 36int x86_is_stack_id(int id, char *name)
35{ 37{
@@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name)
37} 39}
38 40
39static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
40 unsigned *usedp, char **idp) 42 unsigned *usedp, char **idp)
41{ 43{
42 unsigned k; 44 unsigned k;
43 45
@@ -101,6 +103,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
101 return NULL; 103 return NULL;
102} 104}
103 105
106static inline int
107in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
108 unsigned long *irq_stack_end)
109{
110 return (stack >= irq_stack && stack < irq_stack_end);
111}
112
113/*
114 * We are returning from the irq stack and go to the previous one.
115 * If the previous stack is also in the irq stack, then bp in the first
116 * frame of the irq stack points to the previous, interrupted one.
117 * Otherwise we have another level of indirection: We first save
118 * the bp of the previous stack, then we switch the stack to the irq one
119 * and save a new bp that links to the previous one.
120 * (See save_args())
121 */
122static inline unsigned long
123fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
124 unsigned long *irq_stack, unsigned long *irq_stack_end)
125{
126#ifdef CONFIG_FRAME_POINTER
127 struct stack_frame *frame = (struct stack_frame *)bp;
128
129 if (!in_irq_stack(stack, irq_stack, irq_stack_end))
130 return (unsigned long)frame->next_frame;
131#endif
132 return bp;
133}
134
104/* 135/*
105 * x86-64 can have up to three kernel stacks: 136 * x86-64 can have up to three kernel stacks:
106 * process stack 137 * process stack
@@ -157,8 +188,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
157 if (ops->stack(data, id) < 0) 188 if (ops->stack(data, id) < 0)
158 break; 189 break;
159 190
160 bp = print_context_stack(tinfo, stack, bp, ops, 191 bp = ops->walk_stack(tinfo, stack, bp, ops,
161 data, estack_end, &graph); 192 data, estack_end, &graph);
162 ops->stack(data, "<EOE>"); 193 ops->stack(data, "<EOE>");
163 /* 194 /*
164 * We link to the next stack via the 195 * We link to the next stack via the
@@ -173,7 +204,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 irq_stack = irq_stack_end - 204 irq_stack = irq_stack_end -
174 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); 205 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
175 206
176 if (stack >= irq_stack && stack < irq_stack_end) { 207 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
177 if (ops->stack(data, "IRQ") < 0) 208 if (ops->stack(data, "IRQ") < 0)
178 break; 209 break;
179 bp = print_context_stack(tinfo, stack, bp, 210 bp = print_context_stack(tinfo, stack, bp,
@@ -184,6 +215,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
184 * pointer (index -1 to end) in the IRQ stack: 215 * pointer (index -1 to end) in the IRQ stack:
185 */ 216 */
186 stack = (unsigned long *) (irq_stack_end[-1]); 217 stack = (unsigned long *) (irq_stack_end[-1]);
218 bp = fixup_bp_irq_link(bp, stack, irq_stack,
219 irq_stack_end);
187 irq_stack_end = NULL; 220 irq_stack_end = NULL;
188 ops->stack(data, "EOI"); 221 ops->stack(data, "EOI");
189 continue; 222 continue;
@@ -202,21 +235,24 @@ EXPORT_SYMBOL(dump_trace);
202 235
203void 236void
204show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 237show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
205 unsigned long *sp, unsigned long bp, char *log_lvl) 238 unsigned long *sp, unsigned long bp, char *log_lvl)
206{ 239{
240 unsigned long *irq_stack_end;
241 unsigned long *irq_stack;
207 unsigned long *stack; 242 unsigned long *stack;
243 int cpu;
208 int i; 244 int i;
209 const int cpu = smp_processor_id(); 245
210 unsigned long *irq_stack_end = 246 preempt_disable();
211 (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); 247 cpu = smp_processor_id();
212 unsigned long *irq_stack = 248
213 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); 249 irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
250 irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
214 251
215 /* 252 /*
216 * debugging aid: "show_stack(NULL, NULL);" prints the 253 * Debugging aid: "show_stack(NULL, NULL);" prints the
217 * back trace for this cpu. 254 * back trace for this cpu:
218 */ 255 */
219
220 if (sp == NULL) { 256 if (sp == NULL) {
221 if (task) 257 if (task)
222 sp = (unsigned long *)task->thread.sp; 258 sp = (unsigned long *)task->thread.sp;
@@ -240,6 +276,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 printk(" %016lx", *stack++); 276 printk(" %016lx", *stack++);
241 touch_nmi_watchdog(); 277 touch_nmi_watchdog();
242 } 278 }
279 preempt_enable();
280
243 printk("\n"); 281 printk("\n");
244 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 282 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
245} 283}
@@ -303,4 +341,3 @@ int is_valid_bugaddr(unsigned long ip)
303 341
304 return ud2 == 0x0b0f; 342 return ud2 == 0x0b0f;
305} 343}
306
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 230687ba5ba5..a966b753e496 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -732,7 +732,7 @@ core_initcall(e820_mark_nvs_memory);
732/* 732/*
733 * Early reserved memory areas. 733 * Early reserved memory areas.
734 */ 734 */
735#define MAX_EARLY_RES 20 735#define MAX_EARLY_RES 32
736 736
737struct early_res { 737struct early_res {
738 u64 start, end; 738 u64 start, end;
@@ -740,7 +740,16 @@ struct early_res {
740 char overlap_ok; 740 char overlap_ok;
741}; 741};
742static struct early_res early_res[MAX_EARLY_RES] __initdata = { 742static struct early_res early_res[MAX_EARLY_RES] __initdata = {
743 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ 743 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
744#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
745 /*
746 * But first pinch a few for the stack/trampoline stuff
747 * FIXME: Don't need the extra page at 4K, but need to fix
748 * trampoline before removing it. (see the GDT stuff)
749 */
750 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
751#endif
752
744 {} 753 {}
745}; 754};
746 755
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index ad5bd988fb79..cdcfb122f256 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -454,8 +454,10 @@ void __init efi_init(void)
454 if (add_efi_memmap) 454 if (add_efi_memmap)
455 do_add_efi_memmap(); 455 do_add_efi_memmap();
456 456
457#ifdef CONFIG_X86_32
457 x86_platform.get_wallclock = efi_get_time; 458 x86_platform.get_wallclock = efi_get_time;
458 x86_platform.set_wallclock = efi_set_rtc_mmss; 459 x86_platform.set_wallclock = efi_set_rtc_mmss;
460#endif
459 461
460 /* Setup for EFI runtime service */ 462 /* Setup for EFI runtime service */
461 reboot_type = BOOT_EFI; 463 reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..44a8e0dc6737 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,26 +717,69 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
711 */ 727 */
712#define PTREGSCALL(name) \ 728#define PTREGSCALL0(name) \
713 ALIGN; \ 729 ALIGN; \
714ptregs_##name: \ 730ptregs_##name: \
715 leal 4(%esp),%eax; \ 731 leal 4(%esp),%eax; \
716 jmp sys_##name; 732 jmp sys_##name;
717 733
718PTREGSCALL(iopl) 734#define PTREGSCALL1(name) \
719PTREGSCALL(fork) 735 ALIGN; \
720PTREGSCALL(clone) 736ptregs_##name: \
721PTREGSCALL(vfork) 737 leal 4(%esp),%edx; \
722PTREGSCALL(execve) 738 movl (PT_EBX+4)(%esp),%eax; \
723PTREGSCALL(sigaltstack) 739 jmp sys_##name;
724PTREGSCALL(sigreturn) 740
725PTREGSCALL(rt_sigreturn) 741#define PTREGSCALL2(name) \
726PTREGSCALL(vm86) 742 ALIGN; \
727PTREGSCALL(vm86old) 743ptregs_##name: \
744 leal 4(%esp),%ecx; \
745 movl (PT_ECX+4)(%esp),%edx; \
746 movl (PT_EBX+4)(%esp),%eax; \
747 jmp sys_##name;
748
749#define PTREGSCALL3(name) \
750 ALIGN; \
751ptregs_##name: \
752 leal 4(%esp),%eax; \
753 pushl %eax; \
754 movl PT_EDX(%eax),%ecx; \
755 movl PT_ECX(%eax),%edx; \
756 movl PT_EBX(%eax),%eax; \
757 call sys_##name; \
758 addl $4,%esp; \
759 ret
760
761PTREGSCALL1(iopl)
762PTREGSCALL0(fork)
763PTREGSCALL0(vfork)
764PTREGSCALL3(execve)
765PTREGSCALL2(sigaltstack)
766PTREGSCALL0(sigreturn)
767PTREGSCALL0(rt_sigreturn)
768PTREGSCALL2(vm86)
769PTREGSCALL1(vm86old)
770
771/* Clone is an oddball. The 4th arg is in %edi */
772 ALIGN;
773ptregs_clone:
774 leal 4(%esp),%eax
775 pushl %eax
776 pushl PT_EDI(%eax)
777 movl PT_EDX(%eax),%ecx
778 movl PT_ECX(%eax),%edx
779 movl PT_EBX(%eax),%eax
780 call sys_clone
781 addl $8,%esp
782 ret
728 783
729.macro FIXUP_ESPFIX_STACK 784.macro FIXUP_ESPFIX_STACK
730/* 785/*
@@ -814,6 +869,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 869ENDPROC(common_interrupt)
815 CFI_ENDPROC 870 CFI_ENDPROC
816 871
872/*
873 * Irq entries should be protected against kprobes
874 */
875 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 876#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 877ENTRY(name) \
819 RING0_INT_FRAME; \ 878 RING0_INT_FRAME; \
@@ -980,16 +1039,16 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1039 jmp error_code
981 CFI_ENDPROC 1040 CFI_ENDPROC
982END(spurious_interrupt_bug) 1041END(spurious_interrupt_bug)
1042/*
1043 * End of kprobes section
1044 */
1045 .popsection
983 1046
984ENTRY(kernel_thread_helper) 1047ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1048 pushl $0 # fake return address for unwinder
986 CFI_STARTPROC 1049 CFI_STARTPROC
987 movl %edx,%eax 1050 movl %edi,%eax
988 push %edx 1051 call *%esi
989 CFI_ADJUST_CFA_OFFSET 4
990 call *%ebx
991 push %eax
992 CFI_ADJUST_CFA_OFFSET 4
993 call do_exit 1052 call do_exit
994 ud2 # padding for call trace 1053 ud2 # padding for call trace
995 CFI_ENDPROC 1054 CFI_ENDPROC
@@ -1185,17 +1244,14 @@ END(ftrace_graph_caller)
1185 1244
1186.globl return_to_handler 1245.globl return_to_handler
1187return_to_handler: 1246return_to_handler:
1188 pushl $0
1189 pushl %eax 1247 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1248 pushl %edx
1192 movl %ebp, %eax 1249 movl %ebp, %eax
1193 call ftrace_return_to_handler 1250 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1251 movl %eax, %ecx
1195 popl %edx 1252 popl %edx
1196 popl %ecx
1197 popl %eax 1253 popl %eax
1198 ret 1254 jmp *%ecx
1199#endif 1255#endif
1200 1256
1201.section .rodata,"a" 1257.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f8f358..0697ff139837 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 16(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $16, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
@@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
969#endif 977#endif
970apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
971 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
972apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
973 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
974 982
975#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
976apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1068,10 +1076,10 @@ ENTRY(\sym)
1068 TRACE_IRQS_OFF 1076 TRACE_IRQS_OFF
1069 movq %rsp,%rdi /* pt_regs pointer */ 1077 movq %rsp,%rdi /* pt_regs pointer */
1070 xorl %esi,%esi /* no error code */ 1078 xorl %esi,%esi /* no error code */
1071 PER_CPU(init_tss, %rbp) 1079 PER_CPU(init_tss, %r12)
1072 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1073 call \do_sym 1081 call \do_sym
1074 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1075 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1076 CFI_ENDPROC 1084 CFI_ENDPROC
1077END(\sym) 1085END(\sym)
@@ -1158,63 +1166,20 @@ bad_gs:
1158 jmp 2b 1166 jmp 2b
1159 .previous 1167 .previous
1160 1168
1161/* 1169ENTRY(kernel_thread_helper)
1162 * Create a kernel thread.
1163 *
1164 * C extern interface:
1165 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1166 *
1167 * asm input arguments:
1168 * rdi: fn, rsi: arg, rdx: flags
1169 */
1170ENTRY(kernel_thread)
1171 CFI_STARTPROC
1172 FAKE_STACK_FRAME $child_rip
1173 SAVE_ALL
1174
1175 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1176 movq %rdx,%rdi
1177 orq kernel_thread_flags(%rip),%rdi
1178 movq $-1, %rsi
1179 movq %rsp, %rdx
1180
1181 xorl %r8d,%r8d
1182 xorl %r9d,%r9d
1183
1184 # clone now
1185 call do_fork
1186 movq %rax,RAX(%rsp)
1187 xorl %edi,%edi
1188
1189 /*
1190 * It isn't worth to check for reschedule here,
1191 * so internally to the x86_64 port you can rely on kernel_thread()
1192 * not to reschedule the child before returning, this avoids the need
1193 * of hacks for example to fork off the per-CPU idle tasks.
1194 * [Hopefully no generic code relies on the reschedule -AK]
1195 */
1196 RESTORE_ALL
1197 UNFAKE_STACK_FRAME
1198 ret
1199 CFI_ENDPROC
1200END(kernel_thread)
1201
1202ENTRY(child_rip)
1203 pushq $0 # fake return address 1170 pushq $0 # fake return address
1204 CFI_STARTPROC 1171 CFI_STARTPROC
1205 /* 1172 /*
1206 * Here we are in the child and the registers are set as they were 1173 * Here we are in the child and the registers are set as they were
1207 * at kernel_thread() invocation in the parent. 1174 * at kernel_thread() invocation in the parent.
1208 */ 1175 */
1209 movq %rdi, %rax 1176 call *%rsi
1210 movq %rsi, %rdi
1211 call *%rax
1212 # exit 1177 # exit
1213 mov %eax, %edi 1178 mov %eax, %edi
1214 call do_exit 1179 call do_exit
1215 ud2 # padding for call trace 1180 ud2 # padding for call trace
1216 CFI_ENDPROC 1181 CFI_ENDPROC
1217END(child_rip) 1182END(kernel_thread_helper)
1218 1183
1219/* 1184/*
1220 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1185 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1491,12 +1456,17 @@ error_kernelspace:
1491 leaq irq_return(%rip),%rcx 1456 leaq irq_return(%rip),%rcx
1492 cmpq %rcx,RIP+8(%rsp) 1457 cmpq %rcx,RIP+8(%rsp)
1493 je error_swapgs 1458 je error_swapgs
1494 movl %ecx,%ecx /* zero extend */ 1459 movl %ecx,%eax /* zero extend */
1495 cmpq %rcx,RIP+8(%rsp) 1460 cmpq %rax,RIP+8(%rsp)
1496 je error_swapgs 1461 je bstep_iret
1497 cmpq $gs_change,RIP+8(%rsp) 1462 cmpq $gs_change,RIP+8(%rsp)
1498 je error_swapgs 1463 je error_swapgs
1499 jmp error_sti 1464 jmp error_sti
1465
1466bstep_iret:
1467 /* Fix truncated RIP */
1468 movq %rcx,RIP+8(%rsp)
1469 jmp error_swapgs
1500END(error_entry) 1470END(error_entry)
1501 1471
1502 1472
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 944e9820b4b5..309689245431 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -353,15 +355,15 @@ int __init ftrace_dyn_arch_init(void *data)
353 355
354 switch (faulted) { 356 switch (faulted) {
355 case 0: 357 case 0:
356 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 358 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
357 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 359 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
358 break; 360 break;
359 case 1: 361 case 1:
360 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 362 pr_info("converting mcount calls to 66 66 66 66 90\n");
361 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 363 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
362 break; 364 break;
363 case 2: 365 case 2:
364 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 366 pr_info("converting mcount calls to jmp . + 5\n");
365 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 367 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
366 break; 368 break;
367 } 369 }
@@ -485,82 +487,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
485 487
486#ifdef CONFIG_FTRACE_SYSCALLS 488#ifdef CONFIG_FTRACE_SYSCALLS
487 489
488extern unsigned long __start_syscalls_metadata[];
489extern unsigned long __stop_syscalls_metadata[];
490extern unsigned long *sys_call_table; 490extern unsigned long *sys_call_table;
491 491
492static struct syscall_metadata **syscalls_metadata; 492unsigned long __init arch_syscall_addr(int nr)
493
494static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
495{
496 struct syscall_metadata *start;
497 struct syscall_metadata *stop;
498 char str[KSYM_SYMBOL_LEN];
499
500
501 start = (struct syscall_metadata *)__start_syscalls_metadata;
502 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
503 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
504
505 for ( ; start < stop; start++) {
506 if (start->name && !strcmp(start->name, str))
507 return start;
508 }
509 return NULL;
510}
511
512struct syscall_metadata *syscall_nr_to_meta(int nr)
513{
514 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
515 return NULL;
516
517 return syscalls_metadata[nr];
518}
519
520int syscall_name_to_nr(char *name)
521{ 493{
522 int i; 494 return (unsigned long)(&sys_call_table)[nr];
523
524 if (!syscalls_metadata)
525 return -1;
526
527 for (i = 0; i < NR_syscalls; i++) {
528 if (syscalls_metadata[i]) {
529 if (!strcmp(syscalls_metadata[i]->name, name))
530 return i;
531 }
532 }
533 return -1;
534}
535
536void set_syscall_enter_id(int num, int id)
537{
538 syscalls_metadata[num]->enter_id = id;
539}
540
541void set_syscall_exit_id(int num, int id)
542{
543 syscalls_metadata[num]->exit_id = id;
544}
545
546static int __init arch_init_ftrace_syscalls(void)
547{
548 int i;
549 struct syscall_metadata *meta;
550 unsigned long **psys_syscall_table = &sys_call_table;
551
552 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
553 NR_syscalls, GFP_KERNEL);
554 if (!syscalls_metadata) {
555 WARN_ON(1);
556 return -ENOMEM;
557 }
558
559 for (i = 0; i < NR_syscalls; i++) {
560 meta = find_syscall_meta(psys_syscall_table[i]);
561 syscalls_metadata[i] = meta;
562 }
563 return 0;
564} 495}
565arch_initcall(arch_init_ftrace_syscalls);
566#endif 496#endif
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
deleted file mode 100644
index 9b08e852fd1a..000000000000
--- a/arch/x86/kernel/geode_32.c
+++ /dev/null
@@ -1,196 +0,0 @@
1/*
2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public License
8 * as published by the Free Software Foundation.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/ioport.h>
14#include <linux/io.h>
15#include <asm/msr.h>
16#include <asm/geode.h>
17
18static struct {
19 char *name;
20 u32 msr;
21 int size;
22 u32 base;
23} lbars[] = {
24 { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
25 { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
26 { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
27 { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
28};
29
30static void __init init_lbars(void)
31{
32 u32 lo, hi;
33 int i;
34
35 for (i = 0; i < ARRAY_SIZE(lbars); i++) {
36 rdmsr(lbars[i].msr, lo, hi);
37 if (hi & 0x01)
38 lbars[i].base = lo & 0x0000ffff;
39
40 if (lbars[i].base == 0)
41 printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
42 lbars[i].name);
43 }
44}
45
46int geode_get_dev_base(unsigned int dev)
47{
48 BUG_ON(dev >= ARRAY_SIZE(lbars));
49 return lbars[dev].base;
50}
51EXPORT_SYMBOL_GPL(geode_get_dev_base);
52
53/* === GPIO API === */
54
55void geode_gpio_set(u32 gpio, unsigned int reg)
56{
57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
58
59 if (!base)
60 return;
61
62 /* low bank register */
63 if (gpio & 0xFFFF)
64 outl(gpio & 0xFFFF, base + reg);
65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
69}
70EXPORT_SYMBOL_GPL(geode_gpio_set);
71
72void geode_gpio_clear(u32 gpio, unsigned int reg)
73{
74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
75
76 if (!base)
77 return;
78
79 /* low bank register */
80 if (gpio & 0xFFFF)
81 outl((gpio & 0xFFFF) << 16, base + reg);
82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
86}
87EXPORT_SYMBOL_GPL(geode_gpio_clear);
88
89int geode_gpio_isset(u32 gpio, unsigned int reg)
90{
91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
93
94 if (!base)
95 return 0;
96
97 /* low bank register */
98 if (gpio & 0xFFFF) {
99 val = inl(base + reg) & (gpio & 0xFFFF);
100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
111}
112EXPORT_SYMBOL_GPL(geode_gpio_isset);
113
114void geode_gpio_set_irq(unsigned int group, unsigned int irq)
115{
116 u32 lo, hi;
117
118 if (group > 7 || irq > 15)
119 return;
120
121 rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
122
123 lo &= ~(0xF << (group * 4));
124 lo |= (irq & 0xF) << (group * 4);
125
126 wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
127}
128EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
129
130void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
131{
132 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
133 u32 offset, shift, val;
134
135 if (gpio >= 24)
136 offset = GPIO_MAP_W;
137 else if (gpio >= 16)
138 offset = GPIO_MAP_Z;
139 else if (gpio >= 8)
140 offset = GPIO_MAP_Y;
141 else
142 offset = GPIO_MAP_X;
143
144 shift = (gpio % 8) * 4;
145
146 val = inl(base + offset);
147
148 /* Clear whatever was there before */
149 val &= ~(0xF << shift);
150
151 /* And set the new value */
152
153 val |= ((pair & 7) << shift);
154
155 /* Set the PME bit if this is a PME event */
156
157 if (pme)
158 val |= (1 << (shift + 3));
159
160 outl(val, base + offset);
161}
162EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
163
164int geode_has_vsa2(void)
165{
166 static int has_vsa2 = -1;
167
168 if (has_vsa2 == -1) {
169 u16 val;
170
171 /*
172 * The VSA has virtual registers that we can query for a
173 * signature.
174 */
175 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
176 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
177
178 val = inw(VSA_VRC_DATA);
179 has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
180 }
181
182 return has_vsa2;
183}
184EXPORT_SYMBOL_GPL(geode_has_vsa2);
185
186static int __init geode_southbridge_init(void)
187{
188 if (!is_geode())
189 return -ENODEV;
190
191 init_lbars();
192 (void) mfgpt_timer_setup();
193 return 0;
194}
195
196postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 4f8e2507e8f3..5051b94c9069 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,8 +29,6 @@ static void __init i386_default_early_setup(void)
29 29
30void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
31{ 31{
32 reserve_trampoline_memory();
33
34 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
35 33
36#ifdef CONFIG_BLK_DEV_INITRD 34#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd778fd9..b5a9896ca1e7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 reserve_trampoline_memory();
102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 102
105#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b55ee4ff509f..2d8b5035371c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
212 */ 212 */
213 lgdt early_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
214 214
215 /* set up data segments. actually 0 would do too */ 215 /* set up data segments */
216 movl $__KERNEL_DS,%eax 216 xorl %eax,%eax
217 movl %eax,%ds 217 movl %eax,%ds
218 movl %eax,%ss 218 movl %eax,%ss
219 movl %eax,%es 219 movl %eax,%es
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..ad80a1c718c6 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,6 +33,9 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36u8 hpet_blockid; /* OS timer block num */
37u8 hpet_msi_disable;
38
36#ifdef CONFIG_PCI_MSI 39#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers; 40static unsigned long hpet_num_timers;
38#endif 41#endif
@@ -47,12 +50,12 @@ struct hpet_dev {
47 char name[10]; 50 char name[10];
48}; 51};
49 52
50unsigned long hpet_readl(unsigned long a) 53inline unsigned int hpet_readl(unsigned int a)
51{ 54{
52 return readl(hpet_virt_address + a); 55 return readl(hpet_virt_address + a);
53} 56}
54 57
55static inline void hpet_writel(unsigned long d, unsigned long a) 58static inline void hpet_writel(unsigned int d, unsigned int a)
56{ 59{
57 writel(d, hpet_virt_address + a); 60 writel(d, hpet_virt_address + a);
58} 61}
@@ -167,7 +170,7 @@ do { \
167 170
168static void hpet_reserve_msi_timers(struct hpet_data *hd); 171static void hpet_reserve_msi_timers(struct hpet_data *hd);
169 172
170static void hpet_reserve_platform_timers(unsigned long id) 173static void hpet_reserve_platform_timers(unsigned int id)
171{ 174{
172 struct hpet __iomem *hpet = hpet_virt_address; 175 struct hpet __iomem *hpet = hpet_virt_address;
173 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; 176 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +208,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
205 208
206} 209}
207#else 210#else
208static void hpet_reserve_platform_timers(unsigned long id) { } 211static void hpet_reserve_platform_timers(unsigned int id) { }
209#endif 212#endif
210 213
211/* 214/*
@@ -246,7 +249,7 @@ static void hpet_reset_counter(void)
246 249
247static void hpet_start_counter(void) 250static void hpet_start_counter(void)
248{ 251{
249 unsigned long cfg = hpet_readl(HPET_CFG); 252 unsigned int cfg = hpet_readl(HPET_CFG);
250 cfg |= HPET_CFG_ENABLE; 253 cfg |= HPET_CFG_ENABLE;
251 hpet_writel(cfg, HPET_CFG); 254 hpet_writel(cfg, HPET_CFG);
252} 255}
@@ -271,7 +274,7 @@ static void hpet_resume_counter(void)
271 274
272static void hpet_enable_legacy_int(void) 275static void hpet_enable_legacy_int(void)
273{ 276{
274 unsigned long cfg = hpet_readl(HPET_CFG); 277 unsigned int cfg = hpet_readl(HPET_CFG);
275 278
276 cfg |= HPET_CFG_LEGACY; 279 cfg |= HPET_CFG_LEGACY;
277 hpet_writel(cfg, HPET_CFG); 280 hpet_writel(cfg, HPET_CFG);
@@ -314,7 +317,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
314static void hpet_set_mode(enum clock_event_mode mode, 317static void hpet_set_mode(enum clock_event_mode mode,
315 struct clock_event_device *evt, int timer) 318 struct clock_event_device *evt, int timer)
316{ 319{
317 unsigned long cfg, cmp, now; 320 unsigned int cfg, cmp, now;
318 uint64_t delta; 321 uint64_t delta;
319 322
320 switch (mode) { 323 switch (mode) {
@@ -323,7 +326,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
323 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 326 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
324 delta >>= evt->shift; 327 delta >>= evt->shift;
325 now = hpet_readl(HPET_COUNTER); 328 now = hpet_readl(HPET_COUNTER);
326 cmp = now + (unsigned long) delta; 329 cmp = now + (unsigned int) delta;
327 cfg = hpet_readl(HPET_Tn_CFG(timer)); 330 cfg = hpet_readl(HPET_Tn_CFG(timer));
328 /* Make sure we use edge triggered interrupts */ 331 /* Make sure we use edge triggered interrupts */
329 cfg &= ~HPET_TN_LEVEL; 332 cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +342,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
339 * (See AMD-8111 HyperTransport I/O Hub Data Sheet, 342 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
340 * Publication # 24674) 343 * Publication # 24674)
341 */ 344 */
342 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 345 hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
343 hpet_start_counter(); 346 hpet_start_counter();
344 hpet_print_config(); 347 hpet_print_config();
345 break; 348 break;
@@ -383,13 +386,24 @@ static int hpet_next_event(unsigned long delta,
383 hpet_writel(cnt, HPET_Tn_CMP(timer)); 386 hpet_writel(cnt, HPET_Tn_CMP(timer));
384 387
385 /* 388 /*
386 * We need to read back the CMP register to make sure that 389 * We need to read back the CMP register on certain HPET
387 * what we wrote hit the chip before we compare it to the 390 * implementations (ATI chipsets) which seem to delay the
388 * counter. 391 * transfer of the compare register into the internal compare
392 * logic. With small deltas this might actually be too late as
393 * the counter could already be higher than the compare value
394 * at that point and we would wait for the next hpet interrupt
395 * forever. We found out that reading the CMP register back
396 * forces the transfer so we can rely on the comparison with
397 * the counter register below. If the read back from the
398 * compare register does not match the value we programmed
399 * then we might have a real hardware problem. We can not do
400 * much about it here, but at least alert the user/admin with
401 * a prominent warning.
389 */ 402 */
390 WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); 403 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
404 KERN_WARNING "hpet: compare register read back failed.\n");
391 405
392 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 406 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
393} 407}
394 408
395static void hpet_legacy_set_mode(enum clock_event_mode mode, 409static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +429,7 @@ static struct hpet_dev *hpet_devs;
415void hpet_msi_unmask(unsigned int irq) 429void hpet_msi_unmask(unsigned int irq)
416{ 430{
417 struct hpet_dev *hdev = get_irq_data(irq); 431 struct hpet_dev *hdev = get_irq_data(irq);
418 unsigned long cfg; 432 unsigned int cfg;
419 433
420 /* unmask it */ 434 /* unmask it */
421 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 435 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +439,7 @@ void hpet_msi_unmask(unsigned int irq)
425 439
426void hpet_msi_mask(unsigned int irq) 440void hpet_msi_mask(unsigned int irq)
427{ 441{
428 unsigned long cfg; 442 unsigned int cfg;
429 struct hpet_dev *hdev = get_irq_data(irq); 443 struct hpet_dev *hdev = get_irq_data(irq);
430 444
431 /* mask it */ 445 /* mask it */
@@ -467,7 +481,7 @@ static int hpet_msi_next_event(unsigned long delta,
467 481
468static int hpet_setup_msi_irq(unsigned int irq) 482static int hpet_setup_msi_irq(unsigned int irq)
469{ 483{
470 if (arch_setup_hpet_msi(irq)) { 484 if (arch_setup_hpet_msi(irq, hpet_blockid)) {
471 destroy_irq(irq); 485 destroy_irq(irq);
472 return -EINVAL; 486 return -EINVAL;
473 } 487 }
@@ -584,6 +598,11 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
584 unsigned int num_timers_used = 0; 598 unsigned int num_timers_used = 0;
585 int i; 599 int i;
586 600
601 if (hpet_msi_disable)
602 return;
603
604 if (boot_cpu_has(X86_FEATURE_ARAT))
605 return;
587 id = hpet_readl(HPET_ID); 606 id = hpet_readl(HPET_ID);
588 607
589 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 608 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +617,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
598 617
599 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { 618 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
600 struct hpet_dev *hdev = &hpet_devs[num_timers_used]; 619 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
601 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); 620 unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
602 621
603 /* Only consider HPET timer with MSI support */ 622 /* Only consider HPET timer with MSI support */
604 if (!(cfg & HPET_TN_FSB_CAP)) 623 if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +832,7 @@ static int hpet_clocksource_register(void)
813 */ 832 */
814int __init hpet_enable(void) 833int __init hpet_enable(void)
815{ 834{
816 unsigned long id; 835 unsigned int id;
817 int i; 836 int i;
818 837
819 if (!is_hpet_capable()) 838 if (!is_hpet_capable())
@@ -872,10 +891,8 @@ int __init hpet_enable(void)
872 891
873 if (id & HPET_ID_LEGSUP) { 892 if (id & HPET_ID_LEGSUP) {
874 hpet_legacy_clockevent_register(); 893 hpet_legacy_clockevent_register();
875 hpet_msi_capability_lookup(2);
876 return 1; 894 return 1;
877 } 895 }
878 hpet_msi_capability_lookup(0);
879 return 0; 896 return 0;
880 897
881out_nohpet: 898out_nohpet:
@@ -908,9 +925,20 @@ static __init int hpet_late_init(void)
908 if (!hpet_virt_address) 925 if (!hpet_virt_address)
909 return -ENODEV; 926 return -ENODEV;
910 927
928 if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
929 hpet_msi_capability_lookup(2);
930 else
931 hpet_msi_capability_lookup(0);
932
911 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 933 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
912 hpet_print_config(); 934 hpet_print_config();
913 935
936 if (hpet_msi_disable)
937 return 0;
938
939 if (boot_cpu_has(X86_FEATURE_ARAT))
940 return 0;
941
914 for_each_online_cpu(cpu) { 942 for_each_online_cpu(cpu) {
915 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 943 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
916 } 944 }
@@ -925,7 +953,7 @@ fs_initcall(hpet_late_init);
925void hpet_disable(void) 953void hpet_disable(void)
926{ 954{
927 if (is_hpet_capable()) { 955 if (is_hpet_capable()) {
928 unsigned long cfg = hpet_readl(HPET_CFG); 956 unsigned int cfg = hpet_readl(HPET_CFG);
929 957
930 if (hpet_legacy_int_enabled) { 958 if (hpet_legacy_int_enabled) {
931 cfg &= ~HPET_CFG_LEGACY; 959 cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +993,8 @@ static int hpet_prev_update_sec;
965static struct rtc_time hpet_alarm_time; 993static struct rtc_time hpet_alarm_time;
966static unsigned long hpet_pie_count; 994static unsigned long hpet_pie_count;
967static u32 hpet_t1_cmp; 995static u32 hpet_t1_cmp;
968static unsigned long hpet_default_delta; 996static u32 hpet_default_delta;
969static unsigned long hpet_pie_delta; 997static u32 hpet_pie_delta;
970static unsigned long hpet_pie_limit; 998static unsigned long hpet_pie_limit;
971 999
972static rtc_irq_handler irq_handler; 1000static rtc_irq_handler irq_handler;
@@ -1017,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
1017 */ 1045 */
1018int hpet_rtc_timer_init(void) 1046int hpet_rtc_timer_init(void)
1019{ 1047{
1020 unsigned long cfg, cnt, delta, flags; 1048 unsigned int cfg, cnt, delta;
1049 unsigned long flags;
1021 1050
1022 if (!is_hpet_enabled()) 1051 if (!is_hpet_enabled())
1023 return 0; 1052 return 0;
@@ -1027,7 +1056,7 @@ int hpet_rtc_timer_init(void)
1027 1056
1028 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1057 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1029 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; 1058 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
1030 hpet_default_delta = (unsigned long) clc; 1059 hpet_default_delta = clc;
1031 } 1060 }
1032 1061
1033 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1062 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1142,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1113 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1142 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1114 do_div(clc, freq); 1143 do_div(clc, freq);
1115 clc >>= hpet_clockevent.shift; 1144 clc >>= hpet_clockevent.shift;
1116 hpet_pie_delta = (unsigned long) clc; 1145 hpet_pie_delta = clc;
1117 } 1146 }
1118 return 1; 1147 return 1;
1119} 1148}
@@ -1127,7 +1156,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1127 1156
1128static void hpet_rtc_timer_reinit(void) 1157static void hpet_rtc_timer_reinit(void)
1129{ 1158{
1130 unsigned long cfg, delta; 1159 unsigned int cfg, delta;
1131 int lost_ints = -1; 1160 int lost_ints = -1;
1132 1161
1133 if (unlikely(!hpet_rtc_flags)) { 1162 if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..05d5fec64a94
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,554 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type)
236{
237 /* Len */
238 switch (x86_len) {
239 case X86_BREAKPOINT_LEN_1:
240 *gen_len = HW_BREAKPOINT_LEN_1;
241 break;
242 case X86_BREAKPOINT_LEN_2:
243 *gen_len = HW_BREAKPOINT_LEN_2;
244 break;
245 case X86_BREAKPOINT_LEN_4:
246 *gen_len = HW_BREAKPOINT_LEN_4;
247 break;
248#ifdef CONFIG_X86_64
249 case X86_BREAKPOINT_LEN_8:
250 *gen_len = HW_BREAKPOINT_LEN_8;
251 break;
252#endif
253 default:
254 return -EINVAL;
255 }
256
257 /* Type */
258 switch (x86_type) {
259 case X86_BREAKPOINT_EXECUTE:
260 *gen_type = HW_BREAKPOINT_X;
261 break;
262 case X86_BREAKPOINT_WRITE:
263 *gen_type = HW_BREAKPOINT_W;
264 break;
265 case X86_BREAKPOINT_RW:
266 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
267 break;
268 default:
269 return -EINVAL;
270 }
271
272 return 0;
273}
274
275
276static int arch_build_bp_info(struct perf_event *bp)
277{
278 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
279
280 info->address = bp->attr.bp_addr;
281
282 /* Len */
283 switch (bp->attr.bp_len) {
284 case HW_BREAKPOINT_LEN_1:
285 info->len = X86_BREAKPOINT_LEN_1;
286 break;
287 case HW_BREAKPOINT_LEN_2:
288 info->len = X86_BREAKPOINT_LEN_2;
289 break;
290 case HW_BREAKPOINT_LEN_4:
291 info->len = X86_BREAKPOINT_LEN_4;
292 break;
293#ifdef CONFIG_X86_64
294 case HW_BREAKPOINT_LEN_8:
295 info->len = X86_BREAKPOINT_LEN_8;
296 break;
297#endif
298 default:
299 return -EINVAL;
300 }
301
302 /* Type */
303 switch (bp->attr.bp_type) {
304 case HW_BREAKPOINT_W:
305 info->type = X86_BREAKPOINT_WRITE;
306 break;
307 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
308 info->type = X86_BREAKPOINT_RW;
309 break;
310 case HW_BREAKPOINT_X:
311 info->type = X86_BREAKPOINT_EXECUTE;
312 break;
313 default:
314 return -EINVAL;
315 }
316
317 return 0;
318}
319/*
320 * Validate the arch-specific HW Breakpoint register settings
321 */
322int arch_validate_hwbkpt_settings(struct perf_event *bp,
323 struct task_struct *tsk)
324{
325 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
326 unsigned int align;
327 int ret;
328
329
330 ret = arch_build_bp_info(bp);
331 if (ret)
332 return ret;
333
334 ret = -EINVAL;
335
336 if (info->type == X86_BREAKPOINT_EXECUTE)
337 /*
338 * Ptrace-refactoring code
339 * For now, we'll allow instruction breakpoint only for user-space
340 * addresses
341 */
342 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
343 info->len != X86_BREAKPOINT_EXECUTE)
344 return ret;
345
346 switch (info->len) {
347 case X86_BREAKPOINT_LEN_1:
348 align = 0;
349 break;
350 case X86_BREAKPOINT_LEN_2:
351 align = 1;
352 break;
353 case X86_BREAKPOINT_LEN_4:
354 align = 3;
355 break;
356#ifdef CONFIG_X86_64
357 case X86_BREAKPOINT_LEN_8:
358 align = 7;
359 break;
360#endif
361 default:
362 return ret;
363 }
364
365 ret = arch_store_info(bp);
366
367 if (ret < 0)
368 return ret;
369 /*
370 * Check that the low-order bits of the address are appropriate
371 * for the alignment implied by len.
372 */
373 if (info->address & align)
374 return -EINVAL;
375
376 /* Check that the virtual address is in the proper range */
377 if (tsk) {
378 if (!arch_check_va_in_userspace(info->address, info->len))
379 return -EFAULT;
380 } else {
381 if (!arch_check_va_in_kernelspace(info->address, info->len))
382 return -EFAULT;
383 }
384
385 return 0;
386}
387
388/*
389 * Dump the debug register contents to the user.
390 * We can't dump our per cpu values because it
391 * may contain cpu wide breakpoint, something that
392 * doesn't belong to the current task.
393 *
394 * TODO: include non-ptrace user breakpoints (perf)
395 */
396void aout_dump_debugregs(struct user *dump)
397{
398 int i;
399 int dr7 = 0;
400 struct perf_event *bp;
401 struct arch_hw_breakpoint *info;
402 struct thread_struct *thread = &current->thread;
403
404 for (i = 0; i < HBP_NUM; i++) {
405 bp = thread->ptrace_bps[i];
406
407 if (bp && !bp->attr.disabled) {
408 dump->u_debugreg[i] = bp->attr.bp_addr;
409 info = counter_arch_bp(bp);
410 dr7 |= encode_dr7(i, info->len, info->type);
411 } else {
412 dump->u_debugreg[i] = 0;
413 }
414 }
415
416 dump->u_debugreg[4] = 0;
417 dump->u_debugreg[5] = 0;
418 dump->u_debugreg[6] = current->thread.debugreg6;
419
420 dump->u_debugreg[7] = dr7;
421}
422EXPORT_SYMBOL_GPL(aout_dump_debugregs);
423
424/*
425 * Release the user breakpoints used by ptrace
426 */
427void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
428{
429 int i;
430 struct thread_struct *t = &tsk->thread;
431
432 for (i = 0; i < HBP_NUM; i++) {
433 unregister_hw_breakpoint(t->ptrace_bps[i]);
434 t->ptrace_bps[i] = NULL;
435 }
436}
437
438void hw_breakpoint_restore(void)
439{
440 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
441 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
442 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
443 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
444 set_debugreg(current->thread.debugreg6, 6);
445 set_debugreg(__get_cpu_var(cpu_dr7), 7);
446}
447EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
448
449/*
450 * Handle debug exception notifications.
451 *
452 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
453 *
454 * NOTIFY_DONE returned if one of the following conditions is true.
455 * i) When the causative address is from user-space and the exception
456 * is a valid one, i.e. not triggered as a result of lazy debug register
457 * switching
458 * ii) When there are more bits than trap<n> set in DR6 register (such
459 * as BD, BS or BT) indicating that more than one debug condition is
460 * met and requires some more action in do_debug().
461 *
462 * NOTIFY_STOP returned for all other cases
463 *
464 */
465static int __kprobes hw_breakpoint_handler(struct die_args *args)
466{
467 int i, cpu, rc = NOTIFY_STOP;
468 struct perf_event *bp;
469 unsigned long dr7, dr6;
470 unsigned long *dr6_p;
471
472 /* The DR6 value is pointed by args->err */
473 dr6_p = (unsigned long *)ERR_PTR(args->err);
474 dr6 = *dr6_p;
475
476 /* Do an early return if no trap bits are set in DR6 */
477 if ((dr6 & DR_TRAP_BITS) == 0)
478 return NOTIFY_DONE;
479
480 get_debugreg(dr7, 7);
481 /* Disable breakpoints during exception handling */
482 set_debugreg(0UL, 7);
483 /*
484 * Assert that local interrupts are disabled
485 * Reset the DRn bits in the virtualized register value.
486 * The ptrace trigger routine will add in whatever is needed.
487 */
488 current->thread.debugreg6 &= ~DR_TRAP_BITS;
489 cpu = get_cpu();
490
491 /* Handle all the breakpoints that were triggered */
492 for (i = 0; i < HBP_NUM; ++i) {
493 if (likely(!(dr6 & (DR_TRAP0 << i))))
494 continue;
495
496 /*
497 * The counter may be concurrently released but that can only
498 * occur from a call_rcu() path. We can then safely fetch
499 * the breakpoint, use its callback, touch its counter
500 * while we are in an rcu_read_lock() path.
501 */
502 rcu_read_lock();
503
504 bp = per_cpu(bp_per_reg[i], cpu);
505 if (bp)
506 rc = NOTIFY_DONE;
507 /*
508 * Reset the 'i'th TRAP bit in dr6 to denote completion of
509 * exception handling
510 */
511 (*dr6_p) &= ~(DR_TRAP0 << i);
512 /*
513 * bp can be NULL due to lazy debug register switching
514 * or due to concurrent perf counter removing.
515 */
516 if (!bp) {
517 rcu_read_unlock();
518 break;
519 }
520
521 perf_bp_event(bp, args->regs);
522
523 rcu_read_unlock();
524 }
525 if (dr6 & (~DR_TRAP_BITS))
526 rc = NOTIFY_DONE;
527
528 set_debugreg(dr7, 7);
529 put_cpu();
530
531 return rc;
532}
533
534/*
535 * Handle debug exception notifications.
536 */
537int __kprobes hw_breakpoint_exceptions_notify(
538 struct notifier_block *unused, unsigned long val, void *data)
539{
540 if (val != DIE_DEBUG)
541 return NOTIFY_DONE;
542
543 return hw_breakpoint_handler(data);
544}
545
546void hw_breakpoint_pmu_read(struct perf_event *bp)
547{
548 /* TODO */
549}
550
551void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
552{
553 /* TODO */
554}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d308f16b..8eec0ec59af2 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
103 * on system-call entry - see also fork() and the signal handling 103 * on system-call entry - see also fork() and the signal handling
104 * code. 104 * code.
105 */ 105 */
106static int do_iopl(unsigned int level, struct pt_regs *regs) 106long sys_iopl(unsigned int level, struct pt_regs *regs)
107{ 107{
108 unsigned int old = (regs->flags >> 12) & 3; 108 unsigned int old = (regs->flags >> 12) & 3;
109 struct thread_struct *t = &current->thread;
109 110
110 if (level > 3) 111 if (level > 3)
111 return -EINVAL; 112 return -EINVAL;
@@ -115,29 +116,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
115 return -EPERM; 116 return -EPERM;
116 } 117 }
117 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); 118 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
118
119 return 0;
120}
121
122#ifdef CONFIG_X86_32
123long sys_iopl(struct pt_regs *regs)
124{
125 unsigned int level = regs->bx;
126 struct thread_struct *t = &current->thread;
127 int rc;
128
129 rc = do_iopl(level, regs);
130 if (rc < 0)
131 goto out;
132
133 t->iopl = level << 12; 119 t->iopl = level << 12;
134 set_iopl_mask(t->iopl); 120 set_iopl_mask(t->iopl);
135out: 121
136 return rc; 122 return 0;
137}
138#else
139asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
140{
141 return do_iopl(level, regs);
142} 123}
143#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 391206199515..91fd0c70a18a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -63,19 +63,19 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT"); 66 seq_printf(p, "%*s: ", prec, "PMI");
67 for_each_online_cpu(j) 67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n"); 69 seq_printf(p, " Performance monitoring interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND"); 70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j) 71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v)
149 if (!desc) 149 if (!desc)
150 return 0; 150 return 0;
151 151
152 spin_lock_irqsave(&desc->lock, flags); 152 raw_spin_lock_irqsave(&desc->lock, flags);
153 for_each_online_cpu(j) 153 for_each_online_cpu(j)
154 any_count |= kstat_irqs_cpu(i, j); 154 any_count |= kstat_irqs_cpu(i, j);
155 action = desc->action; 155 action = desc->action;
@@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v)
170 170
171 seq_putc(p, '\n'); 171 seq_putc(p, '\n');
172out: 172out:
173 spin_unlock_irqrestore(&desc->lock, flags); 173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 return 0; 174 return 0;
175} 175}
176 176
@@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
@@ -244,7 +244,6 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
244 __func__, smp_processor_id(), vector, irq); 244 __func__, smp_processor_id(), vector, irq);
245 } 245 }
246 246
247 run_local_timers();
248 irq_exit(); 247 irq_exit();
249 248
250 set_irq_regs(old_regs); 249 set_irq_regs(old_regs);
@@ -252,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
252} 251}
253 252
254/* 253/*
255 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
256 */ 255 */
257void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
258{ 257{
259 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
260 259
@@ -264,15 +263,104 @@ void smp_generic_interrupt(struct pt_regs *regs)
264 263
265 irq_enter(); 264 irq_enter();
266 265
267 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
268 267
269 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
270 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
271 270
272 run_local_timers();
273 irq_exit(); 271 irq_exit();
274 272
275 set_irq_regs(old_regs); 273 set_irq_regs(old_regs);
276} 274}
277 275
278EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277
278#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void)
281{
282 unsigned int irq, vector;
283 static int warned;
284 struct irq_desc *desc;
285
286 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0;
288 int set_affinity = 1;
289 const struct cpumask *affinity;
290
291 if (!desc)
292 continue;
293 if (irq == 2)
294 continue;
295
296 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock);
298
299 affinity = desc->affinity;
300 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock);
303 continue;
304 }
305
306 /*
307 * Complete the irq move. This cpu is going down and for
308 * non intr-remapping case, we can't wait till this interrupt
309 * arrives at this cpu before completing the irq move.
310 */
311 irq_force_complete_move(irq);
312
313 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
314 break_affinity = 1;
315 affinity = cpu_all_mask;
316 }
317
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
319 desc->chip->mask(irq);
320
321 if (desc->chip->set_affinity)
322 desc->chip->set_affinity(irq, affinity);
323 else if (!(warned++))
324 set_affinity = 0;
325
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
327 desc->chip->unmask(irq);
328
329 raw_spin_unlock(&desc->lock);
330
331 if (break_affinity && set_affinity)
332 printk("Broke affinity for irq %i\n", irq);
333 else if (!set_affinity)
334 printk("Cannot set affinity for irq %i\n", irq);
335 }
336
337 /*
338 * We can remove mdelay() and then send spuriuous interrupts to
339 * new cpu targets for all the irqs that were handled previously by
340 * this cpu. While it works, I have seen spurious interrupt messages
341 * (nothing wrong but still...).
342 *
343 * So for now, retain mdelay(1) and check the IRR and then send those
344 * interrupts to new targets as this cpu is already offlined...
345 */
346 mdelay(1);
347
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr;
350
351 if (__get_cpu_var(vector_irq)[vector] < 0)
352 continue;
353
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector];
357
358 desc = irq_to_desc(irq);
359 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger)
361 desc->chip->retrigger(irq);
362 raw_spin_unlock(&desc->lock);
363 }
364 }
365}
366#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
211 211
212 return true; 212 return true;
213} 213}
214
215#ifdef CONFIG_HOTPLUG_CPU
216
217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
219{
220 unsigned int irq;
221 struct irq_desc *desc;
222
223 for_each_irq_desc(irq, desc) {
224 const struct cpumask *affinity;
225
226 if (!desc)
227 continue;
228 if (irq == 2)
229 continue;
230
231 affinity = desc->affinity;
232 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
233 printk("Breaking affinity for irq %i\n", irq);
234 affinity = cpu_all_mask;
235 }
236 if (desc->chip->set_affinity)
237 desc->chip->set_affinity(irq, affinity);
238 else if (desc->action)
239 printk_once("Cannot set affinity for irq %i\n", irq);
240 }
241
242#if 0
243 barrier();
244 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
245 [note the nop - the interrupt-enable boundary on x86 is two
246 instructions from sti] - to flush out pending hardirqs and
247 IPIs. After this point nothing is supposed to reach this CPU." */
248 __asm__ __volatile__("sti; nop; cli");
249 barrier();
250#else
251 /* That doesn't seem sufficient. Give it 1ms. */
252 local_irq_enable();
253 mdelay(1);
254 local_irq_disable();
255#endif
256}
257#endif
258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
62 return true; 62 return true;
63} 63}
64 64
65#ifdef CONFIG_HOTPLUG_CPU
66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
68{
69 unsigned int irq;
70 static int warned;
71 struct irq_desc *desc;
72
73 for_each_irq_desc(irq, desc) {
74 int break_affinity = 0;
75 int set_affinity = 1;
76 const struct cpumask *affinity;
77
78 if (!desc)
79 continue;
80 if (irq == 2)
81 continue;
82
83 /* interrupt's are disabled at this point */
84 spin_lock(&desc->lock);
85
86 affinity = desc->affinity;
87 if (!irq_has_action(irq) ||
88 cpumask_equal(affinity, cpu_online_mask)) {
89 spin_unlock(&desc->lock);
90 continue;
91 }
92
93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
94 break_affinity = 1;
95 affinity = cpu_all_mask;
96 }
97
98 if (desc->chip->mask)
99 desc->chip->mask(irq);
100
101 if (desc->chip->set_affinity)
102 desc->chip->set_affinity(irq, affinity);
103 else if (!(warned++))
104 set_affinity = 0;
105
106 if (desc->chip->unmask)
107 desc->chip->unmask(irq);
108
109 spin_unlock(&desc->lock);
110
111 if (break_affinity && set_affinity)
112 printk("Broke affinity for irq %i\n", irq);
113 else if (!set_affinity)
114 printk("Cannot set affinity for irq %i\n", irq);
115 }
116
117 /* That doesn't seem sufficient. Give it 1ms. */
118 local_irq_enable();
119 mdelay(1);
120 local_irq_disable();
121}
122#endif
123 65
124extern void call_softirq(void); 66extern void call_softirq(void);
125 67
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f30773fb29..d5932226614f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -200,8 +200,8 @@ static void __init apic_intr_init(void)
200 /* self generated IPI for local APIC timer */ 200 /* self generated IPI for local APIC timer */
201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
202 202
203 /* generic IPI for platform specific use */ 203 /* IPI for X86 platform specific use */
204 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 204 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
205 205
206 /* IPI vectors for APIC spurious and error interrupts */ 206 /* IPI vectors for APIC spurious and error interrupts */
207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..bfba6019d762 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -42,7 +42,9 @@
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h>
45 46
47#include <asm/debugreg.h>
46#include <asm/apicdef.h> 48#include <asm/apicdef.h>
47#include <asm/system.h> 49#include <asm/system.h>
48 50
@@ -85,10 +87,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
85 gdb_regs[GDB_DS] = regs->ds; 87 gdb_regs[GDB_DS] = regs->ds;
86 gdb_regs[GDB_ES] = regs->es; 88 gdb_regs[GDB_ES] = regs->es;
87 gdb_regs[GDB_CS] = regs->cs; 89 gdb_regs[GDB_CS] = regs->cs;
88 gdb_regs[GDB_SS] = __KERNEL_DS;
89 gdb_regs[GDB_FS] = 0xFFFF; 90 gdb_regs[GDB_FS] = 0xFFFF;
90 gdb_regs[GDB_GS] = 0xFFFF; 91 gdb_regs[GDB_GS] = 0xFFFF;
91 gdb_regs[GDB_SP] = (int)&regs->sp; 92 if (user_mode_vm(regs)) {
93 gdb_regs[GDB_SS] = regs->ss;
94 gdb_regs[GDB_SP] = regs->sp;
95 } else {
96 gdb_regs[GDB_SS] = __KERNEL_DS;
97 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
98 }
92#else 99#else
93 gdb_regs[GDB_R8] = regs->r8; 100 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 101 gdb_regs[GDB_R9] = regs->r9;
@@ -101,7 +108,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 108 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 109 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 110 gdb_regs32[GDB_SS] = regs->ss;
104 gdb_regs[GDB_SP] = regs->sp; 111 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
105#endif 112#endif
106} 113}
107 114
@@ -198,41 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
198 205
199static struct hw_breakpoint { 206static struct hw_breakpoint {
200 unsigned enabled; 207 unsigned enabled;
201 unsigned type;
202 unsigned len;
203 unsigned long addr; 208 unsigned long addr;
209 int len;
210 int type;
211 struct perf_event **pev;
204} breakinfo[4]; 212} breakinfo[4];
205 213
206static void kgdb_correct_hw_break(void) 214static void kgdb_correct_hw_break(void)
207{ 215{
208 unsigned long dr7;
209 int correctit = 0;
210 int breakbit;
211 int breakno; 216 int breakno;
212 217
213 get_debugreg(dr7, 7);
214 for (breakno = 0; breakno < 4; breakno++) { 218 for (breakno = 0; breakno < 4; breakno++) {
215 breakbit = 2 << (breakno << 1); 219 struct perf_event *bp;
216 if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { 220 struct arch_hw_breakpoint *info;
217 correctit = 1; 221 int val;
218 dr7 |= breakbit; 222 int cpu = raw_smp_processor_id();
219 dr7 &= ~(0xf0000 << (breakno << 2)); 223 if (!breakinfo[breakno].enabled)
220 dr7 |= ((breakinfo[breakno].len << 2) | 224 continue;
221 breakinfo[breakno].type) << 225 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
222 ((breakno << 2) + 16); 226 info = counter_arch_bp(bp);
223 if (breakno >= 0 && breakno <= 3) 227 if (bp->attr.disabled != 1)
224 set_debugreg(breakinfo[breakno].addr, breakno); 228 continue;
225 229 bp->attr.bp_addr = breakinfo[breakno].addr;
226 } else { 230 bp->attr.bp_len = breakinfo[breakno].len;
227 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 231 bp->attr.bp_type = breakinfo[breakno].type;
228 correctit = 1; 232 info->address = breakinfo[breakno].addr;
229 dr7 &= ~breakbit; 233 info->len = breakinfo[breakno].len;
230 dr7 &= ~(0xf0000 << (breakno << 2)); 234 info->type = breakinfo[breakno].type;
231 } 235 val = arch_install_hw_breakpoint(bp);
232 } 236 if (!val)
237 bp->attr.disabled = 0;
233 } 238 }
234 if (correctit) 239 hw_breakpoint_restore();
235 set_debugreg(dr7, 7); 240}
241
242static int hw_break_reserve_slot(int breakno)
243{
244 int cpu;
245 int cnt = 0;
246 struct perf_event **pevent;
247
248 for_each_online_cpu(cpu) {
249 cnt++;
250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
251 if (dbg_reserve_bp_slot(*pevent))
252 goto fail;
253 }
254
255 return 0;
256
257fail:
258 for_each_online_cpu(cpu) {
259 cnt--;
260 if (!cnt)
261 break;
262 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
263 dbg_release_bp_slot(*pevent);
264 }
265 return -1;
266}
267
268static int hw_break_release_slot(int breakno)
269{
270 struct perf_event **pevent;
271 int cpu;
272
273 for_each_online_cpu(cpu) {
274 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
275 if (dbg_release_bp_slot(*pevent))
276 /*
277 * The debugger is responisble for handing the retry on
278 * remove failure.
279 */
280 return -1;
281 }
282 return 0;
236} 283}
237 284
238static int 285static int
@@ -246,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
246 if (i == 4) 293 if (i == 4)
247 return -1; 294 return -1;
248 295
296 if (hw_break_release_slot(i)) {
297 printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
298 return -1;
299 }
249 breakinfo[i].enabled = 0; 300 breakinfo[i].enabled = 0;
250 301
251 return 0; 302 return 0;
@@ -254,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
254static void kgdb_remove_all_hw_break(void) 305static void kgdb_remove_all_hw_break(void)
255{ 306{
256 int i; 307 int i;
308 int cpu = raw_smp_processor_id();
309 struct perf_event *bp;
257 310
258 for (i = 0; i < 4; i++) 311 for (i = 0; i < 4; i++) {
259 memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); 312 if (!breakinfo[i].enabled)
313 continue;
314 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
315 if (bp->attr.disabled == 1)
316 continue;
317 arch_uninstall_hw_breakpoint(bp);
318 bp->attr.disabled = 1;
319 }
260} 320}
261 321
262static int 322static int
263kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) 323kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
264{ 324{
265 unsigned type;
266 int i; 325 int i;
267 326
268 for (i = 0; i < 4; i++) 327 for (i = 0; i < 4; i++)
@@ -273,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
273 332
274 switch (bptype) { 333 switch (bptype) {
275 case BP_HARDWARE_BREAKPOINT: 334 case BP_HARDWARE_BREAKPOINT:
276 type = 0; 335 len = 1;
277 len = 1; 336 breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
278 break; 337 break;
279 case BP_WRITE_WATCHPOINT: 338 case BP_WRITE_WATCHPOINT:
280 type = 1; 339 breakinfo[i].type = X86_BREAKPOINT_WRITE;
281 break; 340 break;
282 case BP_ACCESS_WATCHPOINT: 341 case BP_ACCESS_WATCHPOINT:
283 type = 3; 342 breakinfo[i].type = X86_BREAKPOINT_RW;
284 break; 343 break;
285 default: 344 default:
286 return -1; 345 return -1;
287 } 346 }
288 347 switch (len) {
289 if (len == 1 || len == 2 || len == 4) 348 case 1:
290 breakinfo[i].len = len - 1; 349 breakinfo[i].len = X86_BREAKPOINT_LEN_1;
291 else 350 break;
351 case 2:
352 breakinfo[i].len = X86_BREAKPOINT_LEN_2;
353 break;
354 case 4:
355 breakinfo[i].len = X86_BREAKPOINT_LEN_4;
356 break;
357#ifdef CONFIG_X86_64
358 case 8:
359 breakinfo[i].len = X86_BREAKPOINT_LEN_8;
360 break;
361#endif
362 default:
292 return -1; 363 return -1;
293 364 }
294 breakinfo[i].enabled = 1;
295 breakinfo[i].addr = addr; 365 breakinfo[i].addr = addr;
296 breakinfo[i].type = type; 366 if (hw_break_reserve_slot(i)) {
367 breakinfo[i].addr = 0;
368 return -1;
369 }
370 breakinfo[i].enabled = 1;
297 371
298 return 0; 372 return 0;
299} 373}
@@ -308,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
308 */ 382 */
309void kgdb_disable_hw_debug(struct pt_regs *regs) 383void kgdb_disable_hw_debug(struct pt_regs *regs)
310{ 384{
385 int i;
386 int cpu = raw_smp_processor_id();
387 struct perf_event *bp;
388
311 /* Disable hardware debugging while we are in kgdb: */ 389 /* Disable hardware debugging while we are in kgdb: */
312 set_debugreg(0UL, 7); 390 set_debugreg(0UL, 7);
391 for (i = 0; i < 4; i++) {
392 if (!breakinfo[i].enabled)
393 continue;
394 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
395 if (bp->attr.disabled == 1)
396 continue;
397 arch_uninstall_hw_breakpoint(bp);
398 bp->attr.disabled = 1;
399 }
313} 400}
314 401
315/** 402/**
@@ -373,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
373 struct pt_regs *linux_regs) 460 struct pt_regs *linux_regs)
374{ 461{
375 unsigned long addr; 462 unsigned long addr;
376 unsigned long dr6;
377 char *ptr; 463 char *ptr;
378 int newPC; 464 int newPC;
379 465
@@ -395,25 +481,10 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
395 /* set the trace bit if we're stepping */ 481 /* set the trace bit if we're stepping */
396 if (remcomInBuffer[0] == 's') { 482 if (remcomInBuffer[0] == 's') {
397 linux_regs->flags |= X86_EFLAGS_TF; 483 linux_regs->flags |= X86_EFLAGS_TF;
398 kgdb_single_step = 1;
399 atomic_set(&kgdb_cpu_doing_single_step, 484 atomic_set(&kgdb_cpu_doing_single_step,
400 raw_smp_processor_id()); 485 raw_smp_processor_id());
401 } 486 }
402 487
403 get_debugreg(dr6, 6);
404 if (!(dr6 & 0x4000)) {
405 int breakno;
406
407 for (breakno = 0; breakno < 4; breakno++) {
408 if (dr6 & (1 << breakno) &&
409 breakinfo[breakno].type == 0) {
410 /* Set restore flag: */
411 linux_regs->flags |= X86_EFLAGS_RF;
412 break;
413 }
414 }
415 }
416 set_debugreg(0UL, 6);
417 kgdb_correct_hw_break(); 488 kgdb_correct_hw_break();
418 489
419 return 0; 490 return 0;
@@ -434,6 +505,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 505 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 506 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 507 args->err, "c", "", regs);
508 /*
509 * Reset the BS bit in dr6 (pointed by args->err) to
510 * denote completion of processing
511 */
512 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 513
438 return NOTIFY_STOP; 514 return NOTIFY_STOP;
439} 515}
@@ -476,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
476 break; 552 break;
477 553
478 case DIE_DEBUG: 554 case DIE_DEBUG:
479 if (atomic_read(&kgdb_cpu_doing_single_step) == 555 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
480 raw_smp_processor_id()) {
481 if (user_mode(regs)) 556 if (user_mode(regs))
482 return single_step_cont(regs, args); 557 return single_step_cont(regs, args);
483 break; 558 break;
@@ -530,7 +605,42 @@ static struct notifier_block kgdb_notifier = {
530 */ 605 */
531int kgdb_arch_init(void) 606int kgdb_arch_init(void)
532{ 607{
533 return register_die_notifier(&kgdb_notifier); 608 int i, cpu;
609 int ret;
610 struct perf_event_attr attr;
611 struct perf_event **pevent;
612
613 ret = register_die_notifier(&kgdb_notifier);
614 if (ret != 0)
615 return ret;
616 /*
617 * Pre-allocate the hw breakpoint structions in the non-atomic
618 * portion of kgdb because this operation requires mutexs to
619 * complete.
620 */
621 attr.bp_addr = (unsigned long)kgdb_arch_init;
622 attr.type = PERF_TYPE_BREAKPOINT;
623 attr.bp_len = HW_BREAKPOINT_LEN_1;
624 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1;
626 for (i = 0; i < 4; i++) {
627 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
628 if (IS_ERR(breakinfo[i].pev)) {
629 printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
630 breakinfo[i].pev = NULL;
631 kgdb_arch_exit();
632 return -1;
633 }
634 for_each_online_cpu(cpu) {
635 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
636 pevent[0]->hw.sample_period = 1;
637 if (pevent[0]->destroy != NULL) {
638 pevent[0]->destroy = NULL;
639 release_bp_slot(*pevent);
640 }
641 }
642 }
643 return ret;
534} 644}
535 645
536/** 646/**
@@ -541,6 +651,13 @@ int kgdb_arch_init(void)
541 */ 651 */
542void kgdb_arch_exit(void) 652void kgdb_arch_exit(void)
543{ 653{
654 int i;
655 for (i = 0; i < 4; i++) {
656 if (breakinfo[i].pev) {
657 unregister_wide_hw_breakpoint(breakinfo[i].pev);
658 breakinfo[i].pev = NULL;
659 }
660 }
544 unregister_die_notifier(&kgdb_notifier); 661 unregister_die_notifier(&kgdb_notifier);
545} 662}
546 663
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..5b8c7505b3bc 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,22 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
51 52
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/desc.h> 54#include <asm/desc.h>
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/alternative.h> 57#include <asm/alternative.h>
58#include <asm/insn.h>
59#include <asm/debugreg.h>
57 60
58void jprobe_return_end(void); 61void jprobe_return_end(void);
59 62
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 63DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 64DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62 65
63#ifdef CONFIG_X86_64 66#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76 67
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 68#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 97 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 99};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 100#undef W
154 101
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 102struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +191,75 @@ retry:
244 } 191 }
245} 192}
246 193
194/* Recover the probed instruction at addr for further analysis. */
195static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
196{
197 struct kprobe *kp;
198 kp = get_kprobe((void *)addr);
199 if (!kp)
200 return -EINVAL;
201
202 /*
203 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn.
208 *
209 * On the other hand, kp->opcode has a copy of the first byte of
210 * the probed instruction, which is overwritten by int3. And
211 * the instruction at kp->addr is not modified by kprobes except
212 * for the first byte, we can recover the original instruction
213 * from it and kp->opcode.
214 */
215 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
216 buf[0] = kp->opcode;
217 return 0;
218}
219
220/* Dummy buffers for kallsyms_lookup */
221static char __dummy_buf[KSYM_NAME_LEN];
222
223/* Check if paddr is at an instruction boundary */
224static int __kprobes can_probe(unsigned long paddr)
225{
226 int ret;
227 unsigned long addr, offset = 0;
228 struct insn insn;
229 kprobe_opcode_t buf[MAX_INSN_SIZE];
230
231 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
232 return 0;
233
234 /* Decode instructions */
235 addr = paddr - offset;
236 while (addr < paddr) {
237 kernel_insn_init(&insn, (void *)addr);
238 insn_get_opcode(&insn);
239
240 /*
241 * Check if the instruction has been modified by another
242 * kprobe, in which case we replace the breakpoint by the
243 * original instruction in our buffer.
244 */
245 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
246 ret = recover_probed_instruction(buf, addr);
247 if (ret)
248 /*
249 * Another debugging subsystem might insert
250 * this breakpoint. In that case, we can't
251 * recover it.
252 */
253 return 0;
254 kernel_insn_init(&insn, buf);
255 }
256 insn_get_length(&insn);
257 addr += insn.length;
258 }
259
260 return (addr == paddr);
261}
262
247/* 263/*
248 * Returns non-zero if opcode modifies the interrupt flag. 264 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 265 */
@@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
277static void __kprobes fix_riprel(struct kprobe *p) 293static void __kprobes fix_riprel(struct kprobe *p)
278{ 294{
279#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
280 u8 *insn = p->ainsn.insn; 296 struct insn insn;
281 s64 disp; 297 kernel_insn_init(&insn, p->ainsn.insn);
282 int need_modrm;
283
284 /* Skip legacy instruction prefixes. */
285 while (1) {
286 switch (*insn) {
287 case 0x66:
288 case 0x67:
289 case 0x2e:
290 case 0x3e:
291 case 0x26:
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 }
301 break;
302 }
303 298
304 /* Skip REX instruction prefix. */ 299 if (insn_rip_relative(&insn)) {
305 if (is_REX_prefix(insn)) 300 s64 newdisp;
306 ++insn; 301 u8 *disp;
307 302 insn_get_displacement(&insn);
308 if (*insn == 0x0f) { 303 /*
309 /* Two-byte opcode. */ 304 * The copied instruction uses the %rip-relative addressing
310 ++insn; 305 * mode. Adjust the displacement for the difference between
311 need_modrm = test_bit(*insn, 306 * the original location of this instruction and the location
312 (unsigned long *)twobyte_has_modrm); 307 * of the copy that will actually be run. The tricky bit here
313 } else 308 * is making sure that the sign extension happens correctly in
314 /* One-byte opcode. */ 309 * this calculation, since we need a signed 32-bit result to
315 need_modrm = test_bit(*insn, 310 * be sign-extended to 64 bits when it's added to the %rip
316 (unsigned long *)onebyte_has_modrm); 311 * value and yield the same 64-bit result that the sign-
317 312 * extension of the original signed 32-bit displacement would
318 if (need_modrm) { 313 * have given.
319 u8 modrm = *++insn; 314 */
320 if ((modrm & 0xc7) == 0x05) { 315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
321 /* %rip+disp32 addressing mode */ 316 (u8 *) p->ainsn.insn;
322 /* Displacement follows ModRM byte. */ 317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
323 ++insn; 318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
324 /* 319 *(s32 *) disp = (s32) newdisp;
325 * The copied instruction uses the %rip-relative
326 * addressing mode. Adjust the displacement for the
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 320 }
343#endif 321#endif
344} 322}
@@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 337
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 338int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 339{
340 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 342 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 343 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 344 if (!p->ainsn.insn)
@@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 452{
473 switch (kcb->kprobe_status) { 453 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 454 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 455 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb); 456 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb); 457 set_current_kprobe(p, regs, kcb);
@@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
491 kcb->kprobe_status = KPROBE_REENTER; 460 kcb->kprobe_status = KPROBE_REENTER;
492 break; 461 break;
493 case KPROBE_HIT_SS: 462 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 463 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 464 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 465 * codepath should strictly reside in .kprobes.text section.
497 return 0; 466 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 467 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 468 */
500 * to, or just after, single-stepping of a probed 469 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 470 p->addr);
502 * reside in .kprobes.text section. Raise a warning 471 dump_kprobe(p);
503 * to highlight this peculiar case. 472 BUG();
504 */
505 }
506 default: 473 default:
507 /* impossible cases */ 474 /* impossible cases */
508 WARN_ON(1); 475 WARN_ON(1);
@@ -514,7 +481,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
514 481
515/* 482/*
516 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 483 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
517 * remain disabled thorough out this function. 484 * remain disabled throughout this function.
518 */ 485 */
519static int __kprobes kprobe_handler(struct pt_regs *regs) 486static int __kprobes kprobe_handler(struct pt_regs *regs)
520{ 487{
@@ -851,7 +818,7 @@ no_change:
851 818
852/* 819/*
853 * Interrupts are disabled on entry as trap1 is an interrupt gate and they 820 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
854 * remain disabled thoroughout this function. 821 * remain disabled throughout this function.
855 */ 822 */
856static int __kprobes post_kprobe_handler(struct pt_regs *regs) 823static int __kprobes post_kprobe_handler(struct pt_regs *regs)
857{ 824{
@@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 934 ret = NOTIFY_STOP;
968 break; 935 break;
969 case DIE_DEBUG: 936 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 937 if (post_kprobe_handler(args->regs)) {
938 /*
939 * Reset the BS bit in dr6 (pointed by args->err) to
940 * denote completion of processing
941 */
942 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 943 ret = NOTIFY_STOP;
944 }
972 break; 945 break;
973 case DIE_GPF: 946 case DIE_GPF:
974 /* 947 /*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 03657e784fd8..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -200,6 +201,7 @@ void machine_kexec(struct kimage *image)
200 201
201 /* Interrupts aren't acceptable while we reboot */ 202 /* Interrupts aren't acceptable while we reboot */
202 local_irq_disable(); 203 local_irq_disable();
204 hw_breakpoint_disable();
203 205
204 if (image->preserve_context) { 206 if (image->preserve_context) {
205#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/debugreg.h>
21 22
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 23static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 24 unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
282 283
283 /* Interrupts aren't acceptable while we reboot */ 284 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 285 local_irq_disable();
286 hw_breakpoint_disable();
285 287
286 if (image->preserve_context) { 288 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 289#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
deleted file mode 100644
index 2a62d843f015..000000000000
--- a/arch/x86/kernel/mfgpt_32.c
+++ /dev/null
@@ -1,410 +0,0 @@
1/*
2 * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT)
3 *
4 * Copyright (C) 2006, Advanced Micro Devices, Inc.
5 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of version 2 of the GNU General Public License
9 * as published by the Free Software Foundation.
10 *
11 * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book.
12 */
13
14/*
15 * We are using the 32.768kHz input clock - it's the only one that has the
16 * ranges we find desirable. The following table lists the suitable
17 * divisors and the associated Hz, minimum interval and the maximum interval:
18 *
19 * Divisor Hz Min Delta (s) Max Delta (s)
20 * 1 32768 .00048828125 2.000
21 * 2 16384 .0009765625 4.000
22 * 4 8192 .001953125 8.000
23 * 8 4096 .00390625 16.000
24 * 16 2048 .0078125 32.000
25 * 32 1024 .015625 64.000
26 * 64 512 .03125 128.000
27 * 128 256 .0625 256.000
28 * 256 128 .125 512.000
29 */
30
31#include <linux/kernel.h>
32#include <linux/interrupt.h>
33#include <linux/module.h>
34#include <asm/geode.h>
35
36#define MFGPT_DEFAULT_IRQ 7
37
38static struct mfgpt_timer_t {
39 unsigned int avail:1;
40} mfgpt_timers[MFGPT_MAX_TIMERS];
41
42/* Selected from the table above */
43
44#define MFGPT_DIVISOR 16
45#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
46#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
47#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
48
49/* Allow for disabling of MFGPTs */
50static int disable;
51static int __init mfgpt_disable(char *s)
52{
53 disable = 1;
54 return 1;
55}
56__setup("nomfgpt", mfgpt_disable);
57
58/* Reset the MFGPT timers. This is required by some broken BIOSes which already
59 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
60 * affected at least (0.99 is OK with MFGPT workaround left to off).
61 */
62static int __init mfgpt_fix(char *s)
63{
64 u32 val, dummy;
65
66 /* The following udocumented bit resets the MFGPT timers */
67 val = 0xFF; dummy = 0;
68 wrmsr(MSR_MFGPT_SETUP, val, dummy);
69 return 1;
70}
71__setup("mfgptfix", mfgpt_fix);
72
73/*
74 * Check whether any MFGPTs are available for the kernel to use. In most
75 * cases, firmware that uses AMD's VSA code will claim all timers during
76 * bootup; we certainly don't want to take them if they're already in use.
77 * In other cases (such as with VSAless OpenFirmware), the system firmware
78 * leaves timers available for us to use.
79 */
80
81
82static int timers = -1;
83
84static void geode_mfgpt_detect(void)
85{
86 int i;
87 u16 val;
88
89 timers = 0;
90
91 if (disable) {
92 printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
93 goto done;
94 }
95
96 if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
97 printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
98 goto done;
99 }
100
101 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
102 val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
103 if (!(val & MFGPT_SETUP_SETUP)) {
104 mfgpt_timers[i].avail = 1;
105 timers++;
106 }
107 }
108
109done:
110 printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
111}
112
113int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
114{
115 u32 msr, mask, value, dummy;
116 int shift = (cmp == MFGPT_CMP1) ? 0 : 8;
117
118 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
119 return -EIO;
120
121 /*
122 * The register maps for these are described in sections 6.17.1.x of
123 * the AMD Geode CS5536 Companion Device Data Book.
124 */
125 switch (event) {
126 case MFGPT_EVENT_RESET:
127 /*
128 * XXX: According to the docs, we cannot reset timers above
129 * 6; that is, resets for 7 and 8 will be ignored. Is this
130 * a problem? -dilinger
131 */
132 msr = MSR_MFGPT_NR;
133 mask = 1 << (timer + 24);
134 break;
135
136 case MFGPT_EVENT_NMI:
137 msr = MSR_MFGPT_NR;
138 mask = 1 << (timer + shift);
139 break;
140
141 case MFGPT_EVENT_IRQ:
142 msr = MSR_MFGPT_IRQ;
143 mask = 1 << (timer + shift);
144 break;
145
146 default:
147 return -EIO;
148 }
149
150 rdmsr(msr, value, dummy);
151
152 if (enable)
153 value |= mask;
154 else
155 value &= ~mask;
156
157 wrmsr(msr, value, dummy);
158 return 0;
159}
160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
161
162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
163{
164 u32 zsel, lpc, dummy;
165 int shift;
166
167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
168 return -EIO;
169
170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
181 return -EIO;
182
183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
188
189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
190 if (*irq < 1 || *irq == 2 || *irq > 15)
191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
199 if (enable) {
200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
202 }
203
204 return 0;
205}
206
207static int mfgpt_get(int timer)
208{
209 mfgpt_timers[timer].avail = 0;
210 printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
211 return timer;
212}
213
214int geode_mfgpt_alloc_timer(int timer, int domain)
215{
216 int i;
217
218 if (timers == -1) {
219 /* timers haven't been detected yet */
220 geode_mfgpt_detect();
221 }
222
223 if (!timers)
224 return -1;
225
226 if (timer >= MFGPT_MAX_TIMERS)
227 return -1;
228
229 if (timer < 0) {
230 /* Try to find an available timer */
231 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
232 if (mfgpt_timers[i].avail)
233 return mfgpt_get(i);
234
235 if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
236 break;
237 }
238 } else {
239 /* If they requested a specific timer, try to honor that */
240 if (mfgpt_timers[timer].avail)
241 return mfgpt_get(timer);
242 }
243
244 /* No timers available - too bad */
245 return -1;
246}
247EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
248
249
250#ifdef CONFIG_GEODE_MFGPT_TIMER
251
252/*
253 * The MFPGT timers on the CS5536 provide us with suitable timers to use
254 * as clock event sources - not as good as a HPET or APIC, but certainly
255 * better than the PIT. This isn't a general purpose MFGPT driver, but
256 * a simplified one designed specifically to act as a clock event source.
257 * For full details about the MFGPT, please consult the CS5536 data sheet.
258 */
259
260#include <linux/clocksource.h>
261#include <linux/clockchips.h>
262
263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
264static u16 mfgpt_event_clock;
265
266static int irq;
267static int __init mfgpt_setup(char *str)
268{
269 get_option(&str, &irq);
270 return 1;
271}
272__setup("mfgpt_irq=", mfgpt_setup);
273
274static void mfgpt_disable_timer(u16 clock)
275{
276 /* avoid races by clearing CMP1 and CMP2 unconditionally */
277 geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
278 MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
279}
280
281static int mfgpt_next_event(unsigned long, struct clock_event_device *);
282static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *);
283
284static struct clock_event_device mfgpt_clockevent = {
285 .name = "mfgpt-timer",
286 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event,
289 .rating = 250,
290 .cpumask = cpu_all_mask,
291 .shift = 32
292};
293
294static void mfgpt_start_timer(u16 delta)
295{
296 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
297 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
298
299 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
300 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
301}
302
303static void mfgpt_set_mode(enum clock_event_mode mode,
304 struct clock_event_device *evt)
305{
306 mfgpt_disable_timer(mfgpt_event_clock);
307
308 if (mode == CLOCK_EVT_MODE_PERIODIC)
309 mfgpt_start_timer(MFGPT_PERIODIC);
310
311 mfgpt_tick_mode = mode;
312}
313
314static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
315{
316 mfgpt_start_timer(delta);
317 return 0;
318}
319
320static irqreturn_t mfgpt_tick(int irq, void *dev_id)
321{
322 u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
323
324 /* See if the interrupt was for us */
325 if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
326 return IRQ_NONE;
327
328 /* Turn off the clock (and clear the event) */
329 mfgpt_disable_timer(mfgpt_event_clock);
330
331 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
332 return IRQ_HANDLED;
333
334 /* Clear the counter */
335 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
336
337 /* Restart the clock in periodic mode */
338
339 if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) {
340 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
341 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
342 }
343
344 mfgpt_clockevent.event_handler(&mfgpt_clockevent);
345 return IRQ_HANDLED;
346}
347
348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer"
352};
353
354int __init mfgpt_timer_setup(void)
355{
356 int timer, ret;
357 u16 val;
358
359 timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
360 if (timer < 0) {
361 printk(KERN_ERR
362 "mfgpt-timer: Could not allocate a MFPGT timer\n");
363 return -ENODEV;
364 }
365
366 mfgpt_event_clock = timer;
367
368 /* Set up the IRQ on the MFGPT side */
369 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
370 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
371 return -EIO;
372 }
373
374 /* And register it with the kernel */
375 ret = setup_irq(irq, &mfgptirq);
376
377 if (ret) {
378 printk(KERN_ERR
379 "mfgpt-timer: Unable to set up the interrupt.\n");
380 goto err;
381 }
382
383 /* Set the clock scale and enable the event mode for CMP2 */
384 val = MFGPT_SCALE | (3 << 8);
385
386 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
387
388 /* Set up the clock event */
389 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
390 mfgpt_clockevent.shift);
391 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
392 &mfgpt_clockevent);
393 mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
394 &mfgpt_clockevent);
395
396 printk(KERN_INFO
397 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
398 timer, irq);
399 clockevents_register_device(&mfgpt_clockevent);
400
401 return 0;
402
403err:
404 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
405 printk(KERN_ERR
406 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
407 return -EIO;
408}
409
410#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 366baa179913..e1af7c055c7d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,6 +13,9 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
16#include <linux/firmware.h> 19#include <linux/firmware.h>
17#include <linux/pci_ids.h> 20#include <linux/pci_ids.h>
18#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -76,12 +79,12 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
76 79
77 memset(csig, 0, sizeof(*csig)); 80 memset(csig, 0, sizeof(*csig));
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " 82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86); 83 "supported\n", cpu, c->x86);
81 return -1; 84 return -1;
82 } 85 }
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 88 return 0;
86} 89}
87 90
@@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 106 i++;
104 } 107 }
105 108
106 if (!equiv_cpu_id) { 109 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 110 return 0;
110 }
111 111
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 112 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 113 return 0;
117 }
118 114
119 /* ucode might be chipset specific -- currently we don't support this */ 115 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 117 pr_err("CPU%d: loading of chipset specific code not yet supported\n",
122 "specific code not yet supported\n", cpu); 118 cpu);
123 return 0; 119 return 0;
124 } 120 }
125 121
@@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu)
148 144
149 /* check current patch id and patch's id for match */ 145 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 146 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 148 cpu, mc_amd->hdr.patch_id);
153 return -1; 149 return -1;
154 } 150 }
155 151
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 153 uci->cpu_sig.rev = rev;
160 154
161 return 0; 155 return 0;
@@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 172 return NULL;
179 173
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 175 pr_err("error: invalid type field in container file section header\n");
182 "container file section header\n");
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 180
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 181 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 182 pr_err("error: size mismatch\n");
193 return NULL; 183 return NULL;
194 } 184 }
195 185
@@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 208 size = buf_pos[2];
219 209
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 211 pr_err("error: invalid type field in container file section header\n");
222 "container file section header\n");
223 return 0; 212 return 0;
224 } 213 }
225 214
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 216 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 217 pr_err("failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 218 return 0;
231 } 219 }
232 220
@@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 247
260 offset = install_equiv_cpu_table(ucode_ptr); 248 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 249 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 250 pr_err("failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 251 return UCODE_ERROR;
265 } 252 }
266 253
@@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
291 if (!leftover) { 278 if (!leftover) {
292 vfree(uci->mc); 279 vfree(uci->mc);
293 uci->mc = new_mc; 280 uci->mc = new_mc;
294 pr_debug("microcode: CPU%d found a matching microcode " 281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
295 "update with version 0x%x (current=0x%x)\n",
296 cpu, new_rev, uci->cpu_sig.rev); 282 cpu, new_rev, uci->cpu_sig.rev);
297 } else { 283 } else {
298 vfree(new_mc); 284 vfree(new_mc);
@@ -317,6 +303,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
317 return UCODE_NFOUND; 303 return UCODE_NFOUND;
318 } 304 }
319 305
306 if (*(u32 *)firmware->data != UCODE_MAGIC) {
307 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
308 *(u32 *)firmware->data);
309 return UCODE_ERROR;
310 }
311
320 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 312 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
321 313
322 release_firmware(firmware); 314 release_firmware(firmware);
@@ -327,8 +319,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
327static enum ucode_state 319static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size) 320request_microcode_user(int cpu, const void __user *buf, size_t size)
329{ 321{
330 printk(KERN_INFO "microcode: AMD microcode update via " 322 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
331 "/dev/cpu/microcode not supported\n");
332 return UCODE_ERROR; 323 return UCODE_ERROR;
333} 324}
334 325
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..cceb5bc3c3c2 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,10 +70,12 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/platform_device.h> 76#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 77#include <linux/miscdevice.h>
75#include <linux/capability.h> 78#include <linux/capability.h>
76#include <linux/smp_lock.h>
77#include <linux/kernel.h> 79#include <linux/kernel.h>
78#include <linux/module.h> 80#include <linux/module.h>
79#include <linux/mutex.h> 81#include <linux/mutex.h>
@@ -201,7 +203,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 203
202static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *unused1, struct file *unused2)
203{ 205{
204 cycle_kernel_lock();
205 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
206} 207}
207 208
@@ -211,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
211 ssize_t ret = -EINVAL; 212 ssize_t ret = -EINVAL;
212 213
213 if ((len >> PAGE_SHIFT) > totalram_pages) { 214 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); 215 pr_err("too much data (max %ld pages)\n", totalram_pages);
215 return ret; 216 return ret;
216 } 217 }
217 218
@@ -246,7 +247,7 @@ static int __init microcode_dev_init(void)
246 247
247 error = misc_register(&microcode_dev); 248 error = misc_register(&microcode_dev);
248 if (error) { 249 if (error) {
249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); 250 pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
250 return error; 251 return error;
251 } 252 }
252 253
@@ -361,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu)
361 if (!uci->mc) 362 if (!uci->mc)
362 return UCODE_NFOUND; 363 return UCODE_NFOUND;
363 364
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu); 365 pr_debug("CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu); 366 apply_microcode_on_target(cpu);
366 367
367 return UCODE_OK; 368 return UCODE_OK;
@@ -381,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu)
381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev); 382 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
382 383
383 if (ustate == UCODE_OK) { 384 if (ustate == UCODE_OK) {
384 pr_debug("microcode: CPU%d updated upon init\n", cpu); 385 pr_debug("CPU%d updated upon init\n", cpu);
385 apply_microcode_on_target(cpu); 386 apply_microcode_on_target(cpu);
386 } 387 }
387 388
@@ -408,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
408 if (!cpu_online(cpu)) 409 if (!cpu_online(cpu))
409 return 0; 410 return 0;
410 411
411 pr_debug("microcode: CPU%d added\n", cpu); 412 pr_debug("CPU%d added\n", cpu);
412 413
413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 414 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
414 if (err) 415 if (err)
@@ -427,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
427 if (!cpu_online(cpu)) 428 if (!cpu_online(cpu))
428 return 0; 429 return 0;
429 430
430 pr_debug("microcode: CPU%d removed\n", cpu); 431 pr_debug("CPU%d removed\n", cpu);
431 microcode_fini_cpu(cpu); 432 microcode_fini_cpu(cpu);
432 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 433 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
433 return 0; 434 return 0;
@@ -475,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
475 microcode_update_cpu(cpu); 476 microcode_update_cpu(cpu);
476 case CPU_DOWN_FAILED: 477 case CPU_DOWN_FAILED:
477 case CPU_DOWN_FAILED_FROZEN: 478 case CPU_DOWN_FAILED_FROZEN:
478 pr_debug("microcode: CPU%d added\n", cpu); 479 pr_debug("CPU%d added\n", cpu);
479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 480 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
480 pr_err("microcode: Failed to create group for CPU%d\n", cpu); 481 pr_err("Failed to create group for CPU%d\n", cpu);
481 break; 482 break;
482 case CPU_DOWN_PREPARE: 483 case CPU_DOWN_PREPARE:
483 case CPU_DOWN_PREPARE_FROZEN: 484 case CPU_DOWN_PREPARE_FROZEN:
484 /* Suspend is in progress, only remove the interface */ 485 /* Suspend is in progress, only remove the interface */
485 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 486 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
486 pr_debug("microcode: CPU%d removed\n", cpu); 487 pr_debug("CPU%d removed\n", cpu);
487 break; 488 break;
488 case CPU_DEAD: 489 case CPU_DEAD:
489 case CPU_UP_CANCELED_FROZEN: 490 case CPU_UP_CANCELED_FROZEN:
@@ -509,7 +510,7 @@ static int __init microcode_init(void)
509 microcode_ops = init_amd_microcode(); 510 microcode_ops = init_amd_microcode();
510 511
511 if (!microcode_ops) { 512 if (!microcode_ops) {
512 pr_err("microcode: no support for this CPU vendor\n"); 513 pr_err("no support for this CPU vendor\n");
513 return -ENODEV; 514 return -ENODEV;
514 } 515 }
515 516
@@ -540,8 +541,7 @@ static int __init microcode_init(void)
540 register_hotcpu_notifier(&mc_cpu_notifier); 541 register_hotcpu_notifier(&mc_cpu_notifier);
541 542
542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 543 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
543 " <tigran@aivazian.fsnet.co.uk>," 544 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
544 " Peter Oruba\n");
545 545
546 return 0; 546 return 0;
547} 547}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0d334ddd0a96..ebd193e476ca 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,6 +70,9 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/firmware.h> 76#include <linux/firmware.h>
74#include <linux/uaccess.h> 77#include <linux/uaccess.h>
75#include <linux/kernel.h> 78#include <linux/kernel.h>
@@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
146 149
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) { 151 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel " 152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
150 "processor\n", cpu_num);
151 return -1; 153 return -1;
152 } 154 }
153 155
@@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
165 /* get the current revision from MSR 0x8B */ 167 /* get the current revision from MSR 0x8B */
166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 168 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
167 169
168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", 170 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
169 cpu_num, csig->sig, csig->pf, csig->rev); 171 cpu_num, csig->sig, csig->pf, csig->rev);
170 172
171 return 0; 173 return 0;
172} 174}
@@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc)
194 data_size = get_datasize(mc_header); 196 data_size = get_datasize(mc_header);
195 197
196 if (data_size + MC_HEADER_SIZE > total_size) { 198 if (data_size + MC_HEADER_SIZE > total_size) {
197 printk(KERN_ERR "microcode: error! " 199 pr_err("error! Bad data size in microcode data file\n");
198 "Bad data size in microcode data file\n");
199 return -EINVAL; 200 return -EINVAL;
200 } 201 }
201 202
202 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 203 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
203 printk(KERN_ERR "microcode: error! " 204 pr_err("error! Unknown microcode update format\n");
204 "Unknown microcode update format\n");
205 return -EINVAL; 205 return -EINVAL;
206 } 206 }
207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
208 if (ext_table_size) { 208 if (ext_table_size) {
209 if ((ext_table_size < EXT_HEADER_SIZE) 209 if ((ext_table_size < EXT_HEADER_SIZE)
210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { 210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
211 printk(KERN_ERR "microcode: error! " 211 pr_err("error! Small exttable size in microcode data file\n");
212 "Small exttable size in microcode data file\n");
213 return -EINVAL; 212 return -EINVAL;
214 } 213 }
215 ext_header = mc + MC_HEADER_SIZE + data_size; 214 ext_header = mc + MC_HEADER_SIZE + data_size;
216 if (ext_table_size != exttable_size(ext_header)) { 215 if (ext_table_size != exttable_size(ext_header)) {
217 printk(KERN_ERR "microcode: error! " 216 pr_err("error! Bad exttable size in microcode data file\n");
218 "Bad exttable size in microcode data file\n");
219 return -EFAULT; 217 return -EFAULT;
220 } 218 }
221 ext_sigcount = ext_header->count; 219 ext_sigcount = ext_header->count;
@@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc)
230 while (i--) 228 while (i--)
231 ext_table_sum += ext_tablep[i]; 229 ext_table_sum += ext_tablep[i];
232 if (ext_table_sum) { 230 if (ext_table_sum) {
233 printk(KERN_WARNING "microcode: aborting, " 231 pr_warning("aborting, bad extended signature table checksum\n");
234 "bad extended signature table checksum\n");
235 return -EINVAL; 232 return -EINVAL;
236 } 233 }
237 } 234 }
@@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc)
242 while (i--) 239 while (i--)
243 orig_sum += ((int *)mc)[i]; 240 orig_sum += ((int *)mc)[i];
244 if (orig_sum) { 241 if (orig_sum) {
245 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 242 pr_err("aborting, bad checksum\n");
246 return -EINVAL; 243 return -EINVAL;
247 } 244 }
248 if (!ext_table_size) 245 if (!ext_table_size)
@@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc)
255 - (mc_header->sig + mc_header->pf + mc_header->cksum) 252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
256 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
257 if (sum) { 254 if (sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 255 pr_err("aborting, bad checksum\n");
259 return -EINVAL; 256 return -EINVAL;
260 } 257 }
261 } 258 }
@@ -327,13 +324,11 @@ static int apply_microcode(int cpu)
327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 324 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
328 325
329 if (val[1] != mc_intel->hdr.rev) { 326 if (val[1] != mc_intel->hdr.rev) {
330 printk(KERN_ERR "microcode: CPU%d update " 327 pr_err("CPU%d update to revision 0x%x failed\n",
331 "to revision 0x%x failed\n", 328 cpu_num, mc_intel->hdr.rev);
332 cpu_num, mc_intel->hdr.rev);
333 return -1; 329 return -1;
334 } 330 }
335 printk(KERN_INFO "microcode: CPU%d updated to revision " 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n",
336 "0x%x, date = %04x-%02x-%02x \n",
337 cpu_num, val[1], 332 cpu_num, val[1],
338 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
339 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
@@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
362 357
363 mc_size = get_totalsize(&mc_header); 358 mc_size = get_totalsize(&mc_header);
364 if (!mc_size || mc_size > leftover) { 359 if (!mc_size || mc_size > leftover) {
365 printk(KERN_ERR "microcode: error!" 360 pr_err("error! Bad data in microcode data file\n");
366 "Bad data in microcode data file\n");
367 break; 361 break;
368 } 362 }
369 363
@@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 vfree(uci->mc); 399 vfree(uci->mc);
406 uci->mc = (struct microcode_intel *)new_mc; 400 uci->mc = (struct microcode_intel *)new_mc;
407 401
408 pr_debug("microcode: CPU%d found a matching microcode update with" 402 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
409 " version 0x%x (current=0x%x)\n", 403 cpu, new_rev, uci->cpu_sig.rev);
410 cpu, new_rev, uci->cpu_sig.rev);
411out: 404out:
412 return state; 405 return state;
413} 406}
@@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
429 c->x86, c->x86_model, c->x86_mask); 422 c->x86, c->x86_model, c->x86_mask);
430 423
431 if (request_firmware(&firmware, name, device)) { 424 if (request_firmware(&firmware, name, device)) {
432 pr_debug("microcode: data file %s load failed\n", name); 425 pr_debug("data file %s load failed\n", name);
433 return UCODE_NFOUND; 426 return UCODE_NFOUND;
434 } 427 }
435 428
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 35a57c963df9..40b54ceb68b5 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -945,9 +945,6 @@ void __init early_reserve_e820_mpc_new(void)
945{ 945{
946 if (enable_update_mptable && alloc_mptable) { 946 if (enable_update_mptable && alloc_mptable) {
947 u64 startt = 0; 947 u64 startt = 0;
948#ifdef CONFIG_X86_TRAMPOLINE
949 startt = TRAMPOLINE_BASE;
950#endif
951 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); 948 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
952 } 949 }
953} 950}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..206735ac8cbd 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -172,23 +172,18 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
172 172
173static int msr_open(struct inode *inode, struct file *file) 173static int msr_open(struct inode *inode, struct file *file)
174{ 174{
175 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 175 unsigned int cpu;
176 struct cpuinfo_x86 *c = &cpu_data(cpu); 176 struct cpuinfo_x86 *c;
177 int ret = 0;
178 177
179 lock_kernel();
180 cpu = iminor(file->f_path.dentry->d_inode); 178 cpu = iminor(file->f_path.dentry->d_inode);
179 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
180 return -ENXIO; /* No such CPU */
181 181
182 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
183 ret = -ENXIO; /* No such CPU */
184 goto out;
185 }
186 c = &cpu_data(cpu); 182 c = &cpu_data(cpu);
187 if (!cpu_has(c, X86_FEATURE_MSR)) 183 if (!cpu_has(c, X86_FEATURE_MSR))
188 ret = -EIO; /* MSR not supported */ 184 return -EIO; /* MSR not supported */
189out: 185
190 unlock_kernel(); 186 return 0;
191 return ret;
192} 187}
193 188
194/* 189/*
@@ -251,7 +246,7 @@ static int __init msr_init(void)
251 int i, err = 0; 246 int i, err = 0;
252 i = 0; 247 i = 0;
253 248
254 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { 249 if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
255 printk(KERN_ERR "msr: unable to get major %d for msr\n", 250 printk(KERN_ERR "msr: unable to get major %d for msr\n",
256 MSR_MAJOR); 251 MSR_MAJOR);
257 err = -EBUSY; 252 err = -EBUSY;
@@ -279,7 +274,7 @@ out_class:
279 msr_device_destroy(i); 274 msr_device_destroy(i);
280 class_destroy(msr_class); 275 class_destroy(msr_class);
281out_chrdev: 276out_chrdev:
282 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 277 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
283out: 278out:
284 return err; 279 return err;
285} 280}
@@ -290,7 +285,7 @@ static void __exit msr_exit(void)
290 for_each_online_cpu(cpu) 285 for_each_online_cpu(cpu)
291 msr_device_destroy(cpu); 286 msr_device_destroy(cpu);
292 class_destroy(msr_class); 287 class_destroy(msr_class);
293 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 288 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
294 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 289 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
295} 290}
296 291
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 4006c522adc7..9d1d263f786f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -212,7 +212,7 @@ static int __init olpc_init(void)
212 unsigned char *romsig; 212 unsigned char *romsig;
213 213
214 /* The ioremap check is dangerous; limit what we run it on */ 214 /* The ioremap check is dangerous; limit what we run it on */
215 if (!is_geode() || geode_has_vsa2()) 215 if (!is_geode() || cs5535_has_vsa2())
216 return 0; 216 return 0;
217 217
218 spin_lock_init(&ec_lock); 218 spin_lock_init(&ec_lock);
@@ -244,7 +244,7 @@ static int __init olpc_init(void)
244 (unsigned char *) &olpc_platform_info.ecver, 1); 244 (unsigned char *) &olpc_platform_info.ecver, 1);
245 245
246 /* check to see if the VSA exists */ 246 /* check to see if the VSA exists */
247 if (geode_has_vsa2()) 247 if (cs5535_has_vsa2())
248 olpc_platform_info.flags |= OLPC_F_VSA; 248 olpc_platform_info.flags |= OLPC_F_VSA;
249 249
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 3a7c5a44082e..676b8c77a976 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,9 +8,9 @@
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9 9
10static inline void 10static inline void
11default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) 11default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
12{ 12{
13 __raw_spin_lock(lock); 13 arch_spin_lock(lock);
14} 14}
15 15
16struct pv_lock_ops pv_lock_ops = { 16struct pv_lock_ops pv_lock_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..2bbde6078143 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -31,7 +31,7 @@
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h> 32#include <linux/crash_dump.h>
33#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
34#include <linux/bitops.h> 34#include <linux/bitmap.h>
35#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
36#include <linux/pci.h> 36#include <linux/pci.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
@@ -46,6 +46,7 @@
46#include <asm/dma.h> 46#include <asm/dma.h>
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h>
49 50
50#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
51int use_calgary __read_mostly = 1; 52int use_calgary __read_mostly = 1;
@@ -211,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
211 212
212 spin_lock_irqsave(&tbl->it_lock, flags); 213 spin_lock_irqsave(&tbl->it_lock, flags);
213 214
214 iommu_area_reserve(tbl->it_map, index, npages); 215 bitmap_set(tbl->it_map, index, npages);
215 216
216 spin_unlock_irqrestore(&tbl->it_lock, flags); 217 spin_unlock_irqrestore(&tbl->it_lock, flags);
217} 218}
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
244 if (panic_on_overflow) 245 if (panic_on_overflow)
245 panic("Calgary: fix the allocator.\n"); 246 panic("Calgary: fix the allocator.\n");
246 else 247 else
247 return bad_dma_address; 248 return DMA_ERROR_CODE;
248 } 249 }
249 } 250 }
250 251
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
260 void *vaddr, unsigned int npages, int direction) 261 void *vaddr, unsigned int npages, int direction)
261{ 262{
262 unsigned long entry; 263 unsigned long entry;
263 dma_addr_t ret = bad_dma_address; 264 dma_addr_t ret;
264 265
265 entry = iommu_range_alloc(dev, tbl, npages); 266 entry = iommu_range_alloc(dev, tbl, npages);
266 267
267 if (unlikely(entry == bad_dma_address)) 268 if (unlikely(entry == DMA_ERROR_CODE)) {
268 goto error; 269 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
270 "iommu %p\n", npages, tbl);
271 return DMA_ERROR_CODE;
272 }
269 273
270 /* set the return dma address */ 274 /* set the return dma address */
271 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); 275 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 /* put the TCEs in the HW table */ 277 /* put the TCEs in the HW table */
274 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, 278 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
275 direction); 279 direction);
276
277 return ret; 280 return ret;
278
279error:
280 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
281 "iommu %p\n", npages, tbl);
282 return bad_dma_address;
283} 281}
284 282
285static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 283static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
290 unsigned long flags; 288 unsigned long flags;
291 289
292 /* were we called with bad_dma_address? */ 290 /* were we called with bad_dma_address? */
293 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 291 badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
294 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 292 if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
295 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " 293 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
296 "address 0x%Lx\n", dma_addr); 294 "address 0x%Lx\n", dma_addr);
297 return; 295 return;
@@ -305,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
305 303
306 spin_lock_irqsave(&tbl->it_lock, flags); 304 spin_lock_irqsave(&tbl->it_lock, flags);
307 305
308 iommu_area_free(tbl->it_map, entry, npages); 306 bitmap_clear(tbl->it_map, entry, npages);
309 307
310 spin_unlock_irqrestore(&tbl->it_lock, flags); 308 spin_unlock_irqrestore(&tbl->it_lock, flags);
311} 309}
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
318 316
319 pdev = to_pci_dev(dev); 317 pdev = to_pci_dev(dev);
320 318
319 /* search up the device tree for an iommu */
321 pbus = pdev->bus; 320 pbus = pdev->bus;
322 321 do {
323 /* is the device behind a bridge? Look for the root bus */ 322 tbl = pci_iommu(pbus);
324 while (pbus->parent) 323 if (tbl && tbl->it_busno == pbus->number)
324 break;
325 tbl = NULL;
325 pbus = pbus->parent; 326 pbus = pbus->parent;
326 327 } while (pbus);
327 tbl = pci_iommu(pbus);
328 328
329 BUG_ON(tbl && (tbl->it_busno != pbus->number)); 329 BUG_ON(tbl && (tbl->it_busno != pbus->number));
330 330
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); 373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
374 374
375 entry = iommu_range_alloc(dev, tbl, npages); 375 entry = iommu_range_alloc(dev, tbl, npages);
376 if (entry == bad_dma_address) { 376 if (entry == DMA_ERROR_CODE) {
377 /* makes sure unmap knows to stop */ 377 /* makes sure unmap knows to stop */
378 s->dma_length = 0; 378 s->dma_length = 0;
379 goto error; 379 goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
391error: 391error:
392 calgary_unmap_sg(dev, sg, nelems, dir, NULL); 392 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
393 for_each_sg(sg, s, nelems, i) { 393 for_each_sg(sg, s, nelems, i) {
394 sg->dma_address = bad_dma_address; 394 sg->dma_address = DMA_ERROR_CODE;
395 sg->dma_length = 0; 395 sg->dma_length = 0;
396 } 396 }
397 return 0; 397 return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
446 446
447 /* set up tces to cover the allocated range */ 447 /* set up tces to cover the allocated range */
448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
449 if (mapping == bad_dma_address) 449 if (mapping == DMA_ERROR_CODE)
450 goto free; 450 goto free;
451 *dma_handle = mapping; 451 *dma_handle = mapping;
452 return ret; 452 return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
727 struct iommu_table *tbl = pci_iommu(dev->bus); 727 struct iommu_table *tbl = pci_iommu(dev->bus);
728 728
729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */ 729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
730 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); 730 iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
731 731
732 /* avoid the BIOS/VGA first 640KB-1MB region */ 732 /* avoid the BIOS/VGA first 640KB-1MB region */
733 /* for CalIOC2 - avoid the entire first MB */ 733 /* for CalIOC2 - avoid the entire first MB */
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
1344 return; 1344 return;
1345} 1345}
1346 1346
1347static int __init calgary_iommu_init(void)
1348{
1349 int ret;
1350
1351 /* ok, we're trying to use Calgary - let's roll */
1352 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1353
1354 ret = calgary_init();
1355 if (ret) {
1356 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1357 "falling back to no_iommu\n", ret);
1358 return ret;
1359 }
1360
1361 return 0;
1362}
1363
1347void __init detect_calgary(void) 1364void __init detect_calgary(void)
1348{ 1365{
1349 int bus; 1366 int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
1357 * if the user specified iommu=off or iommu=soft or we found 1374 * if the user specified iommu=off or iommu=soft or we found
1358 * another HW IOMMU already, bail out. 1375 * another HW IOMMU already, bail out.
1359 */ 1376 */
1360 if (swiotlb || no_iommu || iommu_detected) 1377 if (no_iommu || iommu_detected)
1361 return; 1378 return;
1362 1379
1363 if (!use_calgary) 1380 if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", 1459 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1443 specified_table_size); 1460 specified_table_size);
1444 1461
1445 /* swiotlb for devices that aren't behind the Calgary. */ 1462 x86_init.iommu.iommu_init = calgary_iommu_init;
1446 if (max_pfn > MAX_DMA32_PFN)
1447 swiotlb = 1;
1448 } 1463 }
1449 return; 1464 return;
1450 1465
@@ -1457,35 +1472,6 @@ cleanup:
1457 } 1472 }
1458} 1473}
1459 1474
1460int __init calgary_iommu_init(void)
1461{
1462 int ret;
1463
1464 if (no_iommu || (swiotlb && !calgary_detected))
1465 return -ENODEV;
1466
1467 if (!calgary_detected)
1468 return -ENODEV;
1469
1470 /* ok, we're trying to use Calgary - let's roll */
1471 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1472
1473 ret = calgary_init();
1474 if (ret) {
1475 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1476 "falling back to no_iommu\n", ret);
1477 return ret;
1478 }
1479
1480 force_iommu = 1;
1481 bad_dma_address = 0x0;
1482 /* dma_ops is set to swiotlb or nommu */
1483 if (!dma_ops)
1484 dma_ops = &nommu_dma_ops;
1485
1486 return 0;
1487}
1488
1489static int __init calgary_parse_options(char *p) 1475static int __init calgary_parse_options(char *p)
1490{ 1476{
1491 unsigned int bridge; 1477 unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d20009b4e6ef..75e14e21f61a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,11 @@
11#include <asm/gart.h> 11#include <asm/gart.h>
12#include <asm/calgary.h> 12#include <asm/calgary.h>
13#include <asm/amd_iommu.h> 13#include <asm/amd_iommu.h>
14#include <asm/x86_init.h>
14 15
15static int forbid_dac __read_mostly; 16static int forbid_dac __read_mostly;
16 17
17struct dma_map_ops *dma_ops; 18struct dma_map_ops *dma_ops = &nommu_dma_ops;
18EXPORT_SYMBOL(dma_ops); 19EXPORT_SYMBOL(dma_ops);
19 20
20static int iommu_sac_force __read_mostly; 21static int iommu_sac_force __read_mostly;
@@ -42,15 +43,10 @@ int iommu_detected __read_mostly = 0;
42 */ 43 */
43int iommu_pass_through __read_mostly; 44int iommu_pass_through __read_mostly;
44 45
45dma_addr_t bad_dma_address __read_mostly = 0; 46/* Dummy device used for NULL arguments (normally ISA). */
46EXPORT_SYMBOL(bad_dma_address);
47
48/* Dummy device used for NULL arguments (normally ISA). Better would
49 be probably a smaller DMA mask, but this is bug-to-bug compatible
50 to older i386. */
51struct device x86_dma_fallback_dev = { 47struct device x86_dma_fallback_dev = {
52 .init_name = "fallback device", 48 .init_name = "fallback device",
53 .coherent_dma_mask = DMA_BIT_MASK(32), 49 .coherent_dma_mask = ISA_DMA_BIT_MASK,
54 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, 50 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
55}; 51};
56EXPORT_SYMBOL(x86_dma_fallback_dev); 52EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -128,19 +124,18 @@ void __init pci_iommu_alloc(void)
128 /* free the range so iommu could get some range less than 4G */ 124 /* free the range so iommu could get some range less than 4G */
129 dma32_free_bootmem(); 125 dma32_free_bootmem();
130#endif 126#endif
127 if (pci_swiotlb_detect())
128 goto out;
131 129
132 /*
133 * The order of these functions is important for
134 * fall-back/fail-over reasons
135 */
136 gart_iommu_hole_init(); 130 gart_iommu_hole_init();
137 131
138 detect_calgary(); 132 detect_calgary();
139 133
140 detect_intel_iommu(); 134 detect_intel_iommu();
141 135
136 /* needs to be called after gart_iommu_hole_init */
142 amd_iommu_detect(); 137 amd_iommu_detect();
143 138out:
144 pci_swiotlb_init(); 139 pci_swiotlb_init();
145} 140}
146 141
@@ -216,7 +211,7 @@ static __init int iommu_setup(char *p)
216 if (!strncmp(p, "allowdac", 8)) 211 if (!strncmp(p, "allowdac", 8))
217 forbid_dac = 0; 212 forbid_dac = 0;
218 if (!strncmp(p, "nodac", 5)) 213 if (!strncmp(p, "nodac", 5))
219 forbid_dac = -1; 214 forbid_dac = 1;
220 if (!strncmp(p, "usedac", 6)) { 215 if (!strncmp(p, "usedac", 6)) {
221 forbid_dac = -1; 216 forbid_dac = -1;
222 return 1; 217 return 1;
@@ -291,27 +286,19 @@ static int __init pci_iommu_init(void)
291#ifdef CONFIG_PCI 286#ifdef CONFIG_PCI
292 dma_debug_add_bus(&pci_bus_type); 287 dma_debug_add_bus(&pci_bus_type);
293#endif 288#endif
289 x86_init.iommu.iommu_init();
294 290
295 calgary_iommu_init(); 291 if (swiotlb) {
296 292 printk(KERN_INFO "PCI-DMA: "
297 intel_iommu_init(); 293 "Using software bounce buffering for IO (SWIOTLB)\n");
294 swiotlb_print_info();
295 } else
296 swiotlb_free();
298 297
299 amd_iommu_init();
300
301 gart_iommu_init();
302
303 no_iommu_init();
304 return 0; 298 return 0;
305} 299}
306
307void pci_iommu_shutdown(void)
308{
309 gart_iommu_shutdown();
310
311 amd_iommu_shutdown();
312}
313/* Must execute after PCI subsystem */ 300/* Must execute after PCI subsystem */
314fs_initcall(pci_iommu_init); 301rootfs_initcall(pci_iommu_init);
315 302
316#ifdef CONFIG_PCI 303#ifdef CONFIG_PCI
317/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 304/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a7f1b64f86e0..34de53b46f87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -23,7 +23,7 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/topology.h> 24#include <linux/topology.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/bitops.h> 26#include <linux/bitmap.h>
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
@@ -39,6 +39,7 @@
39#include <asm/swiotlb.h> 39#include <asm/swiotlb.h>
40#include <asm/dma.h> 40#include <asm/dma.h>
41#include <asm/k8.h> 41#include <asm/k8.h>
42#include <asm/x86_init.h>
42 43
43static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 44static unsigned long iommu_bus_base; /* GART remapping area (physical) */
44static unsigned long iommu_size; /* size of remapping area bytes */ 45static unsigned long iommu_size; /* size of remapping area bytes */
@@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
46 47
47static u32 *iommu_gatt_base; /* Remapping table */ 48static u32 *iommu_gatt_base; /* Remapping table */
48 49
50static dma_addr_t bad_dma_addr;
51
49/* 52/*
50 * If this is disabled the IOMMU will use an optimized flushing strategy 53 * If this is disabled the IOMMU will use an optimized flushing strategy
51 * of only flushing when an mapping is reused. With it true the GART is 54 * of only flushing when an mapping is reused. With it true the GART is
@@ -92,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
92 95
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 96 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 97 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, 98 boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 99 PAGE_SIZE) >> PAGE_SHIFT;
97 100
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 101 spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -123,7 +126,7 @@ static void free_iommu(unsigned long offset, int size)
123 unsigned long flags; 126 unsigned long flags;
124 127
125 spin_lock_irqsave(&iommu_bitmap_lock, flags); 128 spin_lock_irqsave(&iommu_bitmap_lock, flags);
126 iommu_area_free(iommu_gart_bitmap, offset, size); 129 bitmap_clear(iommu_gart_bitmap, offset, size);
127 if (offset >= next_bit) 130 if (offset >= next_bit)
128 next_bit = offset + size; 131 next_bit = offset + size;
129 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 132 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
216 if (panic_on_overflow) 219 if (panic_on_overflow)
217 panic("dma_map_area overflow %lu bytes\n", size); 220 panic("dma_map_area overflow %lu bytes\n", size);
218 iommu_full(dev, size, dir); 221 iommu_full(dev, size, dir);
219 return bad_dma_address; 222 return bad_dma_addr;
220 } 223 }
221 224
222 for (i = 0; i < npages; i++) { 225 for (i = 0; i < npages; i++) {
@@ -294,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
294 int i; 297 int i;
295 298
296#ifdef CONFIG_IOMMU_DEBUG 299#ifdef CONFIG_IOMMU_DEBUG
297 printk(KERN_DEBUG "dma_map_sg overflow\n"); 300 pr_debug("dma_map_sg overflow\n");
298#endif 301#endif
299 302
300 for_each_sg(sg, s, nents, i) { 303 for_each_sg(sg, s, nents, i) {
@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
302 305
303 if (nonforced_iommu(dev, addr, s->length)) { 306 if (nonforced_iommu(dev, addr, s->length)) {
304 addr = dma_map_area(dev, addr, s->length, dir, 0); 307 addr = dma_map_area(dev, addr, s->length, dir, 0);
305 if (addr == bad_dma_address) { 308 if (addr == bad_dma_addr) {
306 if (i > 0) 309 if (i > 0)
307 gart_unmap_sg(dev, sg, i, dir, NULL); 310 gart_unmap_sg(dev, sg, i, dir, NULL);
308 nents = 0; 311 nents = 0;
@@ -389,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
389 if (!dev) 392 if (!dev)
390 dev = &x86_dma_fallback_dev; 393 dev = &x86_dma_fallback_dev;
391 394
392 out = 0; 395 out = 0;
393 start = 0; 396 start = 0;
394 start_sg = sgmap = sg; 397 start_sg = sg;
395 seg_size = 0; 398 sgmap = sg;
396 max_seg_size = dma_get_max_seg_size(dev); 399 seg_size = 0;
397 ps = NULL; /* shut up gcc */ 400 max_seg_size = dma_get_max_seg_size(dev);
401 ps = NULL; /* shut up gcc */
402
398 for_each_sg(sg, s, nents, i) { 403 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 404 dma_addr_t addr = sg_phys(s);
400 405
@@ -417,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
417 sgmap, pages, need) < 0) 422 sgmap, pages, need) < 0)
418 goto error; 423 goto error;
419 out++; 424 out++;
420 seg_size = 0; 425
421 sgmap = sg_next(sgmap); 426 seg_size = 0;
422 pages = 0; 427 sgmap = sg_next(sgmap);
423 start = i; 428 pages = 0;
424 start_sg = s; 429 start = i;
430 start_sg = s;
425 } 431 }
426 } 432 }
427 433
@@ -455,7 +461,7 @@ error:
455 461
456 iommu_full(dev, pages << PAGE_SHIFT, dir); 462 iommu_full(dev, pages << PAGE_SHIFT, dir);
457 for_each_sg(sg, s, nents, i) 463 for_each_sg(sg, s, nents, i)
458 s->dma_address = bad_dma_address; 464 s->dma_address = bad_dma_addr;
459 return 0; 465 return 0;
460} 466}
461 467
@@ -479,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
479 DMA_BIDIRECTIONAL, align_mask); 485 DMA_BIDIRECTIONAL, align_mask);
480 486
481 flush_gart(); 487 flush_gart();
482 if (paddr != bad_dma_address) { 488 if (paddr != bad_dma_addr) {
483 *dma_addr = paddr; 489 *dma_addr = paddr;
484 return page_address(page); 490 return page_address(page);
485 } 491 }
@@ -499,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
499 free_pages((unsigned long)vaddr, get_order(size)); 505 free_pages((unsigned long)vaddr, get_order(size));
500} 506}
501 507
508static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
509{
510 return (dma_addr == bad_dma_addr);
511}
512
502static int no_agp; 513static int no_agp;
503 514
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 515static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -515,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
515 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; 526 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
516 527
517 if (iommu_size < 64*1024*1024) { 528 if (iommu_size < 64*1024*1024) {
518 printk(KERN_WARNING 529 pr_warning(
519 "PCI-DMA: Warning: Small IOMMU %luMB." 530 "PCI-DMA: Warning: Small IOMMU %luMB."
520 " Consider increasing the AGP aperture in BIOS\n", 531 " Consider increasing the AGP aperture in BIOS\n",
521 iommu_size >> 20); 532 iommu_size >> 20);
@@ -570,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
570 aperture_alloc = aper_alloc; 581 aperture_alloc = aper_alloc;
571} 582}
572 583
573static int gart_resume(struct sys_device *dev) 584static void gart_fixup_northbridges(struct sys_device *dev)
574{ 585{
575 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); 586 int i;
576 587
577 if (fix_up_north_bridges) { 588 if (!fix_up_north_bridges)
578 int i; 589 return;
579 590
580 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); 591 pr_info("PCI-DMA: Restoring GART aperture settings\n");
581 592
582 for (i = 0; i < num_k8_northbridges; i++) { 593 for (i = 0; i < num_k8_northbridges; i++) {
583 struct pci_dev *dev = k8_northbridges[i]; 594 struct pci_dev *dev = k8_northbridges[i];
584 595
585 /* 596 /*
586 * Don't enable translations just yet. That is the next 597 * Don't enable translations just yet. That is the next
587 * step. Restore the pre-suspend aperture settings. 598 * step. Restore the pre-suspend aperture settings.
588 */ 599 */
589 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, 600 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
590 aperture_order << 1); 601 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
591 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
592 aperture_alloc >> 25);
593 }
594 } 602 }
603}
604
605static int gart_resume(struct sys_device *dev)
606{
607 pr_info("PCI-DMA: Resuming GART IOMMU\n");
608
609 gart_fixup_northbridges(dev);
595 610
596 enable_gart_translations(); 611 enable_gart_translations();
597 612
@@ -604,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
604} 619}
605 620
606static struct sysdev_class gart_sysdev_class = { 621static struct sysdev_class gart_sysdev_class = {
607 .name = "gart", 622 .name = "gart",
608 .suspend = gart_suspend, 623 .suspend = gart_suspend,
609 .resume = gart_resume, 624 .resume = gart_resume,
610 625
611}; 626};
612 627
613static struct sys_device device_gart = { 628static struct sys_device device_gart = {
614 .id = 0, 629 .cls = &gart_sysdev_class,
615 .cls = &gart_sysdev_class,
616}; 630};
617 631
618/* 632/*
@@ -627,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
627 void *gatt; 641 void *gatt;
628 int i, error; 642 int i, error;
629 643
630 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 644 pr_info("PCI-DMA: Disabling AGP.\n");
645
631 aper_size = aper_base = info->aper_size = 0; 646 aper_size = aper_base = info->aper_size = 0;
632 dev = NULL; 647 dev = NULL;
633 for (i = 0; i < num_k8_northbridges; i++) { 648 for (i = 0; i < num_k8_northbridges; i++) {
@@ -645,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
645 } 660 }
646 if (!aper_base) 661 if (!aper_base)
647 goto nommu; 662 goto nommu;
663
648 info->aper_base = aper_base; 664 info->aper_base = aper_base;
649 info->aper_size = aper_size >> 20; 665 info->aper_size = aper_size >> 20;
650 666
@@ -667,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
667 683
668 flush_gart(); 684 flush_gart();
669 685
670 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 686 pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
671 aper_base, aper_size>>10); 687 aper_base, aper_size>>10);
672 688
673 return 0; 689 return 0;
674 690
675 nommu: 691 nommu:
676 /* Should not happen anymore */ 692 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 693 pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 "falling back to iommu=soft.\n"); 694 "falling back to iommu=soft.\n");
679 return -1; 695 return -1;
680} 696}
@@ -686,14 +702,16 @@ static struct dma_map_ops gart_dma_ops = {
686 .unmap_page = gart_unmap_page, 702 .unmap_page = gart_unmap_page,
687 .alloc_coherent = gart_alloc_coherent, 703 .alloc_coherent = gart_alloc_coherent,
688 .free_coherent = gart_free_coherent, 704 .free_coherent = gart_free_coherent,
705 .mapping_error = gart_mapping_error,
689}; 706};
690 707
691void gart_iommu_shutdown(void) 708static void gart_iommu_shutdown(void)
692{ 709{
693 struct pci_dev *dev; 710 struct pci_dev *dev;
694 int i; 711 int i;
695 712
696 if (no_agp && (dma_ops != &gart_dma_ops)) 713 /* don't shutdown it if there is AGP installed */
714 if (!no_agp)
697 return; 715 return;
698 716
699 for (i = 0; i < num_k8_northbridges; i++) { 717 for (i = 0; i < num_k8_northbridges; i++) {
@@ -708,7 +726,7 @@ void gart_iommu_shutdown(void)
708 } 726 }
709} 727}
710 728
711void __init gart_iommu_init(void) 729int __init gart_iommu_init(void)
712{ 730{
713 struct agp_kern_info info; 731 struct agp_kern_info info;
714 unsigned long iommu_start; 732 unsigned long iommu_start;
@@ -718,7 +736,7 @@ void __init gart_iommu_init(void)
718 long i; 736 long i;
719 737
720 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 738 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
721 return; 739 return 0;
722 740
723#ifndef CONFIG_AGP_AMD64 741#ifndef CONFIG_AGP_AMD64
724 no_agp = 1; 742 no_agp = 1;
@@ -730,35 +748,28 @@ void __init gart_iommu_init(void)
730 (agp_copy_info(agp_bridge, &info) < 0); 748 (agp_copy_info(agp_bridge, &info) < 0);
731#endif 749#endif
732 750
733 if (swiotlb)
734 return;
735
736 /* Did we detect a different HW IOMMU? */
737 if (iommu_detected && !gart_iommu_aperture)
738 return;
739
740 if (no_iommu || 751 if (no_iommu ||
741 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 752 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
742 !gart_iommu_aperture || 753 !gart_iommu_aperture ||
743 (no_agp && init_k8_gatt(&info) < 0)) { 754 (no_agp && init_k8_gatt(&info) < 0)) {
744 if (max_pfn > MAX_DMA32_PFN) { 755 if (max_pfn > MAX_DMA32_PFN) {
745 printk(KERN_WARNING "More than 4GB of memory " 756 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
746 "but GART IOMMU not available.\n"); 757 pr_warning("falling back to iommu=soft.\n");
747 printk(KERN_WARNING "falling back to iommu=soft.\n");
748 } 758 }
749 return; 759 return 0;
750 } 760 }
751 761
752 /* need to map that range */ 762 /* need to map that range */
753 aper_size = info.aper_size << 20; 763 aper_size = info.aper_size << 20;
754 aper_base = info.aper_base; 764 aper_base = info.aper_base;
755 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 765 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
766
756 if (end_pfn > max_low_pfn_mapped) { 767 if (end_pfn > max_low_pfn_mapped) {
757 start_pfn = (aper_base>>PAGE_SHIFT); 768 start_pfn = (aper_base>>PAGE_SHIFT);
758 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 769 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
759 } 770 }
760 771
761 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 772 pr_info("PCI-DMA: using GART IOMMU.\n");
762 iommu_size = check_iommu_size(info.aper_base, aper_size); 773 iommu_size = check_iommu_size(info.aper_base, aper_size);
763 iommu_pages = iommu_size >> PAGE_SHIFT; 774 iommu_pages = iommu_size >> PAGE_SHIFT;
764 775
@@ -773,8 +784,7 @@ void __init gart_iommu_init(void)
773 784
774 ret = dma_debug_resize_entries(iommu_pages); 785 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret) 786 if (ret)
776 printk(KERN_DEBUG 787 pr_debug("PCI-DMA: Cannot trace all the entries\n");
777 "PCI-DMA: Cannot trace all the entries\n");
778 } 788 }
779#endif 789#endif
780 790
@@ -782,17 +792,16 @@ void __init gart_iommu_init(void)
782 * Out of IOMMU space handling. 792 * Out of IOMMU space handling.
783 * Reserve some invalid pages at the beginning of the GART. 793 * Reserve some invalid pages at the beginning of the GART.
784 */ 794 */
785 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 795 bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
786 796
787 agp_memory_reserved = iommu_size; 797 pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
788 printk(KERN_INFO
789 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
790 iommu_size >> 20); 798 iommu_size >> 20);
791 799
792 iommu_start = aper_size - iommu_size; 800 agp_memory_reserved = iommu_size;
793 iommu_bus_base = info.aper_base + iommu_start; 801 iommu_start = aper_size - iommu_size;
794 bad_dma_address = iommu_bus_base; 802 iommu_bus_base = info.aper_base + iommu_start;
795 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 803 bad_dma_addr = iommu_bus_base;
804 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
796 805
797 /* 806 /*
798 * Unmap the IOMMU part of the GART. The alias of the page is 807 * Unmap the IOMMU part of the GART. The alias of the page is
@@ -814,7 +823,7 @@ void __init gart_iommu_init(void)
814 * the pages as Not-Present: 823 * the pages as Not-Present:
815 */ 824 */
816 wbinvd(); 825 wbinvd();
817 826
818 /* 827 /*
819 * Now all caches are flushed and we can safely enable 828 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility 829 * GART hardware. Doing it early leaves the possibility
@@ -838,6 +847,10 @@ void __init gart_iommu_init(void)
838 847
839 flush_gart(); 848 flush_gart();
840 dma_ops = &gart_dma_ops; 849 dma_ops = &gart_dma_ops;
850 x86_platform.iommu_shutdown = gart_iommu_shutdown;
851 swiotlb = 0;
852
853 return 0;
841} 854}
842 855
843void __init gart_parse_options(char *p) 856void __init gart_parse_options(char *p)
@@ -856,7 +869,7 @@ void __init gart_parse_options(char *p)
856#endif 869#endif
857 if (isdigit(*p) && get_option(&p, &arg)) 870 if (isdigit(*p) && get_option(&p, &arg))
858 iommu_size = arg; 871 iommu_size = arg;
859 if (!strncmp(p, "fullflush", 8)) 872 if (!strncmp(p, "fullflush", 9))
860 iommu_fullflush = 1; 873 iommu_fullflush = 1;
861 if (!strncmp(p, "nofullflush", 11)) 874 if (!strncmp(p, "nofullflush", 11))
862 iommu_fullflush = 0; 875 iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..22be12b60a8f 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
33 dma_addr_t bus = page_to_phys(page) + offset; 33 dma_addr_t bus = page_to_phys(page) + offset;
34 WARN_ON(size == 0); 34 WARN_ON(size == 0);
35 if (!check_addr("map_single", dev, bus, size)) 35 if (!check_addr("map_single", dev, bus, size))
36 return bad_dma_address; 36 return DMA_ERROR_CODE;
37 flush_write_buffers(); 37 flush_write_buffers();
38 return bus; 38 return bus;
39} 39}
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
103 .sync_sg_for_device = nommu_sync_sg_for_device, 103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1, 104 .is_phys = 1,
105}; 105};
106
107void __init no_iommu_init(void)
108{
109 if (dma_ops)
110 return;
111
112 force_iommu = 0; /* no HW IOMMU */
113 dma_ops = &nommu_dma_ops;
114}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..7d2829dde20e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
42 .dma_supported = NULL, 42 .dma_supported = NULL,
43}; 43};
44 44
45void __init pci_swiotlb_init(void) 45/*
46 * pci_swiotlb_detect - set swiotlb to 1 if necessary
47 *
48 * This returns non-zero if we are forced to use swiotlb (by the boot
49 * option).
50 */
51int __init pci_swiotlb_detect(void)
46{ 52{
53 int use_swiotlb = swiotlb | swiotlb_force;
54
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 55 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) 57 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
50 swiotlb = 1; 58 swiotlb = 1;
51#endif 59#endif
52 if (swiotlb_force) 60 if (swiotlb_force)
53 swiotlb = 1; 61 swiotlb = 1;
62
63 return use_swiotlb;
64}
65
66void __init pci_swiotlb_init(void)
67{
54 if (swiotlb) { 68 if (swiotlb) {
55 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); 69 swiotlb_init(0);
56 swiotlb_init();
57 dma_ops = &swiotlb_dma_ops; 70 dma_ops = &swiotlb_dma_ops;
58 } 71 }
59} 72}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..c9b3522b6b46 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,11 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
13#include <linux/dmi.h>
14#include <linux/utsname.h>
12#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 17#include <asm/system.h>
14#include <asm/apic.h> 18#include <asm/apic.h>
15#include <asm/syscalls.h> 19#include <asm/syscalls.h>
@@ -17,6 +21,7 @@
17#include <asm/uaccess.h> 21#include <asm/uaccess.h>
18#include <asm/i387.h> 22#include <asm/i387.h>
19#include <asm/ds.h> 23#include <asm/ds.h>
24#include <asm/debugreg.h>
20 25
21unsigned long idle_halt; 26unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 27EXPORT_SYMBOL(idle_halt);
@@ -87,30 +92,30 @@ void exit_thread(void)
87 } 92 }
88} 93}
89 94
90void flush_thread(void) 95void show_regs_common(void)
91{ 96{
92 struct task_struct *tsk = current; 97 const char *board, *product;
93 98
94#ifdef CONFIG_X86_64 99 board = dmi_get_system_info(DMI_BOARD_NAME);
95 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { 100 if (!board)
96 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); 101 board = "";
97 if (test_tsk_thread_flag(tsk, TIF_IA32)) { 102 product = dmi_get_system_info(DMI_PRODUCT_NAME);
98 clear_tsk_thread_flag(tsk, TIF_IA32); 103 if (!product)
99 } else { 104 product = "";
100 set_tsk_thread_flag(tsk, TIF_IA32);
101 current_thread_info()->status |= TS_COMPAT;
102 }
103 }
104#endif
105 105
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 106 printk(KERN_CONT "\n");
107 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
108 current->pid, current->comm, print_tainted(),
109 init_utsname()->release,
110 (int)strcspn(init_utsname()->version, " "),
111 init_utsname()->version, board, product);
112}
113
114void flush_thread(void)
115{
116 struct task_struct *tsk = current;
107 117
108 tsk->thread.debugreg0 = 0; 118 flush_ptrace_hw_breakpoint(tsk);
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 119 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 120 /*
116 * Forget coprocessor state.. 121 * Forget coprocessor state..
@@ -192,16 +197,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 197 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 198 update_debugctlmsr(next->debugctlmsr);
194 199
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 200 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 201 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 202 /* prev and next are different */
@@ -224,6 +219,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
224 */ 219 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 220 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 } 221 }
222 propagate_user_return_notify(prev_p, next_p);
227} 223}
228 224
229int sys_fork(struct pt_regs *regs) 225int sys_fork(struct pt_regs *regs)
@@ -247,6 +243,78 @@ int sys_vfork(struct pt_regs *regs)
247 NULL, NULL); 243 NULL, NULL);
248} 244}
249 245
246long
247sys_clone(unsigned long clone_flags, unsigned long newsp,
248 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
249{
250 if (!newsp)
251 newsp = regs->sp;
252 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
253}
254
255/*
256 * This gets run with %si containing the
257 * function to call, and %di containing
258 * the "args".
259 */
260extern void kernel_thread_helper(void);
261
262/*
263 * Create a kernel thread
264 */
265int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
266{
267 struct pt_regs regs;
268
269 memset(&regs, 0, sizeof(regs));
270
271 regs.si = (unsigned long) fn;
272 regs.di = (unsigned long) arg;
273
274#ifdef CONFIG_X86_32
275 regs.ds = __USER_DS;
276 regs.es = __USER_DS;
277 regs.fs = __KERNEL_PERCPU;
278 regs.gs = __KERNEL_STACK_CANARY;
279#else
280 regs.ss = __KERNEL_DS;
281#endif
282
283 regs.orig_ax = -1;
284 regs.ip = (unsigned long) kernel_thread_helper;
285 regs.cs = __KERNEL_CS | get_kernel_rpl();
286 regs.flags = X86_EFLAGS_IF | 0x2;
287
288 /* Ok, create the new process.. */
289 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
290}
291EXPORT_SYMBOL(kernel_thread);
292
293/*
294 * sys_execve() executes a new program.
295 */
296long sys_execve(char __user *name, char __user * __user *argv,
297 char __user * __user *envp, struct pt_regs *regs)
298{
299 long error;
300 char *filename;
301
302 filename = getname(name);
303 error = PTR_ERR(filename);
304 if (IS_ERR(filename))
305 return error;
306 error = do_execve(filename, argv, envp, regs);
307
308#ifdef CONFIG_X86_32
309 if (error == 0) {
310 /* Make sure we don't return using sysenter.. */
311 set_thread_flag(TIF_IRET);
312 }
313#endif
314
315 putname(filename);
316 return error;
317}
250 318
251/* 319/*
252 * Idle related variables and functions 320 * Idle related variables and functions
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..37ad1e046aae 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -23,7 +23,6 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/user.h> 24#include <linux/user.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/utsname.h>
27#include <linux/delay.h> 26#include <linux/delay.h>
28#include <linux/reboot.h> 27#include <linux/reboot.h>
29#include <linux/init.h> 28#include <linux/init.h>
@@ -35,7 +34,6 @@
35#include <linux/tick.h> 34#include <linux/tick.h>
36#include <linux/percpu.h> 35#include <linux/percpu.h>
37#include <linux/prctl.h> 36#include <linux/prctl.h>
38#include <linux/dmi.h>
39#include <linux/ftrace.h> 37#include <linux/ftrace.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/io.h> 39#include <linux/io.h>
@@ -58,6 +56,7 @@
58#include <asm/idle.h> 56#include <asm/idle.h>
59#include <asm/syscalls.h> 57#include <asm/syscalls.h>
60#include <asm/ds.h> 58#include <asm/ds.h>
59#include <asm/debugreg.h>
61 60
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 62
@@ -127,39 +126,29 @@ void __show_regs(struct pt_regs *regs, int all)
127 unsigned long d0, d1, d2, d3, d6, d7; 126 unsigned long d0, d1, d2, d3, d6, d7;
128 unsigned long sp; 127 unsigned long sp;
129 unsigned short ss, gs; 128 unsigned short ss, gs;
130 const char *board;
131 129
132 if (user_mode_vm(regs)) { 130 if (user_mode_vm(regs)) {
133 sp = regs->sp; 131 sp = regs->sp;
134 ss = regs->ss & 0xffff; 132 ss = regs->ss & 0xffff;
135 gs = get_user_gs(regs); 133 gs = get_user_gs(regs);
136 } else { 134 } else {
137 sp = (unsigned long) (&regs->sp); 135 sp = kernel_stack_pointer(regs);
138 savesegment(ss, ss); 136 savesegment(ss, ss);
139 savesegment(gs, gs); 137 savesegment(gs, gs);
140 } 138 }
141 139
142 printk("\n"); 140 show_regs_common();
143 141
144 board = dmi_get_system_info(DMI_PRODUCT_NAME); 142 printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
145 if (!board)
146 board = "";
147 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
148 task_pid_nr(current), current->comm,
149 print_tainted(), init_utsname()->release,
150 (int)strcspn(init_utsname()->version, " "),
151 init_utsname()->version, board);
152
153 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
154 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
155 smp_processor_id()); 144 smp_processor_id());
156 print_symbol("EIP is at %s\n", regs->ip); 145 print_symbol("EIP is at %s\n", regs->ip);
157 146
158 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 147 printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
159 regs->ax, regs->bx, regs->cx, regs->dx); 148 regs->ax, regs->bx, regs->cx, regs->dx);
160 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 149 printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
161 regs->si, regs->di, regs->bp, sp); 150 regs->si, regs->di, regs->bp, sp);
162 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 151 printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
163 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); 152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
164 153
165 if (!all) 154 if (!all)
@@ -169,61 +158,28 @@ void __show_regs(struct pt_regs *regs, int all)
169 cr2 = read_cr2(); 158 cr2 = read_cr2();
170 cr3 = read_cr3(); 159 cr3 = read_cr3();
171 cr4 = read_cr4_safe(); 160 cr4 = read_cr4_safe();
172 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 161 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
173 cr0, cr2, cr3, cr4); 162 cr0, cr2, cr3, cr4);
174 163
175 get_debugreg(d0, 0); 164 get_debugreg(d0, 0);
176 get_debugreg(d1, 1); 165 get_debugreg(d1, 1);
177 get_debugreg(d2, 2); 166 get_debugreg(d2, 2);
178 get_debugreg(d3, 3); 167 get_debugreg(d3, 3);
179 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", 168 printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
180 d0, d1, d2, d3); 169 d0, d1, d2, d3);
181 170
182 get_debugreg(d6, 6); 171 get_debugreg(d6, 6);
183 get_debugreg(d7, 7); 172 get_debugreg(d7, 7);
184 printk("DR6: %08lx DR7: %08lx\n", 173 printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
185 d6, d7); 174 d6, d7);
186} 175}
187 176
188void show_regs(struct pt_regs *regs) 177void show_regs(struct pt_regs *regs)
189{ 178{
190 __show_regs(regs, 1); 179 show_registers(regs);
191 show_trace(NULL, regs, &regs->sp, regs->bp); 180 show_trace(NULL, regs, &regs->sp, regs->bp);
192} 181}
193 182
194/*
195 * This gets run with %bx containing the
196 * function to call, and %dx containing
197 * the "args".
198 */
199extern void kernel_thread_helper(void);
200
201/*
202 * Create a kernel thread
203 */
204int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
205{
206 struct pt_regs regs;
207
208 memset(&regs, 0, sizeof(regs));
209
210 regs.bx = (unsigned long) fn;
211 regs.dx = (unsigned long) arg;
212
213 regs.ds = __USER_DS;
214 regs.es = __USER_DS;
215 regs.fs = __KERNEL_PERCPU;
216 regs.gs = __KERNEL_STACK_CANARY;
217 regs.orig_ax = -1;
218 regs.ip = (unsigned long) kernel_thread_helper;
219 regs.cs = __KERNEL_CS | get_kernel_rpl();
220 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
221
222 /* Ok, create the new process.. */
223 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
224}
225EXPORT_SYMBOL(kernel_thread);
226
227void release_thread(struct task_struct *dead_task) 183void release_thread(struct task_struct *dead_task)
228{ 184{
229 BUG_ON(dead_task->mm); 185 BUG_ON(dead_task->mm);
@@ -259,7 +215,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 215
260 task_user_gs(p) = get_user_gs(regs); 216 task_user_gs(p) = get_user_gs(regs);
261 217
218 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 219 tsk = current;
220 err = -ENOMEM;
221
222 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
223
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 224 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 225 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 226 IO_BITMAP_BYTES, GFP_KERNEL);
@@ -430,46 +391,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
430 return prev_p; 391 return prev_p;
431} 392}
432 393
433int sys_clone(struct pt_regs *regs)
434{
435 unsigned long clone_flags;
436 unsigned long newsp;
437 int __user *parent_tidptr, *child_tidptr;
438
439 clone_flags = regs->bx;
440 newsp = regs->cx;
441 parent_tidptr = (int __user *)regs->dx;
442 child_tidptr = (int __user *)regs->di;
443 if (!newsp)
444 newsp = regs->sp;
445 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
446}
447
448/*
449 * sys_execve() executes a new program.
450 */
451int sys_execve(struct pt_regs *regs)
452{
453 int error;
454 char *filename;
455
456 filename = getname((char __user *) regs->bx);
457 error = PTR_ERR(filename);
458 if (IS_ERR(filename))
459 goto out;
460 error = do_execve(filename,
461 (char __user * __user *) regs->cx,
462 (char __user * __user *) regs->dx,
463 regs);
464 if (error == 0) {
465 /* Make sure we don't return using sysenter.. */
466 set_thread_flag(TIF_IRET);
467 }
468 putname(filename);
469out:
470 return error;
471}
472
473#define top_esp (THREAD_SIZE - sizeof(unsigned long)) 394#define top_esp (THREAD_SIZE - sizeof(unsigned long))
474#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) 395#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
475 396
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b683170..41a26a82470a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,6 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/user.h> 27#include <linux/user.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/utsname.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/module.h> 30#include <linux/module.h>
32#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -38,7 +37,6 @@
38#include <linux/uaccess.h> 37#include <linux/uaccess.h>
39#include <linux/io.h> 38#include <linux/io.h>
40#include <linux/ftrace.h> 39#include <linux/ftrace.h>
41#include <linux/dmi.h>
42 40
43#include <asm/pgtable.h> 41#include <asm/pgtable.h>
44#include <asm/system.h> 42#include <asm/system.h>
@@ -52,14 +50,13 @@
52#include <asm/idle.h> 50#include <asm/idle.h>
53#include <asm/syscalls.h> 51#include <asm/syscalls.h>
54#include <asm/ds.h> 52#include <asm/ds.h>
53#include <asm/debugreg.h>
55 54
56asmlinkage extern void ret_from_fork(void); 55asmlinkage extern void ret_from_fork(void);
57 56
58DEFINE_PER_CPU(unsigned long, old_rsp); 57DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle); 58static DEFINE_PER_CPU(unsigned char, is_idle);
60 59
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 60static ATOMIC_NOTIFIER_HEAD(idle_notifier);
64 61
65void idle_notifier_register(struct notifier_block *n) 62void idle_notifier_register(struct notifier_block *n)
@@ -162,31 +159,21 @@ void __show_regs(struct pt_regs *regs, int all)
162 unsigned long d0, d1, d2, d3, d6, d7; 159 unsigned long d0, d1, d2, d3, d6, d7;
163 unsigned int fsindex, gsindex; 160 unsigned int fsindex, gsindex;
164 unsigned int ds, cs, es; 161 unsigned int ds, cs, es;
165 const char *board; 162
166 163 show_regs_common();
167 printk("\n"); 164 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
168 print_modules();
169 board = dmi_get_system_info(DMI_PRODUCT_NAME);
170 if (!board)
171 board = "";
172 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
173 current->pid, current->comm, print_tainted(),
174 init_utsname()->release,
175 (int)strcspn(init_utsname()->version, " "),
176 init_utsname()->version, board);
177 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
178 printk_address(regs->ip, 1); 165 printk_address(regs->ip, 1);
179 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 166 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
180 regs->sp, regs->flags); 167 regs->sp, regs->flags);
181 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 168 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
182 regs->ax, regs->bx, regs->cx); 169 regs->ax, regs->bx, regs->cx);
183 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 170 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
184 regs->dx, regs->si, regs->di); 171 regs->dx, regs->si, regs->di);
185 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 172 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
186 regs->bp, regs->r8, regs->r9); 173 regs->bp, regs->r8, regs->r9);
187 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 174 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
188 regs->r10, regs->r11, regs->r12); 175 regs->r10, regs->r11, regs->r12);
189 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 176 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
190 regs->r13, regs->r14, regs->r15); 177 regs->r13, regs->r14, regs->r15);
191 178
192 asm("movl %%ds,%0" : "=r" (ds)); 179 asm("movl %%ds,%0" : "=r" (ds));
@@ -207,27 +194,26 @@ void __show_regs(struct pt_regs *regs, int all)
207 cr3 = read_cr3(); 194 cr3 = read_cr3();
208 cr4 = read_cr4(); 195 cr4 = read_cr4();
209 196
210 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 197 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
211 fs, fsindex, gs, gsindex, shadowgs); 198 fs, fsindex, gs, gsindex, shadowgs);
212 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 199 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
213 es, cr0); 200 es, cr0);
214 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 201 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
215 cr4); 202 cr4);
216 203
217 get_debugreg(d0, 0); 204 get_debugreg(d0, 0);
218 get_debugreg(d1, 1); 205 get_debugreg(d1, 1);
219 get_debugreg(d2, 2); 206 get_debugreg(d2, 2);
220 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 207 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
221 get_debugreg(d3, 3); 208 get_debugreg(d3, 3);
222 get_debugreg(d6, 6); 209 get_debugreg(d6, 6);
223 get_debugreg(d7, 7); 210 get_debugreg(d7, 7);
224 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
225} 212}
226 213
227void show_regs(struct pt_regs *regs) 214void show_regs(struct pt_regs *regs)
228{ 215{
229 printk(KERN_INFO "CPU %d:", smp_processor_id()); 216 show_registers(regs);
230 __show_regs(regs, 1);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 217 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232} 218}
233 219
@@ -285,8 +271,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
285 *childregs = *regs; 271 *childregs = *regs;
286 272
287 childregs->ax = 0; 273 childregs->ax = 0;
288 childregs->sp = sp; 274 if (user_mode(regs))
289 if (sp == ~0UL) 275 childregs->sp = sp;
276 else
290 childregs->sp = (unsigned long)childregs; 277 childregs->sp = (unsigned long)childregs;
291 278
292 p->thread.sp = (unsigned long) childregs; 279 p->thread.sp = (unsigned long) childregs;
@@ -297,12 +284,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
297 284
298 p->thread.fs = me->thread.fs; 285 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs; 286 p->thread.gs = me->thread.gs;
287 p->thread.io_bitmap_ptr = NULL;
300 288
301 savesegment(gs, p->thread.gsindex); 289 savesegment(gs, p->thread.gsindex);
302 savesegment(fs, p->thread.fsindex); 290 savesegment(fs, p->thread.fsindex);
303 savesegment(es, p->thread.es); 291 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 292 savesegment(ds, p->thread.ds);
305 293
294 err = -ENOMEM;
295 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
296
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 297 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 298 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 299 if (!p->thread.io_bitmap_ptr) {
@@ -341,29 +332,46 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 332 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 333 p->thread.io_bitmap_max = 0;
343 } 334 }
335
344 return err; 336 return err;
345} 337}
346 338
347void 339static void
348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 340start_thread_common(struct pt_regs *regs, unsigned long new_ip,
341 unsigned long new_sp,
342 unsigned int _cs, unsigned int _ss, unsigned int _ds)
349{ 343{
350 loadsegment(fs, 0); 344 loadsegment(fs, 0);
351 loadsegment(es, 0); 345 loadsegment(es, _ds);
352 loadsegment(ds, 0); 346 loadsegment(ds, _ds);
353 load_gs_index(0); 347 load_gs_index(0);
354 regs->ip = new_ip; 348 regs->ip = new_ip;
355 regs->sp = new_sp; 349 regs->sp = new_sp;
356 percpu_write(old_rsp, new_sp); 350 percpu_write(old_rsp, new_sp);
357 regs->cs = __USER_CS; 351 regs->cs = _cs;
358 regs->ss = __USER_DS; 352 regs->ss = _ss;
359 regs->flags = 0x200; 353 regs->flags = X86_EFLAGS_IF;
360 set_fs(USER_DS); 354 set_fs(USER_DS);
361 /* 355 /*
362 * Free the old FP and other extended state 356 * Free the old FP and other extended state
363 */ 357 */
364 free_thread_xstate(current); 358 free_thread_xstate(current);
365} 359}
366EXPORT_SYMBOL_GPL(start_thread); 360
361void
362start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
363{
364 start_thread_common(regs, new_ip, new_sp,
365 __USER_CS, __USER_DS, 0);
366}
367
368#ifdef CONFIG_IA32_EMULATION
369void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
370{
371 start_thread_common(regs, new_ip, new_sp,
372 __USER32_CS, __USER32_DS, __USER32_DS);
373}
374#endif
367 375
368/* 376/*
369 * switch_to(x,y) should switch tasks from x to y. 377 * switch_to(x,y) should switch tasks from x to y.
@@ -495,26 +503,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 503 */
496 if (preload_fpu) 504 if (preload_fpu)
497 __math_state_restore(); 505 __math_state_restore();
498 return prev_p;
499}
500 506
501/* 507 return prev_p;
502 * sys_execve() executes a new program.
503 */
504asmlinkage
505long sys_execve(char __user *name, char __user * __user *argv,
506 char __user * __user *envp, struct pt_regs *regs)
507{
508 long error;
509 char *filename;
510
511 filename = getname(name);
512 error = PTR_ERR(filename);
513 if (IS_ERR(filename))
514 return error;
515 error = do_execve(filename, argv, envp, regs);
516 putname(filename);
517 return error;
518} 508}
519 509
520void set_personality_64bit(void) 510void set_personality_64bit(void)
@@ -531,13 +521,15 @@ void set_personality_64bit(void)
531 current->personality &= ~READ_IMPLIES_EXEC; 521 current->personality &= ~READ_IMPLIES_EXEC;
532} 522}
533 523
534asmlinkage long 524void set_personality_ia32(void)
535sys_clone(unsigned long clone_flags, unsigned long newsp,
536 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
537{ 525{
538 if (!newsp) 526 /* inherit personality from parent */
539 newsp = regs->sp; 527
540 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 528 /* Make sure to be in 32bit mode */
529 set_thread_flag(TIF_IA32);
530
531 /* Prepare the first "return" to user space */
532 current_thread_info()->status |= TS_COMPAT;
541} 533}
542 534
543unsigned long get_wchan(struct task_struct *p) 535unsigned long get_wchan(struct task_struct *p)
@@ -664,3 +656,8 @@ long sys_arch_prctl(int code, unsigned long addr)
664 return do_arch_prctl(current, code, addr); 656 return do_arch_prctl(current, code, addr);
665} 657}
666 658
659unsigned long KSTK_ESP(struct task_struct *task)
660{
661 return (test_tsk_thread_flag(task, TIF_IA32)) ?
662 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
663}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..017d937639fe 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/perf_event.h>
26#include <linux/hw_breakpoint.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
34#include <asm/prctl.h> 36#include <asm/prctl.h>
35#include <asm/proto.h> 37#include <asm/proto.h>
36#include <asm/ds.h> 38#include <asm/ds.h>
39#include <asm/hw_breakpoint.h>
37 40
38#include "tls.h" 41#include "tls.h"
39 42
@@ -49,6 +52,118 @@ enum x86_regset {
49 REGSET_IOPERM32, 52 REGSET_IOPERM32,
50}; 53};
51 54
55struct pt_regs_offset {
56 const char *name;
57 int offset;
58};
59
60#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
61#define REG_OFFSET_END {.name = NULL, .offset = 0}
62
63static const struct pt_regs_offset regoffset_table[] = {
64#ifdef CONFIG_X86_64
65 REG_OFFSET_NAME(r15),
66 REG_OFFSET_NAME(r14),
67 REG_OFFSET_NAME(r13),
68 REG_OFFSET_NAME(r12),
69 REG_OFFSET_NAME(r11),
70 REG_OFFSET_NAME(r10),
71 REG_OFFSET_NAME(r9),
72 REG_OFFSET_NAME(r8),
73#endif
74 REG_OFFSET_NAME(bx),
75 REG_OFFSET_NAME(cx),
76 REG_OFFSET_NAME(dx),
77 REG_OFFSET_NAME(si),
78 REG_OFFSET_NAME(di),
79 REG_OFFSET_NAME(bp),
80 REG_OFFSET_NAME(ax),
81#ifdef CONFIG_X86_32
82 REG_OFFSET_NAME(ds),
83 REG_OFFSET_NAME(es),
84 REG_OFFSET_NAME(fs),
85 REG_OFFSET_NAME(gs),
86#endif
87 REG_OFFSET_NAME(orig_ax),
88 REG_OFFSET_NAME(ip),
89 REG_OFFSET_NAME(cs),
90 REG_OFFSET_NAME(flags),
91 REG_OFFSET_NAME(sp),
92 REG_OFFSET_NAME(ss),
93 REG_OFFSET_END,
94};
95
96/**
97 * regs_query_register_offset() - query register offset from its name
98 * @name: the name of a register
99 *
100 * regs_query_register_offset() returns the offset of a register in struct
101 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
102 */
103int regs_query_register_offset(const char *name)
104{
105 const struct pt_regs_offset *roff;
106 for (roff = regoffset_table; roff->name != NULL; roff++)
107 if (!strcmp(roff->name, name))
108 return roff->offset;
109 return -EINVAL;
110}
111
112/**
113 * regs_query_register_name() - query register name from its offset
114 * @offset: the offset of a register in struct pt_regs.
115 *
116 * regs_query_register_name() returns the name of a register from its
117 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
118 */
119const char *regs_query_register_name(unsigned int offset)
120{
121 const struct pt_regs_offset *roff;
122 for (roff = regoffset_table; roff->name != NULL; roff++)
123 if (roff->offset == offset)
124 return roff->name;
125 return NULL;
126}
127
128static const int arg_offs_table[] = {
129#ifdef CONFIG_X86_32
130 [0] = offsetof(struct pt_regs, ax),
131 [1] = offsetof(struct pt_regs, dx),
132 [2] = offsetof(struct pt_regs, cx)
133#else /* CONFIG_X86_64 */
134 [0] = offsetof(struct pt_regs, di),
135 [1] = offsetof(struct pt_regs, si),
136 [2] = offsetof(struct pt_regs, dx),
137 [3] = offsetof(struct pt_regs, cx),
138 [4] = offsetof(struct pt_regs, r8),
139 [5] = offsetof(struct pt_regs, r9)
140#endif
141};
142
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
52/* 167/*
53 * does not yet catch signals sent when the child dies. 168 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 169 * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 252 return 0;
138} 253}
139 254
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 255#else /* CONFIG_X86_64 */
146 256
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 257#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 376 return 0;
267} 377}
268 378
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 379#endif /* CONFIG_X86_32 */
279 380
280static unsigned long get_flags(struct task_struct *task) 381static unsigned long get_flags(struct task_struct *task)
@@ -408,14 +509,14 @@ static int genregs_get(struct task_struct *target,
408{ 509{
409 if (kbuf) { 510 if (kbuf) {
410 unsigned long *k = kbuf; 511 unsigned long *k = kbuf;
411 while (count > 0) { 512 while (count >= sizeof(*k)) {
412 *k++ = getreg(target, pos); 513 *k++ = getreg(target, pos);
413 count -= sizeof(*k); 514 count -= sizeof(*k);
414 pos += sizeof(*k); 515 pos += sizeof(*k);
415 } 516 }
416 } else { 517 } else {
417 unsigned long __user *u = ubuf; 518 unsigned long __user *u = ubuf;
418 while (count > 0) { 519 while (count >= sizeof(*u)) {
419 if (__put_user(getreg(target, pos), u++)) 520 if (__put_user(getreg(target, pos), u++))
420 return -EFAULT; 521 return -EFAULT;
421 count -= sizeof(*u); 522 count -= sizeof(*u);
@@ -434,14 +535,14 @@ static int genregs_set(struct task_struct *target,
434 int ret = 0; 535 int ret = 0;
435 if (kbuf) { 536 if (kbuf) {
436 const unsigned long *k = kbuf; 537 const unsigned long *k = kbuf;
437 while (count > 0 && !ret) { 538 while (count >= sizeof(*k) && !ret) {
438 ret = putreg(target, pos, *k++); 539 ret = putreg(target, pos, *k++);
439 count -= sizeof(*k); 540 count -= sizeof(*k);
440 pos += sizeof(*k); 541 pos += sizeof(*k);
441 } 542 }
442 } else { 543 } else {
443 const unsigned long __user *u = ubuf; 544 const unsigned long __user *u = ubuf;
444 while (count > 0 && !ret) { 545 while (count >= sizeof(*u) && !ret) {
445 unsigned long word; 546 unsigned long word;
446 ret = __get_user(word, u++); 547 ret = __get_user(word, u++);
447 if (ret) 548 if (ret)
@@ -454,99 +555,237 @@ static int genregs_set(struct task_struct *target,
454 return ret; 555 return ret;
455} 556}
456 557
558static void ptrace_triggered(struct perf_event *bp, int nmi,
559 struct perf_sample_data *data,
560 struct pt_regs *regs)
561{
562 int i;
563 struct thread_struct *thread = &(current->thread);
564
565 /*
566 * Store in the virtual DR6 register the fact that the breakpoint
567 * was hit so the thread's debugger will see it.
568 */
569 for (i = 0; i < HBP_NUM; i++) {
570 if (thread->ptrace_bps[i] == bp)
571 break;
572 }
573
574 thread->debugreg6 |= (DR_TRAP0 << i);
575}
576
457/* 577/*
458 * This function is trivial and will be inlined by the compiler. 578 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 579 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 580 *
461 */ 581 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 582static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 583{
464 switch (n) { 584 int i;
465 case 0: return child->thread.debugreg0; 585 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 586 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 587
468 case 3: return child->thread.debugreg3; 588 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 589 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 590 info = counter_arch_bp(bp[i]);
591 dr7 |= encode_dr7(i, info->len, info->type);
592 }
471 } 593 }
472 return 0; 594
595 return dr7;
473} 596}
474 597
475static int ptrace_set_debugreg(struct task_struct *child, 598static int
476 int n, unsigned long data) 599ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
600 struct task_struct *tsk, int disabled)
477{ 601{
478 int i; 602 int err;
603 int gen_len, gen_type;
604 struct perf_event_attr attr;
479 605
480 if (unlikely(n == 4 || n == 5)) 606 /*
481 return -EIO; 607 * We shoud have at least an inactive breakpoint at this
608 * slot. It means the user is writing dr7 without having
609 * written the address register first
610 */
611 if (!bp)
612 return -EINVAL;
482 613
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 614 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 615 if (err)
616 return err;
485 617
486 switch (n) { 618 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 619 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 620 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 621 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 622
492 case 6: 623 return modify_user_hw_breakpoint(bp, &attr);
493 if ((data & ~0xffffffffUL) != 0) 624}
494 return -EIO;
495 child->thread.debugreg6 = data;
496 break;
497 625
498 case 7: 626/*
627 * Handle ptrace writes to debug register 7.
628 */
629static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
630{
631 struct thread_struct *thread = &(tsk->thread);
632 unsigned long old_dr7;
633 int i, orig_ret = 0, rc = 0;
634 int enabled, second_pass = 0;
635 unsigned len, type;
636 struct perf_event *bp;
637
638 data &= ~DR_CONTROL_RESERVED;
639 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
640restore:
641 /*
642 * Loop through all the hardware breakpoints, making the
643 * appropriate changes to each.
644 */
645 for (i = 0; i < HBP_NUM; i++) {
646 enabled = decode_dr7(data, i, &len, &type);
647 bp = thread->ptrace_bps[i];
648
649 if (!enabled) {
650 if (bp) {
651 /*
652 * Don't unregister the breakpoints right-away,
653 * unless all register_user_hw_breakpoint()
654 * requests have succeeded. This prevents
655 * any window of opportunity for debug
656 * register grabbing by other users.
657 */
658 if (!second_pass)
659 continue;
660
661 rc = ptrace_modify_breakpoint(bp, len, type,
662 tsk, 1);
663 if (rc)
664 break;
665 }
666 continue;
667 }
668
669 rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
670 if (rc)
671 break;
672 }
673 /*
674 * Make a second pass to free the remaining unused breakpoints
675 * or to restore the original breakpoints if an error occurred.
676 */
677 if (!second_pass) {
678 second_pass = 1;
679 if (rc < 0) {
680 orig_ret = rc;
681 data = old_dr7;
682 }
683 goto restore;
684 }
685 return ((orig_ret < 0) ? orig_ret : rc);
686}
687
688/*
689 * Handle PTRACE_PEEKUSR calls for the debug register area.
690 */
691static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
692{
693 struct thread_struct *thread = &(tsk->thread);
694 unsigned long val = 0;
695
696 if (n < HBP_NUM) {
697 struct perf_event *bp;
698 bp = thread->ptrace_bps[n];
699 if (!bp)
700 return 0;
701 val = bp->hw.info.address;
702 } else if (n == 6) {
703 val = thread->debugreg6;
704 } else if (n == 7) {
705 val = ptrace_get_dr7(thread->ptrace_bps);
706 }
707 return val;
708}
709
710static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
711 unsigned long addr)
712{
713 struct perf_event *bp;
714 struct thread_struct *t = &tsk->thread;
715 struct perf_event_attr attr;
716
717 if (!t->ptrace_bps[nr]) {
718 hw_breakpoint_init(&attr);
499 /* 719 /*
500 * Sanity-check data. Take one half-byte at once with 720 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 721 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 722 */
529#ifdef CONFIG_X86_32 723 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 724 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 725 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 726 attr.disabled = 1;
533#endif 727
534 data &= ~DR_CONTROL_RESERVED; 728 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 729
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 730 /*
537 return -EIO; 731 * CHECKME: the previous code returned -EIO if the addr wasn't
538 child->thread.debugreg7 = data; 732 * a valid task virtual addr. The new one will return -EINVAL in
539 if (data) 733 * this case.
540 set_tsk_thread_flag(child, TIF_DEBUG); 734 * -EINVAL may be what we want for in-kernel breakpoints users,
541 else 735 * but -EIO looks better for ptrace, since we refuse a register
542 clear_tsk_thread_flag(child, TIF_DEBUG); 736 * writing for the user. And anyway this is the previous
543 break; 737 * behaviour.
738 */
739 if (IS_ERR(bp))
740 return PTR_ERR(bp);
741
742 t->ptrace_bps[nr] = bp;
743 } else {
744 int err;
745
746 bp = t->ptrace_bps[nr];
747
748 attr = bp->attr;
749 attr.bp_addr = addr;
750 err = modify_user_hw_breakpoint(bp, &attr);
751 if (err)
752 return err;
544 } 753 }
545 754
755
546 return 0; 756 return 0;
547} 757}
548 758
549/* 759/*
760 * Handle PTRACE_POKEUSR calls for the debug register area.
761 */
762int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
763{
764 struct thread_struct *thread = &(tsk->thread);
765 int rc = 0;
766
767 /* There are no DR4 or DR5 registers */
768 if (n == 4 || n == 5)
769 return -EIO;
770
771 if (n == 6) {
772 thread->debugreg6 = val;
773 goto ret_path;
774 }
775 if (n < HBP_NUM) {
776 rc = ptrace_set_breakpoint_addr(tsk, n, val);
777 if (rc)
778 return rc;
779 }
780 /* All that's left is DR7 */
781 if (n == 7)
782 rc = ptrace_write_dr7(tsk, val);
783
784ret_path:
785 return rc;
786}
787
788/*
550 * These access the current or another (stopped) task's io permission 789 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 790 * bitmap for debugging or core dump.
552 */ 791 */
@@ -1219,14 +1458,14 @@ static int genregs32_get(struct task_struct *target,
1219{ 1458{
1220 if (kbuf) { 1459 if (kbuf) {
1221 compat_ulong_t *k = kbuf; 1460 compat_ulong_t *k = kbuf;
1222 while (count > 0) { 1461 while (count >= sizeof(*k)) {
1223 getreg32(target, pos, k++); 1462 getreg32(target, pos, k++);
1224 count -= sizeof(*k); 1463 count -= sizeof(*k);
1225 pos += sizeof(*k); 1464 pos += sizeof(*k);
1226 } 1465 }
1227 } else { 1466 } else {
1228 compat_ulong_t __user *u = ubuf; 1467 compat_ulong_t __user *u = ubuf;
1229 while (count > 0) { 1468 while (count >= sizeof(*u)) {
1230 compat_ulong_t word; 1469 compat_ulong_t word;
1231 getreg32(target, pos, &word); 1470 getreg32(target, pos, &word);
1232 if (__put_user(word, u++)) 1471 if (__put_user(word, u++))
@@ -1247,14 +1486,14 @@ static int genregs32_set(struct task_struct *target,
1247 int ret = 0; 1486 int ret = 0;
1248 if (kbuf) { 1487 if (kbuf) {
1249 const compat_ulong_t *k = kbuf; 1488 const compat_ulong_t *k = kbuf;
1250 while (count > 0 && !ret) { 1489 while (count >= sizeof(*k) && !ret) {
1251 ret = putreg32(target, pos, *k++); 1490 ret = putreg32(target, pos, *k++);
1252 count -= sizeof(*k); 1491 count -= sizeof(*k);
1253 pos += sizeof(*k); 1492 pos += sizeof(*k);
1254 } 1493 }
1255 } else { 1494 } else {
1256 const compat_ulong_t __user *u = ubuf; 1495 const compat_ulong_t __user *u = ubuf;
1257 while (count > 0 && !ret) { 1496 while (count >= sizeof(*u) && !ret) {
1258 compat_ulong_t word; 1497 compat_ulong_t word;
1259 ret = __get_user(word, u++); 1498 ret = __get_user(word, u++);
1260 if (ret) 1499 if (ret)
@@ -1437,21 +1676,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1437#endif 1676#endif
1438} 1677}
1439 1678
1440void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, 1679static void fill_sigtrap_info(struct task_struct *tsk,
1441 int error_code, int si_code) 1680 struct pt_regs *regs,
1681 int error_code, int si_code,
1682 struct siginfo *info)
1442{ 1683{
1443 struct siginfo info;
1444
1445 tsk->thread.trap_no = 1; 1684 tsk->thread.trap_no = 1;
1446 tsk->thread.error_code = error_code; 1685 tsk->thread.error_code = error_code;
1447 1686
1448 memset(&info, 0, sizeof(info)); 1687 memset(info, 0, sizeof(*info));
1449 info.si_signo = SIGTRAP; 1688 info->si_signo = SIGTRAP;
1450 info.si_code = si_code; 1689 info->si_code = si_code;
1690 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
1691}
1451 1692
1452 /* User-mode ip? */ 1693void user_single_step_siginfo(struct task_struct *tsk,
1453 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1694 struct pt_regs *regs,
1695 struct siginfo *info)
1696{
1697 fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
1698}
1454 1699
1700void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1701 int error_code, int si_code)
1702{
1703 struct siginfo info;
1704
1705 fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
1455 /* Send us the fake SIGTRAP */ 1706 /* Send us the fake SIGTRAP */
1456 force_sig_info(SIGTRAP, &info, tsk); 1707 force_sig_info(SIGTRAP, &info, tsk);
1457} 1708}
@@ -1516,29 +1767,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1516 1767
1517asmregparm void syscall_trace_leave(struct pt_regs *regs) 1768asmregparm void syscall_trace_leave(struct pt_regs *regs)
1518{ 1769{
1770 bool step;
1771
1519 if (unlikely(current->audit_context)) 1772 if (unlikely(current->audit_context))
1520 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1773 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1521 1774
1522 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1775 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1523 trace_sys_exit(regs, regs->ax); 1776 trace_sys_exit(regs, regs->ax);
1524 1777
1525 if (test_thread_flag(TIF_SYSCALL_TRACE))
1526 tracehook_report_syscall_exit(regs, 0);
1527
1528 /* 1778 /*
1529 * If TIF_SYSCALL_EMU is set, we only get here because of 1779 * If TIF_SYSCALL_EMU is set, we only get here because of
1530 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 1780 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1531 * We already reported this syscall instruction in 1781 * We already reported this syscall instruction in
1532 * syscall_trace_enter(), so don't do any more now. 1782 * syscall_trace_enter().
1533 */
1534 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1535 return;
1536
1537 /*
1538 * If we are single-stepping, synthesize a trap to follow the
1539 * system call instruction.
1540 */ 1783 */
1541 if (test_thread_flag(TIF_SINGLESTEP) && 1784 step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
1542 tracehook_consider_fatal_signal(current, SIGTRAP)) 1785 !test_thread_flag(TIF_SYSCALL_EMU);
1543 send_sigtrap(current, regs, 0, TRAP_BRKPT); 1786 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1787 tracehook_report_syscall_exit(regs, step);
1544} 1788}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..12e9feaa2f7a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,6 +491,19 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494
495/*
496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on
497 * floppy DMA. Disable HPET MSI on such platforms.
498 */
499static void force_disable_hpet_msi(struct pci_dev *unused)
500{
501 hpet_msi_disable = 1;
502}
503
504DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
505 force_disable_hpet_msi);
506
494#endif 507#endif
495 508
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 509#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
@@ -499,6 +512,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{ 512{
500 struct pci_dev *nb_ht; 513 struct pci_dev *nb_ht;
501 unsigned int devfn; 514 unsigned int devfn;
515 u32 node;
502 u32 val; 516 u32 val;
503 517
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); 518 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +521,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
507 return; 521 return;
508 522
509 pci_read_config_dword(nb_ht, 0x60, &val); 523 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 524 node = val & 7;
525 /*
526 * Some hardware may return an invalid node ID,
527 * so check it first:
528 */
529 if (node_online(node))
530 set_dev_node(&dev->dev, node);
511 pci_dev_put(nb_ht); 531 pci_dev_put(nb_ht);
512} 532}
513 533
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a1a3cdda06e1..704bddcdf64d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
23# include <linux/ctype.h> 23# include <linux/ctype.h>
24# include <linux/mc146818rtc.h> 24# include <linux/mc146818rtc.h>
25#else 25#else
26# include <asm/iommu.h> 26# include <asm/x86_init.h>
27#endif 27#endif
28 28
29/* 29/*
@@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"), 203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
204 }, 204 },
205 }, 205 },
206 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
207 .callback = set_bios_reboot,
208 .ident = "Dell OptiPlex 760",
209 .matches = {
210 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
211 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
212 DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
213 },
214 },
206 { /* Handle problems with rebooting on Dell 2400's */ 215 { /* Handle problems with rebooting on Dell 2400's */
207 .callback = set_bios_reboot, 216 .callback = set_bios_reboot,
208 .ident = "Dell PowerEdge 2400", 217 .ident = "Dell PowerEdge 2400",
@@ -259,6 +268,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
259 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), 268 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
260 }, 269 },
261 }, 270 },
271 { /* Handle problems with rebooting on ASUS P4S800 */
272 .callback = set_bios_reboot,
273 .ident = "ASUS P4S800",
274 .matches = {
275 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
276 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
277 },
278 },
262 { } 279 { }
263}; 280};
264 281
@@ -436,6 +453,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
436 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), 453 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
437 }, 454 },
438 }, 455 },
456 { /* Handle problems with rebooting on Apple Macmini3,1 */
457 .callback = set_pci_reboot,
458 .ident = "Apple Macmini3,1",
459 .matches = {
460 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
461 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
462 },
463 },
439 { } 464 { }
440}; 465};
441 466
@@ -614,7 +639,7 @@ void native_machine_shutdown(void)
614#endif 639#endif
615 640
616#ifdef CONFIG_X86_64 641#ifdef CONFIG_X86_64
617 pci_iommu_shutdown(); 642 x86_platform.iommu_shutdown();
618#endif 643#endif
619} 644}
620 645
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..fda313ebbb03 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -12,7 +12,7 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/geode.h> 15#include <linux/cs5535.h>
16 16
17static void cs5530a_warm_reset(struct pci_dev *dev) 17static void cs5530a_warm_reset(struct pci_dev *dev)
18{ 18{
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f9b1f4e5ab74..3499b4fabc94 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -73,6 +73,7 @@
73 73
74#include <asm/mtrr.h> 74#include <asm/mtrr.h>
75#include <asm/apic.h> 75#include <asm/apic.h>
76#include <asm/trampoline.h>
76#include <asm/e820.h> 77#include <asm/e820.h>
77#include <asm/mpspec.h> 78#include <asm/mpspec.h>
78#include <asm/setup.h> 79#include <asm/setup.h>
@@ -110,6 +111,7 @@
110#ifdef CONFIG_X86_64 111#ifdef CONFIG_X86_64
111#include <asm/numa_64.h> 112#include <asm/numa_64.h>
112#endif 113#endif
114#include <asm/mce.h>
113 115
114/* 116/*
115 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 117 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -248,7 +250,7 @@ EXPORT_SYMBOL(edd);
248 * from boot_params into a safe place. 250 * from boot_params into a safe place.
249 * 251 *
250 */ 252 */
251static inline void copy_edd(void) 253static inline void __init copy_edd(void)
252{ 254{
253 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 255 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
254 sizeof(edd.mbr_signature)); 256 sizeof(edd.mbr_signature));
@@ -257,7 +259,7 @@ static inline void copy_edd(void)
257 edd.edd_info_nr = boot_params.eddbuf_entries; 259 edd.edd_info_nr = boot_params.eddbuf_entries;
258} 260}
259#else 261#else
260static inline void copy_edd(void) 262static inline void __init copy_edd(void)
261{ 263{
262} 264}
263#endif 265#endif
@@ -634,18 +636,33 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
634 }, 636 },
635 }, 637 },
636 { 638 {
639 .callback = dmi_low_memory_corruption,
640 .ident = "Phoenix/MSC BIOS",
641 .matches = {
642 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
643 },
644 },
637 /* 645 /*
638 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 646 * AMI BIOS with low memory corruption was found on Intel DG45ID and
639 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 647 * DG45FC boards.
648 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
640 * match only DMI_BOARD_NAME and see if there is more bad products 649 * match only DMI_BOARD_NAME and see if there is more bad products
641 * with this vendor. 650 * with this vendor.
642 */ 651 */
652 {
643 .callback = dmi_low_memory_corruption, 653 .callback = dmi_low_memory_corruption,
644 .ident = "AMI BIOS", 654 .ident = "AMI BIOS",
645 .matches = { 655 .matches = {
646 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), 656 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
647 }, 657 },
648 }, 658 },
659 {
660 .callback = dmi_low_memory_corruption,
661 .ident = "AMI BIOS",
662 .matches = {
663 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
664 },
665 },
649#endif 666#endif
650 {} 667 {}
651}; 668};
@@ -884,6 +901,13 @@ void __init setup_arch(char **cmdline_p)
884 901
885 reserve_brk(); 902 reserve_brk();
886 903
904 /*
905 * Find and reserve possible boot-time SMP configuration:
906 */
907 find_smp_config();
908
909 reserve_trampoline_memory();
910
887#ifdef CONFIG_ACPI_SLEEP 911#ifdef CONFIG_ACPI_SLEEP
888 /* 912 /*
889 * Reserve low memory region for sleep support. 913 * Reserve low memory region for sleep support.
@@ -930,11 +954,6 @@ void __init setup_arch(char **cmdline_p)
930 954
931 early_acpi_boot_init(); 955 early_acpi_boot_init();
932 956
933 /*
934 * Find and reserve possible boot-time SMP configuration:
935 */
936 find_smp_config();
937
938#ifdef CONFIG_ACPI_NUMA 957#ifdef CONFIG_ACPI_NUMA
939 /* 958 /*
940 * Parse SRAT to discover nodes. 959 * Parse SRAT to discover nodes.
@@ -1021,6 +1040,8 @@ void __init setup_arch(char **cmdline_p)
1021#endif 1040#endif
1022#endif 1041#endif
1023 x86_init.oem.banner(); 1042 x86_init.oem.banner();
1043
1044 mcheck_init();
1024} 1045}
1025 1046
1026#ifdef CONFIG_X86_32 1047#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d559af913e1f..35abcb8b00e9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/module.h> 4#include <linux/module.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,9 +22,9 @@
20#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
21 23
22#ifdef CONFIG_DEBUG_PER_CPU_MAPS 24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
23# define DBG(x...) printk(KERN_DEBUG x) 25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
24#else 26#else
25# define DBG(x...) 27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
26#endif 28#endif
27 29
28DEFINE_PER_CPU(int, cpu_number); 30DEFINE_PER_CPU(int, cpu_number);
@@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
116 } else { 118 } else {
117 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 119 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
118 size, align, goal); 120 size, align, goal);
119 pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 121 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
120 "%016lx\n", cpu, size, node, __pa(ptr)); 122 cpu, size, node, __pa(ptr));
121 } 123 }
122 return ptr; 124 return ptr;
123#else 125#else
@@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void)
198 pcpu_cpu_distance, 200 pcpu_cpu_distance,
199 pcpu_fc_alloc, pcpu_fc_free); 201 pcpu_fc_alloc, pcpu_fc_free);
200 if (rc < 0) 202 if (rc < 0)
201 pr_warning("PERCPU: %s allocator failed (%d), " 203 pr_warning("%s allocator failed (%d), falling back to page size\n",
202 "falling back to page size\n",
203 pcpu_fc_names[pcpu_chosen_fc], rc); 204 pcpu_fc_names[pcpu_chosen_fc], rc);
204 } 205 }
205 if (rc < 0) 206 if (rc < 0)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..4fd173cd8e57 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -544,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
544} 545}
545#endif /* CONFIG_X86_32 */ 546#endif /* CONFIG_X86_32 */
546 547
547#ifdef CONFIG_X86_32 548long
548int sys_sigaltstack(struct pt_regs *regs)
549{
550 const stack_t __user *uss = (const stack_t __user *)regs->bx;
551 stack_t __user *uoss = (stack_t __user *)regs->cx;
552
553 return do_sigaltstack(uss, uoss, regs->sp);
554}
555#else /* !CONFIG_X86_32 */
556asmlinkage long
557sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 549sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
558 struct pt_regs *regs) 550 struct pt_regs *regs)
559{ 551{
560 return do_sigaltstack(uss, uoss, regs->sp); 552 return do_sigaltstack(uss, uoss, regs->sp);
561} 553}
562#endif /* CONFIG_X86_32 */
563 554
564/* 555/*
565 * Do a signal return; undo the signal stack. 556 * Do a signal return; undo the signal stack.
@@ -799,15 +790,6 @@ static void do_signal(struct pt_regs *regs)
799 790
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 791 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 792 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 793 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 794 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 795 /*
@@ -872,6 +854,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
872 if (current->replacement_session_keyring) 854 if (current->replacement_session_keyring)
873 key_replace_session_keyring(); 855 key_replace_session_keyring();
874 } 856 }
857 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
858 fire_user_return_notifiers();
875 859
876#ifdef CONFIG_X86_32 860#ifdef CONFIG_X86_32
877 clear_thread_flag(TIF_IRET); 861 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d915d956e66d..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -198,7 +198,6 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
198{ 198{
199 ack_APIC_irq(); 199 ack_APIC_irq();
200 inc_irq_stat(irq_resched_count); 200 inc_irq_stat(irq_resched_count);
201 run_local_timers();
202 /* 201 /*
203 * KVM uses this interrupt to force a cpu out of guest mode 202 * KVM uses this interrupt to force a cpu out of guest mode
204 */ 203 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..678d0b8c26f3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -671,6 +671,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
671 complete(&c_idle->done); 671 complete(&c_idle->done);
672} 672}
673 673
674/* reduce the number of lines printed when booting a large cpu count system */
675static void __cpuinit announce_cpu(int cpu, int apicid)
676{
677 static int current_node = -1;
678 int node = cpu_to_node(cpu);
679
680 if (system_state == SYSTEM_BOOTING) {
681 if (node != current_node) {
682 if (current_node > (-1))
683 pr_cont(" Ok.\n");
684 current_node = node;
685 pr_info("Booting Node %3d, Processors ", node);
686 }
687 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
688 return;
689 } else
690 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
691 node, cpu, apicid);
692}
693
674/* 694/*
675 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 695 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
676 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 696 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -687,7 +707,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
687 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 707 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
688 }; 708 };
689 709
690 INIT_WORK(&c_idle.work, do_fork_idle); 710 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
691 711
692 alternatives_smp_switch(1); 712 alternatives_smp_switch(1);
693 713
@@ -713,6 +733,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
713 733
714 if (IS_ERR(c_idle.idle)) { 734 if (IS_ERR(c_idle.idle)) {
715 printk("failed fork for CPU %d\n", cpu); 735 printk("failed fork for CPU %d\n", cpu);
736 destroy_work_on_stack(&c_idle.work);
716 return PTR_ERR(c_idle.idle); 737 return PTR_ERR(c_idle.idle);
717 } 738 }
718 739
@@ -736,9 +757,8 @@ do_rest:
736 /* start_ip had better be page-aligned! */ 757 /* start_ip had better be page-aligned! */
737 start_ip = setup_trampoline(); 758 start_ip = setup_trampoline();
738 759
739 /* So we see what's up */ 760 /* So we see what's up */
740 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", 761 announce_cpu(cpu, apicid);
741 cpu, apicid, start_ip);
742 762
743 /* 763 /*
744 * This grunge runs the startup process for 764 * This grunge runs the startup process for
@@ -787,21 +807,17 @@ do_rest:
787 udelay(100); 807 udelay(100);
788 } 808 }
789 809
790 if (cpumask_test_cpu(cpu, cpu_callin_mask)) { 810 if (cpumask_test_cpu(cpu, cpu_callin_mask))
791 /* number CPUs logically, starting from 1 (BSP is 0) */ 811 pr_debug("CPU%d: has booted.\n", cpu);
792 pr_debug("OK.\n"); 812 else {
793 printk(KERN_INFO "CPU%d: ", cpu);
794 print_cpu_info(&cpu_data(cpu));
795 pr_debug("CPU has booted.\n");
796 } else {
797 boot_error = 1; 813 boot_error = 1;
798 if (*((volatile unsigned char *)trampoline_base) 814 if (*((volatile unsigned char *)trampoline_base)
799 == 0xA5) 815 == 0xA5)
800 /* trampoline started but...? */ 816 /* trampoline started but...? */
801 printk(KERN_ERR "Stuck ??\n"); 817 pr_err("CPU%d: Stuck ??\n", cpu);
802 else 818 else
803 /* trampoline code not run */ 819 /* trampoline code not run */
804 printk(KERN_ERR "Not responding.\n"); 820 pr_err("CPU%d: Not responding.\n", cpu);
805 if (apic->inquire_remote_apic) 821 if (apic->inquire_remote_apic)
806 apic->inquire_remote_apic(apicid); 822 apic->inquire_remote_apic(apicid);
807 } 823 }
@@ -831,6 +847,7 @@ do_rest:
831 smpboot_restore_warm_reset_vector(); 847 smpboot_restore_warm_reset_vector();
832 } 848 }
833 849
850 destroy_work_on_stack(&c_idle.work);
834 return boot_error; 851 return boot_error;
835} 852}
836 853
@@ -1250,16 +1267,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1250void cpu_disable_common(void) 1267void cpu_disable_common(void)
1251{ 1268{
1252 int cpu = smp_processor_id(); 1269 int cpu = smp_processor_id();
1253 /*
1254 * HACK:
1255 * Allow any queued timer interrupts to get serviced
1256 * This is only a temporary solution until we cleanup
1257 * fixup_irqs as we do for IA64.
1258 */
1259 local_irq_enable();
1260 mdelay(1);
1261 1270
1262 local_irq_disable();
1263 remove_siblinginfo(cpu); 1271 remove_siblinginfo(cpu);
1264 1272
1265 /* It's now safe to remove this processor from the online map */ 1273 /* It's now safe to remove this processor from the online map */
@@ -1300,14 +1308,16 @@ void native_cpu_die(unsigned int cpu)
1300 for (i = 0; i < 10; i++) { 1308 for (i = 0; i < 10; i++) {
1301 /* They ack this in play_dead by setting CPU_DEAD */ 1309 /* They ack this in play_dead by setting CPU_DEAD */
1302 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1310 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1303 printk(KERN_INFO "CPU %d is now offline\n", cpu); 1311 if (system_state == SYSTEM_RUNNING)
1312 pr_info("CPU %u is now offline\n", cpu);
1313
1304 if (1 == num_online_cpus()) 1314 if (1 == num_online_cpus())
1305 alternatives_smp_switch(0); 1315 alternatives_smp_switch(0);
1306 return; 1316 return;
1307 } 1317 }
1308 msleep(100); 1318 msleep(100);
1309 } 1319 }
1310 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1320 pr_err("CPU %u didn't die...\n", cpu);
1311} 1321}
1312 1322
1313void play_dead_common(void) 1323void play_dead_common(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207181fe..922eefbb3f6c 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning, 56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol, 57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 58 .stack = save_stack_stack,
59 .address = save_stack_address, 59 .address = save_stack_address,
60 .walk_stack = print_context_stack,
60}; 61};
61 62
62static const struct stacktrace_ops save_stack_ops_nosched = { 63static const struct stacktrace_ops save_stack_ops_nosched = {
63 .warning = save_stack_warning, 64 .warning = save_stack_warning,
64 .warning_symbol = save_stack_warning_symbol, 65 .warning_symbol = save_stack_warning_symbol,
65 .stack = save_stack_stack, 66 .stack = save_stack_stack,
66 .address = save_stack_address_nosched, 67 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack,
67}; 69};
68 70
69/* 71/*
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..dee1ff7cba58 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -24,31 +24,6 @@
24 24
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
28 unsigned long prot, unsigned long flags,
29 unsigned long fd, unsigned long pgoff)
30{
31 int error = -EBADF;
32 struct file *file = NULL;
33 struct mm_struct *mm = current->mm;
34
35 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
36 if (!(flags & MAP_ANONYMOUS)) {
37 file = fget(fd);
38 if (!file)
39 goto out;
40 }
41
42 down_write(&mm->mmap_sem);
43 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
44 up_write(&mm->mmap_sem);
45
46 if (file)
47 fput(file);
48out:
49 return error;
50}
51
52/* 27/*
53 * Perform the select(nd, in, out, ex, tv) and mmap() system 28 * Perform the select(nd, in, out, ex, tv) and mmap() system
54 * calls. Linux/i386 didn't use to be able to handle more than 29 * calls. Linux/i386 didn't use to be able to handle more than
@@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
77 if (a.offset & ~PAGE_MASK) 52 if (a.offset & ~PAGE_MASK)
78 goto out; 53 goto out;
79 54
80 err = sys_mmap2(a.addr, a.len, a.prot, a.flags, 55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
81 a.fd, a.offset >> PAGE_SHIFT); 56 a.fd, a.offset >> PAGE_SHIFT);
82out: 57out:
83 return err; 58 return err;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 45e00eb09c3a..8aa2057efd12 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
23 unsigned long, fd, unsigned long, off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file;
27
28 error = -EINVAL; 26 error = -EINVAL;
29 if (off & ~PAGE_MASK) 27 if (off & ~PAGE_MASK)
30 goto out; 28 goto out;
31 29
32 error = -EBADF; 30 error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
33 file = NULL;
34 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
35 if (!(flags & MAP_ANONYMOUS)) {
36 file = fget(fd);
37 if (!file)
38 goto out;
39 }
40 down_write(&current->mm->mmap_sem);
41 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
42 up_write(&current->mm->mmap_sem);
43
44 if (file)
45 fput(file);
46out: 31out:
47 return error; 32 return error;
48} 33}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..15228b5d3eb7 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -191,7 +191,7 @@ ENTRY(sys_call_table)
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap_pgoff
195 .long sys_truncate64 195 .long sys_truncate64
196 .long sys_ftruncate64 196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */ 197 .long sys_stat64 /* 195 */
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dcb00d278512..be2573448ed9 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs)
38#ifdef CONFIG_FRAME_POINTER 38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long)); 39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else 40#else
41 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp =
42 (unsigned long *)kernel_stack_pointer(regs);
42 /* 43 /*
43 * Return address is either directly at stack pointer 44 * Return address is either directly at stack pointer
44 * or above a saved flags. Eflags has bits 22-31 zero, 45 * or above a saved flags. Eflags has bits 22-31 zero,
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index af21e5556900..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -23,8 +23,6 @@
23static struct bau_control **uv_bau_table_bases __read_mostly; 23static struct bau_control **uv_bau_table_bases __read_mostly;
24static int uv_bau_retry_limit __read_mostly; 24static int uv_bau_retry_limit __read_mostly;
25 25
26/* position of pnode (which is nasid>>1): */
27static int uv_nshift __read_mostly;
28/* base pnode in this partition */ 26/* base pnode in this partition */
29static int uv_partition_base_pnode __read_mostly; 27static int uv_partition_base_pnode __read_mostly;
30 28
@@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode)
723 BUG_ON(!adp); 721 BUG_ON(!adp);
724 722
725 pa = uv_gpa(adp); /* need the real nasid*/ 723 pa = uv_gpa(adp); /* need the real nasid*/
726 n = pa >> uv_nshift; 724 n = uv_gpa_to_pnode(pa);
727 m = pa & uv_mmask; 725 m = pa & uv_mmask;
728 726
729 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 727 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
778 * need the pnode of where the memory was really allocated 776 * need the pnode of where the memory was really allocated
779 */ 777 */
780 pa = uv_gpa(pqp); 778 pa = uv_gpa(pqp);
781 pn = pa >> uv_nshift; 779 pn = uv_gpa_to_pnode(pa);
782 uv_write_global_mmr64(pnode, 780 uv_write_global_mmr64(pnode,
783 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 781 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
784 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 782 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
@@ -841,8 +839,7 @@ static int __init uv_bau_init(void)
841 GFP_KERNEL, cpu_to_node(cur_cpu)); 839 GFP_KERNEL, cpu_to_node(cur_cpu));
842 840
843 uv_bau_retry_limit = 1; 841 uv_bau_retry_limit = 1;
844 uv_nshift = uv_hub_info->n_val; 842 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
845 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
846 nblades = uv_num_possible_blades(); 843 nblades = uv_num_possible_blades();
847 844
848 uv_bau_table_bases = (struct bau_control **) 845 uv_bau_table_bases = (struct bau_control **)
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 699f7eeb896a..c652ef62742d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,22 +3,28 @@
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h> 4#include <asm/e820.h>
5 5
6#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
7#define __trampinit
8#define __trampinitdata
9#else
10#define __trampinit __cpuinit
11#define __trampinitdata __cpuinitdata
12#endif
13
6/* ready for x86_64 and x86 */ 14/* ready for x86_64 and x86 */
7unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); 15unsigned char *__trampinitdata trampoline_base;
8 16
9void __init reserve_trampoline_memory(void) 17void __init reserve_trampoline_memory(void)
10{ 18{
11#ifdef CONFIG_X86_32 19 unsigned long mem;
12 /* 20
13 * But first pinch a few for the stack/trampoline stuff
14 * FIXME: Don't need the extra page at 4K, but need to fix
15 * trampoline before removing it. (see the GDT stuff)
16 */
17 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
18#endif
19 /* Has to be in very low memory so we can execute real-mode AP code. */ 21 /* Has to be in very low memory so we can execute real-mode AP code. */
20 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, 22 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
21 "TRAMPOLINE"); 23 if (mem == -1L)
24 panic("Cannot allocate trampoline\n");
25
26 trampoline_base = __va(mem);
27 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
22} 28}
23 29
24/* 30/*
@@ -26,7 +32,7 @@ void __init reserve_trampoline_memory(void)
26 * bootstrap into the page concerned. The caller 32 * bootstrap into the page concerned. The caller
27 * has made sure it's suitably aligned. 33 * has made sure it's suitably aligned.
28 */ 34 */
29unsigned long __cpuinit setup_trampoline(void) 35unsigned long __trampinit setup_trampoline(void)
30{ 36{
31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
32 return virt_to_phys(trampoline_base); 38 return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 596d54c660a5..3af2dff58b21 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,8 +32,12 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP
36.section .rodata, "a", @progbits
37#else
35/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ 38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
36__CPUINITRODATA 39__CPUINITRODATA
40#endif
37.code16 41.code16
38 42
39ENTRY(trampoline_data) 43ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */ 537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 539 return;
540 540
541 /* DR6 may or may not be cleared by the CPU */
542 set_debugreg(0, 6);
541 /* 543 /*
542 * The processor cleared BTF, so don't mark that we need it set. 544 * The processor cleared BTF, so don't mark that we need it set.
543 */ 545 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 546 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 547 tsk->thread.debugctlmsr = 0;
546 548
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 549 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 550 tsk->thread.debugreg6 = dr6;
551
552 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
553 SIGTRAP) == NOTIFY_STOP)
549 return; 554 return;
550 555
551 /* It's safe to allow irq's after DR6 has been saved */ 556 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 557 preempt_conditional_sti(regs);
553 558
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 559 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 560 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 561 error_code, 1);
557 goto clear_dr7; 562 return;
558 } 563 }
559 564
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 565 /*
569 * Single-stepping through TF: make sure we ignore any events in 566 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 567 * kernel space, but re-enable TF when returning to user mode.
568 *
569 * We already checked v86 mode above, so we can check for kernel mode
570 * by just checking the CPL of CS.
571 */ 571 */
572 if (condition & DR_STEP) { 572 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 573 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 574 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
575 regs->flags &= ~X86_EFLAGS_TF;
575 } 576 }
576 577 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 578 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 579 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 580 preempt_conditional_cli(regs);
588 return;
589 581
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 582 return;
604} 583}
605 584
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cd982f48e23e..597683aa5ba0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason)
763{ 763{
764 if (!tsc_unstable) { 764 if (!tsc_unstable) {
765 tsc_unstable = 1; 765 tsc_unstable = 1;
766 sched_clock_stable = 0;
766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 767 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
767 /* Change only the rating, when not registered */ 768 /* Change only the rating, when not registered */
768 if (clocksource_tsc.mult) 769 if (clocksource_tsc.mult)
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..0aa5fed8b9e6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count;
33 * we want to have the fastest, inlined, non-debug version 33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
37 37
38static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
39static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
@@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void)
62 * previous TSC that was measured (possibly on 62 * previous TSC that was measured (possibly on
63 * another CPU) and update the previous TSC timestamp. 63 * another CPU) and update the previous TSC timestamp.
64 */ 64 */
65 __raw_spin_lock(&sync_lock); 65 arch_spin_lock(&sync_lock);
66 prev = last_tsc; 66 prev = last_tsc;
67 rdtsc_barrier(); 67 rdtsc_barrier();
68 now = get_cycles(); 68 now = get_cycles();
69 rdtsc_barrier(); 69 rdtsc_barrier();
70 last_tsc = now; 70 last_tsc = now;
71 __raw_spin_unlock(&sync_lock); 71 arch_spin_unlock(&sync_lock);
72 72
73 /* 73 /*
74 * Be nice every now and then (and also check whether 74 * Be nice every now and then (and also check whether
@@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void)
87 * we saw a time-warp of the TSC going backwards: 87 * we saw a time-warp of the TSC going backwards:
88 */ 88 */
89 if (unlikely(prev > now)) { 89 if (unlikely(prev > now)) {
90 __raw_spin_lock(&sync_lock); 90 arch_spin_lock(&sync_lock);
91 max_warp = max(max_warp, prev - now); 91 max_warp = max(max_warp, prev - now);
92 nr_warps++; 92 nr_warps++;
93 __raw_spin_unlock(&sync_lock); 93 arch_spin_unlock(&sync_lock);
94 } 94 }
95 } 95 }
96 WARN(!(now-start), 96 WARN(!(now-start),
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n");
118 return; 120 return;
119 } 121 }
120 122
121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu);
123
124 /* 123 /*
125 * Reset it - in case this is a second bootup: 124 * Reset it - in case this is a second bootup:
126 */ 125 */
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
142 cpu_relax(); 141 cpu_relax();
143 142
144 if (nr_warps) { 143 if (nr_warps) {
145 printk("\n"); 144 pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
145 smp_processor_id(), cpu);
146 pr_warning("Measured %Ld cycles TSC warp between CPUs, " 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 "turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
151 smp_processor_id(), cpu);
151 } 152 }
152 153
153 /* 154 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..ece73d8e3240 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,25 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h>
12#include <linux/irq.h> 13#include <linux/irq.h>
13 14
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/uv/uv_irq.h> 16#include <asm/uv/uv_irq.h>
17#include <asm/uv/uv_hub.h>
18
19/* MMR offset and pnode of hub sourcing interrupts for a given irq */
20struct uv_irq_2_mmr_pnode{
21 struct rb_node list;
22 unsigned long offset;
23 int pnode;
24 int irq;
25};
26
27static spinlock_t uv_irq_lock;
28static struct rb_root uv_irq_root;
29
30static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
16 31
17static void uv_noop(unsigned int irq) 32static void uv_noop(unsigned int irq)
18{ 33{
@@ -39,25 +54,213 @@ struct irq_chip uv_irq_chip = {
39 .unmask = uv_noop, 54 .unmask = uv_noop,
40 .eoi = uv_ack_apic, 55 .eoi = uv_ack_apic,
41 .end = uv_noop, 56 .end = uv_noop,
57 .set_affinity = uv_set_irq_affinity,
42}; 58};
43 59
44/* 60/*
61 * Add offset and pnode information of the hub sourcing interrupts to the
62 * rb tree for a specific irq.
63 */
64static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
65{
66 struct rb_node **link = &uv_irq_root.rb_node;
67 struct rb_node *parent = NULL;
68 struct uv_irq_2_mmr_pnode *n;
69 struct uv_irq_2_mmr_pnode *e;
70 unsigned long irqflags;
71
72 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
73 uv_blade_to_memory_nid(blade));
74 if (!n)
75 return -ENOMEM;
76
77 n->irq = irq;
78 n->offset = offset;
79 n->pnode = uv_blade_to_pnode(blade);
80 spin_lock_irqsave(&uv_irq_lock, irqflags);
81 /* Find the right place in the rbtree: */
82 while (*link) {
83 parent = *link;
84 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
85
86 if (unlikely(irq == e->irq)) {
87 /* irq entry exists */
88 e->pnode = uv_blade_to_pnode(blade);
89 e->offset = offset;
90 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
91 kfree(n);
92 return 0;
93 }
94
95 if (irq < e->irq)
96 link = &(*link)->rb_left;
97 else
98 link = &(*link)->rb_right;
99 }
100
101 /* Insert the node into the rbtree. */
102 rb_link_node(&n->list, parent, link);
103 rb_insert_color(&n->list, &uv_irq_root);
104
105 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
106 return 0;
107}
108
109/* Retrieve offset and pnode information from the rb tree for a specific irq */
110int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
111{
112 struct uv_irq_2_mmr_pnode *e;
113 struct rb_node *n;
114 unsigned long irqflags;
115
116 spin_lock_irqsave(&uv_irq_lock, irqflags);
117 n = uv_irq_root.rb_node;
118 while (n) {
119 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
120
121 if (e->irq == irq) {
122 *offset = e->offset;
123 *pnode = e->pnode;
124 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
125 return 0;
126 }
127
128 if (irq < e->irq)
129 n = n->rb_left;
130 else
131 n = n->rb_right;
132 }
133 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
134 return -1;
135}
136
137/*
138 * Re-target the irq to the specified CPU and enable the specified MMR located
139 * on the specified blade to allow the sending of MSIs to the specified CPU.
140 */
141static int
142arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 unsigned long mmr_offset, int restrict)
144{
145 const struct cpumask *eligible_cpu = cpumask_of(cpu);
146 struct irq_desc *desc = irq_to_desc(irq);
147 struct irq_cfg *cfg;
148 int mmr_pnode;
149 unsigned long mmr_value;
150 struct uv_IO_APIC_route_entry *entry;
151 int err;
152
153 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
154 sizeof(unsigned long));
155
156 cfg = irq_cfg(irq);
157
158 err = assign_irq_vector(irq, cfg, eligible_cpu);
159 if (err != 0)
160 return err;
161
162 if (restrict == UV_AFFINITY_CPU)
163 desc->status |= IRQ_NO_BALANCING;
164 else
165 desc->status |= IRQ_MOVE_PCNTXT;
166
167 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
168 irq_name);
169
170 mmr_value = 0;
171 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
172 entry->vector = cfg->vector;
173 entry->delivery_mode = apic->irq_delivery_mode;
174 entry->dest_mode = apic->irq_dest_mode;
175 entry->polarity = 0;
176 entry->trigger = 0;
177 entry->mask = 0;
178 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
179
180 mmr_pnode = uv_blade_to_pnode(mmr_blade);
181 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
182
183 if (cfg->move_in_progress)
184 send_cleanup_vector(cfg);
185
186 return irq;
187}
188
189/*
190 * Disable the specified MMR located on the specified blade so that MSIs are
191 * longer allowed to be sent.
192 */
193static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
194{
195 unsigned long mmr_value;
196 struct uv_IO_APIC_route_entry *entry;
197
198 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
199 sizeof(unsigned long));
200
201 mmr_value = 0;
202 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
203 entry->mask = 1;
204
205 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
206}
207
208static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
209{
210 struct irq_desc *desc = irq_to_desc(irq);
211 struct irq_cfg *cfg = desc->chip_data;
212 unsigned int dest;
213 unsigned long mmr_value;
214 struct uv_IO_APIC_route_entry *entry;
215 unsigned long mmr_offset;
216 unsigned mmr_pnode;
217
218 if (set_desc_affinity(desc, mask, &dest))
219 return -1;
220
221 mmr_value = 0;
222 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
223
224 entry->vector = cfg->vector;
225 entry->delivery_mode = apic->irq_delivery_mode;
226 entry->dest_mode = apic->irq_dest_mode;
227 entry->polarity = 0;
228 entry->trigger = 0;
229 entry->mask = 0;
230 entry->dest = dest;
231
232 /* Get previously stored MMR and pnode of hub sourcing interrupts */
233 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
234 return -1;
235
236 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
237
238 if (cfg->move_in_progress)
239 send_cleanup_vector(cfg);
240
241 return 0;
242}
243
244/*
45 * Set up a mapping of an available irq and vector, and enable the specified 245 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an 246 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised. 247 * interrupt is raised.
48 */ 248 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 249int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset) 250 unsigned long mmr_offset, int restrict)
51{ 251{
52 int irq; 252 int irq, ret;
53 int ret; 253
254 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
54 255
55 irq = create_irq();
56 if (irq <= 0) 256 if (irq <= 0)
57 return -EBUSY; 257 return -EBUSY;
58 258
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); 259 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
60 if (ret != irq) 260 restrict);
261 if (ret == irq)
262 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
263 else
61 destroy_irq(irq); 264 destroy_irq(irq);
62 265
63 return ret; 266 return ret;
@@ -71,9 +274,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
71 * 274 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). 275 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */ 276 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) 277void uv_teardown_irq(unsigned int irq)
75{ 278{
76 arch_disable_uv_irq(mmr_blade, mmr_offset); 279 struct uv_irq_2_mmr_pnode *e;
280 struct rb_node *n;
281 unsigned long irqflags;
282
283 spin_lock_irqsave(&uv_irq_lock, irqflags);
284 n = uv_irq_root.rb_node;
285 while (n) {
286 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
287 if (e->irq == irq) {
288 arch_disable_uv_irq(e->pnode, e->offset);
289 rb_erase(n, &uv_irq_root);
290 kfree(e);
291 break;
292 }
293 if (irq < e->irq)
294 n = n->rb_left;
295 else
296 n = n->rb_right;
297 }
298 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
77 destroy_irq(irq); 299 destroy_irq(irq);
78} 300}
79EXPORT_SYMBOL_GPL(uv_teardown_irq); 301EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..2b75ef638dbc 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,7 @@ struct uv_rtc_timer_head {
74 */ 74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 75static struct uv_rtc_timer_head **blade_info __read_mostly;
76 76
77static int uv_rtc_enable; 77static int uv_rtc_evt_enable;
78 78
79/* 79/*
80 * Hardware interface routines 80 * Hardware interface routines
@@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 93 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 96}
@@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 118 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 120
121 /* Set configuration */ 121 /* Set configuration */
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 125
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 126 if (uv_read_rtc(NULL) <= expires)
127 return 0;
128
129 return !uv_intr_pending(pnode);
127} 130}
128 131
129/* 132/*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 226
224 next_cpu = head->next_cpu; 227 next_cpu = head->next_cpu;
225 *t = expires; 228 *t = expires;
229
226 /* Will this one be next to go off? */ 230 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 231 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 232 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 235 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 236 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 237 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 238 return -ETIME;
235 } 239 }
236 } 240 }
237 241
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 248 *
245 * Returns 1 if this timer was pending. 249 * Returns 1 if this timer was pending.
246 */ 250 */
247static int uv_rtc_unset_timer(int cpu) 251static int uv_rtc_unset_timer(int cpu, int force)
248{ 252{
249 int pnode = uv_cpu_to_pnode(cpu); 253 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 254 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
256 260
257 spin_lock_irqsave(&head->lock, flags); 261 spin_lock_irqsave(&head->lock, flags);
258 262
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 263 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 264 rc = 1;
261 265
262 *t = ULLONG_MAX; 266 if (rc) {
263 267 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 268 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 269 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 270 uv_rtc_find_next_timer(head, pnode);
271 }
267 272
268 spin_unlock_irqrestore(&head->lock, flags); 273 spin_unlock_irqrestore(&head->lock, flags);
269 274
@@ -277,10 +282,21 @@ static int uv_rtc_unset_timer(int cpu)
277 282
278/* 283/*
279 * Read the RTC. 284 * Read the RTC.
285 *
286 * Starting with HUB rev 2.0, the UV RTC register is replicated across all
287 * cachelines of it's own page. This allows faster simultaneous reads
288 * from a given socket.
280 */ 289 */
281static cycle_t uv_read_rtc(struct clocksource *cs) 290static cycle_t uv_read_rtc(struct clocksource *cs)
282{ 291{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC); 292 unsigned long offset;
293
294 if (uv_get_min_hub_revision_id() == 1)
295 offset = 0;
296 else
297 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
298
299 return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
284} 300}
285 301
286/* 302/*
@@ -310,32 +326,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 326 break;
311 case CLOCK_EVT_MODE_UNUSED: 327 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 328 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 329 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 330 break;
315 } 331 }
316} 332}
317 333
318static void uv_rtc_interrupt(void) 334static void uv_rtc_interrupt(void)
319{ 335{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 336 int cpu = smp_processor_id();
337 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 338
323 if (!ced || !ced->event_handler) 339 if (!ced || !ced->event_handler)
324 return; 340 return;
325 341
326 if (uv_rtc_unset_timer(cpu) != 1) 342 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 343 return;
328 344
329 ced->event_handler(ced); 345 ced->event_handler(ced);
330} 346}
331 347
332static int __init uv_enable_rtc(char *str) 348static int __init uv_enable_evt_rtc(char *str)
333{ 349{
334 uv_rtc_enable = 1; 350 uv_rtc_evt_enable = 1;
335 351
336 return 1; 352 return 1;
337} 353}
338__setup("uvrtc", uv_enable_rtc); 354__setup("uvrtcevt", uv_enable_evt_rtc);
339 355
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 356static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 357{
@@ -350,27 +366,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 366{
351 int rc; 367 int rc;
352 368
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 369 if (!is_uv_system())
354 return -ENODEV; 370 return -ENODEV;
355 371
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 372 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 373 clocksource_uv.shift);
360 374
375 /* If single blade, prefer tsc */
376 if (uv_num_possible_blades() == 1)
377 clocksource_uv.rating = 250;
378
361 rc = clocksource_register(&clocksource_uv); 379 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 380 if (rc)
363 generic_interrupt_extension = NULL; 381 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
382 else
383 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
384 sn_rtc_cycles_per_second/(unsigned long)1E6);
385
386 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 387 return rc;
365 }
366 388
367 /* Setup and register clockevents */ 389 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 390 rc = uv_rtc_allocate_timers();
369 if (rc) { 391 if (rc)
370 clocksource_unregister(&clocksource_uv); 392 goto error;
371 generic_interrupt_extension = NULL; 393
372 return rc; 394 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 395
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 396 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 397 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +404,19 @@ static __init int uv_rtc_setup_clock(void)
383 404
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 405 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 406 if (rc) {
386 clocksource_unregister(&clocksource_uv); 407 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 408 uv_rtc_deallocate_timers();
409 goto error;
389 } 410 }
390 411
412 printk(KERN_INFO "UV RTC clockevents registered\n");
413
414 return 0;
415
416error:
417 clocksource_unregister(&clocksource_uv);
418 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
419
391 return rc; 420 return rc;
392} 421}
393arch_initcall(uv_rtc_setup_clock); 422arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 1498efa964b6..34a279a7471d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
183 return; 183 return;
184 } 184 }
185 185
186 apic_cpus = apic->apicid_to_cpu_present(m->apicid); 186 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
188 /* 188 /*
189 * Validate version 189 * Validate version
@@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq)
486} 486}
487 487
488static struct irq_chip cobalt_irq_type = { 488static struct irq_chip cobalt_irq_type = {
489 .typename = "Cobalt-APIC", 489 .name = "Cobalt-APIC",
490 .startup = startup_cobalt_irq, 490 .startup = startup_cobalt_irq,
491 .shutdown = disable_cobalt_irq, 491 .shutdown = disable_cobalt_irq,
492 .enable = enable_cobalt_irq, 492 .enable = enable_cobalt_irq,
@@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq)
523} 523}
524 524
525static struct irq_chip piix4_master_irq_type = { 525static struct irq_chip piix4_master_irq_type = {
526 .typename = "PIIX4-master", 526 .name = "PIIX4-master",
527 .startup = startup_piix4_master_irq, 527 .startup = startup_piix4_master_irq,
528 .ack = ack_cobalt_irq, 528 .ack = ack_cobalt_irq,
529 .end = end_piix4_master_irq, 529 .end = end_piix4_master_irq,
@@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = {
531 531
532 532
533static struct irq_chip piix4_virtual_irq_type = { 533static struct irq_chip piix4_virtual_irq_type = {
534 .typename = "PIIX4-virtual", 534 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq, 535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq, 536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq, 537 .disable = disable_8259A_irq,
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..5ffb5622f793 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -197,9 +197,8 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200int sys_vm86old(struct pt_regs *regs) 200int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 202 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 203 * this avoids wasting of stack space.
205 * This remains on the stack until we 204 * This remains on the stack until we
@@ -227,7 +226,7 @@ out:
227} 226}
228 227
229 228
230int sys_vm86(struct pt_regs *regs) 229int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
231{ 230{
232 struct kernel_vm86_struct info; /* declare this _on top_, 231 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 232 * this avoids wasting of stack space.
@@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs)
239 struct vm86plus_struct __user *v86; 238 struct vm86plus_struct __user *v86;
240 239
241 tsk = current; 240 tsk = current;
242 switch (regs->bx) { 241 switch (cmd) {
243 case VM86_REQUEST_IRQ: 242 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 243 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 244 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 245 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); 246 ret = do_vm86_irq_handling(cmd, (int)arg);
248 goto out; 247 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 248 case VM86_PLUS_INSTALL_CHECK:
250 /* 249 /*
@@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs)
261 ret = -EPERM; 260 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 261 if (tsk->thread.saved_sp0)
263 goto out; 262 goto out;
264 v86 = (struct vm86plus_struct __user *)regs->cx; 263 v86 = (struct vm86plus_struct __user *)arg;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 265 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 266 sizeof(info.regs));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 31e6f6cfe53e..d430e4c30193 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -648,7 +648,7 @@ static inline int __init activate_vmi(void)
648 648
649 pv_info.paravirt_enabled = 1; 649 pv_info.paravirt_enabled = 1;
650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
651 pv_info.name = "vmi"; 651 pv_info.name = "vmi [deprecated]";
652 652
653 pv_init_ops.patch = vmi_patch; 653 pv_init_ops.patch = vmi_patch;
654 654
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..74c92bb194df 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 226 evt->min_delta_ns = clockevent_delta2ns(1, evt);
227 evt->cpumask = cpumask_of(cpu); 227 evt->cpumask = cpumask_of(cpu);
228 228
229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
230 evt->name, evt->mult, evt->shift); 230 evt->name, evt->mult, evt->shift);
231 clockevents_register_device(evt); 231 clockevents_register_device(evt);
232} 232}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index eeb4f5fbd86f..f92a0da608cb 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -319,9 +319,7 @@ SECTIONS
319 __brk_limit = .; 319 __brk_limit = .;
320 } 320 }
321 321
322 .end : AT(ADDR(.end) - LOAD_OFFSET) { 322 _end = .;
323 _end = .;
324 }
325 323
326 STABS_DEBUG 324 STABS_DEBUG
327 DWARF_DEBUG 325 DWARF_DEBUG
@@ -333,6 +331,9 @@ SECTIONS
333 331
334 332
335#ifdef CONFIG_X86_32 333#ifdef CONFIG_X86_32
334/*
335 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
336 */
336. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 337. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
337 "kernel image bigger than KERNEL_IMAGE_SIZE"); 338 "kernel image bigger than KERNEL_IMAGE_SIZE");
338#else 339#else
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..9055e5872ff0 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
77 u32 mult)
77{ 78{
78 unsigned long flags; 79 unsigned long flags;
79 80
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
82 vsyscall_gtod_data.clock.vread = clock->vread; 83 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask; 85 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult; 86 vsyscall_gtod_data.clock.mult = mult;
86 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
237}; 238};
238 239
239static ctl_table kernel_root_table2[] = { 240static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 241 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 242 .child = kernel_table2 },
242 {} 243 {}
243}; 244};
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..619f7f88b8cc 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -17,8 +17,6 @@
17EXPORT_SYMBOL(mcount); 17EXPORT_SYMBOL(mcount);
18#endif 18#endif
19 19
20EXPORT_SYMBOL(kernel_thread);
21
22EXPORT_SYMBOL(__get_user_1); 20EXPORT_SYMBOL(__get_user_1);
23EXPORT_SYMBOL(__get_user_2); 21EXPORT_SYMBOL(__get_user_2);
24EXPORT_SYMBOL(__get_user_4); 22EXPORT_SYMBOL(__get_user_4);
@@ -30,9 +28,8 @@ EXPORT_SYMBOL(__put_user_8);
30 28
31EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic);
32EXPORT_SYMBOL(__copy_user_nocache); 30EXPORT_SYMBOL(__copy_user_nocache);
33EXPORT_SYMBOL(copy_from_user); 31EXPORT_SYMBOL(_copy_from_user);
34EXPORT_SYMBOL(copy_to_user); 32EXPORT_SYMBOL(_copy_to_user);
35EXPORT_SYMBOL(__copy_from_user_inatomic);
36 33
37EXPORT_SYMBOL(copy_page); 34EXPORT_SYMBOL(copy_page);
38EXPORT_SYMBOL(clear_page); 35EXPORT_SYMBOL(clear_page);
@@ -57,4 +54,6 @@ EXPORT_SYMBOL(__memcpy);
57 54
58EXPORT_SYMBOL(empty_zero_page); 55EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 56EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 57#ifndef CONFIG_PARAVIRT
58EXPORT_SYMBOL(native_load_gs_index);
59#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 861b8b54e172..ccd179dec36e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -15,10 +15,13 @@
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/pat.h> 16#include <asm/pat.h>
17#include <asm/tsc.h> 17#include <asm/tsc.h>
18#include <asm/iommu.h>
18 19
19void __cpuinit x86_init_noop(void) { } 20void __cpuinit x86_init_noop(void) { }
20void __init x86_init_uint_noop(unsigned int unused) { } 21void __init x86_init_uint_noop(unsigned int unused) { }
21void __init x86_init_pgd_noop(pgd_t *unused) { } 22void __init x86_init_pgd_noop(pgd_t *unused) { }
23int __init iommu_init_noop(void) { return 0; }
24void iommu_shutdown_noop(void) { }
22 25
23/* 26/*
24 * The platform setup functions are preset with the default functions 27 * The platform setup functions are preset with the default functions
@@ -63,6 +66,10 @@ struct x86_init_ops x86_init __initdata = {
63 .tsc_pre_init = x86_init_noop, 66 .tsc_pre_init = x86_init_noop,
64 .timer_init = hpet_time_init, 67 .timer_init = hpet_time_init,
65 }, 68 },
69
70 .iommu = {
71 .iommu_init = iommu_init_noop,
72 },
66}; 73};
67 74
68struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
@@ -73,5 +80,6 @@ struct x86_platform_ops x86_platform = {
73 .calibrate_tsc = native_calibrate_tsc, 80 .calibrate_tsc = native_calibrate_tsc,
74 .get_wallclock = mach_get_cmos_time, 81 .get_wallclock = mach_get_cmos_time,
75 .set_wallclock = mach_set_rtc_mmss, 82 .set_wallclock = mach_set_rtc_mmss,
83 .iommu_shutdown = iommu_shutdown_noop,
76 .is_untracked_pat_range = is_ISA_range, 84 .is_untracked_pat_range = is_ISA_range,
77}; 85};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..4cd498332466 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER
31 ---help--- 32 ---help---
32 Support hosting fully virtualized guest machines using hardware 33 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 34 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o) 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o)
10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11 12
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..7e8faea4651e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -75,6 +75,8 @@
75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */
79#define No64 (1<<28)
78/* Source 2 operand type */ 80/* Source 2 operand type */
79#define Src2None (0<<29) 81#define Src2None (0<<29)
80#define Src2CL (1<<29) 82#define Src2CL (1<<29)
@@ -92,19 +94,23 @@ static u32 opcode_table[256] = {
92 /* 0x00 - 0x07 */ 94 /* 0x00 - 0x07 */
93 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
94 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
95 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
96 /* 0x08 - 0x0F */ 99 /* 0x08 - 0x0F */
97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
99 0, 0, 0, 0, 102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0,
100 /* 0x10 - 0x17 */ 104 /* 0x10 - 0x17 */
101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
104 /* 0x18 - 0x1F */ 109 /* 0x18 - 0x1F */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
108 /* 0x20 - 0x27 */ 114 /* 0x20 - 0x27 */
109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -133,7 +139,8 @@ static u32 opcode_table[256] = {
133 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 139 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
134 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 140 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
135 /* 0x60 - 0x67 */ 141 /* 0x60 - 0x67 */
136 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 142 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
143 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
137 0, 0, 0, 0, 144 0, 0, 0, 0,
138 /* 0x68 - 0x6F */ 145 /* 0x68 - 0x6F */
139 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 146 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -158,7 +165,7 @@ static u32 opcode_table[256] = {
158 /* 0x90 - 0x97 */ 165 /* 0x90 - 0x97 */
159 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 166 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
160 /* 0x98 - 0x9F */ 167 /* 0x98 - 0x9F */
161 0, 0, SrcImm | Src2Imm16, 0, 168 0, 0, SrcImm | Src2Imm16 | No64, 0,
162 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 169 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
163 /* 0xA0 - 0xA7 */ 170 /* 0xA0 - 0xA7 */
164 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 171 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +192,7 @@ static u32 opcode_table[256] = {
185 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 192 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
186 /* 0xC8 - 0xCF */ 193 /* 0xC8 - 0xCF */
187 0, 0, 0, ImplicitOps | Stack, 194 0, 0, 0, ImplicitOps | Stack,
188 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, 195 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
189 /* 0xD0 - 0xD7 */ 196 /* 0xD0 - 0xD7 */
190 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 197 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
191 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 198 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,7 +205,7 @@ static u32 opcode_table[256] = {
198 ByteOp | SrcImmUByte, SrcImmUByte, 205 ByteOp | SrcImmUByte, SrcImmUByte,
199 /* 0xE8 - 0xEF */ 206 /* 0xE8 - 0xEF */
200 SrcImm | Stack, SrcImm | ImplicitOps, 207 SrcImm | Stack, SrcImm | ImplicitOps,
201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, 208 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 209 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
204 /* 0xF0 - 0xF7 */ 211 /* 0xF0 - 0xF7 */
@@ -244,11 +251,13 @@ static u32 twobyte_table[256] = {
244 /* 0x90 - 0x9F */ 251 /* 0x90 - 0x9F */
245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0xA0 - 0xA7 */ 253 /* 0xA0 - 0xA7 */
247 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 254 ImplicitOps | Stack, ImplicitOps | Stack,
255 0, DstMem | SrcReg | ModRM | BitOp,
248 DstMem | SrcReg | Src2ImmByte | ModRM, 256 DstMem | SrcReg | Src2ImmByte | ModRM,
249 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 257 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
250 /* 0xA8 - 0xAF */ 258 /* 0xA8 - 0xAF */
251 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 259 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp,
252 DstMem | SrcReg | Src2ImmByte | ModRM, 261 DstMem | SrcReg | Src2ImmByte | ModRM,
253 DstMem | SrcReg | Src2CL | ModRM, 262 DstMem | SrcReg | Src2CL | ModRM,
254 ModRM, 0, 263 ModRM, 0,
@@ -613,6 +622,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
613{ 622{
614 int rc = 0; 623 int rc = 0;
615 624
625 /* x86 instructions are limited to 15 bytes. */
626 if (eip + size - ctxt->decode.eip_orig > 15)
627 return X86EMUL_UNHANDLEABLE;
616 eip += ctxt->cs_base; 628 eip += ctxt->cs_base;
617 while (size--) { 629 while (size--) {
618 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 630 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -871,7 +883,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
871 /* Shadow copy of register state. Committed on successful emulation. */ 883 /* Shadow copy of register state. Committed on successful emulation. */
872 884
873 memset(c, 0, sizeof(struct decode_cache)); 885 memset(c, 0, sizeof(struct decode_cache));
874 c->eip = kvm_rip_read(ctxt->vcpu); 886 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
875 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 887 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
876 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 888 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
877 889
@@ -962,6 +974,11 @@ done_prefixes:
962 } 974 }
963 } 975 }
964 976
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
979 return -1;
980 }
981
965 if (c->d & Group) { 982 if (c->d & Group) {
966 group = c->d & GroupMask; 983 group = c->d & GroupMask;
967 c->modrm = insn_fetch(u8, 1, c->eip); 984 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1186,6 +1203,69 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1186 return rc; 1203 return rc;
1187} 1204}
1188 1205
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{
1208 struct decode_cache *c = &ctxt->decode;
1209 struct kvm_segment segment;
1210
1211 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
1212
1213 c->src.val = segment.selector;
1214 emulate_push(ctxt);
1215}
1216
1217static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1218 struct x86_emulate_ops *ops, int seg)
1219{
1220 struct decode_cache *c = &ctxt->decode;
1221 unsigned long selector;
1222 int rc;
1223
1224 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1225 if (rc != 0)
1226 return rc;
1227
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
1229 return rc;
1230}
1231
1232static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
1233{
1234 struct decode_cache *c = &ctxt->decode;
1235 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1236 int reg = VCPU_REGS_RAX;
1237
1238 while (reg <= VCPU_REGS_RDI) {
1239 (reg == VCPU_REGS_RSP) ?
1240 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1241
1242 emulate_push(ctxt);
1243 ++reg;
1244 }
1245}
1246
1247static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops)
1249{
1250 struct decode_cache *c = &ctxt->decode;
1251 int rc = 0;
1252 int reg = VCPU_REGS_RDI;
1253
1254 while (reg >= VCPU_REGS_RAX) {
1255 if (reg == VCPU_REGS_RSP) {
1256 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1257 c->op_bytes);
1258 --reg;
1259 }
1260
1261 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1262 if (rc != 0)
1263 break;
1264 --reg;
1265 }
1266 return rc;
1267}
1268
1189static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1269static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1190 struct x86_emulate_ops *ops) 1270 struct x86_emulate_ops *ops)
1191{ 1271{
@@ -1707,18 +1787,45 @@ special_insn:
1707 add: /* add */ 1787 add: /* add */
1708 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 1788 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1709 break; 1789 break;
1790 case 0x06: /* push es */
1791 emulate_push_sreg(ctxt, VCPU_SREG_ES);
1792 break;
1793 case 0x07: /* pop es */
1794 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1795 if (rc != 0)
1796 goto done;
1797 break;
1710 case 0x08 ... 0x0d: 1798 case 0x08 ... 0x0d:
1711 or: /* or */ 1799 or: /* or */
1712 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 1800 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1713 break; 1801 break;
1802 case 0x0e: /* push cs */
1803 emulate_push_sreg(ctxt, VCPU_SREG_CS);
1804 break;
1714 case 0x10 ... 0x15: 1805 case 0x10 ... 0x15:
1715 adc: /* adc */ 1806 adc: /* adc */
1716 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 1807 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1717 break; 1808 break;
1809 case 0x16: /* push ss */
1810 emulate_push_sreg(ctxt, VCPU_SREG_SS);
1811 break;
1812 case 0x17: /* pop ss */
1813 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1814 if (rc != 0)
1815 goto done;
1816 break;
1718 case 0x18 ... 0x1d: 1817 case 0x18 ... 0x1d:
1719 sbb: /* sbb */ 1818 sbb: /* sbb */
1720 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1819 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1721 break; 1820 break;
1821 case 0x1e: /* push ds */
1822 emulate_push_sreg(ctxt, VCPU_SREG_DS);
1823 break;
1824 case 0x1f: /* pop ds */
1825 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1826 if (rc != 0)
1827 goto done;
1828 break;
1722 case 0x20 ... 0x25: 1829 case 0x20 ... 0x25:
1723 and: /* and */ 1830 and: /* and */
1724 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1831 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1857,14 @@ special_insn:
1750 if (rc != 0) 1857 if (rc != 0)
1751 goto done; 1858 goto done;
1752 break; 1859 break;
1860 case 0x60: /* pusha */
1861 emulate_pusha(ctxt);
1862 break;
1863 case 0x61: /* popa */
1864 rc = emulate_popa(ctxt, ops);
1865 if (rc != 0)
1866 goto done;
1867 break;
1753 case 0x63: /* movsxd */ 1868 case 0x63: /* movsxd */
1754 if (ctxt->mode != X86EMUL_MODE_PROT64) 1869 if (ctxt->mode != X86EMUL_MODE_PROT64)
1755 goto cannot_emulate; 1870 goto cannot_emulate;
@@ -1761,7 +1876,7 @@ special_insn:
1761 break; 1876 break;
1762 case 0x6c: /* insb */ 1877 case 0x6c: /* insb */
1763 case 0x6d: /* insw/insd */ 1878 case 0x6d: /* insw/insd */
1764 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1879 if (kvm_emulate_pio_string(ctxt->vcpu,
1765 1, 1880 1,
1766 (c->d & ByteOp) ? 1 : c->op_bytes, 1881 (c->d & ByteOp) ? 1 : c->op_bytes,
1767 c->rep_prefix ? 1882 c->rep_prefix ?
@@ -1777,7 +1892,7 @@ special_insn:
1777 return 0; 1892 return 0;
1778 case 0x6e: /* outsb */ 1893 case 0x6e: /* outsb */
1779 case 0x6f: /* outsw/outsd */ 1894 case 0x6f: /* outsw/outsd */
1780 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1895 if (kvm_emulate_pio_string(ctxt->vcpu,
1781 0, 1896 0,
1782 (c->d & ByteOp) ? 1 : c->op_bytes, 1897 (c->d & ByteOp) ? 1 : c->op_bytes,
1783 c->rep_prefix ? 1898 c->rep_prefix ?
@@ -2070,7 +2185,7 @@ special_insn:
2070 case 0xef: /* out (e/r)ax,dx */ 2185 case 0xef: /* out (e/r)ax,dx */
2071 port = c->regs[VCPU_REGS_RDX]; 2186 port = c->regs[VCPU_REGS_RDX];
2072 io_dir_in = 0; 2187 io_dir_in = 0;
2073 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, 2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2074 (c->d & ByteOp) ? 1 : c->op_bytes, 2189 (c->d & ByteOp) ? 1 : c->op_bytes,
2075 port) != 0) { 2190 port) != 0) {
2076 c->eip = saved_eip; 2191 c->eip = saved_eip;
@@ -2297,6 +2412,14 @@ twobyte_insn:
2297 jmp_rel(c, c->src.val); 2412 jmp_rel(c, c->src.val);
2298 c->dst.type = OP_NONE; 2413 c->dst.type = OP_NONE;
2299 break; 2414 break;
2415 case 0xa0: /* push fs */
2416 emulate_push_sreg(ctxt, VCPU_SREG_FS);
2417 break;
2418 case 0xa1: /* pop fs */
2419 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2420 if (rc != 0)
2421 goto done;
2422 break;
2300 case 0xa3: 2423 case 0xa3:
2301 bt: /* bt */ 2424 bt: /* bt */
2302 c->dst.type = OP_NONE; 2425 c->dst.type = OP_NONE;
@@ -2308,6 +2431,14 @@ twobyte_insn:
2308 case 0xa5: /* shld cl, r, r/m */ 2431 case 0xa5: /* shld cl, r, r/m */
2309 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 2432 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2310 break; 2433 break;
2434 case 0xa8: /* push gs */
2435 emulate_push_sreg(ctxt, VCPU_SREG_GS);
2436 break;
2437 case 0xa9: /* pop gs */
2438 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2439 if (rc != 0)
2440 goto done;
2441 break;
2311 case 0xab: 2442 case 0xab:
2312 bts: /* bts */ 2443 bts: /* bts */
2313 /* only subword offset */ 2444 /* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 82ad523b4901..15578f180e59 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -29,6 +29,8 @@
29 * Based on QEMU and Xen. 29 * Based on QEMU and Xen.
30 */ 30 */
31 31
32#define pr_fmt(fmt) "pit: " fmt
33
32#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
33 35
34#include "irq.h" 36#include "irq.h"
@@ -116,7 +118,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
116 * itself with the initial count and continues counting 118 * itself with the initial count and continues counting
117 * from there. 119 * from there.
118 */ 120 */
119 remaining = hrtimer_expires_remaining(&ps->pit_timer.timer); 121 remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
120 elapsed = ps->pit_timer.period - ktime_to_ns(remaining); 122 elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
121 elapsed = mod_64(elapsed, ps->pit_timer.period); 123 elapsed = mod_64(elapsed, ps->pit_timer.period);
122 124
@@ -262,7 +264,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
262 264
263static void destroy_pit_timer(struct kvm_timer *pt) 265static void destroy_pit_timer(struct kvm_timer *pt)
264{ 266{
265 pr_debug("pit: execute del timer!\n"); 267 pr_debug("execute del timer!\n");
266 hrtimer_cancel(&pt->timer); 268 hrtimer_cancel(&pt->timer);
267} 269}
268 270
@@ -284,7 +286,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284 286
285 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 287 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
286 288
287 pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); 289 pr_debug("create pit timer, interval is %llu nsec\n", interval);
288 290
289 /* TODO The new value only affected after the retriggered */ 291 /* TODO The new value only affected after the retriggered */
290 hrtimer_cancel(&pt->timer); 292 hrtimer_cancel(&pt->timer);
@@ -309,7 +311,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
309 311
310 WARN_ON(!mutex_is_locked(&ps->lock)); 312 WARN_ON(!mutex_is_locked(&ps->lock));
311 313
312 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 314 pr_debug("load_count val is %d, channel is %d\n", val, channel);
313 315
314 /* 316 /*
315 * The largest possible initial count is 0; this is equivalent 317 * The largest possible initial count is 0; this is equivalent
@@ -395,8 +397,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
395 mutex_lock(&pit_state->lock); 397 mutex_lock(&pit_state->lock);
396 398
397 if (val != 0) 399 if (val != 0)
398 pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", 400 pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
399 (unsigned int)addr, len, val); 401 (unsigned int)addr, len, val);
400 402
401 if (addr == 3) { 403 if (addr == 3) {
402 channel = val >> 6; 404 channel = val >> 6;
@@ -465,6 +467,9 @@ static int pit_ioport_read(struct kvm_io_device *this,
465 return -EOPNOTSUPP; 467 return -EOPNOTSUPP;
466 468
467 addr &= KVM_PIT_CHANNEL_MASK; 469 addr &= KVM_PIT_CHANNEL_MASK;
470 if (addr == 3)
471 return 0;
472
468 s = &pit_state->channels[addr]; 473 s = &pit_state->channels[addr];
469 474
470 mutex_lock(&pit_state->lock); 475 mutex_lock(&pit_state->lock);
@@ -688,10 +693,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
688 struct kvm_vcpu *vcpu; 693 struct kvm_vcpu *vcpu;
689 int i; 694 int i;
690 695
691 mutex_lock(&kvm->irq_lock);
692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 696 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 697 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
694 mutex_unlock(&kvm->irq_lock);
695 698
696 /* 699 /*
697 * Provides NMI watchdog support via Virtual Wire mode. 700 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..d057c0cbd245 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
38 s->isr_ack |= (1 << irq); 38 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0]) 39 if (s != &s->pics_state->pics[0])
40 irq += 8; 40 irq += 8;
41 /*
42 * We are dropping lock while calling ack notifiers since ack
43 * notifier callbacks for assigned devices call into PIC recursively.
44 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage.
46 */
47 spin_unlock(&s->pics_state->lock);
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock);
42} 50}
43 51
44void kvm_pic_clear_isr_ack(struct kvm *kvm) 52void kvm_pic_clear_isr_ack(struct kvm *kvm)
@@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
176static inline void pic_intack(struct kvm_kpic_state *s, int irq) 184static inline void pic_intack(struct kvm_kpic_state *s, int irq)
177{ 185{
178 s->isr |= 1 << irq; 186 s->isr |= 1 << irq;
179 if (s->auto_eoi) {
180 if (s->rotate_on_auto_eoi)
181 s->priority_add = (irq + 1) & 7;
182 pic_clear_isr(s, irq);
183 }
184 /* 187 /*
185 * We don't clear a level sensitive interrupt here 188 * We don't clear a level sensitive interrupt here
186 */ 189 */
187 if (!(s->elcr & (1 << irq))) 190 if (!(s->elcr & (1 << irq)))
188 s->irr &= ~(1 << irq); 191 s->irr &= ~(1 << irq);
192
193 if (s->auto_eoi) {
194 if (s->rotate_on_auto_eoi)
195 s->priority_add = (irq + 1) & 7;
196 pic_clear_isr(s, irq);
197 }
198
189} 199}
190 200
191int kvm_pic_read_irq(struct kvm *kvm) 201int kvm_pic_read_irq(struct kvm *kvm)
@@ -225,22 +235,11 @@ int kvm_pic_read_irq(struct kvm *kvm)
225 235
226void kvm_pic_reset(struct kvm_kpic_state *s) 236void kvm_pic_reset(struct kvm_kpic_state *s)
227{ 237{
228 int irq, irqbase, n; 238 int irq;
229 struct kvm *kvm = s->pics_state->irq_request_opaque; 239 struct kvm *kvm = s->pics_state->irq_request_opaque;
230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 240 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
241 u8 irr = s->irr, isr = s->imr;
231 242
232 if (s == &s->pics_state->pics[0])
233 irqbase = 0;
234 else
235 irqbase = 8;
236
237 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
240 n = irq + irqbase;
241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
242 }
243 }
244 s->last_irr = 0; 243 s->last_irr = 0;
245 s->irr = 0; 244 s->irr = 0;
246 s->imr = 0; 245 s->imr = 0;
@@ -256,6 +255,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
256 s->rotate_on_auto_eoi = 0; 255 s->rotate_on_auto_eoi = 0;
257 s->special_fully_nested_mode = 0; 256 s->special_fully_nested_mode = 0;
258 s->init4 = 0; 257 s->init4 = 0;
258
259 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
260 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
261 if (irr & (1 << irq) || isr & (1 << irq)) {
262 pic_clear_isr(s, irq);
263 }
264 }
259} 265}
260 266
261static void pic_ioport_write(void *opaque, u32 addr, u32 val) 267static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
298 priority = get_priority(s, s->isr); 304 priority = get_priority(s, s->isr);
299 if (priority != 8) { 305 if (priority != 8) {
300 irq = (priority + s->priority_add) & 7; 306 irq = (priority + s->priority_add) & 7;
301 pic_clear_isr(s, irq);
302 if (cmd == 5) 307 if (cmd == 5)
303 s->priority_add = (irq + 1) & 7; 308 s->priority_add = (irq + 1) & 7;
309 pic_clear_isr(s, irq);
304 pic_update_irq(s->pics_state); 310 pic_update_irq(s->pics_state);
305 } 311 }
306 break; 312 break;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..be399e207d57 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -71,6 +71,7 @@ struct kvm_pic {
71 int output; /* intr from master PIC */ 71 int output; /* intr from master PIC */
72 struct kvm_io_device dev; 72 struct kvm_io_device dev;
73 void (*ack_notifier)(void *opaque, int irq); 73 void (*ack_notifier)(void *opaque, int irq);
74 unsigned long irq_states[16];
74}; 75};
75 76
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
@@ -85,7 +86,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
85 86
86static inline int irqchip_in_kernel(struct kvm *kvm) 87static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 88{
88 return pic_irqchip(kvm) != NULL; 89 int ret;
90
91 ret = (pic_irqchip(kvm) != NULL);
92 smp_rmb();
93 return ret;
89} 94}
90 95
91void kvm_pic_reset(struct kvm_kpic_state *s); 96void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7024224f0fc8..ba8c045da782 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,7 +32,6 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include <asm/apicdef.h>
36#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
37#include "irq.h" 36#include "irq.h"
38#include "trace.h" 37#include "trace.h"
@@ -374,6 +373,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
374 if (unlikely(!apic_enabled(apic))) 373 if (unlikely(!apic_enabled(apic)))
375 break; 374 break;
376 375
376 if (trig_mode) {
377 apic_debug("level trig mode for vector %d", vector);
378 apic_set_vector(vector, apic->regs + APIC_TMR);
379 } else
380 apic_clear_vector(vector, apic->regs + APIC_TMR);
381
377 result = !apic_test_and_set_irr(vector, apic); 382 result = !apic_test_and_set_irr(vector, apic);
378 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 383 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
379 trig_mode, vector, !result); 384 trig_mode, vector, !result);
@@ -384,11 +389,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
384 break; 389 break;
385 } 390 }
386 391
387 if (trig_mode) {
388 apic_debug("level trig mode for vector %d", vector);
389 apic_set_vector(vector, apic->regs + APIC_TMR);
390 } else
391 apic_clear_vector(vector, apic->regs + APIC_TMR);
392 kvm_vcpu_kick(vcpu); 392 kvm_vcpu_kick(vcpu);
393 break; 393 break;
394 394
@@ -471,11 +471,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
471 trigger_mode = IOAPIC_LEVEL_TRIG; 471 trigger_mode = IOAPIC_LEVEL_TRIG;
472 else 472 else
473 trigger_mode = IOAPIC_EDGE_TRIG; 473 trigger_mode = IOAPIC_EDGE_TRIG;
474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { 474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 475 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
479} 476}
480 477
481static void apic_send_ipi(struct kvm_lapic *apic) 478static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +501,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 501 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
505 irq.vector); 502 irq.vector);
506 503
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 504 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
510} 505}
511 506
512static u32 apic_get_tmcct(struct kvm_lapic *apic) 507static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -521,7 +516,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
521 if (apic_get_reg(apic, APIC_TMICT) == 0) 516 if (apic_get_reg(apic, APIC_TMICT) == 0)
522 return 0; 517 return 0;
523 518
524 remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer); 519 remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
525 if (ktime_to_ns(remaining) < 0) 520 if (ktime_to_ns(remaining) < 0)
526 remaining = ktime_set(0, 0); 521 remaining = ktime_set(0, 0);
527 522
@@ -1156,6 +1151,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1156 hrtimer_cancel(&apic->lapic_timer.timer); 1151 hrtimer_cancel(&apic->lapic_timer.timer);
1157 update_divide_count(apic); 1152 update_divide_count(apic);
1158 start_apic_timer(apic); 1153 start_apic_timer(apic);
1154 apic->irr_pending = true;
1159} 1155}
1160 1156
1161void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1157void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 685a4ffac8e6..89a49fb46a27 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -477,7 +477,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
477 477
478 addr = gfn_to_hva(kvm, gfn); 478 addr = gfn_to_hva(kvm, gfn);
479 if (kvm_is_error_hva(addr)) 479 if (kvm_is_error_hva(addr))
480 return page_size; 480 return PT_PAGE_TABLE_LEVEL;
481 481
482 down_read(&current->mm->mmap_sem); 482 down_read(&current->mm->mmap_sem);
483 vma = find_vma(current->mm, addr); 483 vma = find_vma(current->mm, addr);
@@ -515,11 +515,9 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
515 if (host_level == PT_PAGE_TABLE_LEVEL) 515 if (host_level == PT_PAGE_TABLE_LEVEL)
516 return host_level; 516 return host_level;
517 517
518 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { 518 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level)
519
520 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 519 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
521 break; 520 break;
522 }
523 521
524 return level - 1; 522 return level - 1;
525} 523}
@@ -748,7 +746,8 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
748 return write_protected; 746 return write_protected;
749} 747}
750 748
751static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) 749static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
750 unsigned long data)
752{ 751{
753 u64 *spte; 752 u64 *spte;
754 int need_tlb_flush = 0; 753 int need_tlb_flush = 0;
@@ -763,7 +762,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
763 return need_tlb_flush; 762 return need_tlb_flush;
764} 763}
765 764
766static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) 765static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
766 unsigned long data)
767{ 767{
768 int need_flush = 0; 768 int need_flush = 0;
769 u64 *spte, new_spte; 769 u64 *spte, new_spte;
@@ -799,9 +799,10 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
799 return 0; 799 return 0;
800} 800}
801 801
802static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, u64 data, 802static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
803 unsigned long data,
803 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 804 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
804 u64 data)) 805 unsigned long data))
805{ 806{
806 int i, j; 807 int i, j;
807 int retval = 0; 808 int retval = 0;
@@ -846,10 +847,11 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
846 847
847void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 848void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
848{ 849{
849 kvm_handle_hva(kvm, hva, (u64)&pte, kvm_set_pte_rmapp); 850 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
850} 851}
851 852
852static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) 853static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
854 unsigned long data)
853{ 855{
854 u64 *spte; 856 u64 *spte;
855 int young = 0; 857 int young = 0;
@@ -2785,7 +2787,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2785 if (r) 2787 if (r)
2786 goto out; 2788 goto out;
2787 2789
2788 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); 2790 er = emulate_instruction(vcpu, cr2, error_code, 0);
2789 2791
2790 switch (er) { 2792 switch (er) {
2791 case EMULATE_DONE: 2793 case EMULATE_DONE:
@@ -2796,6 +2798,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2796 case EMULATE_FAIL: 2798 case EMULATE_FAIL:
2797 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2799 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2798 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2800 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2801 vcpu->run->internal.ndata = 0;
2799 return 0; 2802 return 0;
2800 default: 2803 default:
2801 BUG(); 2804 BUG();
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 72558f8ff3f5..ede2131a9225 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -150,7 +150,9 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 150 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 151 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 152
153 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
154 goto not_present;
155
154 trace_kvm_mmu_paging_element(pte, walker->level); 156 trace_kvm_mmu_paging_element(pte, walker->level);
155 157
156 if (!is_present_gpte(pte)) 158 if (!is_present_gpte(pte))
@@ -455,8 +457,6 @@ out_unlock:
455static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
456{ 458{
457 struct kvm_shadow_walk_iterator iterator; 459 struct kvm_shadow_walk_iterator iterator;
458 pt_element_t gpte;
459 gpa_t pte_gpa = -1;
460 int level; 460 int level;
461 u64 *sptep; 461 u64 *sptep;
462 int need_flush = 0; 462 int need_flush = 0;
@@ -467,14 +467,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 467 level = iterator.level;
468 sptep = iterator.sptep; 468 sptep = iterator.sptep;
469 469
470 /* FIXME: properly handle invlpg on large guest pages */
471 if (level == PT_PAGE_TABLE_LEVEL || 470 if (level == PT_PAGE_TABLE_LEVEL ||
472 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
473 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
474 struct kvm_mmu_page *sp = page_header(__pa(sptep));
475
476 pte_gpa = (sp->gfn << PAGE_SHIFT);
477 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
478 473
479 if (is_shadow_present_pte(*sptep)) { 474 if (is_shadow_present_pte(*sptep)) {
480 rmap_remove(vcpu->kvm, sptep); 475 rmap_remove(vcpu->kvm, sptep);
@@ -493,18 +488,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
493 if (need_flush) 488 if (need_flush)
494 kvm_flush_remote_tlbs(vcpu->kvm); 489 kvm_flush_remote_tlbs(vcpu->kvm);
495 spin_unlock(&vcpu->kvm->mmu_lock); 490 spin_unlock(&vcpu->kvm->mmu_lock);
496
497 if (pte_gpa == -1)
498 return;
499 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
500 sizeof(pt_element_t)))
501 return;
502 if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
503 if (mmu_topup_memory_caches(vcpu))
504 return;
505 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
506 sizeof(pt_element_t), 0);
507 }
508} 491}
509 492
510static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c17404add91f..1d9b33843c80 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_FEATURE_SVML (1 << 2) 48#define SVM_FEATURE_SVML (1 << 2)
49#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
49 50
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 51#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 52#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
53 54
54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 55#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
55 56
56/* Turn on to get debugging output*/
57/* #define NESTED_DEBUG */
58
59#ifdef NESTED_DEBUG
60#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
61#else
62#define nsvm_printk(fmt, args...) do {} while(0)
63#endif
64
65static const u32 host_save_user_msrs[] = { 57static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 59 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
85 /* gpa pointers to the real vectors */ 77 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm; 78 u64 vmcb_msrpm;
87 79
80 /* A VMEXIT is required but not yet emulated */
81 bool exit_required;
82
88 /* cache for intercepts of the guest */ 83 /* cache for intercepts of the guest */
89 u16 intercept_cr_read; 84 u16 intercept_cr_read;
90 u16 intercept_cr_write; 85 u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
112 u32 *msrpm; 107 u32 *msrpm;
113 108
114 struct nested_state nested; 109 struct nested_state nested;
110
111 bool nmi_singlestep;
115}; 112};
116 113
117/* enable NPT for AMD64 and X86 with PAE */ 114/* enable NPT for AMD64 and X86 with PAE */
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
286 struct vcpu_svm *svm = to_svm(vcpu); 283 struct vcpu_svm *svm = to_svm(vcpu);
287 284
288 if (!svm->next_rip) { 285 if (!svm->next_rip) {
289 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != 286 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
290 EMULATE_DONE) 287 EMULATE_DONE)
291 printk(KERN_DEBUG "%s: NOP\n", __func__); 288 printk(KERN_DEBUG "%s: NOP\n", __func__);
292 return; 289 return;
@@ -316,75 +313,79 @@ static void svm_hardware_disable(void *garbage)
316 cpu_svm_disable(); 313 cpu_svm_disable();
317} 314}
318 315
319static void svm_hardware_enable(void *garbage) 316static int svm_hardware_enable(void *garbage)
320{ 317{
321 318
322 struct svm_cpu_data *svm_data; 319 struct svm_cpu_data *sd;
323 uint64_t efer; 320 uint64_t efer;
324 struct descriptor_table gdt_descr; 321 struct descriptor_table gdt_descr;
325 struct desc_struct *gdt; 322 struct desc_struct *gdt;
326 int me = raw_smp_processor_id(); 323 int me = raw_smp_processor_id();
327 324
325 rdmsrl(MSR_EFER, efer);
326 if (efer & EFER_SVME)
327 return -EBUSY;
328
328 if (!has_svm()) { 329 if (!has_svm()) {
329 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); 330 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
330 return; 331 me);
332 return -EINVAL;
331 } 333 }
332 svm_data = per_cpu(svm_data, me); 334 sd = per_cpu(svm_data, me);
333 335
334 if (!svm_data) { 336 if (!sd) {
335 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", 337 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
336 me); 338 me);
337 return; 339 return -EINVAL;
338 } 340 }
339 341
340 svm_data->asid_generation = 1; 342 sd->asid_generation = 1;
341 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 343 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
342 svm_data->next_asid = svm_data->max_asid + 1; 344 sd->next_asid = sd->max_asid + 1;
343 345
344 kvm_get_gdt(&gdt_descr); 346 kvm_get_gdt(&gdt_descr);
345 gdt = (struct desc_struct *)gdt_descr.base; 347 gdt = (struct desc_struct *)gdt_descr.base;
346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 348 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
347 349
348 rdmsrl(MSR_EFER, efer);
349 wrmsrl(MSR_EFER, efer | EFER_SVME); 350 wrmsrl(MSR_EFER, efer | EFER_SVME);
350 351
351 wrmsrl(MSR_VM_HSAVE_PA, 352 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
352 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 353
354 return 0;
353} 355}
354 356
355static void svm_cpu_uninit(int cpu) 357static void svm_cpu_uninit(int cpu)
356{ 358{
357 struct svm_cpu_data *svm_data 359 struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
358 = per_cpu(svm_data, raw_smp_processor_id());
359 360
360 if (!svm_data) 361 if (!sd)
361 return; 362 return;
362 363
363 per_cpu(svm_data, raw_smp_processor_id()) = NULL; 364 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
364 __free_page(svm_data->save_area); 365 __free_page(sd->save_area);
365 kfree(svm_data); 366 kfree(sd);
366} 367}
367 368
368static int svm_cpu_init(int cpu) 369static int svm_cpu_init(int cpu)
369{ 370{
370 struct svm_cpu_data *svm_data; 371 struct svm_cpu_data *sd;
371 int r; 372 int r;
372 373
373 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 374 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
374 if (!svm_data) 375 if (!sd)
375 return -ENOMEM; 376 return -ENOMEM;
376 svm_data->cpu = cpu; 377 sd->cpu = cpu;
377 svm_data->save_area = alloc_page(GFP_KERNEL); 378 sd->save_area = alloc_page(GFP_KERNEL);
378 r = -ENOMEM; 379 r = -ENOMEM;
379 if (!svm_data->save_area) 380 if (!sd->save_area)
380 goto err_1; 381 goto err_1;
381 382
382 per_cpu(svm_data, cpu) = svm_data; 383 per_cpu(svm_data, cpu) = sd;
383 384
384 return 0; 385 return 0;
385 386
386err_1: 387err_1:
387 kfree(svm_data); 388 kfree(sd);
388 return r; 389 return r;
389 390
390} 391}
@@ -476,7 +477,7 @@ static __init int svm_hardware_setup(void)
476 kvm_enable_efer_bits(EFER_SVME); 477 kvm_enable_efer_bits(EFER_SVME);
477 } 478 }
478 479
479 for_each_online_cpu(cpu) { 480 for_each_possible_cpu(cpu) {
480 r = svm_cpu_init(cpu); 481 r = svm_cpu_init(cpu);
481 if (r) 482 if (r)
482 goto err; 483 goto err;
@@ -510,7 +511,7 @@ static __exit void svm_hardware_unsetup(void)
510{ 511{
511 int cpu; 512 int cpu;
512 513
513 for_each_online_cpu(cpu) 514 for_each_possible_cpu(cpu)
514 svm_cpu_uninit(cpu); 515 svm_cpu_uninit(cpu);
515 516
516 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 517 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -625,11 +626,12 @@ static void init_vmcb(struct vcpu_svm *svm)
625 save->rip = 0x0000fff0; 626 save->rip = 0x0000fff0;
626 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 627 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
627 628
628 /* 629 /* This is the guest-visible cr0 value.
629 * cr0 val on cpu init should be 0x60000010, we enable cpu 630 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
630 * cache by default. the orderly way is to enable cache in bios.
631 */ 631 */
632 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 632 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
633 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
634
633 save->cr4 = X86_CR4_PAE; 635 save->cr4 = X86_CR4_PAE;
634 /* rdx = ?? */ 636 /* rdx = ?? */
635 637
@@ -644,8 +646,6 @@ static void init_vmcb(struct vcpu_svm *svm)
644 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| 646 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
645 INTERCEPT_CR3_MASK); 647 INTERCEPT_CR3_MASK);
646 save->g_pat = 0x0007040600070406ULL; 648 save->g_pat = 0x0007040600070406ULL;
647 /* enable caching because the QEMU Bios doesn't enable it */
648 save->cr0 = X86_CR0_ET;
649 save->cr3 = 0; 649 save->cr3 = 0;
650 save->cr4 = 0; 650 save->cr4 = 0;
651 } 651 }
@@ -654,6 +654,11 @@ static void init_vmcb(struct vcpu_svm *svm)
654 svm->nested.vmcb = 0; 654 svm->nested.vmcb = 0;
655 svm->vcpu.arch.hflags = 0; 655 svm->vcpu.arch.hflags = 0;
656 656
657 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
658 control->pause_filter_count = 3000;
659 control->intercept |= (1ULL << INTERCEPT_PAUSE);
660 }
661
657 enable_gif(svm); 662 enable_gif(svm);
658} 663}
659 664
@@ -758,14 +763,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 int i; 763 int i;
759 764
760 if (unlikely(cpu != vcpu->cpu)) { 765 if (unlikely(cpu != vcpu->cpu)) {
761 u64 tsc_this, delta; 766 u64 delta;
762 767
763 /* 768 /*
764 * Make sure that the guest sees a monotonically 769 * Make sure that the guest sees a monotonically
765 * increasing TSC. 770 * increasing TSC.
766 */ 771 */
767 rdtscll(tsc_this); 772 delta = vcpu->arch.host_tsc - native_read_tsc();
768 delta = vcpu->arch.host_tsc - tsc_this;
769 svm->vmcb->control.tsc_offset += delta; 773 svm->vmcb->control.tsc_offset += delta;
770 if (is_nested(svm)) 774 if (is_nested(svm))
771 svm->nested.hsave->control.tsc_offset += delta; 775 svm->nested.hsave->control.tsc_offset += delta;
@@ -787,7 +791,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
787 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 791 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
788 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 792 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
789 793
790 rdtscll(vcpu->arch.host_tsc); 794 vcpu->arch.host_tsc = native_read_tsc();
791} 795}
792 796
793static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 797static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1045,7 +1049,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1045 svm->vmcb->control.intercept_exceptions &= 1049 svm->vmcb->control.intercept_exceptions &=
1046 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1050 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1047 1051
1048 if (vcpu->arch.singlestep) 1052 if (svm->nmi_singlestep)
1049 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1053 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1050 1054
1051 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1055 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1060,26 +1064,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1060 vcpu->guest_debug = 0; 1064 vcpu->guest_debug = 0;
1061} 1065}
1062 1066
1063static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1067static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1064{ 1068{
1065 int old_debug = vcpu->guest_debug;
1066 struct vcpu_svm *svm = to_svm(vcpu); 1069 struct vcpu_svm *svm = to_svm(vcpu);
1067 1070
1068 vcpu->guest_debug = dbg->control;
1069
1070 update_db_intercept(vcpu);
1071
1072 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1071 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1073 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1072 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1074 else 1073 else
1075 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1074 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1076 1075
1077 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 1076 update_db_intercept(vcpu);
1078 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1079 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1080 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1081
1082 return 0;
1083} 1077}
1084 1078
1085static void load_host_msrs(struct kvm_vcpu *vcpu) 1079static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1096,16 +1090,16 @@ static void save_host_msrs(struct kvm_vcpu *vcpu)
1096#endif 1090#endif
1097} 1091}
1098 1092
1099static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) 1093static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1100{ 1094{
1101 if (svm_data->next_asid > svm_data->max_asid) { 1095 if (sd->next_asid > sd->max_asid) {
1102 ++svm_data->asid_generation; 1096 ++sd->asid_generation;
1103 svm_data->next_asid = 1; 1097 sd->next_asid = 1;
1104 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1098 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1105 } 1099 }
1106 1100
1107 svm->asid_generation = svm_data->asid_generation; 1101 svm->asid_generation = sd->asid_generation;
1108 svm->vmcb->control.asid = svm_data->next_asid++; 1102 svm->vmcb->control.asid = sd->next_asid++;
1109} 1103}
1110 1104
1111static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 1105static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
@@ -1180,7 +1174,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1180 } 1174 }
1181} 1175}
1182 1176
1183static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1177static int pf_interception(struct vcpu_svm *svm)
1184{ 1178{
1185 u64 fault_address; 1179 u64 fault_address;
1186 u32 error_code; 1180 u32 error_code;
@@ -1194,17 +1188,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1194 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1188 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1195} 1189}
1196 1190
1197static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1191static int db_interception(struct vcpu_svm *svm)
1198{ 1192{
1193 struct kvm_run *kvm_run = svm->vcpu.run;
1194
1199 if (!(svm->vcpu.guest_debug & 1195 if (!(svm->vcpu.guest_debug &
1200 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1196 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1201 !svm->vcpu.arch.singlestep) { 1197 !svm->nmi_singlestep) {
1202 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1198 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1203 return 1; 1199 return 1;
1204 } 1200 }
1205 1201
1206 if (svm->vcpu.arch.singlestep) { 1202 if (svm->nmi_singlestep) {
1207 svm->vcpu.arch.singlestep = false; 1203 svm->nmi_singlestep = false;
1208 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1204 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1209 svm->vmcb->save.rflags &= 1205 svm->vmcb->save.rflags &=
1210 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1206 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1223,25 +1219,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1223 return 1; 1219 return 1;
1224} 1220}
1225 1221
1226static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1222static int bp_interception(struct vcpu_svm *svm)
1227{ 1223{
1224 struct kvm_run *kvm_run = svm->vcpu.run;
1225
1228 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1226 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1227 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1230 kvm_run->debug.arch.exception = BP_VECTOR; 1228 kvm_run->debug.arch.exception = BP_VECTOR;
1231 return 0; 1229 return 0;
1232} 1230}
1233 1231
1234static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1232static int ud_interception(struct vcpu_svm *svm)
1235{ 1233{
1236 int er; 1234 int er;
1237 1235
1238 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 1236 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
1239 if (er != EMULATE_DONE) 1237 if (er != EMULATE_DONE)
1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1238 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1241 return 1; 1239 return 1;
1242} 1240}
1243 1241
1244static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1242static int nm_interception(struct vcpu_svm *svm)
1245{ 1243{
1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1244 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) 1245 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
@@ -1251,7 +1249,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1251 return 1; 1249 return 1;
1252} 1250}
1253 1251
1254static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1252static int mc_interception(struct vcpu_svm *svm)
1255{ 1253{
1256 /* 1254 /*
1257 * On an #MC intercept the MCE handler is not called automatically in 1255 * On an #MC intercept the MCE handler is not called automatically in
@@ -1264,8 +1262,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1264 return 1; 1262 return 1;
1265} 1263}
1266 1264
1267static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1265static int shutdown_interception(struct vcpu_svm *svm)
1268{ 1266{
1267 struct kvm_run *kvm_run = svm->vcpu.run;
1268
1269 /* 1269 /*
1270 * VMCB is undefined after a SHUTDOWN intercept 1270 * VMCB is undefined after a SHUTDOWN intercept
1271 * so reinitialize it. 1271 * so reinitialize it.
@@ -1277,7 +1277,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1277 return 0; 1277 return 0;
1278} 1278}
1279 1279
1280static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1280static int io_interception(struct vcpu_svm *svm)
1281{ 1281{
1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1283 int size, in, string; 1283 int size, in, string;
@@ -1291,7 +1291,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1291 1291
1292 if (string) { 1292 if (string) {
1293 if (emulate_instruction(&svm->vcpu, 1293 if (emulate_instruction(&svm->vcpu,
1294 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) 1294 0, 0, 0) == EMULATE_DO_MMIO)
1295 return 0; 1295 return 0;
1296 return 1; 1296 return 1;
1297 } 1297 }
@@ -1301,33 +1301,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1302 1302
1303 skip_emulated_instruction(&svm->vcpu); 1303 skip_emulated_instruction(&svm->vcpu);
1304 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1304 return kvm_emulate_pio(&svm->vcpu, in, size, port);
1305} 1305}
1306 1306
1307static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1307static int nmi_interception(struct vcpu_svm *svm)
1308{ 1308{
1309 return 1; 1309 return 1;
1310} 1310}
1311 1311
1312static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1312static int intr_interception(struct vcpu_svm *svm)
1313{ 1313{
1314 ++svm->vcpu.stat.irq_exits; 1314 ++svm->vcpu.stat.irq_exits;
1315 return 1; 1315 return 1;
1316} 1316}
1317 1317
1318static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1318static int nop_on_interception(struct vcpu_svm *svm)
1319{ 1319{
1320 return 1; 1320 return 1;
1321} 1321}
1322 1322
1323static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1323static int halt_interception(struct vcpu_svm *svm)
1324{ 1324{
1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1326 skip_emulated_instruction(&svm->vcpu); 1326 skip_emulated_instruction(&svm->vcpu);
1327 return kvm_emulate_halt(&svm->vcpu); 1327 return kvm_emulate_halt(&svm->vcpu);
1328} 1328}
1329 1329
1330static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1330static int vmmcall_interception(struct vcpu_svm *svm)
1331{ 1331{
1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1333 skip_emulated_instruction(&svm->vcpu); 1333 skip_emulated_instruction(&svm->vcpu);
@@ -1378,8 +1378,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1378 1378
1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1380 1380
1381 if (nested_svm_exit_handled(svm)) { 1381 if (svm->nested.intercept & 1ULL) {
1382 nsvm_printk("VMexit -> INTR\n"); 1382 /*
1383 * The #vmexit can't be emulated here directly because this
1384 * code path runs with irqs and preemtion disabled. A
1385 * #vmexit emulation might sleep. Only signal request for
1386 * the #vmexit here.
1387 */
1388 svm->nested.exit_required = true;
1389 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1383 return 1; 1390 return 1;
1384 } 1391 }
1385 1392
@@ -1390,10 +1397,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1390{ 1397{
1391 struct page *page; 1398 struct page *page;
1392 1399
1393 down_read(&current->mm->mmap_sem);
1394 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1400 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1395 up_read(&current->mm->mmap_sem);
1396
1397 if (is_error_page(page)) 1401 if (is_error_page(page))
1398 goto error; 1402 goto error;
1399 1403
@@ -1532,14 +1536,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1532 } 1536 }
1533 default: { 1537 default: {
1534 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1538 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1535 nsvm_printk("exit code: 0x%x\n", exit_code);
1536 if (svm->nested.intercept & exit_bits) 1539 if (svm->nested.intercept & exit_bits)
1537 vmexit = NESTED_EXIT_DONE; 1540 vmexit = NESTED_EXIT_DONE;
1538 } 1541 }
1539 } 1542 }
1540 1543
1541 if (vmexit == NESTED_EXIT_DONE) { 1544 if (vmexit == NESTED_EXIT_DONE) {
1542 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1543 nested_svm_vmexit(svm); 1545 nested_svm_vmexit(svm);
1544 } 1546 }
1545 1547
@@ -1584,6 +1586,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1584 struct vmcb *hsave = svm->nested.hsave; 1586 struct vmcb *hsave = svm->nested.hsave;
1585 struct vmcb *vmcb = svm->vmcb; 1587 struct vmcb *vmcb = svm->vmcb;
1586 1588
1589 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1590 vmcb->control.exit_info_1,
1591 vmcb->control.exit_info_2,
1592 vmcb->control.exit_int_info,
1593 vmcb->control.exit_int_info_err);
1594
1587 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1595 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1588 if (!nested_vmcb) 1596 if (!nested_vmcb)
1589 return 1; 1597 return 1;
@@ -1617,6 +1625,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 1625 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1618 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 1626 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1619 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 1627 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1628
1629 /*
1630 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
1631 * to make sure that we do not lose injected events. So check event_inj
1632 * here and copy it to exit_int_info if it is valid.
1633 * Exit_int_info and event_inj can't be both valid because the case
1634 * below only happens on a VMRUN instruction intercept which has
1635 * no valid exit_int_info set.
1636 */
1637 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
1638 struct vmcb_control_area *nc = &nested_vmcb->control;
1639
1640 nc->exit_int_info = vmcb->control.event_inj;
1641 nc->exit_int_info_err = vmcb->control.event_inj_err;
1642 }
1643
1620 nested_vmcb->control.tlb_ctl = 0; 1644 nested_vmcb->control.tlb_ctl = 0;
1621 nested_vmcb->control.event_inj = 0; 1645 nested_vmcb->control.event_inj = 0;
1622 nested_vmcb->control.event_inj_err = 0; 1646 nested_vmcb->control.event_inj_err = 0;
@@ -1628,10 +1652,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1628 /* Restore the original control entries */ 1652 /* Restore the original control entries */
1629 copy_vmcb_control_area(vmcb, hsave); 1653 copy_vmcb_control_area(vmcb, hsave);
1630 1654
1631 /* Kill any pending exceptions */
1632 if (svm->vcpu.arch.exception.pending == true)
1633 nsvm_printk("WARNING: Pending Exception\n");
1634
1635 kvm_clear_exception_queue(&svm->vcpu); 1655 kvm_clear_exception_queue(&svm->vcpu);
1636 kvm_clear_interrupt_queue(&svm->vcpu); 1656 kvm_clear_interrupt_queue(&svm->vcpu);
1637 1657
@@ -1702,6 +1722,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1702 /* nested_vmcb is our indicator if nested SVM is activated */ 1722 /* nested_vmcb is our indicator if nested SVM is activated */
1703 svm->nested.vmcb = svm->vmcb->save.rax; 1723 svm->nested.vmcb = svm->vmcb->save.rax;
1704 1724
1725 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1726 nested_vmcb->save.rip,
1727 nested_vmcb->control.int_ctl,
1728 nested_vmcb->control.event_inj,
1729 nested_vmcb->control.nested_ctl);
1730
1705 /* Clear internal status */ 1731 /* Clear internal status */
1706 kvm_clear_exception_queue(&svm->vcpu); 1732 kvm_clear_exception_queue(&svm->vcpu);
1707 kvm_clear_interrupt_queue(&svm->vcpu); 1733 kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1789,28 +1815,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1789 svm->nested.intercept = nested_vmcb->control.intercept; 1815 svm->nested.intercept = nested_vmcb->control.intercept;
1790 1816
1791 force_new_asid(&svm->vcpu); 1817 force_new_asid(&svm->vcpu);
1792 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1793 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1794 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 1818 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1795 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1796 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1797 nested_vmcb->control.int_ctl);
1798 }
1799 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 1819 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1800 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 1820 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1801 else 1821 else
1802 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 1822 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1803 1823
1804 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1805 nested_vmcb->control.exit_int_info,
1806 nested_vmcb->control.int_state);
1807
1808 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 1824 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1809 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 1825 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1810 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 1826 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1811 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1812 nsvm_printk("Injecting Event: 0x%x\n",
1813 nested_vmcb->control.event_inj);
1814 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1827 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1815 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1828 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1816 1829
@@ -1837,7 +1850,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1837 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1850 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1838} 1851}
1839 1852
1840static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1853static int vmload_interception(struct vcpu_svm *svm)
1841{ 1854{
1842 struct vmcb *nested_vmcb; 1855 struct vmcb *nested_vmcb;
1843 1856
@@ -1857,7 +1870,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1857 return 1; 1870 return 1;
1858} 1871}
1859 1872
1860static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1873static int vmsave_interception(struct vcpu_svm *svm)
1861{ 1874{
1862 struct vmcb *nested_vmcb; 1875 struct vmcb *nested_vmcb;
1863 1876
@@ -1877,10 +1890,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1877 return 1; 1890 return 1;
1878} 1891}
1879 1892
1880static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1893static int vmrun_interception(struct vcpu_svm *svm)
1881{ 1894{
1882 nsvm_printk("VMrun\n");
1883
1884 if (nested_svm_check_permissions(svm)) 1895 if (nested_svm_check_permissions(svm))
1885 return 1; 1896 return 1;
1886 1897
@@ -1907,7 +1918,7 @@ failed:
1907 return 1; 1918 return 1;
1908} 1919}
1909 1920
1910static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1921static int stgi_interception(struct vcpu_svm *svm)
1911{ 1922{
1912 if (nested_svm_check_permissions(svm)) 1923 if (nested_svm_check_permissions(svm))
1913 return 1; 1924 return 1;
@@ -1920,7 +1931,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1920 return 1; 1931 return 1;
1921} 1932}
1922 1933
1923static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1934static int clgi_interception(struct vcpu_svm *svm)
1924{ 1935{
1925 if (nested_svm_check_permissions(svm)) 1936 if (nested_svm_check_permissions(svm))
1926 return 1; 1937 return 1;
@@ -1937,10 +1948,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1937 return 1; 1948 return 1;
1938} 1949}
1939 1950
1940static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1951static int invlpga_interception(struct vcpu_svm *svm)
1941{ 1952{
1942 struct kvm_vcpu *vcpu = &svm->vcpu; 1953 struct kvm_vcpu *vcpu = &svm->vcpu;
1943 nsvm_printk("INVLPGA\n"); 1954
1955 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
1956 vcpu->arch.regs[VCPU_REGS_RAX]);
1944 1957
1945 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 1958 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1946 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 1959 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1950,15 +1963,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1950 return 1; 1963 return 1;
1951} 1964}
1952 1965
1953static int invalid_op_interception(struct vcpu_svm *svm, 1966static int skinit_interception(struct vcpu_svm *svm)
1954 struct kvm_run *kvm_run)
1955{ 1967{
1968 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
1969
1956 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1970 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1957 return 1; 1971 return 1;
1958} 1972}
1959 1973
1960static int task_switch_interception(struct vcpu_svm *svm, 1974static int invalid_op_interception(struct vcpu_svm *svm)
1961 struct kvm_run *kvm_run) 1975{
1976 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1977 return 1;
1978}
1979
1980static int task_switch_interception(struct vcpu_svm *svm)
1962{ 1981{
1963 u16 tss_selector; 1982 u16 tss_selector;
1964 int reason; 1983 int reason;
@@ -2008,14 +2027,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
2008 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2027 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
2009} 2028}
2010 2029
2011static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2030static int cpuid_interception(struct vcpu_svm *svm)
2012{ 2031{
2013 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2032 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2014 kvm_emulate_cpuid(&svm->vcpu); 2033 kvm_emulate_cpuid(&svm->vcpu);
2015 return 1; 2034 return 1;
2016} 2035}
2017 2036
2018static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2037static int iret_interception(struct vcpu_svm *svm)
2019{ 2038{
2020 ++svm->vcpu.stat.nmi_window_exits; 2039 ++svm->vcpu.stat.nmi_window_exits;
2021 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2040 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2023,26 +2042,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2023 return 1; 2042 return 1;
2024} 2043}
2025 2044
2026static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2045static int invlpg_interception(struct vcpu_svm *svm)
2027{ 2046{
2028 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 2047 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2029 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2048 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2030 return 1; 2049 return 1;
2031} 2050}
2032 2051
2033static int emulate_on_interception(struct vcpu_svm *svm, 2052static int emulate_on_interception(struct vcpu_svm *svm)
2034 struct kvm_run *kvm_run)
2035{ 2053{
2036 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 2054 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2037 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2055 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2038 return 1; 2056 return 1;
2039} 2057}
2040 2058
2041static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2059static int cr8_write_interception(struct vcpu_svm *svm)
2042{ 2060{
2061 struct kvm_run *kvm_run = svm->vcpu.run;
2062
2043 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2063 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2044 /* instruction emulation calls kvm_set_cr8() */ 2064 /* instruction emulation calls kvm_set_cr8() */
2045 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 2065 emulate_instruction(&svm->vcpu, 0, 0, 0);
2046 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2066 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2047 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2067 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2048 return 1; 2068 return 1;
@@ -2128,7 +2148,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2128 return 0; 2148 return 0;
2129} 2149}
2130 2150
2131static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2151static int rdmsr_interception(struct vcpu_svm *svm)
2132{ 2152{
2133 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2153 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2134 u64 data; 2154 u64 data;
@@ -2221,7 +2241,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2221 return 0; 2241 return 0;
2222} 2242}
2223 2243
2224static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2244static int wrmsr_interception(struct vcpu_svm *svm)
2225{ 2245{
2226 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2246 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2227 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2247 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
@@ -2237,17 +2257,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2237 return 1; 2257 return 1;
2238} 2258}
2239 2259
2240static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2260static int msr_interception(struct vcpu_svm *svm)
2241{ 2261{
2242 if (svm->vmcb->control.exit_info_1) 2262 if (svm->vmcb->control.exit_info_1)
2243 return wrmsr_interception(svm, kvm_run); 2263 return wrmsr_interception(svm);
2244 else 2264 else
2245 return rdmsr_interception(svm, kvm_run); 2265 return rdmsr_interception(svm);
2246} 2266}
2247 2267
2248static int interrupt_window_interception(struct vcpu_svm *svm, 2268static int interrupt_window_interception(struct vcpu_svm *svm)
2249 struct kvm_run *kvm_run)
2250{ 2269{
2270 struct kvm_run *kvm_run = svm->vcpu.run;
2271
2251 svm_clear_vintr(svm); 2272 svm_clear_vintr(svm);
2252 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2273 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2253 /* 2274 /*
@@ -2265,8 +2286,13 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2265 return 1; 2286 return 1;
2266} 2287}
2267 2288
2268static int (*svm_exit_handlers[])(struct vcpu_svm *svm, 2289static int pause_interception(struct vcpu_svm *svm)
2269 struct kvm_run *kvm_run) = { 2290{
2291 kvm_vcpu_on_spin(&(svm->vcpu));
2292 return 1;
2293}
2294
2295static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2270 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2296 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2271 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2297 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2272 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2298 [SVM_EXIT_READ_CR4] = emulate_on_interception,
@@ -2301,6 +2327,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2301 [SVM_EXIT_CPUID] = cpuid_interception, 2327 [SVM_EXIT_CPUID] = cpuid_interception,
2302 [SVM_EXIT_IRET] = iret_interception, 2328 [SVM_EXIT_IRET] = iret_interception,
2303 [SVM_EXIT_INVD] = emulate_on_interception, 2329 [SVM_EXIT_INVD] = emulate_on_interception,
2330 [SVM_EXIT_PAUSE] = pause_interception,
2304 [SVM_EXIT_HLT] = halt_interception, 2331 [SVM_EXIT_HLT] = halt_interception,
2305 [SVM_EXIT_INVLPG] = invlpg_interception, 2332 [SVM_EXIT_INVLPG] = invlpg_interception,
2306 [SVM_EXIT_INVLPGA] = invlpga_interception, 2333 [SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2314,26 +2341,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2314 [SVM_EXIT_VMSAVE] = vmsave_interception, 2341 [SVM_EXIT_VMSAVE] = vmsave_interception,
2315 [SVM_EXIT_STGI] = stgi_interception, 2342 [SVM_EXIT_STGI] = stgi_interception,
2316 [SVM_EXIT_CLGI] = clgi_interception, 2343 [SVM_EXIT_CLGI] = clgi_interception,
2317 [SVM_EXIT_SKINIT] = invalid_op_interception, 2344 [SVM_EXIT_SKINIT] = skinit_interception,
2318 [SVM_EXIT_WBINVD] = emulate_on_interception, 2345 [SVM_EXIT_WBINVD] = emulate_on_interception,
2319 [SVM_EXIT_MONITOR] = invalid_op_interception, 2346 [SVM_EXIT_MONITOR] = invalid_op_interception,
2320 [SVM_EXIT_MWAIT] = invalid_op_interception, 2347 [SVM_EXIT_MWAIT] = invalid_op_interception,
2321 [SVM_EXIT_NPF] = pf_interception, 2348 [SVM_EXIT_NPF] = pf_interception,
2322}; 2349};
2323 2350
2324static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2351static int handle_exit(struct kvm_vcpu *vcpu)
2325{ 2352{
2326 struct vcpu_svm *svm = to_svm(vcpu); 2353 struct vcpu_svm *svm = to_svm(vcpu);
2354 struct kvm_run *kvm_run = vcpu->run;
2327 u32 exit_code = svm->vmcb->control.exit_code; 2355 u32 exit_code = svm->vmcb->control.exit_code;
2328 2356
2329 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2357 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2330 2358
2359 if (unlikely(svm->nested.exit_required)) {
2360 nested_svm_vmexit(svm);
2361 svm->nested.exit_required = false;
2362
2363 return 1;
2364 }
2365
2331 if (is_nested(svm)) { 2366 if (is_nested(svm)) {
2332 int vmexit; 2367 int vmexit;
2333 2368
2334 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2369 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
2335 exit_code, svm->vmcb->control.exit_info_1, 2370 svm->vmcb->control.exit_info_1,
2336 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2371 svm->vmcb->control.exit_info_2,
2372 svm->vmcb->control.exit_int_info,
2373 svm->vmcb->control.exit_int_info_err);
2337 2374
2338 vmexit = nested_svm_exit_special(svm); 2375 vmexit = nested_svm_exit_special(svm);
2339 2376
@@ -2383,15 +2420,15 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2383 return 0; 2420 return 0;
2384 } 2421 }
2385 2422
2386 return svm_exit_handlers[exit_code](svm, kvm_run); 2423 return svm_exit_handlers[exit_code](svm);
2387} 2424}
2388 2425
2389static void reload_tss(struct kvm_vcpu *vcpu) 2426static void reload_tss(struct kvm_vcpu *vcpu)
2390{ 2427{
2391 int cpu = raw_smp_processor_id(); 2428 int cpu = raw_smp_processor_id();
2392 2429
2393 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2430 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2394 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ 2431 sd->tss_desc->type = 9; /* available 32/64-bit TSS */
2395 load_TR_desc(); 2432 load_TR_desc();
2396} 2433}
2397 2434
@@ -2399,12 +2436,12 @@ static void pre_svm_run(struct vcpu_svm *svm)
2399{ 2436{
2400 int cpu = raw_smp_processor_id(); 2437 int cpu = raw_smp_processor_id();
2401 2438
2402 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2439 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2403 2440
2404 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 2441 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2405 /* FIXME: handle wraparound of asid_generation */ 2442 /* FIXME: handle wraparound of asid_generation */
2406 if (svm->asid_generation != svm_data->asid_generation) 2443 if (svm->asid_generation != sd->asid_generation)
2407 new_asid(svm, svm_data); 2444 new_asid(svm, sd);
2408} 2445}
2409 2446
2410static void svm_inject_nmi(struct kvm_vcpu *vcpu) 2447static void svm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2460,20 +2497,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2460 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2497 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2461} 2498}
2462 2499
2500static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
2501{
2502 struct vcpu_svm *svm = to_svm(vcpu);
2503
2504 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
2505}
2506
2507static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2508{
2509 struct vcpu_svm *svm = to_svm(vcpu);
2510
2511 if (masked) {
2512 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2513 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2514 } else {
2515 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2516 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
2517 }
2518}
2519
2463static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 2520static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2464{ 2521{
2465 struct vcpu_svm *svm = to_svm(vcpu); 2522 struct vcpu_svm *svm = to_svm(vcpu);
2466 struct vmcb *vmcb = svm->vmcb; 2523 struct vmcb *vmcb = svm->vmcb;
2467 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2524 int ret;
2468 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2525
2469 gif_set(svm) && 2526 if (!gif_set(svm) ||
2470 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); 2527 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2528 return 0;
2529
2530 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
2531
2532 if (is_nested(svm))
2533 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
2534
2535 return ret;
2471} 2536}
2472 2537
2473static void enable_irq_window(struct kvm_vcpu *vcpu) 2538static void enable_irq_window(struct kvm_vcpu *vcpu)
2474{ 2539{
2475 struct vcpu_svm *svm = to_svm(vcpu); 2540 struct vcpu_svm *svm = to_svm(vcpu);
2476 nsvm_printk("Trying to open IRQ window\n");
2477 2541
2478 nested_svm_intr(svm); 2542 nested_svm_intr(svm);
2479 2543
@@ -2498,7 +2562,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2498 /* Something prevents NMI from been injected. Single step over 2562 /* Something prevents NMI from been injected. Single step over
2499 possible problem (IRET or exception injection or interrupt 2563 possible problem (IRET or exception injection or interrupt
2500 shadow) */ 2564 shadow) */
2501 vcpu->arch.singlestep = true; 2565 svm->nmi_singlestep = true;
2502 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2566 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2503 update_db_intercept(vcpu); 2567 update_db_intercept(vcpu);
2504} 2568}
@@ -2588,13 +2652,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2588#define R "e" 2652#define R "e"
2589#endif 2653#endif
2590 2654
2591static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2655static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2592{ 2656{
2593 struct vcpu_svm *svm = to_svm(vcpu); 2657 struct vcpu_svm *svm = to_svm(vcpu);
2594 u16 fs_selector; 2658 u16 fs_selector;
2595 u16 gs_selector; 2659 u16 gs_selector;
2596 u16 ldt_selector; 2660 u16 ldt_selector;
2597 2661
2662 /*
2663 * A vmexit emulation is required before the vcpu can be executed
2664 * again.
2665 */
2666 if (unlikely(svm->nested.exit_required))
2667 return;
2668
2598 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2669 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2599 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2670 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2600 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 2671 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2893,6 +2964,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2893 .queue_exception = svm_queue_exception, 2964 .queue_exception = svm_queue_exception,
2894 .interrupt_allowed = svm_interrupt_allowed, 2965 .interrupt_allowed = svm_interrupt_allowed,
2895 .nmi_allowed = svm_nmi_allowed, 2966 .nmi_allowed = svm_nmi_allowed,
2967 .get_nmi_mask = svm_get_nmi_mask,
2968 .set_nmi_mask = svm_set_nmi_mask,
2896 .enable_nmi_window = enable_nmi_window, 2969 .enable_nmi_window = enable_nmi_window,
2897 .enable_irq_window = enable_irq_window, 2970 .enable_irq_window = enable_irq_window,
2898 .update_cr8_intercept = update_cr8_intercept, 2971 .update_cr8_intercept = update_cr8_intercept,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..816e0449db0b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -349,6 +349,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
349 __entry->coalesced ? " (coalesced)" : "") 349 __entry->coalesced ? " (coalesced)" : "")
350); 350);
351 351
352/*
353 * Tracepoint for nested VMRUN
354 */
355TRACE_EVENT(kvm_nested_vmrun,
356 TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
357 __u32 event_inj, bool npt),
358 TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
359
360 TP_STRUCT__entry(
361 __field( __u64, rip )
362 __field( __u64, vmcb )
363 __field( __u64, nested_rip )
364 __field( __u32, int_ctl )
365 __field( __u32, event_inj )
366 __field( bool, npt )
367 ),
368
369 TP_fast_assign(
370 __entry->rip = rip;
371 __entry->vmcb = vmcb;
372 __entry->nested_rip = nested_rip;
373 __entry->int_ctl = int_ctl;
374 __entry->event_inj = event_inj;
375 __entry->npt = npt;
376 ),
377
378 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
379 "event_inj: 0x%08x npt: %s\n",
380 __entry->rip, __entry->vmcb, __entry->nested_rip,
381 __entry->int_ctl, __entry->event_inj,
382 __entry->npt ? "on" : "off")
383);
384
385/*
386 * Tracepoint for #VMEXIT while nested
387 */
388TRACE_EVENT(kvm_nested_vmexit,
389 TP_PROTO(__u64 rip, __u32 exit_code,
390 __u64 exit_info1, __u64 exit_info2,
391 __u32 exit_int_info, __u32 exit_int_info_err),
392 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
393 exit_int_info, exit_int_info_err),
394
395 TP_STRUCT__entry(
396 __field( __u64, rip )
397 __field( __u32, exit_code )
398 __field( __u64, exit_info1 )
399 __field( __u64, exit_info2 )
400 __field( __u32, exit_int_info )
401 __field( __u32, exit_int_info_err )
402 ),
403
404 TP_fast_assign(
405 __entry->rip = rip;
406 __entry->exit_code = exit_code;
407 __entry->exit_info1 = exit_info1;
408 __entry->exit_info2 = exit_info2;
409 __entry->exit_int_info = exit_int_info;
410 __entry->exit_int_info_err = exit_int_info_err;
411 ),
412 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
413 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
414 __entry->rip,
415 ftrace_print_symbols_seq(p, __entry->exit_code,
416 kvm_x86_ops->exit_reasons_str),
417 __entry->exit_info1, __entry->exit_info2,
418 __entry->exit_int_info, __entry->exit_int_info_err)
419);
420
421/*
422 * Tracepoint for #VMEXIT reinjected to the guest
423 */
424TRACE_EVENT(kvm_nested_vmexit_inject,
425 TP_PROTO(__u32 exit_code,
426 __u64 exit_info1, __u64 exit_info2,
427 __u32 exit_int_info, __u32 exit_int_info_err),
428 TP_ARGS(exit_code, exit_info1, exit_info2,
429 exit_int_info, exit_int_info_err),
430
431 TP_STRUCT__entry(
432 __field( __u32, exit_code )
433 __field( __u64, exit_info1 )
434 __field( __u64, exit_info2 )
435 __field( __u32, exit_int_info )
436 __field( __u32, exit_int_info_err )
437 ),
438
439 TP_fast_assign(
440 __entry->exit_code = exit_code;
441 __entry->exit_info1 = exit_info1;
442 __entry->exit_info2 = exit_info2;
443 __entry->exit_int_info = exit_int_info;
444 __entry->exit_int_info_err = exit_int_info_err;
445 ),
446
447 TP_printk("reason: %s ext_inf1: 0x%016llx "
448 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
449 ftrace_print_symbols_seq(p, __entry->exit_code,
450 kvm_x86_ops->exit_reasons_str),
451 __entry->exit_info1, __entry->exit_info2,
452 __entry->exit_int_info, __entry->exit_int_info_err)
453);
454
455/*
456 * Tracepoint for nested #vmexit because of interrupt pending
457 */
458TRACE_EVENT(kvm_nested_intr_vmexit,
459 TP_PROTO(__u64 rip),
460 TP_ARGS(rip),
461
462 TP_STRUCT__entry(
463 __field( __u64, rip )
464 ),
465
466 TP_fast_assign(
467 __entry->rip = rip
468 ),
469
470 TP_printk("rip: 0x%016llx\n", __entry->rip)
471);
472
473/*
474 * Tracepoint for nested #vmexit because of interrupt pending
475 */
476TRACE_EVENT(kvm_invlpga,
477 TP_PROTO(__u64 rip, int asid, u64 address),
478 TP_ARGS(rip, asid, address),
479
480 TP_STRUCT__entry(
481 __field( __u64, rip )
482 __field( int, asid )
483 __field( __u64, address )
484 ),
485
486 TP_fast_assign(
487 __entry->rip = rip;
488 __entry->asid = asid;
489 __entry->address = address;
490 ),
491
492 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
493 __entry->rip, __entry->asid, __entry->address)
494);
495
496/*
497 * Tracepoint for nested #vmexit because of interrupt pending
498 */
499TRACE_EVENT(kvm_skinit,
500 TP_PROTO(__u64 rip, __u32 slb),
501 TP_ARGS(rip, slb),
502
503 TP_STRUCT__entry(
504 __field( __u64, rip )
505 __field( __u32, slb )
506 ),
507
508 TP_fast_assign(
509 __entry->rip = rip;
510 __entry->slb = slb;
511 ),
512
513 TP_printk("rip: 0x%016llx slb: 0x%08x\n",
514 __entry->rip, __entry->slb)
515);
516
352#endif /* _TRACE_KVM_H */ 517#endif /* _TRACE_KVM_H */
353 518
354/* This part must be outside protection */ 519/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42caba1..d4918d6fc924 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,12 +61,37 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 63
64/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive
67 * executions of PAUSE in a loop. Also indicate if ple enabled.
68 * According to test, this time is usually small than 41 cycles.
69 * ple_window: upper bound on the amount of time a guest is allowed to execute
70 * in a PAUSE loop. Tests indicate that most spinlocks are held for
71 * less than 2^12 cycles
72 * Time is measured based on a counter that runs at the same rate as the TSC,
73 * refer SDM volume 3b section 21.6.13 & 22.1.3.
74 */
75#define KVM_VMX_DEFAULT_PLE_GAP 41
76#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
77static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
78module_param(ple_gap, int, S_IRUGO);
79
80static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
81module_param(ple_window, int, S_IRUGO);
82
64struct vmcs { 83struct vmcs {
65 u32 revision_id; 84 u32 revision_id;
66 u32 abort; 85 u32 abort;
67 char data[0]; 86 char data[0];
68}; 87};
69 88
89struct shared_msr_entry {
90 unsigned index;
91 u64 data;
92 u64 mask;
93};
94
70struct vcpu_vmx { 95struct vcpu_vmx {
71 struct kvm_vcpu vcpu; 96 struct kvm_vcpu vcpu;
72 struct list_head local_vcpus_link; 97 struct list_head local_vcpus_link;
@@ -74,13 +99,12 @@ struct vcpu_vmx {
74 int launched; 99 int launched;
75 u8 fail; 100 u8 fail;
76 u32 idt_vectoring_info; 101 u32 idt_vectoring_info;
77 struct kvm_msr_entry *guest_msrs; 102 struct shared_msr_entry *guest_msrs;
78 struct kvm_msr_entry *host_msrs;
79 int nmsrs; 103 int nmsrs;
80 int save_nmsrs; 104 int save_nmsrs;
81 int msr_offset_efer;
82#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
83 int msr_offset_kernel_gs_base; 106 u64 msr_host_kernel_gs_base;
107 u64 msr_guest_kernel_gs_base;
84#endif 108#endif
85 struct vmcs *vmcs; 109 struct vmcs *vmcs;
86 struct { 110 struct {
@@ -88,7 +112,6 @@ struct vcpu_vmx {
88 u16 fs_sel, gs_sel, ldt_sel; 112 u16 fs_sel, gs_sel, ldt_sel;
89 int gs_ldt_reload_needed; 113 int gs_ldt_reload_needed;
90 int fs_reload_needed; 114 int fs_reload_needed;
91 int guest_efer_loaded;
92 } host_state; 115 } host_state;
93 struct { 116 struct {
94 int vm86_active; 117 int vm86_active;
@@ -107,7 +130,6 @@ struct vcpu_vmx {
107 } rmode; 130 } rmode;
108 int vpid; 131 int vpid;
109 bool emulation_required; 132 bool emulation_required;
110 enum emulation_result invalid_state_emulation_result;
111 133
112 /* Support for vnmi-less CPUs */ 134 /* Support for vnmi-less CPUs */
113 int soft_vnmi_blocked; 135 int soft_vnmi_blocked;
@@ -176,6 +198,8 @@ static struct kvm_vmx_segment_field {
176 VMX_SEGMENT_FIELD(LDTR), 198 VMX_SEGMENT_FIELD(LDTR),
177}; 199};
178 200
201static u64 host_efer;
202
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 203static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180 204
181/* 205/*
@@ -184,28 +208,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
184 */ 208 */
185static const u32 vmx_msr_index[] = { 209static const u32 vmx_msr_index[] = {
186#ifdef CONFIG_X86_64 210#ifdef CONFIG_X86_64
187 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
188#endif 212#endif
189 MSR_EFER, MSR_K6_STAR, 213 MSR_EFER, MSR_K6_STAR,
190}; 214};
191#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
192 216
193static void load_msrs(struct kvm_msr_entry *e, int n)
194{
195 int i;
196
197 for (i = 0; i < n; ++i)
198 wrmsrl(e[i].index, e[i].data);
199}
200
201static void save_msrs(struct kvm_msr_entry *e, int n)
202{
203 int i;
204
205 for (i = 0; i < n; ++i)
206 rdmsrl(e[i].index, e[i].data);
207}
208
209static inline int is_page_fault(u32 intr_info) 217static inline int is_page_fault(u32 intr_info)
210{ 218{
211 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 219 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -320,6 +328,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
320 SECONDARY_EXEC_UNRESTRICTED_GUEST; 328 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321} 329}
322 330
331static inline int cpu_has_vmx_ple(void)
332{
333 return vmcs_config.cpu_based_2nd_exec_ctrl &
334 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
335}
336
323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
324{ 338{
325 return flexpriority_enabled && 339 return flexpriority_enabled &&
@@ -348,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
348 int i; 362 int i;
349 363
350 for (i = 0; i < vmx->nmsrs; ++i) 364 for (i = 0; i < vmx->nmsrs; ++i)
351 if (vmx->guest_msrs[i].index == msr) 365 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
352 return i; 366 return i;
353 return -1; 367 return -1;
354} 368}
@@ -379,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
379 : : "a" (&operand), "c" (ext) : "cc", "memory"); 393 : : "a" (&operand), "c" (ext) : "cc", "memory");
380} 394}
381 395
382static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 396static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
383{ 397{
384 int i; 398 int i;
385 399
@@ -570,17 +584,12 @@ static void reload_tss(void)
570 load_TR_desc(); 584 load_TR_desc();
571} 585}
572 586
573static void load_transition_efer(struct vcpu_vmx *vmx) 587static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
574{ 588{
575 int efer_offset = vmx->msr_offset_efer;
576 u64 host_efer;
577 u64 guest_efer; 589 u64 guest_efer;
578 u64 ignore_bits; 590 u64 ignore_bits;
579 591
580 if (efer_offset < 0) 592 guest_efer = vmx->vcpu.arch.shadow_efer;
581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584 593
585 /* 594 /*
586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +602,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
593 if (guest_efer & EFER_LMA) 602 if (guest_efer & EFER_LMA)
594 ignore_bits &= ~(u64)EFER_SCE; 603 ignore_bits &= ~(u64)EFER_SCE;
595#endif 604#endif
596 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
597 return;
598
599 vmx->host_state.guest_efer_loaded = 1;
600 guest_efer &= ~ignore_bits; 605 guest_efer &= ~ignore_bits;
601 guest_efer |= host_efer & ignore_bits; 606 guest_efer |= host_efer & ignore_bits;
602 wrmsrl(MSR_EFER, guest_efer); 607 vmx->guest_msrs[efer_offset].data = guest_efer;
603 vmx->vcpu.stat.efer_reload++; 608 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
604} 609 return true;
605
606static void reload_host_efer(struct vcpu_vmx *vmx)
607{
608 if (vmx->host_state.guest_efer_loaded) {
609 vmx->host_state.guest_efer_loaded = 0;
610 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
611 }
612} 610}
613 611
614static void vmx_save_host_state(struct kvm_vcpu *vcpu) 612static void vmx_save_host_state(struct kvm_vcpu *vcpu)
615{ 613{
616 struct vcpu_vmx *vmx = to_vmx(vcpu); 614 struct vcpu_vmx *vmx = to_vmx(vcpu);
615 int i;
617 616
618 if (vmx->host_state.loaded) 617 if (vmx->host_state.loaded)
619 return; 618 return;
@@ -650,13 +649,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
650#endif 649#endif
651 650
652#ifdef CONFIG_X86_64 651#ifdef CONFIG_X86_64
653 if (is_long_mode(&vmx->vcpu)) 652 if (is_long_mode(&vmx->vcpu)) {
654 save_msrs(vmx->host_msrs + 653 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
655 vmx->msr_offset_kernel_gs_base, 1); 654 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
656 655 }
657#endif 656#endif
658 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 657 for (i = 0; i < vmx->save_nmsrs; ++i)
659 load_transition_efer(vmx); 658 kvm_set_shared_msr(vmx->guest_msrs[i].index,
659 vmx->guest_msrs[i].data,
660 vmx->guest_msrs[i].mask);
660} 661}
661 662
662static void __vmx_load_host_state(struct vcpu_vmx *vmx) 663static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
684 local_irq_restore(flags); 685 local_irq_restore(flags);
685 } 686 }
686 reload_tss(); 687 reload_tss();
687 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 688#ifdef CONFIG_X86_64
688 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 689 if (is_long_mode(&vmx->vcpu)) {
689 reload_host_efer(vmx); 690 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
691 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
692 }
693#endif
690} 694}
691 695
692static void vmx_load_host_state(struct vcpu_vmx *vmx) 696static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -877,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
877/* 881/*
878 * Swap MSR entry in host/guest MSR entry array. 882 * Swap MSR entry in host/guest MSR entry array.
879 */ 883 */
880#ifdef CONFIG_X86_64
881static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 884static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
882{ 885{
883 struct kvm_msr_entry tmp; 886 struct shared_msr_entry tmp;
884 887
885 tmp = vmx->guest_msrs[to]; 888 tmp = vmx->guest_msrs[to];
886 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 889 vmx->guest_msrs[to] = vmx->guest_msrs[from];
887 vmx->guest_msrs[from] = tmp; 890 vmx->guest_msrs[from] = tmp;
888 tmp = vmx->host_msrs[to];
889 vmx->host_msrs[to] = vmx->host_msrs[from];
890 vmx->host_msrs[from] = tmp;
891} 891}
892#endif
893 892
894/* 893/*
895 * Set up the vmcs to automatically save and restore system 894 * Set up the vmcs to automatically save and restore system
@@ -898,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
898 */ 897 */
899static void setup_msrs(struct vcpu_vmx *vmx) 898static void setup_msrs(struct vcpu_vmx *vmx)
900{ 899{
901 int save_nmsrs; 900 int save_nmsrs, index;
902 unsigned long *msr_bitmap; 901 unsigned long *msr_bitmap;
903 902
904 vmx_load_host_state(vmx); 903 vmx_load_host_state(vmx);
905 save_nmsrs = 0; 904 save_nmsrs = 0;
906#ifdef CONFIG_X86_64 905#ifdef CONFIG_X86_64
907 if (is_long_mode(&vmx->vcpu)) { 906 if (is_long_mode(&vmx->vcpu)) {
908 int index;
909
910 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 907 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
911 if (index >= 0) 908 if (index >= 0)
912 move_msr_up(vmx, index, save_nmsrs++); 909 move_msr_up(vmx, index, save_nmsrs++);
@@ -916,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
916 index = __find_msr_index(vmx, MSR_CSTAR); 913 index = __find_msr_index(vmx, MSR_CSTAR);
917 if (index >= 0) 914 if (index >= 0)
918 move_msr_up(vmx, index, save_nmsrs++); 915 move_msr_up(vmx, index, save_nmsrs++);
919 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
920 if (index >= 0)
921 move_msr_up(vmx, index, save_nmsrs++);
922 /* 916 /*
923 * MSR_K6_STAR is only needed on long mode guests, and only 917 * MSR_K6_STAR is only needed on long mode guests, and only
924 * if efer.sce is enabled. 918 * if efer.sce is enabled.
@@ -928,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
928 move_msr_up(vmx, index, save_nmsrs++); 922 move_msr_up(vmx, index, save_nmsrs++);
929 } 923 }
930#endif 924#endif
931 vmx->save_nmsrs = save_nmsrs; 925 index = __find_msr_index(vmx, MSR_EFER);
926 if (index >= 0 && update_transition_efer(vmx, index))
927 move_msr_up(vmx, index, save_nmsrs++);
932 928
933#ifdef CONFIG_X86_64 929 vmx->save_nmsrs = save_nmsrs;
934 vmx->msr_offset_kernel_gs_base =
935 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
936#endif
937 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
938 930
939 if (cpu_has_vmx_msr_bitmap()) { 931 if (cpu_has_vmx_msr_bitmap()) {
940 if (is_long_mode(&vmx->vcpu)) 932 if (is_long_mode(&vmx->vcpu))
@@ -976,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
976static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 968static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
977{ 969{
978 u64 data; 970 u64 data;
979 struct kvm_msr_entry *msr; 971 struct shared_msr_entry *msr;
980 972
981 if (!pdata) { 973 if (!pdata) {
982 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 974 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
991 case MSR_GS_BASE: 983 case MSR_GS_BASE:
992 data = vmcs_readl(GUEST_GS_BASE); 984 data = vmcs_readl(GUEST_GS_BASE);
993 break; 985 break;
986 case MSR_KERNEL_GS_BASE:
987 vmx_load_host_state(to_vmx(vcpu));
988 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
989 break;
990#endif
994 case MSR_EFER: 991 case MSR_EFER:
995 return kvm_get_msr_common(vcpu, msr_index, pdata); 992 return kvm_get_msr_common(vcpu, msr_index, pdata);
996#endif
997 case MSR_IA32_TSC: 993 case MSR_IA32_TSC:
998 data = guest_read_tsc(); 994 data = guest_read_tsc();
999 break; 995 break;
@@ -1007,6 +1003,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1007 data = vmcs_readl(GUEST_SYSENTER_ESP); 1003 data = vmcs_readl(GUEST_SYSENTER_ESP);
1008 break; 1004 break;
1009 default: 1005 default:
1006 vmx_load_host_state(to_vmx(vcpu));
1010 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1007 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1011 if (msr) { 1008 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu)); 1009 vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1028static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1025static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1029{ 1026{
1030 struct vcpu_vmx *vmx = to_vmx(vcpu); 1027 struct vcpu_vmx *vmx = to_vmx(vcpu);
1031 struct kvm_msr_entry *msr; 1028 struct shared_msr_entry *msr;
1032 u64 host_tsc; 1029 u64 host_tsc;
1033 int ret = 0; 1030 int ret = 0;
1034 1031
@@ -1044,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1044 case MSR_GS_BASE: 1041 case MSR_GS_BASE:
1045 vmcs_writel(GUEST_GS_BASE, data); 1042 vmcs_writel(GUEST_GS_BASE, data);
1046 break; 1043 break;
1044 case MSR_KERNEL_GS_BASE:
1045 vmx_load_host_state(vmx);
1046 vmx->msr_guest_kernel_gs_base = data;
1047 break;
1047#endif 1048#endif
1048 case MSR_IA32_SYSENTER_CS: 1049 case MSR_IA32_SYSENTER_CS:
1049 vmcs_write32(GUEST_SYSENTER_CS, data); 1050 vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1097,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1097 } 1098 }
1098} 1099}
1099 1100
1100static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1101static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1101{ 1102{
1102 int old_debug = vcpu->guest_debug;
1103 unsigned long flags;
1104
1105 vcpu->guest_debug = dbg->control;
1106 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1107 vcpu->guest_debug = 0;
1108
1109 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1103 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1110 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); 1104 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1111 else 1105 else
1112 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 1106 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1113 1107
1114 flags = vmcs_readl(GUEST_RFLAGS);
1115 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1116 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1117 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1118 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1119 vmcs_writel(GUEST_RFLAGS, flags);
1120
1121 update_exception_bitmap(vcpu); 1108 update_exception_bitmap(vcpu);
1122
1123 return 0;
1124} 1109}
1125 1110
1126static __init int cpu_has_kvm_support(void) 1111static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void)
1139 /* locked but not enabled */ 1124 /* locked but not enabled */
1140} 1125}
1141 1126
1142static void hardware_enable(void *garbage) 1127static int hardware_enable(void *garbage)
1143{ 1128{
1144 int cpu = raw_smp_processor_id(); 1129 int cpu = raw_smp_processor_id();
1145 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1130 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1146 u64 old; 1131 u64 old;
1147 1132
1133 if (read_cr4() & X86_CR4_VMXE)
1134 return -EBUSY;
1135
1148 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1136 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1149 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1137 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1150 if ((old & (FEATURE_CONTROL_LOCKED | 1138 if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1147,10 @@ static void hardware_enable(void *garbage)
1159 asm volatile (ASM_VMX_VMXON_RAX 1147 asm volatile (ASM_VMX_VMXON_RAX
1160 : : "a"(&phys_addr), "m"(phys_addr) 1148 : : "a"(&phys_addr), "m"(phys_addr)
1161 : "memory", "cc"); 1149 : "memory", "cc");
1150
1151 ept_sync_global();
1152
1153 return 0;
1162} 1154}
1163 1155
1164static void vmclear_local_vcpus(void) 1156static void vmclear_local_vcpus(void)
@@ -1250,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1250 SECONDARY_EXEC_WBINVD_EXITING | 1242 SECONDARY_EXEC_WBINVD_EXITING |
1251 SECONDARY_EXEC_ENABLE_VPID | 1243 SECONDARY_EXEC_ENABLE_VPID |
1252 SECONDARY_EXEC_ENABLE_EPT | 1244 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1245 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1254 if (adjust_vmx_controls(min2, opt2, 1247 if (adjust_vmx_controls(min2, opt2,
1255 MSR_IA32_VMX_PROCBASED_CTLS2, 1248 MSR_IA32_VMX_PROCBASED_CTLS2,
1256 &_cpu_based_2nd_exec_control) < 0) 1249 &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1337,17 @@ static void free_kvm_area(void)
1344{ 1337{
1345 int cpu; 1338 int cpu;
1346 1339
1347 for_each_online_cpu(cpu) 1340 for_each_possible_cpu(cpu) {
1348 free_vmcs(per_cpu(vmxarea, cpu)); 1341 free_vmcs(per_cpu(vmxarea, cpu));
1342 per_cpu(vmxarea, cpu) = NULL;
1343 }
1349} 1344}
1350 1345
1351static __init int alloc_kvm_area(void) 1346static __init int alloc_kvm_area(void)
1352{ 1347{
1353 int cpu; 1348 int cpu;
1354 1349
1355 for_each_online_cpu(cpu) { 1350 for_each_possible_cpu(cpu) {
1356 struct vmcs *vmcs; 1351 struct vmcs *vmcs;
1357 1352
1358 vmcs = alloc_vmcs_cpu(cpu); 1353 vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1389,9 @@ static __init int hardware_setup(void)
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 1389 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages(); 1390 kvm_disable_largepages();
1396 1391
1392 if (!cpu_has_vmx_ple())
1393 ple_gap = 0;
1394
1397 return alloc_kvm_area(); 1395 return alloc_kvm_area();
1398} 1396}
1399 1397
@@ -1536,8 +1534,16 @@ continue_rmode:
1536static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1534static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1537{ 1535{
1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1536 struct vcpu_vmx *vmx = to_vmx(vcpu);
1539 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1537 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1538
1539 if (!msr)
1540 return;
1540 1541
1542 /*
1543 * Force kernel_gs_base reloading before EFER changes, as control
1544 * of this msr depends on is_long_mode().
1545 */
1546 vmx_load_host_state(to_vmx(vcpu));
1541 vcpu->arch.shadow_efer = efer; 1547 vcpu->arch.shadow_efer = efer;
1542 if (!msr) 1548 if (!msr)
1543 return; 1549 return;
@@ -1727,6 +1733,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1727 vmcs_write64(EPT_POINTER, eptp); 1733 vmcs_write64(EPT_POINTER, eptp);
1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1734 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1729 vcpu->kvm->arch.ept_identity_map_addr; 1735 vcpu->kvm->arch.ept_identity_map_addr;
1736 ept_load_pdptrs(vcpu);
1730 } 1737 }
1731 1738
1732 vmx_flush_tlb(vcpu); 1739 vmx_flush_tlb(vcpu);
@@ -2302,13 +2309,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2302 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2309 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2303 if (vmx->vpid == 0) 2310 if (vmx->vpid == 0)
2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2311 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2305 if (!enable_ept) 2312 if (!enable_ept) {
2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2313 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2314 enable_unrestricted_guest = 0;
2315 }
2307 if (!enable_unrestricted_guest) 2316 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2317 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2318 if (!ple_gap)
2319 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2320 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2310 } 2321 }
2311 2322
2323 if (ple_gap) {
2324 vmcs_write32(PLE_GAP, ple_gap);
2325 vmcs_write32(PLE_WINDOW, ple_window);
2326 }
2327
2312 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 2328 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2313 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2329 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2314 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2330 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2376,10 +2392,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2376 if (wrmsr_safe(index, data_low, data_high) < 0) 2392 if (wrmsr_safe(index, data_low, data_high) < 0)
2377 continue; 2393 continue;
2378 data = data_low | ((u64)data_high << 32); 2394 data = data_low | ((u64)data_high << 32);
2379 vmx->host_msrs[j].index = index; 2395 vmx->guest_msrs[j].index = i;
2380 vmx->host_msrs[j].reserved = 0; 2396 vmx->guest_msrs[j].data = 0;
2381 vmx->host_msrs[j].data = data; 2397 vmx->guest_msrs[j].mask = -1ull;
2382 vmx->guest_msrs[j] = vmx->host_msrs[j];
2383 ++vmx->nmsrs; 2398 ++vmx->nmsrs;
2384 } 2399 }
2385 2400
@@ -2510,7 +2525,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2510 if (vmx->vpid != 0) 2525 if (vmx->vpid != 0)
2511 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2512 2527
2513 vmx->vcpu.arch.cr0 = 0x60000010; 2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2514 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2515 vmx_set_cr4(&vmx->vcpu, 0); 2530 vmx_set_cr4(&vmx->vcpu, 0);
2516 vmx_set_efer(&vmx->vcpu, 0); 2531 vmx_set_efer(&vmx->vcpu, 0);
@@ -2627,6 +2642,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2627 GUEST_INTR_STATE_NMI)); 2642 GUEST_INTR_STATE_NMI));
2628} 2643}
2629 2644
2645static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2646{
2647 if (!cpu_has_virtual_nmis())
2648 return to_vmx(vcpu)->soft_vnmi_blocked;
2649 else
2650 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2651 GUEST_INTR_STATE_NMI);
2652}
2653
2654static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2655{
2656 struct vcpu_vmx *vmx = to_vmx(vcpu);
2657
2658 if (!cpu_has_virtual_nmis()) {
2659 if (vmx->soft_vnmi_blocked != masked) {
2660 vmx->soft_vnmi_blocked = masked;
2661 vmx->vnmi_blocked_time = 0;
2662 }
2663 } else {
2664 if (masked)
2665 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2666 GUEST_INTR_STATE_NMI);
2667 else
2668 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2669 GUEST_INTR_STATE_NMI);
2670 }
2671}
2672
2630static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 2673static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2631{ 2674{
2632 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 2675 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2659,7 +2702,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2659 * Cause the #SS fault with 0 error code in VM86 mode. 2702 * Cause the #SS fault with 0 error code in VM86 mode.
2660 */ 2703 */
2661 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2704 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2662 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2705 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
2663 return 1; 2706 return 1;
2664 /* 2707 /*
2665 * Forward all other exceptions that are valid in real mode. 2708 * Forward all other exceptions that are valid in real mode.
@@ -2710,15 +2753,16 @@ static void kvm_machine_check(void)
2710#endif 2753#endif
2711} 2754}
2712 2755
2713static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2756static int handle_machine_check(struct kvm_vcpu *vcpu)
2714{ 2757{
2715 /* already handled by vcpu_run */ 2758 /* already handled by vcpu_run */
2716 return 1; 2759 return 1;
2717} 2760}
2718 2761
2719static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2762static int handle_exception(struct kvm_vcpu *vcpu)
2720{ 2763{
2721 struct vcpu_vmx *vmx = to_vmx(vcpu); 2764 struct vcpu_vmx *vmx = to_vmx(vcpu);
2765 struct kvm_run *kvm_run = vcpu->run;
2722 u32 intr_info, ex_no, error_code; 2766 u32 intr_info, ex_no, error_code;
2723 unsigned long cr2, rip, dr6; 2767 unsigned long cr2, rip, dr6;
2724 u32 vect_info; 2768 u32 vect_info;
@@ -2728,12 +2772,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2728 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2772 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2729 2773
2730 if (is_machine_check(intr_info)) 2774 if (is_machine_check(intr_info))
2731 return handle_machine_check(vcpu, kvm_run); 2775 return handle_machine_check(vcpu);
2732 2776
2733 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2777 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2734 !is_page_fault(intr_info)) 2778 !is_page_fault(intr_info)) {
2735 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2779 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2736 "intr info 0x%x\n", __func__, vect_info, intr_info); 2780 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2781 vcpu->run->internal.ndata = 2;
2782 vcpu->run->internal.data[0] = vect_info;
2783 vcpu->run->internal.data[1] = intr_info;
2784 return 0;
2785 }
2737 2786
2738 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2787 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2739 return 1; /* already handled by vmx_vcpu_run() */ 2788 return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2793,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 } 2793 }
2745 2794
2746 if (is_invalid_opcode(intr_info)) { 2795 if (is_invalid_opcode(intr_info)) {
2747 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 2796 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
2748 if (er != EMULATE_DONE) 2797 if (er != EMULATE_DONE)
2749 kvm_queue_exception(vcpu, UD_VECTOR); 2798 kvm_queue_exception(vcpu, UD_VECTOR);
2750 return 1; 2799 return 1;
@@ -2803,20 +2852,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2803 return 0; 2852 return 0;
2804} 2853}
2805 2854
2806static int handle_external_interrupt(struct kvm_vcpu *vcpu, 2855static int handle_external_interrupt(struct kvm_vcpu *vcpu)
2807 struct kvm_run *kvm_run)
2808{ 2856{
2809 ++vcpu->stat.irq_exits; 2857 ++vcpu->stat.irq_exits;
2810 return 1; 2858 return 1;
2811} 2859}
2812 2860
2813static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2861static int handle_triple_fault(struct kvm_vcpu *vcpu)
2814{ 2862{
2815 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2863 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2816 return 0; 2864 return 0;
2817} 2865}
2818 2866
2819static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2867static int handle_io(struct kvm_vcpu *vcpu)
2820{ 2868{
2821 unsigned long exit_qualification; 2869 unsigned long exit_qualification;
2822 int size, in, string; 2870 int size, in, string;
@@ -2827,8 +2875,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2827 string = (exit_qualification & 16) != 0; 2875 string = (exit_qualification & 16) != 0;
2828 2876
2829 if (string) { 2877 if (string) {
2830 if (emulate_instruction(vcpu, 2878 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2831 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2832 return 0; 2879 return 0;
2833 return 1; 2880 return 1;
2834 } 2881 }
@@ -2838,7 +2885,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 port = exit_qualification >> 16; 2885 port = exit_qualification >> 16;
2839 2886
2840 skip_emulated_instruction(vcpu); 2887 skip_emulated_instruction(vcpu);
2841 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2888 return kvm_emulate_pio(vcpu, in, size, port);
2842} 2889}
2843 2890
2844static void 2891static void
@@ -2852,7 +2899,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2852 hypercall[2] = 0xc1; 2899 hypercall[2] = 0xc1;
2853} 2900}
2854 2901
2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2902static int handle_cr(struct kvm_vcpu *vcpu)
2856{ 2903{
2857 unsigned long exit_qualification, val; 2904 unsigned long exit_qualification, val;
2858 int cr; 2905 int cr;
@@ -2887,7 +2934,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2887 return 1; 2934 return 1;
2888 if (cr8_prev <= cr8) 2935 if (cr8_prev <= cr8)
2889 return 1; 2936 return 1;
2890 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2937 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2891 return 0; 2938 return 0;
2892 } 2939 }
2893 }; 2940 };
@@ -2922,13 +2969,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2922 default: 2969 default:
2923 break; 2970 break;
2924 } 2971 }
2925 kvm_run->exit_reason = 0; 2972 vcpu->run->exit_reason = 0;
2926 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 2973 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2927 (int)(exit_qualification >> 4) & 3, cr); 2974 (int)(exit_qualification >> 4) & 3, cr);
2928 return 0; 2975 return 0;
2929} 2976}
2930 2977
2931static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2978static int handle_dr(struct kvm_vcpu *vcpu)
2932{ 2979{
2933 unsigned long exit_qualification; 2980 unsigned long exit_qualification;
2934 unsigned long val; 2981 unsigned long val;
@@ -2944,13 +2991,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2944 * guest debugging itself. 2991 * guest debugging itself.
2945 */ 2992 */
2946 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 2993 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2947 kvm_run->debug.arch.dr6 = vcpu->arch.dr6; 2994 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
2948 kvm_run->debug.arch.dr7 = dr; 2995 vcpu->run->debug.arch.dr7 = dr;
2949 kvm_run->debug.arch.pc = 2996 vcpu->run->debug.arch.pc =
2950 vmcs_readl(GUEST_CS_BASE) + 2997 vmcs_readl(GUEST_CS_BASE) +
2951 vmcs_readl(GUEST_RIP); 2998 vmcs_readl(GUEST_RIP);
2952 kvm_run->debug.arch.exception = DB_VECTOR; 2999 vcpu->run->debug.arch.exception = DB_VECTOR;
2953 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3000 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
2954 return 0; 3001 return 0;
2955 } else { 3002 } else {
2956 vcpu->arch.dr7 &= ~DR7_GD; 3003 vcpu->arch.dr7 &= ~DR7_GD;
@@ -3016,13 +3063,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016 return 1; 3063 return 1;
3017} 3064}
3018 3065
3019static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3066static int handle_cpuid(struct kvm_vcpu *vcpu)
3020{ 3067{
3021 kvm_emulate_cpuid(vcpu); 3068 kvm_emulate_cpuid(vcpu);
3022 return 1; 3069 return 1;
3023} 3070}
3024 3071
3025static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3072static int handle_rdmsr(struct kvm_vcpu *vcpu)
3026{ 3073{
3027 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3074 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3028 u64 data; 3075 u64 data;
@@ -3041,7 +3088,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3041 return 1; 3088 return 1;
3042} 3089}
3043 3090
3044static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3091static int handle_wrmsr(struct kvm_vcpu *vcpu)
3045{ 3092{
3046 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3093 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
@@ -3058,14 +3105,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3058 return 1; 3105 return 1;
3059} 3106}
3060 3107
3061static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, 3108static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3062 struct kvm_run *kvm_run)
3063{ 3109{
3064 return 1; 3110 return 1;
3065} 3111}
3066 3112
3067static int handle_interrupt_window(struct kvm_vcpu *vcpu, 3113static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3068 struct kvm_run *kvm_run)
3069{ 3114{
3070 u32 cpu_based_vm_exec_control; 3115 u32 cpu_based_vm_exec_control;
3071 3116
@@ -3081,34 +3126,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3081 * possible 3126 * possible
3082 */ 3127 */
3083 if (!irqchip_in_kernel(vcpu->kvm) && 3128 if (!irqchip_in_kernel(vcpu->kvm) &&
3084 kvm_run->request_interrupt_window && 3129 vcpu->run->request_interrupt_window &&
3085 !kvm_cpu_has_interrupt(vcpu)) { 3130 !kvm_cpu_has_interrupt(vcpu)) {
3086 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3131 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3087 return 0; 3132 return 0;
3088 } 3133 }
3089 return 1; 3134 return 1;
3090} 3135}
3091 3136
3092static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3137static int handle_halt(struct kvm_vcpu *vcpu)
3093{ 3138{
3094 skip_emulated_instruction(vcpu); 3139 skip_emulated_instruction(vcpu);
3095 return kvm_emulate_halt(vcpu); 3140 return kvm_emulate_halt(vcpu);
3096} 3141}
3097 3142
3098static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3143static int handle_vmcall(struct kvm_vcpu *vcpu)
3099{ 3144{
3100 skip_emulated_instruction(vcpu); 3145 skip_emulated_instruction(vcpu);
3101 kvm_emulate_hypercall(vcpu); 3146 kvm_emulate_hypercall(vcpu);
3102 return 1; 3147 return 1;
3103} 3148}
3104 3149
3105static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3150static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3106{ 3151{
3107 kvm_queue_exception(vcpu, UD_VECTOR); 3152 kvm_queue_exception(vcpu, UD_VECTOR);
3108 return 1; 3153 return 1;
3109} 3154}
3110 3155
3111static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3156static int handle_invlpg(struct kvm_vcpu *vcpu)
3112{ 3157{
3113 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3158 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3114 3159
@@ -3117,14 +3162,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3117 return 1; 3162 return 1;
3118} 3163}
3119 3164
3120static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3165static int handle_wbinvd(struct kvm_vcpu *vcpu)
3121{ 3166{
3122 skip_emulated_instruction(vcpu); 3167 skip_emulated_instruction(vcpu);
3123 /* TODO: Add support for VT-d/pass-through device */ 3168 /* TODO: Add support for VT-d/pass-through device */
3124 return 1; 3169 return 1;
3125} 3170}
3126 3171
3127static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3172static int handle_apic_access(struct kvm_vcpu *vcpu)
3128{ 3173{
3129 unsigned long exit_qualification; 3174 unsigned long exit_qualification;
3130 enum emulation_result er; 3175 enum emulation_result er;
@@ -3133,7 +3178,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3133 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3178 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3134 offset = exit_qualification & 0xffful; 3179 offset = exit_qualification & 0xffful;
3135 3180
3136 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3181 er = emulate_instruction(vcpu, 0, 0, 0);
3137 3182
3138 if (er != EMULATE_DONE) { 3183 if (er != EMULATE_DONE) {
3139 printk(KERN_ERR 3184 printk(KERN_ERR
@@ -3144,7 +3189,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3144 return 1; 3189 return 1;
3145} 3190}
3146 3191
3147static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3192static int handle_task_switch(struct kvm_vcpu *vcpu)
3148{ 3193{
3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3194 struct vcpu_vmx *vmx = to_vmx(vcpu);
3150 unsigned long exit_qualification; 3195 unsigned long exit_qualification;
@@ -3198,7 +3243,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3198 return 1; 3243 return 1;
3199} 3244}
3200 3245
3201static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3246static int handle_ept_violation(struct kvm_vcpu *vcpu)
3202{ 3247{
3203 unsigned long exit_qualification; 3248 unsigned long exit_qualification;
3204 gpa_t gpa; 3249 gpa_t gpa;
@@ -3219,8 +3264,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3219 vmcs_readl(GUEST_LINEAR_ADDRESS)); 3264 vmcs_readl(GUEST_LINEAR_ADDRESS));
3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3265 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3221 (long unsigned int)exit_qualification); 3266 (long unsigned int)exit_qualification);
3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3267 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 3268 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3224 return 0; 3269 return 0;
3225 } 3270 }
3226 3271
@@ -3290,7 +3335,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3290 } 3335 }
3291} 3336}
3292 3337
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3338static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3294{ 3339{
3295 u64 sptes[4]; 3340 u64 sptes[4];
3296 int nr_sptes, i; 3341 int nr_sptes, i;
@@ -3306,13 +3351,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 3351 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 3352 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308 3353
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3354 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 3355 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311 3356
3312 return 0; 3357 return 0;
3313} 3358}
3314 3359
3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3360static int handle_nmi_window(struct kvm_vcpu *vcpu)
3316{ 3361{
3317 u32 cpu_based_vm_exec_control; 3362 u32 cpu_based_vm_exec_control;
3318 3363
@@ -3325,36 +3370,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3325 return 1; 3370 return 1;
3326} 3371}
3327 3372
3328static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, 3373static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3329 struct kvm_run *kvm_run)
3330{ 3374{
3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3375 struct vcpu_vmx *vmx = to_vmx(vcpu);
3332 enum emulation_result err = EMULATE_DONE; 3376 enum emulation_result err = EMULATE_DONE;
3333 3377 int ret = 1;
3334 local_irq_enable();
3335 preempt_enable();
3336 3378
3337 while (!guest_state_valid(vcpu)) { 3379 while (!guest_state_valid(vcpu)) {
3338 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3380 err = emulate_instruction(vcpu, 0, 0, 0);
3339 3381
3340 if (err == EMULATE_DO_MMIO) 3382 if (err == EMULATE_DO_MMIO) {
3341 break; 3383 ret = 0;
3384 goto out;
3385 }
3342 3386
3343 if (err != EMULATE_DONE) { 3387 if (err != EMULATE_DONE) {
3344 kvm_report_emulation_failure(vcpu, "emulation failure"); 3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3345 break; 3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0;
3392 ret = 0;
3393 goto out;
3346 } 3394 }
3347 3395
3348 if (signal_pending(current)) 3396 if (signal_pending(current))
3349 break; 3397 goto out;
3350 if (need_resched()) 3398 if (need_resched())
3351 schedule(); 3399 schedule();
3352 } 3400 }
3353 3401
3354 preempt_disable(); 3402 vmx->emulation_required = 0;
3355 local_irq_disable(); 3403out:
3404 return ret;
3405}
3356 3406
3357 vmx->invalid_state_emulation_result = err; 3407/*
3408 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3409 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3410 */
3411static int handle_pause(struct kvm_vcpu *vcpu)
3412{
3413 skip_emulated_instruction(vcpu);
3414 kvm_vcpu_on_spin(vcpu);
3415
3416 return 1;
3358} 3417}
3359 3418
3360/* 3419/*
@@ -3362,8 +3421,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3362 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3363 * to be done to userspace and return 0. 3422 * to be done to userspace and return 0.
3364 */ 3423 */
3365static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 3424static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3366 struct kvm_run *kvm_run) = {
3367 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 3425 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3368 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 3426 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
3369 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 3427 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3452,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3452 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3397}; 3456};
3398 3457
3399static const int kvm_vmx_max_exit_handlers = 3458static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3462,7 @@ static const int kvm_vmx_max_exit_handlers =
3403 * The guest has exited. See if we can fix it or if we need userspace 3462 * The guest has exited. See if we can fix it or if we need userspace
3404 * assistance. 3463 * assistance.
3405 */ 3464 */
3406static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3465static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3407{ 3466{
3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3467 struct vcpu_vmx *vmx = to_vmx(vcpu);
3409 u32 exit_reason = vmx->exit_reason; 3468 u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3470,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3411 3470
3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3471 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3413 3472
3414 /* If we need to emulate an MMIO from handle_invalid_guest_state 3473 /* If guest state is invalid, start emulating */
3415 * we just return 0 */ 3474 if (vmx->emulation_required && emulate_invalid_guest_state)
3416 if (vmx->emulation_required && emulate_invalid_guest_state) { 3475 return handle_invalid_guest_state(vcpu);
3417 if (guest_state_valid(vcpu))
3418 vmx->emulation_required = 0;
3419 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3420 }
3421 3476
3422 /* Access CR3 don't cause VMExit in paging mode, so we need 3477 /* Access CR3 don't cause VMExit in paging mode, so we need
3423 * to sync with guest real CR3. */ 3478 * to sync with guest real CR3. */
@@ -3425,8 +3480,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3480 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3426 3481
3427 if (unlikely(vmx->fail)) { 3482 if (unlikely(vmx->fail)) {
3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3483 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3429 kvm_run->fail_entry.hardware_entry_failure_reason 3484 vcpu->run->fail_entry.hardware_entry_failure_reason
3430 = vmcs_read32(VM_INSTRUCTION_ERROR); 3485 = vmcs_read32(VM_INSTRUCTION_ERROR);
3431 return 0; 3486 return 0;
3432 } 3487 }
@@ -3459,10 +3514,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3459 3514
3460 if (exit_reason < kvm_vmx_max_exit_handlers 3515 if (exit_reason < kvm_vmx_max_exit_handlers
3461 && kvm_vmx_exit_handlers[exit_reason]) 3516 && kvm_vmx_exit_handlers[exit_reason])
3462 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3517 return kvm_vmx_exit_handlers[exit_reason](vcpu);
3463 else { 3518 else {
3464 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3519 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3465 kvm_run->hw.hardware_exit_reason = exit_reason; 3520 vcpu->run->hw.hardware_exit_reason = exit_reason;
3466 } 3521 }
3467 return 0; 3522 return 0;
3468} 3523}
@@ -3600,23 +3655,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3600#define Q "l" 3655#define Q "l"
3601#endif 3656#endif
3602 3657
3603static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3658static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3604{ 3659{
3605 struct vcpu_vmx *vmx = to_vmx(vcpu); 3660 struct vcpu_vmx *vmx = to_vmx(vcpu);
3606 3661
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3611 /* Record the guest's net vcpu time for enforced NMI injections. */ 3662 /* Record the guest's net vcpu time for enforced NMI injections. */
3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3663 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3613 vmx->entry_time = ktime_get(); 3664 vmx->entry_time = ktime_get();
3614 3665
3615 /* Handle invalid guest state instead of entering VMX */ 3666 /* Don't enter VMX if guest state is invalid, let the exit handler
3616 if (vmx->emulation_required && emulate_invalid_guest_state) { 3667 start emulation until we arrive back to a valid state */
3617 handle_invalid_guest_state(vcpu, kvm_run); 3668 if (vmx->emulation_required && emulate_invalid_guest_state)
3618 return; 3669 return;
3619 }
3620 3670
3621 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 3671 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3622 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 3672 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3775,7 +3825,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3775 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 3825 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3776 spin_unlock(&vmx_vpid_lock); 3826 spin_unlock(&vmx_vpid_lock);
3777 vmx_free_vmcs(vcpu); 3827 vmx_free_vmcs(vcpu);
3778 kfree(vmx->host_msrs);
3779 kfree(vmx->guest_msrs); 3828 kfree(vmx->guest_msrs);
3780 kvm_vcpu_uninit(vcpu); 3829 kvm_vcpu_uninit(vcpu);
3781 kmem_cache_free(kvm_vcpu_cache, vmx); 3830 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3802 goto uninit_vcpu; 3851 goto uninit_vcpu;
3803 } 3852 }
3804 3853
3805 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3806 if (!vmx->host_msrs)
3807 goto free_guest_msrs;
3808
3809 vmx->vmcs = alloc_vmcs(); 3854 vmx->vmcs = alloc_vmcs();
3810 if (!vmx->vmcs) 3855 if (!vmx->vmcs)
3811 goto free_msrs; 3856 goto free_msrs;
@@ -3836,8 +3881,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3836free_vmcs: 3881free_vmcs:
3837 free_vmcs(vmx->vmcs); 3882 free_vmcs(vmx->vmcs);
3838free_msrs: 3883free_msrs:
3839 kfree(vmx->host_msrs);
3840free_guest_msrs:
3841 kfree(vmx->guest_msrs); 3884 kfree(vmx->guest_msrs);
3842uninit_vcpu: 3885uninit_vcpu:
3843 kvm_vcpu_uninit(&vmx->vcpu); 3886 kvm_vcpu_uninit(&vmx->vcpu);
@@ -3973,6 +4016,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3973 .queue_exception = vmx_queue_exception, 4016 .queue_exception = vmx_queue_exception,
3974 .interrupt_allowed = vmx_interrupt_allowed, 4017 .interrupt_allowed = vmx_interrupt_allowed,
3975 .nmi_allowed = vmx_nmi_allowed, 4018 .nmi_allowed = vmx_nmi_allowed,
4019 .get_nmi_mask = vmx_get_nmi_mask,
4020 .set_nmi_mask = vmx_set_nmi_mask,
3976 .enable_nmi_window = enable_nmi_window, 4021 .enable_nmi_window = enable_nmi_window,
3977 .enable_irq_window = enable_irq_window, 4022 .enable_irq_window = enable_irq_window,
3978 .update_cr8_intercept = update_cr8_intercept, 4023 .update_cr8_intercept = update_cr8_intercept,
@@ -3987,7 +4032,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
3987 4032
3988static int __init vmx_init(void) 4033static int __init vmx_init(void)
3989{ 4034{
3990 int r; 4035 int r, i;
4036
4037 rdmsrl_safe(MSR_EFER, &host_efer);
4038
4039 for (i = 0; i < NR_VMX_MSR; ++i)
4040 kvm_define_shared_msr(i, vmx_msr_index[i]);
3991 4041
3992 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 4042 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3993 if (!vmx_io_bitmap_a) 4043 if (!vmx_io_bitmap_a)
@@ -4049,8 +4099,6 @@ static int __init vmx_init(void)
4049 if (bypass_guest_pf) 4099 if (bypass_guest_pf)
4050 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 4100 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4051 4101
4052 ept_sync_global();
4053
4054 return 0; 4102 return 0;
4055 4103
4056out3: 4104out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9b9695322f56..a1e1bc9d412d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,13 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h>
40#include <trace/events/kvm.h> 41#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE 42#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS 43#define CREATE_TRACE_POINTS
43#include "trace.h" 44#include "trace.h"
44 45
46#include <asm/debugreg.h>
45#include <asm/uaccess.h> 47#include <asm/uaccess.h>
46#include <asm/msr.h> 48#include <asm/msr.h>
47#include <asm/desc.h> 49#include <asm/desc.h>
@@ -87,6 +89,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
87int ignore_msrs = 0; 89int ignore_msrs = 0;
88module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 90module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
89 91
92#define KVM_NR_SHARED_MSRS 16
93
94struct kvm_shared_msrs_global {
95 int nr;
96 struct kvm_shared_msr {
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100};
101
102struct kvm_shared_msrs {
103 struct user_return_notifier urn;
104 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS];
106};
107
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
109static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
110
90struct kvm_stats_debugfs_item debugfs_entries[] = { 111struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "pf_fixed", VCPU_STAT(pf_fixed) }, 112 { "pf_fixed", VCPU_STAT(pf_fixed) },
92 { "pf_guest", VCPU_STAT(pf_guest) }, 113 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -123,6 +144,72 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
123 { NULL } 144 { NULL }
124}; 145};
125 146
147static void kvm_on_user_return(struct user_return_notifier *urn)
148{
149 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn);
153
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot];
156 if (global->value != locals->current_value[slot]) {
157 wrmsrl(global->msr, global->value);
158 locals->current_value[slot] = global->value;
159 }
160 }
161 locals->registered = false;
162 user_return_notifier_unregister(urn);
163}
164
165void kvm_define_shared_msr(unsigned slot, u32 msr)
166{
167 int cpu;
168 u64 value;
169
170 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr;
173 rdmsrl_safe(msr, &value);
174 shared_msrs_global.msrs[slot].value = value;
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179
180static void kvm_shared_msr_cpu_online(void)
181{
182 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184
185 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value;
187}
188
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192
193 if (((value ^ smsr->current_value[slot]) & mask) == 0)
194 return;
195 smsr->current_value[slot] = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value);
197 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn);
200 smsr->registered = true;
201 }
202}
203EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
204
205static void drop_user_return_notifiers(void *ignore)
206{
207 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
208
209 if (smsr->registered)
210 kvm_on_user_return(&smsr->urn);
211}
212
126unsigned long segment_base(u16 selector) 213unsigned long segment_base(u16 selector)
127{ 214{
128 struct descriptor_table gdt; 215 struct descriptor_table gdt;
@@ -484,16 +571,19 @@ static inline u32 bit(int bitno)
484 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 571 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
485 * 572 *
486 * This list is modified at module load time to reflect the 573 * This list is modified at module load time to reflect the
487 * capabilities of the host cpu. 574 * capabilities of the host cpu. This capabilities test skips MSRs that are
575 * kvm-specific. Those are put in the beginning of the list.
488 */ 576 */
577
578#define KVM_SAVE_MSRS_BEGIN 2
489static u32 msrs_to_save[] = { 579static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
490 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
491 MSR_K6_STAR, 582 MSR_K6_STAR,
492#ifdef CONFIG_X86_64 583#ifdef CONFIG_X86_64
493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 584 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
494#endif 585#endif
495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 586 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
497}; 587};
498 588
499static unsigned num_msrs_to_save; 589static unsigned num_msrs_to_save;
@@ -580,7 +670,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
580{ 670{
581 static int version; 671 static int version;
582 struct pvclock_wall_clock wc; 672 struct pvclock_wall_clock wc;
583 struct timespec now, sys, boot; 673 struct timespec boot;
584 674
585 if (!wall_clock) 675 if (!wall_clock)
586 return; 676 return;
@@ -595,9 +685,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
595 * wall clock specified here. guest system time equals host 685 * wall clock specified here. guest system time equals host
596 * system time for us, thus we must fill in host boot time here. 686 * system time for us, thus we must fill in host boot time here.
597 */ 687 */
598 now = current_kernel_time(); 688 getboottime(&boot);
599 ktime_get_ts(&sys);
600 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
601 689
602 wc.sec = boot.tv_sec; 690 wc.sec = boot.tv_sec;
603 wc.nsec = boot.tv_nsec; 691 wc.nsec = boot.tv_nsec;
@@ -672,12 +760,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
672 local_irq_save(flags); 760 local_irq_save(flags);
673 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 761 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
674 ktime_get_ts(&ts); 762 ktime_get_ts(&ts);
763 monotonic_to_bootbased(&ts);
675 local_irq_restore(flags); 764 local_irq_restore(flags);
676 765
677 /* With all the info we got, fill in the values */ 766 /* With all the info we got, fill in the values */
678 767
679 vcpu->hv_clock.system_time = ts.tv_nsec + 768 vcpu->hv_clock.system_time = ts.tv_nsec +
680 (NSEC_PER_SEC * (u64)ts.tv_sec); 769 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
770
681 /* 771 /*
682 * The interface expects us to write an even number signaling that the 772 * The interface expects us to write an even number signaling that the
683 * update is finished. Since the guest won't see the intermediate 773 * update is finished. Since the guest won't see the intermediate
@@ -835,6 +925,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
835 return 0; 925 return 0;
836} 926}
837 927
928static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
929{
930 struct kvm *kvm = vcpu->kvm;
931 int lm = is_long_mode(vcpu);
932 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
933 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
934 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
935 : kvm->arch.xen_hvm_config.blob_size_32;
936 u32 page_num = data & ~PAGE_MASK;
937 u64 page_addr = data & PAGE_MASK;
938 u8 *page;
939 int r;
940
941 r = -E2BIG;
942 if (page_num >= blob_size)
943 goto out;
944 r = -ENOMEM;
945 page = kzalloc(PAGE_SIZE, GFP_KERNEL);
946 if (!page)
947 goto out;
948 r = -EFAULT;
949 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
950 goto out_free;
951 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
952 goto out_free;
953 r = 0;
954out_free:
955 kfree(page);
956out:
957 return r;
958}
959
838int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 960int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
839{ 961{
840 switch (msr) { 962 switch (msr) {
@@ -950,6 +1072,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
950 "0x%x data 0x%llx\n", msr, data); 1072 "0x%x data 0x%llx\n", msr, data);
951 break; 1073 break;
952 default: 1074 default:
1075 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1076 return xen_hvm_config(vcpu, data);
953 if (!ignore_msrs) { 1077 if (!ignore_msrs) {
954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1078 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
955 msr, data); 1079 msr, data);
@@ -1224,6 +1348,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1224 case KVM_CAP_PIT2: 1348 case KVM_CAP_PIT2:
1225 case KVM_CAP_PIT_STATE2: 1349 case KVM_CAP_PIT_STATE2:
1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1350 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1351 case KVM_CAP_XEN_HVM:
1352 case KVM_CAP_ADJUST_CLOCK:
1353 case KVM_CAP_VCPU_EVENTS:
1227 r = 1; 1354 r = 1;
1228 break; 1355 break;
1229 case KVM_CAP_COALESCED_MMIO: 1356 case KVM_CAP_COALESCED_MMIO:
@@ -1238,8 +1365,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1238 case KVM_CAP_NR_MEMSLOTS: 1365 case KVM_CAP_NR_MEMSLOTS:
1239 r = KVM_MEMORY_SLOTS; 1366 r = KVM_MEMORY_SLOTS;
1240 break; 1367 break;
1241 case KVM_CAP_PV_MMU: 1368 case KVM_CAP_PV_MMU: /* obsolete */
1242 r = !tdp_enabled; 1369 r = 0;
1243 break; 1370 break;
1244 case KVM_CAP_IOMMU: 1371 case KVM_CAP_IOMMU:
1245 r = iommu_found(); 1372 r = iommu_found();
@@ -1326,6 +1453,12 @@ out:
1326void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1453void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1327{ 1454{
1328 kvm_x86_ops->vcpu_load(vcpu, cpu); 1455 kvm_x86_ops->vcpu_load(vcpu, cpu);
1456 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1457 unsigned long khz = cpufreq_quick_get(cpu);
1458 if (!khz)
1459 khz = tsc_khz;
1460 per_cpu(cpu_tsc_khz, cpu) = khz;
1461 }
1329 kvm_request_guest_time_update(vcpu); 1462 kvm_request_guest_time_update(vcpu);
1330} 1463}
1331 1464
@@ -1692,7 +1825,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1692 unsigned bank_num = mcg_cap & 0xff, bank; 1825 unsigned bank_num = mcg_cap & 0xff, bank;
1693 1826
1694 r = -EINVAL; 1827 r = -EINVAL;
1695 if (!bank_num) 1828 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
1696 goto out; 1829 goto out;
1697 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 1830 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1698 goto out; 1831 goto out;
@@ -1759,6 +1892,65 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1759 return 0; 1892 return 0;
1760} 1893}
1761 1894
1895static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1896 struct kvm_vcpu_events *events)
1897{
1898 vcpu_load(vcpu);
1899
1900 events->exception.injected = vcpu->arch.exception.pending;
1901 events->exception.nr = vcpu->arch.exception.nr;
1902 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
1903 events->exception.error_code = vcpu->arch.exception.error_code;
1904
1905 events->interrupt.injected = vcpu->arch.interrupt.pending;
1906 events->interrupt.nr = vcpu->arch.interrupt.nr;
1907 events->interrupt.soft = vcpu->arch.interrupt.soft;
1908
1909 events->nmi.injected = vcpu->arch.nmi_injected;
1910 events->nmi.pending = vcpu->arch.nmi_pending;
1911 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
1912
1913 events->sipi_vector = vcpu->arch.sipi_vector;
1914
1915 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
1916 | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
1917
1918 vcpu_put(vcpu);
1919}
1920
1921static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1922 struct kvm_vcpu_events *events)
1923{
1924 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
1925 | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
1926 return -EINVAL;
1927
1928 vcpu_load(vcpu);
1929
1930 vcpu->arch.exception.pending = events->exception.injected;
1931 vcpu->arch.exception.nr = events->exception.nr;
1932 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
1933 vcpu->arch.exception.error_code = events->exception.error_code;
1934
1935 vcpu->arch.interrupt.pending = events->interrupt.injected;
1936 vcpu->arch.interrupt.nr = events->interrupt.nr;
1937 vcpu->arch.interrupt.soft = events->interrupt.soft;
1938 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
1939 kvm_pic_clear_isr_ack(vcpu->kvm);
1940
1941 vcpu->arch.nmi_injected = events->nmi.injected;
1942 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
1943 vcpu->arch.nmi_pending = events->nmi.pending;
1944 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
1945
1946 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
1947 vcpu->arch.sipi_vector = events->sipi_vector;
1948
1949 vcpu_put(vcpu);
1950
1951 return 0;
1952}
1953
1762long kvm_arch_vcpu_ioctl(struct file *filp, 1954long kvm_arch_vcpu_ioctl(struct file *filp,
1763 unsigned int ioctl, unsigned long arg) 1955 unsigned int ioctl, unsigned long arg)
1764{ 1956{
@@ -1769,6 +1961,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1769 1961
1770 switch (ioctl) { 1962 switch (ioctl) {
1771 case KVM_GET_LAPIC: { 1963 case KVM_GET_LAPIC: {
1964 r = -EINVAL;
1965 if (!vcpu->arch.apic)
1966 goto out;
1772 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1967 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1773 1968
1774 r = -ENOMEM; 1969 r = -ENOMEM;
@@ -1784,6 +1979,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1784 break; 1979 break;
1785 } 1980 }
1786 case KVM_SET_LAPIC: { 1981 case KVM_SET_LAPIC: {
1982 r = -EINVAL;
1983 if (!vcpu->arch.apic)
1984 goto out;
1787 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1985 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1788 r = -ENOMEM; 1986 r = -ENOMEM;
1789 if (!lapic) 1987 if (!lapic)
@@ -1910,6 +2108,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1910 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2108 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1911 break; 2109 break;
1912 } 2110 }
2111 case KVM_GET_VCPU_EVENTS: {
2112 struct kvm_vcpu_events events;
2113
2114 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2115
2116 r = -EFAULT;
2117 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2118 break;
2119 r = 0;
2120 break;
2121 }
2122 case KVM_SET_VCPU_EVENTS: {
2123 struct kvm_vcpu_events events;
2124
2125 r = -EFAULT;
2126 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2127 break;
2128
2129 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2130 break;
2131 }
1913 default: 2132 default:
1914 r = -EINVAL; 2133 r = -EINVAL;
1915 } 2134 }
@@ -2038,9 +2257,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2038 sizeof(struct kvm_pic_state)); 2257 sizeof(struct kvm_pic_state));
2039 break; 2258 break;
2040 case KVM_IRQCHIP_IOAPIC: 2259 case KVM_IRQCHIP_IOAPIC:
2041 memcpy(&chip->chip.ioapic, 2260 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2042 ioapic_irqchip(kvm),
2043 sizeof(struct kvm_ioapic_state));
2044 break; 2261 break;
2045 default: 2262 default:
2046 r = -EINVAL; 2263 r = -EINVAL;
@@ -2070,11 +2287,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2070 spin_unlock(&pic_irqchip(kvm)->lock); 2287 spin_unlock(&pic_irqchip(kvm)->lock);
2071 break; 2288 break;
2072 case KVM_IRQCHIP_IOAPIC: 2289 case KVM_IRQCHIP_IOAPIC:
2073 mutex_lock(&kvm->irq_lock); 2290 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2074 memcpy(ioapic_irqchip(kvm),
2075 &chip->chip.ioapic,
2076 sizeof(struct kvm_ioapic_state));
2077 mutex_unlock(&kvm->irq_lock);
2078 break; 2291 break;
2079 default: 2292 default:
2080 r = -EINVAL; 2293 r = -EINVAL;
@@ -2182,7 +2395,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2182{ 2395{
2183 struct kvm *kvm = filp->private_data; 2396 struct kvm *kvm = filp->private_data;
2184 void __user *argp = (void __user *)arg; 2397 void __user *argp = (void __user *)arg;
2185 int r = -EINVAL; 2398 int r = -ENOTTY;
2186 /* 2399 /*
2187 * This union makes it completely explicit to gcc-3.x 2400 * This union makes it completely explicit to gcc-3.x
2188 * that these two variables' stack usage should be 2401 * that these two variables' stack usage should be
@@ -2244,25 +2457,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
2244 if (r) 2457 if (r)
2245 goto out; 2458 goto out;
2246 break; 2459 break;
2247 case KVM_CREATE_IRQCHIP: 2460 case KVM_CREATE_IRQCHIP: {
2461 struct kvm_pic *vpic;
2462
2463 mutex_lock(&kvm->lock);
2464 r = -EEXIST;
2465 if (kvm->arch.vpic)
2466 goto create_irqchip_unlock;
2248 r = -ENOMEM; 2467 r = -ENOMEM;
2249 kvm->arch.vpic = kvm_create_pic(kvm); 2468 vpic = kvm_create_pic(kvm);
2250 if (kvm->arch.vpic) { 2469 if (vpic) {
2251 r = kvm_ioapic_init(kvm); 2470 r = kvm_ioapic_init(kvm);
2252 if (r) { 2471 if (r) {
2253 kfree(kvm->arch.vpic); 2472 kfree(vpic);
2254 kvm->arch.vpic = NULL; 2473 goto create_irqchip_unlock;
2255 goto out;
2256 } 2474 }
2257 } else 2475 } else
2258 goto out; 2476 goto create_irqchip_unlock;
2477 smp_wmb();
2478 kvm->arch.vpic = vpic;
2479 smp_wmb();
2259 r = kvm_setup_default_irq_routing(kvm); 2480 r = kvm_setup_default_irq_routing(kvm);
2260 if (r) { 2481 if (r) {
2482 mutex_lock(&kvm->irq_lock);
2261 kfree(kvm->arch.vpic); 2483 kfree(kvm->arch.vpic);
2262 kfree(kvm->arch.vioapic); 2484 kfree(kvm->arch.vioapic);
2263 goto out; 2485 kvm->arch.vpic = NULL;
2486 kvm->arch.vioapic = NULL;
2487 mutex_unlock(&kvm->irq_lock);
2264 } 2488 }
2489 create_irqchip_unlock:
2490 mutex_unlock(&kvm->lock);
2265 break; 2491 break;
2492 }
2266 case KVM_CREATE_PIT: 2493 case KVM_CREATE_PIT:
2267 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2494 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2268 goto create_pit; 2495 goto create_pit;
@@ -2292,10 +2519,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2292 goto out; 2519 goto out;
2293 if (irqchip_in_kernel(kvm)) { 2520 if (irqchip_in_kernel(kvm)) {
2294 __s32 status; 2521 __s32 status;
2295 mutex_lock(&kvm->irq_lock);
2296 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2522 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2297 irq_event.irq, irq_event.level); 2523 irq_event.irq, irq_event.level);
2298 mutex_unlock(&kvm->irq_lock);
2299 if (ioctl == KVM_IRQ_LINE_STATUS) { 2524 if (ioctl == KVM_IRQ_LINE_STATUS) {
2300 irq_event.status = status; 2525 irq_event.status = status;
2301 if (copy_to_user(argp, &irq_event, 2526 if (copy_to_user(argp, &irq_event,
@@ -2421,6 +2646,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
2421 r = 0; 2646 r = 0;
2422 break; 2647 break;
2423 } 2648 }
2649 case KVM_XEN_HVM_CONFIG: {
2650 r = -EFAULT;
2651 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2652 sizeof(struct kvm_xen_hvm_config)))
2653 goto out;
2654 r = -EINVAL;
2655 if (kvm->arch.xen_hvm_config.flags)
2656 goto out;
2657 r = 0;
2658 break;
2659 }
2660 case KVM_SET_CLOCK: {
2661 struct timespec now;
2662 struct kvm_clock_data user_ns;
2663 u64 now_ns;
2664 s64 delta;
2665
2666 r = -EFAULT;
2667 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2668 goto out;
2669
2670 r = -EINVAL;
2671 if (user_ns.flags)
2672 goto out;
2673
2674 r = 0;
2675 ktime_get_ts(&now);
2676 now_ns = timespec_to_ns(&now);
2677 delta = user_ns.clock - now_ns;
2678 kvm->arch.kvmclock_offset = delta;
2679 break;
2680 }
2681 case KVM_GET_CLOCK: {
2682 struct timespec now;
2683 struct kvm_clock_data user_ns;
2684 u64 now_ns;
2685
2686 ktime_get_ts(&now);
2687 now_ns = timespec_to_ns(&now);
2688 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2689 user_ns.flags = 0;
2690
2691 r = -EFAULT;
2692 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2693 goto out;
2694 r = 0;
2695 break;
2696 }
2697
2424 default: 2698 default:
2425 ; 2699 ;
2426 } 2700 }
@@ -2433,7 +2707,8 @@ static void kvm_init_msr_list(void)
2433 u32 dummy[2]; 2707 u32 dummy[2];
2434 unsigned i, j; 2708 unsigned i, j;
2435 2709
2436 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2710 /* skip the first msrs in the list. KVM-specific */
2711 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2437 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2712 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2438 continue; 2713 continue;
2439 if (j < i) 2714 if (j < i)
@@ -2757,13 +3032,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
2757} 3032}
2758 3033
2759int emulate_instruction(struct kvm_vcpu *vcpu, 3034int emulate_instruction(struct kvm_vcpu *vcpu,
2760 struct kvm_run *run,
2761 unsigned long cr2, 3035 unsigned long cr2,
2762 u16 error_code, 3036 u16 error_code,
2763 int emulation_type) 3037 int emulation_type)
2764{ 3038{
2765 int r, shadow_mask; 3039 int r, shadow_mask;
2766 struct decode_cache *c; 3040 struct decode_cache *c;
3041 struct kvm_run *run = vcpu->run;
2767 3042
2768 kvm_clear_exception_queue(vcpu); 3043 kvm_clear_exception_queue(vcpu);
2769 vcpu->arch.mmio_fault_cr2 = cr2; 3044 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2783,7 +3058,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2783 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3058 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2784 3059
2785 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3060 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2786 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3061 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2787 vcpu->arch.emulate_ctxt.mode = 3062 vcpu->arch.emulate_ctxt.mode =
2788 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3063 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2789 ? X86EMUL_MODE_REAL : cs_l 3064 ? X86EMUL_MODE_REAL : cs_l
@@ -2861,7 +3136,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2861 return EMULATE_DO_MMIO; 3136 return EMULATE_DO_MMIO;
2862 } 3137 }
2863 3138
2864 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3139 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2865 3140
2866 if (vcpu->mmio_is_write) { 3141 if (vcpu->mmio_is_write) {
2867 vcpu->mmio_needed = 0; 3142 vcpu->mmio_needed = 0;
@@ -2969,8 +3244,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2969 return r; 3244 return r;
2970} 3245}
2971 3246
2972int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3247int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2973 int size, unsigned port)
2974{ 3248{
2975 unsigned long val; 3249 unsigned long val;
2976 3250
@@ -2999,7 +3273,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2999} 3273}
3000EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3274EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3001 3275
3002int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3276int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3003 int size, unsigned long count, int down, 3277 int size, unsigned long count, int down,
3004 gva_t address, int rep, unsigned port) 3278 gva_t address, int rep, unsigned port)
3005{ 3279{
@@ -3072,9 +3346,6 @@ static void bounce_off(void *info)
3072 /* nothing */ 3346 /* nothing */
3073} 3347}
3074 3348
3075static unsigned int ref_freq;
3076static unsigned long tsc_khz_ref;
3077
3078static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3349static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3079 void *data) 3350 void *data)
3080{ 3351{
@@ -3083,14 +3354,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
3083 struct kvm_vcpu *vcpu; 3354 struct kvm_vcpu *vcpu;
3084 int i, send_ipi = 0; 3355 int i, send_ipi = 0;
3085 3356
3086 if (!ref_freq)
3087 ref_freq = freq->old;
3088
3089 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3357 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3090 return 0; 3358 return 0;
3091 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3359 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3092 return 0; 3360 return 0;
3093 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3361 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3094 3362
3095 spin_lock(&kvm_lock); 3363 spin_lock(&kvm_lock);
3096 list_for_each_entry(kvm, &vm_list, vm_list) { 3364 list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3127,9 +3395,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
3127 .notifier_call = kvmclock_cpufreq_notifier 3395 .notifier_call = kvmclock_cpufreq_notifier
3128}; 3396};
3129 3397
3398static void kvm_timer_init(void)
3399{
3400 int cpu;
3401
3402 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3403 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3404 CPUFREQ_TRANSITION_NOTIFIER);
3405 for_each_online_cpu(cpu) {
3406 unsigned long khz = cpufreq_get(cpu);
3407 if (!khz)
3408 khz = tsc_khz;
3409 per_cpu(cpu_tsc_khz, cpu) = khz;
3410 }
3411 } else {
3412 for_each_possible_cpu(cpu)
3413 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3414 }
3415}
3416
3130int kvm_arch_init(void *opaque) 3417int kvm_arch_init(void *opaque)
3131{ 3418{
3132 int r, cpu; 3419 int r;
3133 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3420 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3134 3421
3135 if (kvm_x86_ops) { 3422 if (kvm_x86_ops) {
@@ -3161,13 +3448,7 @@ int kvm_arch_init(void *opaque)
3161 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3448 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3162 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3449 PT_DIRTY_MASK, PT64_NX_MASK, 0);
3163 3450
3164 for_each_possible_cpu(cpu) 3451 kvm_timer_init();
3165 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3166 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3167 tsc_khz_ref = tsc_khz;
3168 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3169 CPUFREQ_TRANSITION_NOTIFIER);
3170 }
3171 3452
3172 return 0; 3453 return 0;
3173 3454
@@ -3295,7 +3576,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3295 unsigned long *rflags) 3576 unsigned long *rflags)
3296{ 3577{
3297 kvm_lmsw(vcpu, msw); 3578 kvm_lmsw(vcpu, msw);
3298 *rflags = kvm_x86_ops->get_rflags(vcpu); 3579 *rflags = kvm_get_rflags(vcpu);
3299} 3580}
3300 3581
3301unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3582unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
@@ -3333,7 +3614,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3333 switch (cr) { 3614 switch (cr) {
3334 case 0: 3615 case 0:
3335 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3616 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3336 *rflags = kvm_x86_ops->get_rflags(vcpu); 3617 *rflags = kvm_get_rflags(vcpu);
3337 break; 3618 break;
3338 case 2: 3619 case 2:
3339 vcpu->arch.cr2 = val; 3620 vcpu->arch.cr2 = val;
@@ -3453,18 +3734,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3453 * 3734 *
3454 * No need to exit to userspace if we already have an interrupt queued. 3735 * No need to exit to userspace if we already have an interrupt queued.
3455 */ 3736 */
3456static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3737static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3457 struct kvm_run *kvm_run)
3458{ 3738{
3459 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3739 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3460 kvm_run->request_interrupt_window && 3740 vcpu->run->request_interrupt_window &&
3461 kvm_arch_interrupt_allowed(vcpu)); 3741 kvm_arch_interrupt_allowed(vcpu));
3462} 3742}
3463 3743
3464static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3744static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3465 struct kvm_run *kvm_run)
3466{ 3745{
3467 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3746 struct kvm_run *kvm_run = vcpu->run;
3747
3748 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3468 kvm_run->cr8 = kvm_get_cr8(vcpu); 3749 kvm_run->cr8 = kvm_get_cr8(vcpu);
3469 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3750 kvm_run->apic_base = kvm_get_apic_base(vcpu);
3470 if (irqchip_in_kernel(vcpu->kvm)) 3751 if (irqchip_in_kernel(vcpu->kvm))
@@ -3525,7 +3806,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3525 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3806 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3526} 3807}
3527 3808
3528static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3809static void inject_pending_event(struct kvm_vcpu *vcpu)
3529{ 3810{
3530 /* try to reinject previous events if any */ 3811 /* try to reinject previous events if any */
3531 if (vcpu->arch.exception.pending) { 3812 if (vcpu->arch.exception.pending) {
@@ -3561,11 +3842,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3561 } 3842 }
3562} 3843}
3563 3844
3564static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3845static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3565{ 3846{
3566 int r; 3847 int r;
3567 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3848 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3568 kvm_run->request_interrupt_window; 3849 vcpu->run->request_interrupt_window;
3569 3850
3570 if (vcpu->requests) 3851 if (vcpu->requests)
3571 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3852 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3586,12 +3867,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3586 kvm_x86_ops->tlb_flush(vcpu); 3867 kvm_x86_ops->tlb_flush(vcpu);
3587 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3868 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3588 &vcpu->requests)) { 3869 &vcpu->requests)) {
3589 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3870 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3590 r = 0; 3871 r = 0;
3591 goto out; 3872 goto out;
3592 } 3873 }
3593 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3874 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3594 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3875 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3595 r = 0; 3876 r = 0;
3596 goto out; 3877 goto out;
3597 } 3878 }
@@ -3615,7 +3896,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3615 goto out; 3896 goto out;
3616 } 3897 }
3617 3898
3618 inject_pending_event(vcpu, kvm_run); 3899 inject_pending_event(vcpu);
3619 3900
3620 /* enable NMI/IRQ window open exits if needed */ 3901 /* enable NMI/IRQ window open exits if needed */
3621 if (vcpu->arch.nmi_pending) 3902 if (vcpu->arch.nmi_pending)
@@ -3641,16 +3922,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3641 } 3922 }
3642 3923
3643 trace_kvm_entry(vcpu->vcpu_id); 3924 trace_kvm_entry(vcpu->vcpu_id);
3644 kvm_x86_ops->run(vcpu, kvm_run); 3925 kvm_x86_ops->run(vcpu);
3645 3926
3646 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 3927 /*
3647 set_debugreg(current->thread.debugreg0, 0); 3928 * If the guest has used debug registers, at least dr7
3648 set_debugreg(current->thread.debugreg1, 1); 3929 * will be disabled while returning to the host.
3649 set_debugreg(current->thread.debugreg2, 2); 3930 * If we don't have active breakpoints in the host, we don't
3650 set_debugreg(current->thread.debugreg3, 3); 3931 * care about the messed up debug address registers. But if
3651 set_debugreg(current->thread.debugreg6, 6); 3932 * we have some of them active, restore the old state.
3652 set_debugreg(current->thread.debugreg7, 7); 3933 */
3653 } 3934 if (hw_breakpoint_active())
3935 hw_breakpoint_restore();
3654 3936
3655 set_bit(KVM_REQ_KICK, &vcpu->requests); 3937 set_bit(KVM_REQ_KICK, &vcpu->requests);
3656 local_irq_enable(); 3938 local_irq_enable();
@@ -3682,13 +3964,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3682 3964
3683 kvm_lapic_sync_from_vapic(vcpu); 3965 kvm_lapic_sync_from_vapic(vcpu);
3684 3966
3685 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3967 r = kvm_x86_ops->handle_exit(vcpu);
3686out: 3968out:
3687 return r; 3969 return r;
3688} 3970}
3689 3971
3690 3972
3691static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3973static int __vcpu_run(struct kvm_vcpu *vcpu)
3692{ 3974{
3693 int r; 3975 int r;
3694 3976
@@ -3708,7 +3990,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3708 r = 1; 3990 r = 1;
3709 while (r > 0) { 3991 while (r > 0) {
3710 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3992 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3711 r = vcpu_enter_guest(vcpu, kvm_run); 3993 r = vcpu_enter_guest(vcpu);
3712 else { 3994 else {
3713 up_read(&vcpu->kvm->slots_lock); 3995 up_read(&vcpu->kvm->slots_lock);
3714 kvm_vcpu_block(vcpu); 3996 kvm_vcpu_block(vcpu);
@@ -3736,14 +4018,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3736 if (kvm_cpu_has_pending_timer(vcpu)) 4018 if (kvm_cpu_has_pending_timer(vcpu))
3737 kvm_inject_pending_timer_irqs(vcpu); 4019 kvm_inject_pending_timer_irqs(vcpu);
3738 4020
3739 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 4021 if (dm_request_for_irq_injection(vcpu)) {
3740 r = -EINTR; 4022 r = -EINTR;
3741 kvm_run->exit_reason = KVM_EXIT_INTR; 4023 vcpu->run->exit_reason = KVM_EXIT_INTR;
3742 ++vcpu->stat.request_irq_exits; 4024 ++vcpu->stat.request_irq_exits;
3743 } 4025 }
3744 if (signal_pending(current)) { 4026 if (signal_pending(current)) {
3745 r = -EINTR; 4027 r = -EINTR;
3746 kvm_run->exit_reason = KVM_EXIT_INTR; 4028 vcpu->run->exit_reason = KVM_EXIT_INTR;
3747 ++vcpu->stat.signal_exits; 4029 ++vcpu->stat.signal_exits;
3748 } 4030 }
3749 if (need_resched()) { 4031 if (need_resched()) {
@@ -3754,7 +4036,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3754 } 4036 }
3755 4037
3756 up_read(&vcpu->kvm->slots_lock); 4038 up_read(&vcpu->kvm->slots_lock);
3757 post_kvm_run_save(vcpu, kvm_run); 4039 post_kvm_run_save(vcpu);
3758 4040
3759 vapic_exit(vcpu); 4041 vapic_exit(vcpu);
3760 4042
@@ -3787,15 +4069,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3787 if (r) 4069 if (r)
3788 goto out; 4070 goto out;
3789 } 4071 }
3790#if CONFIG_HAS_IOMEM
3791 if (vcpu->mmio_needed) { 4072 if (vcpu->mmio_needed) {
3792 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4073 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3793 vcpu->mmio_read_completed = 1; 4074 vcpu->mmio_read_completed = 1;
3794 vcpu->mmio_needed = 0; 4075 vcpu->mmio_needed = 0;
3795 4076
3796 down_read(&vcpu->kvm->slots_lock); 4077 down_read(&vcpu->kvm->slots_lock);
3797 r = emulate_instruction(vcpu, kvm_run, 4078 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3798 vcpu->arch.mmio_fault_cr2, 0,
3799 EMULTYPE_NO_DECODE); 4079 EMULTYPE_NO_DECODE);
3800 up_read(&vcpu->kvm->slots_lock); 4080 up_read(&vcpu->kvm->slots_lock);
3801 if (r == EMULATE_DO_MMIO) { 4081 if (r == EMULATE_DO_MMIO) {
@@ -3806,12 +4086,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3806 goto out; 4086 goto out;
3807 } 4087 }
3808 } 4088 }
3809#endif
3810 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4089 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3811 kvm_register_write(vcpu, VCPU_REGS_RAX, 4090 kvm_register_write(vcpu, VCPU_REGS_RAX,
3812 kvm_run->hypercall.ret); 4091 kvm_run->hypercall.ret);
3813 4092
3814 r = __vcpu_run(vcpu, kvm_run); 4093 r = __vcpu_run(vcpu);
3815 4094
3816out: 4095out:
3817 if (vcpu->sigset_active) 4096 if (vcpu->sigset_active)
@@ -3845,13 +4124,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3845#endif 4124#endif
3846 4125
3847 regs->rip = kvm_rip_read(vcpu); 4126 regs->rip = kvm_rip_read(vcpu);
3848 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 4127 regs->rflags = kvm_get_rflags(vcpu);
3849
3850 /*
3851 * Don't leak debug flags in case they were set for guest debugging
3852 */
3853 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3854 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3855 4128
3856 vcpu_put(vcpu); 4129 vcpu_put(vcpu);
3857 4130
@@ -3879,12 +4152,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3879 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4152 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3880 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4153 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3881 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4154 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3882
3883#endif 4155#endif
3884 4156
3885 kvm_rip_write(vcpu, regs->rip); 4157 kvm_rip_write(vcpu, regs->rip);
3886 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 4158 kvm_set_rflags(vcpu, regs->rflags);
3887
3888 4159
3889 vcpu->arch.exception.pending = false; 4160 vcpu->arch.exception.pending = false;
3890 4161
@@ -4051,7 +4322,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4051 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4322 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4052} 4323}
4053 4324
4054static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 4325static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
4055 struct desc_struct *seg_desc) 4326 struct desc_struct *seg_desc)
4056{ 4327{
4057 u32 base_addr = get_desc_base(seg_desc); 4328 u32 base_addr = get_desc_base(seg_desc);
@@ -4103,7 +4374,7 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4103{ 4374{
4104 return (seg != VCPU_SREG_LDTR) && 4375 return (seg != VCPU_SREG_LDTR) &&
4105 (seg != VCPU_SREG_TR) && 4376 (seg != VCPU_SREG_TR) &&
4106 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4377 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4107} 4378}
4108 4379
4109int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4380int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
@@ -4131,7 +4402,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4131{ 4402{
4132 tss->cr3 = vcpu->arch.cr3; 4403 tss->cr3 = vcpu->arch.cr3;
4133 tss->eip = kvm_rip_read(vcpu); 4404 tss->eip = kvm_rip_read(vcpu);
4134 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4405 tss->eflags = kvm_get_rflags(vcpu);
4135 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4406 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4136 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4407 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4137 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4408 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4155,7 +4426,7 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4155 kvm_set_cr3(vcpu, tss->cr3); 4426 kvm_set_cr3(vcpu, tss->cr3);
4156 4427
4157 kvm_rip_write(vcpu, tss->eip); 4428 kvm_rip_write(vcpu, tss->eip);
4158 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4429 kvm_set_rflags(vcpu, tss->eflags | 2);
4159 4430
4160 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4431 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4161 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4432 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4193,7 +4464,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4193 struct tss_segment_16 *tss) 4464 struct tss_segment_16 *tss)
4194{ 4465{
4195 tss->ip = kvm_rip_read(vcpu); 4466 tss->ip = kvm_rip_read(vcpu);
4196 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4467 tss->flag = kvm_get_rflags(vcpu);
4197 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4468 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4198 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4469 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4199 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4470 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4208,14 +4479,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4208 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4479 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4209 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4480 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4210 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4481 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4211 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4212} 4482}
4213 4483
4214static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4484static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4215 struct tss_segment_16 *tss) 4485 struct tss_segment_16 *tss)
4216{ 4486{
4217 kvm_rip_write(vcpu, tss->ip); 4487 kvm_rip_write(vcpu, tss->ip);
4218 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 4488 kvm_set_rflags(vcpu, tss->flag | 2);
4219 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4489 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4220 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4490 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4221 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4491 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4361,8 +4631,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4361 } 4631 }
4362 4632
4363 if (reason == TASK_SWITCH_IRET) { 4633 if (reason == TASK_SWITCH_IRET) {
4364 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4634 u32 eflags = kvm_get_rflags(vcpu);
4365 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4635 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4366 } 4636 }
4367 4637
4368 /* set back link to prev task only if NT bit is set in eflags 4638 /* set back link to prev task only if NT bit is set in eflags
@@ -4370,11 +4640,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4370 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4640 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4371 old_tss_sel = 0xffff; 4641 old_tss_sel = 0xffff;
4372 4642
4373 /* set back link to prev task only if NT bit is set in eflags
4374 note that old_tss_sel is not used afetr this point */
4375 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4376 old_tss_sel = 0xffff;
4377
4378 if (nseg_desc.type & 8) 4643 if (nseg_desc.type & 8)
4379 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4644 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4380 old_tss_base, &nseg_desc); 4645 old_tss_base, &nseg_desc);
@@ -4383,8 +4648,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4383 old_tss_base, &nseg_desc); 4648 old_tss_base, &nseg_desc);
4384 4649
4385 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4650 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4386 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4651 u32 eflags = kvm_get_rflags(vcpu);
4387 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4652 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4388 } 4653 }
4389 4654
4390 if (reason != TASK_SWITCH_IRET) { 4655 if (reason != TASK_SWITCH_IRET) {
@@ -4436,8 +4701,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4436 4701
4437 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4702 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4438 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4703 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4439 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4704 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4440 load_pdptrs(vcpu, vcpu->arch.cr3); 4705 load_pdptrs(vcpu, vcpu->arch.cr3);
4706 mmu_reset_needed = 1;
4707 }
4441 4708
4442 if (mmu_reset_needed) 4709 if (mmu_reset_needed)
4443 kvm_mmu_reset_context(vcpu); 4710 kvm_mmu_reset_context(vcpu);
@@ -4478,12 +4745,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4478int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4745int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4479 struct kvm_guest_debug *dbg) 4746 struct kvm_guest_debug *dbg)
4480{ 4747{
4748 unsigned long rflags;
4481 int i, r; 4749 int i, r;
4482 4750
4483 vcpu_load(vcpu); 4751 vcpu_load(vcpu);
4484 4752
4485 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4753 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4486 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4754 r = -EBUSY;
4755 if (vcpu->arch.exception.pending)
4756 goto unlock_out;
4757 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4758 kvm_queue_exception(vcpu, DB_VECTOR);
4759 else
4760 kvm_queue_exception(vcpu, BP_VECTOR);
4761 }
4762
4763 /*
4764 * Read rflags as long as potentially injected trace flags are still
4765 * filtered out.
4766 */
4767 rflags = kvm_get_rflags(vcpu);
4768
4769 vcpu->guest_debug = dbg->control;
4770 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4771 vcpu->guest_debug = 0;
4772
4773 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4487 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4774 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4488 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4775 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4489 vcpu->arch.switch_db_regs = 4776 vcpu->arch.switch_db_regs =
@@ -4494,13 +4781,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4494 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4781 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4495 } 4782 }
4496 4783
4497 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4784 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
4785 vcpu->arch.singlestep_cs =
4786 get_segment_selector(vcpu, VCPU_SREG_CS);
4787 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
4788 }
4789
4790 /*
4791 * Trigger an rflags update that will inject or remove the trace
4792 * flags.
4793 */
4794 kvm_set_rflags(vcpu, rflags);
4795
4796 kvm_x86_ops->set_guest_debug(vcpu, dbg);
4498 4797
4499 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4798 r = 0;
4500 kvm_queue_exception(vcpu, DB_VECTOR);
4501 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4502 kvm_queue_exception(vcpu, BP_VECTOR);
4503 4799
4800unlock_out:
4504 vcpu_put(vcpu); 4801 vcpu_put(vcpu);
4505 4802
4506 return r; 4803 return r;
@@ -4701,14 +4998,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4701 return kvm_x86_ops->vcpu_reset(vcpu); 4998 return kvm_x86_ops->vcpu_reset(vcpu);
4702} 4999}
4703 5000
4704void kvm_arch_hardware_enable(void *garbage) 5001int kvm_arch_hardware_enable(void *garbage)
4705{ 5002{
4706 kvm_x86_ops->hardware_enable(garbage); 5003 /*
5004 * Since this may be called from a hotplug notifcation,
5005 * we can't get the CPU frequency directly.
5006 */
5007 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5008 int cpu = raw_smp_processor_id();
5009 per_cpu(cpu_tsc_khz, cpu) = 0;
5010 }
5011
5012 kvm_shared_msr_cpu_online();
5013
5014 return kvm_x86_ops->hardware_enable(garbage);
4707} 5015}
4708 5016
4709void kvm_arch_hardware_disable(void *garbage) 5017void kvm_arch_hardware_disable(void *garbage)
4710{ 5018{
4711 kvm_x86_ops->hardware_disable(garbage); 5019 kvm_x86_ops->hardware_disable(garbage);
5020 drop_user_return_notifiers(garbage);
4712} 5021}
4713 5022
4714int kvm_arch_hardware_setup(void) 5023int kvm_arch_hardware_setup(void)
@@ -4762,12 +5071,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4762 GFP_KERNEL); 5071 GFP_KERNEL);
4763 if (!vcpu->arch.mce_banks) { 5072 if (!vcpu->arch.mce_banks) {
4764 r = -ENOMEM; 5073 r = -ENOMEM;
4765 goto fail_mmu_destroy; 5074 goto fail_free_lapic;
4766 } 5075 }
4767 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5076 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4768 5077
4769 return 0; 5078 return 0;
4770 5079fail_free_lapic:
5080 kvm_free_lapic(vcpu);
4771fail_mmu_destroy: 5081fail_mmu_destroy:
4772 kvm_mmu_destroy(vcpu); 5082 kvm_mmu_destroy(vcpu);
4773fail_free_pio_data: 5083fail_free_pio_data:
@@ -4778,6 +5088,7 @@ fail:
4778 5088
4779void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5089void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4780{ 5090{
5091 kfree(vcpu->arch.mce_banks);
4781 kvm_free_lapic(vcpu); 5092 kvm_free_lapic(vcpu);
4782 down_read(&vcpu->kvm->slots_lock); 5093 down_read(&vcpu->kvm->slots_lock);
4783 kvm_mmu_destroy(vcpu); 5094 kvm_mmu_destroy(vcpu);
@@ -4946,8 +5257,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4946 return kvm_x86_ops->interrupt_allowed(vcpu); 5257 return kvm_x86_ops->interrupt_allowed(vcpu);
4947} 5258}
4948 5259
5260unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5261{
5262 unsigned long rflags;
5263
5264 rflags = kvm_x86_ops->get_rflags(vcpu);
5265 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5266 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5267 return rflags;
5268}
5269EXPORT_SYMBOL_GPL(kvm_get_rflags);
5270
5271void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5272{
5273 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5274 vcpu->arch.singlestep_cs ==
5275 get_segment_selector(vcpu, VCPU_SREG_CS) &&
5276 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5277 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5278 kvm_x86_ops->set_rflags(vcpu, rflags);
5279}
5280EXPORT_SYMBOL_GPL(kvm_set_rflags);
5281
4949EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5282EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4950EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5283EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5284EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4952EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5285EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4953EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5286EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5287EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5288EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5289EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5290EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5291EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5292EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 000000000000..8df89f0a3fe6
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1 @@
inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 85f5db95c60f..cffd754f3039 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,14 +2,27 @@
2# Makefile for x86 specific library files. 2# Makefile for x86 specific library files.
3# 3#
4 4
5obj-$(CONFIG_SMP) := msr.o 5inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
6inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
7quiet_cmd_inat_tables = GEN $@
8 cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@
9
10$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
11 $(call cmd,inat_tables)
12
13$(obj)/inat.o: $(obj)/inat-tables.c
14
15clean-files := inat-tables.c
16
17obj-$(CONFIG_SMP) += msr-smp.o
6 18
7lib-y := delay.o 19lib-y := delay.o
8lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
9lib-y += usercopy_$(BITS).o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
23lib-$(CONFIG_KPROBES) += insn.o inat.o
11 24
12obj-y += msr-reg.o msr-reg-export.o 25obj-y += msr.o msr-reg.o msr-reg-export.o
13 26
14ifeq ($(CONFIG_X86_32),y) 27ifeq ($(CONFIG_X86_32),y)
15 obj-y += atomic64_32.o 28 obj-y += atomic64_32.o
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 6ba0f7bb85ea..cf889d4e076a 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -65,7 +65,7 @@
65 .endm 65 .endm
66 66
67/* Standard copy_to_user with segment limit checking */ 67/* Standard copy_to_user with segment limit checking */
68ENTRY(copy_to_user) 68ENTRY(_copy_to_user)
69 CFI_STARTPROC 69 CFI_STARTPROC
70 GET_THREAD_INFO(%rax) 70 GET_THREAD_INFO(%rax)
71 movq %rdi,%rcx 71 movq %rdi,%rcx
@@ -75,10 +75,10 @@ ENTRY(copy_to_user)
75 jae bad_to_user 75 jae bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
77 CFI_ENDPROC 77 CFI_ENDPROC
78ENDPROC(copy_to_user) 78ENDPROC(_copy_to_user)
79 79
80/* Standard copy_from_user with segment limit checking */ 80/* Standard copy_from_user with segment limit checking */
81ENTRY(copy_from_user) 81ENTRY(_copy_from_user)
82 CFI_STARTPROC 82 CFI_STARTPROC
83 GET_THREAD_INFO(%rax) 83 GET_THREAD_INFO(%rax)
84 movq %rsi,%rcx 84 movq %rsi,%rcx
@@ -88,7 +88,7 @@ ENTRY(copy_from_user)
88 jae bad_from_user 88 jae bad_from_user
89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
90 CFI_ENDPROC 90 CFI_ENDPROC
91ENDPROC(copy_from_user) 91ENDPROC(_copy_from_user)
92 92
93ENTRY(copy_user_generic) 93ENTRY(copy_user_generic)
94 CFI_STARTPROC 94 CFI_STARTPROC
@@ -96,12 +96,6 @@ ENTRY(copy_user_generic)
96 CFI_ENDPROC 96 CFI_ENDPROC
97ENDPROC(copy_user_generic) 97ENDPROC(copy_user_generic)
98 98
99ENTRY(__copy_from_user_inatomic)
100 CFI_STARTPROC
101 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
102 CFI_ENDPROC
103ENDPROC(__copy_from_user_inatomic)
104
105 .section .fixup,"ax" 99 .section .fixup,"ax"
106 /* must zero dest */ 100 /* must zero dest */
107ENTRY(bad_from_user) 101ENTRY(bad_from_user)
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 000000000000..46fc4ee09fc4
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,90 @@
1/*
2 * x86 instruction attribute tables
3 *
4 * Written by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 */
21#include <asm/insn.h>
22
23/* Attribute tables are generated from opcode map */
24#include "inat-tables.c"
25
26/* Attribute search APIs */
27insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
28{
29 return inat_primary_table[opcode];
30}
31
32insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
33 insn_attr_t esc_attr)
34{
35 const insn_attr_t *table;
36 insn_attr_t lpfx_attr;
37 int n, m = 0;
38
39 n = inat_escape_id(esc_attr);
40 if (last_pfx) {
41 lpfx_attr = inat_get_opcode_attribute(last_pfx);
42 m = inat_last_prefix_id(lpfx_attr);
43 }
44 table = inat_escape_tables[n][0];
45 if (!table)
46 return 0;
47 if (inat_has_variant(table[opcode]) && m) {
48 table = inat_escape_tables[n][m];
49 if (!table)
50 return 0;
51 }
52 return table[opcode];
53}
54
55insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
56 insn_attr_t grp_attr)
57{
58 const insn_attr_t *table;
59 insn_attr_t lpfx_attr;
60 int n, m = 0;
61
62 n = inat_group_id(grp_attr);
63 if (last_pfx) {
64 lpfx_attr = inat_get_opcode_attribute(last_pfx);
65 m = inat_last_prefix_id(lpfx_attr);
66 }
67 table = inat_group_tables[n][0];
68 if (!table)
69 return inat_group_common_attribute(grp_attr);
70 if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
71 table = inat_group_tables[n][m];
72 if (!table)
73 return inat_group_common_attribute(grp_attr);
74 }
75 return table[X86_MODRM_REG(modrm)] |
76 inat_group_common_attribute(grp_attr);
77}
78
79insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
80 insn_byte_t vex_p)
81{
82 const insn_attr_t *table;
83 if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
84 return 0;
85 table = inat_avx_tables[vex_m][vex_p];
86 if (!table)
87 return 0;
88 return table[opcode];
89}
90
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644
index 000000000000..9f33b984d0ef
--- /dev/null
+++ b/arch/x86/lib/insn.c
@@ -0,0 +1,516 @@
1/*
2 * x86 instruction analysis
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004, 2009
19 */
20
21#include <linux/string.h>
22#include <asm/inat.h>
23#include <asm/insn.h>
24
25#define get_next(t, insn) \
26 ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
27
28#define peek_next(t, insn) \
29 ({t r; r = *(t*)insn->next_byte; r; })
30
31#define peek_nbyte_next(t, insn, n) \
32 ({t r; r = *(t*)((insn)->next_byte + n); r; })
33
34/**
35 * insn_init() - initialize struct insn
36 * @insn: &struct insn to be initialized
37 * @kaddr: address (in kernel memory) of instruction (or copy thereof)
38 * @x86_64: !0 for 64-bit kernel or 64-bit app
39 */
40void insn_init(struct insn *insn, const void *kaddr, int x86_64)
41{
42 memset(insn, 0, sizeof(*insn));
43 insn->kaddr = kaddr;
44 insn->next_byte = kaddr;
45 insn->x86_64 = x86_64 ? 1 : 0;
46 insn->opnd_bytes = 4;
47 if (x86_64)
48 insn->addr_bytes = 8;
49 else
50 insn->addr_bytes = 4;
51}
52
53/**
54 * insn_get_prefixes - scan x86 instruction prefix bytes
55 * @insn: &struct insn containing instruction
56 *
57 * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
58 * to point to the (first) opcode. No effect if @insn->prefixes.got
59 * is already set.
60 */
61void insn_get_prefixes(struct insn *insn)
62{
63 struct insn_field *prefixes = &insn->prefixes;
64 insn_attr_t attr;
65 insn_byte_t b, lb;
66 int i, nb;
67
68 if (prefixes->got)
69 return;
70
71 nb = 0;
72 lb = 0;
73 b = peek_next(insn_byte_t, insn);
74 attr = inat_get_opcode_attribute(b);
75 while (inat_is_legacy_prefix(attr)) {
76 /* Skip if same prefix */
77 for (i = 0; i < nb; i++)
78 if (prefixes->bytes[i] == b)
79 goto found;
80 if (nb == 4)
81 /* Invalid instruction */
82 break;
83 prefixes->bytes[nb++] = b;
84 if (inat_is_address_size_prefix(attr)) {
85 /* address size switches 2/4 or 4/8 */
86 if (insn->x86_64)
87 insn->addr_bytes ^= 12;
88 else
89 insn->addr_bytes ^= 6;
90 } else if (inat_is_operand_size_prefix(attr)) {
91 /* oprand size switches 2/4 */
92 insn->opnd_bytes ^= 6;
93 }
94found:
95 prefixes->nbytes++;
96 insn->next_byte++;
97 lb = b;
98 b = peek_next(insn_byte_t, insn);
99 attr = inat_get_opcode_attribute(b);
100 }
101 /* Set the last prefix */
102 if (lb && lb != insn->prefixes.bytes[3]) {
103 if (unlikely(insn->prefixes.bytes[3])) {
104 /* Swap the last prefix */
105 b = insn->prefixes.bytes[3];
106 for (i = 0; i < nb; i++)
107 if (prefixes->bytes[i] == lb)
108 prefixes->bytes[i] = b;
109 }
110 insn->prefixes.bytes[3] = lb;
111 }
112
113 /* Decode REX prefix */
114 if (insn->x86_64) {
115 b = peek_next(insn_byte_t, insn);
116 attr = inat_get_opcode_attribute(b);
117 if (inat_is_rex_prefix(attr)) {
118 insn->rex_prefix.value = b;
119 insn->rex_prefix.nbytes = 1;
120 insn->next_byte++;
121 if (X86_REX_W(b))
122 /* REX.W overrides opnd_size */
123 insn->opnd_bytes = 8;
124 }
125 }
126 insn->rex_prefix.got = 1;
127
128 /* Decode VEX prefix */
129 b = peek_next(insn_byte_t, insn);
130 attr = inat_get_opcode_attribute(b);
131 if (inat_is_vex_prefix(attr)) {
132 insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
133 if (!insn->x86_64) {
134 /*
135 * In 32-bits mode, if the [7:6] bits (mod bits of
136 * ModRM) on the second byte are not 11b, it is
137 * LDS or LES.
138 */
139 if (X86_MODRM_MOD(b2) != 3)
140 goto vex_end;
141 }
142 insn->vex_prefix.bytes[0] = b;
143 insn->vex_prefix.bytes[1] = b2;
144 if (inat_is_vex3_prefix(attr)) {
145 b2 = peek_nbyte_next(insn_byte_t, insn, 2);
146 insn->vex_prefix.bytes[2] = b2;
147 insn->vex_prefix.nbytes = 3;
148 insn->next_byte += 3;
149 if (insn->x86_64 && X86_VEX_W(b2))
150 /* VEX.W overrides opnd_size */
151 insn->opnd_bytes = 8;
152 } else {
153 insn->vex_prefix.nbytes = 2;
154 insn->next_byte += 2;
155 }
156 }
157vex_end:
158 insn->vex_prefix.got = 1;
159
160 prefixes->got = 1;
161 return;
162}
163
164/**
165 * insn_get_opcode - collect opcode(s)
166 * @insn: &struct insn containing instruction
167 *
168 * Populates @insn->opcode, updates @insn->next_byte to point past the
169 * opcode byte(s), and set @insn->attr (except for groups).
170 * If necessary, first collects any preceding (prefix) bytes.
171 * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
172 * is already 1.
173 */
174void insn_get_opcode(struct insn *insn)
175{
176 struct insn_field *opcode = &insn->opcode;
177 insn_byte_t op, pfx;
178 if (opcode->got)
179 return;
180 if (!insn->prefixes.got)
181 insn_get_prefixes(insn);
182
183 /* Get first opcode */
184 op = get_next(insn_byte_t, insn);
185 opcode->bytes[0] = op;
186 opcode->nbytes = 1;
187
188 /* Check if there is VEX prefix or not */
189 if (insn_is_avx(insn)) {
190 insn_byte_t m, p;
191 m = insn_vex_m_bits(insn);
192 p = insn_vex_p_bits(insn);
193 insn->attr = inat_get_avx_attribute(op, m, p);
194 if (!inat_accept_vex(insn->attr))
195 insn->attr = 0; /* This instruction is bad */
196 goto end; /* VEX has only 1 byte for opcode */
197 }
198
199 insn->attr = inat_get_opcode_attribute(op);
200 while (inat_is_escape(insn->attr)) {
201 /* Get escaped opcode */
202 op = get_next(insn_byte_t, insn);
203 opcode->bytes[opcode->nbytes++] = op;
204 pfx = insn_last_prefix(insn);
205 insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
206 }
207 if (inat_must_vex(insn->attr))
208 insn->attr = 0; /* This instruction is bad */
209end:
210 opcode->got = 1;
211}
212
213/**
214 * insn_get_modrm - collect ModRM byte, if any
215 * @insn: &struct insn containing instruction
216 *
217 * Populates @insn->modrm and updates @insn->next_byte to point past the
218 * ModRM byte, if any. If necessary, first collects the preceding bytes
219 * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
220 */
221void insn_get_modrm(struct insn *insn)
222{
223 struct insn_field *modrm = &insn->modrm;
224 insn_byte_t pfx, mod;
225 if (modrm->got)
226 return;
227 if (!insn->opcode.got)
228 insn_get_opcode(insn);
229
230 if (inat_has_modrm(insn->attr)) {
231 mod = get_next(insn_byte_t, insn);
232 modrm->value = mod;
233 modrm->nbytes = 1;
234 if (inat_is_group(insn->attr)) {
235 pfx = insn_last_prefix(insn);
236 insn->attr = inat_get_group_attribute(mod, pfx,
237 insn->attr);
238 }
239 }
240
241 if (insn->x86_64 && inat_is_force64(insn->attr))
242 insn->opnd_bytes = 8;
243 modrm->got = 1;
244}
245
246
247/**
248 * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
249 * @insn: &struct insn containing instruction
250 *
251 * If necessary, first collects the instruction up to and including the
252 * ModRM byte. No effect if @insn->x86_64 is 0.
253 */
254int insn_rip_relative(struct insn *insn)
255{
256 struct insn_field *modrm = &insn->modrm;
257
258 if (!insn->x86_64)
259 return 0;
260 if (!modrm->got)
261 insn_get_modrm(insn);
262 /*
263 * For rip-relative instructions, the mod field (top 2 bits)
264 * is zero and the r/m field (bottom 3 bits) is 0x5.
265 */
266 return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
267}
268
269/**
270 * insn_get_sib() - Get the SIB byte of instruction
271 * @insn: &struct insn containing instruction
272 *
273 * If necessary, first collects the instruction up to and including the
274 * ModRM byte.
275 */
276void insn_get_sib(struct insn *insn)
277{
278 insn_byte_t modrm;
279
280 if (insn->sib.got)
281 return;
282 if (!insn->modrm.got)
283 insn_get_modrm(insn);
284 if (insn->modrm.nbytes) {
285 modrm = (insn_byte_t)insn->modrm.value;
286 if (insn->addr_bytes != 2 &&
287 X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
288 insn->sib.value = get_next(insn_byte_t, insn);
289 insn->sib.nbytes = 1;
290 }
291 }
292 insn->sib.got = 1;
293}
294
295
296/**
297 * insn_get_displacement() - Get the displacement of instruction
298 * @insn: &struct insn containing instruction
299 *
300 * If necessary, first collects the instruction up to and including the
301 * SIB byte.
302 * Displacement value is sign-expanded.
303 */
304void insn_get_displacement(struct insn *insn)
305{
306 insn_byte_t mod, rm, base;
307
308 if (insn->displacement.got)
309 return;
310 if (!insn->sib.got)
311 insn_get_sib(insn);
312 if (insn->modrm.nbytes) {
313 /*
314 * Interpreting the modrm byte:
315 * mod = 00 - no displacement fields (exceptions below)
316 * mod = 01 - 1-byte displacement field
317 * mod = 10 - displacement field is 4 bytes, or 2 bytes if
318 * address size = 2 (0x67 prefix in 32-bit mode)
319 * mod = 11 - no memory operand
320 *
321 * If address size = 2...
322 * mod = 00, r/m = 110 - displacement field is 2 bytes
323 *
324 * If address size != 2...
325 * mod != 11, r/m = 100 - SIB byte exists
326 * mod = 00, SIB base = 101 - displacement field is 4 bytes
327 * mod = 00, r/m = 101 - rip-relative addressing, displacement
328 * field is 4 bytes
329 */
330 mod = X86_MODRM_MOD(insn->modrm.value);
331 rm = X86_MODRM_RM(insn->modrm.value);
332 base = X86_SIB_BASE(insn->sib.value);
333 if (mod == 3)
334 goto out;
335 if (mod == 1) {
336 insn->displacement.value = get_next(char, insn);
337 insn->displacement.nbytes = 1;
338 } else if (insn->addr_bytes == 2) {
339 if ((mod == 0 && rm == 6) || mod == 2) {
340 insn->displacement.value =
341 get_next(short, insn);
342 insn->displacement.nbytes = 2;
343 }
344 } else {
345 if ((mod == 0 && rm == 5) || mod == 2 ||
346 (mod == 0 && base == 5)) {
347 insn->displacement.value = get_next(int, insn);
348 insn->displacement.nbytes = 4;
349 }
350 }
351 }
352out:
353 insn->displacement.got = 1;
354}
355
356/* Decode moffset16/32/64 */
357static void __get_moffset(struct insn *insn)
358{
359 switch (insn->addr_bytes) {
360 case 2:
361 insn->moffset1.value = get_next(short, insn);
362 insn->moffset1.nbytes = 2;
363 break;
364 case 4:
365 insn->moffset1.value = get_next(int, insn);
366 insn->moffset1.nbytes = 4;
367 break;
368 case 8:
369 insn->moffset1.value = get_next(int, insn);
370 insn->moffset1.nbytes = 4;
371 insn->moffset2.value = get_next(int, insn);
372 insn->moffset2.nbytes = 4;
373 break;
374 }
375 insn->moffset1.got = insn->moffset2.got = 1;
376}
377
378/* Decode imm v32(Iz) */
379static void __get_immv32(struct insn *insn)
380{
381 switch (insn->opnd_bytes) {
382 case 2:
383 insn->immediate.value = get_next(short, insn);
384 insn->immediate.nbytes = 2;
385 break;
386 case 4:
387 case 8:
388 insn->immediate.value = get_next(int, insn);
389 insn->immediate.nbytes = 4;
390 break;
391 }
392}
393
394/* Decode imm v64(Iv/Ov) */
395static void __get_immv(struct insn *insn)
396{
397 switch (insn->opnd_bytes) {
398 case 2:
399 insn->immediate1.value = get_next(short, insn);
400 insn->immediate1.nbytes = 2;
401 break;
402 case 4:
403 insn->immediate1.value = get_next(int, insn);
404 insn->immediate1.nbytes = 4;
405 break;
406 case 8:
407 insn->immediate1.value = get_next(int, insn);
408 insn->immediate1.nbytes = 4;
409 insn->immediate2.value = get_next(int, insn);
410 insn->immediate2.nbytes = 4;
411 break;
412 }
413 insn->immediate1.got = insn->immediate2.got = 1;
414}
415
416/* Decode ptr16:16/32(Ap) */
417static void __get_immptr(struct insn *insn)
418{
419 switch (insn->opnd_bytes) {
420 case 2:
421 insn->immediate1.value = get_next(short, insn);
422 insn->immediate1.nbytes = 2;
423 break;
424 case 4:
425 insn->immediate1.value = get_next(int, insn);
426 insn->immediate1.nbytes = 4;
427 break;
428 case 8:
429 /* ptr16:64 is not exist (no segment) */
430 return;
431 }
432 insn->immediate2.value = get_next(unsigned short, insn);
433 insn->immediate2.nbytes = 2;
434 insn->immediate1.got = insn->immediate2.got = 1;
435}
436
437/**
438 * insn_get_immediate() - Get the immediates of instruction
439 * @insn: &struct insn containing instruction
440 *
441 * If necessary, first collects the instruction up to and including the
442 * displacement bytes.
443 * Basically, most of immediates are sign-expanded. Unsigned-value can be
444 * get by bit masking with ((1 << (nbytes * 8)) - 1)
445 */
446void insn_get_immediate(struct insn *insn)
447{
448 if (insn->immediate.got)
449 return;
450 if (!insn->displacement.got)
451 insn_get_displacement(insn);
452
453 if (inat_has_moffset(insn->attr)) {
454 __get_moffset(insn);
455 goto done;
456 }
457
458 if (!inat_has_immediate(insn->attr))
459 /* no immediates */
460 goto done;
461
462 switch (inat_immediate_size(insn->attr)) {
463 case INAT_IMM_BYTE:
464 insn->immediate.value = get_next(char, insn);
465 insn->immediate.nbytes = 1;
466 break;
467 case INAT_IMM_WORD:
468 insn->immediate.value = get_next(short, insn);
469 insn->immediate.nbytes = 2;
470 break;
471 case INAT_IMM_DWORD:
472 insn->immediate.value = get_next(int, insn);
473 insn->immediate.nbytes = 4;
474 break;
475 case INAT_IMM_QWORD:
476 insn->immediate1.value = get_next(int, insn);
477 insn->immediate1.nbytes = 4;
478 insn->immediate2.value = get_next(int, insn);
479 insn->immediate2.nbytes = 4;
480 break;
481 case INAT_IMM_PTR:
482 __get_immptr(insn);
483 break;
484 case INAT_IMM_VWORD32:
485 __get_immv32(insn);
486 break;
487 case INAT_IMM_VWORD:
488 __get_immv(insn);
489 break;
490 default:
491 break;
492 }
493 if (inat_has_second_immediate(insn->attr)) {
494 insn->immediate2.value = get_next(char, insn);
495 insn->immediate2.nbytes = 1;
496 }
497done:
498 insn->immediate.got = 1;
499}
500
501/**
502 * insn_get_length() - Get the length of instruction
503 * @insn: &struct insn containing instruction
504 *
505 * If necessary, first collects the instruction up to and including the
506 * immediates bytes.
507 */
508void insn_get_length(struct insn *insn)
509{
510 if (insn->length)
511 return;
512 if (!insn->immediate.got)
513 insn_get_immediate(insn);
514 insn->length = (unsigned char)((unsigned long)insn->next_byte
515 - (unsigned long)insn->kaddr);
516}
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
new file mode 100644
index 000000000000..a6b1b86d2253
--- /dev/null
+++ b/arch/x86/lib/msr-smp.c
@@ -0,0 +1,204 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6static void __rdmsr_on_cpu(void *info)
7{
8 struct msr_info *rv = info;
9 struct msr *reg;
10 int this_cpu = raw_smp_processor_id();
11
12 if (rv->msrs)
13 reg = per_cpu_ptr(rv->msrs, this_cpu);
14 else
15 reg = &rv->reg;
16
17 rdmsr(rv->msr_no, reg->l, reg->h);
18}
19
20static void __wrmsr_on_cpu(void *info)
21{
22 struct msr_info *rv = info;
23 struct msr *reg;
24 int this_cpu = raw_smp_processor_id();
25
26 if (rv->msrs)
27 reg = per_cpu_ptr(rv->msrs, this_cpu);
28 else
29 reg = &rv->reg;
30
31 wrmsr(rv->msr_no, reg->l, reg->h);
32}
33
34int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
35{
36 int err;
37 struct msr_info rv;
38
39 memset(&rv, 0, sizeof(rv));
40
41 rv.msr_no = msr_no;
42 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
43 *l = rv.reg.l;
44 *h = rv.reg.h;
45
46 return err;
47}
48EXPORT_SYMBOL(rdmsr_on_cpu);
49
50int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
51{
52 int err;
53 struct msr_info rv;
54
55 memset(&rv, 0, sizeof(rv));
56
57 rv.msr_no = msr_no;
58 rv.reg.l = l;
59 rv.reg.h = h;
60 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
61
62 return err;
63}
64EXPORT_SYMBOL(wrmsr_on_cpu);
65
66static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
67 struct msr *msrs,
68 void (*msr_func) (void *info))
69{
70 struct msr_info rv;
71 int this_cpu;
72
73 memset(&rv, 0, sizeof(rv));
74
75 rv.msrs = msrs;
76 rv.msr_no = msr_no;
77
78 this_cpu = get_cpu();
79
80 if (cpumask_test_cpu(this_cpu, mask))
81 msr_func(&rv);
82
83 smp_call_function_many(mask, msr_func, &rv, 1);
84 put_cpu();
85}
86
87/* rdmsr on a bunch of CPUs
88 *
89 * @mask: which CPUs
90 * @msr_no: which MSR
91 * @msrs: array of MSR values
92 *
93 */
94void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
95{
96 __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
97}
98EXPORT_SYMBOL(rdmsr_on_cpus);
99
100/*
101 * wrmsr on a bunch of CPUs
102 *
103 * @mask: which CPUs
104 * @msr_no: which MSR
105 * @msrs: array of MSR values
106 *
107 */
108void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
109{
110 __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
111}
112EXPORT_SYMBOL(wrmsr_on_cpus);
113
114/* These "safe" variants are slower and should be used when the target MSR
115 may not actually exist. */
116static void __rdmsr_safe_on_cpu(void *info)
117{
118 struct msr_info *rv = info;
119
120 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
121}
122
123static void __wrmsr_safe_on_cpu(void *info)
124{
125 struct msr_info *rv = info;
126
127 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
128}
129
130int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
131{
132 int err;
133 struct msr_info rv;
134
135 memset(&rv, 0, sizeof(rv));
136
137 rv.msr_no = msr_no;
138 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
139 *l = rv.reg.l;
140 *h = rv.reg.h;
141
142 return err ? err : rv.err;
143}
144EXPORT_SYMBOL(rdmsr_safe_on_cpu);
145
146int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
147{
148 int err;
149 struct msr_info rv;
150
151 memset(&rv, 0, sizeof(rv));
152
153 rv.msr_no = msr_no;
154 rv.reg.l = l;
155 rv.reg.h = h;
156 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
157
158 return err ? err : rv.err;
159}
160EXPORT_SYMBOL(wrmsr_safe_on_cpu);
161
162/*
163 * These variants are significantly slower, but allows control over
164 * the entire 32-bit GPR set.
165 */
166static void __rdmsr_safe_regs_on_cpu(void *info)
167{
168 struct msr_regs_info *rv = info;
169
170 rv->err = rdmsr_safe_regs(rv->regs);
171}
172
173static void __wrmsr_safe_regs_on_cpu(void *info)
174{
175 struct msr_regs_info *rv = info;
176
177 rv->err = wrmsr_safe_regs(rv->regs);
178}
179
180int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
181{
182 int err;
183 struct msr_regs_info rv;
184
185 rv.regs = regs;
186 rv.err = -EIO;
187 err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
188
189 return err ? err : rv.err;
190}
191EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
192
193int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
194{
195 int err;
196 struct msr_regs_info rv;
197
198 rv.regs = regs;
199 rv.err = -EIO;
200 err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
201
202 return err ? err : rv.err;
203}
204EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 33a1e3ca22d8..8f8eebdca7d4 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,226 +1,23 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/preempt.h> 2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h> 3#include <asm/msr.h>
5 4
6struct msr_info { 5struct msr *msrs_alloc(void)
7 u32 msr_no;
8 struct msr reg;
9 struct msr *msrs;
10 int off;
11 int err;
12};
13
14static void __rdmsr_on_cpu(void *info)
15{
16 struct msr_info *rv = info;
17 struct msr *reg;
18 int this_cpu = raw_smp_processor_id();
19
20 if (rv->msrs)
21 reg = &rv->msrs[this_cpu - rv->off];
22 else
23 reg = &rv->reg;
24
25 rdmsr(rv->msr_no, reg->l, reg->h);
26}
27
28static void __wrmsr_on_cpu(void *info)
29{
30 struct msr_info *rv = info;
31 struct msr *reg;
32 int this_cpu = raw_smp_processor_id();
33
34 if (rv->msrs)
35 reg = &rv->msrs[this_cpu - rv->off];
36 else
37 reg = &rv->reg;
38
39 wrmsr(rv->msr_no, reg->l, reg->h);
40}
41
42int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
43{
44 int err;
45 struct msr_info rv;
46
47 memset(&rv, 0, sizeof(rv));
48
49 rv.msr_no = msr_no;
50 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
51 *l = rv.reg.l;
52 *h = rv.reg.h;
53
54 return err;
55}
56EXPORT_SYMBOL(rdmsr_on_cpu);
57
58int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
59{
60 int err;
61 struct msr_info rv;
62
63 memset(&rv, 0, sizeof(rv));
64
65 rv.msr_no = msr_no;
66 rv.reg.l = l;
67 rv.reg.h = h;
68 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
69
70 return err;
71}
72EXPORT_SYMBOL(wrmsr_on_cpu);
73
74/* rdmsr on a bunch of CPUs
75 *
76 * @mask: which CPUs
77 * @msr_no: which MSR
78 * @msrs: array of MSR values
79 *
80 */
81void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
82{
83 struct msr_info rv;
84 int this_cpu;
85
86 memset(&rv, 0, sizeof(rv));
87
88 rv.off = cpumask_first(mask);
89 rv.msrs = msrs;
90 rv.msr_no = msr_no;
91
92 this_cpu = get_cpu();
93
94 if (cpumask_test_cpu(this_cpu, mask))
95 __rdmsr_on_cpu(&rv);
96
97 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
98 put_cpu();
99}
100EXPORT_SYMBOL(rdmsr_on_cpus);
101
102/*
103 * wrmsr on a bunch of CPUs
104 *
105 * @mask: which CPUs
106 * @msr_no: which MSR
107 * @msrs: array of MSR values
108 *
109 */
110void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
111{
112 struct msr_info rv;
113 int this_cpu;
114
115 memset(&rv, 0, sizeof(rv));
116
117 rv.off = cpumask_first(mask);
118 rv.msrs = msrs;
119 rv.msr_no = msr_no;
120
121 this_cpu = get_cpu();
122
123 if (cpumask_test_cpu(this_cpu, mask))
124 __wrmsr_on_cpu(&rv);
125
126 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
127 put_cpu();
128}
129EXPORT_SYMBOL(wrmsr_on_cpus);
130
131/* These "safe" variants are slower and should be used when the target MSR
132 may not actually exist. */
133static void __rdmsr_safe_on_cpu(void *info)
134{
135 struct msr_info *rv = info;
136
137 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
138}
139
140static void __wrmsr_safe_on_cpu(void *info)
141{
142 struct msr_info *rv = info;
143
144 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
145}
146
147int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
148{ 6{
149 int err; 7 struct msr *msrs = NULL;
150 struct msr_info rv;
151 8
152 memset(&rv, 0, sizeof(rv)); 9 msrs = alloc_percpu(struct msr);
10 if (!msrs) {
11 pr_warning("%s: error allocating msrs\n", __func__);
12 return NULL;
13 }
153 14
154 rv.msr_no = msr_no; 15 return msrs;
155 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
156 *l = rv.reg.l;
157 *h = rv.reg.h;
158
159 return err ? err : rv.err;
160} 16}
161EXPORT_SYMBOL(rdmsr_safe_on_cpu); 17EXPORT_SYMBOL(msrs_alloc);
162 18
163int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 19void msrs_free(struct msr *msrs)
164{ 20{
165 int err; 21 free_percpu(msrs);
166 struct msr_info rv;
167
168 memset(&rv, 0, sizeof(rv));
169
170 rv.msr_no = msr_no;
171 rv.reg.l = l;
172 rv.reg.h = h;
173 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
174
175 return err ? err : rv.err;
176}
177EXPORT_SYMBOL(wrmsr_safe_on_cpu);
178
179/*
180 * These variants are significantly slower, but allows control over
181 * the entire 32-bit GPR set.
182 */
183struct msr_regs_info {
184 u32 *regs;
185 int err;
186};
187
188static void __rdmsr_safe_regs_on_cpu(void *info)
189{
190 struct msr_regs_info *rv = info;
191
192 rv->err = rdmsr_safe_regs(rv->regs);
193}
194
195static void __wrmsr_safe_regs_on_cpu(void *info)
196{
197 struct msr_regs_info *rv = info;
198
199 rv->err = wrmsr_safe_regs(rv->regs);
200}
201
202int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
203{
204 int err;
205 struct msr_regs_info rv;
206
207 rv.regs = regs;
208 rv.err = -EIO;
209 err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
210
211 return err ? err : rv.err;
212}
213EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
214
215int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
216{
217 int err;
218 struct msr_regs_info rv;
219
220 rv.regs = regs;
221 rv.err = -EIO;
222 err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
223
224 return err ? err : rv.err;
225} 22}
226EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); 23EXPORT_SYMBOL(msrs_free);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f118d462acc..e218d5df85ff 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user);
874 * data to the requested size using zero bytes. 874 * data to the requested size using zero bytes.
875 */ 875 */
876unsigned long 876unsigned long
877copy_from_user(void *to, const void __user *from, unsigned long n) 877_copy_from_user(void *to, const void __user *from, unsigned long n)
878{ 878{
879 if (access_ok(VERIFY_READ, from, n)) 879 if (access_ok(VERIFY_READ, from, n))
880 n = __copy_from_user(to, from, n); 880 n = __copy_from_user(to, from, n);
@@ -882,4 +882,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
882 memset(to, 0, n); 882 memset(to, 0, n);
883 return n; 883 return n;
884} 884}
885EXPORT_SYMBOL(copy_from_user); 885EXPORT_SYMBOL(_copy_from_user);
886
887void copy_from_user_overflow(void)
888{
889 WARN(1, "Buffer overflow detected!\n");
890}
891EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644
index 000000000000..a793da5e560e
--- /dev/null
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -0,0 +1,893 @@
1# x86 Opcode Maps
2#
3#<Opcode maps>
4# Table: table-name
5# Referrer: escaped-name
6# AVXcode: avx-code
7# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
8# (or)
9# opcode: escape # escaped-name
10# EndTable
11#
12#<group maps>
13# GrpTable: GrpXXX
14# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
15# EndTable
16#
17# AVX Superscripts
18# (VEX): this opcode can accept VEX prefix.
19# (oVEX): this opcode requires VEX prefix.
20# (o128): this opcode only supports 128bit VEX.
21# (o256): this opcode only supports 256bit VEX.
22#
23
24Table: one byte opcode
25Referrer:
26AVXcode:
27# 0x00 - 0x0f
2800: ADD Eb,Gb
2901: ADD Ev,Gv
3002: ADD Gb,Eb
3103: ADD Gv,Ev
3204: ADD AL,Ib
3305: ADD rAX,Iz
3406: PUSH ES (i64)
3507: POP ES (i64)
3608: OR Eb,Gb
3709: OR Ev,Gv
380a: OR Gb,Eb
390b: OR Gv,Ev
400c: OR AL,Ib
410d: OR rAX,Iz
420e: PUSH CS (i64)
430f: escape # 2-byte escape
44# 0x10 - 0x1f
4510: ADC Eb,Gb
4611: ADC Ev,Gv
4712: ADC Gb,Eb
4813: ADC Gv,Ev
4914: ADC AL,Ib
5015: ADC rAX,Iz
5116: PUSH SS (i64)
5217: POP SS (i64)
5318: SBB Eb,Gb
5419: SBB Ev,Gv
551a: SBB Gb,Eb
561b: SBB Gv,Ev
571c: SBB AL,Ib
581d: SBB rAX,Iz
591e: PUSH DS (i64)
601f: POP DS (i64)
61# 0x20 - 0x2f
6220: AND Eb,Gb
6321: AND Ev,Gv
6422: AND Gb,Eb
6523: AND Gv,Ev
6624: AND AL,Ib
6725: AND rAx,Iz
6826: SEG=ES (Prefix)
6927: DAA (i64)
7028: SUB Eb,Gb
7129: SUB Ev,Gv
722a: SUB Gb,Eb
732b: SUB Gv,Ev
742c: SUB AL,Ib
752d: SUB rAX,Iz
762e: SEG=CS (Prefix)
772f: DAS (i64)
78# 0x30 - 0x3f
7930: XOR Eb,Gb
8031: XOR Ev,Gv
8132: XOR Gb,Eb
8233: XOR Gv,Ev
8334: XOR AL,Ib
8435: XOR rAX,Iz
8536: SEG=SS (Prefix)
8637: AAA (i64)
8738: CMP Eb,Gb
8839: CMP Ev,Gv
893a: CMP Gb,Eb
903b: CMP Gv,Ev
913c: CMP AL,Ib
923d: CMP rAX,Iz
933e: SEG=DS (Prefix)
943f: AAS (i64)
95# 0x40 - 0x4f
9640: INC eAX (i64) | REX (o64)
9741: INC eCX (i64) | REX.B (o64)
9842: INC eDX (i64) | REX.X (o64)
9943: INC eBX (i64) | REX.XB (o64)
10044: INC eSP (i64) | REX.R (o64)
10145: INC eBP (i64) | REX.RB (o64)
10246: INC eSI (i64) | REX.RX (o64)
10347: INC eDI (i64) | REX.RXB (o64)
10448: DEC eAX (i64) | REX.W (o64)
10549: DEC eCX (i64) | REX.WB (o64)
1064a: DEC eDX (i64) | REX.WX (o64)
1074b: DEC eBX (i64) | REX.WXB (o64)
1084c: DEC eSP (i64) | REX.WR (o64)
1094d: DEC eBP (i64) | REX.WRB (o64)
1104e: DEC eSI (i64) | REX.WRX (o64)
1114f: DEC eDI (i64) | REX.WRXB (o64)
112# 0x50 - 0x5f
11350: PUSH rAX/r8 (d64)
11451: PUSH rCX/r9 (d64)
11552: PUSH rDX/r10 (d64)
11653: PUSH rBX/r11 (d64)
11754: PUSH rSP/r12 (d64)
11855: PUSH rBP/r13 (d64)
11956: PUSH rSI/r14 (d64)
12057: PUSH rDI/r15 (d64)
12158: POP rAX/r8 (d64)
12259: POP rCX/r9 (d64)
1235a: POP rDX/r10 (d64)
1245b: POP rBX/r11 (d64)
1255c: POP rSP/r12 (d64)
1265d: POP rBP/r13 (d64)
1275e: POP rSI/r14 (d64)
1285f: POP rDI/r15 (d64)
129# 0x60 - 0x6f
13060: PUSHA/PUSHAD (i64)
13161: POPA/POPAD (i64)
13262: BOUND Gv,Ma (i64)
13363: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
13464: SEG=FS (Prefix)
13565: SEG=GS (Prefix)
13666: Operand-Size (Prefix)
13767: Address-Size (Prefix)
13868: PUSH Iz (d64)
13969: IMUL Gv,Ev,Iz
1406a: PUSH Ib (d64)
1416b: IMUL Gv,Ev,Ib
1426c: INS/INSB Yb,DX
1436d: INS/INSW/INSD Yz,DX
1446e: OUTS/OUTSB DX,Xb
1456f: OUTS/OUTSW/OUTSD DX,Xz
146# 0x70 - 0x7f
14770: JO Jb
14871: JNO Jb
14972: JB/JNAE/JC Jb
15073: JNB/JAE/JNC Jb
15174: JZ/JE Jb
15275: JNZ/JNE Jb
15376: JBE/JNA Jb
15477: JNBE/JA Jb
15578: JS Jb
15679: JNS Jb
1577a: JP/JPE Jb
1587b: JNP/JPO Jb
1597c: JL/JNGE Jb
1607d: JNL/JGE Jb
1617e: JLE/JNG Jb
1627f: JNLE/JG Jb
163# 0x80 - 0x8f
16480: Grp1 Eb,Ib (1A)
16581: Grp1 Ev,Iz (1A)
16682: Grp1 Eb,Ib (1A),(i64)
16783: Grp1 Ev,Ib (1A)
16884: TEST Eb,Gb
16985: TEST Ev,Gv
17086: XCHG Eb,Gb
17187: XCHG Ev,Gv
17288: MOV Eb,Gb
17389: MOV Ev,Gv
1748a: MOV Gb,Eb
1758b: MOV Gv,Ev
1768c: MOV Ev,Sw
1778d: LEA Gv,M
1788e: MOV Sw,Ew
1798f: Grp1A (1A) | POP Ev (d64)
180# 0x90 - 0x9f
18190: NOP | PAUSE (F3) | XCHG r8,rAX
18291: XCHG rCX/r9,rAX
18392: XCHG rDX/r10,rAX
18493: XCHG rBX/r11,rAX
18594: XCHG rSP/r12,rAX
18695: XCHG rBP/r13,rAX
18796: XCHG rSI/r14,rAX
18897: XCHG rDI/r15,rAX
18998: CBW/CWDE/CDQE
19099: CWD/CDQ/CQO
1919a: CALLF Ap (i64)
1929b: FWAIT/WAIT
1939c: PUSHF/D/Q Fv (d64)
1949d: POPF/D/Q Fv (d64)
1959e: SAHF
1969f: LAHF
197# 0xa0 - 0xaf
198a0: MOV AL,Ob
199a1: MOV rAX,Ov
200a2: MOV Ob,AL
201a3: MOV Ov,rAX
202a4: MOVS/B Xb,Yb
203a5: MOVS/W/D/Q Xv,Yv
204a6: CMPS/B Xb,Yb
205a7: CMPS/W/D Xv,Yv
206a8: TEST AL,Ib
207a9: TEST rAX,Iz
208aa: STOS/B Yb,AL
209ab: STOS/W/D/Q Yv,rAX
210ac: LODS/B AL,Xb
211ad: LODS/W/D/Q rAX,Xv
212ae: SCAS/B AL,Yb
213af: SCAS/W/D/Q rAX,Xv
214# 0xb0 - 0xbf
215b0: MOV AL/R8L,Ib
216b1: MOV CL/R9L,Ib
217b2: MOV DL/R10L,Ib
218b3: MOV BL/R11L,Ib
219b4: MOV AH/R12L,Ib
220b5: MOV CH/R13L,Ib
221b6: MOV DH/R14L,Ib
222b7: MOV BH/R15L,Ib
223b8: MOV rAX/r8,Iv
224b9: MOV rCX/r9,Iv
225ba: MOV rDX/r10,Iv
226bb: MOV rBX/r11,Iv
227bc: MOV rSP/r12,Iv
228bd: MOV rBP/r13,Iv
229be: MOV rSI/r14,Iv
230bf: MOV rDI/r15,Iv
231# 0xc0 - 0xcf
232c0: Grp2 Eb,Ib (1A)
233c1: Grp2 Ev,Ib (1A)
234c2: RETN Iw (f64)
235c3: RETN
236c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
237c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
238c6: Grp11 Eb,Ib (1A)
239c7: Grp11 Ev,Iz (1A)
240c8: ENTER Iw,Ib
241c9: LEAVE (d64)
242ca: RETF Iw
243cb: RETF
244cc: INT3
245cd: INT Ib
246ce: INTO (i64)
247cf: IRET/D/Q
248# 0xd0 - 0xdf
249d0: Grp2 Eb,1 (1A)
250d1: Grp2 Ev,1 (1A)
251d2: Grp2 Eb,CL (1A)
252d3: Grp2 Ev,CL (1A)
253d4: AAM Ib (i64)
254d5: AAD Ib (i64)
255d6:
256d7: XLAT/XLATB
257d8: ESC
258d9: ESC
259da: ESC
260db: ESC
261dc: ESC
262dd: ESC
263de: ESC
264df: ESC
265# 0xe0 - 0xef
266e0: LOOPNE/LOOPNZ Jb (f64)
267e1: LOOPE/LOOPZ Jb (f64)
268e2: LOOP Jb (f64)
269e3: JrCXZ Jb (f64)
270e4: IN AL,Ib
271e5: IN eAX,Ib
272e6: OUT Ib,AL
273e7: OUT Ib,eAX
274e8: CALL Jz (f64)
275e9: JMP-near Jz (f64)
276ea: JMP-far Ap (i64)
277eb: JMP-short Jb (f64)
278ec: IN AL,DX
279ed: IN eAX,DX
280ee: OUT DX,AL
281ef: OUT DX,eAX
282# 0xf0 - 0xff
283f0: LOCK (Prefix)
284f1:
285f2: REPNE (Prefix)
286f3: REP/REPE (Prefix)
287f4: HLT
288f5: CMC
289f6: Grp3_1 Eb (1A)
290f7: Grp3_2 Ev (1A)
291f8: CLC
292f9: STC
293fa: CLI
294fb: STI
295fc: CLD
296fd: STD
297fe: Grp4 (1A)
298ff: Grp5 (1A)
299EndTable
300
301Table: 2-byte opcode (0x0f)
302Referrer: 2-byte escape
303AVXcode: 1
304# 0x0f 0x00-0x0f
30500: Grp6 (1A)
30601: Grp7 (1A)
30702: LAR Gv,Ew
30803: LSL Gv,Ew
30904:
31005: SYSCALL (o64)
31106: CLTS
31207: SYSRET (o64)
31308: INVD
31409: WBINVD
3150a:
3160b: UD2 (1B)
3170c:
3180d: NOP Ev | GrpP
3190e: FEMMS
320# 3DNow! uses the last imm byte as opcode extension.
3210f: 3DNow! Pq,Qq,Ib
322# 0x0f 0x10-0x1f
32310: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
32411: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
32512: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
32613: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
32714: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
32815: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
32916: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
33017: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
33118: Grp16 (1A)
33219:
3331a:
3341b:
3351c:
3361d:
3371e:
3381f: NOP Ev
339# 0x0f 0x20-0x2f
34020: MOV Rd,Cd
34121: MOV Rd,Dd
34222: MOV Cd,Rd
34323: MOV Dd,Rd
34424:
34525:
34626:
34727:
34828: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
34929: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
3502a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
3512b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
3522c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
3532d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
3542e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128)
3552f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128)
356# 0x0f 0x30-0x3f
35730: WRMSR
35831: RDTSC
35932: RDMSR
36033: RDPMC
36134: SYSENTER
36235: SYSEXIT
36336:
36437: GETSEC
36538: escape # 3-byte escape 1
36639:
3673a: escape # 3-byte escape 2
3683b:
3693c:
3703d:
3713e:
3723f:
373# 0x0f 0x40-0x4f
37440: CMOVO Gv,Ev
37541: CMOVNO Gv,Ev
37642: CMOVB/C/NAE Gv,Ev
37743: CMOVAE/NB/NC Gv,Ev
37844: CMOVE/Z Gv,Ev
37945: CMOVNE/NZ Gv,Ev
38046: CMOVBE/NA Gv,Ev
38147: CMOVA/NBE Gv,Ev
38248: CMOVS Gv,Ev
38349: CMOVNS Gv,Ev
3844a: CMOVP/PE Gv,Ev
3854b: CMOVNP/PO Gv,Ev
3864c: CMOVL/NGE Gv,Ev
3874d: CMOVNL/GE Gv,Ev
3884e: CMOVLE/NG Gv,Ev
3894f: CMOVNLE/G Gv,Ev
390# 0x0f 0x50-0x5f
39150: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
39251: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
39352: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
39453: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
39554: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
39655: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
39756: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
39857: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
39958: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
40059: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
4015a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
4025b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
4035c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
4045d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
4055e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
4065f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
407# 0x0f 0x60-0x6f
40860: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
40961: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
41062: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
41163: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
41264: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
41365: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
41466: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
41567: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
41668: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
41769: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
4186a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
4196b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
4206c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
4216d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
4226e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
4236f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
424# 0x0f 0x70-0x7f
42570: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
42671: Grp12 (1A)
42772: Grp13 (1A)
42873: Grp14 (1A)
42974: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
43075: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
43176: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
43277: emms/vzeroupper/vzeroall (VEX)
43378: VMREAD Ed/q,Gd/q
43479: VMWRITE Gd/q,Ed/q
4357a:
4367b:
4377c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
4387d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
4397e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
4407f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
441# 0x0f 0x80-0x8f
44280: JO Jz (f64)
44381: JNO Jz (f64)
44482: JB/JNAE/JC Jz (f64)
44583: JNB/JAE/JNC Jz (f64)
44684: JZ/JE Jz (f64)
44785: JNZ/JNE Jz (f64)
44886: JBE/JNA Jz (f64)
44987: JNBE/JA Jz (f64)
45088: JS Jz (f64)
45189: JNS Jz (f64)
4528a: JP/JPE Jz (f64)
4538b: JNP/JPO Jz (f64)
4548c: JL/JNGE Jz (f64)
4558d: JNL/JGE Jz (f64)
4568e: JLE/JNG Jz (f64)
4578f: JNLE/JG Jz (f64)
458# 0x0f 0x90-0x9f
45990: SETO Eb
46091: SETNO Eb
46192: SETB/C/NAE Eb
46293: SETAE/NB/NC Eb
46394: SETE/Z Eb
46495: SETNE/NZ Eb
46596: SETBE/NA Eb
46697: SETA/NBE Eb
46798: SETS Eb
46899: SETNS Eb
4699a: SETP/PE Eb
4709b: SETNP/PO Eb
4719c: SETL/NGE Eb
4729d: SETNL/GE Eb
4739e: SETLE/NG Eb
4749f: SETNLE/G Eb
475# 0x0f 0xa0-0xaf
476a0: PUSH FS (d64)
477a1: POP FS (d64)
478a2: CPUID
479a3: BT Ev,Gv
480a4: SHLD Ev,Gv,Ib
481a5: SHLD Ev,Gv,CL
482a6: GrpPDLK
483a7: GrpRNG
484a8: PUSH GS (d64)
485a9: POP GS (d64)
486aa: RSM
487ab: BTS Ev,Gv
488ac: SHRD Ev,Gv,Ib
489ad: SHRD Ev,Gv,CL
490ae: Grp15 (1A),(1C)
491af: IMUL Gv,Ev
492# 0x0f 0xb0-0xbf
493b0: CMPXCHG Eb,Gb
494b1: CMPXCHG Ev,Gv
495b2: LSS Gv,Mp
496b3: BTR Ev,Gv
497b4: LFS Gv,Mp
498b5: LGS Gv,Mp
499b6: MOVZX Gv,Eb
500b7: MOVZX Gv,Ew
501b8: JMPE | POPCNT Gv,Ev (F3)
502b9: Grp10 (1A)
503ba: Grp8 Ev,Ib (1A)
504bb: BTC Ev,Gv
505bc: BSF Gv,Ev
506bd: BSR Gv,Ev
507be: MOVSX Gv,Eb
508bf: MOVSX Gv,Ew
509# 0x0f 0xc0-0xcf
510c0: XADD Eb,Gb
511c1: XADD Ev,Gv
512c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
513c3: movnti Md/q,Gd/q
514c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
515c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
516c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
517c7: Grp9 (1A)
518c8: BSWAP RAX/EAX/R8/R8D
519c9: BSWAP RCX/ECX/R9/R9D
520ca: BSWAP RDX/EDX/R10/R10D
521cb: BSWAP RBX/EBX/R11/R11D
522cc: BSWAP RSP/ESP/R12/R12D
523cd: BSWAP RBP/EBP/R13/R13D
524ce: BSWAP RSI/ESI/R14/R14D
525cf: BSWAP RDI/EDI/R15/R15D
526# 0x0f 0xd0-0xdf
527d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
528d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
529d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
530d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
531d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
532d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
533d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
534d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
535d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
536d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
537da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
538db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
539dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
540dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
541de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
542df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
543# 0x0f 0xe0-0xef
544e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
545e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
546e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
547e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
548e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
549e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
550e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
551e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
552e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
553e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
554ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
555eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
556ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
557ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
558ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
559ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
560# 0x0f 0xf0-0xff
561f0: lddqu Vdq,Mdq (F2),(VEX)
562f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
563f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
564f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
565f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
566f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
567f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
568f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
569f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
570f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
571fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
572fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
573fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
574fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
575fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
576ff:
577EndTable
578
579Table: 3-byte opcode 1 (0x0f 0x38)
580Referrer: 3-byte escape 1
581AVXcode: 2
582# 0x0f 0x38 0x00-0x0f
58300: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
58401: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
58502: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
58603: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
58704: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
58805: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
58906: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
59007: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
59108: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
59209: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
5930a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
5940b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
5950c: Vpermilps /r (66),(oVEX)
5960d: Vpermilpd /r (66),(oVEX)
5970e: vtestps /r (66),(oVEX)
5980f: vtestpd /r (66),(oVEX)
599# 0x0f 0x38 0x10-0x1f
60010: pblendvb Vdq,Wdq (66)
60111:
60212:
60313:
60414: blendvps Vdq,Wdq (66)
60515: blendvpd Vdq,Wdq (66)
60616:
60717: ptest Vdq,Wdq (66),(VEX)
60818: vbroadcastss /r (66),(oVEX)
60919: vbroadcastsd /r (66),(oVEX),(o256)
6101a: vbroadcastf128 /r (66),(oVEX),(o256)
6111b:
6121c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
6131d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
6141e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
6151f:
616# 0x0f 0x38 0x20-0x2f
61720: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
61821: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
61922: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
62023: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
62124: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
62225: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
62326:
62427:
62528: pmuldq Vdq,Wdq (66),(VEX),(o128)
62629: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
6272a: movntdqa Vdq,Mdq (66),(VEX),(o128)
6282b: packusdw Vdq,Wdq (66),(VEX),(o128)
6292c: vmaskmovps(ld) /r (66),(oVEX)
6302d: vmaskmovpd(ld) /r (66),(oVEX)
6312e: vmaskmovps(st) /r (66),(oVEX)
6322f: vmaskmovpd(st) /r (66),(oVEX)
633# 0x0f 0x38 0x30-0x3f
63430: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
63531: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
63632: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
63733: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
63834: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
63935: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
64036:
64137: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
64238: pminsb Vdq,Wdq (66),(VEX),(o128)
64339: pminsd Vdq,Wdq (66),(VEX),(o128)
6443a: pminuw Vdq,Wdq (66),(VEX),(o128)
6453b: pminud Vdq,Wdq (66),(VEX),(o128)
6463c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
6473d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
6483e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
6493f: pmaxud Vdq,Wdq (66),(VEX),(o128)
650# 0x0f 0x38 0x40-0x8f
65140: pmulld Vdq,Wdq (66),(VEX),(o128)
65241: phminposuw Vdq,Wdq (66),(VEX),(o128)
65380: INVEPT Gd/q,Mdq (66)
65481: INVPID Gd/q,Mdq (66)
655# 0x0f 0x38 0x90-0xbf (FMA)
65696: vfmaddsub132pd/ps /r (66),(VEX)
65797: vfmsubadd132pd/ps /r (66),(VEX)
65898: vfmadd132pd/ps /r (66),(VEX)
65999: vfmadd132sd/ss /r (66),(VEX),(o128)
6609a: vfmsub132pd/ps /r (66),(VEX)
6619b: vfmsub132sd/ss /r (66),(VEX),(o128)
6629c: vfnmadd132pd/ps /r (66),(VEX)
6639d: vfnmadd132sd/ss /r (66),(VEX),(o128)
6649e: vfnmsub132pd/ps /r (66),(VEX)
6659f: vfnmsub132sd/ss /r (66),(VEX),(o128)
666a6: vfmaddsub213pd/ps /r (66),(VEX)
667a7: vfmsubadd213pd/ps /r (66),(VEX)
668a8: vfmadd213pd/ps /r (66),(VEX)
669a9: vfmadd213sd/ss /r (66),(VEX),(o128)
670aa: vfmsub213pd/ps /r (66),(VEX)
671ab: vfmsub213sd/ss /r (66),(VEX),(o128)
672ac: vfnmadd213pd/ps /r (66),(VEX)
673ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
674ae: vfnmsub213pd/ps /r (66),(VEX)
675af: vfnmsub213sd/ss /r (66),(VEX),(o128)
676b6: vfmaddsub231pd/ps /r (66),(VEX)
677b7: vfmsubadd231pd/ps /r (66),(VEX)
678b8: vfmadd231pd/ps /r (66),(VEX)
679b9: vfmadd231sd/ss /r (66),(VEX),(o128)
680ba: vfmsub231pd/ps /r (66),(VEX)
681bb: vfmsub231sd/ss /r (66),(VEX),(o128)
682bc: vfnmadd231pd/ps /r (66),(VEX)
683bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
684be: vfnmsub231pd/ps /r (66),(VEX)
685bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
686# 0x0f 0x38 0xc0-0xff
687db: aesimc Vdq,Wdq (66),(VEX),(o128)
688dc: aesenc Vdq,Wdq (66),(VEX),(o128)
689dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
690de: aesdec Vdq,Wdq (66),(VEX),(o128)
691df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
692f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
693f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
694EndTable
695
696Table: 3-byte opcode 2 (0x0f 0x3a)
697Referrer: 3-byte escape 2
698AVXcode: 3
699# 0x0f 0x3a 0x00-0xff
70004: vpermilps /r,Ib (66),(oVEX)
70105: vpermilpd /r,Ib (66),(oVEX)
70206: vperm2f128 /r,Ib (66),(oVEX),(o256)
70308: roundps Vdq,Wdq,Ib (66),(VEX)
70409: roundpd Vdq,Wdq,Ib (66),(VEX)
7050a: roundss Vss,Wss,Ib (66),(VEX),(o128)
7060b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
7070c: blendps Vdq,Wdq,Ib (66),(VEX)
7080d: blendpd Vdq,Wdq,Ib (66),(VEX)
7090e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
7100f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
71114: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
71215: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
71316: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
71417: extractps Ed,Vdq,Ib (66),(VEX),(o128)
71518: vinsertf128 /r,Ib (66),(oVEX),(o256)
71619: vextractf128 /r,Ib (66),(oVEX),(o256)
71720: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
71821: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
71922: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
72040: dpps Vdq,Wdq,Ib (66),(VEX)
72141: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
72242: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
72344: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
7244a: vblendvps /r,Ib (66),(oVEX)
7254b: vblendvpd /r,Ib (66),(oVEX)
7264c: vpblendvb /r,Ib (66),(oVEX),(o128)
72760: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
72861: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
72962: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
73063: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
731df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
732EndTable
733
734GrpTable: Grp1
7350: ADD
7361: OR
7372: ADC
7383: SBB
7394: AND
7405: SUB
7416: XOR
7427: CMP
743EndTable
744
745GrpTable: Grp1A
7460: POP
747EndTable
748
749GrpTable: Grp2
7500: ROL
7511: ROR
7522: RCL
7533: RCR
7544: SHL/SAL
7555: SHR
7566:
7577: SAR
758EndTable
759
760GrpTable: Grp3_1
7610: TEST Eb,Ib
7621:
7632: NOT Eb
7643: NEG Eb
7654: MUL AL,Eb
7665: IMUL AL,Eb
7676: DIV AL,Eb
7687: IDIV AL,Eb
769EndTable
770
771GrpTable: Grp3_2
7720: TEST Ev,Iz
7731:
7742: NOT Ev
7753: NEG Ev
7764: MUL rAX,Ev
7775: IMUL rAX,Ev
7786: DIV rAX,Ev
7797: IDIV rAX,Ev
780EndTable
781
782GrpTable: Grp4
7830: INC Eb
7841: DEC Eb
785EndTable
786
787GrpTable: Grp5
7880: INC Ev
7891: DEC Ev
7902: CALLN Ev (f64)
7913: CALLF Ep
7924: JMPN Ev (f64)
7935: JMPF Ep
7946: PUSH Ev (d64)
7957:
796EndTable
797
798GrpTable: Grp6
7990: SLDT Rv/Mw
8001: STR Rv/Mw
8012: LLDT Ew
8023: LTR Ew
8034: VERR Ew
8045: VERW Ew
805EndTable
806
807GrpTable: Grp7
8080: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
8091: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
8102: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
8113: LIDT Ms
8124: SMSW Mw/Rv
8135:
8146: LMSW Ew
8157: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
816EndTable
817
818GrpTable: Grp8
8194: BT
8205: BTS
8216: BTR
8227: BTC
823EndTable
824
825GrpTable: Grp9
8261: CMPXCHG8B/16B Mq/Mdq
8276: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
8287: VMPTRST Mq
829EndTable
830
831GrpTable: Grp10
832EndTable
833
834GrpTable: Grp11
8350: MOV
836EndTable
837
838GrpTable: Grp12
8392: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
8404: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
8416: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
842EndTable
843
844GrpTable: Grp13
8452: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
8464: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
8476: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
848EndTable
849
850GrpTable: Grp14
8512: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
8523: psrldq Udq,Ib (66),(11B),(VEX),(o128)
8536: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
8547: pslldq Udq,Ib (66),(11B),(VEX),(o128)
855EndTable
856
857GrpTable: Grp15
8580: fxsave
8591: fxstor
8602: ldmxcsr (VEX)
8613: stmxcsr (VEX)
8624: XSAVE
8635: XRSTOR | lfence (11B)
8646: mfence (11B)
8657: clflush | sfence (11B)
866EndTable
867
868GrpTable: Grp16
8690: prefetch NTA
8701: prefetch T0
8712: prefetch T1
8723: prefetch T2
873EndTable
874
875# AMD's Prefetch Group
876GrpTable: GrpP
8770: PREFETCH
8781: PREFETCHW
879EndTable
880
881GrpTable: GrpPDLK
8820: MONTMUL
8831: XSHA1
8842: XSHA2
885EndTable
886
887GrpTable: GrpRNG
8880: xstore-rng
8891: xcrypt-ecb
8902: xcrypt-cbc
8914: xcrypt-cfb
8925: xcrypt-ofb
893EndTable
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61b41ca3b5a2..d0474ad2a6e5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs)
35 35
36 return 0; 36 return 0;
37} 37}
38
39#ifdef CONFIG_X86_64
40/*
41 * Need to defined our own search_extable on X86_64 to work around
42 * a B stepping K8 bug.
43 */
44const struct exception_table_entry *
45search_extable(const struct exception_table_entry *first,
46 const struct exception_table_entry *last,
47 unsigned long value)
48{
49 /* B stepping K8 bug */
50 if ((value >> 32) == 0)
51 value |= 0xffffffffUL << 32;
52
53 while (first <= last) {
54 const struct exception_table_entry *mid;
55 long diff;
56
57 mid = (last - first) / 2 + first;
58 diff = mid->insn - value;
59 if (diff == 0)
60 return mid;
61 else if (diff < 0)
62 first = mid+1;
63 else
64 last = mid-1;
65 }
66 return NULL;
67}
68#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee9028cf0..f62777940dfb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
38 * Returns 0 if mmiotrace is disabled, or if the fault is not 38 * Returns 0 if mmiotrace is disabled, or if the fault is not
39 * handled by mmiotrace: 39 * handled by mmiotrace:
40 */ 40 */
41static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 41static inline int __kprobes
42kmmio_fault(struct pt_regs *regs, unsigned long addr)
42{ 43{
43 if (unlikely(is_kmmio_active())) 44 if (unlikely(is_kmmio_active()))
44 if (kmmio_handler(regs, addr) == 1) 45 if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
46 return 0; 47 return 0;
47} 48}
48 49
49static inline int notify_page_fault(struct pt_regs *regs) 50static inline int __kprobes notify_page_fault(struct pt_regs *regs)
50{ 51{
51 int ret = 0; 52 int ret = 0;
52 53
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
240 * 241 *
241 * Handle a fault on the vmalloc or module mapping area 242 * Handle a fault on the vmalloc or module mapping area
242 */ 243 */
243static noinline int vmalloc_fault(unsigned long address) 244static noinline __kprobes int vmalloc_fault(unsigned long address)
244{ 245{
245 unsigned long pgd_paddr; 246 unsigned long pgd_paddr;
246 pmd_t *pmd_k; 247 pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
357 * 358 *
358 * This assumes no large pages in there. 359 * This assumes no large pages in there.
359 */ 360 */
360static noinline int vmalloc_fault(unsigned long address) 361static noinline __kprobes int vmalloc_fault(unsigned long address)
361{ 362{
362 pgd_t *pgd, *pgd_ref; 363 pgd_t *pgd, *pgd_ref;
363 pud_t *pud, *pud_ref; 364 pud_t *pud, *pud_ref;
@@ -658,7 +659,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
658 show_fault_oops(regs, error_code, address); 659 show_fault_oops(regs, error_code, address);
659 660
660 stackend = end_of_stack(tsk); 661 stackend = end_of_stack(tsk);
661 if (*stackend != STACK_END_MAGIC) 662 if (tsk != &init_task && *stackend != STACK_END_MAGIC)
662 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 663 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
663 664
664 tsk->thread.cr2 = address; 665 tsk->thread.cr2 = address;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
860 * There are no security implications to leaving a stale TLB when 861 * There are no security implications to leaving a stale TLB when
861 * increasing the permissions on a page. 862 * increasing the permissions on a page.
862 */ 863 */
863static noinline int 864static noinline __kprobes int
864spurious_fault(unsigned long error_code, unsigned long address) 865spurious_fault(unsigned long error_code, unsigned long address)
865{ 866{
866 pgd_t *pgd; 867 pgd_t *pgd;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c973f8e2a6cf..9a0c258a86be 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -892,8 +892,7 @@ void __init mem_init(void)
892 reservedpages << (PAGE_SHIFT-10), 892 reservedpages << (PAGE_SHIFT-10),
893 datasize >> 10, 893 datasize >> 10,
894 initsize >> 10, 894 initsize >> 10,
895 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 895 totalhigh_pages << (PAGE_SHIFT-10));
896 );
897 896
898 printk(KERN_INFO "virtual kernel memory layout:\n" 897 printk(KERN_INFO "virtual kernel memory layout:\n"
899 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 898 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5198b9bb34ef..69ddfbd91135 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -49,6 +49,7 @@
49#include <asm/numa.h> 49#include <asm/numa.h>
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h> 51#include <asm/init.h>
52#include <linux/bootmem.h>
52 53
53static unsigned long dma_reserve __initdata; 54static unsigned long dma_reserve __initdata;
54 55
@@ -616,6 +617,21 @@ void __init paging_init(void)
616 */ 617 */
617#ifdef CONFIG_MEMORY_HOTPLUG 618#ifdef CONFIG_MEMORY_HOTPLUG
618/* 619/*
620 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
621 * updating.
622 */
623static void update_end_of_memory_vars(u64 start, u64 size)
624{
625 unsigned long end_pfn = PFN_UP(start + size);
626
627 if (end_pfn > max_pfn) {
628 max_pfn = end_pfn;
629 max_low_pfn = end_pfn;
630 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
631 }
632}
633
634/*
619 * Memory is added always to NORMAL zone. This means you will never get 635 * Memory is added always to NORMAL zone. This means you will never get
620 * additional DMA/DMA32 memory. 636 * additional DMA/DMA32 memory.
621 */ 637 */
@@ -634,6 +650,9 @@ int arch_add_memory(int nid, u64 start, u64 size)
634 ret = __add_pages(nid, zone, start_pfn, nr_pages); 650 ret = __add_pages(nid, zone, start_pfn, nr_pages);
635 WARN_ON_ONCE(ret); 651 WARN_ON_ONCE(ret);
636 652
653 /* update max_pfn, max_low_pfn and high_memory */
654 update_end_of_memory_vars(start, size);
655
637 return ret; 656 return ret;
638} 657}
639EXPORT_SYMBOL_GPL(arch_add_memory); 658EXPORT_SYMBOL_GPL(arch_add_memory);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 1bf9e08ed733..e404ffe30210 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -133,8 +133,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
133 (unsigned long long)phys_addr, 133 (unsigned long long)phys_addr,
134 (unsigned long long)(phys_addr + size), 134 (unsigned long long)(phys_addr + size),
135 prot_val, new_prot_val); 135 prot_val, new_prot_val);
136 free_memtype(phys_addr, phys_addr + size); 136 goto err_free_memtype;
137 return NULL;
138 } 137 }
139 prot_val = new_prot_val; 138 prot_val = new_prot_val;
140 } 139 }
@@ -160,26 +159,25 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
160 */ 159 */
161 area = get_vm_area_caller(size, VM_IOREMAP, caller); 160 area = get_vm_area_caller(size, VM_IOREMAP, caller);
162 if (!area) 161 if (!area)
163 return NULL; 162 goto err_free_memtype;
164 area->phys_addr = phys_addr; 163 area->phys_addr = phys_addr;
165 vaddr = (unsigned long) area->addr; 164 vaddr = (unsigned long) area->addr;
166 165
167 if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { 166 if (kernel_map_sync_memtype(phys_addr, size, prot_val))
168 free_memtype(phys_addr, phys_addr + size); 167 goto err_free_area;
169 free_vm_area(area);
170 return NULL;
171 }
172 168
173 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { 169 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
174 free_memtype(phys_addr, phys_addr + size); 170 goto err_free_area;
175 free_vm_area(area);
176 return NULL;
177 }
178 171
179 ret_addr = (void __iomem *) (vaddr + offset); 172 ret_addr = (void __iomem *) (vaddr + offset);
180 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 173 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
181 174
182 return ret_addr; 175 return ret_addr;
176err_free_area:
177 free_vm_area(area);
178err_free_memtype:
179 free_memtype(phys_addr, phys_addr + size);
180 return NULL;
183} 181}
184 182
185/** 183/**
@@ -246,30 +244,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
246} 244}
247EXPORT_SYMBOL(ioremap_cache); 245EXPORT_SYMBOL(ioremap_cache);
248 246
249static void __iomem *ioremap_default(resource_size_t phys_addr,
250 unsigned long size)
251{
252 unsigned long flags;
253 void __iomem *ret;
254 int err;
255
256 /*
257 * - WB for WB-able memory and no other conflicting mappings
258 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
259 * - Inherit from confliting mappings otherwise
260 */
261 err = reserve_memtype(phys_addr, phys_addr + size,
262 _PAGE_CACHE_WB, &flags);
263 if (err < 0)
264 return NULL;
265
266 ret = __ioremap_caller(phys_addr, size, flags,
267 __builtin_return_address(0));
268
269 free_memtype(phys_addr, phys_addr + size);
270 return ret;
271}
272
273void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, 247void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
274 unsigned long prot_val) 248 unsigned long prot_val)
275{ 249{
@@ -345,7 +319,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
345 if (page_is_ram(start >> PAGE_SHIFT)) 319 if (page_is_ram(start >> PAGE_SHIFT))
346 return __va(phys); 320 return __va(phys);
347 321
348 addr = (void __force *)ioremap_default(start, PAGE_SIZE); 322 addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
349 if (addr) 323 if (addr)
350 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 324 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
351 325
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 4901d0dafda6..af3b6c8a436f 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -106,26 +106,25 @@ void kmemcheck_error_recall(void)
106 106
107 switch (e->type) { 107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS: 108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " 109 printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ? 110 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)", 111 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address); 112 (void *) e->address);
114 113
115 printk(KERN_INFO); 114 printk(KERN_WARNING);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i) 115 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]); 116 printk(KERN_CONT "%02x", e->memory_copy[i]);
118 printk("\n"); 117 printk(KERN_CONT "\n");
119 118
120 printk(KERN_INFO); 119 printk(KERN_WARNING);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) { 120 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) 121 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]); 122 printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
124 else 123 else
125 printk(" ?"); 124 printk(KERN_CONT " ?");
126 } 125 }
127 printk("\n"); 126 printk(KERN_CONT "\n");
128 printk(KERN_INFO "%*c\n", 2 + 2 127 printk(KERN_WARNING "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); 128 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break; 129 break;
131 case KMEMCHECK_ERROR_BUG: 130 case KMEMCHECK_ERROR_BUG:
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917f..536fb6823366 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -5,6 +5,8 @@
5 * 2008 Pekka Paalanen <pq@iki.fi> 5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/list.h> 10#include <linux/list.h>
9#include <linux/rculist.h> 11#include <linux/rculist.h>
10#include <linux/spinlock.h> 12#include <linux/spinlock.h>
@@ -136,7 +138,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
136 pte_t *pte = lookup_address(f->page, &level); 138 pte_t *pte = lookup_address(f->page, &level);
137 139
138 if (!pte) { 140 if (!pte) {
139 pr_err("kmmio: no pte for page 0x%08lx\n", f->page); 141 pr_err("no pte for page 0x%08lx\n", f->page);
140 return -1; 142 return -1;
141 } 143 }
142 144
@@ -148,7 +150,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
148 clear_pte_presence(pte, clear, &f->old_presence); 150 clear_pte_presence(pte, clear, &f->old_presence);
149 break; 151 break;
150 default: 152 default:
151 pr_err("kmmio: unexpected page level 0x%x.\n", level); 153 pr_err("unexpected page level 0x%x.\n", level);
152 return -1; 154 return -1;
153 } 155 }
154 156
@@ -170,13 +172,14 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
170static int arm_kmmio_fault_page(struct kmmio_fault_page *f) 172static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
171{ 173{
172 int ret; 174 int ret;
173 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); 175 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
174 if (f->armed) { 176 if (f->armed) {
175 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", 177 pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
176 f->page, f->count, !!f->old_presence); 178 f->page, f->count, !!f->old_presence);
177 } 179 }
178 ret = clear_page_presence(f, true); 180 ret = clear_page_presence(f, true);
179 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); 181 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
182 f->page);
180 f->armed = true; 183 f->armed = true;
181 return ret; 184 return ret;
182} 185}
@@ -203,7 +206,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
203 */ 206 */
204/* 207/*
205 * Interrupts are disabled on entry as trap3 is an interrupt gate 208 * Interrupts are disabled on entry as trap3 is an interrupt gate
206 * and they remain disabled thorough out this function. 209 * and they remain disabled throughout this function.
207 */ 210 */
208int kmmio_handler(struct pt_regs *regs, unsigned long addr) 211int kmmio_handler(struct pt_regs *regs, unsigned long addr)
209{ 212{
@@ -240,24 +243,21 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
240 * condition needs handling by do_page_fault(), the 243 * condition needs handling by do_page_fault(), the
241 * page really not being present is the most common. 244 * page really not being present is the most common.
242 */ 245 */
243 pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", 246 pr_debug("secondary hit for 0x%08lx CPU %d.\n",
244 addr, smp_processor_id()); 247 addr, smp_processor_id());
245 248
246 if (!faultpage->old_presence) 249 if (!faultpage->old_presence)
247 pr_info("kmmio: unexpected secondary hit for " 250 pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
248 "address 0x%08lx on CPU %d.\n", addr, 251 addr, smp_processor_id());
249 smp_processor_id());
250 } else { 252 } else {
251 /* 253 /*
252 * Prevent overwriting already in-flight context. 254 * Prevent overwriting already in-flight context.
253 * This should not happen, let's hope disarming at 255 * This should not happen, let's hope disarming at
254 * least prevents a panic. 256 * least prevents a panic.
255 */ 257 */
256 pr_emerg("kmmio: recursive probe hit on CPU %d, " 258 pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
257 "for address 0x%08lx. Ignoring.\n", 259 smp_processor_id(), addr);
258 smp_processor_id(), addr); 260 pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
259 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
260 ctx->addr);
261 disarm_kmmio_fault_page(faultpage); 261 disarm_kmmio_fault_page(faultpage);
262 } 262 }
263 goto no_kmmio_ctx; 263 goto no_kmmio_ctx;
@@ -302,7 +302,7 @@ no_kmmio:
302 302
303/* 303/*
304 * Interrupts are disabled on entry as trap1 is an interrupt gate 304 * Interrupts are disabled on entry as trap1 is an interrupt gate
305 * and they remain disabled thorough out this function. 305 * and they remain disabled throughout this function.
306 * This must always get called as the pair to kmmio_handler(). 306 * This must always get called as the pair to kmmio_handler().
307 */ 307 */
308static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) 308static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
@@ -316,8 +316,8 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
316 * something external causing them (f.e. using a debugger while 316 * something external causing them (f.e. using a debugger while
317 * mmio tracing enabled), or erroneous behaviour 317 * mmio tracing enabled), or erroneous behaviour
318 */ 318 */
319 pr_warning("kmmio: unexpected debug trap on CPU %d.\n", 319 pr_warning("unexpected debug trap on CPU %d.\n",
320 smp_processor_id()); 320 smp_processor_id());
321 goto out; 321 goto out;
322 } 322 }
323 323
@@ -425,7 +425,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
425 list_add_rcu(&p->list, &kmmio_probes); 425 list_add_rcu(&p->list, &kmmio_probes);
426 while (size < size_lim) { 426 while (size < size_lim) {
427 if (add_kmmio_fault_page(p->addr + size)) 427 if (add_kmmio_fault_page(p->addr + size))
428 pr_err("kmmio: Unable to set page fault.\n"); 428 pr_err("Unable to set page fault.\n");
429 size += PAGE_SIZE; 429 size += PAGE_SIZE;
430 } 430 }
431out: 431out:
@@ -490,7 +490,7 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
490 * 2. remove_kmmio_fault_pages() 490 * 2. remove_kmmio_fault_pages()
491 * Remove the pages from kmmio_page_table. 491 * Remove the pages from kmmio_page_table.
492 * 3. rcu_free_kmmio_fault_pages() 492 * 3. rcu_free_kmmio_fault_pages()
493 * Actally free the kmmio_fault_page structs as with RCU. 493 * Actually free the kmmio_fault_page structs as with RCU.
494 */ 494 */
495void unregister_kmmio_probe(struct kmmio_probe *p) 495void unregister_kmmio_probe(struct kmmio_probe *p)
496{ 496{
@@ -511,7 +511,7 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
511 511
512 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 512 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
513 if (!drelease) { 513 if (!drelease) {
514 pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); 514 pr_crit("leaking kmmio_fault_page objects.\n");
515 return; 515 return;
516 } 516 }
517 drelease->release_list = release_list; 517 drelease->release_list = release_list;
@@ -538,10 +538,17 @@ static int
538kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) 538kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
539{ 539{
540 struct die_args *arg = args; 540 struct die_args *arg = args;
541 unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
541 542
542 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 543 if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
543 if (post_kmmio_handler(arg->err, arg->regs) == 1) 544 if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
545 /*
546 * Reset the BS bit in dr6 (pointed by args->err) to
547 * denote completion of processing
548 */
549 *dr6_p &= ~DR_STEP;
544 return NOTIFY_STOP; 550 return NOTIFY_STOP;
551 }
545 552
546 return NOTIFY_DONE; 553 return NOTIFY_DONE;
547} 554}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 132772a8ec57..34a3291ca103 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -19,6 +19,9 @@
19 * 19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi. 20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */ 21 */
22
23#define pr_fmt(fmt) "mmiotrace: " fmt
24
22#define DEBUG 1 25#define DEBUG 1
23 26
24#include <linux/module.h> 27#include <linux/module.h>
@@ -36,8 +39,6 @@
36 39
37#include "pf_in.h" 40#include "pf_in.h"
38 41
39#define NAME "mmiotrace: "
40
41struct trap_reason { 42struct trap_reason {
42 unsigned long addr; 43 unsigned long addr;
43 unsigned long ip; 44 unsigned long ip;
@@ -96,17 +97,18 @@ static void print_pte(unsigned long address)
96 pte_t *pte = lookup_address(address, &level); 97 pte_t *pte = lookup_address(address, &level);
97 98
98 if (!pte) { 99 if (!pte) {
99 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", 100 pr_err("Error in %s: no pte for page 0x%08lx\n",
100 __func__, address); 101 __func__, address);
101 return; 102 return;
102 } 103 }
103 104
104 if (level == PG_LEVEL_2M) { 105 if (level == PG_LEVEL_2M) {
105 pr_emerg(NAME "4MB pages are not currently supported: " 106 pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
106 "0x%08lx\n", address); 107 address);
107 BUG(); 108 BUG();
108 } 109 }
109 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, 110 pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
111 address,
110 (unsigned long long)pte_val(*pte), 112 (unsigned long long)pte_val(*pte),
111 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); 113 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
112} 114}
@@ -118,22 +120,21 @@ static void print_pte(unsigned long address)
118static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) 120static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
119{ 121{
120 const struct trap_reason *my_reason = &get_cpu_var(pf_reason); 122 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
121 pr_emerg(NAME "unexpected fault for address: 0x%08lx, " 123 pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
122 "last fault for address: 0x%08lx\n", 124 addr, my_reason->addr);
123 addr, my_reason->addr);
124 print_pte(addr); 125 print_pte(addr);
125 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); 126 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
126 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); 127 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
127#ifdef __i386__ 128#ifdef __i386__
128 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", 129 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
129 regs->ax, regs->bx, regs->cx, regs->dx); 130 regs->ax, regs->bx, regs->cx, regs->dx);
130 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", 131 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
131 regs->si, regs->di, regs->bp, regs->sp); 132 regs->si, regs->di, regs->bp, regs->sp);
132#else 133#else
133 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", 134 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
134 regs->ax, regs->cx, regs->dx); 135 regs->ax, regs->cx, regs->dx);
135 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", 136 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
136 regs->si, regs->di, regs->bp, regs->sp); 137 regs->si, regs->di, regs->bp, regs->sp);
137#endif 138#endif
138 put_cpu_var(pf_reason); 139 put_cpu_var(pf_reason);
139 BUG(); 140 BUG();
@@ -213,7 +214,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
213 /* this should always return the active_trace count to 0 */ 214 /* this should always return the active_trace count to 0 */
214 my_reason->active_traces--; 215 my_reason->active_traces--;
215 if (my_reason->active_traces) { 216 if (my_reason->active_traces) {
216 pr_emerg(NAME "unexpected post handler"); 217 pr_emerg("unexpected post handler");
217 BUG(); 218 BUG();
218 } 219 }
219 220
@@ -244,7 +245,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
244 }; 245 };
245 246
246 if (!trace) { 247 if (!trace) {
247 pr_err(NAME "kmalloc failed in ioremap\n"); 248 pr_err("kmalloc failed in ioremap\n");
248 return; 249 return;
249 } 250 }
250 251
@@ -282,8 +283,8 @@ void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
282 if (!is_enabled()) /* recheck and proper locking in *_core() */ 283 if (!is_enabled()) /* recheck and proper locking in *_core() */
283 return; 284 return;
284 285
285 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", 286 pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
286 (unsigned long long)offset, size, addr); 287 (unsigned long long)offset, size, addr);
287 if ((filter_offset) && (offset != filter_offset)) 288 if ((filter_offset) && (offset != filter_offset))
288 return; 289 return;
289 ioremap_trace_core(offset, size, addr); 290 ioremap_trace_core(offset, size, addr);
@@ -301,7 +302,7 @@ static void iounmap_trace_core(volatile void __iomem *addr)
301 struct remap_trace *tmp; 302 struct remap_trace *tmp;
302 struct remap_trace *found_trace = NULL; 303 struct remap_trace *found_trace = NULL;
303 304
304 pr_debug(NAME "Unmapping %p.\n", addr); 305 pr_debug("Unmapping %p.\n", addr);
305 306
306 spin_lock_irq(&trace_lock); 307 spin_lock_irq(&trace_lock);
307 if (!is_enabled()) 308 if (!is_enabled())
@@ -363,9 +364,8 @@ static void clear_trace_list(void)
363 * Caller also ensures is_enabled() cannot change. 364 * Caller also ensures is_enabled() cannot change.
364 */ 365 */
365 list_for_each_entry(trace, &trace_list, list) { 366 list_for_each_entry(trace, &trace_list, list) {
366 pr_notice(NAME "purging non-iounmapped " 367 pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
367 "trace @0x%08lx, size 0x%lx.\n", 368 trace->probe.addr, trace->probe.len);
368 trace->probe.addr, trace->probe.len);
369 if (!nommiotrace) 369 if (!nommiotrace)
370 unregister_kmmio_probe(&trace->probe); 370 unregister_kmmio_probe(&trace->probe);
371 } 371 }
@@ -387,7 +387,7 @@ static void enter_uniprocessor(void)
387 387
388 if (downed_cpus == NULL && 388 if (downed_cpus == NULL &&
389 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { 389 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
390 pr_notice(NAME "Failed to allocate mask\n"); 390 pr_notice("Failed to allocate mask\n");
391 goto out; 391 goto out;
392 } 392 }
393 393
@@ -395,20 +395,19 @@ static void enter_uniprocessor(void)
395 cpumask_copy(downed_cpus, cpu_online_mask); 395 cpumask_copy(downed_cpus, cpu_online_mask);
396 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); 396 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
397 if (num_online_cpus() > 1) 397 if (num_online_cpus() > 1)
398 pr_notice(NAME "Disabling non-boot CPUs...\n"); 398 pr_notice("Disabling non-boot CPUs...\n");
399 put_online_cpus(); 399 put_online_cpus();
400 400
401 for_each_cpu(cpu, downed_cpus) { 401 for_each_cpu(cpu, downed_cpus) {
402 err = cpu_down(cpu); 402 err = cpu_down(cpu);
403 if (!err) 403 if (!err)
404 pr_info(NAME "CPU%d is down.\n", cpu); 404 pr_info("CPU%d is down.\n", cpu);
405 else 405 else
406 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); 406 pr_err("Error taking CPU%d down: %d\n", cpu, err);
407 } 407 }
408out: 408out:
409 if (num_online_cpus() > 1) 409 if (num_online_cpus() > 1)
410 pr_warning(NAME "multiple CPUs still online, " 410 pr_warning("multiple CPUs still online, may miss events.\n");
411 "may miss events.\n");
412} 411}
413 412
414/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, 413/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
@@ -420,13 +419,13 @@ static void __ref leave_uniprocessor(void)
420 419
421 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) 420 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
422 return; 421 return;
423 pr_notice(NAME "Re-enabling CPUs...\n"); 422 pr_notice("Re-enabling CPUs...\n");
424 for_each_cpu(cpu, downed_cpus) { 423 for_each_cpu(cpu, downed_cpus) {
425 err = cpu_up(cpu); 424 err = cpu_up(cpu);
426 if (!err) 425 if (!err)
427 pr_info(NAME "enabled CPU%d.\n", cpu); 426 pr_info("enabled CPU%d.\n", cpu);
428 else 427 else
429 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); 428 pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
430 } 429 }
431} 430}
432 431
@@ -434,8 +433,8 @@ static void __ref leave_uniprocessor(void)
434static void enter_uniprocessor(void) 433static void enter_uniprocessor(void)
435{ 434{
436 if (num_online_cpus() > 1) 435 if (num_online_cpus() > 1)
437 pr_warning(NAME "multiple CPUs are online, may miss events. " 436 pr_warning("multiple CPUs are online, may miss events. "
438 "Suggest booting with maxcpus=1 kernel argument.\n"); 437 "Suggest booting with maxcpus=1 kernel argument.\n");
439} 438}
440 439
441static void leave_uniprocessor(void) 440static void leave_uniprocessor(void)
@@ -450,13 +449,13 @@ void enable_mmiotrace(void)
450 goto out; 449 goto out;
451 450
452 if (nommiotrace) 451 if (nommiotrace)
453 pr_info(NAME "MMIO tracing disabled.\n"); 452 pr_info("MMIO tracing disabled.\n");
454 kmmio_init(); 453 kmmio_init();
455 enter_uniprocessor(); 454 enter_uniprocessor();
456 spin_lock_irq(&trace_lock); 455 spin_lock_irq(&trace_lock);
457 atomic_inc(&mmiotrace_enabled); 456 atomic_inc(&mmiotrace_enabled);
458 spin_unlock_irq(&trace_lock); 457 spin_unlock_irq(&trace_lock);
459 pr_info(NAME "enabled.\n"); 458 pr_info("enabled.\n");
460out: 459out:
461 mutex_unlock(&mmiotrace_mutex); 460 mutex_unlock(&mmiotrace_mutex);
462} 461}
@@ -475,7 +474,7 @@ void disable_mmiotrace(void)
475 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 474 clear_trace_list(); /* guarantees: no more kmmio callbacks */
476 leave_uniprocessor(); 475 leave_uniprocessor();
477 kmmio_cleanup(); 476 kmmio_cleanup();
478 pr_info(NAME "disabled.\n"); 477 pr_info("disabled.\n");
479out: 478out:
480 mutex_unlock(&mmiotrace_mutex); 479 mutex_unlock(&mmiotrace_mutex);
481} 480}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index a81b7e73275d..ae9648eb1c7f 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -356,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end)
356 * - _PAGE_CACHE_UC_MINUS 356 * - _PAGE_CACHE_UC_MINUS
357 * - _PAGE_CACHE_UC 357 * - _PAGE_CACHE_UC
358 * 358 *
359 * req_type will have a special case value '-1', when requester want to inherit
360 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
361 *
362 * If new_type is NULL, function will return an error if it cannot reserve the 359 * If new_type is NULL, function will return an error if it cannot reserve the
363 * region with req_type. If new_type is non-NULL, function will return 360 * region with req_type. If new_type is non-NULL, function will return
364 * available type in new_type in case of no error. In case of any error 361 * available type in new_type in case of no error. In case of any error
@@ -378,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
378 if (!pat_enabled) { 375 if (!pat_enabled) {
379 /* This is identical to page table setting without PAT */ 376 /* This is identical to page table setting without PAT */
380 if (new_type) { 377 if (new_type) {
381 if (req_type == -1) 378 if (req_type == _PAGE_CACHE_WC)
382 *new_type = _PAGE_CACHE_WB;
383 else if (req_type == _PAGE_CACHE_WC)
384 *new_type = _PAGE_CACHE_UC_MINUS; 379 *new_type = _PAGE_CACHE_UC_MINUS;
385 else 380 else
386 *new_type = req_type & _PAGE_CACHE_MASK; 381 *new_type = req_type & _PAGE_CACHE_MASK;
@@ -709,9 +704,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
709 if (!range_is_allowed(pfn, size)) 704 if (!range_is_allowed(pfn, size))
710 return 0; 705 return 0;
711 706
712 if (file->f_flags & O_SYNC) { 707 if (file->f_flags & O_DSYNC)
713 flags = _PAGE_CACHE_UC_MINUS; 708 flags = _PAGE_CACHE_UC_MINUS;
714 }
715 709
716#ifdef CONFIG_X86_32 710#ifdef CONFIG_X86_32
717 /* 711 /*
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 6f8aa33031c7..9324f13492d5 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -267,6 +267,8 @@ int __init get_memcfg_from_srat(void)
267 e820_register_active_regions(chunk->nid, chunk->start_pfn, 267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn)); 268 min(chunk->end_pfn, max_pfn));
269 } 269 }
270 /* for out of order entries in SRAT */
271 sort_node_map();
270 272
271 for_each_online_node(nid) { 273 for_each_online_node(nid) {
272 unsigned long start = node_start_pfn[nid]; 274 unsigned long start = node_start_pfn[nid];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 34aa438d60b6..28c68762648f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
136 apicid_to_node[apic_id] = node; 136 apicid_to_node[apic_id] = node;
137 node_set(node, cpu_nodes_parsed); 137 node_set(node, cpu_nodes_parsed);
138 acpi_numa = 1; 138 acpi_numa = 1;
139 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 139 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
140 pxm, apic_id, node); 140 pxm, apic_id, node);
141} 141}
142 142
@@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
170 apicid_to_node[apic_id] = node; 170 apicid_to_node[apic_id] = node;
171 node_set(node, cpu_nodes_parsed); 171 node_set(node, cpu_nodes_parsed);
172 acpi_numa = 1; 172 acpi_numa = 1;
173 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 173 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
174 pxm, apic_id, node); 174 pxm, apic_id, node);
175} 175}
176 176
@@ -229,9 +229,11 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
229 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); 229 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
230 } 230 }
231 231
232 if (changed) 232 if (changed) {
233 node_set(node, cpu_nodes_parsed);
233 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", 234 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
234 nd->start, nd->end); 235 nd->start, nd->end);
236 }
235} 237}
236 238
237/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 239/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -317,7 +319,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
317 unsigned long s = nodes[i].start >> PAGE_SHIFT; 319 unsigned long s = nodes[i].start >> PAGE_SHIFT;
318 unsigned long e = nodes[i].end >> PAGE_SHIFT; 320 unsigned long e = nodes[i].end >> PAGE_SHIFT;
319 pxmram += e - s; 321 pxmram += e - s;
320 pxmram -= absent_pages_in_range(s, e); 322 pxmram -= __absent_pages_in_range(i, s, e);
321 if ((long)pxmram < 0) 323 if ((long)pxmram < 0)
322 pxmram = 0; 324 pxmram = 0;
323 } 325 }
@@ -373,6 +375,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
373 for_each_node_mask(i, nodes_parsed) 375 for_each_node_mask(i, nodes_parsed)
374 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 376 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
375 nodes[i].end >> PAGE_SHIFT); 377 nodes[i].end >> PAGE_SHIFT);
378 /* for out of order entries in SRAT */
379 sort_node_map();
376 if (!nodes_cover_memory(nodes)) { 380 if (!nodes_cover_memory(nodes)) {
377 bad_srat(); 381 bad_srat();
378 return -1; 382 return -1;
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df5..8565d944f7cf 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
1/* 1/*
2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> 2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
3 */ 3 */
4
5#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6
4#include <linux/module.h> 7#include <linux/module.h>
5#include <linux/io.h> 8#include <linux/io.h>
6#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
7 10
8#define MODULE_NAME "testmmiotrace"
9
10static unsigned long mmio_address; 11static unsigned long mmio_address;
11module_param(mmio_address, ulong, 0); 12module_param(mmio_address, ulong, 0);
12MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " 13MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
30static void do_write_test(void __iomem *p) 31static void do_write_test(void __iomem *p)
31{ 32{
32 unsigned int i; 33 unsigned int i;
33 pr_info(MODULE_NAME ": write test.\n"); 34 pr_info("write test.\n");
34 mmiotrace_printk("Write test.\n"); 35 mmiotrace_printk("Write test.\n");
35 36
36 for (i = 0; i < 256; i++) 37 for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
47{ 48{
48 unsigned int i; 49 unsigned int i;
49 unsigned errs[3] = { 0 }; 50 unsigned errs[3] = { 0 };
50 pr_info(MODULE_NAME ": read test.\n"); 51 pr_info("read test.\n");
51 mmiotrace_printk("Read test.\n"); 52 mmiotrace_printk("Read test.\n");
52 53
53 for (i = 0; i < 256; i++) 54 for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
68 69
69static void do_read_far_test(void __iomem *p) 70static void do_read_far_test(void __iomem *p)
70{ 71{
71 pr_info(MODULE_NAME ": read far test.\n"); 72 pr_info("read far test.\n");
72 mmiotrace_printk("Read far test.\n"); 73 mmiotrace_printk("Read far test.\n");
73 74
74 ioread32(p + read_far); 75 ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
78{ 79{
79 void __iomem *p = ioremap_nocache(mmio_address, size); 80 void __iomem *p = ioremap_nocache(mmio_address, size);
80 if (!p) { 81 if (!p) {
81 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 82 pr_err("could not ioremap, aborting.\n");
82 return; 83 return;
83 } 84 }
84 mmiotrace_printk("ioremap returned %p.\n", p); 85 mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
94 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
95 96
96 if (mmio_address == 0) { 97 if (mmio_address == 0) {
97 pr_err(MODULE_NAME ": you have to use the module argument " 98 pr_err("you have to use the module argument mmio_address.\n");
98 "mmio_address.\n"); 99 pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
99 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
100 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
101 return -ENXIO; 100 return -ENXIO;
102 } 101 }
103 102
104 pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " 103 pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
105 "address space, and writing 16 kB of rubbish in there.\n", 104 "and writing 16 kB of rubbish in there.\n",
106 size >> 10, mmio_address); 105 size >> 10, mmio_address);
107 do_test(size); 106 do_test(size);
108 pr_info(MODULE_NAME ": All done.\n"); 107 pr_info("All done.\n");
109 return 0; 108 return 0;
110} 109}
111 110
112static void __exit cleanup(void) 111static void __exit cleanup(void)
113{ 112{
114 pr_debug(MODULE_NAME ": unloaded.\n"); 113 pr_debug("unloaded.\n");
115} 114}
116 115
117module_init(init); 116module_init(init);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 044897be021f..3855096c59b8 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -41,10 +41,11 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
41} 41}
42 42
43static struct stacktrace_ops backtrace_ops = { 43static struct stacktrace_ops backtrace_ops = {
44 .warning = backtrace_warning, 44 .warning = backtrace_warning,
45 .warning_symbol = backtrace_warning_symbol, 45 .warning_symbol = backtrace_warning_symbol,
46 .stack = backtrace_stack, 46 .stack = backtrace_stack,
47 .address = backtrace_address, 47 .address = backtrace_address,
48 .walk_stack = print_context_stack,
48}; 49};
49 50
50struct frame_head { 51struct frame_head {
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cb88b1a0bd5f..3347f696edc7 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -222,7 +222,7 @@ static void nmi_cpu_switch(void *dummy)
222 222
223 /* move to next set */ 223 /* move to next set */
224 si += model->num_counters; 224 si += model->num_counters;
225 if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) 225 if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
226 per_cpu(switch_index, cpu) = 0; 226 per_cpu(switch_index, cpu) = 0;
227 else 227 else
228 per_cpu(switch_index, cpu) = si; 228 per_cpu(switch_index, cpu) = si;
@@ -598,6 +598,7 @@ static int __init ppro_init(char **cpu_type)
598 case 15: case 23: 598 case 15: case 23:
599 *cpu_type = "i386/core_2"; 599 *cpu_type = "i386/core_2";
600 break; 600 break;
601 case 0x2e:
601 case 26: 602 case 26:
602 spec = &op_arch_perfmon_spec; 603 spec = &op_arch_perfmon_spec;
603 *cpu_type = "i386/core_i7"; 604 *cpu_type = "i386/core_i7";
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index d49202e740ea..39fba37f702f 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,3 +15,8 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-y += common.o early.o 16obj-y += common.o early.o
17obj-y += amd_bus.o 17obj-y += amd_bus.o
18obj-$(CONFIG_X86_64) += bus_numa.o
19
20ifeq ($(CONFIG_PCI_DEBUG),y)
21EXTRA_CFLAGS += -DDEBUG
22endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1014eb4bfc37..959e548a7039 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -7,6 +7,7 @@
7#include <asm/pci_x86.h> 7#include <asm/pci_x86.h>
8 8
9struct pci_root_info { 9struct pci_root_info {
10 struct acpi_device *bridge;
10 char *name; 11 char *name;
11 unsigned int res_num; 12 unsigned int res_num;
12 struct resource *res; 13 struct resource *res;
@@ -58,6 +59,30 @@ bus_has_transparent_bridge(struct pci_bus *bus)
58 return false; 59 return false;
59} 60}
60 61
62static void
63align_resource(struct acpi_device *bridge, struct resource *res)
64{
65 int align = (res->flags & IORESOURCE_MEM) ? 16 : 4;
66
67 /*
68 * Host bridge windows are not BARs, but the decoders on the PCI side
69 * that claim this address space have starting alignment and length
70 * constraints, so fix any obvious BIOS goofs.
71 */
72 if (!IS_ALIGNED(res->start, align)) {
73 dev_printk(KERN_DEBUG, &bridge->dev,
74 "host bridge window %pR invalid; "
75 "aligning start to %d-byte boundary\n", res, align);
76 res->start &= ~(align - 1);
77 }
78 if (!IS_ALIGNED(res->end + 1, align)) {
79 dev_printk(KERN_DEBUG, &bridge->dev,
80 "host bridge window %pR invalid; "
81 "aligning end to %d-byte boundary\n", res, align);
82 res->end = ALIGN(res->end, align) - 1;
83 }
84}
85
61static acpi_status 86static acpi_status
62setup_resource(struct acpi_resource *acpi_res, void *data) 87setup_resource(struct acpi_resource *acpi_res, void *data)
63{ 88{
@@ -91,11 +116,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
91 start = addr.minimum + addr.translation_offset; 116 start = addr.minimum + addr.translation_offset;
92 end = start + addr.address_length - 1; 117 end = start + addr.address_length - 1;
93 if (info->res_num >= max_root_bus_resources) { 118 if (info->res_num >= max_root_bus_resources) {
94 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " 119 if (pci_probe & PCI_USE__CRS)
95 "from %s for %s due to _CRS returning more than " 120 printk(KERN_WARNING "PCI: Failed to allocate "
96 "%d resource descriptors\n", (unsigned long) start, 121 "0x%lx-0x%lx from %s for %s due to _CRS "
97 (unsigned long) end, root->name, info->name, 122 "returning more than %d resource descriptors\n",
98 max_root_bus_resources); 123 (unsigned long) start, (unsigned long) end,
124 root->name, info->name, max_root_bus_resources);
99 return AE_OK; 125 return AE_OK;
100 } 126 }
101 127
@@ -105,14 +131,28 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
105 res->start = start; 131 res->start = start;
106 res->end = end; 132 res->end = end;
107 res->child = NULL; 133 res->child = NULL;
134 align_resource(info->bridge, res);
135
136 if (!(pci_probe & PCI_USE__CRS)) {
137 dev_printk(KERN_DEBUG, &info->bridge->dev,
138 "host bridge window %pR (ignored)\n", res);
139 return AE_OK;
140 }
108 141
109 if (insert_resource(root, res)) { 142 if (insert_resource(root, res)) {
110 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " 143 dev_err(&info->bridge->dev,
111 "from %s for %s\n", (unsigned long) res->start, 144 "can't allocate host bridge window %pR\n", res);
112 (unsigned long) res->end, root->name, info->name);
113 } else { 145 } else {
114 info->bus->resource[info->res_num] = res; 146 info->bus->resource[info->res_num] = res;
115 info->res_num++; 147 info->res_num++;
148 if (addr.translation_offset)
149 dev_info(&info->bridge->dev, "host bridge window %pR "
150 "(PCI address [%#llx-%#llx])\n",
151 res, res->start - addr.translation_offset,
152 res->end - addr.translation_offset);
153 else
154 dev_info(&info->bridge->dev,
155 "host bridge window %pR\n", res);
116 } 156 }
117 return AE_OK; 157 return AE_OK;
118} 158}
@@ -124,6 +164,12 @@ get_current_resources(struct acpi_device *device, int busnum,
124 struct pci_root_info info; 164 struct pci_root_info info;
125 size_t size; 165 size_t size;
126 166
167 if (!(pci_probe & PCI_USE__CRS))
168 dev_info(&device->dev,
169 "ignoring host bridge windows from ACPI; "
170 "boot with \"pci=use_crs\" to use them\n");
171
172 info.bridge = device;
127 info.bus = bus; 173 info.bus = bus;
128 info.res_num = 0; 174 info.res_num = 0;
129 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 175 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
@@ -163,8 +209,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
163#endif 209#endif
164 210
165 if (domain && !pci_domains_supported) { 211 if (domain && !pci_domains_supported) {
166 printk(KERN_WARNING "PCI: Multiple domains not supported " 212 printk(KERN_WARNING "pci_bus %04x:%02x: "
167 "(dom %d, bus %d)\n", domain, busnum); 213 "ignored (multiple domains not supported)\n",
214 domain, busnum);
168 return NULL; 215 return NULL;
169 } 216 }
170 217
@@ -188,7 +235,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
188 */ 235 */
189 sd = kzalloc(sizeof(*sd), GFP_KERNEL); 236 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
190 if (!sd) { 237 if (!sd) {
191 printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); 238 printk(KERN_WARNING "pci_bus %04x:%02x: "
239 "ignored (out of memory)\n", domain, busnum);
192 return NULL; 240 return NULL;
193 } 241 }
194 242
@@ -209,9 +257,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
209 } else { 257 } else {
210 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); 258 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
211 if (bus) { 259 if (bus) {
212 if (pci_probe & PCI_USE__CRS) 260 get_current_resources(device, busnum, domain, bus);
213 get_current_resources(device, busnum, domain,
214 bus);
215 bus->subordinate = pci_scan_child_bus(bus); 261 bus->subordinate = pci_scan_child_bus(bus);
216 } 262 }
217 } 263 }
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 572ee9782f2a..95ecbd495955 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -6,10 +6,10 @@
6 6
7#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
8#include <asm/pci-direct.h> 8#include <asm/pci-direct.h>
9#include <asm/mpspec.h>
10#include <linux/cpumask.h>
11#endif 9#endif
12 10
11#include "bus_numa.h"
12
13/* 13/*
14 * This discovers the pcibus <-> node mapping on AMD K8. 14 * This discovers the pcibus <-> node mapping on AMD K8.
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
@@ -17,67 +17,6 @@
17 17
18#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
19 19
20/*
21 * sub bus (transparent) will use entres from 3 to store extra from root,
22 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
23 */
24#define RES_NUM 16
25struct pci_root_info {
26 char name[12];
27 unsigned int res_num;
28 struct resource res[RES_NUM];
29 int bus_min;
30 int bus_max;
31 int node;
32 int link;
33};
34
35/* 4 at this time, it may become to 32 */
36#define PCI_ROOT_NR 4
37static int pci_root_num;
38static struct pci_root_info pci_root_info[PCI_ROOT_NR];
39
40void x86_pci_root_bus_res_quirks(struct pci_bus *b)
41{
42 int i;
43 int j;
44 struct pci_root_info *info;
45
46 /* don't go for it if _CRS is used already */
47 if (b->resource[0] != &ioport_resource ||
48 b->resource[1] != &iomem_resource)
49 return;
50
51 /* if only one root bus, don't need to anything */
52 if (pci_root_num < 2)
53 return;
54
55 for (i = 0; i < pci_root_num; i++) {
56 if (pci_root_info[i].bus_min == b->number)
57 break;
58 }
59
60 if (i == pci_root_num)
61 return;
62
63 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
64 b->number);
65
66 info = &pci_root_info[i];
67 for (j = 0; j < info->res_num; j++) {
68 struct resource *res;
69 struct resource *root;
70
71 res = &info->res[j];
72 b->resource[j] = res;
73 if (res->flags & IORESOURCE_IO)
74 root = &ioport_resource;
75 else
76 root = &iomem_resource;
77 insert_resource(root, res);
78 }
79}
80
81#define RANGE_NUM 16 20#define RANGE_NUM 16
82 21
83struct res_range { 22struct res_range {
@@ -130,52 +69,6 @@ static void __init update_range(struct res_range *range, size_t start,
130 } 69 }
131} 70}
132 71
133static void __init update_res(struct pci_root_info *info, size_t start,
134 size_t end, unsigned long flags, int merge)
135{
136 int i;
137 struct resource *res;
138
139 if (!merge)
140 goto addit;
141
142 /* try to merge it with old one */
143 for (i = 0; i < info->res_num; i++) {
144 size_t final_start, final_end;
145 size_t common_start, common_end;
146
147 res = &info->res[i];
148 if (res->flags != flags)
149 continue;
150
151 common_start = max((size_t)res->start, start);
152 common_end = min((size_t)res->end, end);
153 if (common_start > common_end + 1)
154 continue;
155
156 final_start = min((size_t)res->start, start);
157 final_end = max((size_t)res->end, end);
158
159 res->start = final_start;
160 res->end = final_end;
161 return;
162 }
163
164addit:
165
166 /* need to add that */
167 if (info->res_num >= RES_NUM)
168 return;
169
170 res = &info->res[info->res_num];
171 res->name = info->name;
172 res->flags = flags;
173 res->start = start;
174 res->end = end;
175 res->child = NULL;
176 info->res_num++;
177}
178
179struct pci_hostbridge_probe { 72struct pci_hostbridge_probe {
180 u32 bus; 73 u32 bus;
181 u32 slot; 74 u32 slot;
@@ -230,7 +123,6 @@ static int __init early_fill_mp_bus_info(void)
230 int j; 123 int j;
231 unsigned bus; 124 unsigned bus;
232 unsigned slot; 125 unsigned slot;
233 int found;
234 int node; 126 int node;
235 int link; 127 int link;
236 int def_node; 128 int def_node;
@@ -247,7 +139,7 @@ static int __init early_fill_mp_bus_info(void)
247 if (!early_pci_allowed()) 139 if (!early_pci_allowed())
248 return -1; 140 return -1;
249 141
250 found = 0; 142 found_all_numa_early = 0;
251 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 143 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
252 u32 id; 144 u32 id;
253 u16 device; 145 u16 device;
@@ -261,12 +153,12 @@ static int __init early_fill_mp_bus_info(void)
261 device = (id>>16) & 0xffff; 153 device = (id>>16) & 0xffff;
262 if (pci_probes[i].vendor == vendor && 154 if (pci_probes[i].vendor == vendor &&
263 pci_probes[i].device == device) { 155 pci_probes[i].device == device) {
264 found = 1; 156 found_all_numa_early = 1;
265 break; 157 break;
266 } 158 }
267 } 159 }
268 160
269 if (!found) 161 if (!found_all_numa_early)
270 return 0; 162 return 0;
271 163
272 pci_root_num = 0; 164 pci_root_num = 0;
@@ -488,7 +380,7 @@ static int __init early_fill_mp_bus_info(void)
488 info = &pci_root_info[i]; 380 info = &pci_root_info[i];
489 res_num = info->res_num; 381 res_num = info->res_num;
490 busnum = info->bus_min; 382 busnum = info->bus_min;
491 printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", 383 printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
492 info->bus_min, info->bus_max, info->node, info->link); 384 info->bus_min, info->bus_max, info->node, info->link);
493 for (j = 0; j < res_num; j++) { 385 for (j = 0; j < res_num; j++) {
494 res = &info->res[j]; 386 res = &info->res[j];
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
new file mode 100644
index 000000000000..f939d603adfa
--- /dev/null
+++ b/arch/x86/pci/bus_numa.c
@@ -0,0 +1,101 @@
1#include <linux/init.h>
2#include <linux/pci.h>
3
4#include "bus_numa.h"
5
6int pci_root_num;
7struct pci_root_info pci_root_info[PCI_ROOT_NR];
8int found_all_numa_early;
9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b)
11{
12 int i;
13 int j;
14 struct pci_root_info *info;
15
16 /* don't go for it if _CRS is used already */
17 if (b->resource[0] != &ioport_resource ||
18 b->resource[1] != &iomem_resource)
19 return;
20
21 if (!pci_root_num)
22 return;
23
24 /* for amd, if only one root bus, don't need to do anything */
25 if (pci_root_num < 2 && found_all_numa_early)
26 return;
27
28 for (i = 0; i < pci_root_num; i++) {
29 if (pci_root_info[i].bus_min == b->number)
30 break;
31 }
32
33 if (i == pci_root_num)
34 return;
35
36 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
37 b->number);
38
39 info = &pci_root_info[i];
40 for (j = 0; j < info->res_num; j++) {
41 struct resource *res;
42 struct resource *root;
43
44 res = &info->res[j];
45 b->resource[j] = res;
46 if (res->flags & IORESOURCE_IO)
47 root = &ioport_resource;
48 else
49 root = &iomem_resource;
50 insert_resource(root, res);
51 }
52}
53
54void __devinit update_res(struct pci_root_info *info, size_t start,
55 size_t end, unsigned long flags, int merge)
56{
57 int i;
58 struct resource *res;
59
60 if (start > end)
61 return;
62
63 if (!merge)
64 goto addit;
65
66 /* try to merge it with old one */
67 for (i = 0; i < info->res_num; i++) {
68 size_t final_start, final_end;
69 size_t common_start, common_end;
70
71 res = &info->res[i];
72 if (res->flags != flags)
73 continue;
74
75 common_start = max((size_t)res->start, start);
76 common_end = min((size_t)res->end, end);
77 if (common_start > common_end + 1)
78 continue;
79
80 final_start = min((size_t)res->start, start);
81 final_end = max((size_t)res->end, end);
82
83 res->start = final_start;
84 res->end = final_end;
85 return;
86 }
87
88addit:
89
90 /* need to add that */
91 if (info->res_num >= RES_NUM)
92 return;
93
94 res = &info->res[info->res_num];
95 res->name = info->name;
96 res->flags = flags;
97 res->start = start;
98 res->end = end;
99 res->child = NULL;
100 info->res_num++;
101}
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
new file mode 100644
index 000000000000..adbc23fe82ac
--- /dev/null
+++ b/arch/x86/pci/bus_numa.h
@@ -0,0 +1,27 @@
1#ifdef CONFIG_X86_64
2
3/*
4 * sub bus (transparent) will use entres from 3 to store extra from
5 * root, so need to make sure we have enough slot there, Should we
6 * increase PCI_BUS_NUM_RESOURCES?
7 */
8#define RES_NUM 16
9struct pci_root_info {
10 char name[12];
11 unsigned int res_num;
12 struct resource res[RES_NUM];
13 int bus_min;
14 int bus_max;
15 int node;
16 int link;
17};
18
19/* 4 at this time, it may become to 32 */
20#define PCI_ROOT_NR 4
21extern int pci_root_num;
22extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
23extern int found_all_numa_early;
24
25extern void update_res(struct pci_root_info *info, size_t start,
26 size_t end, unsigned long flags, int merge);
27#endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 1331fcf26143..d2552c68e94d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -410,8 +410,6 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
410 return bus; 410 return bus;
411} 411}
412 412
413extern u8 pci_cache_line_size;
414
415int __init pcibios_init(void) 413int __init pcibios_init(void)
416{ 414{
417 struct cpuinfo_x86 *c = &boot_cpu_data; 415 struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -422,15 +420,19 @@ int __init pcibios_init(void)
422 } 420 }
423 421
424 /* 422 /*
425 * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 423 * Set PCI cacheline size to that of the CPU if the CPU has reported it.
426 * and P4. It's also good for 386/486s (which actually have 16) 424 * (For older CPUs that don't support cpuid, we se it to 32 bytes
425 * It's also good for 386/486s (which actually have 16)
427 * as quite a few PCI devices do not support smaller values. 426 * as quite a few PCI devices do not support smaller values.
428 */ 427 */
429 pci_cache_line_size = 32 >> 2; 428 if (c->x86_clflush_size > 0) {
430 if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) 429 pci_dfl_cache_line_size = c->x86_clflush_size >> 2;
431 pci_cache_line_size = 64 >> 2; /* K7 & K8 */ 430 printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
432 else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) 431 pci_dfl_cache_line_size << 2);
433 pci_cache_line_size = 128 >> 2; /* P4 */ 432 } else {
433 pci_dfl_cache_line_size = 32 >> 2;
434 printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
435 }
434 436
435 pcibios_resource_survey(); 437 pcibios_resource_survey();
436 438
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index aaf26ae58cd5..d1067d539bee 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -12,8 +12,6 @@ u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
12 u32 v; 12 u32 v;
13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
14 v = inl(0xcfc); 14 v = inl(0xcfc);
15 if (v != 0xffffffff)
16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
17 return v; 15 return v;
18} 16}
19 17
@@ -22,7 +20,6 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
22 u8 v; 20 u8 v;
23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 21 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
24 v = inb(0xcfc + (offset&3)); 22 v = inb(0xcfc + (offset&3));
25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
26 return v; 23 return v;
27} 24}
28 25
@@ -31,28 +28,24 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
31 u16 v; 28 u16 v;
32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 29 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
33 v = inw(0xcfc + (offset&2)); 30 v = inw(0xcfc + (offset&2));
34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
35 return v; 31 return v;
36} 32}
37 33
38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 34void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
39 u32 val) 35 u32 val)
40{ 36{
41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 37 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
43 outl(val, 0xcfc); 38 outl(val, 0xcfc);
44} 39}
45 40
46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 41void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
47{ 42{
48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 43 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
50 outb(val, 0xcfc + (offset&3)); 44 outb(val, 0xcfc + (offset&3));
51} 45}
52 46
53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) 47void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
54{ 48{
55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
57 outw(val, 0xcfc + (offset&2)); 50 outw(val, 0xcfc + (offset&2));
58} 51}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b22d13b0c71d..5dc9e8c63fcd 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -129,7 +129,9 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
129 continue; 129 continue;
130 if (!r->start || 130 if (!r->start ||
131 pci_claim_resource(dev, idx) < 0) { 131 pci_claim_resource(dev, idx) < 0) {
132 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 132 dev_info(&dev->dev,
133 "can't reserve window %pR\n",
134 r);
133 /* 135 /*
134 * Something is wrong with the region. 136 * Something is wrong with the region.
135 * Invalidate the resource to prevent 137 * Invalidate the resource to prevent
@@ -144,16 +146,29 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
144 } 146 }
145} 147}
146 148
149struct pci_check_idx_range {
150 int start;
151 int end;
152};
153
147static void __init pcibios_allocate_resources(int pass) 154static void __init pcibios_allocate_resources(int pass)
148{ 155{
149 struct pci_dev *dev = NULL; 156 struct pci_dev *dev = NULL;
150 int idx, disabled; 157 int idx, disabled, i;
151 u16 command; 158 u16 command;
152 struct resource *r; 159 struct resource *r;
153 160
161 struct pci_check_idx_range idx_range[] = {
162 { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END },
163#ifdef CONFIG_PCI_IOV
164 { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END },
165#endif
166 };
167
154 for_each_pci_dev(dev) { 168 for_each_pci_dev(dev) {
155 pci_read_config_word(dev, PCI_COMMAND, &command); 169 pci_read_config_word(dev, PCI_COMMAND, &command);
156 for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { 170 for (i = 0; i < ARRAY_SIZE(idx_range); i++)
171 for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) {
157 r = &dev->resource[idx]; 172 r = &dev->resource[idx];
158 if (r->parent) /* Already allocated */ 173 if (r->parent) /* Already allocated */
159 continue; 174 continue;
@@ -164,12 +179,12 @@ static void __init pcibios_allocate_resources(int pass)
164 else 179 else
165 disabled = !(command & PCI_COMMAND_MEMORY); 180 disabled = !(command & PCI_COMMAND_MEMORY);
166 if (pass == disabled) { 181 if (pass == disabled) {
167 dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", 182 dev_dbg(&dev->dev,
168 (unsigned long long) r->start, 183 "BAR %d: reserving %pr (d=%d, p=%d)\n",
169 (unsigned long long) r->end, 184 idx, r, disabled, pass);
170 r->flags, disabled, pass);
171 if (pci_claim_resource(dev, idx) < 0) { 185 if (pci_claim_resource(dev, idx) < 0) {
172 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 186 dev_info(&dev->dev,
187 "can't reserve %pR\n", r);
173 /* We'll assign a new address later */ 188 /* We'll assign a new address later */
174 r->end -= r->start; 189 r->end -= r->start;
175 r->start = 0; 190 r->start = 0;
@@ -182,7 +197,7 @@ static void __init pcibios_allocate_resources(int pass)
182 /* Turn the ROM off, leave the resource region, 197 /* Turn the ROM off, leave the resource region,
183 * but keep it unregistered. */ 198 * but keep it unregistered. */
184 u32 reg; 199 u32 reg;
185 dev_dbg(&dev->dev, "disabling ROM\n"); 200 dev_dbg(&dev->dev, "disabling ROM %pR\n", r);
186 r->flags &= ~IORESOURCE_ROM_ENABLE; 201 r->flags &= ~IORESOURCE_ROM_ENABLE;
187 pci_read_config_dword(dev, 202 pci_read_config_dword(dev,
188 dev->rom_base_reg, &reg); 203 dev->rom_base_reg, &reg);
@@ -282,6 +297,15 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
282 return -EINVAL; 297 return -EINVAL;
283 298
284 prot = pgprot_val(vma->vm_page_prot); 299 prot = pgprot_val(vma->vm_page_prot);
300
301 /*
302 * Return error if pat is not enabled and write_combine is requested.
303 * Caller can followup with UC MINUS request and add a WC mtrr if there
304 * is a free mtrr slot.
305 */
306 if (!pat_enabled && write_combine)
307 return -EINVAL;
308
285 if (pat_enabled && write_combine) 309 if (pat_enabled && write_combine)
286 prot |= _PAGE_CACHE_WC; 310 prot |= _PAGE_CACHE_WC;
287 else if (pat_enabled || boot_cpu_data.x86 > 3) 311 else if (pat_enabled || boot_cpu_data.x86 > 3)
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 602c172d3bd5..b19d1e54201e 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,48 +15,98 @@
15#include <linux/acpi.h> 15#include <linux/acpi.h>
16#include <linux/sfi_acpi.h> 16#include <linux/sfi_acpi.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/sort.h> 18#include <linux/dmi.h>
19#include <asm/e820.h> 19#include <asm/e820.h>
20#include <asm/pci_x86.h> 20#include <asm/pci_x86.h>
21#include <asm/acpi.h> 21#include <asm/acpi.h>
22 22
23#define PREFIX "PCI: " 23#define PREFIX "PCI: "
24 24
25/* aperture is up to 256MB but BIOS may reserve less */
26#define MMCONFIG_APER_MIN (2 * 1024*1024)
27#define MMCONFIG_APER_MAX (256 * 1024*1024)
28
29/* Indicate if the mmcfg resources have been placed into the resource table. */ 25/* Indicate if the mmcfg resources have been placed into the resource table. */
30static int __initdata pci_mmcfg_resources_inserted; 26static int __initdata pci_mmcfg_resources_inserted;
31 27
32static __init int extend_mmcfg(int num) 28LIST_HEAD(pci_mmcfg_list);
29
30static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
33{ 31{
34 struct acpi_mcfg_allocation *new; 32 if (cfg->res.parent)
35 int new_num = pci_mmcfg_config_num + num; 33 release_resource(&cfg->res);
34 list_del(&cfg->list);
35 kfree(cfg);
36}
36 37
37 new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); 38static __init void free_all_mmcfg(void)
38 if (!new) 39{
39 return -1; 40 struct pci_mmcfg_region *cfg, *tmp;
40 41
41 if (pci_mmcfg_config) { 42 pci_mmcfg_arch_free();
42 memcpy(new, pci_mmcfg_config, 43 list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list)
43 sizeof(pci_mmcfg_config[0]) * new_num); 44 pci_mmconfig_remove(cfg);
44 kfree(pci_mmcfg_config); 45}
46
47static __init void list_add_sorted(struct pci_mmcfg_region *new)
48{
49 struct pci_mmcfg_region *cfg;
50
51 /* keep list sorted by segment and starting bus number */
52 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
53 if (cfg->segment > new->segment ||
54 (cfg->segment == new->segment &&
55 cfg->start_bus >= new->start_bus)) {
56 list_add_tail(&new->list, &cfg->list);
57 return;
58 }
45 } 59 }
46 pci_mmcfg_config = new; 60 list_add_tail(&new->list, &pci_mmcfg_list);
61}
47 62
48 return 0; 63static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
64 int end, u64 addr)
65{
66 struct pci_mmcfg_region *new;
67 int num_buses;
68 struct resource *res;
69
70 if (addr == 0)
71 return NULL;
72
73 new = kzalloc(sizeof(*new), GFP_KERNEL);
74 if (!new)
75 return NULL;
76
77 new->address = addr;
78 new->segment = segment;
79 new->start_bus = start;
80 new->end_bus = end;
81
82 list_add_sorted(new);
83
84 num_buses = end - start + 1;
85 res = &new->res;
86 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
87 res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
88 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
89 snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
90 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
91 res->name = new->name;
92
93 printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at "
94 "%pR (base %#lx)\n", segment, start, end, &new->res,
95 (unsigned long) addr);
96
97 return new;
49} 98}
50 99
51static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end) 100struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
52{ 101{
53 int i = pci_mmcfg_config_num; 102 struct pci_mmcfg_region *cfg;
54 103
55 pci_mmcfg_config_num++; 104 list_for_each_entry(cfg, &pci_mmcfg_list, list)
56 pci_mmcfg_config[i].address = addr; 105 if (cfg->segment == segment &&
57 pci_mmcfg_config[i].pci_segment = segment; 106 cfg->start_bus <= bus && bus <= cfg->end_bus)
58 pci_mmcfg_config[i].start_bus_number = start; 107 return cfg;
59 pci_mmcfg_config[i].end_bus_number = end; 108
109 return NULL;
60} 110}
61 111
62static const char __init *pci_mmcfg_e7520(void) 112static const char __init *pci_mmcfg_e7520(void)
@@ -68,11 +118,9 @@ static const char __init *pci_mmcfg_e7520(void)
68 if (win == 0x0000 || win == 0xf000) 118 if (win == 0x0000 || win == 0xf000)
69 return NULL; 119 return NULL;
70 120
71 if (extend_mmcfg(1) == -1) 121 if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL)
72 return NULL; 122 return NULL;
73 123
74 fill_one_mmcfg(win << 16, 0, 0, 255);
75
76 return "Intel Corporation E7520 Memory Controller Hub"; 124 return "Intel Corporation E7520 Memory Controller Hub";
77} 125}
78 126
@@ -114,11 +162,9 @@ static const char __init *pci_mmcfg_intel_945(void)
114 if ((pciexbar & mask) >= 0xf0000000U) 162 if ((pciexbar & mask) >= 0xf0000000U)
115 return NULL; 163 return NULL;
116 164
117 if (extend_mmcfg(1) == -1) 165 if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL)
118 return NULL; 166 return NULL;
119 167
120 fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1);
121
122 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; 168 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
123} 169}
124 170
@@ -127,7 +173,7 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
127 u32 low, high, address; 173 u32 low, high, address;
128 u64 base, msr; 174 u64 base, msr;
129 int i; 175 int i;
130 unsigned segnbits = 0, busnbits; 176 unsigned segnbits = 0, busnbits, end_bus;
131 177
132 if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) 178 if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF))
133 return NULL; 179 return NULL;
@@ -161,11 +207,13 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
161 busnbits = 8; 207 busnbits = 8;
162 } 208 }
163 209
164 if (extend_mmcfg(1 << segnbits) == -1) 210 end_bus = (1 << busnbits) - 1;
165 return NULL;
166
167 for (i = 0; i < (1 << segnbits); i++) 211 for (i = 0; i < (1 << segnbits); i++)
168 fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1); 212 if (pci_mmconfig_add(i, 0, end_bus,
213 base + (1<<28) * i) == NULL) {
214 free_all_mmcfg();
215 return NULL;
216 }
169 217
170 return "AMD Family 10h NB"; 218 return "AMD Family 10h NB";
171} 219}
@@ -190,7 +238,7 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
190 /* 238 /*
191 * do check if amd fam10h already took over 239 * do check if amd fam10h already took over
192 */ 240 */
193 if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) 241 if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked)
194 return NULL; 242 return NULL;
195 243
196 mcp55_checked = true; 244 mcp55_checked = true;
@@ -213,16 +261,14 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
213 if (!(extcfg & extcfg_enable_mask)) 261 if (!(extcfg & extcfg_enable_mask))
214 continue; 262 continue;
215 263
216 if (extend_mmcfg(1) == -1)
217 continue;
218
219 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; 264 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
220 base = extcfg & extcfg_base_mask[size_index]; 265 base = extcfg & extcfg_base_mask[size_index];
221 /* base could > 4G */ 266 /* base could > 4G */
222 base <<= extcfg_base_lshift; 267 base <<= extcfg_base_lshift;
223 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift; 268 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
224 end = start + extcfg_sizebus[size_index] - 1; 269 end = start + extcfg_sizebus[size_index] - 1;
225 fill_one_mmcfg(base, 0, start, end); 270 if (pci_mmconfig_add(0, start, end, base) == NULL)
271 continue;
226 mcp55_mmconf_found++; 272 mcp55_mmconf_found++;
227 } 273 }
228 274
@@ -253,45 +299,27 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
253 0x0369, pci_mmcfg_nvidia_mcp55 }, 299 0x0369, pci_mmcfg_nvidia_mcp55 },
254}; 300};
255 301
256static int __init cmp_mmcfg(const void *x1, const void *x2)
257{
258 const typeof(pci_mmcfg_config[0]) *m1 = x1;
259 const typeof(pci_mmcfg_config[0]) *m2 = x2;
260 int start1, start2;
261
262 start1 = m1->start_bus_number;
263 start2 = m2->start_bus_number;
264
265 return start1 - start2;
266}
267
268static void __init pci_mmcfg_check_end_bus_number(void) 302static void __init pci_mmcfg_check_end_bus_number(void)
269{ 303{
270 int i; 304 struct pci_mmcfg_region *cfg, *cfgx;
271 typeof(pci_mmcfg_config[0]) *cfg, *cfgx;
272
273 /* sort them at first */
274 sort(pci_mmcfg_config, pci_mmcfg_config_num,
275 sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL);
276 305
277 /* last one*/ 306 /* last one*/
278 if (pci_mmcfg_config_num > 0) { 307 cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list);
279 i = pci_mmcfg_config_num - 1; 308 if (cfg)
280 cfg = &pci_mmcfg_config[i]; 309 if (cfg->end_bus < cfg->start_bus)
281 if (cfg->end_bus_number < cfg->start_bus_number) 310 cfg->end_bus = 255;
282 cfg->end_bus_number = 255;
283 }
284 311
285 /* don't overlap please */ 312 if (list_is_singular(&pci_mmcfg_list))
286 for (i = 0; i < pci_mmcfg_config_num - 1; i++) { 313 return;
287 cfg = &pci_mmcfg_config[i];
288 cfgx = &pci_mmcfg_config[i+1];
289 314
290 if (cfg->end_bus_number < cfg->start_bus_number) 315 /* don't overlap please */
291 cfg->end_bus_number = 255; 316 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
317 if (cfg->end_bus < cfg->start_bus)
318 cfg->end_bus = 255;
292 319
293 if (cfg->end_bus_number >= cfgx->start_bus_number) 320 cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
294 cfg->end_bus_number = cfgx->start_bus_number - 1; 321 if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus)
322 cfg->end_bus = cfgx->start_bus - 1;
295 } 323 }
296} 324}
297 325
@@ -306,8 +334,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
306 if (!raw_pci_ops) 334 if (!raw_pci_ops)
307 return 0; 335 return 0;
308 336
309 pci_mmcfg_config_num = 0; 337 free_all_mmcfg();
310 pci_mmcfg_config = NULL;
311 338
312 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) { 339 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
313 bus = pci_mmcfg_probes[i].bus; 340 bus = pci_mmcfg_probes[i].bus;
@@ -322,45 +349,22 @@ static int __init pci_mmcfg_check_hostbridge(void)
322 name = pci_mmcfg_probes[i].probe(); 349 name = pci_mmcfg_probes[i].probe();
323 350
324 if (name) 351 if (name)
325 printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", 352 printk(KERN_INFO PREFIX "%s with MMCONFIG support\n",
326 name); 353 name);
327 } 354 }
328 355
329 /* some end_bus_number is crazy, fix it */ 356 /* some end_bus_number is crazy, fix it */
330 pci_mmcfg_check_end_bus_number(); 357 pci_mmcfg_check_end_bus_number();
331 358
332 return pci_mmcfg_config_num != 0; 359 return !list_empty(&pci_mmcfg_list);
333} 360}
334 361
335static void __init pci_mmcfg_insert_resources(void) 362static void __init pci_mmcfg_insert_resources(void)
336{ 363{
337#define PCI_MMCFG_RESOURCE_NAME_LEN 24 364 struct pci_mmcfg_region *cfg;
338 int i;
339 struct resource *res;
340 char *names;
341 unsigned num_buses;
342
343 res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
344 pci_mmcfg_config_num, GFP_KERNEL);
345 if (!res) {
346 printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
347 return;
348 }
349 365
350 names = (void *)&res[pci_mmcfg_config_num]; 366 list_for_each_entry(cfg, &pci_mmcfg_list, list)
351 for (i = 0; i < pci_mmcfg_config_num; i++, res++) { 367 insert_resource(&iomem_resource, &cfg->res);
352 struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
353 num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
354 res->name = names;
355 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN,
356 "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment,
357 cfg->start_bus_number, cfg->end_bus_number);
358 res->start = cfg->address + (cfg->start_bus_number << 20);
359 res->end = res->start + (num_buses << 20) - 1;
360 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
361 insert_resource(&iomem_resource, res);
362 names += PCI_MMCFG_RESOURCE_NAME_LEN;
363 }
364 368
365 /* Mark that the resources have been inserted. */ 369 /* Mark that the resources have been inserted. */
366 pci_mmcfg_resources_inserted = 1; 370 pci_mmcfg_resources_inserted = 1;
@@ -437,11 +441,12 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
437typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); 441typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
438 442
439static int __init is_mmconf_reserved(check_reserved_t is_reserved, 443static int __init is_mmconf_reserved(check_reserved_t is_reserved,
440 u64 addr, u64 size, int i, 444 struct pci_mmcfg_region *cfg, int with_e820)
441 typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
442{ 445{
446 u64 addr = cfg->res.start;
447 u64 size = resource_size(&cfg->res);
443 u64 old_size = size; 448 u64 old_size = size;
444 int valid = 0; 449 int valid = 0, num_buses;
445 450
446 while (!is_reserved(addr, addr + size, E820_RESERVED)) { 451 while (!is_reserved(addr, addr + size, E820_RESERVED)) {
447 size >>= 1; 452 size >>= 1;
@@ -450,19 +455,25 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
450 } 455 }
451 456
452 if (size >= (16UL<<20) || size == old_size) { 457 if (size >= (16UL<<20) || size == old_size) {
453 printk(KERN_NOTICE 458 printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n",
454 "PCI: MCFG area at %Lx reserved in %s\n", 459 &cfg->res,
455 addr, with_e820?"E820":"ACPI motherboard resources"); 460 with_e820 ? "E820" : "ACPI motherboard resources");
456 valid = 1; 461 valid = 1;
457 462
458 if (old_size != size) { 463 if (old_size != size) {
459 /* update end_bus_number */ 464 /* update end_bus */
460 cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); 465 cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
461 printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " 466 num_buses = cfg->end_bus - cfg->start_bus + 1;
462 "segment %hu buses %u - %u\n", 467 cfg->res.end = cfg->res.start +
463 i, (unsigned long)cfg->address, cfg->pci_segment, 468 PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
464 (unsigned int)cfg->start_bus_number, 469 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
465 (unsigned int)cfg->end_bus_number); 470 "PCI MMCONFIG %04x [bus %02x-%02x]",
471 cfg->segment, cfg->start_bus, cfg->end_bus);
472 printk(KERN_INFO PREFIX
473 "MMCONFIG for %04x [bus%02x-%02x] "
474 "at %pR (base %#lx) (size reduced!)\n",
475 cfg->segment, cfg->start_bus, cfg->end_bus,
476 &cfg->res, (unsigned long) cfg->address);
466 } 477 }
467 } 478 }
468 479
@@ -471,45 +482,26 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
471 482
472static void __init pci_mmcfg_reject_broken(int early) 483static void __init pci_mmcfg_reject_broken(int early)
473{ 484{
474 typeof(pci_mmcfg_config[0]) *cfg; 485 struct pci_mmcfg_region *cfg;
475 int i;
476 486
477 if ((pci_mmcfg_config_num == 0) || 487 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
478 (pci_mmcfg_config == NULL) ||
479 (pci_mmcfg_config[0].address == 0))
480 return;
481
482 for (i = 0; i < pci_mmcfg_config_num; i++) {
483 int valid = 0; 488 int valid = 0;
484 u64 addr, size;
485
486 cfg = &pci_mmcfg_config[i];
487 addr = cfg->start_bus_number;
488 addr <<= 20;
489 addr += cfg->address;
490 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
491 size <<= 20;
492 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
493 "segment %hu buses %u - %u\n",
494 i, (unsigned long)cfg->address, cfg->pci_segment,
495 (unsigned int)cfg->start_bus_number,
496 (unsigned int)cfg->end_bus_number);
497 489
498 if (!early && !acpi_disabled) 490 if (!early && !acpi_disabled)
499 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); 491 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
500 492
501 if (valid) 493 if (valid)
502 continue; 494 continue;
503 495
504 if (!early) 496 if (!early)
505 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" 497 printk(KERN_ERR FW_BUG PREFIX
506 " reserved in ACPI motherboard resources\n", 498 "MMCONFIG at %pR not reserved in "
507 cfg->address); 499 "ACPI motherboard resources\n", &cfg->res);
508 500
509 /* Don't try to do this check unless configuration 501 /* Don't try to do this check unless configuration
510 type 1 is available. how about type 2 ?*/ 502 type 1 is available. how about type 2 ?*/
511 if (raw_pci_ops) 503 if (raw_pci_ops)
512 valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); 504 valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
513 505
514 if (!valid) 506 if (!valid)
515 goto reject; 507 goto reject;
@@ -518,34 +510,41 @@ static void __init pci_mmcfg_reject_broken(int early)
518 return; 510 return;
519 511
520reject: 512reject:
521 printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); 513 printk(KERN_INFO PREFIX "not using MMCONFIG\n");
522 pci_mmcfg_arch_free(); 514 free_all_mmcfg();
523 kfree(pci_mmcfg_config);
524 pci_mmcfg_config = NULL;
525 pci_mmcfg_config_num = 0;
526} 515}
527 516
528static int __initdata known_bridge; 517static int __initdata known_bridge;
529 518
530static int acpi_mcfg_64bit_base_addr __initdata = FALSE; 519static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
520 struct acpi_mcfg_allocation *cfg)
521{
522 int year;
531 523
532/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 524 if (cfg->address < 0xFFFFFFFF)
533struct acpi_mcfg_allocation *pci_mmcfg_config; 525 return 0;
534int pci_mmcfg_config_num;
535 526
536static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
537{
538 if (!strcmp(mcfg->header.oem_id, "SGI")) 527 if (!strcmp(mcfg->header.oem_id, "SGI"))
539 acpi_mcfg_64bit_base_addr = TRUE; 528 return 0;
540 529
541 return 0; 530 if (mcfg->header.revision >= 1) {
531 if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
532 year >= 2010)
533 return 0;
534 }
535
536 printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
537 "is above 4GB, ignored\n", cfg->pci_segment,
538 cfg->start_bus_number, cfg->end_bus_number, cfg->address);
539 return -EINVAL;
542} 540}
543 541
544static int __init pci_parse_mcfg(struct acpi_table_header *header) 542static int __init pci_parse_mcfg(struct acpi_table_header *header)
545{ 543{
546 struct acpi_table_mcfg *mcfg; 544 struct acpi_table_mcfg *mcfg;
545 struct acpi_mcfg_allocation *cfg_table, *cfg;
547 unsigned long i; 546 unsigned long i;
548 int config_size; 547 int entries;
549 548
550 if (!header) 549 if (!header)
551 return -EINVAL; 550 return -EINVAL;
@@ -553,38 +552,33 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
553 mcfg = (struct acpi_table_mcfg *)header; 552 mcfg = (struct acpi_table_mcfg *)header;
554 553
555 /* how many config structures do we have */ 554 /* how many config structures do we have */
556 pci_mmcfg_config_num = 0; 555 free_all_mmcfg();
556 entries = 0;
557 i = header->length - sizeof(struct acpi_table_mcfg); 557 i = header->length - sizeof(struct acpi_table_mcfg);
558 while (i >= sizeof(struct acpi_mcfg_allocation)) { 558 while (i >= sizeof(struct acpi_mcfg_allocation)) {
559 ++pci_mmcfg_config_num; 559 entries++;
560 i -= sizeof(struct acpi_mcfg_allocation); 560 i -= sizeof(struct acpi_mcfg_allocation);
561 }; 561 };
562 if (pci_mmcfg_config_num == 0) { 562 if (entries == 0) {
563 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); 563 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
564 return -ENODEV; 564 return -ENODEV;
565 } 565 }
566 566
567 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); 567 cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
568 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); 568 for (i = 0; i < entries; i++) {
569 if (!pci_mmcfg_config) { 569 cfg = &cfg_table[i];
570 printk(KERN_WARNING PREFIX 570 if (acpi_mcfg_check_entry(mcfg, cfg)) {
571 "No memory for MCFG config tables\n"); 571 free_all_mmcfg();
572 return -ENOMEM;
573 }
574
575 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
576
577 acpi_mcfg_oem_check(mcfg);
578
579 for (i = 0; i < pci_mmcfg_config_num; ++i) {
580 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
581 !acpi_mcfg_64bit_base_addr) {
582 printk(KERN_ERR PREFIX
583 "MMCONFIG not in low 4GB of memory\n");
584 kfree(pci_mmcfg_config);
585 pci_mmcfg_config_num = 0;
586 return -ENODEV; 572 return -ENODEV;
587 } 573 }
574
575 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
576 cfg->end_bus_number, cfg->address) == NULL) {
577 printk(KERN_WARNING PREFIX
578 "no memory for MCFG entries\n");
579 free_all_mmcfg();
580 return -ENOMEM;
581 }
588 } 582 }
589 583
590 return 0; 584 return 0;
@@ -614,9 +608,7 @@ static void __init __pci_mmcfg_init(int early)
614 608
615 pci_mmcfg_reject_broken(early); 609 pci_mmcfg_reject_broken(early);
616 610
617 if ((pci_mmcfg_config_num == 0) || 611 if (list_empty(&pci_mmcfg_list))
618 (pci_mmcfg_config == NULL) ||
619 (pci_mmcfg_config[0].address == 0))
620 return; 612 return;
621 613
622 if (pci_mmcfg_arch_init()) 614 if (pci_mmcfg_arch_init())
@@ -648,9 +640,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
648 */ 640 */
649 if ((pci_mmcfg_resources_inserted == 1) || 641 if ((pci_mmcfg_resources_inserted == 1) ||
650 (pci_probe & PCI_PROBE_MMCONF) == 0 || 642 (pci_probe & PCI_PROBE_MMCONF) == 0 ||
651 (pci_mmcfg_config_num == 0) || 643 list_empty(&pci_mmcfg_list))
652 (pci_mmcfg_config == NULL) ||
653 (pci_mmcfg_config[0].address == 0))
654 return 1; 644 return 1;
655 645
656 /* 646 /*
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f10a7e94a84c..90d5fd476ed4 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -27,18 +27,10 @@ static int mmcfg_last_accessed_cpu;
27 */ 27 */
28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) 28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
29{ 29{
30 struct acpi_mcfg_allocation *cfg; 30 struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
31 int cfg_num;
32
33 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
34 cfg = &pci_mmcfg_config[cfg_num];
35 if (cfg->pci_segment == seg &&
36 (cfg->start_bus_number <= bus) &&
37 (cfg->end_bus_number >= bus))
38 return cfg->address;
39 }
40 31
41 /* Fall back to type 0 */ 32 if (cfg)
33 return cfg->address;
42 return 0; 34 return 0;
43} 35}
44 36
@@ -47,7 +39,7 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
47 */ 39 */
48static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn) 40static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn)
49{ 41{
50 u32 dev_base = base | (bus << 20) | (devfn << 12); 42 u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12);
51 int cpu = smp_processor_id(); 43 int cpu = smp_processor_id();
52 if (dev_base != mmcfg_last_accessed_device || 44 if (dev_base != mmcfg_last_accessed_device ||
53 cpu != mmcfg_last_accessed_cpu) { 45 cpu != mmcfg_last_accessed_cpu) {
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 94349f8b2f96..e783841bd1d7 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -12,38 +12,15 @@
12#include <asm/e820.h> 12#include <asm/e820.h>
13#include <asm/pci_x86.h> 13#include <asm/pci_x86.h>
14 14
15/* Static virtual mapping of the MMCONFIG aperture */ 15#define PREFIX "PCI: "
16struct mmcfg_virt {
17 struct acpi_mcfg_allocation *cfg;
18 char __iomem *virt;
19};
20static struct mmcfg_virt *pci_mmcfg_virt;
21
22static char __iomem *get_virt(unsigned int seg, unsigned bus)
23{
24 struct acpi_mcfg_allocation *cfg;
25 int cfg_num;
26
27 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
28 cfg = pci_mmcfg_virt[cfg_num].cfg;
29 if (cfg->pci_segment == seg &&
30 (cfg->start_bus_number <= bus) &&
31 (cfg->end_bus_number >= bus))
32 return pci_mmcfg_virt[cfg_num].virt;
33 }
34
35 /* Fall back to type 0 */
36 return NULL;
37}
38 16
39static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) 17static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
40{ 18{
41 char __iomem *addr; 19 struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
42 20
43 addr = get_virt(seg, bus); 21 if (cfg && cfg->virt)
44 if (!addr) 22 return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
45 return NULL; 23 return NULL;
46 return addr + ((bus << 20) | (devfn << 12));
47} 24}
48 25
49static int pci_mmcfg_read(unsigned int seg, unsigned int bus, 26static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
@@ -109,42 +86,30 @@ static struct pci_raw_ops pci_mmcfg = {
109 .write = pci_mmcfg_write, 86 .write = pci_mmcfg_write,
110}; 87};
111 88
112static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) 89static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg)
113{ 90{
114 void __iomem *addr; 91 void __iomem *addr;
115 u64 start, size; 92 u64 start, size;
93 int num_buses;
116 94
117 start = cfg->start_bus_number; 95 start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
118 start <<= 20; 96 num_buses = cfg->end_bus - cfg->start_bus + 1;
119 start += cfg->address; 97 size = PCI_MMCFG_BUS_OFFSET(num_buses);
120 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
121 size <<= 20;
122 addr = ioremap_nocache(start, size); 98 addr = ioremap_nocache(start, size);
123 if (addr) { 99 if (addr)
124 printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", 100 addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
125 start, start + size - 1);
126 addr -= cfg->start_bus_number << 20;
127 }
128 return addr; 101 return addr;
129} 102}
130 103
131int __init pci_mmcfg_arch_init(void) 104int __init pci_mmcfg_arch_init(void)
132{ 105{
133 int i; 106 struct pci_mmcfg_region *cfg;
134 pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) *
135 pci_mmcfg_config_num, GFP_KERNEL);
136 if (pci_mmcfg_virt == NULL) {
137 printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
138 return 0;
139 }
140 107
141 for (i = 0; i < pci_mmcfg_config_num; ++i) { 108 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
142 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; 109 cfg->virt = mcfg_ioremap(cfg);
143 pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); 110 if (!cfg->virt) {
144 if (!pci_mmcfg_virt[i].virt) { 111 printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
145 printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " 112 &cfg->res);
146 "segment %d\n",
147 pci_mmcfg_config[i].pci_segment);
148 pci_mmcfg_arch_free(); 113 pci_mmcfg_arch_free();
149 return 0; 114 return 0;
150 } 115 }
@@ -155,19 +120,12 @@ int __init pci_mmcfg_arch_init(void)
155 120
156void __init pci_mmcfg_arch_free(void) 121void __init pci_mmcfg_arch_free(void)
157{ 122{
158 int i; 123 struct pci_mmcfg_region *cfg;
159
160 if (pci_mmcfg_virt == NULL)
161 return;
162 124
163 for (i = 0; i < pci_mmcfg_config_num; ++i) { 125 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
164 if (pci_mmcfg_virt[i].virt) { 126 if (cfg->virt) {
165 iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20)); 127 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
166 pci_mmcfg_virt[i].virt = NULL; 128 cfg->virt = NULL;
167 pci_mmcfg_virt[i].cfg = NULL;
168 } 129 }
169 } 130 }
170
171 kfree(pci_mmcfg_virt);
172 pci_mmcfg_virt = NULL;
173} 131}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 8aa85f17667e..0a979f3e5b8a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,6 +18,7 @@
18#include <asm/mce.h> 18#include <asm/mce.h>
19#include <asm/xcr.h> 19#include <asm/xcr.h>
20#include <asm/suspend.h> 20#include <asm/suspend.h>
21#include <asm/debugreg.h>
21 22
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23static struct saved_context saved_context; 24static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
142#endif 143#endif
143 load_TR_desc(); /* This does ltr */ 144 load_TR_desc(); /* This does ltr */
144 load_LDT(&current->active_mm->context); /* This does lldt */ 145 load_LDT(&current->active_mm->context); /* This does lldt */
145
146 /*
147 * Now maybe reload the debug registers
148 */
149 if (current->thread.debugreg7) {
150#ifdef CONFIG_X86_32
151 set_debugreg(current->thread.debugreg0, 0);
152 set_debugreg(current->thread.debugreg1, 1);
153 set_debugreg(current->thread.debugreg2, 2);
154 set_debugreg(current->thread.debugreg3, 3);
155 /* no 4 and 5 */
156 set_debugreg(current->thread.debugreg6, 6);
157 set_debugreg(current->thread.debugreg7, 7);
158#else
159 /* CONFIG_X86_64 */
160 loaddebug(&current->thread, 0);
161 loaddebug(&current->thread, 1);
162 loaddebug(&current->thread, 2);
163 loaddebug(&current->thread, 3);
164 /* no 4 and 5 */
165 loaddebug(&current->thread, 6);
166 loaddebug(&current->thread, 7);
167#endif
168 }
169
170} 146}
171 147
172/** 148/**
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644
index 000000000000..f82082677337
--- /dev/null
+++ b/arch/x86/tools/Makefile
@@ -0,0 +1,31 @@
1PHONY += posttest
2
3ifeq ($(KBUILD_VERBOSE),1)
4 posttest_verbose = -v
5else
6 posttest_verbose =
7endif
8
9ifeq ($(CONFIG_64BIT),y)
10 posttest_64bit = -y
11else
12 posttest_64bit = -n
13endif
14
15distill_awk = $(srctree)/arch/x86/tools/distill.awk
16chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
17
18quiet_cmd_posttest = TEST $@
19 cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
20
21posttest: $(obj)/test_get_len vmlinux
22 $(call cmd,posttest)
23
24hostprogs-y := test_get_len
25
26# -I needed for generated C source and C source which in the kernel tree.
27HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
28
29# Dependencies are also needed.
30$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
31
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644
index 000000000000..fd1ab80be0de
--- /dev/null
+++ b/arch/x86/tools/chkobjdump.awk
@@ -0,0 +1,33 @@
1# GNU objdump version checker
2#
3# Usage:
4# objdump -v | awk -f chkobjdump.awk
5BEGIN {
6 # objdump version 2.19 or later is OK for the test.
7 od_ver = 2;
8 od_sver = 19;
9}
10
11/^GNU objdump/ {
12 verstr = ""
13 for (i = 3; i <= NF; i++)
14 if (match($(i), "^[0-9]")) {
15 verstr = $(i);
16 break;
17 }
18 if (verstr == "") {
19 printf("Warning: Failed to find objdump version number.\n");
20 exit 0;
21 }
22 split(verstr, ver, ".");
23 if (ver[1] > od_ver ||
24 (ver[1] == od_ver && ver[2] >= od_sver)) {
25 exit 1;
26 } else {
27 printf("Warning: objdump version %s is older than %d.%d\n",
28 verstr, od_ver, od_sver);
29 print("Warning: Skipping posttest.");
30 # Logic is inverted, because we just skip test without error.
31 exit 0;
32 }
33}
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644
index 000000000000..c13c0ee48ab4
--- /dev/null
+++ b/arch/x86/tools/distill.awk
@@ -0,0 +1,47 @@
1#!/bin/awk -f
2# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
3# Distills the disassembly as follows:
4# - Removes all lines except the disassembled instructions.
5# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
6# into a single line.
7# - Remove bad(or prefix only) instructions
8
9BEGIN {
10 prev_addr = ""
11 prev_hex = ""
12 prev_mnemonic = ""
13 bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
14 fwait_expr = "^9b "
15 fwait_str="9b\tfwait"
16}
17
18/^ *[0-9a-f]+ <[^>]*>:/ {
19 # Symbol entry
20 printf("%s%s\n", $2, $1)
21}
22
23/^ *[0-9a-f]+:/ {
24 if (split($0, field, "\t") < 3) {
25 # This is a continuation of the same insn.
26 prev_hex = prev_hex field[2]
27 } else {
28 # Skip bad instructions
29 if (match(prev_mnemonic, bad_expr))
30 prev_addr = ""
31 # Split fwait from other f* instructions
32 if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
33 printf "%s\t%s\n", prev_addr, fwait_str
34 sub(fwait_expr, "", prev_hex)
35 }
36 if (prev_addr != "")
37 printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
38 prev_addr = field[1]
39 prev_hex = field[2]
40 prev_mnemonic = field[3]
41 }
42}
43
44END {
45 if (prev_addr != "")
46 printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
47}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644
index 000000000000..eaf11f52fc0b
--- /dev/null
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -0,0 +1,378 @@
1#!/bin/awk -f
2# gen-insn-attr-x86.awk: Instruction attribute table generator
3# Written by Masami Hiramatsu <mhiramat@redhat.com>
4#
5# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
6
7# Awk implementation sanity check
8function check_awk_implement() {
9 if (sprintf("%x", 0) != "0")
10 return "Your awk has a printf-format problem."
11 return ""
12}
13
14# Clear working vars
15function clear_vars() {
16 delete table
17 delete lptable2
18 delete lptable1
19 delete lptable3
20 eid = -1 # escape id
21 gid = -1 # group id
22 aid = -1 # AVX id
23 tname = ""
24}
25
26BEGIN {
27 # Implementation error checking
28 awkchecked = check_awk_implement()
29 if (awkchecked != "") {
30 print "Error: " awkchecked > "/dev/stderr"
31 print "Please try to use gawk." > "/dev/stderr"
32 exit 1
33 }
34
35 # Setup generating tables
36 print "/* x86 opcode map generated from x86-opcode-map.txt */"
37 print "/* Do not change this code. */\n"
38 ggid = 1
39 geid = 1
40 gaid = 0
41 delete etable
42 delete gtable
43 delete atable
44
45 opnd_expr = "^[A-Za-z/]"
46 ext_expr = "^\\("
47 sep_expr = "^\\|$"
48 group_expr = "^Grp[0-9A-Za-z]+"
49
50 imm_expr = "^[IJAO][a-z]"
51 imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
52 imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
53 imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
54 imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
55 imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
56 imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
57 imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
58 imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
59 imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
60 imm_flag["Ob"] = "INAT_MOFFSET"
61 imm_flag["Ov"] = "INAT_MOFFSET"
62
63 modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
64 force64_expr = "\\([df]64\\)"
65 rex_expr = "^REX(\\.[XRWB]+)*"
66 fpu_expr = "^ESC" # TODO
67
68 lprefix1_expr = "\\(66\\)"
69 lprefix2_expr = "\\(F3\\)"
70 lprefix3_expr = "\\(F2\\)"
71 max_lprefix = 4
72
73 vexok_expr = "\\(VEX\\)"
74 vexonly_expr = "\\(oVEX\\)"
75
76 prefix_expr = "\\(Prefix\\)"
77 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
78 prefix_num["REPNE"] = "INAT_PFX_REPNE"
79 prefix_num["REP/REPE"] = "INAT_PFX_REPE"
80 prefix_num["LOCK"] = "INAT_PFX_LOCK"
81 prefix_num["SEG=CS"] = "INAT_PFX_CS"
82 prefix_num["SEG=DS"] = "INAT_PFX_DS"
83 prefix_num["SEG=ES"] = "INAT_PFX_ES"
84 prefix_num["SEG=FS"] = "INAT_PFX_FS"
85 prefix_num["SEG=GS"] = "INAT_PFX_GS"
86 prefix_num["SEG=SS"] = "INAT_PFX_SS"
87 prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
88 prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
89 prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
90
91 clear_vars()
92}
93
94function semantic_error(msg) {
95 print "Semantic error at " NR ": " msg > "/dev/stderr"
96 exit 1
97}
98
99function debug(msg) {
100 print "DEBUG: " msg
101}
102
103function array_size(arr, i,c) {
104 c = 0
105 for (i in arr)
106 c++
107 return c
108}
109
110/^Table:/ {
111 print "/* " $0 " */"
112 if (tname != "")
113 semantic_error("Hit Table: before EndTable:.");
114}
115
116/^Referrer:/ {
117 if (NF != 1) {
118 # escape opcode table
119 ref = ""
120 for (i = 2; i <= NF; i++)
121 ref = ref $i
122 eid = escape[ref]
123 tname = sprintf("inat_escape_table_%d", eid)
124 }
125}
126
127/^AVXcode:/ {
128 if (NF != 1) {
129 # AVX/escape opcode table
130 aid = $2
131 if (gaid <= aid)
132 gaid = aid + 1
133 if (tname == "") # AVX only opcode table
134 tname = sprintf("inat_avx_table_%d", $2)
135 }
136 if (aid == -1 && eid == -1) # primary opcode table
137 tname = "inat_primary_table"
138}
139
140/^GrpTable:/ {
141 print "/* " $0 " */"
142 if (!($2 in group))
143 semantic_error("No group: " $2 )
144 gid = group[$2]
145 tname = "inat_group_table_" gid
146}
147
148function print_table(tbl,name,fmt,n)
149{
150 print "const insn_attr_t " name " = {"
151 for (i = 0; i < n; i++) {
152 id = sprintf(fmt, i)
153 if (tbl[id])
154 print " [" id "] = " tbl[id] ","
155 }
156 print "};"
157}
158
159/^EndTable/ {
160 if (gid != -1) {
161 # print group tables
162 if (array_size(table) != 0) {
163 print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
164 "0x%x", 8)
165 gtable[gid,0] = tname
166 }
167 if (array_size(lptable1) != 0) {
168 print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
169 "0x%x", 8)
170 gtable[gid,1] = tname "_1"
171 }
172 if (array_size(lptable2) != 0) {
173 print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
174 "0x%x", 8)
175 gtable[gid,2] = tname "_2"
176 }
177 if (array_size(lptable3) != 0) {
178 print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
179 "0x%x", 8)
180 gtable[gid,3] = tname "_3"
181 }
182 } else {
183 # print primary/escaped tables
184 if (array_size(table) != 0) {
185 print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
186 "0x%02x", 256)
187 etable[eid,0] = tname
188 if (aid >= 0)
189 atable[aid,0] = tname
190 }
191 if (array_size(lptable1) != 0) {
192 print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
193 "0x%02x", 256)
194 etable[eid,1] = tname "_1"
195 if (aid >= 0)
196 atable[aid,1] = tname "_1"
197 }
198 if (array_size(lptable2) != 0) {
199 print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
200 "0x%02x", 256)
201 etable[eid,2] = tname "_2"
202 if (aid >= 0)
203 atable[aid,2] = tname "_2"
204 }
205 if (array_size(lptable3) != 0) {
206 print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
207 "0x%02x", 256)
208 etable[eid,3] = tname "_3"
209 if (aid >= 0)
210 atable[aid,3] = tname "_3"
211 }
212 }
213 print ""
214 clear_vars()
215}
216
217function add_flags(old,new) {
218 if (old && new)
219 return old " | " new
220 else if (old)
221 return old
222 else
223 return new
224}
225
226# convert operands to flags.
227function convert_operands(count,opnd, i,j,imm,mod)
228{
229 imm = null
230 mod = null
231 for (j = 1; j <= count; j++) {
232 i = opnd[j]
233 if (match(i, imm_expr) == 1) {
234 if (!imm_flag[i])
235 semantic_error("Unknown imm opnd: " i)
236 if (imm) {
237 if (i != "Ib")
238 semantic_error("Second IMM error")
239 imm = add_flags(imm, "INAT_SCNDIMM")
240 } else
241 imm = imm_flag[i]
242 } else if (match(i, modrm_expr))
243 mod = "INAT_MODRM"
244 }
245 return add_flags(imm, mod)
246}
247
248/^[0-9a-f]+\:/ {
249 if (NR == 1)
250 next
251 # get index
252 idx = "0x" substr($1, 1, index($1,":") - 1)
253 if (idx in table)
254 semantic_error("Redefine " idx " in " tname)
255
256 # check if escaped opcode
257 if ("escape" == $2) {
258 if ($3 != "#")
259 semantic_error("No escaped name")
260 ref = ""
261 for (i = 4; i <= NF; i++)
262 ref = ref $i
263 if (ref in escape)
264 semantic_error("Redefine escape (" ref ")")
265 escape[ref] = geid
266 geid++
267 table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
268 next
269 }
270
271 variant = null
272 # converts
273 i = 2
274 while (i <= NF) {
275 opcode = $(i++)
276 delete opnds
277 ext = null
278 flags = null
279 opnd = null
280 # parse one opcode
281 if (match($i, opnd_expr)) {
282 opnd = $i
283 count = split($(i++), opnds, ",")
284 flags = convert_operands(count, opnds)
285 }
286 if (match($i, ext_expr))
287 ext = $(i++)
288 if (match($i, sep_expr))
289 i++
290 else if (i < NF)
291 semantic_error($i " is not a separator")
292
293 # check if group opcode
294 if (match(opcode, group_expr)) {
295 if (!(opcode in group)) {
296 group[opcode] = ggid
297 ggid++
298 }
299 flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
300 }
301 # check force(or default) 64bit
302 if (match(ext, force64_expr))
303 flags = add_flags(flags, "INAT_FORCE64")
304
305 # check REX prefix
306 if (match(opcode, rex_expr))
307 flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
308
309 # check coprocessor escape : TODO
310 if (match(opcode, fpu_expr))
311 flags = add_flags(flags, "INAT_MODRM")
312
313 # check VEX only code
314 if (match(ext, vexonly_expr))
315 flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
316
317 # check VEX only code
318 if (match(ext, vexok_expr))
319 flags = add_flags(flags, "INAT_VEXOK")
320
321 # check prefixes
322 if (match(ext, prefix_expr)) {
323 if (!prefix_num[opcode])
324 semantic_error("Unknown prefix: " opcode)
325 flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
326 }
327 if (length(flags) == 0)
328 continue
329 # check if last prefix
330 if (match(ext, lprefix1_expr)) {
331 lptable1[idx] = add_flags(lptable1[idx],flags)
332 variant = "INAT_VARIANT"
333 } else if (match(ext, lprefix2_expr)) {
334 lptable2[idx] = add_flags(lptable2[idx],flags)
335 variant = "INAT_VARIANT"
336 } else if (match(ext, lprefix3_expr)) {
337 lptable3[idx] = add_flags(lptable3[idx],flags)
338 variant = "INAT_VARIANT"
339 } else {
340 table[idx] = add_flags(table[idx],flags)
341 }
342 }
343 if (variant)
344 table[idx] = add_flags(table[idx],variant)
345}
346
347END {
348 if (awkchecked != "")
349 exit 1
350 # print escape opcode map's array
351 print "/* Escape opcode map array */"
352 print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \
353 "[INAT_LSTPFX_MAX + 1] = {"
354 for (i = 0; i < geid; i++)
355 for (j = 0; j < max_lprefix; j++)
356 if (etable[i,j])
357 print " ["i"]["j"] = "etable[i,j]","
358 print "};\n"
359 # print group opcode map's array
360 print "/* Group opcode map array */"
361 print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\
362 "[INAT_LSTPFX_MAX + 1] = {"
363 for (i = 0; i < ggid; i++)
364 for (j = 0; j < max_lprefix; j++)
365 if (gtable[i,j])
366 print " ["i"]["j"] = "gtable[i,j]","
367 print "};\n"
368 # print AVX opcode map's array
369 print "/* AVX opcode map array */"
370 print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\
371 "[INAT_LSTPFX_MAX + 1] = {"
372 for (i = 0; i < gaid; i++)
373 for (j = 0; j < max_lprefix; j++)
374 if (atable[i,j])
375 print " ["i"]["j"] = "atable[i,j]","
376 print "};"
377}
378
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
new file mode 100644
index 000000000000..bee8d6ac2691
--- /dev/null
+++ b/arch/x86/tools/test_get_len.c
@@ -0,0 +1,173 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2009
17 */
18
19#include <stdlib.h>
20#include <stdio.h>
21#include <string.h>
22#include <assert.h>
23#include <unistd.h>
24
25#define unlikely(cond) (cond)
26
27#include <asm/insn.h>
28#include <inat.c>
29#include <insn.c>
30
31/*
32 * Test of instruction analysis in general and insn_get_length() in
33 * particular. See if insn_get_length() and the disassembler agree
34 * on the length of each instruction in an elf disassembly.
35 *
36 * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
37 */
38
39const char *prog;
40static int verbose;
41static int x86_64;
42
43static void usage(void)
44{
45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
46 " %s [-y|-n] [-v] \n", prog);
47 fprintf(stderr, "\t-y 64bit mode\n");
48 fprintf(stderr, "\t-n 32bit mode\n");
49 fprintf(stderr, "\t-v verbose mode\n");
50 exit(1);
51}
52
53static void malformed_line(const char *line, int line_nr)
54{
55 fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
56 exit(3);
57}
58
59static void dump_field(FILE *fp, const char *name, const char *indent,
60 struct insn_field *field)
61{
62 fprintf(fp, "%s.%s = {\n", indent, name);
63 fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
64 indent, field->value, field->bytes[0], field->bytes[1],
65 field->bytes[2], field->bytes[3]);
66 fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
67 field->got, field->nbytes);
68}
69
70static void dump_insn(FILE *fp, struct insn *insn)
71{
72 fprintf(fp, "Instruction = { \n");
73 dump_field(fp, "prefixes", "\t", &insn->prefixes);
74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
76 dump_field(fp, "opcode", "\t", &insn->opcode);
77 dump_field(fp, "modrm", "\t", &insn->modrm);
78 dump_field(fp, "sib", "\t", &insn->sib);
79 dump_field(fp, "displacement", "\t", &insn->displacement);
80 dump_field(fp, "immediate1", "\t", &insn->immediate1);
81 dump_field(fp, "immediate2", "\t", &insn->immediate2);
82 fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
83 insn->attr, insn->opnd_bytes, insn->addr_bytes);
84 fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
85 insn->length, insn->x86_64, insn->kaddr);
86}
87
88static void parse_args(int argc, char **argv)
89{
90 int c;
91 prog = argv[0];
92 while ((c = getopt(argc, argv, "ynv")) != -1) {
93 switch (c) {
94 case 'y':
95 x86_64 = 1;
96 break;
97 case 'n':
98 x86_64 = 0;
99 break;
100 case 'v':
101 verbose = 1;
102 break;
103 default:
104 usage();
105 }
106 }
107}
108
109#define BUFSIZE 256
110
111int main(int argc, char **argv)
112{
113 char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
114 unsigned char insn_buf[16];
115 struct insn insn;
116 int insns = 0;
117 int warnings = 0;
118
119 parse_args(argc, argv);
120
121 while (fgets(line, BUFSIZE, stdin)) {
122 char copy[BUFSIZE], *s, *tab1, *tab2;
123 int nb = 0;
124 unsigned int b;
125
126 if (line[0] == '<') {
127 /* Symbol line */
128 strcpy(sym, line);
129 continue;
130 }
131
132 insns++;
133 memset(insn_buf, 0, 16);
134 strcpy(copy, line);
135 tab1 = strchr(copy, '\t');
136 if (!tab1)
137 malformed_line(line, insns);
138 s = tab1 + 1;
139 s += strspn(s, " ");
140 tab2 = strchr(s, '\t');
141 if (!tab2)
142 malformed_line(line, insns);
143 *tab2 = '\0'; /* Characters beyond tab2 aren't examined */
144 while (s < tab2) {
145 if (sscanf(s, "%x", &b) == 1) {
146 insn_buf[nb++] = (unsigned char) b;
147 s += 3;
148 } else
149 break;
150 }
151 /* Decode an instruction */
152 insn_init(&insn, insn_buf, x86_64);
153 insn_get_length(&insn);
154 if (insn.length != nb) {
155 warnings++;
156 fprintf(stderr, "Warning: %s found difference at %s\n",
157 prog, sym);
158 fprintf(stderr, "Warning: %s", line);
159 fprintf(stderr, "Warning: objdump says %d bytes, but "
160 "insn_get_length() says %d\n", nb,
161 insn.length);
162 if (verbose)
163 dump_insn(stderr, &insn);
164 }
165 }
166 if (warnings)
167 fprintf(stderr, "Warning: decoded and checked %d"
168 " instructions with %d warnings\n", insns, warnings);
169 else
170 fprintf(stderr, "Succeed: decoded and checked %d"
171 " instructions\n", insns);
172 return 0;
173}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 58bc00f68b12..02b442e92007 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -393,7 +393,6 @@ static ctl_table abi_table2[] = {
393 393
394static ctl_table abi_root_table2[] = { 394static ctl_table abi_root_table2[] = {
395 { 395 {
396 .ctl_name = CTL_ABI,
397 .procname = "abi", 396 .procname = "abi",
398 .mode = 0555, 397 .mode = 0555,
399 .child = abi_table2 398 .child = abi_table2
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c5e805d4a788..36daccb68642 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -27,7 +27,9 @@
27#include <linux/page-flags.h> 27#include <linux/page-flags.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/console.h> 29#include <linux/console.h>
30#include <linux/pci.h>
30 31
32#include <xen/xen.h>
31#include <xen/interface/xen.h> 33#include <xen/interface/xen.h>
32#include <xen/interface/version.h> 34#include <xen/interface/version.h>
33#include <xen/interface/physdev.h> 35#include <xen/interface/physdev.h>
@@ -138,24 +140,23 @@ static void xen_vcpu_setup(int cpu)
138 */ 140 */
139void xen_vcpu_restore(void) 141void xen_vcpu_restore(void)
140{ 142{
141 if (have_vcpu_info_placement) { 143 int cpu;
142 int cpu;
143 144
144 for_each_online_cpu(cpu) { 145 for_each_online_cpu(cpu) {
145 bool other_cpu = (cpu != smp_processor_id()); 146 bool other_cpu = (cpu != smp_processor_id());
146 147
147 if (other_cpu && 148 if (other_cpu &&
148 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) 149 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
149 BUG(); 150 BUG();
150 151
151 xen_vcpu_setup(cpu); 152 xen_setup_runstate_info(cpu);
152 153
153 if (other_cpu && 154 if (have_vcpu_info_placement)
154 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) 155 xen_vcpu_setup(cpu);
155 BUG();
156 }
157 156
158 BUG_ON(!have_vcpu_info_placement); 157 if (other_cpu &&
158 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
159 BUG();
159 } 160 }
160} 161}
161 162
@@ -178,6 +179,7 @@ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
178static void xen_cpuid(unsigned int *ax, unsigned int *bx, 179static void xen_cpuid(unsigned int *ax, unsigned int *bx,
179 unsigned int *cx, unsigned int *dx) 180 unsigned int *cx, unsigned int *dx)
180{ 181{
182 unsigned maskebx = ~0;
181 unsigned maskecx = ~0; 183 unsigned maskecx = ~0;
182 unsigned maskedx = ~0; 184 unsigned maskedx = ~0;
183 185
@@ -185,9 +187,16 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
185 * Mask out inconvenient features, to try and disable as many 187 * Mask out inconvenient features, to try and disable as many
186 * unsupported kernel subsystems as possible. 188 * unsupported kernel subsystems as possible.
187 */ 189 */
188 if (*ax == 1) { 190 switch (*ax) {
191 case 1:
189 maskecx = cpuid_leaf1_ecx_mask; 192 maskecx = cpuid_leaf1_ecx_mask;
190 maskedx = cpuid_leaf1_edx_mask; 193 maskedx = cpuid_leaf1_edx_mask;
194 break;
195
196 case 0xb:
197 /* Suppress extended topology stuff */
198 maskebx = 0;
199 break;
191 } 200 }
192 201
193 asm(XEN_EMULATE_PREFIX "cpuid" 202 asm(XEN_EMULATE_PREFIX "cpuid"
@@ -197,6 +206,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
197 "=d" (*dx) 206 "=d" (*dx)
198 : "0" (*ax), "2" (*cx)); 207 : "0" (*ax), "2" (*cx));
199 208
209 *bx &= maskebx;
200 *cx &= maskecx; 210 *cx &= maskecx;
201 *dx &= maskedx; 211 *dx &= maskedx;
202} 212}
@@ -1075,6 +1085,8 @@ asmlinkage void __init xen_start_kernel(void)
1075 * Set up some pagetable state before starting to set any ptes. 1085 * Set up some pagetable state before starting to set any ptes.
1076 */ 1086 */
1077 1087
1088 xen_init_mmu_ops();
1089
1078 /* Prevent unwanted bits from being set in PTEs. */ 1090 /* Prevent unwanted bits from being set in PTEs. */
1079 __supported_pte_mask &= ~_PAGE_GLOBAL; 1091 __supported_pte_mask &= ~_PAGE_GLOBAL;
1080 if (!xen_initial_domain()) 1092 if (!xen_initial_domain())
@@ -1097,7 +1109,6 @@ asmlinkage void __init xen_start_kernel(void)
1097 */ 1109 */
1098 xen_setup_stackprotector(); 1110 xen_setup_stackprotector();
1099 1111
1100 xen_init_mmu_ops();
1101 xen_init_irq_ops(); 1112 xen_init_irq_ops();
1102 xen_init_cpuid_mask(); 1113 xen_init_cpuid_mask();
1103 1114
@@ -1140,9 +1151,13 @@ asmlinkage void __init xen_start_kernel(void)
1140 1151
1141 /* keep using Xen gdt for now; no urgent need to change it */ 1152 /* keep using Xen gdt for now; no urgent need to change it */
1142 1153
1154#ifdef CONFIG_X86_32
1143 pv_info.kernel_rpl = 1; 1155 pv_info.kernel_rpl = 1;
1144 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1156 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1145 pv_info.kernel_rpl = 0; 1157 pv_info.kernel_rpl = 0;
1158#else
1159 pv_info.kernel_rpl = 0;
1160#endif
1146 1161
1147 /* set the limit of our address space */ 1162 /* set the limit of our address space */
1148 xen_reserve_top(); 1163 xen_reserve_top();
@@ -1166,10 +1181,16 @@ asmlinkage void __init xen_start_kernel(void)
1166 add_preferred_console("xenboot", 0, NULL); 1181 add_preferred_console("xenboot", 0, NULL);
1167 add_preferred_console("tty", 0, NULL); 1182 add_preferred_console("tty", 0, NULL);
1168 add_preferred_console("hvc", 0, NULL); 1183 add_preferred_console("hvc", 0, NULL);
1184 } else {
1185 /* Make sure ACS will be enabled */
1186 pci_request_acs();
1169 } 1187 }
1188
1170 1189
1171 xen_raw_console_write("about to get started...\n"); 1190 xen_raw_console_write("about to get started...\n");
1172 1191
1192 xen_setup_runstate_info(0);
1193
1173 /* Start the world */ 1194 /* Start the world */
1174#ifdef CONFIG_X86_32 1195#ifdef CONFIG_X86_32
1175 i386_start_kernel(); 1196 i386_start_kernel();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3bf7b1d250ce..bf4cd6bfe959 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -185,7 +185,7 @@ static inline unsigned p2m_index(unsigned long pfn)
185} 185}
186 186
187/* Build the parallel p2m_top_mfn structures */ 187/* Build the parallel p2m_top_mfn structures */
188static void __init xen_build_mfn_list_list(void) 188void xen_build_mfn_list_list(void)
189{ 189{
190 unsigned pfn, idx; 190 unsigned pfn, idx;
191 191
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index fe03eeed7b48..563d20504988 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -35,10 +35,10 @@
35 35
36cpumask_var_t xen_cpu_initialized_map; 36cpumask_var_t xen_cpu_initialized_map;
37 37
38static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, xen_resched_irq);
39static DEFINE_PER_CPU(int, callfunc_irq); 39static DEFINE_PER_CPU(int, xen_callfunc_irq);
40static DEFINE_PER_CPU(int, callfuncsingle_irq); 40static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
41static DEFINE_PER_CPU(int, debug_irq) = -1; 41static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
42 42
43static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 43static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
44static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 44static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -73,7 +73,7 @@ static __cpuinit void cpu_bringup(void)
73 73
74 xen_setup_cpu_clockevents(); 74 xen_setup_cpu_clockevents();
75 75
76 cpu_set(cpu, cpu_online_map); 76 set_cpu_online(cpu, true);
77 percpu_write(cpu_state, CPU_ONLINE); 77 percpu_write(cpu_state, CPU_ONLINE);
78 wmb(); 78 wmb();
79 79
@@ -103,7 +103,7 @@ static int xen_smp_intr_init(unsigned int cpu)
103 NULL); 103 NULL);
104 if (rc < 0) 104 if (rc < 0)
105 goto fail; 105 goto fail;
106 per_cpu(resched_irq, cpu) = rc; 106 per_cpu(xen_resched_irq, cpu) = rc;
107 107
108 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); 108 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
109 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, 109 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
@@ -114,7 +114,7 @@ static int xen_smp_intr_init(unsigned int cpu)
114 NULL); 114 NULL);
115 if (rc < 0) 115 if (rc < 0)
116 goto fail; 116 goto fail;
117 per_cpu(callfunc_irq, cpu) = rc; 117 per_cpu(xen_callfunc_irq, cpu) = rc;
118 118
119 debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); 119 debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
120 rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, 120 rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
@@ -122,7 +122,7 @@ static int xen_smp_intr_init(unsigned int cpu)
122 debug_name, NULL); 122 debug_name, NULL);
123 if (rc < 0) 123 if (rc < 0)
124 goto fail; 124 goto fail;
125 per_cpu(debug_irq, cpu) = rc; 125 per_cpu(xen_debug_irq, cpu) = rc;
126 126
127 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); 127 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
128 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, 128 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
@@ -133,19 +133,20 @@ static int xen_smp_intr_init(unsigned int cpu)
133 NULL); 133 NULL);
134 if (rc < 0) 134 if (rc < 0)
135 goto fail; 135 goto fail;
136 per_cpu(callfuncsingle_irq, cpu) = rc; 136 per_cpu(xen_callfuncsingle_irq, cpu) = rc;
137 137
138 return 0; 138 return 0;
139 139
140 fail: 140 fail:
141 if (per_cpu(resched_irq, cpu) >= 0) 141 if (per_cpu(xen_resched_irq, cpu) >= 0)
142 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); 142 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
143 if (per_cpu(callfunc_irq, cpu) >= 0) 143 if (per_cpu(xen_callfunc_irq, cpu) >= 0)
144 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 144 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
145 if (per_cpu(debug_irq, cpu) >= 0) 145 if (per_cpu(xen_debug_irq, cpu) >= 0)
146 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 146 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
147 if (per_cpu(callfuncsingle_irq, cpu) >= 0) 147 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
148 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); 148 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
149 NULL);
149 150
150 return rc; 151 return rc;
151} 152}
@@ -295,6 +296,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
295 (unsigned long)task_stack_page(idle) - 296 (unsigned long)task_stack_page(idle) -
296 KERNEL_STACK_OFFSET + THREAD_SIZE; 297 KERNEL_STACK_OFFSET + THREAD_SIZE;
297#endif 298#endif
299 xen_setup_runstate_info(cpu);
298 xen_setup_timer(cpu); 300 xen_setup_timer(cpu);
299 xen_init_lock_cpu(cpu); 301 xen_init_lock_cpu(cpu);
300 302
@@ -348,10 +350,10 @@ static void xen_cpu_die(unsigned int cpu)
348 current->state = TASK_UNINTERRUPTIBLE; 350 current->state = TASK_UNINTERRUPTIBLE;
349 schedule_timeout(HZ/10); 351 schedule_timeout(HZ/10);
350 } 352 }
351 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); 353 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
352 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 354 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
353 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 355 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
354 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); 356 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
355 xen_uninit_lock_cpu(cpu); 357 xen_uninit_lock_cpu(cpu);
356 xen_teardown_timer(cpu); 358 xen_teardown_timer(cpu);
357 359
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 36a5141108df..24ded31b5aec 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -120,14 +120,14 @@ struct xen_spinlock {
120 unsigned short spinners; /* count of waiting cpus */ 120 unsigned short spinners; /* count of waiting cpus */
121}; 121};
122 122
123static int xen_spin_is_locked(struct raw_spinlock *lock) 123static int xen_spin_is_locked(struct arch_spinlock *lock)
124{ 124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126 126
127 return xl->lock != 0; 127 return xl->lock != 0;
128} 128}
129 129
130static int xen_spin_is_contended(struct raw_spinlock *lock) 130static int xen_spin_is_contended(struct arch_spinlock *lock)
131{ 131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133 133
@@ -136,7 +136,7 @@ static int xen_spin_is_contended(struct raw_spinlock *lock)
136 return xl->spinners != 0; 136 return xl->spinners != 0;
137} 137}
138 138
139static int xen_spin_trylock(struct raw_spinlock *lock) 139static int xen_spin_trylock(struct arch_spinlock *lock)
140{ 140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1; 142 u8 old = 1;
@@ -181,7 +181,7 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
181 __get_cpu_var(lock_spinners) = prev; 181 __get_cpu_var(lock_spinners) = prev;
182} 182}
183 183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) 184static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
185{ 185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev; 187 struct xen_spinlock *prev;
@@ -254,7 +254,7 @@ out:
254 return ret; 254 return ret;
255} 255}
256 256
257static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) 257static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
258{ 258{
259 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 259 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
260 unsigned timeout; 260 unsigned timeout;
@@ -291,12 +291,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
291 spin_time_accum_total(start_spin); 291 spin_time_accum_total(start_spin);
292} 292}
293 293
294static void xen_spin_lock(struct raw_spinlock *lock) 294static void xen_spin_lock(struct arch_spinlock *lock)
295{ 295{
296 __xen_spin_lock(lock, false); 296 __xen_spin_lock(lock, false);
297} 297}
298 298
299static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) 299static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
300{ 300{
301 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); 301 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
302} 302}
@@ -317,7 +317,7 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
317 } 317 }
318} 318}
319 319
320static void xen_spin_unlock(struct raw_spinlock *lock) 320static void xen_spin_unlock(struct arch_spinlock *lock)
321{ 321{
322 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 322 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
323 323
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 95be7b434724..987267f79bf5 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,4 +1,5 @@
1#include <linux/types.h> 1#include <linux/types.h>
2#include <linux/clockchips.h>
2 3
3#include <xen/interface/xen.h> 4#include <xen/interface/xen.h>
4#include <xen/grant_table.h> 5#include <xen/grant_table.h>
@@ -27,6 +28,8 @@ void xen_pre_suspend(void)
27 28
28void xen_post_suspend(int suspend_cancelled) 29void xen_post_suspend(int suspend_cancelled)
29{ 30{
31 xen_build_mfn_list_list();
32
30 xen_setup_shared_info(); 33 xen_setup_shared_info();
31 34
32 if (suspend_cancelled) { 35 if (suspend_cancelled) {
@@ -44,7 +47,19 @@ void xen_post_suspend(int suspend_cancelled)
44 47
45} 48}
46 49
50static void xen_vcpu_notify_restore(void *data)
51{
52 unsigned long reason = (unsigned long)data;
53
54 /* Boot processor notified via generic timekeeping_resume() */
55 if ( smp_processor_id() == 0)
56 return;
57
58 clockevents_notify(reason, NULL);
59}
60
47void xen_arch_resume(void) 61void xen_arch_resume(void)
48{ 62{
49 /* nothing */ 63 smp_call_function(xen_vcpu_notify_restore,
64 (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
50} 65}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 0a5aa44299a5..0d3f07cd1b5f 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -31,14 +31,14 @@
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
35 35
36/* snapshots of runstate info */ 36/* snapshots of runstate info */
37static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); 37static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
38 38
39/* unused ns of stolen and blocked time */ 39/* unused ns of stolen and blocked time */
40static DEFINE_PER_CPU(u64, residual_stolen); 40static DEFINE_PER_CPU(u64, xen_residual_stolen);
41static DEFINE_PER_CPU(u64, residual_blocked); 41static DEFINE_PER_CPU(u64, xen_residual_blocked);
42 42
43/* return an consistent snapshot of 64-bit time/counter value */ 43/* return an consistent snapshot of 64-bit time/counter value */
44static u64 get64(const u64 *p) 44static u64 get64(const u64 *p)
@@ -79,7 +79,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
79 79
80 BUG_ON(preemptible()); 80 BUG_ON(preemptible());
81 81
82 state = &__get_cpu_var(runstate); 82 state = &__get_cpu_var(xen_runstate);
83 83
84 /* 84 /*
85 * The runstate info is always updated by the hypervisor on 85 * The runstate info is always updated by the hypervisor on
@@ -97,14 +97,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
97/* return true when a vcpu could run but has no real cpu to run on */ 97/* return true when a vcpu could run but has no real cpu to run on */
98bool xen_vcpu_stolen(int vcpu) 98bool xen_vcpu_stolen(int vcpu)
99{ 99{
100 return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; 100 return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
101} 101}
102 102
103static void setup_runstate_info(int cpu) 103void xen_setup_runstate_info(int cpu)
104{ 104{
105 struct vcpu_register_runstate_memory_area area; 105 struct vcpu_register_runstate_memory_area area;
106 106
107 area.addr.v = &per_cpu(runstate, cpu); 107 area.addr.v = &per_cpu(xen_runstate, cpu);
108 108
109 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, 109 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
110 cpu, &area)) 110 cpu, &area))
@@ -122,7 +122,7 @@ static void do_stolen_accounting(void)
122 122
123 WARN_ON(state.state != RUNSTATE_running); 123 WARN_ON(state.state != RUNSTATE_running);
124 124
125 snap = &__get_cpu_var(runstate_snapshot); 125 snap = &__get_cpu_var(xen_runstate_snapshot);
126 126
127 /* work out how much time the VCPU has not been runn*ing* */ 127 /* work out how much time the VCPU has not been runn*ing* */
128 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; 128 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
@@ -133,24 +133,24 @@ static void do_stolen_accounting(void)
133 133
134 /* Add the appropriate number of ticks of stolen time, 134 /* Add the appropriate number of ticks of stolen time,
135 including any left-overs from last time. */ 135 including any left-overs from last time. */
136 stolen = runnable + offline + __get_cpu_var(residual_stolen); 136 stolen = runnable + offline + __get_cpu_var(xen_residual_stolen);
137 137
138 if (stolen < 0) 138 if (stolen < 0)
139 stolen = 0; 139 stolen = 0;
140 140
141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
142 __get_cpu_var(residual_stolen) = stolen; 142 __get_cpu_var(xen_residual_stolen) = stolen;
143 account_steal_ticks(ticks); 143 account_steal_ticks(ticks);
144 144
145 /* Add the appropriate number of ticks of blocked time, 145 /* Add the appropriate number of ticks of blocked time,
146 including any left-overs from last time. */ 146 including any left-overs from last time. */
147 blocked += __get_cpu_var(residual_blocked); 147 blocked += __get_cpu_var(xen_residual_blocked);
148 148
149 if (blocked < 0) 149 if (blocked < 0)
150 blocked = 0; 150 blocked = 0;
151 151
152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
153 __get_cpu_var(residual_blocked) = blocked; 153 __get_cpu_var(xen_residual_blocked) = blocked;
154 account_idle_ticks(ticks); 154 account_idle_ticks(ticks);
155} 155}
156 156
@@ -434,7 +434,7 @@ void xen_setup_timer(int cpu)
434 name = "<timer kasprintf failed>"; 434 name = "<timer kasprintf failed>";
435 435
436 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 436 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
437 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, 437 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
438 name, NULL); 438 name, NULL);
439 439
440 evt = &per_cpu(xen_clock_events, cpu); 440 evt = &per_cpu(xen_clock_events, cpu);
@@ -442,8 +442,6 @@ void xen_setup_timer(int cpu)
442 442
443 evt->cpumask = cpumask_of(cpu); 443 evt->cpumask = cpumask_of(cpu);
444 evt->irq = irq; 444 evt->irq = irq;
445
446 setup_runstate_info(cpu);
447} 445}
448 446
449void xen_teardown_timer(int cpu) 447void xen_teardown_timer(int cpu)
@@ -494,6 +492,7 @@ __init void xen_time_init(void)
494 492
495 setup_force_cpu_cap(X86_FEATURE_TSC); 493 setup_force_cpu_cap(X86_FEATURE_TSC);
496 494
495 xen_setup_runstate_info(cpu);
497 xen_setup_timer(cpu); 496 xen_setup_timer(cpu);
498 xen_setup_cpu_clockevents(); 497 xen_setup_cpu_clockevents();
499} 498}
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 02f496a8dbaa..53adefda4275 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -96,7 +96,7 @@ ENTRY(xen_sysret32)
96 pushq $__USER32_CS 96 pushq $__USER32_CS
97 pushq %rcx 97 pushq %rcx
98 98
99 pushq $VGCF_in_syscall 99 pushq $0
1001: jmp hypercall_iret 1001: jmp hypercall_iret
101ENDPATCH(xen_sysret32) 101ENDPATCH(xen_sysret32)
102RELOC(xen_sysret32, 1b+1) 102RELOC(xen_sysret32, 1b+1)
@@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target)
151ENTRY(xen_sysenter_target) 151ENTRY(xen_sysenter_target)
152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */ 152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */
153 mov $-ENOSYS, %rax 153 mov $-ENOSYS, %rax
154 pushq $VGCF_in_syscall 154 pushq $0
155 jmp hypercall_iret 155 jmp hypercall_iret
156ENDPROC(xen_syscall32_target) 156ENDPROC(xen_syscall32_target)
157ENDPROC(xen_sysenter_target) 157ENDPROC(xen_sysenter_target)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 355fa6b99c9c..f9153a300bce 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -25,6 +25,7 @@ extern struct shared_info *HYPERVISOR_shared_info;
25 25
26void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
27void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_build_mfn_list_list(void);
28void xen_setup_machphys_mapping(void); 29void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 31void xen_ident_map_ISA(void);
@@ -41,6 +42,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
41 42
42void xen_init_irq_ops(void); 43void xen_init_irq_ops(void);
43void xen_setup_timer(int cpu); 44void xen_setup_timer(int cpu);
45void xen_setup_runstate_info(int cpu);
44void xen_teardown_timer(int cpu); 46void xen_teardown_timer(int cpu);
45cycle_t xen_clocksource_read(void); 47cycle_t xen_clocksource_read(void);
46void xen_setup_cpu_clockevents(void); 48void xen_setup_cpu_clockevents(void);