aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig43
-rw-r--r--arch/x86/Kconfig.cpu19
-rw-r--r--arch/x86/Kconfig.debug23
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/Makefile_32.cpu9
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/relocs.c87
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S3
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/boot/setup.ld3
-rw-r--r--arch/x86/boot/version.c4
-rw-r--r--arch/x86/boot/video.c6
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S517
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c10
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S157
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c333
-rw-r--r--arch/x86/ia32/ia32entry.S46
-rw-r--r--arch/x86/ia32/sys_ia32.c99
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/a.out-core.h10
-rw-r--r--arch/x86/include/asm/acpi.h29
-rw-r--r--arch/x86/include/asm/alternative-asm.h10
-rw-r--r--arch/x86/include/asm/alternative.h1
-rw-r--r--arch/x86/include/asm/amd_iommu.h15
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h40
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h54
-rw-r--r--arch/x86/include/asm/apic.h21
-rw-r--r--arch/x86/include/asm/apicdef.h6
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/asm-offsets.h1
-rw-r--r--arch/x86/include/asm/bug.h4
-rw-r--r--arch/x86/include/asm/cache.h7
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/calgary.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h218
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h234
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/debugreg.h33
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/desc_defs.h4
-rw-r--r--arch/x86/include/asm/device.h2
-rw-r--r--arch/x86/include/asm/dma-mapping.h17
-rw-r--r--arch/x86/include/asm/e820.h23
-rw-r--r--arch/x86/include/asm/elf.h21
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/gart.h9
-rw-r--r--arch/x86/include/asm/geode.h219
-rw-r--r--arch/x86/include/asm/hardirq.h8
-rw-r--r--arch/x86/include/asm/hpet.h7
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h73
-rw-r--r--arch/x86/include/asm/hw_irq.h35
-rw-r--r--arch/x86/include/asm/i387.h7
-rw-r--r--arch/x86/include/asm/inat.h220
-rw-r--r--arch/x86/include/asm/inat_types.h29
-rw-r--r--arch/x86/include/asm/insn.h184
-rw-r--r--arch/x86/include/asm/inst.h150
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_vectors.h4
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/kvm.h34
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h35
-rw-r--r--arch/x86/include/asm/mce.h16
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mmzone_32.h2
-rw-r--r--arch/x86/include/asm/mpspec.h27
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/msr.h27
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/page_types.h3
-rw-r--r--arch/x86/include/asm/paravirt.h42
-rw-r--r--arch/x86/include/asm/paravirt_types.h24
-rw-r--r--arch/x86/include/asm/pci_x86.h20
-rw-r--r--arch/x86/include/asm/perf_event.h13
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--arch/x86/include/asm/processor.h18
-rw-r--r--arch/x86/include/asm/proto.h17
-rw-r--r--arch/x86/include/asm/ptrace.h64
-rw-r--r--arch/x86/include/asm/sections.h6
-rw-r--r--arch/x86/include/asm/sigcontext.h4
-rw-r--r--arch/x86/include/asm/spinlock.h62
-rw-r--r--arch/x86/include/asm/spinlock_types.h10
-rw-r--r--arch/x86/include/asm/stacktrace.h24
-rw-r--r--arch/x86/include/asm/string_32.h9
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/asm/swiotlb.h11
-rw-r--r--arch/x86/include/asm/sys_ia32.h9
-rw-r--r--arch/x86/include/asm/syscalls.h34
-rw-r--r--arch/x86/include/asm/system.h32
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/include/asm/topology.h10
-rw-r--r--arch/x86/include/asm/trampoline.h1
-rw-r--r--arch/x86/include/asm/uaccess.h1
-rw-r--r--arch/x86/include/asm/uaccess_32.h27
-rw-r--r--arch/x86/include/asm/uaccess_64.h36
-rw-r--r--arch/x86/include/asm/unistd_32.h3
-rw-r--r--arch/x86/include/asm/unistd_64.h2
-rw-r--r--arch/x86/include/asm/uv/bios.h11
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h151
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h14
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/x86_init.h14
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h27
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c3
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/processor.c100
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S3
-rw-r--r--arch/x86/kernel/acpi/sleep.c26
-rw-r--r--arch/x86/kernel/amd_iommu.c1295
-rw-r--r--arch/x86/kernel/amd_iommu_init.c140
-rw-r--r--arch/x86/kernel/aperture_64.c13
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c38
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c5
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c18
-rw-r--r--arch/x86/kernel/apic/es7000_32.c28
-rw-r--r--arch/x86/kernel/apic/io_apic.c449
-rw-r--r--arch/x86/kernel/apic/nmi.c11
-rw-r--r--arch/x86/kernel/apic/numaq_32.c18
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c52
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/bios_uv.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c15
-rw-r--r--arch/x86/kernel/cpu/amd.c57
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c44
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c34
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c9
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c34
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c155
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c45
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c53
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event.c241
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpuid.c22
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/crash_dump_32.c19
-rw-r--r--arch/x86/kernel/dumpstack.c48
-rw-r--r--arch/x86/kernel/dumpstack.h6
-rw-r--r--arch/x86/kernel/dumpstack_32.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c83
-rw-r--r--arch/x86/kernel/e820.c17
-rw-r--r--arch/x86/kernel/early_printk.c5
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/entry_32.S100
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c101
-rw-r--r--arch/x86/kernel/geode_32.c196
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S18
-rw-r--r--arch/x86/kernel/head_64.S7
-rw-r--r--arch/x86/kernel/hpet.c77
-rw-r--r--arch/x86/kernel/hw_breakpoint.c554
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c2
-rw-r--r--arch/x86/kernel/ioport.c28
-rw-r--r--arch/x86/kernel/irq.c130
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c21
-rw-r--r--arch/x86/kernel/kprobes.c261
-rw-r--r--arch/x86/kernel/machine_kexec_32.c8
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c410
-rw-r--r--arch/x86/kernel/microcode_amd.c99
-rw-r--r--arch/x86/kernel/microcode_core.c34
-rw-r--r--arch/x86/kernel/microcode_intel.c47
-rw-r--r--arch/x86/kernel/mpparse.c47
-rw-r--r--arch/x86/kernel/msr.c23
-rw-r--r--arch/x86/kernel/olpc.c4
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c100
-rw-r--r--arch/x86/kernel/pci-dma.c49
-rw-r--r--arch/x86/kernel/pci-gart_64.c164
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c21
-rw-r--r--arch/x86/kernel/process.c114
-rw-r--r--arch/x86/kernel/process_32.c111
-rw-r--r--arch/x86/kernel/process_64.c122
-rw-r--r--arch/x86/kernel/ptrace.c480
-rw-r--r--arch/x86/kernel/quirks.c9
-rw-r--r--arch/x86/kernel/reboot.c21
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c3
-rw-r--r--arch/x86/kernel/setup.c123
-rw-r--r--arch/x86/kernel/setup_percpu.c13
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/smpboot.c58
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/sys_i386_32.c27
-rw-r--r--arch/x86/kernel/sys_x86_64.c17
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/time.c3
-rw-r--r--arch/x86/kernel/tlb_uv.c13
-rw-r--r--arch/x86/kernel/trampoline.c30
-rw-r--r--arch/x86/kernel/trampoline_64.S4
-rw-r--r--arch/x86/kernel/traps.c73
-rw-r--r--arch/x86/kernel/tsc.c1
-rw-r--r--arch/x86/kernel/tsc_sync.c23
-rw-r--r--arch/x86/kernel/uv_irq.c238
-rw-r--r--arch/x86/kernel/uv_time.c80
-rw-r--r--arch/x86/kernel/visws_quirks.c10
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S45
-rw-r--r--arch/x86/kernel/vsyscall_64.c7
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c11
-rw-r--r--arch/x86/kernel/x86_init.c10
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c159
-rw-r--r--arch/x86/kvm/i8254.c16
-rw-r--r--arch/x86/kvm/i8259.c44
-rw-r--r--arch/x86/kvm/irq.h7
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/mmu.c89
-rw-r--r--arch/x86/kvm/paging_tmpl.h37
-rw-r--r--arch/x86/kvm/svm.c359
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c450
-rw-r--r--arch/x86/kvm/x86.c578
-rw-r--r--arch/x86/lib/.gitignore1
-rw-r--r--arch/x86/lib/Makefile23
-rw-r--r--arch/x86/lib/copy_user_64.S14
-rw-r--r--arch/x86/lib/inat.c90
-rw-r--r--arch/x86/lib/insn.c516
-rw-r--r--arch/x86/lib/msr-smp.c204
-rw-r--r--arch/x86/lib/msr.c227
-rw-r--r--arch/x86/lib/usercopy_32.c10
-rw-r--r--arch/x86/lib/x86-opcode-map.txt893
-rw-r--r--arch/x86/mm/extable.c31
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c10
-rw-r--r--arch/x86/mm/init_64.c35
-rw-r--r--arch/x86/mm/ioremap.c50
-rw-r--r--arch/x86/mm/k8topology_64.c101
-rw-r--r--arch/x86/mm/kmemcheck/error.c19
-rw-r--r--arch/x86/mm/kmmio.c54
-rw-r--r--arch/x86/mm/mmio-mod.c71
-rw-r--r--arch/x86/mm/numa_32.c4
-rw-r--r--arch/x86/mm/numa_64.c252
-rw-r--r--arch/x86/mm/pageattr.c22
-rw-r--r--arch/x86/mm/pat.c23
-rw-r--r--arch/x86/mm/setup_nx.c59
-rw-r--r--arch/x86/mm/srat_32.c2
-rw-r--r--arch/x86/mm/srat_64.c37
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/oprofile/backtrace.c9
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/acpi.c74
-rw-r--r--arch/x86/pci/amd_bus.c120
-rw-r--r--arch/x86/pci/bus_numa.c101
-rw-r--r--arch/x86/pci/bus_numa.h27
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/early.c7
-rw-r--r--arch/x86/pci/i386.c42
-rw-r--r--arch/x86/pci/intel_bus.c90
-rw-r--r--arch/x86/pci/mmconfig-shared.c356
-rw-r--r--arch/x86/pci/mmconfig_32.c16
-rw-r--r--arch/x86/pci/mmconfig_64.c88
-rw-r--r--arch/x86/power/cpu.c26
-rw-r--r--arch/x86/tools/Makefile31
-rw-r--r--arch/x86/tools/chkobjdump.awk33
-rw-r--r--arch/x86/tools/distill.awk47
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk378
-rw-r--r--arch/x86/tools/test_get_len.c173
-rw-r--r--arch/x86/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/xen/enlighten.c51
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/smp.c3
-rw-r--r--arch/x86/xen/spinlock.c16
-rw-r--r--arch/x86/xen/suspend.c17
-rw-r--r--arch/x86/xen/time.c7
-rw-r--r--arch/x86/xen/xen-asm_64.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
304 files changed, 11856 insertions, 6536 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8da93745c087..55298e891571 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -49,7 +49,11 @@ config X86
49 select HAVE_KERNEL_GZIP 49 select HAVE_KERNEL_GZIP
50 select HAVE_KERNEL_BZIP2 50 select HAVE_KERNEL_BZIP2
51 select HAVE_KERNEL_LZMA 51 select HAVE_KERNEL_LZMA
52 select HAVE_HW_BREAKPOINT
53 select PERF_EVENTS
54 select ANON_INODES
52 select HAVE_ARCH_KMEMCHECK 55 select HAVE_ARCH_KMEMCHECK
56 select HAVE_USER_RETURN_NOTIFIER
53 57
54config OUTPUT_FORMAT 58config OUTPUT_FORMAT
55 string 59 string
@@ -86,10 +90,6 @@ config STACKTRACE_SUPPORT
86config HAVE_LATENCYTOP_SUPPORT 90config HAVE_LATENCYTOP_SUPPORT
87 def_bool y 91 def_bool y
88 92
89config FAST_CMPXCHG_LOCAL
90 bool
91 default y
92
93config MMU 93config MMU
94 def_bool y 94 def_bool y
95 95
@@ -495,7 +495,7 @@ if PARAVIRT_GUEST
495source "arch/x86/xen/Kconfig" 495source "arch/x86/xen/Kconfig"
496 496
497config VMI 497config VMI
498 bool "VMI Guest support" 498 bool "VMI Guest support (DEPRECATED)"
499 select PARAVIRT 499 select PARAVIRT
500 depends on X86_32 500 depends on X86_32
501 ---help--- 501 ---help---
@@ -504,6 +504,15 @@ config VMI
504 at the moment), by linking the kernel to a GPL-ed ROM module 504 at the moment), by linking the kernel to a GPL-ed ROM module
505 provided by the hypervisor. 505 provided by the hypervisor.
506 506
507 As of September 2009, VMware has started a phased retirement
508 of this feature from VMware's products. Please see
509 feature-removal-schedule.txt for details. If you are
510 planning to enable this option, please note that you cannot
511 live migrate a VMI enabled VM to a future VMware product,
512 which doesn't support VMI. So if you expect your kernel to
513 seamlessly migrate to newer VMware products, keep this
514 disabled.
515
507config KVM_CLOCK 516config KVM_CLOCK
508 bool "KVM paravirtualized clock" 517 bool "KVM paravirtualized clock"
509 select PARAVIRT 518 select PARAVIRT
@@ -1325,7 +1334,9 @@ config MATH_EMULATION
1325 kernel, it won't hurt. 1334 kernel, it won't hurt.
1326 1335
1327config MTRR 1336config MTRR
1328 bool "MTRR (Memory Type Range Register) support" 1337 bool
1338 default y
1339 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
1329 ---help--- 1340 ---help---
1330 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1341 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1331 the Memory Type Range Registers (MTRRs) may be used to control 1342 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1391,7 +1402,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1391 1402
1392config X86_PAT 1403config X86_PAT
1393 bool 1404 bool
1394 prompt "x86 PAT support" 1405 default y
1406 prompt "x86 PAT support" if EMBEDDED
1395 depends on MTRR 1407 depends on MTRR
1396 ---help--- 1408 ---help---
1397 Use PAT attributes to setup page level cache control. 1409 Use PAT attributes to setup page level cache control.
@@ -1438,12 +1450,8 @@ config SECCOMP
1438 1450
1439 If unsure, say Y. Only embedded should say N here. 1451 If unsure, say Y. Only embedded should say N here.
1440 1452
1441config CC_STACKPROTECTOR_ALL
1442 bool
1443
1444config CC_STACKPROTECTOR 1453config CC_STACKPROTECTOR
1445 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" 1454 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
1446 select CC_STACKPROTECTOR_ALL
1447 ---help--- 1455 ---help---
1448 This option turns on the -fstack-protector GCC feature. This 1456 This option turns on the -fstack-protector GCC feature. This
1449 feature puts, at the beginning of functions, a canary value on 1457 feature puts, at the beginning of functions, a canary value on
@@ -1601,7 +1609,7 @@ config COMPAT_VDSO
1601 depends on X86_32 || IA32_EMULATION 1609 depends on X86_32 || IA32_EMULATION
1602 ---help--- 1610 ---help---
1603 Map the 32-bit VDSO to the predictable old-style address too. 1611 Map the 32-bit VDSO to the predictable old-style address too.
1604 ---help--- 1612
1605 Say N here if you are running a sufficiently recent glibc 1613 Say N here if you are running a sufficiently recent glibc
1606 version (2.3.3 or later), to remove the high-mapped 1614 version (2.3.3 or later), to remove the high-mapped
1607 VDSO mapping and to exclusively use the randomized VDSO. 1615 VDSO mapping and to exclusively use the randomized VDSO.
@@ -2006,18 +2014,9 @@ config SCx200HR_TIMER
2006 processor goes idle (as is done by the scheduler). The 2014 processor goes idle (as is done by the scheduler). The
2007 other workaround is idle=poll boot option. 2015 other workaround is idle=poll boot option.
2008 2016
2009config GEODE_MFGPT_TIMER
2010 def_bool y
2011 prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
2012 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
2013 ---help---
2014 This driver provides a clock event source based on the MFGPT
2015 timer(s) in the CS5535 and CS5536 companion chip for the geode.
2016 MFGPTs have a better resolution and max interval than the
2017 generic PIT, and are suitable for use as high-res timers.
2018
2019config OLPC 2017config OLPC
2020 bool "One Laptop Per Child support" 2018 bool "One Laptop Per Child support"
2019 select GPIOLIB
2021 default n 2020 default n
2022 ---help--- 2021 ---help---
2023 Add support for detecting the unique features of the OLPC 2022 Add support for detecting the unique features of the OLPC
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 527519b8a9f9..08e442bc3ab9 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -301,15 +301,11 @@ config X86_CPU
301 301
302# 302#
303# Define implied options from the CPU selection here 303# Define implied options from the CPU selection here
304config X86_L1_CACHE_BYTES 304config X86_INTERNODE_CACHE_SHIFT
305 int 305 int
306 default "128" if MPSC 306 default "12" if X86_VSMP
307 default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 307 default "7" if NUMA
308 308 default X86_L1_CACHE_SHIFT
309config X86_INTERNODE_CACHE_BYTES
310 int
311 default "4096" if X86_VSMP
312 default X86_L1_CACHE_BYTES if !X86_VSMP
313 309
314config X86_CMPXCHG 310config X86_CMPXCHG
315 def_bool X86_64 || (X86_32 && !M386) 311 def_bool X86_64 || (X86_32 && !M386)
@@ -317,9 +313,9 @@ config X86_CMPXCHG
317config X86_L1_CACHE_SHIFT 313config X86_L1_CACHE_SHIFT
318 int 314 int
319 default "7" if MPENTIUM4 || MPSC 315 default "7" if MPENTIUM4 || MPSC
316 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 317 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 318 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
323 319
324config X86_XADD 320config X86_XADD
325 def_bool y 321 def_bool y
@@ -400,18 +396,19 @@ config X86_TSC
400 396
401config X86_CMPXCHG64 397config X86_CMPXCHG64
402 def_bool y 398 def_bool y
403 depends on X86_PAE || X86_64 399 depends on !M386 && !M486
404 400
405# this should be set for all -march=.. options where the compiler 401# this should be set for all -march=.. options where the compiler
406# generates cmov. 402# generates cmov.
407config X86_CMOV 403config X86_CMOV
408 def_bool y 404 def_bool y
409 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM) 405 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
410 406
411config X86_MINIMUM_CPU_FAMILY 407config X86_MINIMUM_CPU_FAMILY
412 int 408 int
413 default "64" if X86_64 409 default "64" if X86_64
414 default "6" if X86_32 && X86_P6_NOP 410 default "6" if X86_32 && X86_P6_NOP
411 default "5" if X86_32 && X86_CMPXCHG64
415 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) 412 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
416 default "3" 413 default "3"
417 414
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..bc01e3ebfeb2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
186config HAVE_MMIOTRACE_SUPPORT 186config HAVE_MMIOTRACE_SUPPORT
187 def_bool y 187 def_bool y
188 188
189config X86_DECODER_SELFTEST
190 bool "x86 instruction decoder selftest"
191 depends on DEBUG_KERNEL && KPROBES
192 ---help---
193 Perform x86 instruction decoder selftests at build time.
194 This option is useful for checking the sanity of x86 instruction
195 decoder code.
196 If unsure, say "N".
197
189# 198#
190# IO delay types: 199# IO delay types:
191# 200#
@@ -287,4 +296,18 @@ config OPTIMIZE_INLINING
287 296
288 If unsure, say N. 297 If unsure, say N.
289 298
299config DEBUG_STRICT_USER_COPY_CHECKS
300 bool "Strict copy size checks"
301 depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
302 ---help---
303 Enabling this option turns a certain set of sanity checks for user
304 copy operations into compile time failures.
305
306 The copy_from_user() etc checks are there to help test if there
307 are sufficient security checks on the length argument of
308 the copy operation, by having gcc prove that the argument is
309 within bounds.
310
311 If unsure, or if you run an older (pre 4.4) gcc, say N.
312
290endmenu 313endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a012ee8ef803..78b32be55e9e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -76,7 +76,6 @@ ifdef CONFIG_CC_STACKPROTECTOR
76 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh 76 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
77 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) 77 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y)
78 stackp-y := -fstack-protector 78 stackp-y := -fstack-protector
79 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
80 KBUILD_CFLAGS += $(stackp-y) 79 KBUILD_CFLAGS += $(stackp-y)
81 else 80 else
82 $(warning stack protector enabled but no compiler support) 81 $(warning stack protector enabled but no compiler support)
@@ -156,6 +155,9 @@ all: bzImage
156KBUILD_IMAGE := $(boot)/bzImage 155KBUILD_IMAGE := $(boot)/bzImage
157 156
158bzImage: vmlinux 157bzImage: vmlinux
158ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
159 $(Q)$(MAKE) $(build)=arch/x86/tools posttest
160endif
159 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 161 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
160 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 162 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
161 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ 163 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 30e9a264f69d..1255d953c65d 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -41,11 +41,18 @@ cflags-$(CONFIG_X86_ELAN) += -march=i486
41 41
42# Geode GX1 support 42# Geode GX1 support
43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx 43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
44 44cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx)
45# add at the end to overwrite eventual tuning options from earlier 45# add at the end to overwrite eventual tuning options from earlier
46# cpu entries 46# cpu entries
47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
48 48
49# Work around the pentium-mmx code generator madness of gcc4.4.x which
50# does stack alignment by generating horrible code _before_ the mcount
51# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
52# tracer assumptions. For i686, generic, core2 this is set by the
53# compiler anyway
54cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
55
49# Bug fix for binutils: this option is required in order to keep 56# Bug fix for binutils: this option is required in order to keep
50# binutils from generating NOPL instructions against our will. 57# binutils from generating NOPL instructions against our will.
51ifneq ($(CONFIG_X86_P6_NOP),y) 58ifneq ($(CONFIG_X86_P6_NOP),y)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f8ed0658404c..f25bbd37765a 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -9,6 +9,7 @@ targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinu
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
12cflags-$(CONFIG_X86_32) := -march=i386
12cflags-$(CONFIG_X86_64) := -mcmodel=small 13cflags-$(CONFIG_X86_64) := -mcmodel=small
13KBUILD_CFLAGS += $(cflags-y) 14KBUILD_CFLAGS += $(cflags-y)
14KBUILD_CFLAGS += $(call cc-option,-ffreestanding) 15KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 077e1b69198e..faff0dc9c06a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -107,8 +107,7 @@ ENTRY(startup_32)
107 lgdt gdt(%ebp) 107 lgdt gdt(%ebp)
108 108
109 /* Enable PAE mode */ 109 /* Enable PAE mode */
110 xorl %eax, %eax 110 movl $(X86_CR4_PAE), %eax
111 orl $(X86_CR4_PAE), %eax
112 movl %eax, %cr4 111 movl %eax, %cr4
113 112
114 /* 113 /*
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index bbeb0c3fbd90..89bbf4e4d05d 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -9,6 +9,9 @@
9#include <byteswap.h> 9#include <byteswap.h>
10#define USE_BSD 10#define USE_BSD
11#include <endian.h> 11#include <endian.h>
12#include <regex.h>
13
14static void die(char *fmt, ...);
12 15
13#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 16#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
14static Elf32_Ehdr ehdr; 17static Elf32_Ehdr ehdr;
@@ -30,25 +33,47 @@ static struct section *secs;
30 * the address for which it has been compiled. Don't warn user about 33 * the address for which it has been compiled. Don't warn user about
31 * absolute relocations present w.r.t these symbols. 34 * absolute relocations present w.r.t these symbols.
32 */ 35 */
33static const char* safe_abs_relocs[] = { 36static const char abs_sym_regex[] =
34 "xen_irq_disable_direct_reloc", 37 "^(xen_irq_disable_direct_reloc$|"
35 "xen_save_fl_direct_reloc", 38 "xen_save_fl_direct_reloc$|"
36}; 39 "VDSO|"
40 "__crc_)";
41static regex_t abs_sym_regex_c;
42static int is_abs_reloc(const char *sym_name)
43{
44 return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0);
45}
37 46
38static int is_safe_abs_reloc(const char* sym_name) 47/*
48 * These symbols are known to be relative, even if the linker marks them
49 * as absolute (typically defined outside any section in the linker script.)
50 */
51static const char rel_sym_regex[] =
52 "^_end$";
53static regex_t rel_sym_regex_c;
54static int is_rel_reloc(const char *sym_name)
39{ 55{
40 int i; 56 return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0);
57}
41 58
42 for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { 59static void regex_init(void)
43 if (!strcmp(sym_name, safe_abs_relocs[i])) 60{
44 /* Match found */ 61 char errbuf[128];
45 return 1; 62 int err;
46 } 63
47 if (strncmp(sym_name, "VDSO", 4) == 0) 64 err = regcomp(&abs_sym_regex_c, abs_sym_regex,
48 return 1; 65 REG_EXTENDED|REG_NOSUB);
49 if (strncmp(sym_name, "__crc_", 6) == 0) 66 if (err) {
50 return 1; 67 regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf);
51 return 0; 68 die("%s", errbuf);
69 }
70
71 err = regcomp(&rel_sym_regex_c, rel_sym_regex,
72 REG_EXTENDED|REG_NOSUB);
73 if (err) {
74 regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf);
75 die("%s", errbuf);
76 }
52} 77}
53 78
54static void die(char *fmt, ...) 79static void die(char *fmt, ...)
@@ -131,7 +156,7 @@ static const char *rel_type(unsigned type)
131#undef REL_TYPE 156#undef REL_TYPE
132 }; 157 };
133 const char *name = "unknown type rel type name"; 158 const char *name = "unknown type rel type name";
134 if (type < ARRAY_SIZE(type_name)) { 159 if (type < ARRAY_SIZE(type_name) && type_name[type]) {
135 name = type_name[type]; 160 name = type_name[type];
136 } 161 }
137 return name; 162 return name;
@@ -448,7 +473,7 @@ static void print_absolute_relocs(void)
448 * Before warning check if this absolute symbol 473 * Before warning check if this absolute symbol
449 * relocation is harmless. 474 * relocation is harmless.
450 */ 475 */
451 if (is_safe_abs_reloc(name)) 476 if (is_abs_reloc(name) || is_rel_reloc(name))
452 continue; 477 continue;
453 478
454 if (!printed) { 479 if (!printed) {
@@ -501,21 +526,26 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
501 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; 526 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
502 r_type = ELF32_R_TYPE(rel->r_info); 527 r_type = ELF32_R_TYPE(rel->r_info);
503 /* Don't visit relocations to absolute symbols */ 528 /* Don't visit relocations to absolute symbols */
504 if (sym->st_shndx == SHN_ABS) { 529 if (sym->st_shndx == SHN_ABS &&
530 !is_rel_reloc(sym_name(sym_strtab, sym))) {
505 continue; 531 continue;
506 } 532 }
507 if (r_type == R_386_NONE || r_type == R_386_PC32) { 533 switch (r_type) {
534 case R_386_NONE:
535 case R_386_PC32:
508 /* 536 /*
509 * NONE can be ignored and and PC relative 537 * NONE can be ignored and and PC relative
510 * relocations don't need to be adjusted. 538 * relocations don't need to be adjusted.
511 */ 539 */
512 } 540 break;
513 else if (r_type == R_386_32) { 541 case R_386_32:
514 /* Visit relocations that need to be adjusted */ 542 /* Visit relocations that need to be adjusted */
515 visit(rel, sym); 543 visit(rel, sym);
516 } 544 break;
517 else { 545 default:
518 die("Unsupported relocation type: %d\n", r_type); 546 die("Unsupported relocation type: %s (%d)\n",
547 rel_type(r_type), r_type);
548 break;
519 } 549 }
520 } 550 }
521 } 551 }
@@ -571,16 +601,15 @@ static void emit_relocs(int as_text)
571 } 601 }
572 else { 602 else {
573 unsigned char buf[4]; 603 unsigned char buf[4];
574 buf[0] = buf[1] = buf[2] = buf[3] = 0;
575 /* Print a stop */ 604 /* Print a stop */
576 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); 605 fwrite("\0\0\0\0", 4, 1, stdout);
577 /* Now print each relocation */ 606 /* Now print each relocation */
578 for (i = 0; i < reloc_count; i++) { 607 for (i = 0; i < reloc_count; i++) {
579 buf[0] = (relocs[i] >> 0) & 0xff; 608 buf[0] = (relocs[i] >> 0) & 0xff;
580 buf[1] = (relocs[i] >> 8) & 0xff; 609 buf[1] = (relocs[i] >> 8) & 0xff;
581 buf[2] = (relocs[i] >> 16) & 0xff; 610 buf[2] = (relocs[i] >> 16) & 0xff;
582 buf[3] = (relocs[i] >> 24) & 0xff; 611 buf[3] = (relocs[i] >> 24) & 0xff;
583 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); 612 fwrite(buf, 4, 1, stdout);
584 } 613 }
585 } 614 }
586} 615}
@@ -598,6 +627,8 @@ int main(int argc, char **argv)
598 FILE *fp; 627 FILE *fp;
599 int i; 628 int i;
600 629
630 regex_init();
631
601 show_absolute_syms = 0; 632 show_absolute_syms = 0;
602 show_absolute_relocs = 0; 633 show_absolute_relocs = 0;
603 as_text = 0; 634 as_text = 0;
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f4193bb48782..a6f1a59a5b0c 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
4 4
5#undef i386 5#undef i386
6 6
7#include <asm/cache.h>
7#include <asm/page_types.h> 8#include <asm/page_types.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
46 *(.data.*) 47 *(.data.*)
47 _edata = . ; 48 _edata = . ;
48 } 49 }
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 50 . = ALIGN(L1_CACHE_BYTES);
50 .bss : { 51 .bss : {
51 _bss = . ; 52 _bss = . ;
52 *(.bss) 53 *(.bss)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b31cc54b4641..93e689f4bd86 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -16,7 +16,7 @@
16 */ 16 */
17 17
18#include <asm/segment.h> 18#include <asm/segment.h>
19#include <linux/utsrelease.h> 19#include <generated/utsrelease.h>
20#include <asm/boot.h> 20#include <asm/boot.h>
21#include <asm/e820.h> 21#include <asm/e820.h>
22#include <asm/page_types.h> 22#include <asm/page_types.h>
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 0f6ec455a2b1..03c0683636b6 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -53,6 +53,9 @@ SECTIONS
53 53
54 /DISCARD/ : { *(.note*) } 54 /DISCARD/ : { *(.note*) }
55 55
56 /*
57 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
58 */
56 . = ASSERT(_end <= 0x8000, "Setup too big!"); 59 . = ASSERT(_end <= 0x8000, "Setup too big!");
57 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); 60 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
58 /* Necessary for the very-old-loader check to work... */ 61 /* Necessary for the very-old-loader check to work... */
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
index 2723d9b5ce43..2b15aa488ffb 100644
--- a/arch/x86/boot/version.c
+++ b/arch/x86/boot/version.c
@@ -13,8 +13,8 @@
13 */ 13 */
14 14
15#include "boot.h" 15#include "boot.h"
16#include <linux/utsrelease.h> 16#include <generated/utsrelease.h>
17#include <linux/compile.h> 17#include <generated/compile.h>
18 18
19const char kernel_version[] = 19const char kernel_version[] =
20 UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") " 20 UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index d42da3802499..f767164cd5df 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -27,6 +27,12 @@ static void store_cursor_position(void)
27 27
28 boot_params.screen_info.orig_x = oreg.dl; 28 boot_params.screen_info.orig_x = oreg.dl;
29 boot_params.screen_info.orig_y = oreg.dh; 29 boot_params.screen_info.orig_y = oreg.dh;
30
31 if (oreg.ch & 0x20)
32 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
33
34 if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
35 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
30} 36}
31 37
32static void store_video_mode(void) 38static void store_video_mode(void)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index cfb0010fa940..1a58ad89fdf7 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
15 16
16obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 17obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
17 18
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
24salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
25 26
26aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
28
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb0566e83319..20bb0e1ac681 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#include <linux/linkage.h> 18#include <linux/linkage.h>
19#include <asm/inst.h>
19 20
20.text 21.text
21 22
@@ -122,103 +123,72 @@ ENTRY(aesni_set_key)
122 movups 0x10(%rsi), %xmm2 # other user key 123 movups 0x10(%rsi), %xmm2 # other user key
123 movaps %xmm2, (%rcx) 124 movaps %xmm2, (%rcx)
124 add $0x10, %rcx 125 add $0x10, %rcx
125 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 126 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
126 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
127 call _key_expansion_256a 127 call _key_expansion_256a
128 # aeskeygenassist $0x1, %xmm0, %xmm1 128 AESKEYGENASSIST 0x1 %xmm0 %xmm1
129 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
130 call _key_expansion_256b 129 call _key_expansion_256b
131 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 130 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
132 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
133 call _key_expansion_256a 131 call _key_expansion_256a
134 # aeskeygenassist $0x2, %xmm0, %xmm1 132 AESKEYGENASSIST 0x2 %xmm0 %xmm1
135 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
136 call _key_expansion_256b 133 call _key_expansion_256b
137 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 134 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
138 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
139 call _key_expansion_256a 135 call _key_expansion_256a
140 # aeskeygenassist $0x4, %xmm0, %xmm1 136 AESKEYGENASSIST 0x4 %xmm0 %xmm1
141 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
142 call _key_expansion_256b 137 call _key_expansion_256b
143 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 138 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
144 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
145 call _key_expansion_256a 139 call _key_expansion_256a
146 # aeskeygenassist $0x8, %xmm0, %xmm1 140 AESKEYGENASSIST 0x8 %xmm0 %xmm1
147 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
148 call _key_expansion_256b 141 call _key_expansion_256b
149 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 142 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
150 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
151 call _key_expansion_256a 143 call _key_expansion_256a
152 # aeskeygenassist $0x10, %xmm0, %xmm1 144 AESKEYGENASSIST 0x10 %xmm0 %xmm1
153 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
154 call _key_expansion_256b 145 call _key_expansion_256b
155 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 146 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
156 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
157 call _key_expansion_256a 147 call _key_expansion_256a
158 # aeskeygenassist $0x20, %xmm0, %xmm1 148 AESKEYGENASSIST 0x20 %xmm0 %xmm1
159 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
160 call _key_expansion_256b 149 call _key_expansion_256b
161 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 150 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
162 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
163 call _key_expansion_256a 151 call _key_expansion_256a
164 jmp .Ldec_key 152 jmp .Ldec_key
165.Lenc_key192: 153.Lenc_key192:
166 movq 0x10(%rsi), %xmm2 # other user key 154 movq 0x10(%rsi), %xmm2 # other user key
167 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 155 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
168 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
169 call _key_expansion_192a 156 call _key_expansion_192a
170 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 157 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
171 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
172 call _key_expansion_192b 158 call _key_expansion_192b
173 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 159 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
174 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
175 call _key_expansion_192a 160 call _key_expansion_192a
176 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 161 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
177 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
178 call _key_expansion_192b 162 call _key_expansion_192b
179 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 163 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
180 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
181 call _key_expansion_192a 164 call _key_expansion_192a
182 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 165 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
183 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
184 call _key_expansion_192b 166 call _key_expansion_192b
185 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 167 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
186 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
187 call _key_expansion_192a 168 call _key_expansion_192a
188 # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 169 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
189 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
190 call _key_expansion_192b 170 call _key_expansion_192b
191 jmp .Ldec_key 171 jmp .Ldec_key
192.Lenc_key128: 172.Lenc_key128:
193 # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 173 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
194 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
195 call _key_expansion_128 174 call _key_expansion_128
196 # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 175 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
197 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
198 call _key_expansion_128 176 call _key_expansion_128
199 # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 177 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
200 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
201 call _key_expansion_128 178 call _key_expansion_128
202 # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 179 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
203 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
204 call _key_expansion_128 180 call _key_expansion_128
205 # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 181 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
206 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
207 call _key_expansion_128 182 call _key_expansion_128
208 # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 183 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
209 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
210 call _key_expansion_128 184 call _key_expansion_128
211 # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 185 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
212 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
213 call _key_expansion_128 186 call _key_expansion_128
214 # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 187 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
215 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
216 call _key_expansion_128 188 call _key_expansion_128
217 # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 189 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
218 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
219 call _key_expansion_128 190 call _key_expansion_128
220 # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 191 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
221 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
222 call _key_expansion_128 192 call _key_expansion_128
223.Ldec_key: 193.Ldec_key:
224 sub $0x10, %rcx 194 sub $0x10, %rcx
@@ -231,8 +201,7 @@ ENTRY(aesni_set_key)
231.align 4 201.align 4
232.Ldec_key_loop: 202.Ldec_key_loop:
233 movaps (%rdi), %xmm0 203 movaps (%rdi), %xmm0
234 # aesimc %xmm0, %xmm1 204 AESIMC %xmm0 %xmm1
235 .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
236 movaps %xmm1, (%rsi) 205 movaps %xmm1, (%rsi)
237 add $0x10, %rdi 206 add $0x10, %rdi
238 sub $0x10, %rsi 207 sub $0x10, %rsi
@@ -274,51 +243,37 @@ _aesni_enc1:
274 je .Lenc192 243 je .Lenc192
275 add $0x20, TKEYP 244 add $0x20, TKEYP
276 movaps -0x60(TKEYP), KEY 245 movaps -0x60(TKEYP), KEY
277 # aesenc KEY, STATE 246 AESENC KEY STATE
278 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
279 movaps -0x50(TKEYP), KEY 247 movaps -0x50(TKEYP), KEY
280 # aesenc KEY, STATE 248 AESENC KEY STATE
281 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
282.align 4 249.align 4
283.Lenc192: 250.Lenc192:
284 movaps -0x40(TKEYP), KEY 251 movaps -0x40(TKEYP), KEY
285 # aesenc KEY, STATE 252 AESENC KEY STATE
286 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
287 movaps -0x30(TKEYP), KEY 253 movaps -0x30(TKEYP), KEY
288 # aesenc KEY, STATE 254 AESENC KEY STATE
289 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
290.align 4 255.align 4
291.Lenc128: 256.Lenc128:
292 movaps -0x20(TKEYP), KEY 257 movaps -0x20(TKEYP), KEY
293 # aesenc KEY, STATE 258 AESENC KEY STATE
294 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
295 movaps -0x10(TKEYP), KEY 259 movaps -0x10(TKEYP), KEY
296 # aesenc KEY, STATE 260 AESENC KEY STATE
297 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
298 movaps (TKEYP), KEY 261 movaps (TKEYP), KEY
299 # aesenc KEY, STATE 262 AESENC KEY STATE
300 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
301 movaps 0x10(TKEYP), KEY 263 movaps 0x10(TKEYP), KEY
302 # aesenc KEY, STATE 264 AESENC KEY STATE
303 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
304 movaps 0x20(TKEYP), KEY 265 movaps 0x20(TKEYP), KEY
305 # aesenc KEY, STATE 266 AESENC KEY STATE
306 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
307 movaps 0x30(TKEYP), KEY 267 movaps 0x30(TKEYP), KEY
308 # aesenc KEY, STATE 268 AESENC KEY STATE
309 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
310 movaps 0x40(TKEYP), KEY 269 movaps 0x40(TKEYP), KEY
311 # aesenc KEY, STATE 270 AESENC KEY STATE
312 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
313 movaps 0x50(TKEYP), KEY 271 movaps 0x50(TKEYP), KEY
314 # aesenc KEY, STATE 272 AESENC KEY STATE
315 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
316 movaps 0x60(TKEYP), KEY 273 movaps 0x60(TKEYP), KEY
317 # aesenc KEY, STATE 274 AESENC KEY STATE
318 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
319 movaps 0x70(TKEYP), KEY 275 movaps 0x70(TKEYP), KEY
320 # aesenclast KEY, STATE # last round 276 AESENCLAST KEY STATE
321 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
322 ret 277 ret
323 278
324/* 279/*
@@ -353,135 +308,79 @@ _aesni_enc4:
353 je .L4enc192 308 je .L4enc192
354 add $0x20, TKEYP 309 add $0x20, TKEYP
355 movaps -0x60(TKEYP), KEY 310 movaps -0x60(TKEYP), KEY
356 # aesenc KEY, STATE1 311 AESENC KEY STATE1
357 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 312 AESENC KEY STATE2
358 # aesenc KEY, STATE2 313 AESENC KEY STATE3
359 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 314 AESENC KEY STATE4
360 # aesenc KEY, STATE3
361 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
362 # aesenc KEY, STATE4
363 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
364 movaps -0x50(TKEYP), KEY 315 movaps -0x50(TKEYP), KEY
365 # aesenc KEY, STATE1 316 AESENC KEY STATE1
366 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 317 AESENC KEY STATE2
367 # aesenc KEY, STATE2 318 AESENC KEY STATE3
368 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 319 AESENC KEY STATE4
369 # aesenc KEY, STATE3
370 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
371 # aesenc KEY, STATE4
372 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
373#.align 4 320#.align 4
374.L4enc192: 321.L4enc192:
375 movaps -0x40(TKEYP), KEY 322 movaps -0x40(TKEYP), KEY
376 # aesenc KEY, STATE1 323 AESENC KEY STATE1
377 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 324 AESENC KEY STATE2
378 # aesenc KEY, STATE2 325 AESENC KEY STATE3
379 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 326 AESENC KEY STATE4
380 # aesenc KEY, STATE3
381 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
382 # aesenc KEY, STATE4
383 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
384 movaps -0x30(TKEYP), KEY 327 movaps -0x30(TKEYP), KEY
385 # aesenc KEY, STATE1 328 AESENC KEY STATE1
386 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 329 AESENC KEY STATE2
387 # aesenc KEY, STATE2 330 AESENC KEY STATE3
388 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 331 AESENC KEY STATE4
389 # aesenc KEY, STATE3
390 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
391 # aesenc KEY, STATE4
392 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
393#.align 4 332#.align 4
394.L4enc128: 333.L4enc128:
395 movaps -0x20(TKEYP), KEY 334 movaps -0x20(TKEYP), KEY
396 # aesenc KEY, STATE1 335 AESENC KEY STATE1
397 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 336 AESENC KEY STATE2
398 # aesenc KEY, STATE2 337 AESENC KEY STATE3
399 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 338 AESENC KEY STATE4
400 # aesenc KEY, STATE3
401 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
402 # aesenc KEY, STATE4
403 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
404 movaps -0x10(TKEYP), KEY 339 movaps -0x10(TKEYP), KEY
405 # aesenc KEY, STATE1 340 AESENC KEY STATE1
406 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 341 AESENC KEY STATE2
407 # aesenc KEY, STATE2 342 AESENC KEY STATE3
408 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 343 AESENC KEY STATE4
409 # aesenc KEY, STATE3
410 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
411 # aesenc KEY, STATE4
412 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
413 movaps (TKEYP), KEY 344 movaps (TKEYP), KEY
414 # aesenc KEY, STATE1 345 AESENC KEY STATE1
415 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 346 AESENC KEY STATE2
416 # aesenc KEY, STATE2 347 AESENC KEY STATE3
417 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 348 AESENC KEY STATE4
418 # aesenc KEY, STATE3
419 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
420 # aesenc KEY, STATE4
421 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
422 movaps 0x10(TKEYP), KEY 349 movaps 0x10(TKEYP), KEY
423 # aesenc KEY, STATE1 350 AESENC KEY STATE1
424 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 351 AESENC KEY STATE2
425 # aesenc KEY, STATE2 352 AESENC KEY STATE3
426 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 353 AESENC KEY STATE4
427 # aesenc KEY, STATE3
428 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
429 # aesenc KEY, STATE4
430 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
431 movaps 0x20(TKEYP), KEY 354 movaps 0x20(TKEYP), KEY
432 # aesenc KEY, STATE1 355 AESENC KEY STATE1
433 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 356 AESENC KEY STATE2
434 # aesenc KEY, STATE2 357 AESENC KEY STATE3
435 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 358 AESENC KEY STATE4
436 # aesenc KEY, STATE3
437 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
438 # aesenc KEY, STATE4
439 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
440 movaps 0x30(TKEYP), KEY 359 movaps 0x30(TKEYP), KEY
441 # aesenc KEY, STATE1 360 AESENC KEY STATE1
442 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 361 AESENC KEY STATE2
443 # aesenc KEY, STATE2 362 AESENC KEY STATE3
444 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 363 AESENC KEY STATE4
445 # aesenc KEY, STATE3
446 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
447 # aesenc KEY, STATE4
448 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
449 movaps 0x40(TKEYP), KEY 364 movaps 0x40(TKEYP), KEY
450 # aesenc KEY, STATE1 365 AESENC KEY STATE1
451 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 366 AESENC KEY STATE2
452 # aesenc KEY, STATE2 367 AESENC KEY STATE3
453 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 368 AESENC KEY STATE4
454 # aesenc KEY, STATE3
455 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
456 # aesenc KEY, STATE4
457 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
458 movaps 0x50(TKEYP), KEY 369 movaps 0x50(TKEYP), KEY
459 # aesenc KEY, STATE1 370 AESENC KEY STATE1
460 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 371 AESENC KEY STATE2
461 # aesenc KEY, STATE2 372 AESENC KEY STATE3
462 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 373 AESENC KEY STATE4
463 # aesenc KEY, STATE3
464 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
465 # aesenc KEY, STATE4
466 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
467 movaps 0x60(TKEYP), KEY 374 movaps 0x60(TKEYP), KEY
468 # aesenc KEY, STATE1 375 AESENC KEY STATE1
469 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 376 AESENC KEY STATE2
470 # aesenc KEY, STATE2 377 AESENC KEY STATE3
471 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 378 AESENC KEY STATE4
472 # aesenc KEY, STATE3
473 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
474 # aesenc KEY, STATE4
475 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
476 movaps 0x70(TKEYP), KEY 379 movaps 0x70(TKEYP), KEY
477 # aesenclast KEY, STATE1 # last round 380 AESENCLAST KEY STATE1 # last round
478 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 381 AESENCLAST KEY STATE2
479 # aesenclast KEY, STATE2 382 AESENCLAST KEY STATE3
480 .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 383 AESENCLAST KEY STATE4
481 # aesenclast KEY, STATE3
482 .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
483 # aesenclast KEY, STATE4
484 .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
485 ret 384 ret
486 385
487/* 386/*
@@ -518,51 +417,37 @@ _aesni_dec1:
518 je .Ldec192 417 je .Ldec192
519 add $0x20, TKEYP 418 add $0x20, TKEYP
520 movaps -0x60(TKEYP), KEY 419 movaps -0x60(TKEYP), KEY
521 # aesdec KEY, STATE 420 AESDEC KEY STATE
522 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
523 movaps -0x50(TKEYP), KEY 421 movaps -0x50(TKEYP), KEY
524 # aesdec KEY, STATE 422 AESDEC KEY STATE
525 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
526.align 4 423.align 4
527.Ldec192: 424.Ldec192:
528 movaps -0x40(TKEYP), KEY 425 movaps -0x40(TKEYP), KEY
529 # aesdec KEY, STATE 426 AESDEC KEY STATE
530 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
531 movaps -0x30(TKEYP), KEY 427 movaps -0x30(TKEYP), KEY
532 # aesdec KEY, STATE 428 AESDEC KEY STATE
533 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
534.align 4 429.align 4
535.Ldec128: 430.Ldec128:
536 movaps -0x20(TKEYP), KEY 431 movaps -0x20(TKEYP), KEY
537 # aesdec KEY, STATE 432 AESDEC KEY STATE
538 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
539 movaps -0x10(TKEYP), KEY 433 movaps -0x10(TKEYP), KEY
540 # aesdec KEY, STATE 434 AESDEC KEY STATE
541 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
542 movaps (TKEYP), KEY 435 movaps (TKEYP), KEY
543 # aesdec KEY, STATE 436 AESDEC KEY STATE
544 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
545 movaps 0x10(TKEYP), KEY 437 movaps 0x10(TKEYP), KEY
546 # aesdec KEY, STATE 438 AESDEC KEY STATE
547 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
548 movaps 0x20(TKEYP), KEY 439 movaps 0x20(TKEYP), KEY
549 # aesdec KEY, STATE 440 AESDEC KEY STATE
550 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
551 movaps 0x30(TKEYP), KEY 441 movaps 0x30(TKEYP), KEY
552 # aesdec KEY, STATE 442 AESDEC KEY STATE
553 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
554 movaps 0x40(TKEYP), KEY 443 movaps 0x40(TKEYP), KEY
555 # aesdec KEY, STATE 444 AESDEC KEY STATE
556 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
557 movaps 0x50(TKEYP), KEY 445 movaps 0x50(TKEYP), KEY
558 # aesdec KEY, STATE 446 AESDEC KEY STATE
559 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
560 movaps 0x60(TKEYP), KEY 447 movaps 0x60(TKEYP), KEY
561 # aesdec KEY, STATE 448 AESDEC KEY STATE
562 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
563 movaps 0x70(TKEYP), KEY 449 movaps 0x70(TKEYP), KEY
564 # aesdeclast KEY, STATE # last round 450 AESDECLAST KEY STATE
565 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
566 ret 451 ret
567 452
568/* 453/*
@@ -597,135 +482,79 @@ _aesni_dec4:
597 je .L4dec192 482 je .L4dec192
598 add $0x20, TKEYP 483 add $0x20, TKEYP
599 movaps -0x60(TKEYP), KEY 484 movaps -0x60(TKEYP), KEY
600 # aesdec KEY, STATE1 485 AESDEC KEY STATE1
601 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 486 AESDEC KEY STATE2
602 # aesdec KEY, STATE2 487 AESDEC KEY STATE3
603 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 488 AESDEC KEY STATE4
604 # aesdec KEY, STATE3
605 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
606 # aesdec KEY, STATE4
607 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
608 movaps -0x50(TKEYP), KEY 489 movaps -0x50(TKEYP), KEY
609 # aesdec KEY, STATE1 490 AESDEC KEY STATE1
610 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 491 AESDEC KEY STATE2
611 # aesdec KEY, STATE2 492 AESDEC KEY STATE3
612 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 493 AESDEC KEY STATE4
613 # aesdec KEY, STATE3
614 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
615 # aesdec KEY, STATE4
616 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
617.align 4 494.align 4
618.L4dec192: 495.L4dec192:
619 movaps -0x40(TKEYP), KEY 496 movaps -0x40(TKEYP), KEY
620 # aesdec KEY, STATE1 497 AESDEC KEY STATE1
621 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 498 AESDEC KEY STATE2
622 # aesdec KEY, STATE2 499 AESDEC KEY STATE3
623 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 500 AESDEC KEY STATE4
624 # aesdec KEY, STATE3
625 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
626 # aesdec KEY, STATE4
627 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
628 movaps -0x30(TKEYP), KEY 501 movaps -0x30(TKEYP), KEY
629 # aesdec KEY, STATE1 502 AESDEC KEY STATE1
630 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 503 AESDEC KEY STATE2
631 # aesdec KEY, STATE2 504 AESDEC KEY STATE3
632 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 505 AESDEC KEY STATE4
633 # aesdec KEY, STATE3
634 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
635 # aesdec KEY, STATE4
636 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
637.align 4 506.align 4
638.L4dec128: 507.L4dec128:
639 movaps -0x20(TKEYP), KEY 508 movaps -0x20(TKEYP), KEY
640 # aesdec KEY, STATE1 509 AESDEC KEY STATE1
641 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 510 AESDEC KEY STATE2
642 # aesdec KEY, STATE2 511 AESDEC KEY STATE3
643 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 512 AESDEC KEY STATE4
644 # aesdec KEY, STATE3
645 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
646 # aesdec KEY, STATE4
647 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
648 movaps -0x10(TKEYP), KEY 513 movaps -0x10(TKEYP), KEY
649 # aesdec KEY, STATE1 514 AESDEC KEY STATE1
650 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 515 AESDEC KEY STATE2
651 # aesdec KEY, STATE2 516 AESDEC KEY STATE3
652 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 517 AESDEC KEY STATE4
653 # aesdec KEY, STATE3
654 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
655 # aesdec KEY, STATE4
656 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
657 movaps (TKEYP), KEY 518 movaps (TKEYP), KEY
658 # aesdec KEY, STATE1 519 AESDEC KEY STATE1
659 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 520 AESDEC KEY STATE2
660 # aesdec KEY, STATE2 521 AESDEC KEY STATE3
661 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 522 AESDEC KEY STATE4
662 # aesdec KEY, STATE3
663 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
664 # aesdec KEY, STATE4
665 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
666 movaps 0x10(TKEYP), KEY 523 movaps 0x10(TKEYP), KEY
667 # aesdec KEY, STATE1 524 AESDEC KEY STATE1
668 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 525 AESDEC KEY STATE2
669 # aesdec KEY, STATE2 526 AESDEC KEY STATE3
670 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 527 AESDEC KEY STATE4
671 # aesdec KEY, STATE3
672 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
673 # aesdec KEY, STATE4
674 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
675 movaps 0x20(TKEYP), KEY 528 movaps 0x20(TKEYP), KEY
676 # aesdec KEY, STATE1 529 AESDEC KEY STATE1
677 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 530 AESDEC KEY STATE2
678 # aesdec KEY, STATE2 531 AESDEC KEY STATE3
679 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 532 AESDEC KEY STATE4
680 # aesdec KEY, STATE3
681 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
682 # aesdec KEY, STATE4
683 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
684 movaps 0x30(TKEYP), KEY 533 movaps 0x30(TKEYP), KEY
685 # aesdec KEY, STATE1 534 AESDEC KEY STATE1
686 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 535 AESDEC KEY STATE2
687 # aesdec KEY, STATE2 536 AESDEC KEY STATE3
688 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 537 AESDEC KEY STATE4
689 # aesdec KEY, STATE3
690 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
691 # aesdec KEY, STATE4
692 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
693 movaps 0x40(TKEYP), KEY 538 movaps 0x40(TKEYP), KEY
694 # aesdec KEY, STATE1 539 AESDEC KEY STATE1
695 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 540 AESDEC KEY STATE2
696 # aesdec KEY, STATE2 541 AESDEC KEY STATE3
697 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 542 AESDEC KEY STATE4
698 # aesdec KEY, STATE3
699 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
700 # aesdec KEY, STATE4
701 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
702 movaps 0x50(TKEYP), KEY 543 movaps 0x50(TKEYP), KEY
703 # aesdec KEY, STATE1 544 AESDEC KEY STATE1
704 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 545 AESDEC KEY STATE2
705 # aesdec KEY, STATE2 546 AESDEC KEY STATE3
706 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 547 AESDEC KEY STATE4
707 # aesdec KEY, STATE3
708 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
709 # aesdec KEY, STATE4
710 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
711 movaps 0x60(TKEYP), KEY 548 movaps 0x60(TKEYP), KEY
712 # aesdec KEY, STATE1 549 AESDEC KEY STATE1
713 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 550 AESDEC KEY STATE2
714 # aesdec KEY, STATE2 551 AESDEC KEY STATE3
715 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 552 AESDEC KEY STATE4
716 # aesdec KEY, STATE3
717 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
718 # aesdec KEY, STATE4
719 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
720 movaps 0x70(TKEYP), KEY 553 movaps 0x70(TKEYP), KEY
721 # aesdeclast KEY, STATE1 # last round 554 AESDECLAST KEY STATE1 # last round
722 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 555 AESDECLAST KEY STATE2
723 # aesdeclast KEY, STATE2 556 AESDECLAST KEY STATE3
724 .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 557 AESDECLAST KEY STATE4
725 # aesdeclast KEY, STATE3
726 .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
727 # aesdeclast KEY, STATE4
728 .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
729 ret 558 ret
730 559
731/* 560/*
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 585edebe12cf..49c552c060e9 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -82,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
82 return -EINVAL; 82 return -EINVAL;
83 } 83 }
84 84
85 if (irq_fpu_usable()) 85 if (!irq_fpu_usable())
86 err = crypto_aes_expand_key(ctx, in_key, key_len); 86 err = crypto_aes_expand_key(ctx, in_key, key_len);
87 else { 87 else {
88 kernel_fpu_begin(); 88 kernel_fpu_begin();
@@ -103,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
103{ 103{
104 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); 104 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
105 105
106 if (irq_fpu_usable()) 106 if (!irq_fpu_usable())
107 crypto_aes_encrypt_x86(ctx, dst, src); 107 crypto_aes_encrypt_x86(ctx, dst, src);
108 else { 108 else {
109 kernel_fpu_begin(); 109 kernel_fpu_begin();
@@ -116,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
116{ 116{
117 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); 117 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
118 118
119 if (irq_fpu_usable()) 119 if (!irq_fpu_usable())
120 crypto_aes_decrypt_x86(ctx, dst, src); 120 crypto_aes_decrypt_x86(ctx, dst, src);
121 else { 121 else {
122 kernel_fpu_begin(); 122 kernel_fpu_begin();
@@ -342,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req)
342 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); 342 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
343 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 343 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
344 344
345 if (irq_fpu_usable()) { 345 if (!irq_fpu_usable()) {
346 struct ablkcipher_request *cryptd_req = 346 struct ablkcipher_request *cryptd_req =
347 ablkcipher_request_ctx(req); 347 ablkcipher_request_ctx(req);
348 memcpy(cryptd_req, req, sizeof(*req)); 348 memcpy(cryptd_req, req, sizeof(*req));
@@ -363,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req)
363 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); 363 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
364 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 364 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
365 365
366 if (irq_fpu_usable()) { 366 if (!irq_fpu_usable()) {
367 struct ablkcipher_request *cryptd_req = 367 struct ablkcipher_request *cryptd_req =
368 ablkcipher_request_ctx(req); 368 ablkcipher_request_ctx(req);
369 memcpy(cryptd_req, req, sizeof(*req)); 369 memcpy(cryptd_req, req, sizeof(*req));
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 000000000000..1eb7f90cb7b9
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains accelerated part of ghash
4 * implementation. More information about PCLMULQDQ can be found at:
5 *
6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
7 *
8 * Copyright (c) 2009 Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal
11 * Erdinc Ozturk
12 * Deniz Karakoyunlu
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License version 2 as published
16 * by the Free Software Foundation.
17 */
18
19#include <linux/linkage.h>
20#include <asm/inst.h>
21
22.data
23
24.align 16
25.Lbswap_mask:
26 .octa 0x000102030405060708090a0b0c0d0e0f
27.Lpoly:
28 .octa 0xc2000000000000000000000000000001
29.Ltwo_one:
30 .octa 0x00000001000000000000000000000001
31
32#define DATA %xmm0
33#define SHASH %xmm1
34#define T1 %xmm2
35#define T2 %xmm3
36#define T3 %xmm4
37#define BSWAP %xmm5
38#define IN1 %xmm6
39
40.text
41
42/*
43 * __clmul_gf128mul_ble: internal ABI
44 * input:
45 * DATA: operand1
46 * SHASH: operand2, hash_key << 1 mod poly
47 * output:
48 * DATA: operand1 * operand2 mod poly
49 * changed:
50 * T1
51 * T2
52 * T3
53 */
54__clmul_gf128mul_ble:
55 movaps DATA, T1
56 pshufd $0b01001110, DATA, T2
57 pshufd $0b01001110, SHASH, T3
58 pxor DATA, T2
59 pxor SHASH, T3
60
61 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
62 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
63 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
64 pxor DATA, T2
65 pxor T1, T2 # T2 = a0 * b1 + a1 * b0
66
67 movaps T2, T3
68 pslldq $8, T3
69 psrldq $8, T2
70 pxor T3, DATA
71 pxor T2, T1 # <T1:DATA> is result of
72 # carry-less multiplication
73
74 # first phase of the reduction
75 movaps DATA, T3
76 psllq $1, T3
77 pxor DATA, T3
78 psllq $5, T3
79 pxor DATA, T3
80 psllq $57, T3
81 movaps T3, T2
82 pslldq $8, T2
83 psrldq $8, T3
84 pxor T2, DATA
85 pxor T3, T1
86
87 # second phase of the reduction
88 movaps DATA, T2
89 psrlq $5, T2
90 pxor DATA, T2
91 psrlq $1, T2
92 pxor DATA, T2
93 psrlq $1, T2
94 pxor T2, T1
95 pxor T1, DATA
96 ret
97
98/* void clmul_ghash_mul(char *dst, const be128 *shash) */
99ENTRY(clmul_ghash_mul)
100 movups (%rdi), DATA
101 movups (%rsi), SHASH
102 movaps .Lbswap_mask, BSWAP
103 PSHUFB_XMM BSWAP DATA
104 call __clmul_gf128mul_ble
105 PSHUFB_XMM BSWAP DATA
106 movups DATA, (%rdi)
107 ret
108
109/*
110 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
111 * const be128 *shash);
112 */
113ENTRY(clmul_ghash_update)
114 cmp $16, %rdx
115 jb .Lupdate_just_ret # check length
116 movaps .Lbswap_mask, BSWAP
117 movups (%rdi), DATA
118 movups (%rcx), SHASH
119 PSHUFB_XMM BSWAP DATA
120.align 4
121.Lupdate_loop:
122 movups (%rsi), IN1
123 PSHUFB_XMM BSWAP IN1
124 pxor IN1, DATA
125 call __clmul_gf128mul_ble
126 sub $16, %rdx
127 add $16, %rsi
128 cmp $16, %rdx
129 jge .Lupdate_loop
130 PSHUFB_XMM BSWAP DATA
131 movups DATA, (%rdi)
132.Lupdate_just_ret:
133 ret
134
135/*
136 * void clmul_ghash_setkey(be128 *shash, const u8 *key);
137 *
138 * Calculate hash_key << 1 mod poly
139 */
140ENTRY(clmul_ghash_setkey)
141 movaps .Lbswap_mask, BSWAP
142 movups (%rsi), %xmm0
143 PSHUFB_XMM BSWAP %xmm0
144 movaps %xmm0, %xmm1
145 psllq $1, %xmm0
146 psrlq $63, %xmm1
147 movaps %xmm1, %xmm2
148 pslldq $8, %xmm1
149 psrldq $8, %xmm2
150 por %xmm1, %xmm0
151 # reduction
152 pshufd $0b00100100, %xmm2, %xmm1
153 pcmpeqd .Ltwo_one, %xmm1
154 pand .Lpoly, %xmm1
155 pxor %xmm1, %xmm0
156 movups %xmm0, (%rdi)
157 ret
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 000000000000..cbcc8d8ea93a
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,333 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains glue code.
4 *
5 * Copyright (c) 2009 Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License version 2 as published
10 * by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/crypto.h>
17#include <crypto/algapi.h>
18#include <crypto/cryptd.h>
19#include <crypto/gf128mul.h>
20#include <crypto/internal/hash.h>
21#include <asm/i387.h>
22
23#define GHASH_BLOCK_SIZE 16
24#define GHASH_DIGEST_SIZE 16
25
26void clmul_ghash_mul(char *dst, const be128 *shash);
27
28void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
29 const be128 *shash);
30
31void clmul_ghash_setkey(be128 *shash, const u8 *key);
32
33struct ghash_async_ctx {
34 struct cryptd_ahash *cryptd_tfm;
35};
36
37struct ghash_ctx {
38 be128 shash;
39};
40
41struct ghash_desc_ctx {
42 u8 buffer[GHASH_BLOCK_SIZE];
43 u32 bytes;
44};
45
46static int ghash_init(struct shash_desc *desc)
47{
48 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
49
50 memset(dctx, 0, sizeof(*dctx));
51
52 return 0;
53}
54
55static int ghash_setkey(struct crypto_shash *tfm,
56 const u8 *key, unsigned int keylen)
57{
58 struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
59
60 if (keylen != GHASH_BLOCK_SIZE) {
61 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
62 return -EINVAL;
63 }
64
65 clmul_ghash_setkey(&ctx->shash, key);
66
67 return 0;
68}
69
70static int ghash_update(struct shash_desc *desc,
71 const u8 *src, unsigned int srclen)
72{
73 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
74 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
75 u8 *dst = dctx->buffer;
76
77 kernel_fpu_begin();
78 if (dctx->bytes) {
79 int n = min(srclen, dctx->bytes);
80 u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
81
82 dctx->bytes -= n;
83 srclen -= n;
84
85 while (n--)
86 *pos++ ^= *src++;
87
88 if (!dctx->bytes)
89 clmul_ghash_mul(dst, &ctx->shash);
90 }
91
92 clmul_ghash_update(dst, src, srclen, &ctx->shash);
93 kernel_fpu_end();
94
95 if (srclen & 0xf) {
96 src += srclen - (srclen & 0xf);
97 srclen &= 0xf;
98 dctx->bytes = GHASH_BLOCK_SIZE - srclen;
99 while (srclen--)
100 *dst++ ^= *src++;
101 }
102
103 return 0;
104}
105
106static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
107{
108 u8 *dst = dctx->buffer;
109
110 if (dctx->bytes) {
111 u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
112
113 while (dctx->bytes--)
114 *tmp++ ^= 0;
115
116 kernel_fpu_begin();
117 clmul_ghash_mul(dst, &ctx->shash);
118 kernel_fpu_end();
119 }
120
121 dctx->bytes = 0;
122}
123
124static int ghash_final(struct shash_desc *desc, u8 *dst)
125{
126 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
127 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
128 u8 *buf = dctx->buffer;
129
130 ghash_flush(ctx, dctx);
131 memcpy(dst, buf, GHASH_BLOCK_SIZE);
132
133 return 0;
134}
135
136static struct shash_alg ghash_alg = {
137 .digestsize = GHASH_DIGEST_SIZE,
138 .init = ghash_init,
139 .update = ghash_update,
140 .final = ghash_final,
141 .setkey = ghash_setkey,
142 .descsize = sizeof(struct ghash_desc_ctx),
143 .base = {
144 .cra_name = "__ghash",
145 .cra_driver_name = "__ghash-pclmulqdqni",
146 .cra_priority = 0,
147 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
148 .cra_blocksize = GHASH_BLOCK_SIZE,
149 .cra_ctxsize = sizeof(struct ghash_ctx),
150 .cra_module = THIS_MODULE,
151 .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list),
152 },
153};
154
155static int ghash_async_init(struct ahash_request *req)
156{
157 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
158 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
159 struct ahash_request *cryptd_req = ahash_request_ctx(req);
160 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
161
162 if (!irq_fpu_usable()) {
163 memcpy(cryptd_req, req, sizeof(*req));
164 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
165 return crypto_ahash_init(cryptd_req);
166 } else {
167 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
168 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
169
170 desc->tfm = child;
171 desc->flags = req->base.flags;
172 return crypto_shash_init(desc);
173 }
174}
175
176static int ghash_async_update(struct ahash_request *req)
177{
178 struct ahash_request *cryptd_req = ahash_request_ctx(req);
179
180 if (!irq_fpu_usable()) {
181 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
182 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
183 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
184
185 memcpy(cryptd_req, req, sizeof(*req));
186 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
187 return crypto_ahash_update(cryptd_req);
188 } else {
189 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
190 return shash_ahash_update(req, desc);
191 }
192}
193
194static int ghash_async_final(struct ahash_request *req)
195{
196 struct ahash_request *cryptd_req = ahash_request_ctx(req);
197
198 if (!irq_fpu_usable()) {
199 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
200 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
201 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
202
203 memcpy(cryptd_req, req, sizeof(*req));
204 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
205 return crypto_ahash_final(cryptd_req);
206 } else {
207 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
208 return crypto_shash_final(desc, req->result);
209 }
210}
211
212static int ghash_async_digest(struct ahash_request *req)
213{
214 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
215 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
216 struct ahash_request *cryptd_req = ahash_request_ctx(req);
217 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
218
219 if (!irq_fpu_usable()) {
220 memcpy(cryptd_req, req, sizeof(*req));
221 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
222 return crypto_ahash_digest(cryptd_req);
223 } else {
224 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
225 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
226
227 desc->tfm = child;
228 desc->flags = req->base.flags;
229 return shash_ahash_digest(req, desc);
230 }
231}
232
233static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
234 unsigned int keylen)
235{
236 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
237 struct crypto_ahash *child = &ctx->cryptd_tfm->base;
238 int err;
239
240 crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
241 crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
242 & CRYPTO_TFM_REQ_MASK);
243 err = crypto_ahash_setkey(child, key, keylen);
244 crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
245 & CRYPTO_TFM_RES_MASK);
246
247 return 0;
248}
249
250static int ghash_async_init_tfm(struct crypto_tfm *tfm)
251{
252 struct cryptd_ahash *cryptd_tfm;
253 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
254
255 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
256 if (IS_ERR(cryptd_tfm))
257 return PTR_ERR(cryptd_tfm);
258 ctx->cryptd_tfm = cryptd_tfm;
259 crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
260 sizeof(struct ahash_request) +
261 crypto_ahash_reqsize(&cryptd_tfm->base));
262
263 return 0;
264}
265
266static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
267{
268 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
269
270 cryptd_free_ahash(ctx->cryptd_tfm);
271}
272
273static struct ahash_alg ghash_async_alg = {
274 .init = ghash_async_init,
275 .update = ghash_async_update,
276 .final = ghash_async_final,
277 .setkey = ghash_async_setkey,
278 .digest = ghash_async_digest,
279 .halg = {
280 .digestsize = GHASH_DIGEST_SIZE,
281 .base = {
282 .cra_name = "ghash",
283 .cra_driver_name = "ghash-clmulni",
284 .cra_priority = 400,
285 .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
286 .cra_blocksize = GHASH_BLOCK_SIZE,
287 .cra_type = &crypto_ahash_type,
288 .cra_module = THIS_MODULE,
289 .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
290 .cra_init = ghash_async_init_tfm,
291 .cra_exit = ghash_async_exit_tfm,
292 },
293 },
294};
295
296static int __init ghash_pclmulqdqni_mod_init(void)
297{
298 int err;
299
300 if (!cpu_has_pclmulqdq) {
301 printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
302 " detected.\n");
303 return -ENODEV;
304 }
305
306 err = crypto_register_shash(&ghash_alg);
307 if (err)
308 goto err_out;
309 err = crypto_register_ahash(&ghash_async_alg);
310 if (err)
311 goto err_shash;
312
313 return 0;
314
315err_shash:
316 crypto_unregister_shash(&ghash_alg);
317err_out:
318 return err;
319}
320
321static void __exit ghash_pclmulqdqni_mod_exit(void)
322{
323 crypto_unregister_ahash(&ghash_async_alg);
324 crypto_unregister_shash(&ghash_alg);
325}
326
327module_init(ghash_pclmulqdqni_mod_init);
328module_exit(ghash_pclmulqdqni_mod_exit);
329
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
332 "acclerated by PCLMULQDQ-NI");
333MODULE_ALIAS("ghash");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 74619c4f9fda..53147ad85b96 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -21,8 +21,8 @@
21#define __AUDIT_ARCH_LE 0x40000000 21#define __AUDIT_ARCH_LE 0x40000000
22 22
23#ifndef CONFIG_AUDITSYSCALL 23#ifndef CONFIG_AUDITSYSCALL
24#define sysexit_audit int_ret_from_sys_call 24#define sysexit_audit ia32_ret_from_sys_call
25#define sysretl_audit int_ret_from_sys_call 25#define sysretl_audit ia32_ret_from_sys_call
26#endif 26#endif
27 27
28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
@@ -39,12 +39,12 @@
39 .endm 39 .endm
40 40
41 /* clobbers %eax */ 41 /* clobbers %eax */
42 .macro CLEAR_RREGS _r9=rax 42 .macro CLEAR_RREGS offset=0, _r9=rax
43 xorl %eax,%eax 43 xorl %eax,%eax
44 movq %rax,R11(%rsp) 44 movq %rax,\offset+R11(%rsp)
45 movq %rax,R10(%rsp) 45 movq %rax,\offset+R10(%rsp)
46 movq %\_r9,R9(%rsp) 46 movq %\_r9,\offset+R9(%rsp)
47 movq %rax,R8(%rsp) 47 movq %rax,\offset+R8(%rsp)
48 .endm 48 .endm
49 49
50 /* 50 /*
@@ -172,6 +172,10 @@ sysexit_from_sys_call:
172 movl RIP-R11(%rsp),%edx /* User %eip */ 172 movl RIP-R11(%rsp),%edx /* User %eip */
173 CFI_REGISTER rip,rdx 173 CFI_REGISTER rip,rdx
174 RESTORE_ARGS 1,24,1,1,1,1 174 RESTORE_ARGS 1,24,1,1,1,1
175 xorq %r8,%r8
176 xorq %r9,%r9
177 xorq %r10,%r10
178 xorq %r11,%r11
175 popfq 179 popfq
176 CFI_ADJUST_CFA_OFFSET -8 180 CFI_ADJUST_CFA_OFFSET -8
177 /*CFI_RESTORE rflags*/ 181 /*CFI_RESTORE rflags*/
@@ -200,9 +204,9 @@ sysexit_from_sys_call:
200 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ 204 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
201 .endm 205 .endm
202 206
203 .macro auditsys_exit exit,ebpsave=RBP 207 .macro auditsys_exit exit
204 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 208 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
205 jnz int_ret_from_sys_call 209 jnz ia32_ret_from_sys_call
206 TRACE_IRQS_ON 210 TRACE_IRQS_ON
207 sti 211 sti
208 movl %eax,%esi /* second arg, syscall return value */ 212 movl %eax,%esi /* second arg, syscall return value */
@@ -213,13 +217,13 @@ sysexit_from_sys_call:
213 call audit_syscall_exit 217 call audit_syscall_exit
214 GET_THREAD_INFO(%r10) 218 GET_THREAD_INFO(%r10)
215 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ 219 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
216 movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
217 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 220 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
218 cli 221 cli
219 TRACE_IRQS_OFF 222 TRACE_IRQS_OFF
220 testl %edi,TI_flags(%r10) 223 testl %edi,TI_flags(%r10)
221 jnz int_with_check 224 jz \exit
222 jmp \exit 225 CLEAR_RREGS -ARGOFFSET
226 jmp int_with_check
223 .endm 227 .endm
224 228
225sysenter_auditsys: 229sysenter_auditsys:
@@ -329,6 +333,9 @@ sysretl_from_sys_call:
329 CFI_REGISTER rip,rcx 333 CFI_REGISTER rip,rcx
330 movl EFLAGS-ARGOFFSET(%rsp),%r11d 334 movl EFLAGS-ARGOFFSET(%rsp),%r11d
331 /*CFI_REGISTER rflags,r11*/ 335 /*CFI_REGISTER rflags,r11*/
336 xorq %r10,%r10
337 xorq %r9,%r9
338 xorq %r8,%r8
332 TRACE_IRQS_ON 339 TRACE_IRQS_ON
333 movl RSP-ARGOFFSET(%rsp),%esp 340 movl RSP-ARGOFFSET(%rsp),%esp
334 CFI_RESTORE rsp 341 CFI_RESTORE rsp
@@ -343,7 +350,7 @@ cstar_auditsys:
343 jmp cstar_dispatch 350 jmp cstar_dispatch
344 351
345sysretl_audit: 352sysretl_audit:
346 auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */ 353 auditsys_exit sysretl_from_sys_call
347#endif 354#endif
348 355
349cstar_tracesys: 356cstar_tracesys:
@@ -353,7 +360,7 @@ cstar_tracesys:
353#endif 360#endif
354 xchgl %r9d,%ebp 361 xchgl %r9d,%ebp
355 SAVE_REST 362 SAVE_REST
356 CLEAR_RREGS r9 363 CLEAR_RREGS 0, r9
357 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 364 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
358 movq %rsp,%rdi /* &pt_regs -> arg1 */ 365 movq %rsp,%rdi /* &pt_regs -> arg1 */
359 call syscall_trace_enter 366 call syscall_trace_enter
@@ -425,6 +432,8 @@ ia32_do_call:
425 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative 432 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
426ia32_sysret: 433ia32_sysret:
427 movq %rax,RAX-ARGOFFSET(%rsp) 434 movq %rax,RAX-ARGOFFSET(%rsp)
435ia32_ret_from_sys_call:
436 CLEAR_RREGS -ARGOFFSET
428 jmp int_ret_from_sys_call 437 jmp int_ret_from_sys_call
429 438
430ia32_tracesys: 439ia32_tracesys:
@@ -442,8 +451,8 @@ END(ia32_syscall)
442 451
443ia32_badsys: 452ia32_badsys:
444 movq $0,ORIG_RAX-ARGOFFSET(%rsp) 453 movq $0,ORIG_RAX-ARGOFFSET(%rsp)
445 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 454 movq $-ENOSYS,%rax
446 jmp int_ret_from_sys_call 455 jmp ia32_sysret
447 456
448quiet_ni_syscall: 457quiet_ni_syscall:
449 movq $-ENOSYS,%rax 458 movq $-ENOSYS,%rax
@@ -644,7 +653,7 @@ ia32_sys_call_table:
644 .quad compat_sys_writev 653 .quad compat_sys_writev
645 .quad sys_getsid 654 .quad sys_getsid
646 .quad sys_fdatasync 655 .quad sys_fdatasync
647 .quad sys32_sysctl /* sysctl */ 656 .quad compat_sys_sysctl /* sysctl */
648 .quad sys_mlock /* 150 */ 657 .quad sys_mlock /* 150 */
649 .quad sys_munlock 658 .quad sys_munlock
650 .quad sys_mlockall 659 .quad sys_mlockall
@@ -687,7 +696,7 @@ ia32_sys_call_table:
687 .quad quiet_ni_syscall /* streams2 */ 696 .quad quiet_ni_syscall /* streams2 */
688 .quad stub32_vfork /* 190 */ 697 .quad stub32_vfork /* 190 */
689 .quad compat_sys_getrlimit 698 .quad compat_sys_getrlimit
690 .quad sys32_mmap2 699 .quad sys_mmap_pgoff
691 .quad sys32_truncate64 700 .quad sys32_truncate64
692 .quad sys32_ftruncate64 701 .quad sys32_ftruncate64
693 .quad sys32_stat64 /* 195 */ 702 .quad sys32_stat64 /* 195 */
@@ -832,4 +841,5 @@ ia32_sys_call_table:
832 .quad compat_sys_pwritev 841 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg
835ia32_syscall_end: 845ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 9f5527198825..422572c77923 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -155,9 +155,6 @@ struct mmap_arg_struct {
155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) 155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
156{ 156{
157 struct mmap_arg_struct a; 157 struct mmap_arg_struct a;
158 struct file *file = NULL;
159 unsigned long retval;
160 struct mm_struct *mm ;
161 158
162 if (copy_from_user(&a, arg, sizeof(a))) 159 if (copy_from_user(&a, arg, sizeof(a)))
163 return -EFAULT; 160 return -EFAULT;
@@ -165,22 +162,8 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
165 if (a.offset & ~PAGE_MASK) 162 if (a.offset & ~PAGE_MASK)
166 return -EINVAL; 163 return -EINVAL;
167 164
168 if (!(a.flags & MAP_ANONYMOUS)) { 165 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
169 file = fget(a.fd);
170 if (!file)
171 return -EBADF;
172 }
173
174 mm = current->mm;
175 down_write(&mm->mmap_sem);
176 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
177 a.offset>>PAGE_SHIFT); 166 a.offset>>PAGE_SHIFT);
178 if (file)
179 fput(file);
180
181 up_write(&mm->mmap_sem);
182
183 return retval;
184} 167}
185 168
186asmlinkage long sys32_mprotect(unsigned long start, size_t len, 169asmlinkage long sys32_mprotect(unsigned long start, size_t len,
@@ -434,62 +417,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
434 return ret; 417 return ret;
435} 418}
436 419
437#ifdef CONFIG_SYSCTL_SYSCALL
438struct sysctl_ia32 {
439 unsigned int name;
440 int nlen;
441 unsigned int oldval;
442 unsigned int oldlenp;
443 unsigned int newval;
444 unsigned int newlen;
445 unsigned int __unused[4];
446};
447
448
449asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
450{
451 struct sysctl_ia32 a32;
452 mm_segment_t old_fs = get_fs();
453 void __user *oldvalp, *newvalp;
454 size_t oldlen;
455 int __user *namep;
456 long ret;
457
458 if (copy_from_user(&a32, args32, sizeof(a32)))
459 return -EFAULT;
460
461 /*
462 * We need to pre-validate these because we have to disable
463 * address checking before calling do_sysctl() because of
464 * OLDLEN but we can't run the risk of the user specifying bad
465 * addresses here. Well, since we're dealing with 32 bit
466 * addresses, we KNOW that access_ok() will always succeed, so
467 * this is an expensive NOP, but so what...
468 */
469 namep = compat_ptr(a32.name);
470 oldvalp = compat_ptr(a32.oldval);
471 newvalp = compat_ptr(a32.newval);
472
473 if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
474 || !access_ok(VERIFY_WRITE, namep, 0)
475 || !access_ok(VERIFY_WRITE, oldvalp, 0)
476 || !access_ok(VERIFY_WRITE, newvalp, 0))
477 return -EFAULT;
478
479 set_fs(KERNEL_DS);
480 lock_kernel();
481 ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
482 newvalp, (size_t) a32.newlen);
483 unlock_kernel();
484 set_fs(old_fs);
485
486 if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
487 return -EFAULT;
488
489 return ret;
490}
491#endif
492
493/* warning: next two assume little endian */ 420/* warning: next two assume little endian */
494asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, 421asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
495 u32 poslo, u32 poshi) 422 u32 poslo, u32 poshi)
@@ -539,30 +466,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
539 return ret; 466 return ret;
540} 467}
541 468
542asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
543 unsigned long prot, unsigned long flags,
544 unsigned long fd, unsigned long pgoff)
545{
546 struct mm_struct *mm = current->mm;
547 unsigned long error;
548 struct file *file = NULL;
549
550 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
551 if (!(flags & MAP_ANONYMOUS)) {
552 file = fget(fd);
553 if (!file)
554 return -EBADF;
555 }
556
557 down_write(&mm->mmap_sem);
558 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
559 up_write(&mm->mmap_sem);
560
561 if (file)
562 fput(file);
563 return error;
564}
565
566asmlinkage long sys32_olduname(struct oldold_utsname __user *name) 469asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
567{ 470{
568 char *arch = "x86_64"; 471 char *arch = "x86_64";
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..9f828f87ca35 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
10header-y += sigcontext32.h 10header-y += sigcontext32.h
11header-y += ucontext.h 11header-y += ucontext.h
12header-y += processor-flags.h 12header-y += processor-flags.h
13header-y += hw_breakpoint.h
13 14
14unifdef-y += e820.h 15unifdef-y += e820.h
15unifdef-y += ist.h 16unifdef-y += ist.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index bb70e397aa84..7a15588e45d4 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,7 @@
17 17
18#include <linux/user.h> 18#include <linux/user.h>
19#include <linux/elfcore.h> 19#include <linux/elfcore.h>
20#include <asm/debugreg.h>
20 21
21/* 22/*
22 * fill in the user structure for an a.out core dump 23 * fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
32 >> PAGE_SHIFT; 33 >> PAGE_SHIFT;
33 dump->u_dsize -= dump->u_tsize; 34 dump->u_dsize -= dump->u_tsize;
34 dump->u_ssize = 0; 35 dump->u_ssize = 0;
35 dump->u_debugreg[0] = current->thread.debugreg0; 36 aout_dump_debugregs(dump);
36 dump->u_debugreg[1] = current->thread.debugreg1;
37 dump->u_debugreg[2] = current->thread.debugreg2;
38 dump->u_debugreg[3] = current->thread.debugreg3;
39 dump->u_debugreg[4] = 0;
40 dump->u_debugreg[5] = 0;
41 dump->u_debugreg[6] = current->thread.debugreg6;
42 dump->u_debugreg[7] = current->thread.debugreg7;
43 37
44 if (dump->start_stack < TASK_SIZE) 38 if (dump->start_stack < TASK_SIZE)
45 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) 39 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..56f462cf22d2 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void);
118extern unsigned long acpi_wakeup_address; 118extern unsigned long acpi_wakeup_address;
119 119
120/* early initialization routine */ 120/* early initialization routine */
121extern void acpi_reserve_bootmem(void); 121extern void acpi_reserve_wakeup_memory(void);
122 122
123/* 123/*
124 * Check if the CPU can handle C2 and deeper 124 * Check if the CPU can handle C2 and deeper
@@ -142,6 +142,32 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
142 return max_cstate; 142 return max_cstate;
143} 143}
144 144
145static inline bool arch_has_acpi_pdc(void)
146{
147 struct cpuinfo_x86 *c = &cpu_data(0);
148 return (c->x86_vendor == X86_VENDOR_INTEL ||
149 c->x86_vendor == X86_VENDOR_CENTAUR);
150}
151
152static inline void arch_acpi_set_pdc_bits(u32 *buf)
153{
154 struct cpuinfo_x86 *c = &cpu_data(0);
155
156 buf[2] |= ACPI_PDC_C_CAPABILITY_SMP;
157
158 if (cpu_has(c, X86_FEATURE_EST))
159 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
160
161 if (cpu_has(c, X86_FEATURE_ACPI))
162 buf[2] |= ACPI_PDC_T_FFH;
163
164 /*
165 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
166 */
167 if (!cpu_has(c, X86_FEATURE_MWAIT))
168 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
169}
170
145#else /* !CONFIG_ACPI */ 171#else /* !CONFIG_ACPI */
146 172
147#define acpi_lapic 0 173#define acpi_lapic 0
@@ -158,6 +184,7 @@ struct bootnode;
158 184
159#ifdef CONFIG_ACPI_NUMA 185#ifdef CONFIG_ACPI_NUMA
160extern int acpi_numa; 186extern int acpi_numa;
187extern int acpi_get_nodes(struct bootnode *physnodes);
161extern int acpi_scan_nodes(unsigned long start, unsigned long end); 188extern int acpi_scan_nodes(unsigned long start, unsigned long end);
162#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 189#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
163extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 190extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e2077d343c33..b97f786a48d5 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -1,17 +1,13 @@
1#ifdef __ASSEMBLY__ 1#ifdef __ASSEMBLY__
2 2
3#ifdef CONFIG_X86_32 3#include <asm/asm.h>
4# define X86_ALIGN .long
5#else
6# define X86_ALIGN .quad
7#endif
8 4
9#ifdef CONFIG_SMP 5#ifdef CONFIG_SMP
10 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
111: lock 71: lock
12 .section .smp_locks,"a" 8 .section .smp_locks,"a"
13 .align 4 9 _ASM_ALIGN
14 X86_ALIGN 1b 10 _ASM_PTR 1b
15 .previous 11 .previous
16 .endm 12 .endm
17#else 13#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c240efc74e00..69b74a7b877f 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -84,6 +84,7 @@ static inline void alternatives_smp_switch(int smp) {}
84 " .byte " __stringify(feature) "\n" /* feature bit */ \ 84 " .byte " __stringify(feature) "\n" /* feature bit */ \
85 " .byte 662b-661b\n" /* sourcelen */ \ 85 " .byte 662b-661b\n" /* sourcelen */ \
86 " .byte 664f-663f\n" /* replacementlen */ \ 86 " .byte 664f-663f\n" /* replacementlen */ \
87 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
87 ".previous\n" \ 88 ".previous\n" \
88 ".section .altinstr_replacement, \"ax\"\n" \ 89 ".section .altinstr_replacement, \"ax\"\n" \
89 "663:\n\t" newinstr "\n664:\n" /* replacement */ \ 90 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index ac95995b7bad..5af2982133b5 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -23,18 +23,13 @@
23#include <linux/irqreturn.h> 23#include <linux/irqreturn.h>
24 24
25#ifdef CONFIG_AMD_IOMMU 25#ifdef CONFIG_AMD_IOMMU
26extern int amd_iommu_init(void); 26
27extern int amd_iommu_init_dma_ops(void);
28extern int amd_iommu_init_passthrough(void);
29extern void amd_iommu_detect(void); 27extern void amd_iommu_detect(void);
30extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 28
31extern void amd_iommu_flush_all_domains(void);
32extern void amd_iommu_flush_all_devices(void);
33extern void amd_iommu_shutdown(void);
34#else 29#else
35static inline int amd_iommu_init(void) { return -ENODEV; } 30
36static inline void amd_iommu_detect(void) { } 31static inline void amd_iommu_detect(void) { }
37static inline void amd_iommu_shutdown(void) { } 32
38#endif 33#endif
39 34
40#endif /* _ASM_X86_AMD_IOMMU_H */ 35#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
new file mode 100644
index 000000000000..4d817f9e6e77
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -0,0 +1,40 @@
1/*
2 * Copyright (C) 2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H
21
22struct amd_iommu;
23
24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
27extern void amd_iommu_flush_all_domains(void);
28extern void amd_iommu_flush_all_devices(void);
29extern void amd_iommu_apply_erratum_63(u16 devid);
30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31extern int amd_iommu_init_devices(void);
32extern void amd_iommu_uninit_devices(void);
33extern void amd_iommu_init_notifier(void);
34#ifndef CONFIG_AMD_IOMMU_STATS
35
36static inline void amd_iommu_stats_init(void) { }
37
38#endif /* !CONFIG_AMD_IOMMU_STATS */
39
40#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 2a2cc7a78a81..ba19ad4c47d0 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,6 +25,11 @@
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26 26
27/* 27/*
28 * Maximum number of IOMMUs supported
29 */
30#define MAX_IOMMUS 32
31
32/*
28 * some size calculation constants 33 * some size calculation constants
29 */ 34 */
30#define DEV_TABLE_ENTRY_SIZE 32 35#define DEV_TABLE_ENTRY_SIZE 32
@@ -206,6 +211,9 @@ extern bool amd_iommu_dump;
206 printk(KERN_INFO "AMD-Vi: " format, ## arg); \ 211 printk(KERN_INFO "AMD-Vi: " format, ## arg); \
207 } while(0); 212 } while(0);
208 213
214/* global flag if IOMMUs cache non-present entries */
215extern bool amd_iommu_np_cache;
216
209/* 217/*
210 * Make iterating over all IOMMUs easier 218 * Make iterating over all IOMMUs easier
211 */ 219 */
@@ -226,6 +234,8 @@ extern bool amd_iommu_dump;
226 * independent of their use. 234 * independent of their use.
227 */ 235 */
228struct protection_domain { 236struct protection_domain {
237 struct list_head list; /* for list of all protection domains */
238 struct list_head dev_list; /* List of all devices in this domain */
229 spinlock_t lock; /* mostly used to lock the page table*/ 239 spinlock_t lock; /* mostly used to lock the page table*/
230 u16 id; /* the domain id written to the device table */ 240 u16 id; /* the domain id written to the device table */
231 int mode; /* paging mode (0-6 levels) */ 241 int mode; /* paging mode (0-6 levels) */
@@ -233,7 +243,20 @@ struct protection_domain {
233 unsigned long flags; /* flags to find out type of domain */ 243 unsigned long flags; /* flags to find out type of domain */
234 bool updated; /* complete domain flush required */ 244 bool updated; /* complete domain flush required */
235 unsigned dev_cnt; /* devices assigned to this domain */ 245 unsigned dev_cnt; /* devices assigned to this domain */
246 unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
236 void *priv; /* private data */ 247 void *priv; /* private data */
248
249};
250
251/*
252 * This struct contains device specific data for the IOMMU
253 */
254struct iommu_dev_data {
255 struct list_head list; /* For domain->dev_list */
256 struct device *dev; /* Device this data belong to */
257 struct device *alias; /* The Alias Device */
258 struct protection_domain *domain; /* Domain the device is bound to */
259 atomic_t bind; /* Domain attach reverent count */
237}; 260};
238 261
239/* 262/*
@@ -291,6 +314,9 @@ struct dma_ops_domain {
291struct amd_iommu { 314struct amd_iommu {
292 struct list_head list; 315 struct list_head list;
293 316
317 /* Index within the IOMMU array */
318 int index;
319
294 /* locks the accesses to the hardware */ 320 /* locks the accesses to the hardware */
295 spinlock_t lock; 321 spinlock_t lock;
296 322
@@ -357,6 +383,21 @@ struct amd_iommu {
357extern struct list_head amd_iommu_list; 383extern struct list_head amd_iommu_list;
358 384
359/* 385/*
386 * Array with pointers to each IOMMU struct
387 * The indices are referenced in the protection domains
388 */
389extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
390
391/* Number of IOMMUs present in the system */
392extern int amd_iommus_present;
393
394/*
395 * Declarations for the global list of all protection domains
396 */
397extern spinlock_t amd_iommu_pd_lock;
398extern struct list_head amd_iommu_pd_list;
399
400/*
360 * Structure defining one entry in the device table 401 * Structure defining one entry in the device table
361 */ 402 */
362struct dev_table_entry { 403struct dev_table_entry {
@@ -416,15 +457,9 @@ extern unsigned amd_iommu_aperture_order;
416/* largest PCI device id we expect translation requests for */ 457/* largest PCI device id we expect translation requests for */
417extern u16 amd_iommu_last_bdf; 458extern u16 amd_iommu_last_bdf;
418 459
419/* data structures for protection domain handling */
420extern struct protection_domain **amd_iommu_pd_table;
421
422/* allocation bitmap for domain ids */ 460/* allocation bitmap for domain ids */
423extern unsigned long *amd_iommu_pd_alloc_bitmap; 461extern unsigned long *amd_iommu_pd_alloc_bitmap;
424 462
425/* will be 1 if device isolation is enabled */
426extern bool amd_iommu_isolate;
427
428/* 463/*
429 * If true, the addresses will be flushed on unmap time, not when 464 * If true, the addresses will be flushed on unmap time, not when
430 * they are reused 465 * they are reused
@@ -462,11 +497,6 @@ struct __iommu_counter {
462#define ADD_STATS_COUNTER(name, x) 497#define ADD_STATS_COUNTER(name, x)
463#define SUB_STATS_COUNTER(name, x) 498#define SUB_STATS_COUNTER(name, x)
464 499
465static inline void amd_iommu_stats_init(void) { }
466
467#endif /* CONFIG_AMD_IOMMU_STATS */ 500#endif /* CONFIG_AMD_IOMMU_STATS */
468 501
469/* some function prototypes */
470extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
471
472#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ 502#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 474d80d3e6cc..b4ac2cdcb64f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -297,20 +297,20 @@ struct apic {
297 int disable_esr; 297 int disable_esr;
298 298
299 int dest_logical; 299 int dest_logical;
300 unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); 300 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
301 unsigned long (*check_apicid_present)(int apicid); 301 unsigned long (*check_apicid_present)(int apicid);
302 302
303 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); 303 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
304 void (*init_apic_ldr)(void); 304 void (*init_apic_ldr)(void);
305 305
306 physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); 306 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
307 307
308 void (*setup_apic_routing)(void); 308 void (*setup_apic_routing)(void);
309 int (*multi_timer_check)(int apic, int irq); 309 int (*multi_timer_check)(int apic, int irq);
310 int (*apicid_to_node)(int logical_apicid); 310 int (*apicid_to_node)(int logical_apicid);
311 int (*cpu_to_logical_apicid)(int cpu); 311 int (*cpu_to_logical_apicid)(int cpu);
312 int (*cpu_present_to_apicid)(int mps_cpu); 312 int (*cpu_present_to_apicid)(int mps_cpu);
313 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); 313 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
314 void (*setup_portio_remap)(void); 314 void (*setup_portio_remap)(void);
315 int (*check_phys_apicid_present)(int phys_apicid); 315 int (*check_phys_apicid_present)(int phys_apicid);
316 void (*enable_apic_mode)(void); 316 void (*enable_apic_mode)(void);
@@ -488,6 +488,8 @@ static inline unsigned int read_apic_id(void)
488 488
489extern void default_setup_apic_routing(void); 489extern void default_setup_apic_routing(void);
490 490
491extern struct apic apic_noop;
492
491#ifdef CONFIG_X86_32 493#ifdef CONFIG_X86_32
492 494
493extern struct apic apic_default; 495extern struct apic apic_default;
@@ -532,9 +534,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
532 return (unsigned int)(mask1 & mask2 & mask3); 534 return (unsigned int)(mask1 & mask2 & mask3);
533} 535}
534 536
535static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) 537static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
536{ 538{
537 return physid_isset(apicid, bitmap); 539 return physid_isset(apicid, *map);
538} 540}
539 541
540static inline unsigned long default_check_apicid_present(int bit) 542static inline unsigned long default_check_apicid_present(int bit)
@@ -542,9 +544,9 @@ static inline unsigned long default_check_apicid_present(int bit)
542 return physid_isset(bit, phys_cpu_present_map); 544 return physid_isset(bit, phys_cpu_present_map);
543} 545}
544 546
545static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map) 547static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
546{ 548{
547 return phys_map; 549 *retmap = *phys_map;
548} 550}
549 551
550/* Mapping from cpu number to logical apicid */ 552/* Mapping from cpu number to logical apicid */
@@ -583,11 +585,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
583extern int default_check_phys_apicid_present(int phys_apicid); 585extern int default_check_phys_apicid_present(int phys_apicid);
584#endif 586#endif
585 587
586static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
587{
588 return physid_mask_of_physid(phys_apicid);
589}
590
591#endif /* CONFIG_X86_LOCAL_APIC */ 588#endif /* CONFIG_X86_LOCAL_APIC */
592 589
593#ifdef CONFIG_X86_32 590#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 3b62da926de9..7fe3b3060f08 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -11,6 +11,12 @@
11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
12#define APIC_DEFAULT_PHYS_BASE 0xfee00000 12#define APIC_DEFAULT_PHYS_BASE 0xfee00000
13 13
14/*
15 * This is the IO-APIC register space as specified
16 * by Intel docs:
17 */
18#define IO_APIC_SLOT_SIZE 1024
19
14#define APIC_ID 0x20 20#define APIC_ID 0x20
15 21
16#define APIC_LVR 0x30 22#define APIC_LVR 0x30
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
deleted file mode 100644
index 82f613c607ce..000000000000
--- a/arch/x86/include/asm/apicnum.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _ASM_X86_APICNUM_H
2#define _ASM_X86_APICNUM_H
3
4/* define MAX_IO_APICS */
5#ifdef CONFIG_X86_32
6# define MAX_IO_APICS 64
7#else
8# define MAX_IO_APICS 128
9# define MAX_LOCAL_APIC 32768
10#endif
11
12#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/asm-offsets.h b/arch/x86/include/asm/asm-offsets.h
new file mode 100644
index 000000000000..d370ee36a182
--- /dev/null
+++ b/arch/x86/include/asm/asm-offsets.h
@@ -0,0 +1 @@
#include <generated/asm-offsets.h>
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index d9cf1cd156d2..f654d1bb17fb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -22,14 +22,14 @@ do { \
22 ".popsection" \ 22 ".popsection" \
23 : : "i" (__FILE__), "i" (__LINE__), \ 23 : : "i" (__FILE__), "i" (__LINE__), \
24 "i" (sizeof(struct bug_entry))); \ 24 "i" (sizeof(struct bug_entry))); \
25 for (;;) ; \ 25 unreachable(); \
26} while (0) 26} while (0)
27 27
28#else 28#else
29#define BUG() \ 29#define BUG() \
30do { \ 30do { \
31 asm volatile("ud2"); \ 31 asm volatile("ud2"); \
32 for (;;) ; \ 32 unreachable(); \
33} while (0) 33} while (0)
34#endif 34#endif
35 35
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 549860d3be8f..2f9047cfaaca 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -9,12 +9,13 @@
9 9
10#define __read_mostly __attribute__((__section__(".data.read_mostly"))) 10#define __read_mostly __attribute__((__section__(".data.read_mostly")))
11 11
12#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
13#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
14
12#ifdef CONFIG_X86_VSMP 15#ifdef CONFIG_X86_VSMP
13/* vSMP Internode cacheline shift */
14#define INTERNODE_CACHE_SHIFT (12)
15#ifdef CONFIG_SMP 16#ifdef CONFIG_SMP
16#define __cacheline_aligned_in_smp \ 17#define __cacheline_aligned_in_smp \
17 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ 18 __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
18 __page_aligned_data 19 __page_aligned_data
19#endif 20#endif
20#endif 21#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b54f6afe7ec4..634c40a739a6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { } 12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma, 13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { } 14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15static inline void flush_dcache_page(struct page *page) { } 16static inline void flush_dcache_page(struct page *page) { }
16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } 17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } 18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
@@ -176,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size);
176#ifdef CONFIG_DEBUG_RODATA 177#ifdef CONFIG_DEBUG_RODATA
177void mark_rodata_ro(void); 178void mark_rodata_ro(void);
178extern const int rodata_test_data; 179extern const int rodata_test_data;
180extern int kernel_set_to_readonly;
179void set_kernel_text_rw(void); 181void set_kernel_text_rw(void);
180void set_kernel_text_ro(void); 182void set_kernel_text_ro(void);
181#else 183#else
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb62aa7..0918654305af 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@ struct cal_chipset_ops {
62extern int use_calgary; 62extern int use_calgary;
63 63
64#ifdef CONFIG_CALGARY_IOMMU 64#ifdef CONFIG_CALGARY_IOMMU
65extern int calgary_iommu_init(void);
66extern void detect_calgary(void); 65extern void detect_calgary(void);
67#else 66#else
68static inline int calgary_iommu_init(void) { return 1; }
69static inline void detect_calgary(void) { return; } 67static inline void detect_calgary(void) { return; }
70#endif 68#endif
71 69
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ee1931be6593..ffb9bb6b6c37 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -8,14 +8,50 @@
8 * you need to test for the feature in boot_cpu_data. 8 * you need to test for the feature in boot_cpu_data.
9 */ 9 */
10 10
11#define xchg(ptr, v) \ 11extern void __xchg_wrong_size(void);
12 ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr)))) 12
13/*
14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
15 * Note 2: xchg has side effect, so that attribute volatile is necessary,
16 * but generally the primitive is invalid, *ptr is output argument. --ANK
17 */
13 18
14struct __xchg_dummy { 19struct __xchg_dummy {
15 unsigned long a[100]; 20 unsigned long a[100];
16}; 21};
17#define __xg(x) ((struct __xchg_dummy *)(x)) 22#define __xg(x) ((struct __xchg_dummy *)(x))
18 23
24#define __xchg(x, ptr, size) \
25({ \
26 __typeof(*(ptr)) __x = (x); \
27 switch (size) { \
28 case 1: \
29 asm volatile("xchgb %b0,%1" \
30 : "=q" (__x) \
31 : "m" (*__xg(ptr)), "0" (__x) \
32 : "memory"); \
33 break; \
34 case 2: \
35 asm volatile("xchgw %w0,%1" \
36 : "=r" (__x) \
37 : "m" (*__xg(ptr)), "0" (__x) \
38 : "memory"); \
39 break; \
40 case 4: \
41 asm volatile("xchgl %0,%1" \
42 : "=r" (__x) \
43 : "m" (*__xg(ptr)), "0" (__x) \
44 : "memory"); \
45 break; \
46 default: \
47 __xchg_wrong_size(); \
48 } \
49 __x; \
50})
51
52#define xchg(ptr, v) \
53 __xchg((v), (ptr), sizeof(*ptr))
54
19/* 55/*
20 * The semantics of XCHGCMP8B are a bit strange, this is why 56 * The semantics of XCHGCMP8B are a bit strange, this is why
21 * there is a loop and the loading of %%eax and %%edx has to 57 * there is a loop and the loading of %%eax and %%edx has to
@@ -71,57 +107,63 @@ static inline void __set_64bit_var(unsigned long long *ptr,
71 (unsigned int)((value) >> 32)) \ 107 (unsigned int)((value) >> 32)) \
72 : __set_64bit(ptr, ll_low((value)), ll_high((value)))) 108 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
73 109
74/* 110extern void __cmpxchg_wrong_size(void);
75 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
76 * Note 2: xchg has side effect, so that attribute volatile is necessary,
77 * but generally the primitive is invalid, *ptr is output argument. --ANK
78 */
79static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
80 int size)
81{
82 switch (size) {
83 case 1:
84 asm volatile("xchgb %b0,%1"
85 : "=q" (x)
86 : "m" (*__xg(ptr)), "0" (x)
87 : "memory");
88 break;
89 case 2:
90 asm volatile("xchgw %w0,%1"
91 : "=r" (x)
92 : "m" (*__xg(ptr)), "0" (x)
93 : "memory");
94 break;
95 case 4:
96 asm volatile("xchgl %0,%1"
97 : "=r" (x)
98 : "m" (*__xg(ptr)), "0" (x)
99 : "memory");
100 break;
101 }
102 return x;
103}
104 111
105/* 112/*
106 * Atomic compare and exchange. Compare OLD with MEM, if identical, 113 * Atomic compare and exchange. Compare OLD with MEM, if identical,
107 * store NEW in MEM. Return the initial value in MEM. Success is 114 * store NEW in MEM. Return the initial value in MEM. Success is
108 * indicated by comparing RETURN with OLD. 115 * indicated by comparing RETURN with OLD.
109 */ 116 */
117#define __raw_cmpxchg(ptr, old, new, size, lock) \
118({ \
119 __typeof__(*(ptr)) __ret; \
120 __typeof__(*(ptr)) __old = (old); \
121 __typeof__(*(ptr)) __new = (new); \
122 switch (size) { \
123 case 1: \
124 asm volatile(lock "cmpxchgb %b1,%2" \
125 : "=a"(__ret) \
126 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
127 : "memory"); \
128 break; \
129 case 2: \
130 asm volatile(lock "cmpxchgw %w1,%2" \
131 : "=a"(__ret) \
132 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
133 : "memory"); \
134 break; \
135 case 4: \
136 asm volatile(lock "cmpxchgl %1,%2" \
137 : "=a"(__ret) \
138 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
139 : "memory"); \
140 break; \
141 default: \
142 __cmpxchg_wrong_size(); \
143 } \
144 __ret; \
145})
146
147#define __cmpxchg(ptr, old, new, size) \
148 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
149
150#define __sync_cmpxchg(ptr, old, new, size) \
151 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
152
153#define __cmpxchg_local(ptr, old, new, size) \
154 __raw_cmpxchg((ptr), (old), (new), (size), "")
110 155
111#ifdef CONFIG_X86_CMPXCHG 156#ifdef CONFIG_X86_CMPXCHG
112#define __HAVE_ARCH_CMPXCHG 1 157#define __HAVE_ARCH_CMPXCHG 1
113#define cmpxchg(ptr, o, n) \ 158
114 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ 159#define cmpxchg(ptr, old, new) \
115 (unsigned long)(n), \ 160 __cmpxchg((ptr), (old), (new), sizeof(*ptr))
116 sizeof(*(ptr)))) 161
117#define sync_cmpxchg(ptr, o, n) \ 162#define sync_cmpxchg(ptr, old, new) \
118 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ 163 __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
119 (unsigned long)(n), \ 164
120 sizeof(*(ptr)))) 165#define cmpxchg_local(ptr, old, new) \
121#define cmpxchg_local(ptr, o, n) \ 166 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
122 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
123 (unsigned long)(n), \
124 sizeof(*(ptr))))
125#endif 167#endif
126 168
127#ifdef CONFIG_X86_CMPXCHG64 169#ifdef CONFIG_X86_CMPXCHG64
@@ -133,94 +175,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
133 (unsigned long long)(n))) 175 (unsigned long long)(n)))
134#endif 176#endif
135 177
136static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
137 unsigned long new, int size)
138{
139 unsigned long prev;
140 switch (size) {
141 case 1:
142 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
143 : "=a"(prev)
144 : "q"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 2:
148 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 4:
154 asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162
163/*
164 * Always use locked operations when touching memory shared with a
165 * hypervisor, since the system may be SMP even if the guest kernel
166 * isn't.
167 */
168static inline unsigned long __sync_cmpxchg(volatile void *ptr,
169 unsigned long old,
170 unsigned long new, int size)
171{
172 unsigned long prev;
173 switch (size) {
174 case 1:
175 asm volatile("lock; cmpxchgb %b1,%2"
176 : "=a"(prev)
177 : "q"(new), "m"(*__xg(ptr)), "0"(old)
178 : "memory");
179 return prev;
180 case 2:
181 asm volatile("lock; cmpxchgw %w1,%2"
182 : "=a"(prev)
183 : "r"(new), "m"(*__xg(ptr)), "0"(old)
184 : "memory");
185 return prev;
186 case 4:
187 asm volatile("lock; cmpxchgl %1,%2"
188 : "=a"(prev)
189 : "r"(new), "m"(*__xg(ptr)), "0"(old)
190 : "memory");
191 return prev;
192 }
193 return old;
194}
195
196static inline unsigned long __cmpxchg_local(volatile void *ptr,
197 unsigned long old,
198 unsigned long new, int size)
199{
200 unsigned long prev;
201 switch (size) {
202 case 1:
203 asm volatile("cmpxchgb %b1,%2"
204 : "=a"(prev)
205 : "q"(new), "m"(*__xg(ptr)), "0"(old)
206 : "memory");
207 return prev;
208 case 2:
209 asm volatile("cmpxchgw %w1,%2"
210 : "=a"(prev)
211 : "r"(new), "m"(*__xg(ptr)), "0"(old)
212 : "memory");
213 return prev;
214 case 4:
215 asm volatile("cmpxchgl %1,%2"
216 : "=a"(prev)
217 : "r"(new), "m"(*__xg(ptr)), "0"(old)
218 : "memory");
219 return prev;
220 }
221 return old;
222}
223
224static inline unsigned long long __cmpxchg64(volatile void *ptr, 178static inline unsigned long long __cmpxchg64(volatile void *ptr,
225 unsigned long long old, 179 unsigned long long old,
226 unsigned long long new) 180 unsigned long long new)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 52de72e0de8c..485ae415faec 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,9 +3,6 @@
3 3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */ 4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5 5
6#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
7 (ptr), sizeof(*(ptr))))
8
9#define __xg(x) ((volatile long *)(x)) 6#define __xg(x) ((volatile long *)(x))
10 7
11static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) 8static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
@@ -15,167 +12,118 @@ static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
15 12
16#define _set_64bit set_64bit 13#define _set_64bit set_64bit
17 14
15extern void __xchg_wrong_size(void);
16extern void __cmpxchg_wrong_size(void);
17
18/* 18/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
20 * Note 2: xchg has side effect, so that attribute volatile is necessary, 20 * Note 2: xchg has side effect, so that attribute volatile is necessary,
21 * but generally the primitive is invalid, *ptr is output argument. --ANK 21 * but generally the primitive is invalid, *ptr is output argument. --ANK
22 */ 22 */
23static inline unsigned long __xchg(unsigned long x, volatile void *ptr, 23#define __xchg(x, ptr, size) \
24 int size) 24({ \
25{ 25 __typeof(*(ptr)) __x = (x); \
26 switch (size) { 26 switch (size) { \
27 case 1: 27 case 1: \
28 asm volatile("xchgb %b0,%1" 28 asm volatile("xchgb %b0,%1" \
29 : "=q" (x) 29 : "=q" (__x) \
30 : "m" (*__xg(ptr)), "0" (x) 30 : "m" (*__xg(ptr)), "0" (__x) \
31 : "memory"); 31 : "memory"); \
32 break; 32 break; \
33 case 2: 33 case 2: \
34 asm volatile("xchgw %w0,%1" 34 asm volatile("xchgw %w0,%1" \
35 : "=r" (x) 35 : "=r" (__x) \
36 : "m" (*__xg(ptr)), "0" (x) 36 : "m" (*__xg(ptr)), "0" (__x) \
37 : "memory"); 37 : "memory"); \
38 break; 38 break; \
39 case 4: 39 case 4: \
40 asm volatile("xchgl %k0,%1" 40 asm volatile("xchgl %k0,%1" \
41 : "=r" (x) 41 : "=r" (__x) \
42 : "m" (*__xg(ptr)), "0" (x) 42 : "m" (*__xg(ptr)), "0" (__x) \
43 : "memory"); 43 : "memory"); \
44 break; 44 break; \
45 case 8: 45 case 8: \
46 asm volatile("xchgq %0,%1" 46 asm volatile("xchgq %0,%1" \
47 : "=r" (x) 47 : "=r" (__x) \
48 : "m" (*__xg(ptr)), "0" (x) 48 : "m" (*__xg(ptr)), "0" (__x) \
49 : "memory"); 49 : "memory"); \
50 break; 50 break; \
51 } 51 default: \
52 return x; 52 __xchg_wrong_size(); \
53} 53 } \
54 __x; \
55})
56
57#define xchg(ptr, v) \
58 __xchg((v), (ptr), sizeof(*ptr))
59
60#define __HAVE_ARCH_CMPXCHG 1
54 61
55/* 62/*
56 * Atomic compare and exchange. Compare OLD with MEM, if identical, 63 * Atomic compare and exchange. Compare OLD with MEM, if identical,
57 * store NEW in MEM. Return the initial value in MEM. Success is 64 * store NEW in MEM. Return the initial value in MEM. Success is
58 * indicated by comparing RETURN with OLD. 65 * indicated by comparing RETURN with OLD.
59 */ 66 */
67#define __raw_cmpxchg(ptr, old, new, size, lock) \
68({ \
69 __typeof__(*(ptr)) __ret; \
70 __typeof__(*(ptr)) __old = (old); \
71 __typeof__(*(ptr)) __new = (new); \
72 switch (size) { \
73 case 1: \
74 asm volatile(lock "cmpxchgb %b1,%2" \
75 : "=a"(__ret) \
76 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
77 : "memory"); \
78 break; \
79 case 2: \
80 asm volatile(lock "cmpxchgw %w1,%2" \
81 : "=a"(__ret) \
82 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
83 : "memory"); \
84 break; \
85 case 4: \
86 asm volatile(lock "cmpxchgl %k1,%2" \
87 : "=a"(__ret) \
88 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
89 : "memory"); \
90 break; \
91 case 8: \
92 asm volatile(lock "cmpxchgq %1,%2" \
93 : "=a"(__ret) \
94 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
95 : "memory"); \
96 break; \
97 default: \
98 __cmpxchg_wrong_size(); \
99 } \
100 __ret; \
101})
60 102
61#define __HAVE_ARCH_CMPXCHG 1 103#define __cmpxchg(ptr, old, new, size) \
104 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
62 105
63static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, 106#define __sync_cmpxchg(ptr, old, new, size) \
64 unsigned long new, int size) 107 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
65{
66 unsigned long prev;
67 switch (size) {
68 case 1:
69 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
70 : "=a"(prev)
71 : "q"(new), "m"(*__xg(ptr)), "0"(old)
72 : "memory");
73 return prev;
74 case 2:
75 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
76 : "=a"(prev)
77 : "r"(new), "m"(*__xg(ptr)), "0"(old)
78 : "memory");
79 return prev;
80 case 4:
81 asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
82 : "=a"(prev)
83 : "r"(new), "m"(*__xg(ptr)), "0"(old)
84 : "memory");
85 return prev;
86 case 8:
87 asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
88 : "=a"(prev)
89 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90 : "memory");
91 return prev;
92 }
93 return old;
94}
95 108
96/* 109#define __cmpxchg_local(ptr, old, new, size) \
97 * Always use locked operations when touching memory shared with a 110 __raw_cmpxchg((ptr), (old), (new), (size), "")
98 * hypervisor, since the system may be SMP even if the guest kernel
99 * isn't.
100 */
101static inline unsigned long __sync_cmpxchg(volatile void *ptr,
102 unsigned long old,
103 unsigned long new, int size)
104{
105 unsigned long prev;
106 switch (size) {
107 case 1:
108 asm volatile("lock; cmpxchgb %b1,%2"
109 : "=a"(prev)
110 : "q"(new), "m"(*__xg(ptr)), "0"(old)
111 : "memory");
112 return prev;
113 case 2:
114 asm volatile("lock; cmpxchgw %w1,%2"
115 : "=a"(prev)
116 : "r"(new), "m"(*__xg(ptr)), "0"(old)
117 : "memory");
118 return prev;
119 case 4:
120 asm volatile("lock; cmpxchgl %1,%2"
121 : "=a"(prev)
122 : "r"(new), "m"(*__xg(ptr)), "0"(old)
123 : "memory");
124 return prev;
125 }
126 return old;
127}
128 111
129static inline unsigned long __cmpxchg_local(volatile void *ptr, 112#define cmpxchg(ptr, old, new) \
130 unsigned long old, 113 __cmpxchg((ptr), (old), (new), sizeof(*ptr))
131 unsigned long new, int size) 114
132{ 115#define sync_cmpxchg(ptr, old, new) \
133 unsigned long prev; 116 __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
134 switch (size) { 117
135 case 1: 118#define cmpxchg_local(ptr, old, new) \
136 asm volatile("cmpxchgb %b1,%2" 119 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
137 : "=a"(prev)
138 : "q"(new), "m"(*__xg(ptr)), "0"(old)
139 : "memory");
140 return prev;
141 case 2:
142 asm volatile("cmpxchgw %w1,%2"
143 : "=a"(prev)
144 : "r"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 4:
148 asm volatile("cmpxchgl %k1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 8:
154 asm volatile("cmpxchgq %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162 120
163#define cmpxchg(ptr, o, n) \
164 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
165 (unsigned long)(n), sizeof(*(ptr))))
166#define cmpxchg64(ptr, o, n) \ 121#define cmpxchg64(ptr, o, n) \
167({ \ 122({ \
168 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ 123 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
169 cmpxchg((ptr), (o), (n)); \ 124 cmpxchg((ptr), (o), (n)); \
170}) 125})
171#define cmpxchg_local(ptr, o, n) \ 126
172 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
173 (unsigned long)(n), \
174 sizeof(*(ptr))))
175#define sync_cmpxchg(ptr, o, n) \
176 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
177 (unsigned long)(n), \
178 sizeof(*(ptr))))
179#define cmpxchg64_local(ptr, o, n) \ 127#define cmpxchg64_local(ptr, o, n) \
180({ \ 128({ \
181 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ 129 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9cfc88b97742..637e1ec963c3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -153,6 +153,7 @@
153#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ 153#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
154#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ 154#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
155#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ 155#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
156#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
156 157
157/* 158/*
158 * Auxiliary flags: Linux defined - For features scattered in various 159 * Auxiliary flags: Linux defined - For features scattered in various
@@ -248,6 +249,7 @@ extern const char * const x86_power_flags[32];
248#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) 249#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
249#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 250#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
250#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 251#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
252#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
251 253
252#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 254#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
253# define cpu_has_invlpg 1 255# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37be9e2..8240f76b531e 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -18,6 +18,7 @@
18#define DR_TRAP1 (0x2) /* db1 */ 18#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */ 19#define DR_TRAP2 (0x4) /* db2 */
20#define DR_TRAP3 (0x8) /* db3 */ 20#define DR_TRAP3 (0x8) /* db3 */
21#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
21 22
22#define DR_STEP (0x4000) /* single-step */ 23#define DR_STEP (0x4000) /* single-step */
23#define DR_SWITCH (0x8000) /* task switch */ 24#define DR_SWITCH (0x8000) /* task switch */
@@ -49,6 +50,8 @@
49 50
50#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ 51#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
51#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ 52#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
53#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
54#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
52#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ 55#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
53 56
54#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ 57#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
@@ -67,4 +70,34 @@
67#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ 70#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
68#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ 71#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
69 72
73/*
74 * HW breakpoint additions
75 */
76#ifdef __KERNEL__
77
78DECLARE_PER_CPU(unsigned long, cpu_dr7);
79
80static inline void hw_breakpoint_disable(void)
81{
82 /* Zero the control register for HW Breakpoint */
83 set_debugreg(0UL, 7);
84
85 /* Zero-out the individual HW breakpoint address registers */
86 set_debugreg(0UL, 0);
87 set_debugreg(0UL, 1);
88 set_debugreg(0UL, 2);
89 set_debugreg(0UL, 3);
90}
91
92static inline int hw_breakpoint_active(void)
93{
94 return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
95}
96
97extern void aout_dump_debugregs(struct user *dump);
98
99extern void hw_breakpoint_restore(void);
100
101#endif /* __KERNEL__ */
102
70#endif /* _ASM_X86_DEBUGREG_H */ 103#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e8de2f6f5ca5..617bd56b3070 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -288,7 +288,7 @@ static inline void load_LDT(mm_context_t *pc)
288 288
289static inline unsigned long get_desc_base(const struct desc_struct *desc) 289static inline unsigned long get_desc_base(const struct desc_struct *desc)
290{ 290{
291 return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); 291 return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
292} 292}
293 293
294static inline void set_desc_base(struct desc_struct *desc, unsigned long base) 294static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 9d6684849fd9..278441f39856 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -12,9 +12,9 @@
12#include <linux/types.h> 12#include <linux/types.h>
13 13
14/* 14/*
15 * FIXME: Acessing the desc_struct through its fields is more elegant, 15 * FIXME: Accessing the desc_struct through its fields is more elegant,
16 * and should be the one valid thing to do. However, a lot of open code 16 * and should be the one valid thing to do. However, a lot of open code
17 * still touches the a and b acessors, and doing this allow us to do it 17 * still touches the a and b accessors, and doing this allow us to do it
18 * incrementally. We keep the signature as a struct, rather than an union, 18 * incrementally. We keep the signature as a struct, rather than an union,
19 * so we can get rid of it transparently in the future -- glommer 19 * so we can get rid of it transparently in the future -- glommer
20 */ 20 */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index cee34e9ca45b..029f230ab637 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -8,7 +8,7 @@ struct dev_archdata {
8#ifdef CONFIG_X86_64 8#ifdef CONFIG_X86_64
9struct dma_map_ops *dma_ops; 9struct dma_map_ops *dma_ops;
10#endif 10#endif
11#ifdef CONFIG_DMAR 11#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU)
12 void *iommu; /* hook for IOMMU specific extension */ 12 void *iommu; /* hook for IOMMU specific extension */
13#endif 13#endif
14}; 14};
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 0ee770d23d0e..ac91eed21061 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -14,7 +14,14 @@
14#include <asm/swiotlb.h> 14#include <asm/swiotlb.h>
15#include <asm-generic/dma-coherent.h> 15#include <asm-generic/dma-coherent.h>
16 16
17extern dma_addr_t bad_dma_address; 17#ifdef CONFIG_ISA
18# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
19#else
20# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
21#endif
22
23#define DMA_ERROR_CODE 0
24
18extern int iommu_merge; 25extern int iommu_merge;
19extern struct device x86_dma_fallback_dev; 26extern struct device x86_dma_fallback_dev;
20extern int panic_on_overflow; 27extern int panic_on_overflow;
@@ -42,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
42 if (ops->mapping_error) 49 if (ops->mapping_error)
43 return ops->mapping_error(dev, dma_addr); 50 return ops->mapping_error(dev, dma_addr);
44 51
45 return (dma_addr == bad_dma_address); 52 return (dma_addr == DMA_ERROR_CODE);
46} 53}
47 54
48#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) 55#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -60,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
60 if (!dev->dma_mask) 67 if (!dev->dma_mask)
61 return 0; 68 return 0;
62 69
63 return addr + size <= *dev->dma_mask; 70 return addr + size - 1 <= *dev->dma_mask;
64} 71}
65 72
66static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) 73static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
@@ -124,10 +131,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
124 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) 131 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
125 return memory; 132 return memory;
126 133
127 if (!dev) { 134 if (!dev)
128 dev = &x86_dma_fallback_dev; 135 dev = &x86_dma_fallback_dev;
129 gfp |= GFP_DMA;
130 }
131 136
132 if (!is_device_dma_capable(dev)) 137 if (!is_device_dma_capable(dev))
133 return NULL; 138 return NULL;
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e614fe71..761249e396fe 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -61,6 +61,12 @@ struct e820map {
61 struct e820entry map[E820_X_MAX]; 61 struct e820entry map[E820_X_MAX];
62}; 62};
63 63
64#define ISA_START_ADDRESS 0xa0000
65#define ISA_END_ADDRESS 0x100000
66
67#define BIOS_BEGIN 0x000a0000
68#define BIOS_END 0x00100000
69
64#ifdef __KERNEL__ 70#ifdef __KERNEL__
65/* see comment in arch/x86/kernel/e820.c */ 71/* see comment in arch/x86/kernel/e820.c */
66extern struct e820map e820; 72extern struct e820map e820;
@@ -126,15 +132,18 @@ extern void e820_reserve_resources(void);
126extern void e820_reserve_resources_late(void); 132extern void e820_reserve_resources_late(void);
127extern void setup_memory_map(void); 133extern void setup_memory_map(void);
128extern char *default_machine_specific_memory_setup(void); 134extern char *default_machine_specific_memory_setup(void);
129#endif /* __KERNEL__ */
130#endif /* __ASSEMBLY__ */
131 135
132#define ISA_START_ADDRESS 0xa0000 136/*
133#define ISA_END_ADDRESS 0x100000 137 * Returns true iff the specified range [s,e) is completely contained inside
134#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) 138 * the ISA region.
139 */
140static inline bool is_ISA_range(u64 s, u64 e)
141{
142 return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
143}
135 144
136#define BIOS_BEGIN 0x000a0000 145#endif /* __KERNEL__ */
137#define BIOS_END 0x00100000 146#endif /* __ASSEMBLY__ */
138 147
139#ifdef __KERNEL__ 148#ifdef __KERNEL__
140#include <linux/ioport.h> 149#include <linux/ioport.h>
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 456a304b8172..b4501ee223ad 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -157,19 +157,6 @@ do { \
157 157
158#define compat_elf_check_arch(x) elf_check_arch_ia32(x) 158#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
159 159
160static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
161{
162 loadsegment(fs, 0);
163 loadsegment(ds, __USER32_DS);
164 loadsegment(es, __USER32_DS);
165 load_gs_index(0);
166 regs->ip = ip;
167 regs->sp = sp;
168 regs->flags = X86_EFLAGS_IF;
169 regs->cs = __USER32_CS;
170 regs->ss = __USER32_DS;
171}
172
173static inline void elf_common_init(struct thread_struct *t, 160static inline void elf_common_init(struct thread_struct *t,
174 struct pt_regs *regs, const u16 ds) 161 struct pt_regs *regs, const u16 ds)
175{ 162{
@@ -191,11 +178,8 @@ do { \
191#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
192 elf_common_init(&current->thread, regs, __USER_DS) 179 elf_common_init(&current->thread, regs, __USER_DS)
193 180
194#define compat_start_thread(regs, ip, sp) \ 181void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
195do { \ 182#define compat_start_thread start_thread_ia32
196 start_ia32_thread(regs, ip, sp); \
197 set_fs(USER_DS); \
198} while (0)
199 183
200#define COMPAT_SET_PERSONALITY(ex) \ 184#define COMPAT_SET_PERSONALITY(ex) \
201do { \ 185do { \
@@ -255,7 +239,6 @@ extern int force_personality32;
255#endif /* !CONFIG_X86_32 */ 239#endif /* !CONFIG_X86_32 */
256 240
257#define CORE_DUMP_USE_REGSET 241#define CORE_DUMP_USE_REGSET
258#define USE_ELF_CORE_DUMP
259#define ELF_EXEC_PAGESIZE 4096 242#define ELF_EXEC_PAGESIZE 4096
260 243
261/* This is the location that an ET_DYN program is loaded if exec'ed. Typical 244/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index f5693c81a1db..8e8ec663a98f 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -34,7 +34,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
34 smp_invalidate_interrupt) 34 smp_invalidate_interrupt)
35#endif 35#endif
36 36
37BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) 37BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
38 38
39/* 39/*
40 * every pentium local APIC has two 'local interrupts', with a 40 * every pentium local APIC has two 'local interrupts', with a
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa409d8..4ac5b0f33fc1 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed;
35extern int gart_iommu_aperture_disabled; 35extern int gart_iommu_aperture_disabled;
36 36
37extern void early_gart_iommu_check(void); 37extern void early_gart_iommu_check(void);
38extern void gart_iommu_init(void); 38extern int gart_iommu_init(void);
39extern void gart_iommu_shutdown(void);
40extern void __init gart_parse_options(char *); 39extern void __init gart_parse_options(char *);
41extern void gart_iommu_hole_init(void); 40extern void gart_iommu_hole_init(void);
42 41
@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void);
48static inline void early_gart_iommu_check(void) 47static inline void early_gart_iommu_check(void)
49{ 48{
50} 49}
51static inline void gart_iommu_init(void)
52{
53}
54static inline void gart_iommu_shutdown(void)
55{
56}
57static inline void gart_parse_options(char *options) 50static inline void gart_parse_options(char *options)
58{ 51{
59} 52}
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
index ad3c2ed75481..7cd73552a4e8 100644
--- a/arch/x86/include/asm/geode.h
+++ b/arch/x86/include/asm/geode.h
@@ -12,160 +12,7 @@
12 12
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <linux/io.h> 14#include <linux/io.h>
15 15#include <linux/cs5535.h>
16/* Generic southbridge functions */
17
18#define GEODE_DEV_PMS 0
19#define GEODE_DEV_ACPI 1
20#define GEODE_DEV_GPIO 2
21#define GEODE_DEV_MFGPT 3
22
23extern int geode_get_dev_base(unsigned int dev);
24
25/* Useful macros */
26#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
27#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
28#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
29#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
30
31/* MSRS */
32
33#define MSR_GLIU_P2D_RO0 0x10000029
34
35#define MSR_LX_GLD_MSR_CONFIG 0x48002001
36#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
37 * sheet has the wrong value */
38#define MSR_GLCP_SYS_RSTPLL 0x4C000014
39#define MSR_GLCP_DOTPLL 0x4C000015
40
41#define MSR_LBAR_SMB 0x5140000B
42#define MSR_LBAR_GPIO 0x5140000C
43#define MSR_LBAR_MFGPT 0x5140000D
44#define MSR_LBAR_ACPI 0x5140000E
45#define MSR_LBAR_PMS 0x5140000F
46
47#define MSR_DIVIL_SOFT_RESET 0x51400017
48
49#define MSR_PIC_YSEL_LOW 0x51400020
50#define MSR_PIC_YSEL_HIGH 0x51400021
51#define MSR_PIC_ZSEL_LOW 0x51400022
52#define MSR_PIC_ZSEL_HIGH 0x51400023
53#define MSR_PIC_IRQM_LPC 0x51400025
54
55#define MSR_MFGPT_IRQ 0x51400028
56#define MSR_MFGPT_NR 0x51400029
57#define MSR_MFGPT_SETUP 0x5140002B
58
59#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
60
61#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
62#define MSR_GX_MSR_PADSEL 0xC0002011
63
64/* Resource Sizes */
65
66#define LBAR_GPIO_SIZE 0xFF
67#define LBAR_MFGPT_SIZE 0x40
68#define LBAR_ACPI_SIZE 0x40
69#define LBAR_PMS_SIZE 0x80
70
71/* ACPI registers (PMS block) */
72
73/*
74 * PM1_EN is only valid when VSA is enabled for 16 bit reads.
75 * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
76 * with a 32 bit read at offset 0x0
77 */
78
79#define PM1_STS 0x00
80#define PM1_EN 0x02
81#define PM1_CNT 0x08
82#define PM2_CNT 0x0C
83#define PM_TMR 0x10
84#define PM_GPE0_STS 0x18
85#define PM_GPE0_EN 0x1C
86
87/* PMC registers (PMS block) */
88
89#define PM_SSD 0x00
90#define PM_SCXA 0x04
91#define PM_SCYA 0x08
92#define PM_OUT_SLPCTL 0x0C
93#define PM_SCLK 0x10
94#define PM_SED 0x1
95#define PM_SCXD 0x18
96#define PM_SCYD 0x1C
97#define PM_IN_SLPCTL 0x20
98#define PM_WKD 0x30
99#define PM_WKXD 0x34
100#define PM_RD 0x38
101#define PM_WKXA 0x3C
102#define PM_FSD 0x40
103#define PM_TSD 0x44
104#define PM_PSD 0x48
105#define PM_NWKD 0x4C
106#define PM_AWKD 0x50
107#define PM_SSC 0x54
108
109/* VSA2 magic values */
110
111#define VSA_VRC_INDEX 0xAC1C
112#define VSA_VRC_DATA 0xAC1E
113#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
114#define VSA_VR_SIGNATURE 0x0003
115#define VSA_VR_MEM_SIZE 0x0200
116#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
117#define GSW_VSA_SIG 0x534d /* General Software signature */
118/* GPIO */
119
120#define GPIO_OUTPUT_VAL 0x00
121#define GPIO_OUTPUT_ENABLE 0x04
122#define GPIO_OUTPUT_OPEN_DRAIN 0x08
123#define GPIO_OUTPUT_INVERT 0x0C
124#define GPIO_OUTPUT_AUX1 0x10
125#define GPIO_OUTPUT_AUX2 0x14
126#define GPIO_PULL_UP 0x18
127#define GPIO_PULL_DOWN 0x1C
128#define GPIO_INPUT_ENABLE 0x20
129#define GPIO_INPUT_INVERT 0x24
130#define GPIO_INPUT_FILTER 0x28
131#define GPIO_INPUT_EVENT_COUNT 0x2C
132#define GPIO_READ_BACK 0x30
133#define GPIO_INPUT_AUX1 0x34
134#define GPIO_EVENTS_ENABLE 0x38
135#define GPIO_LOCK_ENABLE 0x3C
136#define GPIO_POSITIVE_EDGE_EN 0x40
137#define GPIO_NEGATIVE_EDGE_EN 0x44
138#define GPIO_POSITIVE_EDGE_STS 0x48
139#define GPIO_NEGATIVE_EDGE_STS 0x4C
140
141#define GPIO_MAP_X 0xE0
142#define GPIO_MAP_Y 0xE4
143#define GPIO_MAP_Z 0xE8
144#define GPIO_MAP_W 0xEC
145
146static inline u32 geode_gpio(unsigned int nr)
147{
148 BUG_ON(nr > 28);
149 return 1 << nr;
150}
151
152extern void geode_gpio_set(u32, unsigned int);
153extern void geode_gpio_clear(u32, unsigned int);
154extern int geode_gpio_isset(u32, unsigned int);
155extern void geode_gpio_setup_event(unsigned int, int, int);
156extern void geode_gpio_set_irq(unsigned int, unsigned int);
157
158static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
159{
160 geode_gpio_setup_event(gpio, pair, 0);
161}
162
163static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
164{
165 geode_gpio_setup_event(gpio, pair, 1);
166}
167
168/* Specific geode tests */
169 16
170static inline int is_geode_gx(void) 17static inline int is_geode_gx(void)
171{ 18{
@@ -186,68 +33,4 @@ static inline int is_geode(void)
186 return (is_geode_gx() || is_geode_lx()); 33 return (is_geode_gx() || is_geode_lx());
187} 34}
188 35
189#ifdef CONFIG_MGEODE_LX
190extern int geode_has_vsa2(void);
191#else
192static inline int geode_has_vsa2(void)
193{
194 return 0;
195}
196#endif
197
198/* MFGPTs */
199
200#define MFGPT_MAX_TIMERS 8
201#define MFGPT_TIMER_ANY (-1)
202
203#define MFGPT_DOMAIN_WORKING 1
204#define MFGPT_DOMAIN_STANDBY 2
205#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
206
207#define MFGPT_CMP1 0
208#define MFGPT_CMP2 1
209
210#define MFGPT_EVENT_IRQ 0
211#define MFGPT_EVENT_NMI 1
212#define MFGPT_EVENT_RESET 3
213
214#define MFGPT_REG_CMP1 0
215#define MFGPT_REG_CMP2 2
216#define MFGPT_REG_COUNTER 4
217#define MFGPT_REG_SETUP 6
218
219#define MFGPT_SETUP_CNTEN (1 << 15)
220#define MFGPT_SETUP_CMP2 (1 << 14)
221#define MFGPT_SETUP_CMP1 (1 << 13)
222#define MFGPT_SETUP_SETUP (1 << 12)
223#define MFGPT_SETUP_STOPEN (1 << 11)
224#define MFGPT_SETUP_EXTEN (1 << 10)
225#define MFGPT_SETUP_REVEN (1 << 5)
226#define MFGPT_SETUP_CLKSEL (1 << 4)
227
228static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
229{
230 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
231 outw(value, base + reg + (timer * 8));
232}
233
234static inline u16 geode_mfgpt_read(int timer, u16 reg)
235{
236 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
237 return inw(base + reg + (timer * 8));
238}
239
240extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
241extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
242extern int geode_mfgpt_alloc_timer(int timer, int domain);
243
244#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
245#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
246
247#ifdef CONFIG_GEODE_MFGPT_TIMER
248extern int __init mfgpt_timer_setup(void);
249#else
250static inline int mfgpt_timer_setup(void) { return 0; }
251#endif
252
253#endif /* _ASM_X86_GEODE_H */ 36#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 82e3e8f01043..0f8576427cfe 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,7 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs; 17 unsigned int apic_pending_irqs;
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
@@ -20,11 +20,11 @@ typedef struct {
20 unsigned int irq_call_count; 20 unsigned int irq_call_count;
21 unsigned int irq_tlb_count; 21 unsigned int irq_tlb_count;
22#endif 22#endif
23#ifdef CONFIG_X86_MCE 23#ifdef CONFIG_X86_THERMAL_VECTOR
24 unsigned int irq_thermal_count; 24 unsigned int irq_thermal_count;
25# ifdef CONFIG_X86_MCE_THRESHOLD 25#endif
26#ifdef CONFIG_X86_MCE_THRESHOLD
26 unsigned int irq_threshold_count; 27 unsigned int irq_threshold_count;
27# endif
28#endif 28#endif
29} ____cacheline_aligned irq_cpustat_t; 29} ____cacheline_aligned irq_cpustat_t;
30 30
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1c22cb05ad6a..5d89fd2a3690 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -65,11 +65,12 @@
65/* hpet memory map physical address */ 65/* hpet memory map physical address */
66extern unsigned long hpet_address; 66extern unsigned long hpet_address;
67extern unsigned long force_hpet_address; 67extern unsigned long force_hpet_address;
68extern u8 hpet_blockid;
68extern int hpet_force_user; 69extern int hpet_force_user;
69extern int is_hpet_enabled(void); 70extern int is_hpet_enabled(void);
70extern int hpet_enable(void); 71extern int hpet_enable(void);
71extern void hpet_disable(void); 72extern void hpet_disable(void);
72extern unsigned long hpet_readl(unsigned long a); 73extern unsigned int hpet_readl(unsigned int a);
73extern void force_hpet_resume(void); 74extern void force_hpet_resume(void);
74 75
75extern void hpet_msi_unmask(unsigned int irq); 76extern void hpet_msi_unmask(unsigned int irq);
@@ -78,9 +79,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
78extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); 79extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
79 80
80#ifdef CONFIG_PCI_MSI 81#ifdef CONFIG_PCI_MSI
81extern int arch_setup_hpet_msi(unsigned int irq); 82extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
82#else 83#else
83static inline int arch_setup_hpet_msi(unsigned int irq) 84static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
84{ 85{
85 return -EINVAL; 86 return -EINVAL;
86} 87}
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..0675a7c4c20e
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
1#ifndef _I386_HW_BREAKPOINT_H
2#define _I386_HW_BREAKPOINT_H
3
4#ifdef __KERNEL__
5#define __ARCH_HW_BREAKPOINT_H
6
7/*
8 * The name should probably be something dealt in
9 * a higher level. While dealing with the user
10 * (display/resolving)
11 */
12struct arch_hw_breakpoint {
13 char *name; /* Contains name of the symbol to set bkpt */
14 unsigned long address;
15 u8 len;
16 u8 type;
17};
18
19#include <linux/kdebug.h>
20#include <linux/percpu.h>
21#include <linux/list.h>
22
23/* Available HW breakpoint length encodings */
24#define X86_BREAKPOINT_LEN_1 0x40
25#define X86_BREAKPOINT_LEN_2 0x44
26#define X86_BREAKPOINT_LEN_4 0x4c
27#define X86_BREAKPOINT_LEN_EXECUTE 0x40
28
29#ifdef CONFIG_X86_64
30#define X86_BREAKPOINT_LEN_8 0x48
31#endif
32
33/* Available HW breakpoint type encodings */
34
35/* trigger on instruction execute */
36#define X86_BREAKPOINT_EXECUTE 0x80
37/* trigger on memory write */
38#define X86_BREAKPOINT_WRITE 0x81
39/* trigger on memory read or write */
40#define X86_BREAKPOINT_RW 0x83
41
42/* Total number of available HW breakpoint registers */
43#define HBP_NUM 4
44
45struct perf_event;
46struct pmu;
47
48extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
49extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
50 struct task_struct *tsk);
51extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
52 unsigned long val, void *data);
53
54
55int arch_install_hw_breakpoint(struct perf_event *bp);
56void arch_uninstall_hw_breakpoint(struct perf_event *bp);
57void hw_breakpoint_pmu_read(struct perf_event *bp);
58void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
59
60extern void
61arch_fill_perf_breakpoint(struct perf_event *bp);
62
63unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
64int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
65
66extern int arch_bp_generic_fields(int x86_len, int x86_type,
67 int *gen_len, int *gen_type);
68
69extern struct pmu perf_ops_bp;
70
71#endif /* __KERNEL__ */
72#endif /* _I386_HW_BREAKPOINT_H */
73
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ba180d93b08c..eeac829a0f44 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,7 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void x86_platform_ipi(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void); 32extern void perf_pending_interrupt(void);
33 33
@@ -79,14 +79,33 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
79 int ioapic, int ioapic_pin, 79 int ioapic, int ioapic_pin,
80 int trigger, int polarity) 80 int trigger, int polarity)
81{ 81{
82 irq_attr->ioapic = ioapic; 82 irq_attr->ioapic = ioapic;
83 irq_attr->ioapic_pin = ioapic_pin; 83 irq_attr->ioapic_pin = ioapic_pin;
84 irq_attr->trigger = trigger; 84 irq_attr->trigger = trigger;
85 irq_attr->polarity = polarity; 85 irq_attr->polarity = polarity;
86} 86}
87 87
88extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, 88/*
89 struct io_apic_irq_attr *irq_attr); 89 * This is performance-critical, we want to do it O(1)
90 *
91 * Most irqs are mapped 1:1 with pins.
92 */
93struct irq_cfg {
94 struct irq_pin_list *irq_2_pin;
95 cpumask_var_t domain;
96 cpumask_var_t old_domain;
97 u8 vector;
98 u8 move_in_progress : 1;
99};
100
101extern struct irq_cfg *irq_cfg(unsigned int);
102extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
103extern void send_cleanup_vector(struct irq_cfg *);
104
105struct irq_desc;
106extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
107 unsigned int *dest_id);
108extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
90extern void setup_ioapic_dest(void); 109extern void setup_ioapic_dest(void);
91 110
92extern void enable_IO_APIC(void); 111extern void enable_IO_APIC(void);
@@ -101,7 +120,7 @@ extern void eisa_set_level_irq(unsigned int irq);
101/* SMP */ 120/* SMP */
102extern void smp_apic_timer_interrupt(struct pt_regs *); 121extern void smp_apic_timer_interrupt(struct pt_regs *);
103extern void smp_spurious_interrupt(struct pt_regs *); 122extern void smp_spurious_interrupt(struct pt_regs *);
104extern void smp_generic_interrupt(struct pt_regs *); 123extern void smp_x86_platform_ipi(struct pt_regs *);
105extern void smp_error_interrupt(struct pt_regs *); 124extern void smp_error_interrupt(struct pt_regs *);
106#ifdef CONFIG_X86_IO_APIC 125#ifdef CONFIG_X86_IO_APIC
107extern asmlinkage void smp_irq_move_cleanup_interrupt(void); 126extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb758f2..ebfb8a9e11f7 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,8 @@
10#ifndef _ASM_X86_I387_H 10#ifndef _ASM_X86_I387_H
11#define _ASM_X86_I387_H 11#define _ASM_X86_I387_H
12 12
13#ifndef __ASSEMBLY__
14
13#include <linux/sched.h> 15#include <linux/sched.h>
14#include <linux/kernel_stat.h> 16#include <linux/kernel_stat.h>
15#include <linux/regset.h> 17#include <linux/regset.h>
@@ -411,4 +413,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
411 } 413 }
412} 414}
413 415
416#endif /* __ASSEMBLY__ */
417
418#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
419#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
420
414#endif /* _ASM_X86_I387_H */ 421#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644
index 000000000000..205b063e3e32
--- /dev/null
+++ b/arch/x86/include/asm/inat.h
@@ -0,0 +1,220 @@
1#ifndef _ASM_X86_INAT_H
2#define _ASM_X86_INAT_H
3/*
4 * x86 instruction attributes
5 *
6 * Written by Masami Hiramatsu <mhiramat@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 */
23#include <asm/inat_types.h>
24
25/*
26 * Internal bits. Don't use bitmasks directly, because these bits are
27 * unstable. You should use checking functions.
28 */
29
30#define INAT_OPCODE_TABLE_SIZE 256
31#define INAT_GROUP_TABLE_SIZE 8
32
33/* Legacy last prefixes */
34#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */
35#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */
36#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */
37/* Other Legacy prefixes */
38#define INAT_PFX_LOCK 4 /* 0xF0 */
39#define INAT_PFX_CS 5 /* 0x2E */
40#define INAT_PFX_DS 6 /* 0x3E */
41#define INAT_PFX_ES 7 /* 0x26 */
42#define INAT_PFX_FS 8 /* 0x64 */
43#define INAT_PFX_GS 9 /* 0x65 */
44#define INAT_PFX_SS 10 /* 0x36 */
45#define INAT_PFX_ADDRSZ 11 /* 0x67 */
46/* x86-64 REX prefix */
47#define INAT_PFX_REX 12 /* 0x4X */
48/* AVX VEX prefixes */
49#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
50#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
51
52#define INAT_LSTPFX_MAX 3
53#define INAT_LGCPFX_MAX 11
54
55/* Immediate size */
56#define INAT_IMM_BYTE 1
57#define INAT_IMM_WORD 2
58#define INAT_IMM_DWORD 3
59#define INAT_IMM_QWORD 4
60#define INAT_IMM_PTR 5
61#define INAT_IMM_VWORD32 6
62#define INAT_IMM_VWORD 7
63
64/* Legacy prefix */
65#define INAT_PFX_OFFS 0
66#define INAT_PFX_BITS 4
67#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
68#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
69/* Escape opcodes */
70#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS)
71#define INAT_ESC_BITS 2
72#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1)
73#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS)
74/* Group opcodes (1-16) */
75#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS)
76#define INAT_GRP_BITS 5
77#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1)
78#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS)
79/* Immediates */
80#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS)
81#define INAT_IMM_BITS 3
82#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
83/* Flags */
84#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
85#define INAT_MODRM (1 << (INAT_FLAG_OFFS))
86#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1))
87#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2))
88#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3))
89#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4))
90#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
91#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
92/* Attribute making macros for attribute tables */
93#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
94#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
95#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
96#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
97
98/* Attribute search APIs */
99extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
100extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
101 insn_byte_t last_pfx,
102 insn_attr_t esc_attr);
103extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
104 insn_byte_t last_pfx,
105 insn_attr_t esc_attr);
106extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
107 insn_byte_t vex_m,
108 insn_byte_t vex_pp);
109
110/* Attribute checking functions */
111static inline int inat_is_legacy_prefix(insn_attr_t attr)
112{
113 attr &= INAT_PFX_MASK;
114 return attr && attr <= INAT_LGCPFX_MAX;
115}
116
117static inline int inat_is_address_size_prefix(insn_attr_t attr)
118{
119 return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
120}
121
122static inline int inat_is_operand_size_prefix(insn_attr_t attr)
123{
124 return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
125}
126
127static inline int inat_is_rex_prefix(insn_attr_t attr)
128{
129 return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
130}
131
132static inline int inat_last_prefix_id(insn_attr_t attr)
133{
134 if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
135 return 0;
136 else
137 return attr & INAT_PFX_MASK;
138}
139
140static inline int inat_is_vex_prefix(insn_attr_t attr)
141{
142 attr &= INAT_PFX_MASK;
143 return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
144}
145
146static inline int inat_is_vex3_prefix(insn_attr_t attr)
147{
148 return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
149}
150
151static inline int inat_is_escape(insn_attr_t attr)
152{
153 return attr & INAT_ESC_MASK;
154}
155
156static inline int inat_escape_id(insn_attr_t attr)
157{
158 return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
159}
160
161static inline int inat_is_group(insn_attr_t attr)
162{
163 return attr & INAT_GRP_MASK;
164}
165
166static inline int inat_group_id(insn_attr_t attr)
167{
168 return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
169}
170
171static inline int inat_group_common_attribute(insn_attr_t attr)
172{
173 return attr & ~INAT_GRP_MASK;
174}
175
176static inline int inat_has_immediate(insn_attr_t attr)
177{
178 return attr & INAT_IMM_MASK;
179}
180
181static inline int inat_immediate_size(insn_attr_t attr)
182{
183 return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
184}
185
186static inline int inat_has_modrm(insn_attr_t attr)
187{
188 return attr & INAT_MODRM;
189}
190
191static inline int inat_is_force64(insn_attr_t attr)
192{
193 return attr & INAT_FORCE64;
194}
195
196static inline int inat_has_second_immediate(insn_attr_t attr)
197{
198 return attr & INAT_SCNDIMM;
199}
200
201static inline int inat_has_moffset(insn_attr_t attr)
202{
203 return attr & INAT_MOFFSET;
204}
205
206static inline int inat_has_variant(insn_attr_t attr)
207{
208 return attr & INAT_VARIANT;
209}
210
211static inline int inat_accept_vex(insn_attr_t attr)
212{
213 return attr & INAT_VEXOK;
214}
215
216static inline int inat_must_vex(insn_attr_t attr)
217{
218 return attr & INAT_VEXONLY;
219}
220#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644
index 000000000000..cb3c20ce39cf
--- /dev/null
+++ b/arch/x86/include/asm/inat_types.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_INAT_TYPES_H
2#define _ASM_X86_INAT_TYPES_H
3/*
4 * x86 instruction attributes
5 *
6 * Written by Masami Hiramatsu <mhiramat@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 */
23
24/* Instruction attributes */
25typedef unsigned int insn_attr_t;
26typedef unsigned char insn_byte_t;
27typedef signed int insn_value_t;
28
29#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644
index 000000000000..96c2e0ad04ca
--- /dev/null
+++ b/arch/x86/include/asm/insn.h
@@ -0,0 +1,184 @@
1#ifndef _ASM_X86_INSN_H
2#define _ASM_X86_INSN_H
3/*
4 * x86 instruction analysis
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2009
21 */
22
23/* insn_attr_t is defined in inat.h */
24#include <asm/inat.h>
25
26struct insn_field {
27 union {
28 insn_value_t value;
29 insn_byte_t bytes[4];
30 };
31 /* !0 if we've run insn_get_xxx() for this field */
32 unsigned char got;
33 unsigned char nbytes;
34};
35
36struct insn {
37 struct insn_field prefixes; /*
38 * Prefixes
39 * prefixes.bytes[3]: last prefix
40 */
41 struct insn_field rex_prefix; /* REX prefix */
42 struct insn_field vex_prefix; /* VEX prefix */
43 struct insn_field opcode; /*
44 * opcode.bytes[0]: opcode1
45 * opcode.bytes[1]: opcode2
46 * opcode.bytes[2]: opcode3
47 */
48 struct insn_field modrm;
49 struct insn_field sib;
50 struct insn_field displacement;
51 union {
52 struct insn_field immediate;
53 struct insn_field moffset1; /* for 64bit MOV */
54 struct insn_field immediate1; /* for 64bit imm or off16/32 */
55 };
56 union {
57 struct insn_field moffset2; /* for 64bit MOV */
58 struct insn_field immediate2; /* for 64bit imm or seg16 */
59 };
60
61 insn_attr_t attr;
62 unsigned char opnd_bytes;
63 unsigned char addr_bytes;
64 unsigned char length;
65 unsigned char x86_64;
66
67 const insn_byte_t *kaddr; /* kernel address of insn to analyze */
68 const insn_byte_t *next_byte;
69};
70
71#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
72#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
73#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
74
75#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
76#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
77#define X86_SIB_BASE(sib) ((sib) & 0x07)
78
79#define X86_REX_W(rex) ((rex) & 8)
80#define X86_REX_R(rex) ((rex) & 4)
81#define X86_REX_X(rex) ((rex) & 2)
82#define X86_REX_B(rex) ((rex) & 1)
83
84/* VEX bit flags */
85#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
86#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */
87#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */
88#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
89#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
90/* VEX bit fields */
91#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
92#define X86_VEX2_M 1 /* VEX2.M always 1 */
93#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
94#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */
95#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
96
97/* The last prefix is needed for two-byte and three-byte opcodes */
98static inline insn_byte_t insn_last_prefix(struct insn *insn)
99{
100 return insn->prefixes.bytes[3];
101}
102
103extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
104extern void insn_get_prefixes(struct insn *insn);
105extern void insn_get_opcode(struct insn *insn);
106extern void insn_get_modrm(struct insn *insn);
107extern void insn_get_sib(struct insn *insn);
108extern void insn_get_displacement(struct insn *insn);
109extern void insn_get_immediate(struct insn *insn);
110extern void insn_get_length(struct insn *insn);
111
112/* Attribute will be determined after getting ModRM (for opcode groups) */
113static inline void insn_get_attribute(struct insn *insn)
114{
115 insn_get_modrm(insn);
116}
117
118/* Instruction uses RIP-relative addressing */
119extern int insn_rip_relative(struct insn *insn);
120
121/* Init insn for kernel text */
122static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
123{
124#ifdef CONFIG_X86_64
125 insn_init(insn, kaddr, 1);
126#else /* CONFIG_X86_32 */
127 insn_init(insn, kaddr, 0);
128#endif
129}
130
131static inline int insn_is_avx(struct insn *insn)
132{
133 if (!insn->prefixes.got)
134 insn_get_prefixes(insn);
135 return (insn->vex_prefix.value != 0);
136}
137
138static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
139{
140 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
141 return X86_VEX2_M;
142 else
143 return X86_VEX3_M(insn->vex_prefix.bytes[1]);
144}
145
146static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
147{
148 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
149 return X86_VEX_P(insn->vex_prefix.bytes[1]);
150 else
151 return X86_VEX_P(insn->vex_prefix.bytes[2]);
152}
153
154/* Offset of each field from kaddr */
155static inline int insn_offset_rex_prefix(struct insn *insn)
156{
157 return insn->prefixes.nbytes;
158}
159static inline int insn_offset_vex_prefix(struct insn *insn)
160{
161 return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
162}
163static inline int insn_offset_opcode(struct insn *insn)
164{
165 return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
166}
167static inline int insn_offset_modrm(struct insn *insn)
168{
169 return insn_offset_opcode(insn) + insn->opcode.nbytes;
170}
171static inline int insn_offset_sib(struct insn *insn)
172{
173 return insn_offset_modrm(insn) + insn->modrm.nbytes;
174}
175static inline int insn_offset_displacement(struct insn *insn)
176{
177 return insn_offset_sib(insn) + insn->sib.nbytes;
178}
179static inline int insn_offset_immediate(struct insn *insn)
180{
181 return insn_offset_displacement(insn) + insn->displacement.nbytes;
182}
183
184#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
new file mode 100644
index 000000000000..14cf526091f9
--- /dev/null
+++ b/arch/x86/include/asm/inst.h
@@ -0,0 +1,150 @@
1/*
2 * Generate .byte code for some instructions not supported by old
3 * binutils.
4 */
5#ifndef X86_ASM_INST_H
6#define X86_ASM_INST_H
7
8#ifdef __ASSEMBLY__
9
10 .macro XMM_NUM opd xmm
11 .ifc \xmm,%xmm0
12 \opd = 0
13 .endif
14 .ifc \xmm,%xmm1
15 \opd = 1
16 .endif
17 .ifc \xmm,%xmm2
18 \opd = 2
19 .endif
20 .ifc \xmm,%xmm3
21 \opd = 3
22 .endif
23 .ifc \xmm,%xmm4
24 \opd = 4
25 .endif
26 .ifc \xmm,%xmm5
27 \opd = 5
28 .endif
29 .ifc \xmm,%xmm6
30 \opd = 6
31 .endif
32 .ifc \xmm,%xmm7
33 \opd = 7
34 .endif
35 .ifc \xmm,%xmm8
36 \opd = 8
37 .endif
38 .ifc \xmm,%xmm9
39 \opd = 9
40 .endif
41 .ifc \xmm,%xmm10
42 \opd = 10
43 .endif
44 .ifc \xmm,%xmm11
45 \opd = 11
46 .endif
47 .ifc \xmm,%xmm12
48 \opd = 12
49 .endif
50 .ifc \xmm,%xmm13
51 \opd = 13
52 .endif
53 .ifc \xmm,%xmm14
54 \opd = 14
55 .endif
56 .ifc \xmm,%xmm15
57 \opd = 15
58 .endif
59 .endm
60
61 .macro PFX_OPD_SIZE
62 .byte 0x66
63 .endm
64
65 .macro PFX_REX opd1 opd2
66 .if (\opd1 | \opd2) & 8
67 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
68 .endif
69 .endm
70
71 .macro MODRM mod opd1 opd2
72 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
73 .endm
74
75 .macro PSHUFB_XMM xmm1 xmm2
76 XMM_NUM pshufb_opd1 \xmm1
77 XMM_NUM pshufb_opd2 \xmm2
78 PFX_OPD_SIZE
79 PFX_REX pshufb_opd1 pshufb_opd2
80 .byte 0x0f, 0x38, 0x00
81 MODRM 0xc0 pshufb_opd1 pshufb_opd2
82 .endm
83
84 .macro PCLMULQDQ imm8 xmm1 xmm2
85 XMM_NUM clmul_opd1 \xmm1
86 XMM_NUM clmul_opd2 \xmm2
87 PFX_OPD_SIZE
88 PFX_REX clmul_opd1 clmul_opd2
89 .byte 0x0f, 0x3a, 0x44
90 MODRM 0xc0 clmul_opd1 clmul_opd2
91 .byte \imm8
92 .endm
93
94 .macro AESKEYGENASSIST rcon xmm1 xmm2
95 XMM_NUM aeskeygen_opd1 \xmm1
96 XMM_NUM aeskeygen_opd2 \xmm2
97 PFX_OPD_SIZE
98 PFX_REX aeskeygen_opd1 aeskeygen_opd2
99 .byte 0x0f, 0x3a, 0xdf
100 MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
101 .byte \rcon
102 .endm
103
104 .macro AESIMC xmm1 xmm2
105 XMM_NUM aesimc_opd1 \xmm1
106 XMM_NUM aesimc_opd2 \xmm2
107 PFX_OPD_SIZE
108 PFX_REX aesimc_opd1 aesimc_opd2
109 .byte 0x0f, 0x38, 0xdb
110 MODRM 0xc0 aesimc_opd1 aesimc_opd2
111 .endm
112
113 .macro AESENC xmm1 xmm2
114 XMM_NUM aesenc_opd1 \xmm1
115 XMM_NUM aesenc_opd2 \xmm2
116 PFX_OPD_SIZE
117 PFX_REX aesenc_opd1 aesenc_opd2
118 .byte 0x0f, 0x38, 0xdc
119 MODRM 0xc0 aesenc_opd1 aesenc_opd2
120 .endm
121
122 .macro AESENCLAST xmm1 xmm2
123 XMM_NUM aesenclast_opd1 \xmm1
124 XMM_NUM aesenclast_opd2 \xmm2
125 PFX_OPD_SIZE
126 PFX_REX aesenclast_opd1 aesenclast_opd2
127 .byte 0x0f, 0x38, 0xdd
128 MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
129 .endm
130
131 .macro AESDEC xmm1 xmm2
132 XMM_NUM aesdec_opd1 \xmm1
133 XMM_NUM aesdec_opd2 \xmm2
134 PFX_OPD_SIZE
135 PFX_REX aesdec_opd1 aesdec_opd2
136 .byte 0x0f, 0x38, 0xde
137 MODRM 0xc0 aesdec_opd1 aesdec_opd2
138 .endm
139
140 .macro AESDECLAST xmm1 xmm2
141 XMM_NUM aesdeclast_opd1 \xmm1
142 XMM_NUM aesdeclast_opd2 \xmm2
143 PFX_OPD_SIZE
144 PFX_REX aesdeclast_opd1 aesdeclast_opd2
145 .byte 0x0f, 0x38, 0xdf
146 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
147 .endm
148#endif
149
150#endif
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21bbee6c..345c99cef152 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_IOMMU_H 1#ifndef _ASM_X86_IOMMU_H
2#define _ASM_X86_IOMMU_H 2#define _ASM_X86_IOMMU_H
3 3
4extern void pci_iommu_shutdown(void);
5extern void no_iommu_init(void);
6extern struct dma_map_ops nommu_dma_ops; 4extern struct dma_map_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
8extern int iommu_detected; 6extern int iommu_detected;
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ddda6cbed6f4..5458380b6ef8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -34,9 +34,10 @@ static inline int irq_canonicalize(int irq)
34#ifdef CONFIG_HOTPLUG_CPU 34#ifdef CONFIG_HOTPLUG_CPU
35#include <linux/cpumask.h> 35#include <linux/cpumask.h>
36extern void fixup_irqs(void); 36extern void fixup_irqs(void);
37extern void irq_force_complete_move(int);
37#endif 38#endif
38 39
39extern void (*generic_interrupt_extension)(void); 40extern void (*x86_platform_ipi_callback)(void);
40extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
41extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
42 43
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5b21f0ec3df2..4611f085cd43 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,14 +106,14 @@
106/* 106/*
107 * Generic system vector for platform specific use 107 * Generic system vector for platform specific use
108 */ 108 */
109#define GENERIC_INTERRUPT_VECTOR 0xed 109#define X86_PLATFORM_IPI_VECTOR 0xed
110 110
111/* 111/*
112 * Performance monitoring pending work vector: 112 * Performance monitoring pending work vector:
113 */ 113 */
114#define LOCAL_PENDING_VECTOR 0xec 114#define LOCAL_PENDING_VECTOR 0xec
115 115
116#define UV_BAU_MESSAGE 0xec 116#define UV_BAU_MESSAGE 0xea
117 117
118/* 118/*
119 * Self IPI vector for machine checks 119 * Self IPI vector for machine checks
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c2d1f3b58e5f..f70e60071fe8 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -4,13 +4,16 @@
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6extern struct pci_device_id k8_nb_ids[]; 6extern struct pci_device_id k8_nb_ids[];
7struct bootnode;
7 8
8extern int early_is_k8_nb(u32 value); 9extern int early_is_k8_nb(u32 value);
9extern struct pci_dev **k8_northbridges; 10extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges; 11extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void); 12extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 13extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 14extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void);
14 17
15#ifdef CONFIG_K8_NB 18#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node) 19static inline struct pci_dev *node_to_k8_nb_misc(int node)
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..f46b79f6c16c 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,8 @@
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE 20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS
22 24
23/* Architectural interrupt line count. */ 25/* Architectural interrupt line count. */
24#define KVM_NR_INTERRUPTS 256 26#define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
79#define KVM_IRQCHIP_PIC_MASTER 0 81#define KVM_IRQCHIP_PIC_MASTER 0
80#define KVM_IRQCHIP_PIC_SLAVE 1 82#define KVM_IRQCHIP_PIC_SLAVE 1
81#define KVM_IRQCHIP_IOAPIC 2 83#define KVM_IRQCHIP_IOAPIC 2
84#define KVM_NR_IRQCHIPS 3
82 85
83/* for KVM_GET_REGS and KVM_SET_REGS */ 86/* for KVM_GET_REGS and KVM_SET_REGS */
84struct kvm_regs { 87struct kvm_regs {
@@ -250,4 +253,35 @@ struct kvm_reinject_control {
250 __u8 pit_reinject; 253 __u8 pit_reinject;
251 __u8 reserved[31]; 254 __u8 reserved[31];
252}; 255};
256
257/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
258#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
259#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
260
261/* for KVM_GET/SET_VCPU_EVENTS */
262struct kvm_vcpu_events {
263 struct {
264 __u8 injected;
265 __u8 nr;
266 __u8 has_error_code;
267 __u8 pad;
268 __u32 error_code;
269 } exception;
270 struct {
271 __u8 injected;
272 __u8 nr;
273 __u8 soft;
274 __u8 pad;
275 } interrupt;
276 struct {
277 __u8 injected;
278 __u8 pending;
279 __u8 masked;
280 __u8 pad;
281 } nmi;
282 __u32 sipi_vector;
283 __u32 flags;
284 __u32 reserved[10];
285};
286
253#endif /* _ASM_X86_KVM_H */ 287#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..7c18e1230f54 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -129,7 +129,7 @@ struct decode_cache {
129 u8 seg_override; 129 u8 seg_override;
130 unsigned int d; 130 unsigned int d;
131 unsigned long regs[NR_VCPU_REGS]; 131 unsigned long regs[NR_VCPU_REGS];
132 unsigned long eip; 132 unsigned long eip, eip_orig;
133 /* modrm */ 133 /* modrm */
134 u8 modrm; 134 u8 modrm;
135 u8 modrm_mod; 135 u8 modrm_mod;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3be000435fad..4f865e8b8540 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,7 +354,6 @@ struct kvm_vcpu_arch {
354 unsigned int time_offset; 354 unsigned int time_offset;
355 struct page *time_page; 355 struct page *time_page;
356 356
357 bool singlestep; /* guest is single stepped by KVM */
358 bool nmi_pending; 357 bool nmi_pending;
359 bool nmi_injected; 358 bool nmi_injected;
360 359
@@ -371,6 +370,10 @@ struct kvm_vcpu_arch {
371 u64 mcg_status; 370 u64 mcg_status;
372 u64 mcg_ctl; 371 u64 mcg_ctl;
373 u64 *mce_banks; 372 u64 *mce_banks;
373
374 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs;
376 unsigned long singlestep_rip;
374}; 377};
375 378
376struct kvm_mem_alias { 379struct kvm_mem_alias {
@@ -397,7 +400,6 @@ struct kvm_arch{
397 struct kvm_pic *vpic; 400 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 401 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 402 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 unsigned int tss_addr; 405 unsigned int tss_addr;
@@ -410,8 +412,10 @@ struct kvm_arch{
410 gpa_t ept_identity_map_addr; 412 gpa_t ept_identity_map_addr;
411 413
412 unsigned long irq_sources_bitmap; 414 unsigned long irq_sources_bitmap;
413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
414 u64 vm_init_tsc; 415 u64 vm_init_tsc;
416 s64 kvmclock_offset;
417
418 struct kvm_xen_hvm_config xen_hvm_config;
415}; 419};
416 420
417struct kvm_vm_stat { 421struct kvm_vm_stat {
@@ -461,7 +465,7 @@ struct descriptor_table {
461struct kvm_x86_ops { 465struct kvm_x86_ops {
462 int (*cpu_has_kvm_support)(void); /* __init */ 466 int (*cpu_has_kvm_support)(void); /* __init */
463 int (*disabled_by_bios)(void); /* __init */ 467 int (*disabled_by_bios)(void); /* __init */
464 void (*hardware_enable)(void *dummy); /* __init */ 468 int (*hardware_enable)(void *dummy);
465 void (*hardware_disable)(void *dummy); 469 void (*hardware_disable)(void *dummy);
466 void (*check_processor_compatibility)(void *rtn); 470 void (*check_processor_compatibility)(void *rtn);
467 int (*hardware_setup)(void); /* __init */ 471 int (*hardware_setup)(void); /* __init */
@@ -477,8 +481,8 @@ struct kvm_x86_ops {
477 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 481 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
478 void (*vcpu_put)(struct kvm_vcpu *vcpu); 482 void (*vcpu_put)(struct kvm_vcpu *vcpu);
479 483
480 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 484 void (*set_guest_debug)(struct kvm_vcpu *vcpu,
481 struct kvm_guest_debug *dbg); 485 struct kvm_guest_debug *dbg);
482 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 486 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
483 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 487 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
484 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 488 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -506,8 +510,8 @@ struct kvm_x86_ops {
506 510
507 void (*tlb_flush)(struct kvm_vcpu *vcpu); 511 void (*tlb_flush)(struct kvm_vcpu *vcpu);
508 512
509 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 513 void (*run)(struct kvm_vcpu *vcpu);
510 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 514 int (*handle_exit)(struct kvm_vcpu *vcpu);
511 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 515 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
512 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 516 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
513 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 517 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,6 +523,8 @@ struct kvm_x86_ops {
519 bool has_error_code, u32 error_code); 523 bool has_error_code, u32 error_code);
520 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 524 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 525 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
526 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
527 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
522 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 528 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 529 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 530 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -568,7 +574,7 @@ enum emulation_result {
568#define EMULTYPE_NO_DECODE (1 << 0) 574#define EMULTYPE_NO_DECODE (1 << 0)
569#define EMULTYPE_TRAP_UD (1 << 1) 575#define EMULTYPE_TRAP_UD (1 << 1)
570#define EMULTYPE_SKIP (1 << 2) 576#define EMULTYPE_SKIP (1 << 2)
571int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 577int emulate_instruction(struct kvm_vcpu *vcpu,
572 unsigned long cr2, u16 error_code, int emulation_type); 578 unsigned long cr2, u16 error_code, int emulation_type);
573void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
574void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +591,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
585 591
586struct x86_emulate_ctxt; 592struct x86_emulate_ctxt;
587 593
588int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 594int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
589 int size, unsigned port); 595 int size, unsigned port);
590int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 596int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
591 int size, unsigned long count, int down, 597 int size, unsigned long count, int down,
592 gva_t address, int rep, unsigned port); 598 gva_t address, int rep, unsigned port);
593void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 599void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -616,6 +622,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
616int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 622int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
617int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 623int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
618 624
625unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
626void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
627
619void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 628void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 629void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 630void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -796,9 +805,13 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
796#define KVM_ARCH_WANT_MMU_NOTIFIER 805#define KVM_ARCH_WANT_MMU_NOTIFIER
797int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 806int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
798int kvm_age_hva(struct kvm *kvm, unsigned long hva); 807int kvm_age_hva(struct kvm *kvm, unsigned long hva);
808void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
799int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 809int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
800int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 810int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
801int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 811int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
802int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 812int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
803 813
814void kvm_define_shared_msr(unsigned index, u32 msr);
815void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
816
804#endif /* _ASM_X86_KVM_HOST_H */ 817#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b608a64c5814..858baa061cfc 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -108,6 +108,8 @@ struct mce_log {
108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
110 110
111extern struct atomic_notifier_head x86_mce_decoder_chain;
112
111#ifdef __KERNEL__ 113#ifdef __KERNEL__
112 114
113#include <linux/percpu.h> 115#include <linux/percpu.h>
@@ -118,9 +120,11 @@ extern int mce_disabled;
118extern int mce_p5_enabled; 120extern int mce_p5_enabled;
119 121
120#ifdef CONFIG_X86_MCE 122#ifdef CONFIG_X86_MCE
121void mcheck_init(struct cpuinfo_x86 *c); 123int mcheck_init(void);
124void mcheck_cpu_init(struct cpuinfo_x86 *c);
122#else 125#else
123static inline void mcheck_init(struct cpuinfo_x86 *c) {} 126static inline int mcheck_init(void) { return 0; }
127static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
124#endif 128#endif
125 129
126#ifdef CONFIG_X86_ANCIENT_MCE 130#ifdef CONFIG_X86_ANCIENT_MCE
@@ -133,6 +137,8 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
133static inline void enable_p5_mce(void) {} 137static inline void enable_p5_mce(void) {}
134#endif 138#endif
135 139
140extern void (*x86_mce_decode_callback)(struct mce *m);
141
136void mce_setup(struct mce *m); 142void mce_setup(struct mce *m);
137void mce_log(struct mce *m); 143void mce_log(struct mce *m);
138DECLARE_PER_CPU(struct sys_device, mce_dev); 144DECLARE_PER_CPU(struct sys_device, mce_dev);
@@ -212,5 +218,11 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
212 218
213void mce_log_therm_throt_event(__u64 status); 219void mce_log_therm_throt_event(__u64 status);
214 220
221#ifdef CONFIG_X86_THERMAL_VECTOR
222extern void mcheck_intel_therm_init(void);
223#else
224static inline void mcheck_intel_therm_init(void) { }
225#endif
226
215#endif /* __KERNEL__ */ 227#endif /* __KERNEL__ */
216#endif /* _ASM_X86_MCE_H */ 228#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..c24ca9a56458 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -12,6 +12,8 @@ struct device;
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; 12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13 13
14struct microcode_ops { 14struct microcode_ops {
15 void (*init)(struct device *device);
16 void (*fini)(void);
15 enum ucode_state (*request_microcode_user) (int cpu, 17 enum ucode_state (*request_microcode_user) (int cpu,
16 const void __user *buf, size_t size); 18 const void __user *buf, size_t size);
17 19
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index ede6998bd92c..91df7c51806c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
47/* 47/*
48 * generic node memory support, the following assumptions apply: 48 * generic node memory support, the following assumptions apply:
49 * 49 *
50 * 1) memory comes in 64Mb contigious chunks which are either present or not 50 * 1) memory comes in 64Mb contiguous chunks which are either present or not
51 * 2) we will not have more than 64Gb in total 51 * 2) we will not have more than 64Gb in total
52 * 52 *
53 * for now assume that 64Gb is max amount of RAM for whole system 53 * for now assume that 64Gb is max amount of RAM for whole system
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 79c94500c0bb..d8bf23a88d05 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -71,12 +71,7 @@ static inline void early_get_smp_config(void)
71 71
72static inline void find_smp_config(void) 72static inline void find_smp_config(void)
73{ 73{
74 x86_init.mpparse.find_smp_config(1); 74 x86_init.mpparse.find_smp_config();
75}
76
77static inline void early_find_smp_config(void)
78{
79 x86_init.mpparse.find_smp_config(0);
80} 75}
81 76
82#ifdef CONFIG_X86_MPPARSE 77#ifdef CONFIG_X86_MPPARSE
@@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
89# else 84# else
90# define default_mpc_oem_bus_info NULL 85# define default_mpc_oem_bus_info NULL
91# endif 86# endif
92extern void default_find_smp_config(unsigned int reserve); 87extern void default_find_smp_config(void);
93extern void default_get_smp_config(unsigned int early); 88extern void default_get_smp_config(unsigned int early);
94#else 89#else
95static inline void early_reserve_e820_mpc_new(void) { } 90static inline void early_reserve_e820_mpc_new(void) { }
@@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
97#define default_mpc_apic_id NULL 92#define default_mpc_apic_id NULL
98#define default_smp_read_mpc_oem NULL 93#define default_smp_read_mpc_oem NULL
99#define default_mpc_oem_bus_info NULL 94#define default_mpc_oem_bus_info NULL
100#define default_find_smp_config x86_init_uint_noop 95#define default_find_smp_config x86_init_noop
101#define default_get_smp_config x86_init_uint_noop 96#define default_get_smp_config x86_init_uint_noop
102#endif 97#endif
103 98
@@ -163,14 +158,16 @@ typedef struct physid_mask physid_mask_t;
163#define physids_shift_left(d, s, n) \ 158#define physids_shift_left(d, s, n) \
164 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 159 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
165 160
166#define physids_coerce(map) ((map).mask[0]) 161static inline unsigned long physids_coerce(physid_mask_t *map)
162{
163 return map->mask[0];
164}
167 165
168#define physids_promote(physids) \ 166static inline void physids_promote(unsigned long physids, physid_mask_t *map)
169 ({ \ 167{
170 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ 168 physids_clear(*map);
171 __physid_mask.mask[0] = physids; \ 169 map->mask[0] = physids;
172 __physid_mask; \ 170}
173 })
174 171
175/* Note: will create very large stack frames if physid_mask_t is big */ 172/* Note: will create very large stack frames if physid_mask_t is big */
176#define physid_mask_of_physid(physid) \ 173#define physid_mask_of_physid(physid) \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4ffe09b2ad75..1cd58cdbc03f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -12,6 +12,7 @@
12#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ 12#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
13#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ 13#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
14#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ 14#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
15#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */
15 16
16/* EFER bits: */ 17/* EFER bits: */
17#define _EFER_SCE 0 /* SYSCALL/SYSRET */ 18#define _EFER_SCE 0 /* SYSCALL/SYSRET */
@@ -123,6 +124,7 @@
123#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 124#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
124#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff 125#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
125#define FAM10H_MMIO_CONF_BASE_SHIFT 20 126#define FAM10H_MMIO_CONF_BASE_SHIFT 20
127#define MSR_FAM10H_NODE_ID 0xc001100c
126 128
127/* K8 MSRs */ 129/* K8 MSRs */
128#define MSR_K8_TOP_MEM1 0xc001001a 130#define MSR_K8_TOP_MEM1 0xc001001a
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 7e2b6ba962ff..c5bc4c2d33f5 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -27,6 +27,18 @@ struct msr {
27 }; 27 };
28}; 28};
29 29
30struct msr_info {
31 u32 msr_no;
32 struct msr reg;
33 struct msr *msrs;
34 int err;
35};
36
37struct msr_regs_info {
38 u32 *regs;
39 int err;
40};
41
30static inline unsigned long long native_read_tscp(unsigned int *aux) 42static inline unsigned long long native_read_tscp(unsigned int *aux)
31{ 43{
32 unsigned long low, high; 44 unsigned long low, high;
@@ -240,15 +252,18 @@ do { \
240#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ 252#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
241 (u32)((val) >> 32)) 253 (u32)((val) >> 32))
242 254
243#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2)) 255#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2))
256
257#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
244 258
245#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) 259struct msr *msrs_alloc(void);
260void msrs_free(struct msr *msrs);
246 261
247#ifdef CONFIG_SMP 262#ifdef CONFIG_SMP
248int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 263int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
249int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 264int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
250void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 265void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
251void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 266void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
252int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 267int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
253int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 268int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
254int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); 269int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
@@ -264,12 +279,12 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
264 wrmsr(msr_no, l, h); 279 wrmsr(msr_no, l, h);
265 return 0; 280 return 0;
266} 281}
267static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no, 282static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
268 struct msr *msrs) 283 struct msr *msrs)
269{ 284{
270 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); 285 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
271} 286}
272static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no, 287static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
273 struct msr *msrs) 288 struct msr *msrs)
274{ 289{
275 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); 290 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 834a30295fab..3a57385d9fa7 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -120,7 +120,7 @@ extern int olpc_ec_mask_unset(uint8_t bits);
120 120
121/* GPIO assignments */ 121/* GPIO assignments */
122 122
123#define OLPC_GPIO_MIC_AC geode_gpio(1) 123#define OLPC_GPIO_MIC_AC 1
124#define OLPC_GPIO_DCON_IRQ geode_gpio(7) 124#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
125#define OLPC_GPIO_THRM_ALRM geode_gpio(10) 125#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
126#define OLPC_GPIO_SMB_CLK geode_gpio(14) 126#define OLPC_GPIO_SMB_CLK geode_gpio(14)
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..642fe34b36a2 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped;
49extern unsigned long init_memory_mapping(unsigned long start, 49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end); 50 unsigned long end);
51 51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); 52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
53 int acpi, int k8);
53extern void free_initmem(void); 54extern void free_initmem(void);
54 55
55#endif /* !__ASSEMBLY__ */ 56#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8aebcc41041d..dd59a85a918f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -731,34 +731,34 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
731 731
732#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) 732#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
733 733
734static inline int __raw_spin_is_locked(struct raw_spinlock *lock) 734static inline int arch_spin_is_locked(struct arch_spinlock *lock)
735{ 735{
736 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); 736 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
737} 737}
738 738
739static inline int __raw_spin_is_contended(struct raw_spinlock *lock) 739static inline int arch_spin_is_contended(struct arch_spinlock *lock)
740{ 740{
741 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); 741 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
742} 742}
743#define __raw_spin_is_contended __raw_spin_is_contended 743#define arch_spin_is_contended arch_spin_is_contended
744 744
745static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) 745static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
746{ 746{
747 PVOP_VCALL1(pv_lock_ops.spin_lock, lock); 747 PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
748} 748}
749 749
750static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock, 750static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
751 unsigned long flags) 751 unsigned long flags)
752{ 752{
753 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); 753 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
754} 754}
755 755
756static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) 756static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
757{ 757{
758 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); 758 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
759} 759}
760 760
761static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) 761static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
762{ 762{
763 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); 763 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
764} 764}
@@ -840,42 +840,22 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
840 840
841static inline unsigned long __raw_local_save_flags(void) 841static inline unsigned long __raw_local_save_flags(void)
842{ 842{
843 unsigned long f; 843 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
844
845 asm volatile(paravirt_alt(PARAVIRT_CALL)
846 : "=a"(f)
847 : paravirt_type(pv_irq_ops.save_fl),
848 paravirt_clobber(CLBR_EAX)
849 : "memory", "cc");
850 return f;
851} 844}
852 845
853static inline void raw_local_irq_restore(unsigned long f) 846static inline void raw_local_irq_restore(unsigned long f)
854{ 847{
855 asm volatile(paravirt_alt(PARAVIRT_CALL) 848 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
856 : "=a"(f)
857 : PV_FLAGS_ARG(f),
858 paravirt_type(pv_irq_ops.restore_fl),
859 paravirt_clobber(CLBR_EAX)
860 : "memory", "cc");
861} 849}
862 850
863static inline void raw_local_irq_disable(void) 851static inline void raw_local_irq_disable(void)
864{ 852{
865 asm volatile(paravirt_alt(PARAVIRT_CALL) 853 PVOP_VCALLEE0(pv_irq_ops.irq_disable);
866 :
867 : paravirt_type(pv_irq_ops.irq_disable),
868 paravirt_clobber(CLBR_EAX)
869 : "memory", "eax", "cc");
870} 854}
871 855
872static inline void raw_local_irq_enable(void) 856static inline void raw_local_irq_enable(void)
873{ 857{
874 asm volatile(paravirt_alt(PARAVIRT_CALL) 858 PVOP_VCALLEE0(pv_irq_ops.irq_enable);
875 :
876 : paravirt_type(pv_irq_ops.irq_enable),
877 paravirt_clobber(CLBR_EAX)
878 : "memory", "eax", "cc");
879} 859}
880 860
881static inline unsigned long __raw_local_irq_save(void) 861static inline unsigned long __raw_local_irq_save(void)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index dd0f5b32489d..b1e70d51e40c 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -318,14 +318,14 @@ struct pv_mmu_ops {
318 phys_addr_t phys, pgprot_t flags); 318 phys_addr_t phys, pgprot_t flags);
319}; 319};
320 320
321struct raw_spinlock; 321struct arch_spinlock;
322struct pv_lock_ops { 322struct pv_lock_ops {
323 int (*spin_is_locked)(struct raw_spinlock *lock); 323 int (*spin_is_locked)(struct arch_spinlock *lock);
324 int (*spin_is_contended)(struct raw_spinlock *lock); 324 int (*spin_is_contended)(struct arch_spinlock *lock);
325 void (*spin_lock)(struct raw_spinlock *lock); 325 void (*spin_lock)(struct arch_spinlock *lock);
326 void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); 326 void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags);
327 int (*spin_trylock)(struct raw_spinlock *lock); 327 int (*spin_trylock)(struct arch_spinlock *lock);
328 void (*spin_unlock)(struct raw_spinlock *lock); 328 void (*spin_unlock)(struct arch_spinlock *lock);
329}; 329};
330 330
331/* This contains all the paravirt structures: we get a convenient 331/* This contains all the paravirt structures: we get a convenient
@@ -494,10 +494,11 @@ int paravirt_disable_iospace(void);
494#define EXTRA_CLOBBERS 494#define EXTRA_CLOBBERS
495#define VEXTRA_CLOBBERS 495#define VEXTRA_CLOBBERS
496#else /* CONFIG_X86_64 */ 496#else /* CONFIG_X86_64 */
497/* [re]ax isn't an arg, but the return val */
497#define PVOP_VCALL_ARGS \ 498#define PVOP_VCALL_ARGS \
498 unsigned long __edi = __edi, __esi = __esi, \ 499 unsigned long __edi = __edi, __esi = __esi, \
499 __edx = __edx, __ecx = __ecx 500 __edx = __edx, __ecx = __ecx, __eax = __eax
500#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax 501#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
501 502
502#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) 503#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
503#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) 504#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
@@ -509,6 +510,7 @@ int paravirt_disable_iospace(void);
509 "=c" (__ecx) 510 "=c" (__ecx)
510#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) 511#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
511 512
513/* void functions are still allowed [re]ax for scratch */
512#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) 514#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
513#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS 515#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
514 516
@@ -583,8 +585,8 @@ int paravirt_disable_iospace(void);
583 VEXTRA_CLOBBERS, \ 585 VEXTRA_CLOBBERS, \
584 pre, post, ##__VA_ARGS__) 586 pre, post, ##__VA_ARGS__)
585 587
586#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ 588#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
587 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ 589 ____PVOP_VCALL(op.func, CLBR_RET_REG, \
588 PVOP_VCALLEE_CLOBBERS, , \ 590 PVOP_VCALLEE_CLOBBERS, , \
589 pre, post, ##__VA_ARGS__) 591 pre, post, ##__VA_ARGS__)
590 592
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988eee3a..b4bf9a942ed0 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -118,11 +118,27 @@ extern int __init pcibios_init(void);
118 118
119/* pci-mmconfig.c */ 119/* pci-mmconfig.c */
120 120
121/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
122#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
123
124struct pci_mmcfg_region {
125 struct list_head list;
126 struct resource res;
127 u64 address;
128 char __iomem *virt;
129 u16 segment;
130 u8 start_bus;
131 u8 end_bus;
132 char name[PCI_MMCFG_RESOURCE_NAME_LEN];
133};
134
121extern int __init pci_mmcfg_arch_init(void); 135extern int __init pci_mmcfg_arch_init(void);
122extern void __init pci_mmcfg_arch_free(void); 136extern void __init pci_mmcfg_arch_free(void);
137extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
138
139extern struct list_head pci_mmcfg_list;
123 140
124extern struct acpi_mcfg_allocation *pci_mmcfg_config; 141#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
125extern int pci_mmcfg_config_num;
126 142
127/* 143/*
128 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space 144 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index ad7ce3fd5065..8d9f8548a870 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -28,9 +28,20 @@
28 */ 28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff 29#define ARCH_PERFMON_EVENT_MASK 0xffff
30 30
31/*
32 * filter mask to validate fixed counter events.
33 * the following filters disqualify for fixed counters:
34 * - inv
35 * - edge
36 * - cnt-mask
37 * The other filters are supported by fixed counters.
38 * The any-thread option is supported starting with v3.
39 */
40#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000
41
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 42#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 43#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 44#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ 45#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) 46 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36 47
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd360ab35..a34c785c5a63 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -16,6 +16,8 @@
16 16
17#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
18 18
19#include <asm/x86_init.h>
20
19/* 21/*
20 * ZERO_PAGE is a global shared page that is always zero: used 22 * ZERO_PAGE is a global shared page that is always zero: used
21 * for zero-mapped memory areas etc.. 23 * for zero-mapped memory areas etc..
@@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
270 unsigned long new_flags) 272 unsigned long new_flags)
271{ 273{
272 /* 274 /*
273 * PAT type is always WB for ISA. So no need to check. 275 * PAT type is always WB for untracked ranges, so no need to check.
274 */ 276 */
275 if (is_ISA_range(paddr, paddr + size - 1)) 277 if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
276 return 1; 278 return 1;
277 279
278 /* 280 /*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c3429e8b2424..fc801bab1b3b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -30,6 +30,7 @@ struct mm_struct;
30#include <linux/math64.h> 30#include <linux/math64.h>
31#include <linux/init.h> 31#include <linux/init.h>
32 32
33#define HBP_NUM 4
33/* 34/*
34 * Default implementation of macro that returns current 35 * Default implementation of macro that returns current
35 * instruction pointer ("program counter"). 36 * instruction pointer ("program counter").
@@ -180,7 +181,7 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
180 unsigned int *ecx, unsigned int *edx) 181 unsigned int *ecx, unsigned int *edx)
181{ 182{
182 /* ecx is often an input as well as an output. */ 183 /* ecx is often an input as well as an output. */
183 asm("cpuid" 184 asm volatile("cpuid"
184 : "=a" (*eax), 185 : "=a" (*eax),
185 "=b" (*ebx), 186 "=b" (*ebx),
186 "=c" (*ecx), 187 "=c" (*ecx),
@@ -422,6 +423,8 @@ extern unsigned int xstate_size;
422extern void free_thread_xstate(struct task_struct *); 423extern void free_thread_xstate(struct task_struct *);
423extern struct kmem_cache *task_xstate_cachep; 424extern struct kmem_cache *task_xstate_cachep;
424 425
426struct perf_event;
427
425struct thread_struct { 428struct thread_struct {
426 /* Cached TLS descriptors: */ 429 /* Cached TLS descriptors: */
427 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; 430 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -443,13 +446,10 @@ struct thread_struct {
443 unsigned long fs; 446 unsigned long fs;
444#endif 447#endif
445 unsigned long gs; 448 unsigned long gs;
446 /* Hardware debugging registers: */ 449 /* Save middle states of ptrace breakpoints */
447 unsigned long debugreg0; 450 struct perf_event *ptrace_bps[HBP_NUM];
448 unsigned long debugreg1; 451 /* Debug status used for traps, single steps, etc... */
449 unsigned long debugreg2; 452 unsigned long debugreg6;
450 unsigned long debugreg3;
451 unsigned long debugreg6;
452 unsigned long debugreg7;
453 /* Fault info: */ 453 /* Fault info: */
454 unsigned long cr2; 454 unsigned long cr2;
455 unsigned long trap_no; 455 unsigned long trap_no;
@@ -1000,7 +1000,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
1000#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) 1000#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
1001 1001
1002#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) 1002#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
1003#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ 1003extern unsigned long KSTK_ESP(struct task_struct *task);
1004#endif /* CONFIG_X86_64 */ 1004#endif /* CONFIG_X86_64 */
1005 1005
1006extern void start_thread(struct pt_regs *regs, unsigned long new_ip, 1006extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 621f56d73121..4009f6534f52 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,18 +5,19 @@
5 5
6/* misc architecture specific prototypes */ 6/* misc architecture specific prototypes */
7 7
8extern void early_idt_handler(void); 8void early_idt_handler(void);
9 9
10extern void system_call(void); 10void system_call(void);
11extern void syscall_init(void); 11void syscall_init(void);
12 12
13extern void ia32_syscall(void); 13void ia32_syscall(void);
14extern void ia32_cstar_target(void); 14void ia32_cstar_target(void);
15extern void ia32_sysenter_target(void); 15void ia32_sysenter_target(void);
16 16
17extern void syscall32_cpu_init(void); 17void syscall32_cpu_init(void);
18 18
19extern void check_efer(void); 19void x86_configure_nx(void);
20void x86_report_nx(void);
20 21
21extern int reboot_force; 22extern int reboot_force;
22 23
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908349aa..9d369f680321 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
7 7
8#ifdef __KERNEL__ 8#ifdef __KERNEL__
9#include <asm/segment.h> 9#include <asm/segment.h>
10#include <asm/page_types.h>
10#endif 11#endif
11 12
12#ifndef __ASSEMBLY__ 13#ifndef __ASSEMBLY__
@@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
216 return regs->sp; 217 return regs->sp;
217} 218}
218 219
220/* Query offset/name of register from its name/offset */
221extern int regs_query_register_offset(const char *name);
222extern const char *regs_query_register_name(unsigned int offset);
223#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
224
225/**
226 * regs_get_register() - get register value from its offset
227 * @regs: pt_regs from which register value is gotten.
228 * @offset: offset number of the register.
229 *
230 * regs_get_register returns the value of a register. The @offset is the
231 * offset of the register in struct pt_regs address which specified by @regs.
232 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
233 */
234static inline unsigned long regs_get_register(struct pt_regs *regs,
235 unsigned int offset)
236{
237 if (unlikely(offset > MAX_REG_OFFSET))
238 return 0;
239 return *(unsigned long *)((unsigned long)regs + offset);
240}
241
242/**
243 * regs_within_kernel_stack() - check the address in the stack
244 * @regs: pt_regs which contains kernel stack pointer.
245 * @addr: address which is checked.
246 *
247 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
248 * If @addr is within the kernel stack, it returns true. If not, returns false.
249 */
250static inline int regs_within_kernel_stack(struct pt_regs *regs,
251 unsigned long addr)
252{
253 return ((addr & ~(THREAD_SIZE - 1)) ==
254 (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
255}
256
257/**
258 * regs_get_kernel_stack_nth() - get Nth entry of the stack
259 * @regs: pt_regs which contains kernel stack pointer.
260 * @n: stack entry number.
261 *
262 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
263 * is specified by @regs. If the @n th entry is NOT in the kernel stack,
264 * this returns 0.
265 */
266static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
267 unsigned int n)
268{
269 unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
270 addr += n;
271 if (regs_within_kernel_stack(regs, (unsigned long)addr))
272 return *addr;
273 else
274 return 0;
275}
276
277/* Get Nth argument at function call */
278extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
279 unsigned int n);
280
219/* 281/*
220 * These are defined as per linux/ptrace.h, which see. 282 * These are defined as per linux/ptrace.h, which see.
221 */ 283 */
@@ -230,6 +292,8 @@ extern void user_enable_block_step(struct task_struct *);
230#define arch_has_block_step() (boot_cpu_data.x86 >= 6) 292#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
231#endif 293#endif
232 294
295#define ARCH_HAS_USER_SINGLE_STEP_INFO
296
233struct user_desc; 297struct user_desc;
234extern int do_get_thread_area(struct task_struct *p, int idx, 298extern int do_get_thread_area(struct task_struct *p, int idx,
235 struct user_desc __user *info); 299 struct user_desc __user *info);
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 1b7ee5d673c2..0a5242428659 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,13 @@
2#define _ASM_X86_SECTIONS_H 2#define _ASM_X86_SECTIONS_H
3 3
4#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5#include <asm/uaccess.h>
5 6
6extern char __brk_base[], __brk_limit[]; 7extern char __brk_base[], __brk_limit[];
8extern struct exception_table_entry __stop___ex_table[];
9
10#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
11extern char __end_rodata_hpage_align[];
12#endif
7 13
8#endif /* _ASM_X86_SECTIONS_H */ 14#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 72e5a4491661..04459d25e66e 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -124,7 +124,7 @@ struct sigcontext {
124 * fpstate is really (struct _fpstate *) or (struct _xstate *) 124 * fpstate is really (struct _fpstate *) or (struct _xstate *)
125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved 125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end 126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
127 * of extended memory layout. See comments at the defintion of 127 * of extended memory layout. See comments at the definition of
128 * (struct _fpx_sw_bytes) 128 * (struct _fpx_sw_bytes)
129 */ 129 */
130 void __user *fpstate; /* zero when no FPU/extended context */ 130 void __user *fpstate; /* zero when no FPU/extended context */
@@ -219,7 +219,7 @@ struct sigcontext {
219 * fpstate is really (struct _fpstate *) or (struct _xstate *) 219 * fpstate is really (struct _fpstate *) or (struct _xstate *)
220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved 220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end 221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
222 * of extended memory layout. See comments at the defintion of 222 * of extended memory layout. See comments at the definition of
223 * (struct _fpx_sw_bytes) 223 * (struct _fpx_sw_bytes)
224 */ 224 */
225 void __user *fpstate; /* zero when no FPU/extended context */ 225 void __user *fpstate; /* zero when no FPU/extended context */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 4e77853321db..3089f70c0c52 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -58,7 +58,7 @@
58#if (NR_CPUS < 256) 58#if (NR_CPUS < 256)
59#define TICKET_SHIFT 8 59#define TICKET_SHIFT 8
60 60
61static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) 61static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
62{ 62{
63 short inc = 0x0100; 63 short inc = 0x0100;
64 64
@@ -77,7 +77,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
77 : "memory", "cc"); 77 : "memory", "cc");
78} 78}
79 79
80static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) 80static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
81{ 81{
82 int tmp, new; 82 int tmp, new;
83 83
@@ -96,7 +96,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
96 return tmp; 96 return tmp;
97} 97}
98 98
99static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) 99static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
100{ 100{
101 asm volatile(UNLOCK_LOCK_PREFIX "incb %0" 101 asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
102 : "+m" (lock->slock) 102 : "+m" (lock->slock)
@@ -106,7 +106,7 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
106#else 106#else
107#define TICKET_SHIFT 16 107#define TICKET_SHIFT 16
108 108
109static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) 109static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
110{ 110{
111 int inc = 0x00010000; 111 int inc = 0x00010000;
112 int tmp; 112 int tmp;
@@ -127,7 +127,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
127 : "memory", "cc"); 127 : "memory", "cc");
128} 128}
129 129
130static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) 130static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
131{ 131{
132 int tmp; 132 int tmp;
133 int new; 133 int new;
@@ -149,7 +149,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
149 return tmp; 149 return tmp;
150} 150}
151 151
152static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) 152static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
153{ 153{
154 asm volatile(UNLOCK_LOCK_PREFIX "incw %0" 154 asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
155 : "+m" (lock->slock) 155 : "+m" (lock->slock)
@@ -158,14 +158,14 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
158} 158}
159#endif 159#endif
160 160
161static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) 161static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
162{ 162{
163 int tmp = ACCESS_ONCE(lock->slock); 163 int tmp = ACCESS_ONCE(lock->slock);
164 164
165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); 165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
166} 166}
167 167
168static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) 168static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
169{ 169{
170 int tmp = ACCESS_ONCE(lock->slock); 170 int tmp = ACCESS_ONCE(lock->slock);
171 171
@@ -174,43 +174,43 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
174 174
175#ifndef CONFIG_PARAVIRT_SPINLOCKS 175#ifndef CONFIG_PARAVIRT_SPINLOCKS
176 176
177static inline int __raw_spin_is_locked(raw_spinlock_t *lock) 177static inline int arch_spin_is_locked(arch_spinlock_t *lock)
178{ 178{
179 return __ticket_spin_is_locked(lock); 179 return __ticket_spin_is_locked(lock);
180} 180}
181 181
182static inline int __raw_spin_is_contended(raw_spinlock_t *lock) 182static inline int arch_spin_is_contended(arch_spinlock_t *lock)
183{ 183{
184 return __ticket_spin_is_contended(lock); 184 return __ticket_spin_is_contended(lock);
185} 185}
186#define __raw_spin_is_contended __raw_spin_is_contended 186#define arch_spin_is_contended arch_spin_is_contended
187 187
188static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) 188static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
189{ 189{
190 __ticket_spin_lock(lock); 190 __ticket_spin_lock(lock);
191} 191}
192 192
193static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) 193static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
194{ 194{
195 return __ticket_spin_trylock(lock); 195 return __ticket_spin_trylock(lock);
196} 196}
197 197
198static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) 198static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
199{ 199{
200 __ticket_spin_unlock(lock); 200 __ticket_spin_unlock(lock);
201} 201}
202 202
203static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, 203static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
204 unsigned long flags) 204 unsigned long flags)
205{ 205{
206 __raw_spin_lock(lock); 206 arch_spin_lock(lock);
207} 207}
208 208
209#endif /* CONFIG_PARAVIRT_SPINLOCKS */ 209#endif /* CONFIG_PARAVIRT_SPINLOCKS */
210 210
211static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) 211static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
212{ 212{
213 while (__raw_spin_is_locked(lock)) 213 while (arch_spin_is_locked(lock))
214 cpu_relax(); 214 cpu_relax();
215} 215}
216 216
@@ -232,7 +232,7 @@ static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
232 * read_can_lock - would read_trylock() succeed? 232 * read_can_lock - would read_trylock() succeed?
233 * @lock: the rwlock in question. 233 * @lock: the rwlock in question.
234 */ 234 */
235static inline int __raw_read_can_lock(raw_rwlock_t *lock) 235static inline int arch_read_can_lock(arch_rwlock_t *lock)
236{ 236{
237 return (int)(lock)->lock > 0; 237 return (int)(lock)->lock > 0;
238} 238}
@@ -241,12 +241,12 @@ static inline int __raw_read_can_lock(raw_rwlock_t *lock)
241 * write_can_lock - would write_trylock() succeed? 241 * write_can_lock - would write_trylock() succeed?
242 * @lock: the rwlock in question. 242 * @lock: the rwlock in question.
243 */ 243 */
244static inline int __raw_write_can_lock(raw_rwlock_t *lock) 244static inline int arch_write_can_lock(arch_rwlock_t *lock)
245{ 245{
246 return (lock)->lock == RW_LOCK_BIAS; 246 return (lock)->lock == RW_LOCK_BIAS;
247} 247}
248 248
249static inline void __raw_read_lock(raw_rwlock_t *rw) 249static inline void arch_read_lock(arch_rwlock_t *rw)
250{ 250{
251 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" 251 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
252 "jns 1f\n" 252 "jns 1f\n"
@@ -255,7 +255,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw)
255 ::LOCK_PTR_REG (rw) : "memory"); 255 ::LOCK_PTR_REG (rw) : "memory");
256} 256}
257 257
258static inline void __raw_write_lock(raw_rwlock_t *rw) 258static inline void arch_write_lock(arch_rwlock_t *rw)
259{ 259{
260 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" 260 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
261 "jz 1f\n" 261 "jz 1f\n"
@@ -264,7 +264,7 @@ static inline void __raw_write_lock(raw_rwlock_t *rw)
264 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); 264 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
265} 265}
266 266
267static inline int __raw_read_trylock(raw_rwlock_t *lock) 267static inline int arch_read_trylock(arch_rwlock_t *lock)
268{ 268{
269 atomic_t *count = (atomic_t *)lock; 269 atomic_t *count = (atomic_t *)lock;
270 270
@@ -274,7 +274,7 @@ static inline int __raw_read_trylock(raw_rwlock_t *lock)
274 return 0; 274 return 0;
275} 275}
276 276
277static inline int __raw_write_trylock(raw_rwlock_t *lock) 277static inline int arch_write_trylock(arch_rwlock_t *lock)
278{ 278{
279 atomic_t *count = (atomic_t *)lock; 279 atomic_t *count = (atomic_t *)lock;
280 280
@@ -284,23 +284,23 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
284 return 0; 284 return 0;
285} 285}
286 286
287static inline void __raw_read_unlock(raw_rwlock_t *rw) 287static inline void arch_read_unlock(arch_rwlock_t *rw)
288{ 288{
289 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); 289 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
290} 290}
291 291
292static inline void __raw_write_unlock(raw_rwlock_t *rw) 292static inline void arch_write_unlock(arch_rwlock_t *rw)
293{ 293{
294 asm volatile(LOCK_PREFIX "addl %1, %0" 294 asm volatile(LOCK_PREFIX "addl %1, %0"
295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); 295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
296} 296}
297 297
298#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) 298#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
299#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) 299#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
300 300
301#define _raw_spin_relax(lock) cpu_relax() 301#define arch_spin_relax(lock) cpu_relax()
302#define _raw_read_relax(lock) cpu_relax() 302#define arch_read_relax(lock) cpu_relax()
303#define _raw_write_relax(lock) cpu_relax() 303#define arch_write_relax(lock) cpu_relax()
304 304
305/* The {read|write|spin}_lock() on x86 are full memory barriers. */ 305/* The {read|write|spin}_lock() on x86 are full memory barriers. */
306static inline void smp_mb__after_lock(void) { } 306static inline void smp_mb__after_lock(void) { }
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 845f81c87091..dcb48b2edc11 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -5,16 +5,16 @@
5# error "please don't include this file directly" 5# error "please don't include this file directly"
6#endif 6#endif
7 7
8typedef struct raw_spinlock { 8typedef struct arch_spinlock {
9 unsigned int slock; 9 unsigned int slock;
10} raw_spinlock_t; 10} arch_spinlock_t;
11 11
12#define __RAW_SPIN_LOCK_UNLOCKED { 0 } 12#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
13 13
14typedef struct { 14typedef struct {
15 unsigned int lock; 15 unsigned int lock;
16} raw_rwlock_t; 16} arch_rwlock_t;
17 17
18#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } 18#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
19 19
20#endif /* _ASM_X86_SPINLOCK_TYPES_H */ 20#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index cf86a5e73815..35e89122a42f 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -5,6 +5,29 @@ extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name); 6int x86_is_stack_id(int id, char *name);
7 7
8struct thread_info;
9struct stacktrace_ops;
10
11typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo,
12 unsigned long *stack,
13 unsigned long bp,
14 const struct stacktrace_ops *ops,
15 void *data,
16 unsigned long *end,
17 int *graph);
18
19extern unsigned long
20print_context_stack(struct thread_info *tinfo,
21 unsigned long *stack, unsigned long bp,
22 const struct stacktrace_ops *ops, void *data,
23 unsigned long *end, int *graph);
24
25extern unsigned long
26print_context_stack_bp(struct thread_info *tinfo,
27 unsigned long *stack, unsigned long bp,
28 const struct stacktrace_ops *ops, void *data,
29 unsigned long *end, int *graph);
30
8/* Generic stack tracer with callbacks */ 31/* Generic stack tracer with callbacks */
9 32
10struct stacktrace_ops { 33struct stacktrace_ops {
@@ -14,6 +37,7 @@ struct stacktrace_ops {
14 void (*address)(void *data, unsigned long address, int reliable); 37 void (*address)(void *data, unsigned long address, int reliable);
15 /* On negative return stop dumping */ 38 /* On negative return stop dumping */
16 int (*stack)(void *data, char *name); 39 int (*stack)(void *data, char *name);
40 walk_stack_t walk_stack;
17}; 41};
18 42
19void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 43void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index ae907e617181..3d3e8353ee5c 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 */ 177 */
178 178
179#ifndef CONFIG_KMEMCHECK 179#ifndef CONFIG_KMEMCHECK
180
181#if (__GNUC__ >= 4)
182#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
183#else
180#define memcpy(t, f, n) \ 184#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 185 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 186 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 187 : __memcpy((t), (f), (n)))
188#endif
184#else 189#else
185/* 190/*
186 * kmemcheck becomes very happy if we use the REP instructions unconditionally, 191 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
@@ -316,11 +321,15 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
316 : __memset_generic((s), (c), (count))) 321 : __memset_generic((s), (c), (count)))
317 322
318#define __HAVE_ARCH_MEMSET 323#define __HAVE_ARCH_MEMSET
324#if (__GNUC__ >= 4)
325#define memset(s, c, count) __builtin_memset(s, c, count)
326#else
319#define memset(s, c, count) \ 327#define memset(s, c, count) \
320 (__builtin_constant_p(c) \ 328 (__builtin_constant_p(c) \
321 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ 329 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
322 (count)) \ 330 (count)) \
323 : __memset((s), (c), (count))) 331 : __memset((s), (c), (count)))
332#endif
324 333
325/* 334/*
326 * find the first occurrence of byte 'c', or 1 past the area if none 335 * find the first occurrence of byte 'c', or 1 past the area if none
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..1fecb7e61130 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
57 u16 intercept_dr_write; 57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 58 u32 intercept_exceptions;
59 u64 intercept; 59 u64 intercept;
60 u8 reserved_1[44]; 60 u8 reserved_1[42];
61 u16 pause_filter_count;
61 u64 iopm_base_pa; 62 u64 iopm_base_pa;
62 u64 msrpm_base_pa; 63 u64 msrpm_base_pa;
63 u64 tsc_offset; 64 u64 tsc_offset;
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20174fb..8085277e1b8b 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,15 +3,16 @@
3 3
4#include <linux/swiotlb.h> 4#include <linux/swiotlb.h>
5 5
6/* SWIOTLB interface */
7
8extern int swiotlb_force;
9
10#ifdef CONFIG_SWIOTLB 6#ifdef CONFIG_SWIOTLB
11extern int swiotlb; 7extern int swiotlb;
12extern void pci_swiotlb_init(void); 8extern int __init pci_swiotlb_detect(void);
9extern void __init pci_swiotlb_init(void);
13#else 10#else
14#define swiotlb 0 11#define swiotlb 0
12static inline int pci_swiotlb_detect(void)
13{
14 return 0;
15}
15static inline void pci_swiotlb_init(void) 16static inline void pci_swiotlb_init(void)
16{ 17{
17} 18}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 72a6dcd1299b..d5f69045c100 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -30,7 +30,6 @@ struct mmap_arg_struct;
30asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); 30asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); 31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
32 32
33asmlinkage long sys32_pipe(int __user *);
34struct sigaction32; 33struct sigaction32;
35struct old_sigaction32; 34struct old_sigaction32;
36asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, 35asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
@@ -51,20 +50,12 @@ asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); 50asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); 51asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
53 52
54#ifdef CONFIG_SYSCTL_SYSCALL
55struct sysctl_ia32;
56asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
57#endif
58
59asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); 53asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
60asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); 54asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
61 55
62asmlinkage long sys32_personality(unsigned long); 56asmlinkage long sys32_personality(unsigned long);
63asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); 57asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
64 58
65asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
66 unsigned long, unsigned long, unsigned long);
67
68struct oldold_utsname; 59struct oldold_utsname;
69struct old_utsname; 60struct old_utsname;
70asmlinkage long sys32_olduname(struct oldold_utsname __user *); 61asmlinkage long sys32_olduname(struct oldold_utsname __user *);
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 372b76edd63f..8868b9420b0e 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -18,16 +18,24 @@
18/* Common in X86_32 and X86_64 */ 18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21long sys_iopl(unsigned int, struct pt_regs *);
21 22
22/* kernel/process.c */ 23/* kernel/process.c */
23int sys_fork(struct pt_regs *); 24int sys_fork(struct pt_regs *);
24int sys_vfork(struct pt_regs *); 25int sys_vfork(struct pt_regs *);
26long sys_execve(char __user *, char __user * __user *,
27 char __user * __user *, struct pt_regs *);
28long sys_clone(unsigned long, unsigned long, void __user *,
29 void __user *, struct pt_regs *);
25 30
26/* kernel/ldt.c */ 31/* kernel/ldt.c */
27asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); 32asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
28 33
29/* kernel/signal.c */ 34/* kernel/signal.c */
30long sys_rt_sigreturn(struct pt_regs *); 35long sys_rt_sigreturn(struct pt_regs *);
36long sys_sigaltstack(const stack_t __user *, stack_t __user *,
37 struct pt_regs *);
38
31 39
32/* kernel/tls.c */ 40/* kernel/tls.c */
33asmlinkage int sys_set_thread_area(struct user_desc __user *); 41asmlinkage int sys_set_thread_area(struct user_desc __user *);
@@ -35,18 +43,11 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
35 43
36/* X86_32 only */ 44/* X86_32 only */
37#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
38/* kernel/ioport.c */
39long sys_iopl(struct pt_regs *);
40
41/* kernel/process_32.c */
42int sys_clone(struct pt_regs *);
43int sys_execve(struct pt_regs *);
44 46
45/* kernel/signal.c */ 47/* kernel/signal.c */
46asmlinkage int sys_sigsuspend(int, int, old_sigset_t); 48asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
47asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, 49asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
48 struct old_sigaction __user *); 50 struct old_sigaction __user *);
49int sys_sigaltstack(struct pt_regs *);
50unsigned long sys_sigreturn(struct pt_regs *); 51unsigned long sys_sigreturn(struct pt_regs *);
51 52
52/* kernel/sys_i386_32.c */ 53/* kernel/sys_i386_32.c */
@@ -55,8 +56,6 @@ struct sel_arg_struct;
55struct oldold_utsname; 56struct oldold_utsname;
56struct old_utsname; 57struct old_utsname;
57 58
58asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
59 unsigned long, unsigned long, unsigned long);
60asmlinkage int old_mmap(struct mmap_arg_struct __user *); 59asmlinkage int old_mmap(struct mmap_arg_struct __user *);
61asmlinkage int old_select(struct sel_arg_struct __user *); 60asmlinkage int old_select(struct sel_arg_struct __user *);
62asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); 61asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
@@ -64,28 +63,15 @@ asmlinkage int sys_uname(struct old_utsname __user *);
64asmlinkage int sys_olduname(struct oldold_utsname __user *); 63asmlinkage int sys_olduname(struct oldold_utsname __user *);
65 64
66/* kernel/vm86_32.c */ 65/* kernel/vm86_32.c */
67int sys_vm86old(struct pt_regs *); 66int sys_vm86old(struct vm86_struct __user *, struct pt_regs *);
68int sys_vm86(struct pt_regs *); 67int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
69 68
70#else /* CONFIG_X86_32 */ 69#else /* CONFIG_X86_32 */
71 70
72/* X86_64 only */ 71/* X86_64 only */
73/* kernel/ioport.c */
74asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
75
76/* kernel/process_64.c */ 72/* kernel/process_64.c */
77asmlinkage long sys_clone(unsigned long, unsigned long,
78 void __user *, void __user *,
79 struct pt_regs *);
80asmlinkage long sys_execve(char __user *, char __user * __user *,
81 char __user * __user *,
82 struct pt_regs *);
83long sys_arch_prctl(int, unsigned long); 73long sys_arch_prctl(int, unsigned long);
84 74
85/* kernel/signal.c */
86asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
87 struct pt_regs *);
88
89/* kernel/sys_x86_64.c */ 75/* kernel/sys_x86_64.c */
90struct new_utsname; 76struct new_utsname;
91 77
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index de10c19d9558..e529f26c3292 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
23struct tss_struct; 23struct tss_struct;
24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
25 struct tss_struct *tss); 25 struct tss_struct *tss);
26extern void show_regs_common(void);
26 27
27#ifdef CONFIG_X86_32 28#ifdef CONFIG_X86_32
28 29
@@ -128,8 +129,6 @@ do { \
128 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ 129 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
129 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ 130 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
130 "call __switch_to\n\t" \ 131 "call __switch_to\n\t" \
131 ".globl thread_return\n" \
132 "thread_return:\n\t" \
133 "movq "__percpu_arg([current_task])",%%rsi\n\t" \ 132 "movq "__percpu_arg([current_task])",%%rsi\n\t" \
134 __switch_canary \ 133 __switch_canary \
135 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 134 "movq %P[thread_info](%%rsi),%%r8\n\t" \
@@ -157,19 +156,22 @@ extern void native_load_gs_index(unsigned);
157 * Load a segment. Fall back on loading the zero 156 * Load a segment. Fall back on loading the zero
158 * segment if something goes wrong.. 157 * segment if something goes wrong..
159 */ 158 */
160#define loadsegment(seg, value) \ 159#define loadsegment(seg, value) \
161 asm volatile("\n" \ 160do { \
162 "1:\t" \ 161 unsigned short __val = (value); \
163 "movl %k0,%%" #seg "\n" \ 162 \
164 "2:\n" \ 163 asm volatile(" \n" \
165 ".section .fixup,\"ax\"\n" \ 164 "1: movl %k0,%%" #seg " \n" \
166 "3:\t" \ 165 \
167 "movl %k1, %%" #seg "\n\t" \ 166 ".section .fixup,\"ax\" \n" \
168 "jmp 2b\n" \ 167 "2: xorl %k0,%k0 \n" \
169 ".previous\n" \ 168 " jmp 1b \n" \
170 _ASM_EXTABLE(1b,3b) \ 169 ".previous \n" \
171 : :"r" (value), "r" (0) : "memory") 170 \
172 171 _ASM_EXTABLE(1b, 2b) \
172 \
173 : "+r" (__val) : : "memory"); \
174} while (0)
173 175
174/* 176/*
175 * Save a segment register away 177 * Save a segment register away
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..375c917c37d2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
84#define TIF_SECCOMP 8 /* secure computing */ 84#define TIF_SECCOMP 8 /* secure computing */
85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
86#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
86#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
87#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
88#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 108#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
108#define _TIF_SECCOMP (1 << TIF_SECCOMP) 109#define _TIF_SECCOMP (1 << TIF_SECCOMP)
109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 110#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
111#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
110#define _TIF_NOTSC (1 << TIF_NOTSC) 112#define _TIF_NOTSC (1 << TIF_NOTSC)
111#define _TIF_IA32 (1 << TIF_IA32) 113#define _TIF_IA32 (1 << TIF_IA32)
112#define _TIF_FORK (1 << TIF_FORK) 114#define _TIF_FORK (1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
142 144
143/* Only used for 64 bit */ 145/* Only used for 64 bit */
144#define _TIF_DO_NOTIFY_MASK \ 146#define _TIF_DO_NOTIFY_MASK \
145 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 147 (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
148 _TIF_USER_RETURN_NOTIFY)
146 149
147/* flags to check in __switch_to() */ 150/* flags to check in __switch_to() */
148#define _TIF_WORK_CTXSW \ 151#define _TIF_WORK_CTXSW \
149 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 152 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
150 153
151#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 154#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
152#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 155#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
153 156
154#define PREEMPT_ACTIVE 0x10000000 157#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 25a92842dd99..c5087d796587 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -35,11 +35,16 @@
35# endif 35# endif
36#endif 36#endif
37 37
38/* Node not present */ 38/*
39#define NUMA_NO_NODE (-1) 39 * to preserve the visibility of NUMA_NO_NODE definition,
40 * moved to there from here. May be used independent of
41 * CONFIG_NUMA.
42 */
43#include <linux/numa.h>
40 44
41#ifdef CONFIG_NUMA 45#ifdef CONFIG_NUMA
42#include <linux/cpumask.h> 46#include <linux/cpumask.h>
47
43#include <asm/mpspec.h> 48#include <asm/mpspec.h>
44 49
45#ifdef CONFIG_X86_32 50#ifdef CONFIG_X86_32
@@ -143,6 +148,7 @@ extern unsigned long node_remap_size[];
143 | 1*SD_BALANCE_FORK \ 148 | 1*SD_BALANCE_FORK \
144 | 0*SD_BALANCE_WAKE \ 149 | 0*SD_BALANCE_WAKE \
145 | 1*SD_WAKE_AFFINE \ 150 | 1*SD_WAKE_AFFINE \
151 | 0*SD_PREFER_LOCAL \
146 | 0*SD_SHARE_CPUPOWER \ 152 | 0*SD_SHARE_CPUPOWER \
147 | 0*SD_POWERSAVINGS_BALANCE \ 153 | 0*SD_POWERSAVINGS_BALANCE \
148 | 0*SD_SHARE_PKG_RESOURCES \ 154 | 0*SD_SHARE_PKG_RESOURCES \
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 90f06c25221d..cb507bb05d79 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -16,7 +16,6 @@ extern unsigned long initial_code;
16extern unsigned long initial_gs; 16extern unsigned long initial_gs;
17 17
18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
19#define TRAMPOLINE_BASE 0x6000
20 19
21extern unsigned long setup_trampoline(void); 20extern unsigned long setup_trampoline(void);
22extern void __init reserve_trampoline_memory(void); 21extern void __init reserve_trampoline_memory(void);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d2c6c930b491..abd3e0ea762a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -570,7 +570,6 @@ extern struct movsl_mask {
570#ifdef CONFIG_X86_32 570#ifdef CONFIG_X86_32
571# include "uaccess_32.h" 571# include "uaccess_32.h"
572#else 572#else
573# define ARCH_HAS_SEARCH_EXTABLE
574# include "uaccess_64.h" 573# include "uaccess_64.h"
575#endif 574#endif
576 575
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 632fb44b4cb5..0c9825e97f36 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -187,9 +187,34 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
187 187
188unsigned long __must_check copy_to_user(void __user *to, 188unsigned long __must_check copy_to_user(void __user *to,
189 const void *from, unsigned long n); 189 const void *from, unsigned long n);
190unsigned long __must_check copy_from_user(void *to, 190unsigned long __must_check _copy_from_user(void *to,
191 const void __user *from, 191 const void __user *from,
192 unsigned long n); 192 unsigned long n);
193
194
195extern void copy_from_user_overflow(void)
196#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
197 __compiletime_error("copy_from_user() buffer size is not provably correct")
198#else
199 __compiletime_warning("copy_from_user() buffer size is not provably correct")
200#endif
201;
202
203static inline unsigned long __must_check copy_from_user(void *to,
204 const void __user *from,
205 unsigned long n)
206{
207 int sz = __compiletime_object_size(to);
208 int ret = -EFAULT;
209
210 if (likely(sz == -1 || sz >= n))
211 ret = _copy_from_user(to, from, n);
212 else
213 copy_from_user_overflow();
214
215 return ret;
216}
217
193long __must_check strncpy_from_user(char *dst, const char __user *src, 218long __must_check strncpy_from_user(char *dst, const char __user *src,
194 long count); 219 long count);
195long __must_check __strncpy_from_user(char *dst, 220long __must_check __strncpy_from_user(char *dst,
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index db24b215fc50..46324c6a4f6e 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -19,12 +19,37 @@ __must_check unsigned long
19copy_user_generic(void *to, const void *from, unsigned len); 19copy_user_generic(void *to, const void *from, unsigned len);
20 20
21__must_check unsigned long 21__must_check unsigned long
22copy_to_user(void __user *to, const void *from, unsigned len); 22_copy_to_user(void __user *to, const void *from, unsigned len);
23__must_check unsigned long 23__must_check unsigned long
24copy_from_user(void *to, const void __user *from, unsigned len); 24_copy_from_user(void *to, const void __user *from, unsigned len);
25__must_check unsigned long 25__must_check unsigned long
26copy_in_user(void __user *to, const void __user *from, unsigned len); 26copy_in_user(void __user *to, const void __user *from, unsigned len);
27 27
28static inline unsigned long __must_check copy_from_user(void *to,
29 const void __user *from,
30 unsigned long n)
31{
32 int sz = __compiletime_object_size(to);
33 int ret = -EFAULT;
34
35 might_fault();
36 if (likely(sz == -1 || sz >= n))
37 ret = _copy_from_user(to, from, n);
38#ifdef CONFIG_DEBUG_VM
39 else
40 WARN(1, "Buffer overflow detected!\n");
41#endif
42 return ret;
43}
44
45static __always_inline __must_check
46int copy_to_user(void __user *dst, const void *src, unsigned size)
47{
48 might_fault();
49
50 return _copy_to_user(dst, src, size);
51}
52
28static __always_inline __must_check 53static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 54int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 55{
@@ -176,8 +201,11 @@ __must_check long strlen_user(const char __user *str);
176__must_check unsigned long clear_user(void __user *mem, unsigned long len); 201__must_check unsigned long clear_user(void __user *mem, unsigned long len);
177__must_check unsigned long __clear_user(void __user *mem, unsigned long len); 202__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
178 203
179__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, 204static __must_check __always_inline int
180 unsigned size); 205__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
206{
207 return copy_user_generic(dst, (__force const void *)src, size);
208}
181 209
182static __must_check __always_inline int 210static __must_check __always_inline int
183__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) 211__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c209a7e3..3baf379fa840 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,11 @@
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337
345 346
346#ifdef __KERNEL__ 347#ifdef __KERNEL__
347 348
348#define NR_syscalls 337 349#define NR_syscalls 338
349 350
350#define __ARCH_WANT_IPC_PARSE_VERSION 351#define __ARCH_WANT_IPC_PARSE_VERSION
351#define __ARCH_WANT_OLD_READDIR 352#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0adbc68..4843f7ba754a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_event_open 298 662#define __NR_perf_event_open 298
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
664 666
665#ifndef __NO_STUBS 667#ifndef __NO_STUBS
666#define __ARCH_WANT_OLD_READDIR 668#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 7ed17ff502b9..2751f3075d8b 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -76,15 +76,6 @@ union partition_info_u {
76 }; 76 };
77}; 77};
78 78
79union uv_watchlist_u {
80 u64 val;
81 struct {
82 u64 blade : 16,
83 size : 32,
84 filler : 16;
85 };
86};
87
88enum uv_memprotect { 79enum uv_memprotect {
89 UV_MEMPROT_RESTRICT_ACCESS, 80 UV_MEMPROT_RESTRICT_ACCESS,
90 UV_MEMPROT_ALLOW_AMO, 81 UV_MEMPROT_ALLOW_AMO,
@@ -100,7 +91,7 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
100 91
101extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); 92extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
102extern s64 uv_bios_freq_base(u64, u64 *); 93extern s64 uv_bios_freq_base(u64, u64 *);
103extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, 94extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
104 unsigned long *); 95 unsigned long *);
105extern int uv_bios_mq_watchlist_free(int, int); 96extern int uv_bios_mq_watchlist_free(int, int);
106extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); 97extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 80e2984f521c..b414d2b401f6 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -55,7 +55,7 @@
55#define DESC_STATUS_SOURCE_TIMEOUT 3 55#define DESC_STATUS_SOURCE_TIMEOUT 3
56 56
57/* 57/*
58 * source side threshholds at which message retries print a warning 58 * source side thresholds at which message retries print a warning
59 */ 59 */
60#define SOURCE_TIMEOUT_LIMIT 20 60#define SOURCE_TIMEOUT_LIMIT 20
61#define DESTINATION_TIMEOUT_LIMIT 20 61#define DESTINATION_TIMEOUT_LIMIT 20
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 04eb6c958b9d..bc54fa965af3 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -19,6 +19,8 @@
19#include <asm/types.h> 19#include <asm/types.h>
20#include <asm/percpu.h> 20#include <asm/percpu.h>
21#include <asm/uv/uv_mmrs.h> 21#include <asm/uv/uv_mmrs.h>
22#include <asm/irq_vectors.h>
23#include <asm/io_apic.h>
22 24
23 25
24/* 26/*
@@ -29,20 +31,20 @@
29 * contiguous (although various IO spaces may punch holes in 31 * contiguous (although various IO spaces may punch holes in
30 * it).. 32 * it)..
31 * 33 *
32 * N - Number of bits in the node portion of a socket physical 34 * N - Number of bits in the node portion of a socket physical
33 * address. 35 * address.
34 * 36 *
35 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of 37 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
36 * routers always have low bit of 1, C/MBricks have low bit 38 * routers always have low bit of 1, C/MBricks have low bit
37 * equal to 0. Most addressing macros that target UV hub chips 39 * equal to 0. Most addressing macros that target UV hub chips
38 * right shift the NASID by 1 to exclude the always-zero bit. 40 * right shift the NASID by 1 to exclude the always-zero bit.
39 * NASIDs contain up to 15 bits. 41 * NASIDs contain up to 15 bits.
40 * 42 *
41 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead 43 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
42 * of nasids. 44 * of nasids.
43 * 45 *
44 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant 46 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
45 * of the nasid for socket usage. 47 * of the nasid for socket usage.
46 * 48 *
47 * 49 *
48 * NumaLink Global Physical Address Format: 50 * NumaLink Global Physical Address Format:
@@ -69,12 +71,12 @@
69 * 71 *
70 * 72 *
71 * APICID format 73 * APICID format
72 * NOTE!!!!!! This is the current format of the APICID. However, code 74 * NOTE!!!!!! This is the current format of the APICID. However, code
73 * should assume that this will change in the future. Use functions 75 * should assume that this will change in the future. Use functions
74 * in this file for all APICID bit manipulations and conversion. 76 * in this file for all APICID bit manipulations and conversion.
75 * 77 *
76 * 1111110000000000 78 * 1111110000000000
77 * 5432109876543210 79 * 5432109876543210
78 * pppppppppplc0cch 80 * pppppppppplc0cch
79 * sssssssssss 81 * sssssssssss
80 * 82 *
@@ -87,9 +89,9 @@
87 * Note: Processor only supports 12 bits in the APICID register. The ACPI 89 * Note: Processor only supports 12 bits in the APICID register. The ACPI
88 * tables hold all 16 bits. Software needs to be aware of this. 90 * tables hold all 16 bits. Software needs to be aware of this.
89 * 91 *
90 * Unless otherwise specified, all references to APICID refer to 92 * Unless otherwise specified, all references to APICID refer to
91 * the FULL value contained in ACPI tables, not the subset in the 93 * the FULL value contained in ACPI tables, not the subset in the
92 * processor APICID register. 94 * processor APICID register.
93 */ 95 */
94 96
95 97
@@ -114,7 +116,7 @@
114/* 116/*
115 * The largest possible NASID of a C or M brick (+ 2) 117 * The largest possible NASID of a C or M brick (+ 2)
116 */ 118 */
117#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) 119#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2)
118 120
119struct uv_scir_s { 121struct uv_scir_s {
120 struct timer_list timer; 122 struct timer_list timer;
@@ -149,16 +151,16 @@ struct uv_hub_info_s {
149}; 151};
150 152
151DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 153DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
152#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) 154#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
153#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) 155#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
154 156
155/* 157/*
156 * Local & Global MMR space macros. 158 * Local & Global MMR space macros.
157 * Note: macros are intended to be used ONLY by inline functions 159 * Note: macros are intended to be used ONLY by inline functions
158 * in this file - not by other kernel code. 160 * in this file - not by other kernel code.
159 * n - NASID (full 15-bit global nasid) 161 * n - NASID (full 15-bit global nasid)
160 * g - GNODE (full 15-bit global nasid, right shifted 1) 162 * g - GNODE (full 15-bit global nasid, right shifted 1)
161 * p - PNODE (local part of nsids, right shifted 1) 163 * p - PNODE (local part of nsids, right shifted 1)
162 */ 164 */
163#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) 165#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
164#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) 166#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
@@ -170,6 +172,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
170#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) 172#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
171#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) 173#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
172 174
175#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
176
173#define UV_GLOBAL_MMR32_PNODE_SHIFT 15 177#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
174#define UV_GLOBAL_MMR64_PNODE_SHIFT 26 178#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
175 179
@@ -211,8 +215,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
211/* 215/*
212 * Macros for converting between kernel virtual addresses, socket local physical 216 * Macros for converting between kernel virtual addresses, socket local physical
213 * addresses, and UV global physical addresses. 217 * addresses, and UV global physical addresses.
214 * Note: use the standard __pa() & __va() macros for converting 218 * Note: use the standard __pa() & __va() macros for converting
215 * between socket virtual and socket physical addresses. 219 * between socket virtual and socket physical addresses.
216 */ 220 */
217 221
218/* socket phys RAM --> UV global physical address */ 222/* socket phys RAM --> UV global physical address */
@@ -230,6 +234,40 @@ static inline unsigned long uv_gpa(void *v)
230 return uv_soc_phys_ram_to_gpa(__pa(v)); 234 return uv_soc_phys_ram_to_gpa(__pa(v));
231} 235}
232 236
237/* Top two bits indicate the requested address is in MMR space. */
238static inline int
239uv_gpa_in_mmr_space(unsigned long gpa)
240{
241 return (gpa >> 62) == 0x3UL;
242}
243
244/* UV global physical address --> socket phys RAM */
245static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
246{
247 unsigned long paddr = gpa & uv_hub_info->gpa_mask;
248 unsigned long remap_base = uv_hub_info->lowmem_remap_base;
249 unsigned long remap_top = uv_hub_info->lowmem_remap_top;
250
251 if (paddr >= remap_base && paddr < remap_base + remap_top)
252 paddr -= remap_base;
253 return paddr;
254}
255
256
257/* gnode -> pnode */
258static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
259{
260 return gpa >> uv_hub_info->m_val;
261}
262
263/* gpa -> pnode */
264static inline int uv_gpa_to_pnode(unsigned long gpa)
265{
266 unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1;
267
268 return uv_gpa_to_gnode(gpa) & n_mask;
269}
270
233/* pnode, offset --> socket virtual */ 271/* pnode, offset --> socket virtual */
234static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) 272static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
235{ 273{
@@ -249,21 +287,18 @@ static inline int uv_apicid_to_pnode(int apicid)
249 * Access global MMRs using the low memory MMR32 space. This region supports 287 * Access global MMRs using the low memory MMR32 space. This region supports
250 * faster MMR access but not all MMRs are accessible in this space. 288 * faster MMR access but not all MMRs are accessible in this space.
251 */ 289 */
252static inline unsigned long *uv_global_mmr32_address(int pnode, 290static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset)
253 unsigned long offset)
254{ 291{
255 return __va(UV_GLOBAL_MMR32_BASE | 292 return __va(UV_GLOBAL_MMR32_BASE |
256 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); 293 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
257} 294}
258 295
259static inline void uv_write_global_mmr32(int pnode, unsigned long offset, 296static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val)
260 unsigned long val)
261{ 297{
262 writeq(val, uv_global_mmr32_address(pnode, offset)); 298 writeq(val, uv_global_mmr32_address(pnode, offset));
263} 299}
264 300
265static inline unsigned long uv_read_global_mmr32(int pnode, 301static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset)
266 unsigned long offset)
267{ 302{
268 return readq(uv_global_mmr32_address(pnode, offset)); 303 return readq(uv_global_mmr32_address(pnode, offset));
269} 304}
@@ -272,26 +307,42 @@ static inline unsigned long uv_read_global_mmr32(int pnode,
272 * Access Global MMR space using the MMR space located at the top of physical 307 * Access Global MMR space using the MMR space located at the top of physical
273 * memory. 308 * memory.
274 */ 309 */
275static inline unsigned long *uv_global_mmr64_address(int pnode, 310static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset)
276 unsigned long offset)
277{ 311{
278 return __va(UV_GLOBAL_MMR64_BASE | 312 return __va(UV_GLOBAL_MMR64_BASE |
279 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); 313 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
280} 314}
281 315
282static inline void uv_write_global_mmr64(int pnode, unsigned long offset, 316static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val)
283 unsigned long val)
284{ 317{
285 writeq(val, uv_global_mmr64_address(pnode, offset)); 318 writeq(val, uv_global_mmr64_address(pnode, offset));
286} 319}
287 320
288static inline unsigned long uv_read_global_mmr64(int pnode, 321static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset)
289 unsigned long offset)
290{ 322{
291 return readq(uv_global_mmr64_address(pnode, offset)); 323 return readq(uv_global_mmr64_address(pnode, offset));
292} 324}
293 325
294/* 326/*
327 * Global MMR space addresses when referenced by the GRU. (GRU does
328 * NOT use socket addressing).
329 */
330static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset)
331{
332 return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val);
333}
334
335static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
336{
337 writeb(val, uv_global_mmr64_address(pnode, offset));
338}
339
340static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset)
341{
342 return readb(uv_global_mmr64_address(pnode, offset));
343}
344
345/*
295 * Access hub local MMRs. Faster than using global space but only local MMRs 346 * Access hub local MMRs. Faster than using global space but only local MMRs
296 * are accessible. 347 * are accessible.
297 */ 348 */
@@ -410,21 +461,37 @@ static inline void uv_set_scir_bits(unsigned char value)
410 } 461 }
411} 462}
412 463
464static inline unsigned long uv_scir_offset(int apicid)
465{
466 return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f);
467}
468
413static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) 469static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
414{ 470{
415 if (uv_cpu_hub_info(cpu)->scir.state != value) { 471 if (uv_cpu_hub_info(cpu)->scir.state != value) {
472 uv_write_global_mmr8(uv_cpu_to_pnode(cpu),
473 uv_cpu_hub_info(cpu)->scir.offset, value);
416 uv_cpu_hub_info(cpu)->scir.state = value; 474 uv_cpu_hub_info(cpu)->scir.state = value;
417 uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
418 } 475 }
419} 476}
420 477
478static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
479{
480 return (1UL << UVH_IPI_INT_SEND_SHFT) |
481 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
482 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
483 (vector << UVH_IPI_INT_VECTOR_SHFT);
484}
485
421static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) 486static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
422{ 487{
423 unsigned long val; 488 unsigned long val;
489 unsigned long dmode = dest_Fixed;
424 490
425 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 491 if (vector == NMI_VECTOR)
426 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | 492 dmode = dest_NMI;
427 (vector << UVH_IPI_INT_VECTOR_SHFT); 493
494 val = uv_hub_ipi_value(apicid, vector, dmode);
428 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 495 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
429} 496}
430 497
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
index 9613c8c0b647..d6b17c760622 100644
--- a/arch/x86/include/asm/uv/uv_irq.h
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -25,12 +25,14 @@ struct uv_IO_APIC_route_entry {
25 dest : 32; 25 dest : 32;
26}; 26};
27 27
28extern struct irq_chip uv_irq_chip; 28enum {
29 29 UV_AFFINITY_ALL,
30extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); 30 UV_AFFINITY_NODE,
31extern void arch_disable_uv_irq(int, unsigned long); 31 UV_AFFINITY_CPU
32};
32 33
33extern int uv_setup_irq(char *, int, int, unsigned long); 34extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
34extern void uv_teardown_irq(unsigned int, int, unsigned long); 35extern int uv_setup_irq(char *, int, int, unsigned long, int);
36extern void uv_teardown_irq(unsigned int);
35 37
36#endif /* _ASM_X86_UV_UV_IRQ_H */ 38#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d456..2b4945419a84 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -56,6 +56,7 @@
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
59#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
59 60
60 61
61#define PIN_BASED_EXT_INTR_MASK 0x00000001 62#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -144,6 +145,8 @@ enum vmcs_field {
144 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, 145 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
145 TPR_THRESHOLD = 0x0000401c, 146 TPR_THRESHOLD = 0x0000401c,
146 SECONDARY_VM_EXEC_CONTROL = 0x0000401e, 147 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
148 PLE_GAP = 0x00004020,
149 PLE_WINDOW = 0x00004022,
147 VM_INSTRUCTION_ERROR = 0x00004400, 150 VM_INSTRUCTION_ERROR = 0x00004400,
148 VM_EXIT_REASON = 0x00004402, 151 VM_EXIT_REASON = 0x00004402,
149 VM_EXIT_INTR_INFO = 0x00004404, 152 VM_EXIT_INTR_INFO = 0x00004404,
@@ -248,6 +251,7 @@ enum vmcs_field {
248#define EXIT_REASON_MSR_READ 31 251#define EXIT_REASON_MSR_READ 31
249#define EXIT_REASON_MSR_WRITE 32 252#define EXIT_REASON_MSR_WRITE 32
250#define EXIT_REASON_MWAIT_INSTRUCTION 36 253#define EXIT_REASON_MWAIT_INSTRUCTION 36
254#define EXIT_REASON_PAUSE_INSTRUCTION 40
251#define EXIT_REASON_MCE_DURING_VMENTRY 41 255#define EXIT_REASON_MCE_DURING_VMENTRY 41
252#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
253#define EXIT_REASON_APIC_ACCESS 44 257#define EXIT_REASON_APIC_ACCESS 44
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2c756fd4ab0e..ea0e8ea15e15 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -26,7 +26,7 @@ struct x86_init_mpparse {
26 void (*smp_read_mpc_oem)(struct mpc_table *mpc); 26 void (*smp_read_mpc_oem)(struct mpc_table *mpc);
27 void (*mpc_oem_pci_bus)(struct mpc_bus *m); 27 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); 28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
29 void (*find_smp_config)(unsigned int reserve); 29 void (*find_smp_config)(void);
30 void (*get_smp_config)(unsigned int early); 30 void (*get_smp_config)(unsigned int early);
31}; 31};
32 32
@@ -91,6 +91,14 @@ struct x86_init_timers {
91}; 91};
92 92
93/** 93/**
94 * struct x86_init_iommu - platform specific iommu setup
95 * @iommu_init: platform specific iommu setup
96 */
97struct x86_init_iommu {
98 int (*iommu_init)(void);
99};
100
101/**
94 * struct x86_init_ops - functions for platform specific setup 102 * struct x86_init_ops - functions for platform specific setup
95 * 103 *
96 */ 104 */
@@ -101,6 +109,7 @@ struct x86_init_ops {
101 struct x86_init_oem oem; 109 struct x86_init_oem oem;
102 struct x86_init_paging paging; 110 struct x86_init_paging paging;
103 struct x86_init_timers timers; 111 struct x86_init_timers timers;
112 struct x86_init_iommu iommu;
104}; 113};
105 114
106/** 115/**
@@ -116,11 +125,14 @@ struct x86_cpuinit_ops {
116 * @calibrate_tsc: calibrate TSC 125 * @calibrate_tsc: calibrate TSC
117 * @get_wallclock: get time from HW clock like RTC etc. 126 * @get_wallclock: get time from HW clock like RTC etc.
118 * @set_wallclock: set time back to HW clock 127 * @set_wallclock: set time back to HW clock
128 * @is_untracked_pat_range exclude from PAT logic
119 */ 129 */
120struct x86_platform_ops { 130struct x86_platform_ops {
121 unsigned long (*calibrate_tsc)(void); 131 unsigned long (*calibrate_tsc)(void);
122 unsigned long (*get_wallclock)(void); 132 unsigned long (*get_wallclock)(void);
123 int (*set_wallclock)(unsigned long nowtime); 133 int (*set_wallclock)(unsigned long nowtime);
134 void (*iommu_shutdown)(void);
135 bool (*is_untracked_pat_range)(u64 start, u64 end);
124}; 136};
125 137
126extern struct x86_init_ops x86_init; 138extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d5b7e90c0edf..396ff4cc8ed4 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,31 +37,4 @@
37extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
38extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
39 39
40enum xen_domain_type {
41 XEN_NATIVE, /* running on bare hardware */
42 XEN_PV_DOMAIN, /* running in a PV domain */
43 XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
44};
45
46#ifdef CONFIG_XEN
47extern enum xen_domain_type xen_domain_type;
48#else
49#define xen_domain_type XEN_NATIVE
50#endif
51
52#define xen_domain() (xen_domain_type != XEN_NATIVE)
53#define xen_pv_domain() (xen_domain() && \
54 xen_domain_type == XEN_PV_DOMAIN)
55#define xen_hvm_domain() (xen_domain() && \
56 xen_domain_type == XEN_HVM_DOMAIN)
57
58#ifdef CONFIG_XEN_DOM0
59#include <xen/interface/xen.h>
60
61#define xen_initial_domain() (xen_pv_domain() && \
62 xen_start_info->flags & SIF_INITDOMAIN)
63#else /* !CONFIG_XEN_DOM0 */
64#define xen_initial_domain() (0)
65#endif /* CONFIG_XEN_DOM0 */
66
67#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 40#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..d87f09bc5a52 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -89,7 +89,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90 90
91obj-$(CONFIG_K8_NB) += k8.o 91obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 92obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 93obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 94
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fd5ca97a2ad5..6f35260bb3ef 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o 4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
5 5
6ifneq ($(CONFIG_ACPI_PROCESSOR),) 6ifneq ($(CONFIG_ACPI_PROCESSOR),)
7obj-y += cstate.o processor.o 7obj-y += cstate.o
8endif 8endif
9 9
10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin 10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..fb1035cd9a6a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
624 } 624 }
625 625
626 hpet_address = hpet_tbl->address.address; 626 hpet_address = hpet_tbl->address.address;
627 hpet_blockid = hpet_tbl->sequence;
627 628
628 /* 629 /*
629 * Some broken BIOSes advertise HPET at 0x0. We really do not 630 * Some broken BIOSes advertise HPET at 0x0. We really do not
@@ -1122,7 +1123,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1122 if (!acpi_sci_override_gsi) 1123 if (!acpi_sci_override_gsi)
1123 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1124 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
1124 1125
1125 /* Fill in identity legacy mapings where no override */ 1126 /* Fill in identity legacy mappings where no override */
1126 mp_config_acpi_legacy_irqs(); 1127 mp_config_acpi_legacy_irqs();
1127 1128
1128 count = 1129 count =
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 59cdfa4686b2..2e837f5080fe 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
deleted file mode 100644
index d296f4a195c9..000000000000
--- a/arch/x86/kernel/acpi/processor.c
+++ /dev/null
@@ -1,100 +0,0 @@
1/*
2 * Copyright (C) 2005 Intel Corporation
3 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
4 * - Added _PDC for platforms with Intel CPUs
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/acpi.h>
11
12#include <acpi/processor.h>
13#include <asm/acpi.h>
14
15static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
16{
17 struct acpi_object_list *obj_list;
18 union acpi_object *obj;
19 u32 *buf;
20
21 /* allocate and initialize pdc. It will be used later. */
22 obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
23 if (!obj_list) {
24 printk(KERN_ERR "Memory allocation error\n");
25 return;
26 }
27
28 obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
29 if (!obj) {
30 printk(KERN_ERR "Memory allocation error\n");
31 kfree(obj_list);
32 return;
33 }
34
35 buf = kmalloc(12, GFP_KERNEL);
36 if (!buf) {
37 printk(KERN_ERR "Memory allocation error\n");
38 kfree(obj);
39 kfree(obj_list);
40 return;
41 }
42
43 buf[0] = ACPI_PDC_REVISION_ID;
44 buf[1] = 1;
45 buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
46
47 /*
48 * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so
49 * that OSPM is capable of native ACPI throttling software
50 * coordination using BIOS supplied _TSD info.
51 */
52 buf[2] |= ACPI_PDC_SMP_T_SWCOORD;
53 if (cpu_has(c, X86_FEATURE_EST))
54 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
55
56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH;
58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
65 obj->type = ACPI_TYPE_BUFFER;
66 obj->buffer.length = 12;
67 obj->buffer.pointer = (u8 *) buf;
68 obj_list->count = 1;
69 obj_list->pointer = obj;
70 pr->pdc = obj_list;
71
72 return;
73}
74
75
76/* Initialize _PDC data based on the CPU vendor */
77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
78{
79 struct cpuinfo_x86 *c = &cpu_data(pr->id);
80
81 pr->pdc = NULL;
82 if (c->x86_vendor == X86_VENDOR_INTEL)
83 init_intel_pdc(pr, c);
84
85 return;
86}
87
88EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
89
90void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
91{
92 if (pr->pdc) {
93 kfree(pr->pdc->pointer->buffer.pointer);
94 kfree(pr->pdc->pointer);
95 kfree(pr->pdc);
96 pr->pdc = NULL;
97 }
98}
99
100EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cda..060fff8f5c5b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -57,5 +57,8 @@ SECTIONS
57 *(.note*) 57 *(.note*)
58 } 58 }
59 59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
60 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); 63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
61} 64}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
78#ifndef CONFIG_64BIT 78#ifndef CONFIG_64BIT
79 store_gdt((struct desc_ptr *)&header->pmode_gdt); 79 store_gdt((struct desc_ptr *)&header->pmode_gdt);
80 80
81 header->pmode_efer_low = nx_enabled; 81 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
82 if (header->pmode_efer_low & 1) { 82 &header->pmode_efer_high))
83 /* This is strange, why not save efer, always? */ 83 header->pmode_efer_low = header->pmode_efer_high = 0;
84 rdmsr(MSR_EFER, header->pmode_efer_low,
85 header->pmode_efer_high);
86 }
87#endif /* !CONFIG_64BIT */ 84#endif /* !CONFIG_64BIT */
88 85
89 header->pmode_cr0 = read_cr0(); 86 header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
119 116
120 117
121/** 118/**
122 * acpi_reserve_bootmem - do _very_ early ACPI initialisation 119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 * 120 *
124 * We allocate a page from the first 1MB of memory for the wakeup 121 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The 122 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not 123 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages. 124 * <1MB pages.
128 */ 125 */
129void __init acpi_reserve_bootmem(void) 126void __init acpi_reserve_wakeup_memory(void)
130{ 127{
128 unsigned long mem;
129
131 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
132 printk(KERN_ERR 131 printk(KERN_ERR
133 "ACPI: Wakeup code way too big, S3 disabled.\n"); 132 "ACPI: Wakeup code way too big, S3 disabled.\n");
134 return; 133 return;
135 } 134 }
136 135
137 acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); 136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
138 137
139 if (!acpi_realmode) { 138 if (mem == -1L) {
140 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
141 return; 140 return;
142 } 141 }
143 142 acpi_realmode = (unsigned long) phys_to_virt(mem);
144 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); 143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 145}
146 146
147 147
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
165 str = strchr(str, ','); 167 str = strchr(str, ',');
166 if (str != NULL) 168 if (str != NULL)
167 str += strspn(str, ", \t"); 169 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 98f230f6a28d..23824fef789c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -19,7 +19,7 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/bitops.h> 22#include <linux/bitmap.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -28,6 +28,7 @@
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/iommu.h> 29#include <asm/iommu.h>
30#include <asm/gart.h> 30#include <asm/gart.h>
31#include <asm/amd_iommu_proto.h>
31#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
32#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
33 34
@@ -56,20 +57,152 @@ struct iommu_cmd {
56 u32 data[4]; 57 u32 data[4];
57}; 58};
58 59
59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
60 struct unity_map_entry *e);
61static struct dma_ops_domain *find_protection_domain(u16 devid);
62static u64 *alloc_pte(struct protection_domain *domain,
63 unsigned long address, int end_lvl,
64 u64 **pte_page, gfp_t gfp);
65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
66 unsigned long start_page,
67 unsigned int pages);
68static void reset_iommu_command_buffer(struct amd_iommu *iommu); 60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
69static u64 *fetch_pte(struct protection_domain *domain,
70 unsigned long address, int map_size);
71static void update_domain(struct protection_domain *domain); 61static void update_domain(struct protection_domain *domain);
72 62
63/****************************************************************************
64 *
65 * Helper functions
66 *
67 ****************************************************************************/
68
69static inline u16 get_device_id(struct device *dev)
70{
71 struct pci_dev *pdev = to_pci_dev(dev);
72
73 return calc_devid(pdev->bus->number, pdev->devfn);
74}
75
76static struct iommu_dev_data *get_dev_data(struct device *dev)
77{
78 return dev->archdata.iommu;
79}
80
81/*
82 * In this function the list of preallocated protection domains is traversed to
83 * find the domain for a specific device
84 */
85static struct dma_ops_domain *find_protection_domain(u16 devid)
86{
87 struct dma_ops_domain *entry, *ret = NULL;
88 unsigned long flags;
89 u16 alias = amd_iommu_alias_table[devid];
90
91 if (list_empty(&iommu_pd_list))
92 return NULL;
93
94 spin_lock_irqsave(&iommu_pd_list_lock, flags);
95
96 list_for_each_entry(entry, &iommu_pd_list, list) {
97 if (entry->target_dev == devid ||
98 entry->target_dev == alias) {
99 ret = entry;
100 break;
101 }
102 }
103
104 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
105
106 return ret;
107}
108
109/*
110 * This function checks if the driver got a valid device from the caller to
111 * avoid dereferencing invalid pointers.
112 */
113static bool check_device(struct device *dev)
114{
115 u16 devid;
116
117 if (!dev || !dev->dma_mask)
118 return false;
119
120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type)
122 return false;
123
124 devid = get_device_id(dev);
125
126 /* Out of our scope? */
127 if (devid > amd_iommu_last_bdf)
128 return false;
129
130 if (amd_iommu_rlookup_table[devid] == NULL)
131 return false;
132
133 return true;
134}
135
136static int iommu_init_device(struct device *dev)
137{
138 struct iommu_dev_data *dev_data;
139 struct pci_dev *pdev;
140 u16 devid, alias;
141
142 if (dev->archdata.iommu)
143 return 0;
144
145 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
146 if (!dev_data)
147 return -ENOMEM;
148
149 dev_data->dev = dev;
150
151 devid = get_device_id(dev);
152 alias = amd_iommu_alias_table[devid];
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev)
155 dev_data->alias = &pdev->dev;
156
157 atomic_set(&dev_data->bind, 0);
158
159 dev->archdata.iommu = dev_data;
160
161
162 return 0;
163}
164
165static void iommu_uninit_device(struct device *dev)
166{
167 kfree(dev->archdata.iommu);
168}
169
170void __init amd_iommu_uninit_devices(void)
171{
172 struct pci_dev *pdev = NULL;
173
174 for_each_pci_dev(pdev) {
175
176 if (!check_device(&pdev->dev))
177 continue;
178
179 iommu_uninit_device(&pdev->dev);
180 }
181}
182
183int __init amd_iommu_init_devices(void)
184{
185 struct pci_dev *pdev = NULL;
186 int ret = 0;
187
188 for_each_pci_dev(pdev) {
189
190 if (!check_device(&pdev->dev))
191 continue;
192
193 ret = iommu_init_device(&pdev->dev);
194 if (ret)
195 goto out_free;
196 }
197
198 return 0;
199
200out_free:
201
202 amd_iommu_uninit_devices();
203
204 return ret;
205}
73#ifdef CONFIG_AMD_IOMMU_STATS 206#ifdef CONFIG_AMD_IOMMU_STATS
74 207
75/* 208/*
@@ -90,7 +223,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
90DECLARE_STATS_COUNTER(total_map_requests); 223DECLARE_STATS_COUNTER(total_map_requests);
91 224
92static struct dentry *stats_dir; 225static struct dentry *stats_dir;
93static struct dentry *de_isolate;
94static struct dentry *de_fflush; 226static struct dentry *de_fflush;
95 227
96static void amd_iommu_stats_add(struct __iommu_counter *cnt) 228static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +240,6 @@ static void amd_iommu_stats_init(void)
108 if (stats_dir == NULL) 240 if (stats_dir == NULL)
109 return; 241 return;
110 242
111 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
112 (u32 *)&amd_iommu_isolate);
113
114 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 243 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
115 (u32 *)&amd_iommu_unmap_flush); 244 (u32 *)&amd_iommu_unmap_flush);
116 245
@@ -130,12 +259,6 @@ static void amd_iommu_stats_init(void)
130 259
131#endif 260#endif
132 261
133/* returns !0 if the IOMMU is caching non-present entries in its TLB */
134static int iommu_has_npcache(struct amd_iommu *iommu)
135{
136 return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
137}
138
139/**************************************************************************** 262/****************************************************************************
140 * 263 *
141 * Interrupt handling functions 264 * Interrupt handling functions
@@ -199,6 +322,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
199 break; 322 break;
200 case EVENT_TYPE_ILL_CMD: 323 case EVENT_TYPE_ILL_CMD:
201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
202 reset_iommu_command_buffer(iommu); 326 reset_iommu_command_buffer(iommu);
203 dump_command(address); 327 dump_command(address);
204 break; 328 break;
@@ -321,11 +445,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 445 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 446 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
323 447
324 if (unlikely(i == EXIT_LOOP_COUNT)) { 448 if (unlikely(i == EXIT_LOOP_COUNT))
325 spin_unlock(&iommu->lock); 449 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
329} 450}
330 451
331/* 452/*
@@ -372,26 +493,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
372out: 493out:
373 spin_unlock_irqrestore(&iommu->lock, flags); 494 spin_unlock_irqrestore(&iommu->lock, flags);
374 495
496 if (iommu->reset_in_progress)
497 reset_iommu_command_buffer(iommu);
498
375 return 0; 499 return 0;
376} 500}
377 501
502static void iommu_flush_complete(struct protection_domain *domain)
503{
504 int i;
505
506 for (i = 0; i < amd_iommus_present; ++i) {
507 if (!domain->dev_iommu[i])
508 continue;
509
510 /*
511 * Devices of this domain are behind this IOMMU
512 * We need to wait for completion of all commands.
513 */
514 iommu_completion_wait(amd_iommus[i]);
515 }
516}
517
378/* 518/*
379 * Command send function for invalidating a device table entry 519 * Command send function for invalidating a device table entry
380 */ 520 */
381static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 521static int iommu_flush_device(struct device *dev)
382{ 522{
523 struct amd_iommu *iommu;
383 struct iommu_cmd cmd; 524 struct iommu_cmd cmd;
384 int ret; 525 u16 devid;
385 526
386 BUG_ON(iommu == NULL); 527 devid = get_device_id(dev);
528 iommu = amd_iommu_rlookup_table[devid];
387 529
530 /* Build command */
388 memset(&cmd, 0, sizeof(cmd)); 531 memset(&cmd, 0, sizeof(cmd));
389 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 532 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
390 cmd.data[0] = devid; 533 cmd.data[0] = devid;
391 534
392 ret = iommu_queue_command(iommu, &cmd); 535 return iommu_queue_command(iommu, &cmd);
393
394 return ret;
395} 536}
396 537
397static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 538static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +571,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
430 * It invalidates a single PTE if the range to flush is within a single 571 * It invalidates a single PTE if the range to flush is within a single
431 * page. Otherwise it flushes the whole TLB of the IOMMU. 572 * page. Otherwise it flushes the whole TLB of the IOMMU.
432 */ 573 */
433static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 574static void __iommu_flush_pages(struct protection_domain *domain,
434 u64 address, size_t size) 575 u64 address, size_t size, int pde)
435{ 576{
436 int s = 0; 577 int s = 0, i;
437 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); 578 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
438 579
439 address &= PAGE_MASK; 580 address &= PAGE_MASK;
440 581
@@ -447,142 +588,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
447 s = 1; 588 s = 1;
448 } 589 }
449 590
450 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
451 591
452 return 0; 592 for (i = 0; i < amd_iommus_present; ++i) {
593 if (!domain->dev_iommu[i])
594 continue;
595
596 /*
597 * Devices of this domain are behind this IOMMU
598 * We need a TLB flush
599 */
600 iommu_queue_inv_iommu_pages(amd_iommus[i], address,
601 domain->id, pde, s);
602 }
603
604 return;
453} 605}
454 606
455/* Flush the whole IO/TLB for a given protection domain */ 607static void iommu_flush_pages(struct protection_domain *domain,
456static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) 608 u64 address, size_t size)
457{ 609{
458 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 610 __iommu_flush_pages(domain, address, size, 0);
459 611}
460 INC_STATS_COUNTER(domain_flush_single);
461 612
462 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 613/* Flush the whole IO/TLB for a given protection domain */
614static void iommu_flush_tlb(struct protection_domain *domain)
615{
616 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
463} 617}
464 618
465/* Flush the whole IO/TLB for a given protection domain - including PDE */ 619/* Flush the whole IO/TLB for a given protection domain - including PDE */
466static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) 620static void iommu_flush_tlb_pde(struct protection_domain *domain)
467{ 621{
468 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 622 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
469
470 INC_STATS_COUNTER(domain_flush_single);
471
472 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
473} 623}
474 624
625
475/* 626/*
476 * This function flushes one domain on one IOMMU 627 * This function flushes the DTEs for all devices in domain
477 */ 628 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) 629static void iommu_flush_domain_devices(struct protection_domain *domain)
479{ 630{
480 struct iommu_cmd cmd; 631 struct iommu_dev_data *dev_data;
481 unsigned long flags; 632 unsigned long flags;
482 633
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 634 spin_lock_irqsave(&domain->lock, flags);
484 domid, 1, 1);
485 635
486 spin_lock_irqsave(&iommu->lock, flags); 636 list_for_each_entry(dev_data, &domain->dev_list, list)
487 __iommu_queue_command(iommu, &cmd); 637 iommu_flush_device(dev_data->dev);
488 __iommu_completion_wait(iommu); 638
489 __iommu_wait_for_completion(iommu); 639 spin_unlock_irqrestore(&domain->lock, flags);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491} 640}
492 641
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu) 642static void iommu_flush_all_domain_devices(void)
494{ 643{
495 int i; 644 struct protection_domain *domain;
645 unsigned long flags;
496 646
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 647 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 648
499 continue; 649 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
500 flush_domain_on_iommu(iommu, i); 650 iommu_flush_domain_devices(domain);
651 iommu_flush_complete(domain);
501 } 652 }
502 653
654 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
655}
656
657void amd_iommu_flush_all_devices(void)
658{
659 iommu_flush_all_domain_devices();
503} 660}
504 661
505/* 662/*
506 * This function is used to flush the IO/TLB for a given protection domain 663 * This function uses heavy locking and may disable irqs for some time. But
507 * on every IOMMU in the system 664 * this is no issue because it is only called during resume.
508 */ 665 */
509static void iommu_flush_domain(u16 domid) 666void amd_iommu_flush_all_domains(void)
510{ 667{
511 struct amd_iommu *iommu; 668 struct protection_domain *domain;
669 unsigned long flags;
512 670
513 INC_STATS_COUNTER(domain_flush_all); 671 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
514 672
515 for_each_iommu(iommu) 673 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
516 flush_domain_on_iommu(iommu, domid); 674 spin_lock(&domain->lock);
675 iommu_flush_tlb_pde(domain);
676 iommu_flush_complete(domain);
677 spin_unlock(&domain->lock);
678 }
679
680 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
517} 681}
518 682
519void amd_iommu_flush_all_domains(void) 683static void reset_iommu_command_buffer(struct amd_iommu *iommu)
520{ 684{
521 struct amd_iommu *iommu; 685 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
522 686
523 for_each_iommu(iommu) 687 if (iommu->reset_in_progress)
524 flush_all_domains_on_iommu(iommu); 688 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
689
690 amd_iommu_reset_cmd_buffer(iommu);
691 amd_iommu_flush_all_devices();
692 amd_iommu_flush_all_domains();
693
694 iommu->reset_in_progress = false;
525} 695}
526 696
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu) 697/****************************************************************************
698 *
699 * The functions below are used the create the page table mappings for
700 * unity mapped regions.
701 *
702 ****************************************************************************/
703
704/*
705 * This function is used to add another level to an IO page table. Adding
706 * another level increases the size of the address space by 9 bits to a size up
707 * to 64 bits.
708 */
709static bool increase_address_space(struct protection_domain *domain,
710 gfp_t gfp)
528{ 711{
529 int i; 712 u64 *pte;
530 713
531 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 714 if (domain->mode == PAGE_MODE_6_LEVEL)
532 if (iommu != amd_iommu_rlookup_table[i]) 715 /* address space already 64 bit large */
533 continue; 716 return false;
534 717
535 iommu_queue_inv_dev_entry(iommu, i); 718 pte = (void *)get_zeroed_page(gfp);
536 iommu_completion_wait(iommu); 719 if (!pte)
537 } 720 return false;
721
722 *pte = PM_LEVEL_PDE(domain->mode,
723 virt_to_phys(domain->pt_root));
724 domain->pt_root = pte;
725 domain->mode += 1;
726 domain->updated = true;
727
728 return true;
538} 729}
539 730
540static void flush_devices_by_domain(struct protection_domain *domain) 731static u64 *alloc_pte(struct protection_domain *domain,
732 unsigned long address,
733 int end_lvl,
734 u64 **pte_page,
735 gfp_t gfp)
541{ 736{
542 struct amd_iommu *iommu; 737 u64 *pte, *page;
543 int i; 738 int level;
544 739
545 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 740 while (address > PM_LEVEL_SIZE(domain->mode))
546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || 741 increase_address_space(domain, gfp);
547 (amd_iommu_pd_table[i] != domain))
548 continue;
549 742
550 iommu = amd_iommu_rlookup_table[i]; 743 level = domain->mode - 1;
551 if (!iommu) 744 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
552 continue; 745
746 while (level > end_lvl) {
747 if (!IOMMU_PTE_PRESENT(*pte)) {
748 page = (u64 *)get_zeroed_page(gfp);
749 if (!page)
750 return NULL;
751 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
752 }
753
754 level -= 1;
755
756 pte = IOMMU_PTE_PAGE(*pte);
757
758 if (pte_page && level == end_lvl)
759 *pte_page = pte;
553 760
554 iommu_queue_inv_dev_entry(iommu, i); 761 pte = &pte[PM_LEVEL_INDEX(level, address)];
555 iommu_completion_wait(iommu);
556 } 762 }
763
764 return pte;
557} 765}
558 766
559static void reset_iommu_command_buffer(struct amd_iommu *iommu) 767/*
768 * This function checks if there is a PTE for a given dma address. If
769 * there is one, it returns the pointer to it.
770 */
771static u64 *fetch_pte(struct protection_domain *domain,
772 unsigned long address, int map_size)
560{ 773{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); 774 int level;
775 u64 *pte;
562 776
563 if (iommu->reset_in_progress) 777 level = domain->mode - 1;
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); 778 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
565 779
566 iommu->reset_in_progress = true; 780 while (level > map_size) {
781 if (!IOMMU_PTE_PRESENT(*pte))
782 return NULL;
567 783
568 amd_iommu_reset_cmd_buffer(iommu); 784 level -= 1;
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571 785
572 iommu->reset_in_progress = false; 786 pte = IOMMU_PTE_PAGE(*pte);
573} 787 pte = &pte[PM_LEVEL_INDEX(level, address)];
574 788
575void amd_iommu_flush_all_devices(void) 789 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
576{ 790 pte = NULL;
577 flush_devices_by_domain(NULL); 791 break;
578} 792 }
793 }
579 794
580/**************************************************************************** 795 return pte;
581 * 796}
582 * The functions below are used the create the page table mappings for
583 * unity mapped regions.
584 *
585 ****************************************************************************/
586 797
587/* 798/*
588 * Generic mapping functions. It maps a physical address into a DMA 799 * Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +865,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
654} 865}
655 866
656/* 867/*
657 * Init the unity mappings for a specific IOMMU in the system
658 *
659 * Basically iterates over all unity mapping entries and applies them to
660 * the default domain DMA of that IOMMU if necessary.
661 */
662static int iommu_init_unity_mappings(struct amd_iommu *iommu)
663{
664 struct unity_map_entry *entry;
665 int ret;
666
667 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
668 if (!iommu_for_unity_map(iommu, entry))
669 continue;
670 ret = dma_ops_unity_map(iommu->default_dom, entry);
671 if (ret)
672 return ret;
673 }
674
675 return 0;
676}
677
678/*
679 * This function actually applies the mapping to the page table of the 868 * This function actually applies the mapping to the page table of the
680 * dma_ops domain. 869 * dma_ops domain.
681 */ 870 */
@@ -704,6 +893,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
704} 893}
705 894
706/* 895/*
896 * Init the unity mappings for a specific IOMMU in the system
897 *
898 * Basically iterates over all unity mapping entries and applies them to
899 * the default domain DMA of that IOMMU if necessary.
900 */
901static int iommu_init_unity_mappings(struct amd_iommu *iommu)
902{
903 struct unity_map_entry *entry;
904 int ret;
905
906 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
907 if (!iommu_for_unity_map(iommu, entry))
908 continue;
909 ret = dma_ops_unity_map(iommu->default_dom, entry);
910 if (ret)
911 return ret;
912 }
913
914 return 0;
915}
916
917/*
707 * Inits the unity mappings required for a specific device 918 * Inits the unity mappings required for a specific device
708 */ 919 */
709static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 920static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +951,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
740 */ 951 */
741 952
742/* 953/*
743 * This function checks if there is a PTE for a given dma address. If 954 * Used to reserve address ranges in the aperture (e.g. for exclusion
744 * there is one, it returns the pointer to it. 955 * ranges.
745 */ 956 */
746static u64 *fetch_pte(struct protection_domain *domain, 957static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
747 unsigned long address, int map_size) 958 unsigned long start_page,
959 unsigned int pages)
748{ 960{
749 int level; 961 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
750 u64 *pte;
751
752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
754
755 while (level > map_size) {
756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
758
759 level -= 1;
760 962
761 pte = IOMMU_PTE_PAGE(*pte); 963 if (start_page + pages > last_page)
762 pte = &pte[PM_LEVEL_INDEX(level, address)]; 964 pages = last_page - start_page;
763 965
764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { 966 for (i = start_page; i < start_page + pages; ++i) {
765 pte = NULL; 967 int index = i / APERTURE_RANGE_PAGES;
766 break; 968 int page = i % APERTURE_RANGE_PAGES;
767 } 969 __set_bit(page, dom->aperture[index]->bitmap);
768 } 970 }
769
770 return pte;
771} 971}
772 972
773/* 973/*
@@ -775,11 +975,11 @@ static u64 *fetch_pte(struct protection_domain *domain,
775 * aperture in case of dma_ops domain allocation or address allocation 975 * aperture in case of dma_ops domain allocation or address allocation
776 * failure. 976 * failure.
777 */ 977 */
778static int alloc_new_range(struct amd_iommu *iommu, 978static int alloc_new_range(struct dma_ops_domain *dma_dom,
779 struct dma_ops_domain *dma_dom,
780 bool populate, gfp_t gfp) 979 bool populate, gfp_t gfp)
781{ 980{
782 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 981 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
982 struct amd_iommu *iommu;
783 int i; 983 int i;
784 984
785#ifdef CONFIG_IOMMU_STRESS 985#ifdef CONFIG_IOMMU_STRESS
@@ -819,14 +1019,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
819 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1019 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
820 1020
821 /* Intialize the exclusion range if necessary */ 1021 /* Intialize the exclusion range if necessary */
822 if (iommu->exclusion_start && 1022 for_each_iommu(iommu) {
823 iommu->exclusion_start >= dma_dom->aperture[index]->offset && 1023 if (iommu->exclusion_start &&
824 iommu->exclusion_start < dma_dom->aperture_size) { 1024 iommu->exclusion_start >= dma_dom->aperture[index]->offset
825 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 1025 && iommu->exclusion_start < dma_dom->aperture_size) {
826 int pages = iommu_num_pages(iommu->exclusion_start, 1026 unsigned long startpage;
827 iommu->exclusion_length, 1027 int pages = iommu_num_pages(iommu->exclusion_start,
828 PAGE_SIZE); 1028 iommu->exclusion_length,
829 dma_ops_reserve_addresses(dma_dom, startpage, pages); 1029 PAGE_SIZE);
1030 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1031 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1032 }
830 } 1033 }
831 1034
832 /* 1035 /*
@@ -928,7 +1131,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
928 } 1131 }
929 1132
930 if (unlikely(address == -1)) 1133 if (unlikely(address == -1))
931 address = bad_dma_address; 1134 address = DMA_ERROR_CODE;
932 1135
933 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1136 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
934 1137
@@ -959,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
959 1162
960 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; 1163 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
961 1164
962 iommu_area_free(range->bitmap, address, pages); 1165 bitmap_clear(range->bitmap, address, pages);
963 1166
964} 1167}
965 1168
@@ -973,6 +1176,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
973 * 1176 *
974 ****************************************************************************/ 1177 ****************************************************************************/
975 1178
1179/*
1180 * This function adds a protection domain to the global protection domain list
1181 */
1182static void add_domain_to_list(struct protection_domain *domain)
1183{
1184 unsigned long flags;
1185
1186 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1187 list_add(&domain->list, &amd_iommu_pd_list);
1188 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1189}
1190
1191/*
1192 * This function removes a protection domain to the global
1193 * protection domain list
1194 */
1195static void del_domain_from_list(struct protection_domain *domain)
1196{
1197 unsigned long flags;
1198
1199 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1200 list_del(&domain->list);
1201 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1202}
1203
976static u16 domain_id_alloc(void) 1204static u16 domain_id_alloc(void)
977{ 1205{
978 unsigned long flags; 1206 unsigned long flags;
@@ -1000,26 +1228,6 @@ static void domain_id_free(int id)
1000 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1228 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1001} 1229}
1002 1230
1003/*
1004 * Used to reserve address ranges in the aperture (e.g. for exclusion
1005 * ranges.
1006 */
1007static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1008 unsigned long start_page,
1009 unsigned int pages)
1010{
1011 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1012
1013 if (start_page + pages > last_page)
1014 pages = last_page - start_page;
1015
1016 for (i = start_page; i < start_page + pages; ++i) {
1017 int index = i / APERTURE_RANGE_PAGES;
1018 int page = i % APERTURE_RANGE_PAGES;
1019 __set_bit(page, dom->aperture[index]->bitmap);
1020 }
1021}
1022
1023static void free_pagetable(struct protection_domain *domain) 1231static void free_pagetable(struct protection_domain *domain)
1024{ 1232{
1025 int i, j; 1233 int i, j;
@@ -1061,6 +1269,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1061 if (!dom) 1269 if (!dom)
1062 return; 1270 return;
1063 1271
1272 del_domain_from_list(&dom->domain);
1273
1064 free_pagetable(&dom->domain); 1274 free_pagetable(&dom->domain);
1065 1275
1066 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1276 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1288,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1078 * It also intializes the page table and the address allocator data 1288 * It also intializes the page table and the address allocator data
1079 * structures required for the dma_ops interface 1289 * structures required for the dma_ops interface
1080 */ 1290 */
1081static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) 1291static struct dma_ops_domain *dma_ops_domain_alloc(void)
1082{ 1292{
1083 struct dma_ops_domain *dma_dom; 1293 struct dma_ops_domain *dma_dom;
1084 1294
@@ -1091,6 +1301,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1091 dma_dom->domain.id = domain_id_alloc(); 1301 dma_dom->domain.id = domain_id_alloc();
1092 if (dma_dom->domain.id == 0) 1302 if (dma_dom->domain.id == 0)
1093 goto free_dma_dom; 1303 goto free_dma_dom;
1304 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1305 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1306 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1096 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1307 dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1312,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1101 dma_dom->need_flush = false; 1312 dma_dom->need_flush = false;
1102 dma_dom->target_dev = 0xffff; 1313 dma_dom->target_dev = 0xffff;
1103 1314
1104 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) 1315 add_domain_to_list(&dma_dom->domain);
1316
1317 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1105 goto free_dma_dom; 1318 goto free_dma_dom;
1106 1319
1107 /* 1320 /*
@@ -1129,22 +1342,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
1129 return domain->flags & PD_DMA_OPS_MASK; 1342 return domain->flags & PD_DMA_OPS_MASK;
1130} 1343}
1131 1344
1132/*
1133 * Find out the protection domain structure for a given PCI device. This
1134 * will give us the pointer to the page table root for example.
1135 */
1136static struct protection_domain *domain_for_device(u16 devid)
1137{
1138 struct protection_domain *dom;
1139 unsigned long flags;
1140
1141 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1142 dom = amd_iommu_pd_table[devid];
1143 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1144
1145 return dom;
1146}
1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain) 1345static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{ 1346{
1150 u64 pte_root = virt_to_phys(domain->pt_root); 1347 u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1353,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
1156 amd_iommu_dev_table[devid].data[2] = domain->id; 1353 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1354 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1355 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1356}
1159 1357
1160 amd_iommu_pd_table[devid] = domain; 1358static void clear_dte_entry(u16 devid)
1359{
1360 /* remove entry from the device table seen by the hardware */
1361 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1362 amd_iommu_dev_table[devid].data[1] = 0;
1363 amd_iommu_dev_table[devid].data[2] = 0;
1364
1365 amd_iommu_apply_erratum_63(devid);
1366}
1367
1368static void do_attach(struct device *dev, struct protection_domain *domain)
1369{
1370 struct iommu_dev_data *dev_data;
1371 struct amd_iommu *iommu;
1372 u16 devid;
1373
1374 devid = get_device_id(dev);
1375 iommu = amd_iommu_rlookup_table[devid];
1376 dev_data = get_dev_data(dev);
1377
1378 /* Update data structures */
1379 dev_data->domain = domain;
1380 list_add(&dev_data->list, &domain->dev_list);
1381 set_dte_entry(devid, domain);
1382
1383 /* Do reference counting */
1384 domain->dev_iommu[iommu->index] += 1;
1385 domain->dev_cnt += 1;
1386
1387 /* Flush the DTE entry */
1388 iommu_flush_device(dev);
1389}
1390
1391static void do_detach(struct device *dev)
1392{
1393 struct iommu_dev_data *dev_data;
1394 struct amd_iommu *iommu;
1395 u16 devid;
1396
1397 devid = get_device_id(dev);
1398 iommu = amd_iommu_rlookup_table[devid];
1399 dev_data = get_dev_data(dev);
1400
1401 /* decrease reference counters */
1402 dev_data->domain->dev_iommu[iommu->index] -= 1;
1403 dev_data->domain->dev_cnt -= 1;
1404
1405 /* Update data structures */
1406 dev_data->domain = NULL;
1407 list_del(&dev_data->list);
1408 clear_dte_entry(devid);
1409
1410 /* Flush the DTE entry */
1411 iommu_flush_device(dev);
1161} 1412}
1162 1413
1163/* 1414/*
1164 * If a device is not yet associated with a domain, this function does 1415 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware 1416 * assigns it visible for the hardware
1166 */ 1417 */
1167static void __attach_device(struct amd_iommu *iommu, 1418static int __attach_device(struct device *dev,
1168 struct protection_domain *domain, 1419 struct protection_domain *domain)
1169 u16 devid)
1170{ 1420{
1421 struct iommu_dev_data *dev_data, *alias_data;
1422
1423 dev_data = get_dev_data(dev);
1424 alias_data = get_dev_data(dev_data->alias);
1425
1426 if (!alias_data)
1427 return -EINVAL;
1428
1171 /* lock domain */ 1429 /* lock domain */
1172 spin_lock(&domain->lock); 1430 spin_lock(&domain->lock);
1173 1431
1174 /* update DTE entry */ 1432 /* Some sanity checks */
1175 set_dte_entry(devid, domain); 1433 if (alias_data->domain != NULL &&
1434 alias_data->domain != domain)
1435 return -EBUSY;
1436
1437 if (dev_data->domain != NULL &&
1438 dev_data->domain != domain)
1439 return -EBUSY;
1176 1440
1177 domain->dev_cnt += 1; 1441 /* Do real assignment */
1442 if (dev_data->alias != dev) {
1443 alias_data = get_dev_data(dev_data->alias);
1444 if (alias_data->domain == NULL)
1445 do_attach(dev_data->alias, domain);
1446
1447 atomic_inc(&alias_data->bind);
1448 }
1449
1450 if (dev_data->domain == NULL)
1451 do_attach(dev, domain);
1452
1453 atomic_inc(&dev_data->bind);
1178 1454
1179 /* ready */ 1455 /* ready */
1180 spin_unlock(&domain->lock); 1456 spin_unlock(&domain->lock);
1457
1458 return 0;
1181} 1459}
1182 1460
1183/* 1461/*
1184 * If a device is not yet associated with a domain, this function does 1462 * If a device is not yet associated with a domain, this function does
1185 * assigns it visible for the hardware 1463 * assigns it visible for the hardware
1186 */ 1464 */
1187static void attach_device(struct amd_iommu *iommu, 1465static int attach_device(struct device *dev,
1188 struct protection_domain *domain, 1466 struct protection_domain *domain)
1189 u16 devid)
1190{ 1467{
1191 unsigned long flags; 1468 unsigned long flags;
1469 int ret;
1192 1470
1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1471 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1194 __attach_device(iommu, domain, devid); 1472 ret = __attach_device(dev, domain);
1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1473 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1196 1474
1197 /* 1475 /*
@@ -1199,96 +1477,125 @@ static void attach_device(struct amd_iommu *iommu,
1199 * left the caches in the IOMMU dirty. So we have to flush 1477 * left the caches in the IOMMU dirty. So we have to flush
1200 * here to evict all dirty stuff. 1478 * here to evict all dirty stuff.
1201 */ 1479 */
1202 iommu_queue_inv_dev_entry(iommu, devid); 1480 iommu_flush_tlb_pde(domain);
1203 iommu_flush_tlb_pde(iommu, domain->id); 1481
1482 return ret;
1204} 1483}
1205 1484
1206/* 1485/*
1207 * Removes a device from a protection domain (unlocked) 1486 * Removes a device from a protection domain (unlocked)
1208 */ 1487 */
1209static void __detach_device(struct protection_domain *domain, u16 devid) 1488static void __detach_device(struct device *dev)
1210{ 1489{
1490 struct iommu_dev_data *dev_data = get_dev_data(dev);
1491 struct iommu_dev_data *alias_data;
1492 unsigned long flags;
1211 1493
1212 /* lock domain */ 1494 BUG_ON(!dev_data->domain);
1213 spin_lock(&domain->lock);
1214 1495
1215 /* remove domain from the lookup table */ 1496 spin_lock_irqsave(&dev_data->domain->lock, flags);
1216 amd_iommu_pd_table[devid] = NULL;
1217 1497
1218 /* remove entry from the device table seen by the hardware */ 1498 if (dev_data->alias != dev) {
1219 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; 1499 alias_data = get_dev_data(dev_data->alias);
1220 amd_iommu_dev_table[devid].data[1] = 0; 1500 if (atomic_dec_and_test(&alias_data->bind))
1221 amd_iommu_dev_table[devid].data[2] = 0; 1501 do_detach(dev_data->alias);
1502 }
1222 1503
1223 /* decrease reference counter */ 1504 if (atomic_dec_and_test(&dev_data->bind))
1224 domain->dev_cnt -= 1; 1505 do_detach(dev);
1225 1506
1226 /* ready */ 1507 spin_unlock_irqrestore(&dev_data->domain->lock, flags);
1227 spin_unlock(&domain->lock);
1228 1508
1229 /* 1509 /*
1230 * If we run in passthrough mode the device must be assigned to the 1510 * If we run in passthrough mode the device must be assigned to the
1231 * passthrough domain if it is detached from any other domain 1511 * passthrough domain if it is detached from any other domain
1232 */ 1512 */
1233 if (iommu_pass_through) { 1513 if (iommu_pass_through && dev_data->domain == NULL)
1234 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 1514 __attach_device(dev, pt_domain);
1235 __attach_device(iommu, pt_domain, devid);
1236 }
1237} 1515}
1238 1516
1239/* 1517/*
1240 * Removes a device from a protection domain (with devtable_lock held) 1518 * Removes a device from a protection domain (with devtable_lock held)
1241 */ 1519 */
1242static void detach_device(struct protection_domain *domain, u16 devid) 1520static void detach_device(struct device *dev)
1243{ 1521{
1244 unsigned long flags; 1522 unsigned long flags;
1245 1523
1246 /* lock device table */ 1524 /* lock device table */
1247 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1525 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1248 __detach_device(domain, devid); 1526 __detach_device(dev);
1249 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1527 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1250} 1528}
1251 1529
1530/*
1531 * Find out the protection domain structure for a given PCI device. This
1532 * will give us the pointer to the page table root for example.
1533 */
1534static struct protection_domain *domain_for_device(struct device *dev)
1535{
1536 struct protection_domain *dom;
1537 struct iommu_dev_data *dev_data, *alias_data;
1538 unsigned long flags;
1539 u16 devid, alias;
1540
1541 devid = get_device_id(dev);
1542 alias = amd_iommu_alias_table[devid];
1543 dev_data = get_dev_data(dev);
1544 alias_data = get_dev_data(dev_data->alias);
1545 if (!alias_data)
1546 return NULL;
1547
1548 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1549 dom = dev_data->domain;
1550 if (dom == NULL &&
1551 alias_data->domain != NULL) {
1552 __attach_device(dev, alias_data->domain);
1553 dom = alias_data->domain;
1554 }
1555
1556 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1557
1558 return dom;
1559}
1560
1252static int device_change_notifier(struct notifier_block *nb, 1561static int device_change_notifier(struct notifier_block *nb,
1253 unsigned long action, void *data) 1562 unsigned long action, void *data)
1254{ 1563{
1255 struct device *dev = data; 1564 struct device *dev = data;
1256 struct pci_dev *pdev = to_pci_dev(dev); 1565 u16 devid;
1257 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
1258 struct protection_domain *domain; 1566 struct protection_domain *domain;
1259 struct dma_ops_domain *dma_domain; 1567 struct dma_ops_domain *dma_domain;
1260 struct amd_iommu *iommu; 1568 struct amd_iommu *iommu;
1261 unsigned long flags; 1569 unsigned long flags;
1262 1570
1263 if (devid > amd_iommu_last_bdf) 1571 if (!check_device(dev))
1264 goto out; 1572 return 0;
1265
1266 devid = amd_iommu_alias_table[devid];
1267
1268 iommu = amd_iommu_rlookup_table[devid];
1269 if (iommu == NULL)
1270 goto out;
1271
1272 domain = domain_for_device(devid);
1273 1573
1274 if (domain && !dma_ops_domain(domain)) 1574 devid = get_device_id(dev);
1275 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " 1575 iommu = amd_iommu_rlookup_table[devid];
1276 "to a non-dma-ops domain\n", dev_name(dev));
1277 1576
1278 switch (action) { 1577 switch (action) {
1279 case BUS_NOTIFY_UNBOUND_DRIVER: 1578 case BUS_NOTIFY_UNBOUND_DRIVER:
1579
1580 domain = domain_for_device(dev);
1581
1280 if (!domain) 1582 if (!domain)
1281 goto out; 1583 goto out;
1282 if (iommu_pass_through) 1584 if (iommu_pass_through)
1283 break; 1585 break;
1284 detach_device(domain, devid); 1586 detach_device(dev);
1285 break; 1587 break;
1286 case BUS_NOTIFY_ADD_DEVICE: 1588 case BUS_NOTIFY_ADD_DEVICE:
1589
1590 iommu_init_device(dev);
1591
1592 domain = domain_for_device(dev);
1593
1287 /* allocate a protection domain if a device is added */ 1594 /* allocate a protection domain if a device is added */
1288 dma_domain = find_protection_domain(devid); 1595 dma_domain = find_protection_domain(devid);
1289 if (dma_domain) 1596 if (dma_domain)
1290 goto out; 1597 goto out;
1291 dma_domain = dma_ops_domain_alloc(iommu); 1598 dma_domain = dma_ops_domain_alloc();
1292 if (!dma_domain) 1599 if (!dma_domain)
1293 goto out; 1600 goto out;
1294 dma_domain->target_dev = devid; 1601 dma_domain->target_dev = devid;
@@ -1298,11 +1605,15 @@ static int device_change_notifier(struct notifier_block *nb,
1298 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 1605 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1299 1606
1300 break; 1607 break;
1608 case BUS_NOTIFY_DEL_DEVICE:
1609
1610 iommu_uninit_device(dev);
1611
1301 default: 1612 default:
1302 goto out; 1613 goto out;
1303 } 1614 }
1304 1615
1305 iommu_queue_inv_dev_entry(iommu, devid); 1616 iommu_flush_device(dev);
1306 iommu_completion_wait(iommu); 1617 iommu_completion_wait(iommu);
1307 1618
1308out: 1619out:
@@ -1313,6 +1624,11 @@ static struct notifier_block device_nb = {
1313 .notifier_call = device_change_notifier, 1624 .notifier_call = device_change_notifier,
1314}; 1625};
1315 1626
1627void amd_iommu_init_notifier(void)
1628{
1629 bus_register_notifier(&pci_bus_type, &device_nb);
1630}
1631
1316/***************************************************************************** 1632/*****************************************************************************
1317 * 1633 *
1318 * The next functions belong to the dma_ops mapping/unmapping code. 1634 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1320,106 +1636,46 @@ static struct notifier_block device_nb = {
1320 *****************************************************************************/ 1636 *****************************************************************************/
1321 1637
1322/* 1638/*
1323 * This function checks if the driver got a valid device from the caller to
1324 * avoid dereferencing invalid pointers.
1325 */
1326static bool check_device(struct device *dev)
1327{
1328 if (!dev || !dev->dma_mask)
1329 return false;
1330
1331 return true;
1332}
1333
1334/*
1335 * In this function the list of preallocated protection domains is traversed to
1336 * find the domain for a specific device
1337 */
1338static struct dma_ops_domain *find_protection_domain(u16 devid)
1339{
1340 struct dma_ops_domain *entry, *ret = NULL;
1341 unsigned long flags;
1342
1343 if (list_empty(&iommu_pd_list))
1344 return NULL;
1345
1346 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1347
1348 list_for_each_entry(entry, &iommu_pd_list, list) {
1349 if (entry->target_dev == devid) {
1350 ret = entry;
1351 break;
1352 }
1353 }
1354
1355 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1356
1357 return ret;
1358}
1359
1360/*
1361 * In the dma_ops path we only have the struct device. This function 1639 * In the dma_ops path we only have the struct device. This function
1362 * finds the corresponding IOMMU, the protection domain and the 1640 * finds the corresponding IOMMU, the protection domain and the
1363 * requestor id for a given device. 1641 * requestor id for a given device.
1364 * If the device is not yet associated with a domain this is also done 1642 * If the device is not yet associated with a domain this is also done
1365 * in this function. 1643 * in this function.
1366 */ 1644 */
1367static int get_device_resources(struct device *dev, 1645static struct protection_domain *get_domain(struct device *dev)
1368 struct amd_iommu **iommu,
1369 struct protection_domain **domain,
1370 u16 *bdf)
1371{ 1646{
1647 struct protection_domain *domain;
1372 struct dma_ops_domain *dma_dom; 1648 struct dma_ops_domain *dma_dom;
1373 struct pci_dev *pcidev; 1649 u16 devid = get_device_id(dev);
1374 u16 _bdf;
1375
1376 *iommu = NULL;
1377 *domain = NULL;
1378 *bdf = 0xffff;
1379
1380 if (dev->bus != &pci_bus_type)
1381 return 0;
1382
1383 pcidev = to_pci_dev(dev);
1384 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1385 1650
1386 /* device not translated by any IOMMU in the system? */ 1651 if (!check_device(dev))
1387 if (_bdf > amd_iommu_last_bdf) 1652 return ERR_PTR(-EINVAL);
1388 return 0;
1389 1653
1390 *bdf = amd_iommu_alias_table[_bdf]; 1654 domain = domain_for_device(dev);
1655 if (domain != NULL && !dma_ops_domain(domain))
1656 return ERR_PTR(-EBUSY);
1391 1657
1392 *iommu = amd_iommu_rlookup_table[*bdf]; 1658 if (domain != NULL)
1393 if (*iommu == NULL) 1659 return domain;
1394 return 0;
1395 *domain = domain_for_device(*bdf);
1396 if (*domain == NULL) {
1397 dma_dom = find_protection_domain(*bdf);
1398 if (!dma_dom)
1399 dma_dom = (*iommu)->default_dom;
1400 *domain = &dma_dom->domain;
1401 attach_device(*iommu, *domain, *bdf);
1402 DUMP_printk("Using protection domain %d for device %s\n",
1403 (*domain)->id, dev_name(dev));
1404 }
1405 1660
1406 if (domain_for_device(_bdf) == NULL) 1661 /* Device not bount yet - bind it */
1407 attach_device(*iommu, *domain, _bdf); 1662 dma_dom = find_protection_domain(devid);
1663 if (!dma_dom)
1664 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1665 attach_device(dev, &dma_dom->domain);
1666 DUMP_printk("Using protection domain %d for device %s\n",
1667 dma_dom->domain.id, dev_name(dev));
1408 1668
1409 return 1; 1669 return &dma_dom->domain;
1410} 1670}
1411 1671
1412static void update_device_table(struct protection_domain *domain) 1672static void update_device_table(struct protection_domain *domain)
1413{ 1673{
1414 unsigned long flags; 1674 struct iommu_dev_data *dev_data;
1415 int i;
1416 1675
1417 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 1676 list_for_each_entry(dev_data, &domain->dev_list, list) {
1418 if (amd_iommu_pd_table[i] != domain) 1677 u16 devid = get_device_id(dev_data->dev);
1419 continue; 1678 set_dte_entry(devid, domain);
1420 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1421 set_dte_entry(i, domain);
1422 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1423 } 1679 }
1424} 1680}
1425 1681
@@ -1429,76 +1685,13 @@ static void update_domain(struct protection_domain *domain)
1429 return; 1685 return;
1430 1686
1431 update_device_table(domain); 1687 update_device_table(domain);
1432 flush_devices_by_domain(domain); 1688 iommu_flush_domain_devices(domain);
1433 iommu_flush_domain(domain->id); 1689 iommu_flush_tlb_pde(domain);
1434 1690
1435 domain->updated = false; 1691 domain->updated = false;
1436} 1692}
1437 1693
1438/* 1694/*
1439 * This function is used to add another level to an IO page table. Adding
1440 * another level increases the size of the address space by 9 bits to a size up
1441 * to 64 bits.
1442 */
1443static bool increase_address_space(struct protection_domain *domain,
1444 gfp_t gfp)
1445{
1446 u64 *pte;
1447
1448 if (domain->mode == PAGE_MODE_6_LEVEL)
1449 /* address space already 64 bit large */
1450 return false;
1451
1452 pte = (void *)get_zeroed_page(gfp);
1453 if (!pte)
1454 return false;
1455
1456 *pte = PM_LEVEL_PDE(domain->mode,
1457 virt_to_phys(domain->pt_root));
1458 domain->pt_root = pte;
1459 domain->mode += 1;
1460 domain->updated = true;
1461
1462 return true;
1463}
1464
1465static u64 *alloc_pte(struct protection_domain *domain,
1466 unsigned long address,
1467 int end_lvl,
1468 u64 **pte_page,
1469 gfp_t gfp)
1470{
1471 u64 *pte, *page;
1472 int level;
1473
1474 while (address > PM_LEVEL_SIZE(domain->mode))
1475 increase_address_space(domain, gfp);
1476
1477 level = domain->mode - 1;
1478 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1479
1480 while (level > end_lvl) {
1481 if (!IOMMU_PTE_PRESENT(*pte)) {
1482 page = (u64 *)get_zeroed_page(gfp);
1483 if (!page)
1484 return NULL;
1485 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1486 }
1487
1488 level -= 1;
1489
1490 pte = IOMMU_PTE_PAGE(*pte);
1491
1492 if (pte_page && level == end_lvl)
1493 *pte_page = pte;
1494
1495 pte = &pte[PM_LEVEL_INDEX(level, address)];
1496 }
1497
1498 return pte;
1499}
1500
1501/*
1502 * This function fetches the PTE for a given address in the aperture 1695 * This function fetches the PTE for a given address in the aperture
1503 */ 1696 */
1504static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 1697static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1528,8 +1721,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1528 * This is the generic map function. It maps one 4kb page at paddr to 1721 * This is the generic map function. It maps one 4kb page at paddr to
1529 * the given address in the DMA address space for the domain. 1722 * the given address in the DMA address space for the domain.
1530 */ 1723 */
1531static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 1724static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1532 struct dma_ops_domain *dom,
1533 unsigned long address, 1725 unsigned long address,
1534 phys_addr_t paddr, 1726 phys_addr_t paddr,
1535 int direction) 1727 int direction)
@@ -1542,7 +1734,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1542 1734
1543 pte = dma_ops_get_pte(dom, address); 1735 pte = dma_ops_get_pte(dom, address);
1544 if (!pte) 1736 if (!pte)
1545 return bad_dma_address; 1737 return DMA_ERROR_CODE;
1546 1738
1547 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1739 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1548 1740
@@ -1563,8 +1755,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1563/* 1755/*
1564 * The generic unmapping function for on page in the DMA address space. 1756 * The generic unmapping function for on page in the DMA address space.
1565 */ 1757 */
1566static void dma_ops_domain_unmap(struct amd_iommu *iommu, 1758static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1567 struct dma_ops_domain *dom,
1568 unsigned long address) 1759 unsigned long address)
1569{ 1760{
1570 struct aperture_range *aperture; 1761 struct aperture_range *aperture;
@@ -1595,7 +1786,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1595 * Must be called with the domain lock held. 1786 * Must be called with the domain lock held.
1596 */ 1787 */
1597static dma_addr_t __map_single(struct device *dev, 1788static dma_addr_t __map_single(struct device *dev,
1598 struct amd_iommu *iommu,
1599 struct dma_ops_domain *dma_dom, 1789 struct dma_ops_domain *dma_dom,
1600 phys_addr_t paddr, 1790 phys_addr_t paddr,
1601 size_t size, 1791 size_t size,
@@ -1623,7 +1813,7 @@ static dma_addr_t __map_single(struct device *dev,
1623retry: 1813retry:
1624 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1814 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1625 dma_mask); 1815 dma_mask);
1626 if (unlikely(address == bad_dma_address)) { 1816 if (unlikely(address == DMA_ERROR_CODE)) {
1627 /* 1817 /*
1628 * setting next_address here will let the address 1818 * setting next_address here will let the address
1629 * allocator only scan the new allocated range in the 1819 * allocator only scan the new allocated range in the
@@ -1631,11 +1821,11 @@ retry:
1631 */ 1821 */
1632 dma_dom->next_address = dma_dom->aperture_size; 1822 dma_dom->next_address = dma_dom->aperture_size;
1633 1823
1634 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) 1824 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
1635 goto out; 1825 goto out;
1636 1826
1637 /* 1827 /*
1638 * aperture was sucessfully enlarged by 128 MB, try 1828 * aperture was successfully enlarged by 128 MB, try
1639 * allocation again 1829 * allocation again
1640 */ 1830 */
1641 goto retry; 1831 goto retry;
@@ -1643,8 +1833,8 @@ retry:
1643 1833
1644 start = address; 1834 start = address;
1645 for (i = 0; i < pages; ++i) { 1835 for (i = 0; i < pages; ++i) {
1646 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1836 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
1647 if (ret == bad_dma_address) 1837 if (ret == DMA_ERROR_CODE)
1648 goto out_unmap; 1838 goto out_unmap;
1649 1839
1650 paddr += PAGE_SIZE; 1840 paddr += PAGE_SIZE;
@@ -1655,10 +1845,10 @@ retry:
1655 ADD_STATS_COUNTER(alloced_io_mem, size); 1845 ADD_STATS_COUNTER(alloced_io_mem, size);
1656 1846
1657 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1847 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1658 iommu_flush_tlb(iommu, dma_dom->domain.id); 1848 iommu_flush_tlb(&dma_dom->domain);
1659 dma_dom->need_flush = false; 1849 dma_dom->need_flush = false;
1660 } else if (unlikely(iommu_has_npcache(iommu))) 1850 } else if (unlikely(amd_iommu_np_cache))
1661 iommu_flush_pages(iommu, dma_dom->domain.id, address, size); 1851 iommu_flush_pages(&dma_dom->domain, address, size);
1662 1852
1663out: 1853out:
1664 return address; 1854 return address;
@@ -1667,20 +1857,19 @@ out_unmap:
1667 1857
1668 for (--i; i >= 0; --i) { 1858 for (--i; i >= 0; --i) {
1669 start -= PAGE_SIZE; 1859 start -= PAGE_SIZE;
1670 dma_ops_domain_unmap(iommu, dma_dom, start); 1860 dma_ops_domain_unmap(dma_dom, start);
1671 } 1861 }
1672 1862
1673 dma_ops_free_addresses(dma_dom, address, pages); 1863 dma_ops_free_addresses(dma_dom, address, pages);
1674 1864
1675 return bad_dma_address; 1865 return DMA_ERROR_CODE;
1676} 1866}
1677 1867
1678/* 1868/*
1679 * Does the reverse of the __map_single function. Must be called with 1869 * Does the reverse of the __map_single function. Must be called with
1680 * the domain lock held too 1870 * the domain lock held too
1681 */ 1871 */
1682static void __unmap_single(struct amd_iommu *iommu, 1872static void __unmap_single(struct dma_ops_domain *dma_dom,
1683 struct dma_ops_domain *dma_dom,
1684 dma_addr_t dma_addr, 1873 dma_addr_t dma_addr,
1685 size_t size, 1874 size_t size,
1686 int dir) 1875 int dir)
@@ -1688,7 +1877,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1688 dma_addr_t i, start; 1877 dma_addr_t i, start;
1689 unsigned int pages; 1878 unsigned int pages;
1690 1879
1691 if ((dma_addr == bad_dma_address) || 1880 if ((dma_addr == DMA_ERROR_CODE) ||
1692 (dma_addr + size > dma_dom->aperture_size)) 1881 (dma_addr + size > dma_dom->aperture_size))
1693 return; 1882 return;
1694 1883
@@ -1697,7 +1886,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1697 start = dma_addr; 1886 start = dma_addr;
1698 1887
1699 for (i = 0; i < pages; ++i) { 1888 for (i = 0; i < pages; ++i) {
1700 dma_ops_domain_unmap(iommu, dma_dom, start); 1889 dma_ops_domain_unmap(dma_dom, start);
1701 start += PAGE_SIZE; 1890 start += PAGE_SIZE;
1702 } 1891 }
1703 1892
@@ -1706,7 +1895,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1706 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1895 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1707 1896
1708 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1897 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1709 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); 1898 iommu_flush_pages(&dma_dom->domain, dma_addr, size);
1710 dma_dom->need_flush = false; 1899 dma_dom->need_flush = false;
1711 } 1900 }
1712} 1901}
@@ -1720,36 +1909,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
1720 struct dma_attrs *attrs) 1909 struct dma_attrs *attrs)
1721{ 1910{
1722 unsigned long flags; 1911 unsigned long flags;
1723 struct amd_iommu *iommu;
1724 struct protection_domain *domain; 1912 struct protection_domain *domain;
1725 u16 devid;
1726 dma_addr_t addr; 1913 dma_addr_t addr;
1727 u64 dma_mask; 1914 u64 dma_mask;
1728 phys_addr_t paddr = page_to_phys(page) + offset; 1915 phys_addr_t paddr = page_to_phys(page) + offset;
1729 1916
1730 INC_STATS_COUNTER(cnt_map_single); 1917 INC_STATS_COUNTER(cnt_map_single);
1731 1918
1732 if (!check_device(dev)) 1919 domain = get_domain(dev);
1733 return bad_dma_address; 1920 if (PTR_ERR(domain) == -EINVAL)
1734
1735 dma_mask = *dev->dma_mask;
1736
1737 get_device_resources(dev, &iommu, &domain, &devid);
1738
1739 if (iommu == NULL || domain == NULL)
1740 /* device not handled by any AMD IOMMU */
1741 return (dma_addr_t)paddr; 1921 return (dma_addr_t)paddr;
1922 else if (IS_ERR(domain))
1923 return DMA_ERROR_CODE;
1742 1924
1743 if (!dma_ops_domain(domain)) 1925 dma_mask = *dev->dma_mask;
1744 return bad_dma_address;
1745 1926
1746 spin_lock_irqsave(&domain->lock, flags); 1927 spin_lock_irqsave(&domain->lock, flags);
1747 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1928
1929 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
1748 dma_mask); 1930 dma_mask);
1749 if (addr == bad_dma_address) 1931 if (addr == DMA_ERROR_CODE)
1750 goto out; 1932 goto out;
1751 1933
1752 iommu_completion_wait(iommu); 1934 iommu_flush_complete(domain);
1753 1935
1754out: 1936out:
1755 spin_unlock_irqrestore(&domain->lock, flags); 1937 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1764,25 +1946,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1764 enum dma_data_direction dir, struct dma_attrs *attrs) 1946 enum dma_data_direction dir, struct dma_attrs *attrs)
1765{ 1947{
1766 unsigned long flags; 1948 unsigned long flags;
1767 struct amd_iommu *iommu;
1768 struct protection_domain *domain; 1949 struct protection_domain *domain;
1769 u16 devid;
1770 1950
1771 INC_STATS_COUNTER(cnt_unmap_single); 1951 INC_STATS_COUNTER(cnt_unmap_single);
1772 1952
1773 if (!check_device(dev) || 1953 domain = get_domain(dev);
1774 !get_device_resources(dev, &iommu, &domain, &devid)) 1954 if (IS_ERR(domain))
1775 /* device not handled by any AMD IOMMU */
1776 return;
1777
1778 if (!dma_ops_domain(domain))
1779 return; 1955 return;
1780 1956
1781 spin_lock_irqsave(&domain->lock, flags); 1957 spin_lock_irqsave(&domain->lock, flags);
1782 1958
1783 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1959 __unmap_single(domain->priv, dma_addr, size, dir);
1784 1960
1785 iommu_completion_wait(iommu); 1961 iommu_flush_complete(domain);
1786 1962
1787 spin_unlock_irqrestore(&domain->lock, flags); 1963 spin_unlock_irqrestore(&domain->lock, flags);
1788} 1964}
@@ -1814,9 +1990,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1814 struct dma_attrs *attrs) 1990 struct dma_attrs *attrs)
1815{ 1991{
1816 unsigned long flags; 1992 unsigned long flags;
1817 struct amd_iommu *iommu;
1818 struct protection_domain *domain; 1993 struct protection_domain *domain;
1819 u16 devid;
1820 int i; 1994 int i;
1821 struct scatterlist *s; 1995 struct scatterlist *s;
1822 phys_addr_t paddr; 1996 phys_addr_t paddr;
@@ -1825,25 +1999,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1825 1999
1826 INC_STATS_COUNTER(cnt_map_sg); 2000 INC_STATS_COUNTER(cnt_map_sg);
1827 2001
1828 if (!check_device(dev)) 2002 domain = get_domain(dev);
2003 if (PTR_ERR(domain) == -EINVAL)
2004 return map_sg_no_iommu(dev, sglist, nelems, dir);
2005 else if (IS_ERR(domain))
1829 return 0; 2006 return 0;
1830 2007
1831 dma_mask = *dev->dma_mask; 2008 dma_mask = *dev->dma_mask;
1832 2009
1833 get_device_resources(dev, &iommu, &domain, &devid);
1834
1835 if (!iommu || !domain)
1836 return map_sg_no_iommu(dev, sglist, nelems, dir);
1837
1838 if (!dma_ops_domain(domain))
1839 return 0;
1840
1841 spin_lock_irqsave(&domain->lock, flags); 2010 spin_lock_irqsave(&domain->lock, flags);
1842 2011
1843 for_each_sg(sglist, s, nelems, i) { 2012 for_each_sg(sglist, s, nelems, i) {
1844 paddr = sg_phys(s); 2013 paddr = sg_phys(s);
1845 2014
1846 s->dma_address = __map_single(dev, iommu, domain->priv, 2015 s->dma_address = __map_single(dev, domain->priv,
1847 paddr, s->length, dir, false, 2016 paddr, s->length, dir, false,
1848 dma_mask); 2017 dma_mask);
1849 2018
@@ -1854,7 +2023,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1854 goto unmap; 2023 goto unmap;
1855 } 2024 }
1856 2025
1857 iommu_completion_wait(iommu); 2026 iommu_flush_complete(domain);
1858 2027
1859out: 2028out:
1860 spin_unlock_irqrestore(&domain->lock, flags); 2029 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1863,7 +2032,7 @@ out:
1863unmap: 2032unmap:
1864 for_each_sg(sglist, s, mapped_elems, i) { 2033 for_each_sg(sglist, s, mapped_elems, i) {
1865 if (s->dma_address) 2034 if (s->dma_address)
1866 __unmap_single(iommu, domain->priv, s->dma_address, 2035 __unmap_single(domain->priv, s->dma_address,
1867 s->dma_length, dir); 2036 s->dma_length, dir);
1868 s->dma_address = s->dma_length = 0; 2037 s->dma_address = s->dma_length = 0;
1869 } 2038 }
@@ -1882,30 +2051,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1882 struct dma_attrs *attrs) 2051 struct dma_attrs *attrs)
1883{ 2052{
1884 unsigned long flags; 2053 unsigned long flags;
1885 struct amd_iommu *iommu;
1886 struct protection_domain *domain; 2054 struct protection_domain *domain;
1887 struct scatterlist *s; 2055 struct scatterlist *s;
1888 u16 devid;
1889 int i; 2056 int i;
1890 2057
1891 INC_STATS_COUNTER(cnt_unmap_sg); 2058 INC_STATS_COUNTER(cnt_unmap_sg);
1892 2059
1893 if (!check_device(dev) || 2060 domain = get_domain(dev);
1894 !get_device_resources(dev, &iommu, &domain, &devid)) 2061 if (IS_ERR(domain))
1895 return;
1896
1897 if (!dma_ops_domain(domain))
1898 return; 2062 return;
1899 2063
1900 spin_lock_irqsave(&domain->lock, flags); 2064 spin_lock_irqsave(&domain->lock, flags);
1901 2065
1902 for_each_sg(sglist, s, nelems, i) { 2066 for_each_sg(sglist, s, nelems, i) {
1903 __unmap_single(iommu, domain->priv, s->dma_address, 2067 __unmap_single(domain->priv, s->dma_address,
1904 s->dma_length, dir); 2068 s->dma_length, dir);
1905 s->dma_address = s->dma_length = 0; 2069 s->dma_address = s->dma_length = 0;
1906 } 2070 }
1907 2071
1908 iommu_completion_wait(iommu); 2072 iommu_flush_complete(domain);
1909 2073
1910 spin_unlock_irqrestore(&domain->lock, flags); 2074 spin_unlock_irqrestore(&domain->lock, flags);
1911} 2075}
@@ -1918,49 +2082,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
1918{ 2082{
1919 unsigned long flags; 2083 unsigned long flags;
1920 void *virt_addr; 2084 void *virt_addr;
1921 struct amd_iommu *iommu;
1922 struct protection_domain *domain; 2085 struct protection_domain *domain;
1923 u16 devid;
1924 phys_addr_t paddr; 2086 phys_addr_t paddr;
1925 u64 dma_mask = dev->coherent_dma_mask; 2087 u64 dma_mask = dev->coherent_dma_mask;
1926 2088
1927 INC_STATS_COUNTER(cnt_alloc_coherent); 2089 INC_STATS_COUNTER(cnt_alloc_coherent);
1928 2090
1929 if (!check_device(dev)) 2091 domain = get_domain(dev);
2092 if (PTR_ERR(domain) == -EINVAL) {
2093 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2094 *dma_addr = __pa(virt_addr);
2095 return virt_addr;
2096 } else if (IS_ERR(domain))
1930 return NULL; 2097 return NULL;
1931 2098
1932 if (!get_device_resources(dev, &iommu, &domain, &devid)) 2099 dma_mask = dev->coherent_dma_mask;
1933 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2100 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2101 flag |= __GFP_ZERO;
1934 2102
1935 flag |= __GFP_ZERO;
1936 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 2103 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1937 if (!virt_addr) 2104 if (!virt_addr)
1938 return NULL; 2105 return NULL;
1939 2106
1940 paddr = virt_to_phys(virt_addr); 2107 paddr = virt_to_phys(virt_addr);
1941 2108
1942 if (!iommu || !domain) {
1943 *dma_addr = (dma_addr_t)paddr;
1944 return virt_addr;
1945 }
1946
1947 if (!dma_ops_domain(domain))
1948 goto out_free;
1949
1950 if (!dma_mask) 2109 if (!dma_mask)
1951 dma_mask = *dev->dma_mask; 2110 dma_mask = *dev->dma_mask;
1952 2111
1953 spin_lock_irqsave(&domain->lock, flags); 2112 spin_lock_irqsave(&domain->lock, flags);
1954 2113
1955 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 2114 *dma_addr = __map_single(dev, domain->priv, paddr,
1956 size, DMA_BIDIRECTIONAL, true, dma_mask); 2115 size, DMA_BIDIRECTIONAL, true, dma_mask);
1957 2116
1958 if (*dma_addr == bad_dma_address) { 2117 if (*dma_addr == DMA_ERROR_CODE) {
1959 spin_unlock_irqrestore(&domain->lock, flags); 2118 spin_unlock_irqrestore(&domain->lock, flags);
1960 goto out_free; 2119 goto out_free;
1961 } 2120 }
1962 2121
1963 iommu_completion_wait(iommu); 2122 iommu_flush_complete(domain);
1964 2123
1965 spin_unlock_irqrestore(&domain->lock, flags); 2124 spin_unlock_irqrestore(&domain->lock, flags);
1966 2125
@@ -1980,28 +2139,19 @@ static void free_coherent(struct device *dev, size_t size,
1980 void *virt_addr, dma_addr_t dma_addr) 2139 void *virt_addr, dma_addr_t dma_addr)
1981{ 2140{
1982 unsigned long flags; 2141 unsigned long flags;
1983 struct amd_iommu *iommu;
1984 struct protection_domain *domain; 2142 struct protection_domain *domain;
1985 u16 devid;
1986 2143
1987 INC_STATS_COUNTER(cnt_free_coherent); 2144 INC_STATS_COUNTER(cnt_free_coherent);
1988 2145
1989 if (!check_device(dev)) 2146 domain = get_domain(dev);
1990 return; 2147 if (IS_ERR(domain))
1991
1992 get_device_resources(dev, &iommu, &domain, &devid);
1993
1994 if (!iommu || !domain)
1995 goto free_mem;
1996
1997 if (!dma_ops_domain(domain))
1998 goto free_mem; 2148 goto free_mem;
1999 2149
2000 spin_lock_irqsave(&domain->lock, flags); 2150 spin_lock_irqsave(&domain->lock, flags);
2001 2151
2002 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2152 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2003 2153
2004 iommu_completion_wait(iommu); 2154 iommu_flush_complete(domain);
2005 2155
2006 spin_unlock_irqrestore(&domain->lock, flags); 2156 spin_unlock_irqrestore(&domain->lock, flags);
2007 2157
@@ -2015,22 +2165,7 @@ free_mem:
2015 */ 2165 */
2016static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2166static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2017{ 2167{
2018 u16 bdf; 2168 return check_device(dev);
2019 struct pci_dev *pcidev;
2020
2021 /* No device or no PCI device */
2022 if (!dev || dev->bus != &pci_bus_type)
2023 return 0;
2024
2025 pcidev = to_pci_dev(dev);
2026
2027 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
2028
2029 /* Out of our scope? */
2030 if (bdf > amd_iommu_last_bdf)
2031 return 0;
2032
2033 return 1;
2034} 2169}
2035 2170
2036/* 2171/*
@@ -2044,25 +2179,28 @@ static void prealloc_protection_domains(void)
2044{ 2179{
2045 struct pci_dev *dev = NULL; 2180 struct pci_dev *dev = NULL;
2046 struct dma_ops_domain *dma_dom; 2181 struct dma_ops_domain *dma_dom;
2047 struct amd_iommu *iommu;
2048 u16 devid; 2182 u16 devid;
2049 2183
2050 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2184 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2051 devid = calc_devid(dev->bus->number, dev->devfn); 2185
2052 if (devid > amd_iommu_last_bdf) 2186 /* Do we handle this device? */
2053 continue; 2187 if (!check_device(&dev->dev))
2054 devid = amd_iommu_alias_table[devid];
2055 if (domain_for_device(devid))
2056 continue; 2188 continue;
2057 iommu = amd_iommu_rlookup_table[devid]; 2189
2058 if (!iommu) 2190 /* Is there already any domain for it? */
2191 if (domain_for_device(&dev->dev))
2059 continue; 2192 continue;
2060 dma_dom = dma_ops_domain_alloc(iommu); 2193
2194 devid = get_device_id(&dev->dev);
2195
2196 dma_dom = dma_ops_domain_alloc();
2061 if (!dma_dom) 2197 if (!dma_dom)
2062 continue; 2198 continue;
2063 init_unity_mappings_for_device(dma_dom, devid); 2199 init_unity_mappings_for_device(dma_dom, devid);
2064 dma_dom->target_dev = devid; 2200 dma_dom->target_dev = devid;
2065 2201
2202 attach_device(&dev->dev, &dma_dom->domain);
2203
2066 list_add_tail(&dma_dom->list, &iommu_pd_list); 2204 list_add_tail(&dma_dom->list, &iommu_pd_list);
2067 } 2205 }
2068} 2206}
@@ -2091,7 +2229,7 @@ int __init amd_iommu_init_dma_ops(void)
2091 * protection domain will be assigned to the default one. 2229 * protection domain will be assigned to the default one.
2092 */ 2230 */
2093 for_each_iommu(iommu) { 2231 for_each_iommu(iommu) {
2094 iommu->default_dom = dma_ops_domain_alloc(iommu); 2232 iommu->default_dom = dma_ops_domain_alloc();
2095 if (iommu->default_dom == NULL) 2233 if (iommu->default_dom == NULL)
2096 return -ENOMEM; 2234 return -ENOMEM;
2097 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 2235 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2101,15 +2239,12 @@ int __init amd_iommu_init_dma_ops(void)
2101 } 2239 }
2102 2240
2103 /* 2241 /*
2104 * If device isolation is enabled, pre-allocate the protection 2242 * Pre-allocate the protection domains for each device.
2105 * domains for each device.
2106 */ 2243 */
2107 if (amd_iommu_isolate) 2244 prealloc_protection_domains();
2108 prealloc_protection_domains();
2109 2245
2110 iommu_detected = 1; 2246 iommu_detected = 1;
2111 force_iommu = 1; 2247 swiotlb = 0;
2112 bad_dma_address = 0;
2113#ifdef CONFIG_GART_IOMMU 2248#ifdef CONFIG_GART_IOMMU
2114 gart_iommu_aperture_disabled = 1; 2249 gart_iommu_aperture_disabled = 1;
2115 gart_iommu_aperture = 0; 2250 gart_iommu_aperture = 0;
@@ -2120,8 +2255,6 @@ int __init amd_iommu_init_dma_ops(void)
2120 2255
2121 register_iommu(&amd_iommu_ops); 2256 register_iommu(&amd_iommu_ops);
2122 2257
2123 bus_register_notifier(&pci_bus_type, &device_nb);
2124
2125 amd_iommu_stats_init(); 2258 amd_iommu_stats_init();
2126 2259
2127 return 0; 2260 return 0;
@@ -2148,14 +2281,17 @@ free_domains:
2148 2281
2149static void cleanup_domain(struct protection_domain *domain) 2282static void cleanup_domain(struct protection_domain *domain)
2150{ 2283{
2284 struct iommu_dev_data *dev_data, *next;
2151 unsigned long flags; 2285 unsigned long flags;
2152 u16 devid;
2153 2286
2154 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2287 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2155 2288
2156 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) 2289 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2157 if (amd_iommu_pd_table[devid] == domain) 2290 struct device *dev = dev_data->dev;
2158 __detach_device(domain, devid); 2291
2292 do_detach(dev);
2293 atomic_set(&dev_data->bind, 0);
2294 }
2159 2295
2160 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2296 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2161} 2297}
@@ -2165,6 +2301,8 @@ static void protection_domain_free(struct protection_domain *domain)
2165 if (!domain) 2301 if (!domain)
2166 return; 2302 return;
2167 2303
2304 del_domain_from_list(domain);
2305
2168 if (domain->id) 2306 if (domain->id)
2169 domain_id_free(domain->id); 2307 domain_id_free(domain->id);
2170 2308
@@ -2183,6 +2321,9 @@ static struct protection_domain *protection_domain_alloc(void)
2183 domain->id = domain_id_alloc(); 2321 domain->id = domain_id_alloc();
2184 if (!domain->id) 2322 if (!domain->id)
2185 goto out_err; 2323 goto out_err;
2324 INIT_LIST_HEAD(&domain->dev_list);
2325
2326 add_domain_to_list(domain);
2186 2327
2187 return domain; 2328 return domain;
2188 2329
@@ -2239,26 +2380,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2239static void amd_iommu_detach_device(struct iommu_domain *dom, 2380static void amd_iommu_detach_device(struct iommu_domain *dom,
2240 struct device *dev) 2381 struct device *dev)
2241{ 2382{
2242 struct protection_domain *domain = dom->priv; 2383 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2243 struct amd_iommu *iommu; 2384 struct amd_iommu *iommu;
2244 struct pci_dev *pdev;
2245 u16 devid; 2385 u16 devid;
2246 2386
2247 if (dev->bus != &pci_bus_type) 2387 if (!check_device(dev))
2248 return; 2388 return;
2249 2389
2250 pdev = to_pci_dev(dev); 2390 devid = get_device_id(dev);
2251 2391
2252 devid = calc_devid(pdev->bus->number, pdev->devfn); 2392 if (dev_data->domain != NULL)
2253 2393 detach_device(dev);
2254 if (devid > 0)
2255 detach_device(domain, devid);
2256 2394
2257 iommu = amd_iommu_rlookup_table[devid]; 2395 iommu = amd_iommu_rlookup_table[devid];
2258 if (!iommu) 2396 if (!iommu)
2259 return; 2397 return;
2260 2398
2261 iommu_queue_inv_dev_entry(iommu, devid); 2399 iommu_flush_device(dev);
2262 iommu_completion_wait(iommu); 2400 iommu_completion_wait(iommu);
2263} 2401}
2264 2402
@@ -2266,35 +2404,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2266 struct device *dev) 2404 struct device *dev)
2267{ 2405{
2268 struct protection_domain *domain = dom->priv; 2406 struct protection_domain *domain = dom->priv;
2269 struct protection_domain *old_domain; 2407 struct iommu_dev_data *dev_data;
2270 struct amd_iommu *iommu; 2408 struct amd_iommu *iommu;
2271 struct pci_dev *pdev; 2409 int ret;
2272 u16 devid; 2410 u16 devid;
2273 2411
2274 if (dev->bus != &pci_bus_type) 2412 if (!check_device(dev))
2275 return -EINVAL; 2413 return -EINVAL;
2276 2414
2277 pdev = to_pci_dev(dev); 2415 dev_data = dev->archdata.iommu;
2278
2279 devid = calc_devid(pdev->bus->number, pdev->devfn);
2280 2416
2281 if (devid >= amd_iommu_last_bdf || 2417 devid = get_device_id(dev);
2282 devid != amd_iommu_alias_table[devid])
2283 return -EINVAL;
2284 2418
2285 iommu = amd_iommu_rlookup_table[devid]; 2419 iommu = amd_iommu_rlookup_table[devid];
2286 if (!iommu) 2420 if (!iommu)
2287 return -EINVAL; 2421 return -EINVAL;
2288 2422
2289 old_domain = domain_for_device(devid); 2423 if (dev_data->domain)
2290 if (old_domain) 2424 detach_device(dev);
2291 detach_device(old_domain, devid);
2292 2425
2293 attach_device(iommu, domain, devid); 2426 ret = attach_device(dev, domain);
2294 2427
2295 iommu_completion_wait(iommu); 2428 iommu_completion_wait(iommu);
2296 2429
2297 return 0; 2430 return ret;
2298} 2431}
2299 2432
2300static int amd_iommu_map_range(struct iommu_domain *dom, 2433static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2340,7 +2473,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2340 iova += PAGE_SIZE; 2473 iova += PAGE_SIZE;
2341 } 2474 }
2342 2475
2343 iommu_flush_domain(domain->id); 2476 iommu_flush_tlb_pde(domain);
2344} 2477}
2345 2478
2346static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2479static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2391,10 +2524,11 @@ static struct iommu_ops amd_iommu_ops = {
2391 2524
2392int __init amd_iommu_init_passthrough(void) 2525int __init amd_iommu_init_passthrough(void)
2393{ 2526{
2527 struct amd_iommu *iommu;
2394 struct pci_dev *dev = NULL; 2528 struct pci_dev *dev = NULL;
2395 u16 devid, devid2; 2529 u16 devid;
2396 2530
2397 /* allocate passthroug domain */ 2531 /* allocate passthrough domain */
2398 pt_domain = protection_domain_alloc(); 2532 pt_domain = protection_domain_alloc();
2399 if (!pt_domain) 2533 if (!pt_domain)
2400 return -ENOMEM; 2534 return -ENOMEM;
@@ -2402,20 +2536,17 @@ int __init amd_iommu_init_passthrough(void)
2402 pt_domain->mode |= PAGE_MODE_NONE; 2536 pt_domain->mode |= PAGE_MODE_NONE;
2403 2537
2404 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2538 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2405 struct amd_iommu *iommu;
2406 2539
2407 devid = calc_devid(dev->bus->number, dev->devfn); 2540 if (!check_device(&dev->dev))
2408 if (devid > amd_iommu_last_bdf)
2409 continue; 2541 continue;
2410 2542
2411 devid2 = amd_iommu_alias_table[devid]; 2543 devid = get_device_id(&dev->dev);
2412 2544
2413 iommu = amd_iommu_rlookup_table[devid2]; 2545 iommu = amd_iommu_rlookup_table[devid];
2414 if (!iommu) 2546 if (!iommu)
2415 continue; 2547 continue;
2416 2548
2417 __attach_device(iommu, pt_domain, devid); 2549 attach_device(&dev->dev, pt_domain);
2418 __attach_device(iommu, pt_domain, devid2);
2419 } 2550 }
2420 2551
2421 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 2552 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b4b61d462dcc..fb490ce7dd55 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,10 +25,12 @@
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
28#include <asm/amd_iommu_types.h> 29#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 30#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 31#include <asm/iommu.h>
31#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h>
32 34
33/* 35/*
34 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
@@ -123,18 +125,29 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
123 to handle */ 125 to handle */
124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 126LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
125 we find in ACPI */ 127 we find in ACPI */
126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
129bool amd_iommu_isolate = true; /* if true, device isolation is
130 enabled */
131#endif
132
133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 128bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
134 129
135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 130LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
136 system */ 131 system */
137 132
133/* Array to assign indices to IOMMUs*/
134struct amd_iommu *amd_iommus[MAX_IOMMUS];
135int amd_iommus_present;
136
137/* IOMMUs have a non-present cache? */
138bool amd_iommu_np_cache __read_mostly;
139
140/*
141 * Set to true if ACPI table parsing and hardware intialization went properly
142 */
143static bool amd_iommu_initialized;
144
145/*
146 * List of protection domains - used during resume
147 */
148LIST_HEAD(amd_iommu_pd_list);
149spinlock_t amd_iommu_pd_lock;
150
138/* 151/*
139 * Pointer to the device table which is shared by all AMD IOMMUs 152 * Pointer to the device table which is shared by all AMD IOMMUs
140 * it is indexed by the PCI device id or the HT unit id and contains 153 * it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +170,6 @@ u16 *amd_iommu_alias_table;
157struct amd_iommu **amd_iommu_rlookup_table; 170struct amd_iommu **amd_iommu_rlookup_table;
158 171
159/* 172/*
160 * The pd table (protection domain table) is used to find the protection domain
161 * data structure a device belongs to. Indexed with the PCI device id too.
162 */
163struct protection_domain **amd_iommu_pd_table;
164
165/*
166 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap 173 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
167 * to know which ones are already in use. 174 * to know which ones are already in use.
168 */ 175 */
@@ -240,7 +247,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
240 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 247 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
241} 248}
242 249
243static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) 250static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244{ 251{
245 u32 ctrl; 252 u32 ctrl;
246 253
@@ -519,6 +526,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
519 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 526 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
520} 527}
521 528
529static int get_dev_entry_bit(u16 devid, u8 bit)
530{
531 int i = (bit >> 5) & 0x07;
532 int _bit = bit & 0x1f;
533
534 return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
535}
536
537
538void amd_iommu_apply_erratum_63(u16 devid)
539{
540 int sysmgt;
541
542 sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
543 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
544
545 if (sysmgt == 0x01)
546 set_dev_entry_bit(devid, DEV_ENTRY_IW);
547}
548
522/* Writes the specific IOMMU for a device into the rlookup table */ 549/* Writes the specific IOMMU for a device into the rlookup table */
523static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 550static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
524{ 551{
@@ -547,6 +574,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
547 if (flags & ACPI_DEVFLAG_LINT1) 574 if (flags & ACPI_DEVFLAG_LINT1)
548 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 575 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
549 576
577 amd_iommu_apply_erratum_63(devid);
578
550 set_iommu_for_device(iommu, devid); 579 set_iommu_for_device(iommu, devid);
551} 580}
552 581
@@ -816,7 +845,18 @@ static void __init free_iommu_all(void)
816static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 845static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
817{ 846{
818 spin_lock_init(&iommu->lock); 847 spin_lock_init(&iommu->lock);
848
849 /* Add IOMMU to internal data structures */
819 list_add_tail(&iommu->list, &amd_iommu_list); 850 list_add_tail(&iommu->list, &amd_iommu_list);
851 iommu->index = amd_iommus_present++;
852
853 if (unlikely(iommu->index >= MAX_IOMMUS)) {
854 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
855 return -ENOSYS;
856 }
857
858 /* Index is fine - add IOMMU to the array */
859 amd_iommus[iommu->index] = iommu;
820 860
821 /* 861 /*
822 * Copy data from ACPI table entry to the iommu struct 862 * Copy data from ACPI table entry to the iommu struct
@@ -846,6 +886,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
846 init_iommu_from_acpi(iommu, h); 886 init_iommu_from_acpi(iommu, h);
847 init_iommu_devices(iommu); 887 init_iommu_devices(iommu);
848 888
889 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
890 amd_iommu_np_cache = true;
891
849 return pci_enable_device(iommu->dev); 892 return pci_enable_device(iommu->dev);
850} 893}
851 894
@@ -891,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table)
891 } 934 }
892 WARN_ON(p != end); 935 WARN_ON(p != end);
893 936
937 amd_iommu_initialized = true;
938
894 return 0; 939 return 0;
895} 940}
896 941
@@ -903,7 +948,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
903 * 948 *
904 ****************************************************************************/ 949 ****************************************************************************/
905 950
906static int __init iommu_setup_msi(struct amd_iommu *iommu) 951static int iommu_setup_msi(struct amd_iommu *iommu)
907{ 952{
908 int r; 953 int r;
909 954
@@ -1154,19 +1199,10 @@ static struct sys_device device_amd_iommu = {
1154 * functions. Finally it prints some information about AMD IOMMUs and 1199 * functions. Finally it prints some information about AMD IOMMUs and
1155 * the driver state and enables the hardware. 1200 * the driver state and enables the hardware.
1156 */ 1201 */
1157int __init amd_iommu_init(void) 1202static int __init amd_iommu_init(void)
1158{ 1203{
1159 int i, ret = 0; 1204 int i, ret = 0;
1160 1205
1161
1162 if (no_iommu) {
1163 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1164 return 0;
1165 }
1166
1167 if (!amd_iommu_detected)
1168 return -ENODEV;
1169
1170 /* 1206 /*
1171 * First parse ACPI tables to find the largest Bus/Dev/Func 1207 * First parse ACPI tables to find the largest Bus/Dev/Func
1172 * we need to handle. Upon this information the shared data 1208 * we need to handle. Upon this information the shared data
@@ -1203,15 +1239,6 @@ int __init amd_iommu_init(void)
1203 if (amd_iommu_rlookup_table == NULL) 1239 if (amd_iommu_rlookup_table == NULL)
1204 goto free; 1240 goto free;
1205 1241
1206 /*
1207 * Protection Domain table - maps devices to protection domains
1208 * This table has the same size as the rlookup_table
1209 */
1210 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1211 get_order(rlookup_table_size));
1212 if (amd_iommu_pd_table == NULL)
1213 goto free;
1214
1215 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( 1242 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1216 GFP_KERNEL | __GFP_ZERO, 1243 GFP_KERNEL | __GFP_ZERO,
1217 get_order(MAX_DOMAIN_ID/8)); 1244 get_order(MAX_DOMAIN_ID/8));
@@ -1233,6 +1260,8 @@ int __init amd_iommu_init(void)
1233 */ 1260 */
1234 amd_iommu_pd_alloc_bitmap[0] = 1; 1261 amd_iommu_pd_alloc_bitmap[0] = 1;
1235 1262
1263 spin_lock_init(&amd_iommu_pd_lock);
1264
1236 /* 1265 /*
1237 * now the data structures are allocated and basically initialized 1266 * now the data structures are allocated and basically initialized
1238 * start the real acpi table scan 1267 * start the real acpi table scan
@@ -1241,6 +1270,9 @@ int __init amd_iommu_init(void)
1241 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1270 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1242 goto free; 1271 goto free;
1243 1272
1273 if (!amd_iommu_initialized)
1274 goto free;
1275
1244 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1276 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1245 goto free; 1277 goto free;
1246 1278
@@ -1252,6 +1284,10 @@ int __init amd_iommu_init(void)
1252 if (ret) 1284 if (ret)
1253 goto free; 1285 goto free;
1254 1286
1287 ret = amd_iommu_init_devices();
1288 if (ret)
1289 goto free;
1290
1255 if (iommu_pass_through) 1291 if (iommu_pass_through)
1256 ret = amd_iommu_init_passthrough(); 1292 ret = amd_iommu_init_passthrough();
1257 else 1293 else
@@ -1259,32 +1295,29 @@ int __init amd_iommu_init(void)
1259 if (ret) 1295 if (ret)
1260 goto free; 1296 goto free;
1261 1297
1298 amd_iommu_init_notifier();
1299
1262 enable_iommus(); 1300 enable_iommus();
1263 1301
1264 if (iommu_pass_through) 1302 if (iommu_pass_through)
1265 goto out; 1303 goto out;
1266 1304
1267 printk(KERN_INFO "AMD-Vi: device isolation ");
1268 if (amd_iommu_isolate)
1269 printk("enabled\n");
1270 else
1271 printk("disabled\n");
1272
1273 if (amd_iommu_unmap_flush) 1305 if (amd_iommu_unmap_flush)
1274 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); 1306 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1275 else 1307 else
1276 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); 1308 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1277 1309
1310 x86_platform.iommu_shutdown = disable_iommus;
1278out: 1311out:
1279 return ret; 1312 return ret;
1280 1313
1281free: 1314free:
1315
1316 amd_iommu_uninit_devices();
1317
1282 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1318 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1283 get_order(MAX_DOMAIN_ID/8)); 1319 get_order(MAX_DOMAIN_ID/8));
1284 1320
1285 free_pages((unsigned long)amd_iommu_pd_table,
1286 get_order(rlookup_table_size));
1287
1288 free_pages((unsigned long)amd_iommu_rlookup_table, 1321 free_pages((unsigned long)amd_iommu_rlookup_table,
1289 get_order(rlookup_table_size)); 1322 get_order(rlookup_table_size));
1290 1323
@@ -1301,11 +1334,6 @@ free:
1301 goto out; 1334 goto out;
1302} 1335}
1303 1336
1304void amd_iommu_shutdown(void)
1305{
1306 disable_iommus();
1307}
1308
1309/**************************************************************************** 1337/****************************************************************************
1310 * 1338 *
1311 * Early detect code. This code runs at IOMMU detection time in the DMA 1339 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1320,16 +1348,16 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1320 1348
1321void __init amd_iommu_detect(void) 1349void __init amd_iommu_detect(void)
1322{ 1350{
1323 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) 1351 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1324 return; 1352 return;
1325 1353
1326 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1354 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1327 iommu_detected = 1; 1355 iommu_detected = 1;
1328 amd_iommu_detected = 1; 1356 amd_iommu_detected = 1;
1329#ifdef CONFIG_GART_IOMMU 1357 x86_init.iommu.iommu_init = amd_iommu_init;
1330 gart_iommu_aperture_disabled = 1; 1358
1331 gart_iommu_aperture = 0; 1359 /* Make sure ACS will be enabled */
1332#endif 1360 pci_request_acs();
1333 } 1361 }
1334} 1362}
1335 1363
@@ -1350,10 +1378,6 @@ static int __init parse_amd_iommu_dump(char *str)
1350static int __init parse_amd_iommu_options(char *str) 1378static int __init parse_amd_iommu_options(char *str)
1351{ 1379{
1352 for (; *str; ++str) { 1380 for (; *str; ++str) {
1353 if (strncmp(str, "isolate", 7) == 0)
1354 amd_iommu_isolate = true;
1355 if (strncmp(str, "share", 5) == 0)
1356 amd_iommu_isolate = false;
1357 if (strncmp(str, "fullflush", 9) == 0) 1381 if (strncmp(str, "fullflush", 9) == 0)
1358 amd_iommu_unmap_flush = true; 1382 amd_iommu_unmap_flush = true;
1359 } 1383 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..3704997e8b25 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/k8.h>
31#include <asm/x86_init.h>
31 32
32int gart_iommu_aperture; 33int gart_iommu_aperture;
33int gart_iommu_aperture_disabled __initdata; 34int gart_iommu_aperture_disabled __initdata;
@@ -279,7 +280,8 @@ void __init early_gart_iommu_check(void)
279 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
280 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
281 */ 282 */
282 int i, fix, slot; 283 u32 agp_aper_base = 0, agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0;
283 u32 ctl; 285 u32 ctl;
284 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
285 u64 aper_base = 0, last_aper_base = 0; 287 u64 aper_base = 0, last_aper_base = 0;
@@ -289,6 +291,8 @@ void __init early_gart_iommu_check(void)
289 return; 291 return;
290 292
291 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
295
292 fix = 0; 296 fix = 0;
293 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
294 int bus; 298 int bus;
@@ -341,10 +345,10 @@ void __init early_gart_iommu_check(void)
341 } 345 }
342 } 346 }
343 347
344 if (!fix) 348 if (valid_agp)
345 return; 349 return;
346 350
347 /* different nodes have different setting, disable them all at first*/ 351 /* disable them all at first */
348 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 352 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
349 int bus; 353 int bus;
350 int dev_base, dev_limit; 354 int dev_base, dev_limit;
@@ -400,6 +404,7 @@ void __init gart_iommu_hole_init(void)
400 404
401 iommu_detected = 1; 405 iommu_detected = 1;
402 gart_iommu_aperture = 1; 406 gart_iommu_aperture = 1;
407 x86_init.iommu.iommu_init = gart_iommu_init;
403 408
404 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 409 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
405 aper_size = (32 * 1024 * 1024) << aper_order; 410 aper_size = (32 * 1024 * 1024) << aper_order;
@@ -456,8 +461,6 @@ out:
456 461
457 if (aper_alloc) { 462 if (aper_alloc) {
458 /* Got the aperture from the AGP bridge */ 463 /* Got the aperture from the AGP bridge */
459 } else if (swiotlb && !valid_agp) {
460 /* Do nothing */
461 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || 464 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
462 force_iommu || 465 force_iommu ||
463 valid_agp || 466 valid_agp ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 7obj-$(CONFIG_SMP) += ipi.o
8 8
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..aa57c079c98f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -241,28 +241,13 @@ static int modern_apic(void)
241} 241}
242 242
243/* 243/*
244 * bare function to substitute write operation 244 * right after this call apic become NOOP driven
245 * and it's _that_ fast :) 245 * so apic->write/read doesn't do anything
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */ 246 */
262void apic_disable(void) 247void apic_disable(void)
263{ 248{
264 apic->read = native_apic_read_dummy; 249 pr_info("APIC: switched to apic NOOP\n");
265 apic->write = native_apic_write_dummy; 250 apic = &apic_noop;
266} 251}
267 252
268void native_apic_wait_icr_idle(void) 253void native_apic_wait_icr_idle(void)
@@ -459,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
459 v = apic_read(APIC_LVTT); 444 v = apic_read(APIC_LVTT);
460 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 445 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
461 apic_write(APIC_LVTT, v); 446 apic_write(APIC_LVTT, v);
462 apic_write(APIC_TMICT, 0xffffffff); 447 apic_write(APIC_TMICT, 0);
463 break; 448 break;
464 case CLOCK_EVT_MODE_RESUME: 449 case CLOCK_EVT_MODE_RESUME:
465 /* Nothing to do here */ 450 /* Nothing to do here */
@@ -662,7 +647,7 @@ static int __init calibrate_APIC_clock(void)
662 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 647 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
663 648
664 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 649 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
665 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 650 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
666 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 651 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
667 calibration_result); 652 calibration_result);
668 653
@@ -1356,7 +1341,7 @@ void enable_x2apic(void)
1356 1341
1357 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1342 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1358 if (!(msr & X2APIC_ENABLE)) { 1343 if (!(msr & X2APIC_ENABLE)) {
1359 pr_info("Enabling x2apic\n"); 1344 printk_once(KERN_INFO "Enabling x2apic\n");
1360 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1345 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1361 } 1346 }
1362} 1347}
@@ -1392,14 +1377,11 @@ void __init enable_IR_x2apic(void)
1392 unsigned long flags; 1377 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL; 1378 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0; 1379 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0; 1380 int dmar_table_init_ret;
1396 1381
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init(); 1382 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret) 1383 if (dmar_table_init_ret && !x2apic_supported())
1400 pr_debug("dmar_table_init() failed with %d:\n", 1384 return;
1401 dmar_table_init_ret);
1402#endif
1403 1385
1404 ioapic_entries = alloc_ioapic_entries(); 1386 ioapic_entries = alloc_ioapic_entries();
1405 if (!ioapic_entries) { 1387 if (!ioapic_entries) {
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index d0c99abc26c3..eacbd2b31d27 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -306,10 +306,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
306 if (cpumask_test_cpu(cpu, cpu_online_mask)) 306 if (cpumask_test_cpu(cpu, cpu_online_mask))
307 break; 307 break;
308 } 308 }
309 if (cpu < nr_cpu_ids) 309 return per_cpu(x86_cpu_to_apicid, cpu);
310 return per_cpu(x86_cpu_to_apicid, cpu);
311
312 return BAD_APICID;
313} 310}
314 311
315struct apic apic_physflat = { 312struct apic apic_physflat = {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..e31b9ffe25f5
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
1/*
2 * NOOP APIC driver.
3 *
4 * Does almost nothing and should be substituted by a real apic driver via
5 * probe routine.
6 *
7 * Though in case if apic is disabled (for some reason) we try
8 * to not uglify the caller's code and allow to call (some) apic routines
9 * like self-ipi, etc...
10 */
11
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h>
20#include <asm/fixmap.h>
21#include <asm/mpspec.h>
22#include <asm/apicdef.h>
23#include <asm/apic.h>
24#include <asm/setup.h>
25
26#include <linux/smp.h>
27#include <asm/ipi.h>
28
29#include <linux/interrupt.h>
30#include <asm/acpi.h>
31#include <asm/e820.h>
32
33static void noop_init_apic_ldr(void) { }
34static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
35static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
36static void noop_send_IPI_allbutself(int vector) { }
37static void noop_send_IPI_all(int vector) { }
38static void noop_send_IPI_self(int vector) { }
39static void noop_apic_wait_icr_idle(void) { }
40static void noop_apic_icr_write(u32 low, u32 id) { }
41
42static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
43{
44 return -1;
45}
46
47static u32 noop_safe_apic_wait_icr_idle(void)
48{
49 return 0;
50}
51
52static u64 noop_apic_icr_read(void)
53{
54 return 0;
55}
56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{
64 return 0;
65}
66
67static unsigned int noop_get_apic_id(unsigned long x)
68{
69 return 0;
70}
71
72static int noop_probe(void)
73{
74 /*
75 * NOOP apic should not ever be
76 * enabled via probe routine
77 */
78 return 0;
79}
80
81static int noop_apic_id_registered(void)
82{
83 /*
84 * if we would be really "pedantic"
85 * we should pass read_apic_id() here
86 * but since NOOP suppose APIC ID = 0
87 * lets save a few cycles
88 */
89 return physid_isset(0, phys_cpu_present_map);
90}
91
92static const struct cpumask *noop_target_cpus(void)
93{
94 /* only BSP here */
95 return cpumask_of(0);
96}
97
98static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
99{
100 return physid_isset(apicid, *map);
101}
102
103static unsigned long noop_check_apicid_present(int bit)
104{
105 return physid_isset(bit, phys_cpu_present_map);
106}
107
108static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
109{
110 if (cpu != 0)
111 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
112 cpumask_clear(retmask);
113 cpumask_set_cpu(cpu, retmask);
114}
115
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg)
123{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
125 return 0;
126}
127
128static void noop_apic_write(u32 reg, u32 v)
129{
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131}
132
133struct apic apic_noop = {
134 .name = "noop",
135 .probe = noop_probe,
136 .acpi_madt_oem_check = NULL,
137
138 .apic_id_registered = noop_apic_id_registered,
139
140 .irq_delivery_mode = dest_LowestPrio,
141 /* logical delivery broadcast to all CPUs: */
142 .irq_dest_mode = 1,
143
144 .target_cpus = noop_target_cpus,
145 .disable_esr = 0,
146 .dest_logical = APIC_DEST_LOGICAL,
147 .check_apicid_used = noop_check_apicid_used,
148 .check_apicid_present = noop_check_apicid_present,
149
150 .vector_allocation_domain = noop_vector_allocation_domain,
151 .init_apic_ldr = noop_init_apic_ldr,
152
153 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid,
161
162 .setup_portio_remap = NULL,
163 .check_phys_apicid_present = default_check_phys_apicid_present,
164 .enable_apic_mode = NULL,
165
166 .phys_pkg_id = noop_phys_pkg_id,
167
168 .mps_oem_check = NULL,
169
170 .get_apic_id = noop_get_apic_id,
171 .set_apic_id = NULL,
172 .apic_id_mask = 0x0F << 24,
173
174 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
175 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
176
177 .send_IPI_mask = noop_send_IPI_mask,
178 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
179 .send_IPI_allbutself = noop_send_IPI_allbutself,
180 .send_IPI_all = noop_send_IPI_all,
181 .send_IPI_self = noop_send_IPI_self,
182
183 .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
184
185 /* should be safe */
186 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
187 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
188
189 .wait_for_init_deassert = NULL,
190
191 .smp_callin_clear_local_apic = NULL,
192 .inquire_remote_apic = NULL,
193
194 .read = noop_apic_read,
195 .write = noop_apic_write,
196 .icr_read = noop_apic_icr_read,
197 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..cb804c5091b9 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
35#endif 35#endif
36} 36}
37 37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 39{
40 return 0; 40 return 0;
41} 41}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 93 return BAD_APICID;
94} 94}
95 95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */ 96/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu) 97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{ 98{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106 return cpu_physical_id(cpu); 101 return cpu_physical_id(cpu);
107} 102}
108 103
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
110{ 105{
111 /* For clustered we don't have a good way to do this yet - hack */ 106 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL); 107 physids_promote(0xFFL, retmap);
113} 108}
114 109
115static int bigsmp_check_phys_apicid_present(int phys_apicid) 110static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -136,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
136 if (cpumask_test_cpu(cpu, cpu_online_mask)) 131 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break; 132 break;
138 } 133 }
139 if (cpu < nr_cpu_ids) 134 return bigsmp_cpu_to_logical_apicid(cpu);
140 return bigsmp_cpu_to_logical_apicid(cpu);
141
142 return BAD_APICID;
143} 135}
144 136
145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -230,7 +222,7 @@ struct apic apic_bigsmp = {
230 .apicid_to_node = bigsmp_apicid_to_node, 222 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, 223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, 225 .apicid_to_cpu_present = physid_set_mask_of_physid,
234 .setup_portio_remap = NULL, 226 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present, 227 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL, 228 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..dd2b5f264643 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -27,6 +27,9 @@
27 * 27 *
28 * http://www.unisys.com 28 * http://www.unisys.com
29 */ 29 */
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
30#include <linux/notifier.h> 33#include <linux/notifier.h>
31#include <linux/spinlock.h> 34#include <linux/spinlock.h>
32#include <linux/cpumask.h> 35#include <linux/cpumask.h>
@@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr)
223 mip_addr = val; 226 mip_addr = val;
224 mip = (struct mip_reg *)val; 227 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip); 228 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", 229 pr_debug("host_reg = 0x%lx\n",
227 (unsigned long)host_reg); 230 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", 231 pr_debug("mip_reg = 0x%lx\n",
229 (unsigned long)mip_reg); 232 (unsigned long)mip_reg);
230 success++; 233 success++;
231 break; 234 break;
@@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void)
401 if (!es7000_plat) 404 if (!es7000_plat)
402 return; 405 return;
403 406
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); 407 pr_info("Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); 408 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC; 409 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID; 410 es7000_mip_reg.off_0x38 = MIP_VALID;
@@ -466,11 +469,11 @@ static const struct cpumask *es7000_target_cpus(void)
466 return cpumask_of(smp_processor_id()); 469 return cpumask_of(smp_processor_id());
467} 470}
468 471
469static unsigned long 472static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{ 473{
472 return 0; 474 return 0;
473} 475}
476
474static unsigned long es7000_check_apicid_present(int bit) 477static unsigned long es7000_check_apicid_present(int bit)
475{ 478{
476 return physid_isset(bit, phys_cpu_present_map); 479 return physid_isset(bit, phys_cpu_present_map);
@@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void)
514{ 517{
515 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 518 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
516 519
517 printk(KERN_INFO 520 pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
519 (apic_version[apic] == 0x14) ? 521 (apic_version[apic] == 0x14) ?
520 "Physical Cluster" : "Logical Cluster", 522 "Physical Cluster" : "Logical Cluster",
521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 523 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
@@ -539,14 +541,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
539 541
540static int cpu_id; 542static int cpu_id;
541 543
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) 544static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
543{ 545{
544 physid_mask_t mask; 546 physid_set_mask_of_physid(cpu_id, retmap);
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id; 547 ++cpu_id;
548
549 return mask;
550} 548}
551 549
552/* Mapping from cpu number to logical apicid */ 550/* Mapping from cpu number to logical apicid */
@@ -561,10 +559,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
561#endif 559#endif
562} 560}
563 561
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) 562static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
565{ 563{
566 /* For clustered we don't have a good way to do this yet - hack */ 564 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff); 565 physids_promote(0xFFL, retmap);
568} 566}
569 567
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid) 568static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..de00c4619a55 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -60,8 +60,6 @@
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h> 62#include <asm/hw_irq.h>
63#include <asm/uv/uv_hub.h>
64#include <asm/uv/uv_irq.h>
65 63
66#include <asm/apic.h> 64#include <asm/apic.h>
67 65
@@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 return pin; 138 return pin;
141} 139}
142 140
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
148struct irq_cfg {
149 struct irq_pin_list *irq_2_pin;
150 cpumask_var_t domain;
151 cpumask_var_t old_domain;
152 unsigned move_cleanup_count;
153 u8 vector;
154 u8 move_in_progress : 1;
155};
156
157/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
158#ifdef CONFIG_SPARSE_IRQ 142#ifdef CONFIG_SPARSE_IRQ
159static struct irq_cfg irq_cfgx[] = { 143static struct irq_cfg irq_cfgx[] = {
@@ -209,7 +193,7 @@ int __init arch_early_irq_init(void)
209} 193}
210 194
211#ifdef CONFIG_SPARSE_IRQ 195#ifdef CONFIG_SPARSE_IRQ
212static struct irq_cfg *irq_cfg(unsigned int irq) 196struct irq_cfg *irq_cfg(unsigned int irq)
213{ 197{
214 struct irq_cfg *cfg = NULL; 198 struct irq_cfg *cfg = NULL;
215 struct irq_desc *desc; 199 struct irq_desc *desc;
@@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
361/* end for move_irq_desc */ 345/* end for move_irq_desc */
362 346
363#else 347#else
364static struct irq_cfg *irq_cfg(unsigned int irq) 348struct irq_cfg *irq_cfg(unsigned int irq)
365{ 349{
366 return irq < nr_irqs ? irq_cfgx + irq : NULL; 350 return irq < nr_irqs ? irq_cfgx + irq : NULL;
367} 351}
@@ -555,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
555 add_pin_to_irq_node(cfg, node, newapic, newpin); 539 add_pin_to_irq_node(cfg, node, newapic, newpin);
556} 540}
557 541
542static void __io_apic_modify_irq(struct irq_pin_list *entry,
543 int mask_and, int mask_or,
544 void (*final)(struct irq_pin_list *entry))
545{
546 unsigned int reg, pin;
547
548 pin = entry->pin;
549 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
550 reg &= mask_and;
551 reg |= mask_or;
552 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
553 if (final)
554 final(entry);
555}
556
558static void io_apic_modify_irq(struct irq_cfg *cfg, 557static void io_apic_modify_irq(struct irq_cfg *cfg,
559 int mask_and, int mask_or, 558 int mask_and, int mask_or,
560 void (*final)(struct irq_pin_list *entry)) 559 void (*final)(struct irq_pin_list *entry))
561{ 560{
562 int pin;
563 struct irq_pin_list *entry; 561 struct irq_pin_list *entry;
564 562
565 for_each_irq_pin(entry, cfg->irq_2_pin) { 563 for_each_irq_pin(entry, cfg->irq_2_pin)
566 unsigned int reg; 564 __io_apic_modify_irq(entry, mask_and, mask_or, final);
567 pin = entry->pin; 565}
568 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 566
569 reg &= mask_and; 567static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
570 reg |= mask_or; 568{
571 io_apic_modify(entry->apic, 0x10 + pin * 2, reg); 569 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
572 if (final) 570 IO_APIC_REDIR_MASKED, NULL);
573 final(entry); 571}
574 } 572
573static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
574{
575 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
576 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
575} 577}
576 578
577static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) 579static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 597 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
596} 598}
597 599
598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
599{
600 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
601 IO_APIC_REDIR_MASKED, NULL);
602}
603
604static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
605{
606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
608}
609
610static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 600static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
611{ 601{
612 struct irq_cfg *cfg = desc->chip_data; 602 struct irq_cfg *cfg = desc->chip_data;
@@ -1177,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1177 int cpu, err; 1167 int cpu, err;
1178 cpumask_var_t tmp_mask; 1168 cpumask_var_t tmp_mask;
1179 1169
1180 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1170 if (cfg->move_in_progress)
1181 return -EBUSY; 1171 return -EBUSY;
1182 1172
1183 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1173 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1237,8 +1227,7 @@ next:
1237 return err; 1227 return err;
1238} 1228}
1239 1229
1240static int 1230int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1241assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1242{ 1231{
1243 int err; 1232 int err;
1244 unsigned long flags; 1233 unsigned long flags;
@@ -1599,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1599 struct irq_desc *desc; 1588 struct irq_desc *desc;
1600 unsigned int irq; 1589 unsigned int irq;
1601 1590
1602 if (apic_verbosity == APIC_QUIET)
1603 return;
1604
1605 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1591 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1606 for (i = 0; i < nr_ioapics; i++) 1592 for (i = 0; i < nr_ioapics; i++)
1607 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1593 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1708,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base)
1708{ 1694{
1709 int i; 1695 int i;
1710 1696
1711 if (apic_verbosity == APIC_QUIET)
1712 return;
1713
1714 printk(KERN_DEBUG); 1697 printk(KERN_DEBUG);
1715 1698
1716 for (i = 0; i < 8; i++) 1699 for (i = 0; i < 8; i++)
@@ -1724,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1724 unsigned int i, v, ver, maxlvt; 1707 unsigned int i, v, ver, maxlvt;
1725 u64 icr; 1708 u64 icr;
1726 1709
1727 if (apic_verbosity == APIC_QUIET)
1728 return;
1729
1730 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1710 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1731 smp_processor_id(), hard_smp_processor_id()); 1711 smp_processor_id(), hard_smp_processor_id());
1732 v = apic_read(APIC_ID); 1712 v = apic_read(APIC_ID);
@@ -1824,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1824 printk("\n"); 1804 printk("\n");
1825} 1805}
1826 1806
1827__apicdebuginit(void) print_all_local_APICs(void) 1807__apicdebuginit(void) print_local_APICs(int maxcpu)
1828{ 1808{
1829 int cpu; 1809 int cpu;
1830 1810
1811 if (!maxcpu)
1812 return;
1813
1831 preempt_disable(); 1814 preempt_disable();
1832 for_each_online_cpu(cpu) 1815 for_each_online_cpu(cpu) {
1816 if (cpu >= maxcpu)
1817 break;
1833 smp_call_function_single(cpu, print_local_APIC, NULL, 1); 1818 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1819 }
1834 preempt_enable(); 1820 preempt_enable();
1835} 1821}
1836 1822
@@ -1839,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void)
1839 unsigned int v; 1825 unsigned int v;
1840 unsigned long flags; 1826 unsigned long flags;
1841 1827
1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) 1828 if (!nr_legacy_irqs)
1843 return; 1829 return;
1844 1830
1845 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1831 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1866,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void)
1866 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1852 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1867} 1853}
1868 1854
1869__apicdebuginit(int) print_all_ICs(void) 1855static int __initdata show_lapic = 1;
1856static __init int setup_show_lapic(char *arg)
1857{
1858 int num = -1;
1859
1860 if (strcmp(arg, "all") == 0) {
1861 show_lapic = CONFIG_NR_CPUS;
1862 } else {
1863 get_option(&arg, &num);
1864 if (num >= 0)
1865 show_lapic = num;
1866 }
1867
1868 return 1;
1869}
1870__setup("show_lapic=", setup_show_lapic);
1871
1872__apicdebuginit(int) print_ICs(void)
1870{ 1873{
1874 if (apic_verbosity == APIC_QUIET)
1875 return 0;
1876
1871 print_PIC(); 1877 print_PIC();
1872 1878
1873 /* don't print out if apic is not there */ 1879 /* don't print out if apic is not there */
1874 if (!cpu_has_apic && !apic_from_smp_config()) 1880 if (!cpu_has_apic && !apic_from_smp_config())
1875 return 0; 1881 return 0;
1876 1882
1877 print_all_local_APICs(); 1883 print_local_APICs(show_lapic);
1878 print_IO_APIC(); 1884 print_IO_APIC();
1879 1885
1880 return 0; 1886 return 0;
1881} 1887}
1882 1888
1883fs_initcall(print_all_ICs); 1889fs_initcall(print_ICs);
1884 1890
1885 1891
1886/* Where if anywhere is the i8259 connect in external int mode */ 1892/* Where if anywhere is the i8259 connect in external int mode */
@@ -2031,7 +2037,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2031 * This is broken; anything with a real cpu count has to 2037 * This is broken; anything with a real cpu count has to
2032 * circumvent this idiocy regardless. 2038 * circumvent this idiocy regardless.
2033 */ 2039 */
2034 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 2040 apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
2035 2041
2036 /* 2042 /*
2037 * Set the IOAPIC ID to the value stored in the MPC table. 2043 * Set the IOAPIC ID to the value stored in the MPC table.
@@ -2058,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2058 * system must have a unique ID or we get lots of nice 2064 * system must have a unique ID or we get lots of nice
2059 * 'stuck on smp_invalidate_needed IPI wait' messages. 2065 * 'stuck on smp_invalidate_needed IPI wait' messages.
2060 */ 2066 */
2061 if (apic->check_apicid_used(phys_id_present_map, 2067 if (apic->check_apicid_used(&phys_id_present_map,
2062 mp_ioapics[apic_id].apicid)) { 2068 mp_ioapics[apic_id].apicid)) {
2063 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2069 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2064 apic_id, mp_ioapics[apic_id].apicid); 2070 apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2079,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2073 mp_ioapics[apic_id].apicid = i; 2079 mp_ioapics[apic_id].apicid = i;
2074 } else { 2080 } else {
2075 physid_mask_t tmp; 2081 physid_mask_t tmp;
2076 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); 2082 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
2077 apic_printk(APIC_VERBOSE, "Setting %d in the " 2083 apic_printk(APIC_VERBOSE, "Setting %d in the "
2078 "phys_id_present_map\n", 2084 "phys_id_present_map\n",
2079 mp_ioapics[apic_id].apicid); 2085 mp_ioapics[apic_id].apicid);
@@ -2228,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
2228 */ 2234 */
2229 2235
2230#ifdef CONFIG_SMP 2236#ifdef CONFIG_SMP
2231static void send_cleanup_vector(struct irq_cfg *cfg) 2237void send_cleanup_vector(struct irq_cfg *cfg)
2232{ 2238{
2233 cpumask_var_t cleanup_mask; 2239 cpumask_var_t cleanup_mask;
2234 2240
2235 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { 2241 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2236 unsigned int i; 2242 unsigned int i;
2237 cfg->move_cleanup_count = 0;
2238 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2239 cfg->move_cleanup_count++;
2240 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 2243 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2241 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 2244 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2242 } else { 2245 } else {
2243 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 2246 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2244 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2245 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2247 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2246 free_cpumask_var(cleanup_mask); 2248 free_cpumask_var(cleanup_mask);
2247 } 2249 }
@@ -2272,31 +2274,30 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2272 } 2274 }
2273} 2275}
2274 2276
2275static int
2276assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2277
2278/* 2277/*
2279 * Either sets desc->affinity to a valid value, and returns 2278 * Either sets desc->affinity to a valid value, and returns
2280 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and 2279 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2281 * leaves desc->affinity untouched. 2280 * leaves desc->affinity untouched.
2282 */ 2281 */
2283static unsigned int 2282unsigned int
2284set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 2283set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
2284 unsigned int *dest_id)
2285{ 2285{
2286 struct irq_cfg *cfg; 2286 struct irq_cfg *cfg;
2287 unsigned int irq; 2287 unsigned int irq;
2288 2288
2289 if (!cpumask_intersects(mask, cpu_online_mask)) 2289 if (!cpumask_intersects(mask, cpu_online_mask))
2290 return BAD_APICID; 2290 return -1;
2291 2291
2292 irq = desc->irq; 2292 irq = desc->irq;
2293 cfg = desc->chip_data; 2293 cfg = desc->chip_data;
2294 if (assign_irq_vector(irq, cfg, mask)) 2294 if (assign_irq_vector(irq, cfg, mask))
2295 return BAD_APICID; 2295 return -1;
2296 2296
2297 cpumask_copy(desc->affinity, mask); 2297 cpumask_copy(desc->affinity, mask);
2298 2298
2299 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2299 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2300 return 0;
2300} 2301}
2301 2302
2302static int 2303static int
@@ -2312,12 +2313,11 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2312 cfg = desc->chip_data; 2313 cfg = desc->chip_data;
2313 2314
2314 spin_lock_irqsave(&ioapic_lock, flags); 2315 spin_lock_irqsave(&ioapic_lock, flags);
2315 dest = set_desc_affinity(desc, mask); 2316 ret = set_desc_affinity(desc, mask, &dest);
2316 if (dest != BAD_APICID) { 2317 if (!ret) {
2317 /* Only the high 8 bits are valid. */ 2318 /* Only the high 8 bits are valid. */
2318 dest = SET_APIC_LOGICAL_ID(dest); 2319 dest = SET_APIC_LOGICAL_ID(dest);
2319 __target_IO_APIC_irq(irq, dest, cfg); 2320 __target_IO_APIC_irq(irq, dest, cfg);
2320 ret = 0;
2321 } 2321 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2322 spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2323
@@ -2432,9 +2432,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2432 continue; 2432 continue;
2433 2433
2434 cfg = irq_cfg(irq); 2434 cfg = irq_cfg(irq);
2435 spin_lock(&desc->lock); 2435 raw_spin_lock(&desc->lock);
2436 if (!cfg->move_cleanup_count)
2437 goto unlock;
2438 2436
2439 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2437 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2440 goto unlock; 2438 goto unlock;
@@ -2452,29 +2450,40 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2452 goto unlock; 2450 goto unlock;
2453 } 2451 }
2454 __get_cpu_var(vector_irq)[vector] = -1; 2452 __get_cpu_var(vector_irq)[vector] = -1;
2455 cfg->move_cleanup_count--;
2456unlock: 2453unlock:
2457 spin_unlock(&desc->lock); 2454 raw_spin_unlock(&desc->lock);
2458 } 2455 }
2459 2456
2460 irq_exit(); 2457 irq_exit();
2461} 2458}
2462 2459
2463static void irq_complete_move(struct irq_desc **descp) 2460static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2464{ 2461{
2465 struct irq_desc *desc = *descp; 2462 struct irq_desc *desc = *descp;
2466 struct irq_cfg *cfg = desc->chip_data; 2463 struct irq_cfg *cfg = desc->chip_data;
2467 unsigned vector, me; 2464 unsigned me;
2468 2465
2469 if (likely(!cfg->move_in_progress)) 2466 if (likely(!cfg->move_in_progress))
2470 return; 2467 return;
2471 2468
2472 vector = ~get_irq_regs()->orig_ax;
2473 me = smp_processor_id(); 2469 me = smp_processor_id();
2474 2470
2475 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2471 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2476 send_cleanup_vector(cfg); 2472 send_cleanup_vector(cfg);
2477} 2473}
2474
2475static void irq_complete_move(struct irq_desc **descp)
2476{
2477 __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
2478}
2479
2480void irq_force_complete_move(int irq)
2481{
2482 struct irq_desc *desc = irq_to_desc(irq);
2483 struct irq_cfg *cfg = desc->chip_data;
2484
2485 __irq_complete_move(&desc, cfg->vector);
2486}
2478#else 2487#else
2479static inline void irq_complete_move(struct irq_desc **descp) {} 2488static inline void irq_complete_move(struct irq_desc **descp) {}
2480#endif 2489#endif
@@ -2490,6 +2499,59 @@ static void ack_apic_edge(unsigned int irq)
2490 2499
2491atomic_t irq_mis_count; 2500atomic_t irq_mis_count;
2492 2501
2502/*
2503 * IO-APIC versions below 0x20 don't support EOI register.
2504 * For the record, here is the information about various versions:
2505 * 0Xh 82489DX
2506 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2507 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2508 * 30h-FFh Reserved
2509 *
2510 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2511 * version as 0x2. This is an error with documentation and these ICH chips
2512 * use io-apic's of version 0x20.
2513 *
2514 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2515 * Otherwise, we simulate the EOI message manually by changing the trigger
2516 * mode to edge and then back to level, with RTE being masked during this.
2517*/
2518static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2519{
2520 struct irq_pin_list *entry;
2521
2522 for_each_irq_pin(entry, cfg->irq_2_pin) {
2523 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2524 /*
2525 * Intr-remapping uses pin number as the virtual vector
2526 * in the RTE. Actual vector is programmed in
2527 * intr-remapping table entry. Hence for the io-apic
2528 * EOI we use the pin number.
2529 */
2530 if (irq_remapped(irq))
2531 io_apic_eoi(entry->apic, entry->pin);
2532 else
2533 io_apic_eoi(entry->apic, cfg->vector);
2534 } else {
2535 __mask_and_edge_IO_APIC_irq(entry);
2536 __unmask_and_level_IO_APIC_irq(entry);
2537 }
2538 }
2539}
2540
2541static void eoi_ioapic_irq(struct irq_desc *desc)
2542{
2543 struct irq_cfg *cfg;
2544 unsigned long flags;
2545 unsigned int irq;
2546
2547 irq = desc->irq;
2548 cfg = desc->chip_data;
2549
2550 spin_lock_irqsave(&ioapic_lock, flags);
2551 __eoi_ioapic_irq(irq, cfg);
2552 spin_unlock_irqrestore(&ioapic_lock, flags);
2553}
2554
2493static void ack_apic_level(unsigned int irq) 2555static void ack_apic_level(unsigned int irq)
2494{ 2556{
2495 struct irq_desc *desc = irq_to_desc(irq); 2557 struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2587,19 @@ static void ack_apic_level(unsigned int irq)
2525 * level-triggered interrupt. We mask the source for the time of the 2587 * level-triggered interrupt. We mask the source for the time of the
2526 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2588 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2527 * The idea is from Manfred Spraul. --macro 2589 * The idea is from Manfred Spraul. --macro
2590 *
2591 * Also in the case when cpu goes offline, fixup_irqs() will forward
2592 * any unhandled interrupt on the offlined cpu to the new cpu
2593 * destination that is handling the corresponding interrupt. This
2594 * interrupt forwarding is done via IPI's. Hence, in this case also
2595 * level-triggered io-apic interrupt will be seen as an edge
2596 * interrupt in the IRR. And we can't rely on the cpu's EOI
2597 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
2598 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
2599 * supporting EOI register, we do an explicit EOI to clear the
2600 * remote IRR and on IO-APIC's which don't have an EOI register,
2601 * we use the above logic (mask+edge followed by unmask+level) from
2602 * Manfred Spraul to clear the remote IRR.
2528 */ 2603 */
2529 cfg = desc->chip_data; 2604 cfg = desc->chip_data;
2530 i = cfg->vector; 2605 i = cfg->vector;
@@ -2536,6 +2611,19 @@ static void ack_apic_level(unsigned int irq)
2536 */ 2611 */
2537 ack_APIC_irq(); 2612 ack_APIC_irq();
2538 2613
2614 /*
2615 * Tail end of clearing remote IRR bit (either by delivering the EOI
2616 * message via io-apic EOI register write or simulating it using
2617 * mask+edge followed by unnask+level logic) manually when the
2618 * level triggered interrupt is seen as the edge triggered interrupt
2619 * at the cpu.
2620 */
2621 if (!(v & (1 << (i & 0x1f)))) {
2622 atomic_inc(&irq_mis_count);
2623
2624 eoi_ioapic_irq(desc);
2625 }
2626
2539 /* Now we can move and renable the irq */ 2627 /* Now we can move and renable the irq */
2540 if (unlikely(do_unmask_irq)) { 2628 if (unlikely(do_unmask_irq)) {
2541 /* Only migrate the irq if the ack has been received. 2629 /* Only migrate the irq if the ack has been received.
@@ -2569,41 +2657,9 @@ static void ack_apic_level(unsigned int irq)
2569 move_masked_irq(irq); 2657 move_masked_irq(irq);
2570 unmask_IO_APIC_irq_desc(desc); 2658 unmask_IO_APIC_irq_desc(desc);
2571 } 2659 }
2572
2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2574 if (!(v & (1 << (i & 0x1f)))) {
2575 atomic_inc(&irq_mis_count);
2576 spin_lock(&ioapic_lock);
2577 __mask_and_edge_IO_APIC_irq(cfg);
2578 __unmask_and_level_IO_APIC_irq(cfg);
2579 spin_unlock(&ioapic_lock);
2580 }
2581} 2660}
2582 2661
2583#ifdef CONFIG_INTR_REMAP 2662#ifdef CONFIG_INTR_REMAP
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2589 io_apic_eoi(entry->apic, entry->pin);
2590}
2591
2592static void
2593eoi_ioapic_irq(struct irq_desc *desc)
2594{
2595 struct irq_cfg *cfg;
2596 unsigned long flags;
2597 unsigned int irq;
2598
2599 irq = desc->irq;
2600 cfg = desc->chip_data;
2601
2602 spin_lock_irqsave(&ioapic_lock, flags);
2603 __eoi_ioapic_irq(irq, cfg);
2604 spin_unlock_irqrestore(&ioapic_lock, flags);
2605}
2606
2607static void ir_ack_apic_edge(unsigned int irq) 2663static void ir_ack_apic_edge(unsigned int irq)
2608{ 2664{
2609 ack_APIC_irq(); 2665 ack_APIC_irq();
@@ -3157,6 +3213,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3157 continue; 3213 continue;
3158 3214
3159 desc_new = move_irq_desc(desc_new, node); 3215 desc_new = move_irq_desc(desc_new, node);
3216 cfg_new = desc_new->chip_data;
3160 3217
3161 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3218 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3162 irq = new; 3219 irq = new;
@@ -3211,7 +3268,8 @@ void destroy_irq(unsigned int irq)
3211 * MSI message composition 3268 * MSI message composition
3212 */ 3269 */
3213#ifdef CONFIG_PCI_MSI 3270#ifdef CONFIG_PCI_MSI
3214static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 3271static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3272 struct msi_msg *msg, u8 hpet_id)
3215{ 3273{
3216 struct irq_cfg *cfg; 3274 struct irq_cfg *cfg;
3217 int err; 3275 int err;
@@ -3245,7 +3303,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3245 irte.dest_id = IRTE_DEST(dest); 3303 irte.dest_id = IRTE_DEST(dest);
3246 3304
3247 /* Set source-id of interrupt request */ 3305 /* Set source-id of interrupt request */
3248 set_msi_sid(&irte, pdev); 3306 if (pdev)
3307 set_msi_sid(&irte, pdev);
3308 else
3309 set_hpet_sid(&irte, hpet_id);
3249 3310
3250 modify_irte(irq, &irte); 3311 modify_irte(irq, &irte);
3251 3312
@@ -3291,8 +3352,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3291 struct msi_msg msg; 3352 struct msi_msg msg;
3292 unsigned int dest; 3353 unsigned int dest;
3293 3354
3294 dest = set_desc_affinity(desc, mask); 3355 if (set_desc_affinity(desc, mask, &dest))
3295 if (dest == BAD_APICID)
3296 return -1; 3356 return -1;
3297 3357
3298 cfg = desc->chip_data; 3358 cfg = desc->chip_data;
@@ -3324,8 +3384,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3324 if (get_irte(irq, &irte)) 3384 if (get_irte(irq, &irte))
3325 return -1; 3385 return -1;
3326 3386
3327 dest = set_desc_affinity(desc, mask); 3387 if (set_desc_affinity(desc, mask, &dest))
3328 if (dest == BAD_APICID)
3329 return -1; 3388 return -1;
3330 3389
3331 irte.vector = cfg->vector; 3390 irte.vector = cfg->vector;
@@ -3410,7 +3469,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3410 int ret; 3469 int ret;
3411 struct msi_msg msg; 3470 struct msi_msg msg;
3412 3471
3413 ret = msi_compose_msg(dev, irq, &msg); 3472 ret = msi_compose_msg(dev, irq, &msg, -1);
3414 if (ret < 0) 3473 if (ret < 0)
3415 return ret; 3474 return ret;
3416 3475
@@ -3507,8 +3566,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3507 struct msi_msg msg; 3566 struct msi_msg msg;
3508 unsigned int dest; 3567 unsigned int dest;
3509 3568
3510 dest = set_desc_affinity(desc, mask); 3569 if (set_desc_affinity(desc, mask, &dest))
3511 if (dest == BAD_APICID)
3512 return -1; 3570 return -1;
3513 3571
3514 cfg = desc->chip_data; 3572 cfg = desc->chip_data;
@@ -3543,7 +3601,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3543 int ret; 3601 int ret;
3544 struct msi_msg msg; 3602 struct msi_msg msg;
3545 3603
3546 ret = msi_compose_msg(NULL, irq, &msg); 3604 ret = msi_compose_msg(NULL, irq, &msg, -1);
3547 if (ret < 0) 3605 if (ret < 0)
3548 return ret; 3606 return ret;
3549 dmar_msi_write(irq, &msg); 3607 dmar_msi_write(irq, &msg);
@@ -3563,8 +3621,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3563 struct msi_msg msg; 3621 struct msi_msg msg;
3564 unsigned int dest; 3622 unsigned int dest;
3565 3623
3566 dest = set_desc_affinity(desc, mask); 3624 if (set_desc_affinity(desc, mask, &dest))
3567 if (dest == BAD_APICID)
3568 return -1; 3625 return -1;
3569 3626
3570 cfg = desc->chip_data; 3627 cfg = desc->chip_data;
@@ -3583,6 +3640,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3583 3640
3584#endif /* CONFIG_SMP */ 3641#endif /* CONFIG_SMP */
3585 3642
3643static struct irq_chip ir_hpet_msi_type = {
3644 .name = "IR-HPET_MSI",
3645 .unmask = hpet_msi_unmask,
3646 .mask = hpet_msi_mask,
3647#ifdef CONFIG_INTR_REMAP
3648 .ack = ir_ack_apic_edge,
3649#ifdef CONFIG_SMP
3650 .set_affinity = ir_set_msi_irq_affinity,
3651#endif
3652#endif
3653 .retrigger = ioapic_retrigger_irq,
3654};
3655
3586static struct irq_chip hpet_msi_type = { 3656static struct irq_chip hpet_msi_type = {
3587 .name = "HPET_MSI", 3657 .name = "HPET_MSI",
3588 .unmask = hpet_msi_unmask, 3658 .unmask = hpet_msi_unmask,
@@ -3594,20 +3664,36 @@ static struct irq_chip hpet_msi_type = {
3594 .retrigger = ioapic_retrigger_irq, 3664 .retrigger = ioapic_retrigger_irq,
3595}; 3665};
3596 3666
3597int arch_setup_hpet_msi(unsigned int irq) 3667int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3598{ 3668{
3599 int ret; 3669 int ret;
3600 struct msi_msg msg; 3670 struct msi_msg msg;
3601 struct irq_desc *desc = irq_to_desc(irq); 3671 struct irq_desc *desc = irq_to_desc(irq);
3602 3672
3603 ret = msi_compose_msg(NULL, irq, &msg); 3673 if (intr_remapping_enabled) {
3674 struct intel_iommu *iommu = map_hpet_to_ir(id);
3675 int index;
3676
3677 if (!iommu)
3678 return -1;
3679
3680 index = alloc_irte(iommu, irq, 1);
3681 if (index < 0)
3682 return -1;
3683 }
3684
3685 ret = msi_compose_msg(NULL, irq, &msg, id);
3604 if (ret < 0) 3686 if (ret < 0)
3605 return ret; 3687 return ret;
3606 3688
3607 hpet_msi_write(irq, &msg); 3689 hpet_msi_write(irq, &msg);
3608 desc->status |= IRQ_MOVE_PCNTXT; 3690 desc->status |= IRQ_MOVE_PCNTXT;
3609 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, 3691 if (irq_remapped(irq))
3610 "edge"); 3692 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3693 handle_edge_irq, "edge");
3694 else
3695 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3696 handle_edge_irq, "edge");
3611 3697
3612 return 0; 3698 return 0;
3613} 3699}
@@ -3641,8 +3727,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3641 struct irq_cfg *cfg; 3727 struct irq_cfg *cfg;
3642 unsigned int dest; 3728 unsigned int dest;
3643 3729
3644 dest = set_desc_affinity(desc, mask); 3730 if (set_desc_affinity(desc, mask, &dest))
3645 if (dest == BAD_APICID)
3646 return -1; 3731 return -1;
3647 3732
3648 cfg = desc->chip_data; 3733 cfg = desc->chip_data;
@@ -3708,75 +3793,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3708} 3793}
3709#endif /* CONFIG_HT_IRQ */ 3794#endif /* CONFIG_HT_IRQ */
3710 3795
3711#ifdef CONFIG_X86_UV
3712/*
3713 * Re-target the irq to the specified CPU and enable the specified MMR located
3714 * on the specified blade to allow the sending of MSIs to the specified CPU.
3715 */
3716int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3717 unsigned long mmr_offset)
3718{
3719 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3720 struct irq_cfg *cfg;
3721 int mmr_pnode;
3722 unsigned long mmr_value;
3723 struct uv_IO_APIC_route_entry *entry;
3724 unsigned long flags;
3725 int err;
3726
3727 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3728
3729 cfg = irq_cfg(irq);
3730
3731 err = assign_irq_vector(irq, cfg, eligible_cpu);
3732 if (err != 0)
3733 return err;
3734
3735 spin_lock_irqsave(&vector_lock, flags);
3736 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3737 irq_name);
3738 spin_unlock_irqrestore(&vector_lock, flags);
3739
3740 mmr_value = 0;
3741 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3742 entry->vector = cfg->vector;
3743 entry->delivery_mode = apic->irq_delivery_mode;
3744 entry->dest_mode = apic->irq_dest_mode;
3745 entry->polarity = 0;
3746 entry->trigger = 0;
3747 entry->mask = 0;
3748 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3749
3750 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3751 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3752
3753 if (cfg->move_in_progress)
3754 send_cleanup_vector(cfg);
3755
3756 return irq;
3757}
3758
3759/*
3760 * Disable the specified MMR located on the specified blade so that MSIs are
3761 * longer allowed to be sent.
3762 */
3763void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3764{
3765 unsigned long mmr_value;
3766 struct uv_IO_APIC_route_entry *entry;
3767 int mmr_pnode;
3768
3769 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3770
3771 mmr_value = 0;
3772 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3773 entry->mask = 1;
3774
3775 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3776 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3777}
3778#endif /* CONFIG_X86_64 */
3779
3780int __init io_apic_get_redir_entries (int ioapic) 3796int __init io_apic_get_redir_entries (int ioapic)
3781{ 3797{
3782 union IO_APIC_reg_01 reg_01; 3798 union IO_APIC_reg_01 reg_01;
@@ -3944,7 +3960,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3944 */ 3960 */
3945 3961
3946 if (physids_empty(apic_id_map)) 3962 if (physids_empty(apic_id_map))
3947 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 3963 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3948 3964
3949 spin_lock_irqsave(&ioapic_lock, flags); 3965 spin_lock_irqsave(&ioapic_lock, flags);
3950 reg_00.raw = io_apic_read(ioapic, 0); 3966 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3960,10 +3976,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3960 * Every APIC in a system must have a unique ID or we get lots of nice 3976 * Every APIC in a system must have a unique ID or we get lots of nice
3961 * 'stuck on smp_invalidate_needed IPI wait' messages. 3977 * 'stuck on smp_invalidate_needed IPI wait' messages.
3962 */ 3978 */
3963 if (apic->check_apicid_used(apic_id_map, apic_id)) { 3979 if (apic->check_apicid_used(&apic_id_map, apic_id)) {
3964 3980
3965 for (i = 0; i < get_physical_broadcast(); i++) { 3981 for (i = 0; i < get_physical_broadcast(); i++) {
3966 if (!apic->check_apicid_used(apic_id_map, i)) 3982 if (!apic->check_apicid_used(&apic_id_map, i))
3967 break; 3983 break;
3968 } 3984 }
3969 3985
@@ -3976,7 +3992,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3976 apic_id = i; 3992 apic_id = i;
3977 } 3993 }
3978 3994
3979 tmp = apic->apicid_to_cpu_present(apic_id); 3995 apic->apicid_to_cpu_present(apic_id, &tmp);
3980 physids_or(apic_id_map, apic_id_map, tmp); 3996 physids_or(apic_id_map, apic_id_map, tmp);
3981 3997
3982 if (reg_00.bits.ID != apic_id) { 3998 if (reg_00.bits.ID != apic_id) {
@@ -4106,7 +4122,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4106 for (i = 0; i < nr_ioapics; i++) { 4122 for (i = 0; i < nr_ioapics; i++) {
4107 res[i].name = mem; 4123 res[i].name = mem;
4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4124 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4109 sprintf(mem, "IOAPIC %u", i); 4125 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
4110 mem += IOAPIC_RESOURCE_NAME_SIZE; 4126 mem += IOAPIC_RESOURCE_NAME_SIZE;
4111 } 4127 }
4112 4128
@@ -4140,18 +4156,17 @@ void __init ioapic_init_mappings(void)
4140#ifdef CONFIG_X86_32 4156#ifdef CONFIG_X86_32
4141fake_ioapic_page: 4157fake_ioapic_page:
4142#endif 4158#endif
4143 ioapic_phys = (unsigned long) 4159 ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
4144 alloc_bootmem_pages(PAGE_SIZE);
4145 ioapic_phys = __pa(ioapic_phys); 4160 ioapic_phys = __pa(ioapic_phys);
4146 } 4161 }
4147 set_fixmap_nocache(idx, ioapic_phys); 4162 set_fixmap_nocache(idx, ioapic_phys);
4148 apic_printk(APIC_VERBOSE, 4163 apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
4149 "mapped IOAPIC to %08lx (%08lx)\n", 4164 __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
4150 __fix_to_virt(idx), ioapic_phys); 4165 ioapic_phys);
4151 idx++; 4166 idx++;
4152 4167
4153 ioapic_res->start = ioapic_phys; 4168 ioapic_res->start = ioapic_phys;
4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4169 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4155 ioapic_res++; 4170 ioapic_res++;
4156 } 4171 }
4157} 4172}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 45404379d173..4ada42c3dabb 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,8 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_t backtrace_mask __read_mostly; 42/* For reliability, we're prepared to waste bits here. */
43static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
43 44
44/* nmi_active: 45/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 46 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
414 } 415 }
415 416
416 /* We can be called before check_nmi_watchdog, hence NULL check. */ 417 /* We can be called before check_nmi_watchdog, hence NULL check. */
417 if (cpumask_test_cpu(cpu, &backtrace_mask)) { 418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
419 420
420 spin_lock(&lock); 421 spin_lock(&lock);
@@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
422 show_regs(regs); 423 show_regs(regs);
423 dump_stack(); 424 dump_stack();
424 spin_unlock(&lock); 425 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, &backtrace_mask); 426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
426 427
427 rc = 1; 428 rc = 1;
428 } 429 }
@@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void)
558{ 559{
559 int i; 560 int i;
560 561
561 cpumask_copy(&backtrace_mask, cpu_online_mask); 562 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
562 563
563 printk(KERN_INFO "sending NMI to all CPUs:\n"); 564 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR); 565 apic->send_IPI_all(NMI_VECTOR);
565 566
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 567 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 568 for (i = 0; i < 10 * 1000; i++) {
568 if (cpumask_empty(&backtrace_mask)) 569 if (cpumask_empty(to_cpumask(backtrace_mask)))
569 break; 570 break;
570 mdelay(1); 571 mdelay(1);
571 } 572 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index efa00e2b8505..98c4665f251c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
264static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
265{ 265{
266 /* 266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270
271 /*
272 * get boot-time SMP configuration: 267 * get boot-time SMP configuration:
273 */ 268 */
274 if (smp_found_config) 269 if (smp_found_config)
@@ -334,10 +329,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
334 return cpu_all_mask; 329 return cpu_all_mask;
335} 330}
336 331
337static inline unsigned long 332static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
338numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
339{ 333{
340 return physid_isset(apicid, bitmap); 334 return physid_isset(apicid, *map);
341} 335}
342 336
343static inline unsigned long numaq_check_apicid_present(int bit) 337static inline unsigned long numaq_check_apicid_present(int bit)
@@ -371,10 +365,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
371 return apic != 0 && irq == 0; 365 return apic != 0 && irq == 0;
372} 366}
373 367
374static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) 368static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
375{ 369{
376 /* We don't have a good way to do this yet - hack */ 370 /* We don't have a good way to do this yet - hack */
377 return physids_promote(0xFUL); 371 return physids_promote(0xFUL, retmap);
378} 372}
379 373
380static inline int numaq_cpu_to_logical_apicid(int cpu) 374static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -402,12 +396,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
402 return logical_apicid >> 4; 396 return logical_apicid >> 4;
403} 397}
404 398
405static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) 399static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
406{ 400{
407 int node = numaq_apicid_to_node(logical_apicid); 401 int node = numaq_apicid_to_node(logical_apicid);
408 int cpu = __ffs(logical_apicid & 0xf); 402 int cpu = __ffs(logical_apicid & 0xf);
409 403
410 return physid_mask_of_physid(cpu + 4*node); 404 physid_set_mask_of_physid(cpu + 4*node, retmap);
411} 405}
412 406
413/* Where the IO area was mapped on multiquad, always 0 otherwise */ 407/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..1a6559f6768c 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -108,7 +108,7 @@ struct apic apic_default = {
108 .apicid_to_node = default_apicid_to_node, 108 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
110 .cpu_present_to_apicid = default_cpu_present_to_apicid, 110 .cpu_present_to_apicid = default_cpu_present_to_apicid,
111 .apicid_to_cpu_present = default_apicid_to_cpu_present, 111 .apicid_to_cpu_present = physid_set_mask_of_physid,
112 .setup_portio_remap = NULL, 112 .setup_portio_remap = NULL,
113 .check_phys_apicid_present = default_check_phys_apicid_present, 113 .check_phys_apicid_present = default_check_phys_apicid_present,
114 .enable_apic_mode = NULL, 114 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
183 return cpumask_of(0); 183 return cpumask_of(0);
184} 184}
185 185
186static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 186static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
187{ 187{
188 return 0; 188 return 0;
189} 189}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
261 return BAD_APICID; 261 return BAD_APICID;
262} 262}
263 263
264static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) 264static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
265{ 265{
266 /* For clustered we don't have a good way to do this yet - hack */ 266 /* For clustered we don't have a good way to do this yet - hack */
267 return physids_promote(0x0F); 267 physids_promote(0x0FL, retmap);
268} 268}
269 269
270static physid_mask_t summit_apicid_to_cpu_present(int apicid) 270static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
271{ 271{
272 return physid_mask_of_physid(0); 272 physid_set_mask_of_physid(0, retmap);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5371ec36776..cf69c59f4910 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
148 break; 148 break;
149 } 149 }
150 150
151 if (cpu < nr_cpu_ids) 151 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152 return per_cpu(x86_cpu_to_logical_apicid, cpu);
153
154 return BAD_APICID;
155} 152}
156 153
157static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a8989aadc99a..8972f38c5ced 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
146 break; 146 break;
147 } 147 }
148 148
149 if (cpu < nr_cpu_ids) 149 return per_cpu(x86_cpu_to_apicid, cpu);
150 return per_cpu(x86_cpu_to_apicid, cpu);
151
152 return BAD_APICID;
153} 150}
154 151
155static unsigned int x2apic_phys_get_apic_id(unsigned long x) 152static unsigned int x2apic_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f5f5886a6b53..5f92494dab61 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -30,10 +30,22 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/ipi.h> 31#include <asm/ipi.h>
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/x86_init.h>
33 34
34DEFINE_PER_CPU(int, x2apic_extra_bits); 35DEFINE_PER_CPU(int, x2apic_extra_bits);
35 36
36static enum uv_system_type uv_system_type; 37static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr;
39
40static inline bool is_GRU_range(u64 start, u64 end)
41{
42 return start >= gru_start_paddr && end <= gru_end_paddr;
43}
44
45static bool uv_is_untracked_pat_range(u64 start, u64 end)
46{
47 return is_ISA_range(start, end) || is_GRU_range(start, end);
48}
37 49
38static int early_get_nodeid(void) 50static int early_get_nodeid(void)
39{ 51{
@@ -49,6 +61,7 @@ static int early_get_nodeid(void)
49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 62{
51 if (!strcmp(oem_id, "SGI")) { 63 if (!strcmp(oem_id, "SGI")) {
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
52 if (!strcmp(oem_table_id, "UVL")) 65 if (!strcmp(oem_table_id, "UVL"))
53 uv_system_type = UV_LEGACY_APIC; 66 uv_system_type = UV_LEGACY_APIC;
54 else if (!strcmp(oem_table_id, "UVX")) 67 else if (!strcmp(oem_table_id, "UVX"))
@@ -212,10 +225,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
212 if (cpumask_test_cpu(cpu, cpu_online_mask)) 225 if (cpumask_test_cpu(cpu, cpu_online_mask))
213 break; 226 break;
214 } 227 }
215 if (cpu < nr_cpu_ids) 228 return per_cpu(x86_cpu_to_apicid, cpu);
216 return per_cpu(x86_cpu_to_apicid, cpu);
217
218 return BAD_APICID;
219} 229}
220 230
221static unsigned int x2apic_get_apic_id(unsigned long x) 231static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -352,14 +362,14 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
352 362
353 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { 363 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
354 alias.v = uv_read_local_mmr(redir_addrs[i].alias); 364 alias.v = uv_read_local_mmr(redir_addrs[i].alias);
355 if (alias.s.base == 0) { 365 if (alias.s.enable && alias.s.base == 0) {
356 *size = (1UL << alias.s.m_alias); 366 *size = (1UL << alias.s.m_alias);
357 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); 367 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
358 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; 368 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
359 return; 369 return;
360 } 370 }
361 } 371 }
362 BUG(); 372 *base = *size = 0;
363} 373}
364 374
365enum map_type {map_wb, map_uc}; 375enum map_type {map_wb, map_uc};
@@ -385,8 +395,12 @@ static __init void map_gru_high(int max_pnode)
385 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; 395 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
386 396
387 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 397 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
388 if (gru.s.enable) 398 if (gru.s.enable) {
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 399 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
400 gru_start_paddr = ((u64)gru.s.base << shift);
401 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
402
403 }
390} 404}
391 405
392static __init void map_mmr_high(int max_pnode) 406static __init void map_mmr_high(int max_pnode)
@@ -409,6 +423,12 @@ static __init void map_mmioh_high(int max_pnode)
409 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 423 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
410} 424}
411 425
426static __init void map_low_mmrs(void)
427{
428 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
429 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
430}
431
412static __init void uv_rtc_init(void) 432static __init void uv_rtc_init(void)
413{ 433{
414 long status; 434 long status;
@@ -550,6 +570,8 @@ void __init uv_system_init(void)
550 unsigned long mmr_base, present, paddr; 570 unsigned long mmr_base, present, paddr;
551 unsigned short pnode_mask; 571 unsigned short pnode_mask;
552 572
573 map_low_mmrs();
574
553 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 575 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
554 m_val = m_n_config.s.m_skt; 576 m_val = m_n_config.s.m_skt;
555 n_val = m_n_config.s.n_skt; 577 n_val = m_n_config.s.n_skt;
@@ -607,8 +629,10 @@ void __init uv_system_init(void)
607 uv_rtc_init(); 629 uv_rtc_init();
608 630
609 for_each_present_cpu(cpu) { 631 for_each_present_cpu(cpu) {
632 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
633
610 nid = cpu_to_node(cpu); 634 nid = cpu_to_node(cpu);
611 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 635 pnode = uv_apicid_to_pnode(apicid);
612 blade = boot_pnode_to_blade(pnode); 636 blade = boot_pnode_to_blade(pnode);
613 lcpu = uv_blade_info[blade].nr_possible_cpus; 637 lcpu = uv_blade_info[blade].nr_possible_cpus;
614 uv_blade_info[blade].nr_possible_cpus++; 638 uv_blade_info[blade].nr_possible_cpus++;
@@ -619,25 +643,23 @@ void __init uv_system_init(void)
619 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 643 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
620 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 644 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
621 uv_cpu_hub_info(cpu)->m_val = m_val; 645 uv_cpu_hub_info(cpu)->m_val = m_val;
622 uv_cpu_hub_info(cpu)->n_val = m_val; 646 uv_cpu_hub_info(cpu)->n_val = n_val;
623 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 647 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
624 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 648 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
625 uv_cpu_hub_info(cpu)->pnode = pnode; 649 uv_cpu_hub_info(cpu)->pnode = pnode;
626 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 650 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
627 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 651 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
628 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 652 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
629 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 653 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
630 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 654 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
631 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 655 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
632 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 656 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
633 uv_node_to_blade[nid] = blade; 657 uv_node_to_blade[nid] = blade;
634 uv_cpu_to_blade[cpu] = blade; 658 uv_cpu_to_blade[cpu] = blade;
635 max_pnode = max(pnode, max_pnode); 659 max_pnode = max(pnode, max_pnode);
636 660
637 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " 661 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
638 "lcpu %d, blade %d\n", 662 cpu, apicid, pnode, nid, lcpu, blade);
639 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
640 lcpu, blade);
641 } 663 }
642 664
643 /* Add blade/pnode info for nodes without cpus */ 665 /* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..b5b6b23bce53 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
208#include <linux/types.h> 207#include <linux/types.h>
209#include <linux/stddef.h> 208#include <linux/stddef.h>
210#include <linux/timer.h> 209#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 403static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 404static DEFINE_SPINLOCK(user_list_lock);
405static DEFINE_MUTEX(apm_mutex);
406 406
407/* 407/*
408 * Set up a segment that references the real mode segment 0x40 408 * Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1531 return -EPERM; 1531 return -EPERM;
1532 switch (cmd) { 1532 switch (cmd) {
1533 case APM_IOC_STANDBY: 1533 case APM_IOC_STANDBY:
1534 lock_kernel(); 1534 mutex_lock(&apm_mutex);
1535 if (as->standbys_read > 0) { 1535 if (as->standbys_read > 0) {
1536 as->standbys_read--; 1536 as->standbys_read--;
1537 as->standbys_pending--; 1537 as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1540 queue_event(APM_USER_STANDBY, as); 1540 queue_event(APM_USER_STANDBY, as);
1541 if (standbys_pending <= 0) 1541 if (standbys_pending <= 0)
1542 standby(); 1542 standby();
1543 unlock_kernel(); 1543 mutex_unlock(&apm_mutex);
1544 break; 1544 break;
1545 case APM_IOC_SUSPEND: 1545 case APM_IOC_SUSPEND:
1546 lock_kernel(); 1546 mutex_lock(&apm_mutex);
1547 if (as->suspends_read > 0) { 1547 if (as->suspends_read > 0) {
1548 as->suspends_read--; 1548 as->suspends_read--;
1549 as->suspends_pending--; 1549 as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1552 queue_event(APM_USER_SUSPEND, as); 1552 queue_event(APM_USER_SUSPEND, as);
1553 if (suspends_pending <= 0) { 1553 if (suspends_pending <= 0) {
1554 ret = suspend(1); 1554 ret = suspend(1);
1555 mutex_unlock(&apm_mutex);
1555 } else { 1556 } else {
1556 as->suspend_wait = 1; 1557 as->suspend_wait = 1;
1558 mutex_unlock(&apm_mutex);
1557 wait_event_interruptible(apm_suspend_waitqueue, 1559 wait_event_interruptible(apm_suspend_waitqueue,
1558 as->suspend_wait == 0); 1560 as->suspend_wait == 0);
1559 ret = as->suspend_result; 1561 ret = as->suspend_result;
1560 } 1562 }
1561 unlock_kernel();
1562 return ret; 1563 return ret;
1563 default: 1564 default:
1564 return -ENOTTY; 1565 return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
1608{ 1609{
1609 struct apm_user *as; 1610 struct apm_user *as;
1610 1611
1611 lock_kernel();
1612 as = kmalloc(sizeof(*as), GFP_KERNEL); 1612 as = kmalloc(sizeof(*as), GFP_KERNEL);
1613 if (as == NULL) { 1613 if (as == NULL) {
1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1615 sizeof(*as)); 1615 sizeof(*as));
1616 unlock_kernel();
1617 return -ENOMEM; 1616 return -ENOMEM;
1618 } 1617 }
1619 as->magic = APM_BIOS_MAGIC; 1618 as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
1635 user_list = as; 1634 user_list = as;
1636 spin_unlock(&user_list_lock); 1635 spin_unlock(&user_list_lock);
1637 filp->private_data = as; 1636 filp->private_data = as;
1638 unlock_kernel();
1639 return 0; 1637 return 0;
1640} 1638}
1641 1639
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 63a88e1f987d..b0206a211b09 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -101,21 +101,17 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
101} 101}
102 102
103int 103int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, 104uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset) 105 unsigned long *intr_mmr_offset)
106{ 106{
107 union uv_watchlist_u size_blade;
108 u64 watchlist; 107 u64 watchlist;
109 s64 ret; 108 s64 ret;
110 109
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /* 110 /*
115 * bios returns watchlist number or negative error number. 111 * bios returns watchlist number or negative error number.
116 */ 112 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, 113 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset, 114 mq_size, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0); 115 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS) 116 if (ret < BIOS_STATUS_SUCCESS)
121 return ret; 117 return ret;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..1d2cb383410e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c965e5212714..468489b57aae 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -74,6 +74,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
74 unsigned int eax, ebx, ecx, edx, sub_index; 74 unsigned int eax, ebx, ecx, edx, sub_index;
75 unsigned int ht_mask_width, core_plus_mask_width; 75 unsigned int ht_mask_width, core_plus_mask_width;
76 unsigned int core_select_mask, core_level_siblings; 76 unsigned int core_select_mask, core_level_siblings;
77 static bool printed;
77 78
78 if (c->cpuid_level < 0xb) 79 if (c->cpuid_level < 0xb)
79 return; 80 return;
@@ -127,12 +128,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
127 128
128 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 129 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
129 130
130 131 if (!printed) {
131 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 132 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
132 c->phys_proc_id); 133 c->phys_proc_id);
133 if (c->x86_max_cores > 1) 134 if (c->x86_max_cores > 1)
134 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 135 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
135 c->cpu_core_id); 136 c->cpu_core_id);
137 printed = 1;
138 }
136 return; 139 return;
137#endif 140#endif
138} 141}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..e485825130d2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid)
254 254
255/* 255/*
256 * Fixup core topology information for AMD multi-node processors. 256 * Fixup core topology information for AMD multi-node processors.
257 * Assumption 1: Number of cores in each internal node is the same. 257 * Assumption: Number of cores in each internal node is the same.
258 * Assumption 2: Mixed systems with both single-node and dual-node
259 * processors are not supported.
260 */ 258 */
261#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
262static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
263{ 261{
264#ifdef CONFIG_PCI 262 unsigned long long value;
265 u32 t, cpn; 263 u32 nodes, cores_per_node;
266 u8 n, n_id;
267 int cpu = smp_processor_id(); 264 int cpu = smp_processor_id();
268 265
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
267 return;
268
269 /* fixup topology information only once for a core */ 269 /* fixup topology information only once for a core */
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 270 if (cpu_has(c, X86_FEATURE_AMD_DCM))
271 return; 271 return;
272 272
273 /* check for multi-node processor on boot cpu */ 273 rdmsrl(MSR_FAM10H_NODE_ID, value);
274 t = read_pci_config(0, 24, 3, 0xe8); 274
275 if (!(t & (1 << 29))) 275 nodes = ((value >> 3) & 7) + 1;
276 if (nodes == 1)
276 return; 277 return;
277 278
278 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 279 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
280 cores_per_node = c->x86_max_cores / nodes;
279 281
280 /* cores per node: each internal node has half the number of cores */ 282 /* store NodeID, use llc_shared_map to store sibling info */
281 cpn = c->x86_max_cores >> 1; 283 per_cpu(cpu_llc_id, cpu) = value & 7;
282
283 /* even-numbered NB_id of this dual-node processor */
284 n = c->phys_proc_id << 1;
285
286 /*
287 * determine internal node id and assign cores fifty-fifty to
288 * each node of the dual-node processor
289 */
290 t = read_pci_config(0, 24 + n, 3, 0xe8);
291 n = (t>>30) & 0x3;
292 if (n == 0) {
293 if (c->cpu_core_id < cpn)
294 n_id = 0;
295 else
296 n_id = 1;
297 } else {
298 if (c->cpu_core_id < cpn)
299 n_id = 1;
300 else
301 n_id = 0;
302 }
303
304 /* compute entire NodeID, use llc_shared_map to store sibling info */
305 per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
306 284
307 /* fixup core id to be in range from 0 to cpn */ 285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */
308 c->cpu_core_id = c->cpu_core_id % cpn; 286 c->cpu_core_id = c->cpu_core_id % cores_per_node;
309#endif
310} 287}
311#endif 288#endif
312 289
@@ -375,8 +352,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
375 node = nearby_node(apicid); 352 node = nearby_node(apicid);
376 } 353 }
377 numa_set_node(cpu, node); 354 numa_set_node(cpu, node);
378
379 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
380#endif 355#endif
381} 356}
382 357
@@ -535,7 +510,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
535 } 510 }
536 } 511 }
537 512
538 display_cacheinfo(c); 513 cpu_detect_cache_sizes(c);
539 514
540 /* Multi core CPU? */ 515 /* Multi core CPU? */
541 if (c->extended_cpuid_level >= 0x80000008) { 516 if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 } 295 }
296 296
297 display_cacheinfo(c); 297 cpu_detect_cache_sizes(c);
298} 298}
299 299
300enum { 300enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3192f22f2fdd..4868e4a951ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
61static void __cpuinit default_init(struct cpuinfo_x86 *c) 61static void __cpuinit default_init(struct cpuinfo_x86 *c)
62{ 62{
63#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
64 display_cacheinfo(c); 64 cpu_detect_cache_sizes(c);
65#else 65#else
66 /* Not much we can do here... */ 66 /* Not much we can do here... */
67 /* Check if at least it has cpuid */ 67 /* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
383 } 383 }
384} 384}
385 385
386void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 386void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
387{ 387{
388 unsigned int n, dummy, ebx, ecx, edx, l2size; 388 unsigned int n, dummy, ebx, ecx, edx, l2size;
389 389
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
391 391
392 if (n >= 0x80000005) { 392 if (n >= 0x80000005) {
393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
394 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
395 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
396 c->x86_cache_size = (ecx>>24) + (edx>>24); 394 c->x86_cache_size = (ecx>>24) + (edx>>24);
397#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
398 /* On K8 L1 TLB is inclusive, so don't count it */ 396 /* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
422#endif 420#endif
423 421
424 c->x86_cache_size = l2size; 422 c->x86_cache_size = l2size;
425
426 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
427 l2size, ecx & 0xFF);
428} 423}
429 424
430void __cpuinit detect_ht(struct cpuinfo_x86 *c) 425void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -432,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
432#ifdef CONFIG_X86_HT 427#ifdef CONFIG_X86_HT
433 u32 eax, ebx, ecx, edx; 428 u32 eax, ebx, ecx, edx;
434 int index_msb, core_bits; 429 int index_msb, core_bits;
430 static bool printed;
435 431
436 if (!cpu_has(c, X86_FEATURE_HT)) 432 if (!cpu_has(c, X86_FEATURE_HT))
437 return; 433 return;
@@ -447,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
447 smp_num_siblings = (ebx & 0xff0000) >> 16; 443 smp_num_siblings = (ebx & 0xff0000) >> 16;
448 444
449 if (smp_num_siblings == 1) { 445 if (smp_num_siblings == 1) {
450 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 446 printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
451 goto out; 447 goto out;
452 } 448 }
453 449
@@ -474,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
474 ((1 << core_bits) - 1); 470 ((1 << core_bits) - 1);
475 471
476out: 472out:
477 if ((c->x86_max_cores * smp_num_siblings) > 1) { 473 if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
478 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 474 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
479 c->phys_proc_id); 475 c->phys_proc_id);
480 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 476 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
481 c->cpu_core_id); 477 c->cpu_core_id);
478 printed = 1;
482 } 479 }
483#endif 480#endif
484} 481}
@@ -659,24 +656,31 @@ void __init early_cpu_init(void)
659 const struct cpu_dev *const *cdev; 656 const struct cpu_dev *const *cdev;
660 int count = 0; 657 int count = 0;
661 658
659#ifdef PROCESSOR_SELECT
662 printk(KERN_INFO "KERNEL supported cpus:\n"); 660 printk(KERN_INFO "KERNEL supported cpus:\n");
661#endif
662
663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
664 const struct cpu_dev *cpudev = *cdev; 664 const struct cpu_dev *cpudev = *cdev;
665 unsigned int j;
666 665
667 if (count >= X86_VENDOR_NUM) 666 if (count >= X86_VENDOR_NUM)
668 break; 667 break;
669 cpu_devs[count] = cpudev; 668 cpu_devs[count] = cpudev;
670 count++; 669 count++;
671 670
672 for (j = 0; j < 2; j++) { 671#ifdef PROCESSOR_SELECT
673 if (!cpudev->c_ident[j]) 672 {
674 continue; 673 unsigned int j;
675 printk(KERN_INFO " %s %s\n", cpudev->c_vendor, 674
676 cpudev->c_ident[j]); 675 for (j = 0; j < 2; j++) {
676 if (!cpudev->c_ident[j])
677 continue;
678 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
679 cpudev->c_ident[j]);
680 }
677 } 681 }
682#endif
678 } 683 }
679
680 early_identify_cpu(&boot_cpu_data); 684 early_identify_cpu(&boot_cpu_data);
681} 685}
682 686
@@ -837,10 +841,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 841 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 842 }
839 843
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 844 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 845 mcheck_cpu_init(c);
843#endif
844 846
845 select_idle_routine(c); 847 select_idle_routine(c);
846 848
@@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void)
1115 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) 1117 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1116 panic("CPU#%d already initialized!\n", cpu); 1118 panic("CPU#%d already initialized!\n", cpu);
1117 1119
1118 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1120 pr_debug("Initializing CPU#%d\n", cpu);
1119 1121
1120 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1122 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1121 1123
@@ -1136,7 +1138,7 @@ void __cpuinit cpu_init(void)
1136 wrmsrl(MSR_KERNEL_GS_BASE, 0); 1138 wrmsrl(MSR_KERNEL_GS_BASE, 0);
1137 barrier(); 1139 barrier();
1138 1140
1139 check_efer(); 1141 x86_configure_nx();
1140 if (cpu != 0) 1142 if (cpu != 0)
1141 enable_x2apic(); 1143 enable_x2apic();
1142 1144
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36 36
37#endif 37#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 43eb3465dda7..1b1920fa7c80 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -190,9 +190,11 @@ static void do_drv_write(void *_cmd)
190 190
191static void drv_read(struct drv_cmd *cmd) 191static void drv_read(struct drv_cmd *cmd)
192{ 192{
193 int err;
193 cmd->val = 0; 194 cmd->val = 0;
194 195
195 smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); 196 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
197 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
196} 198}
197 199
198static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
@@ -526,15 +528,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
526 528
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) 529static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{ 530{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf 531 /* Intel Xeon Processor 7100 Series Specification Update
532 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an 533 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause 534 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/ 535 * Both Processor Cores to Lock Up. */
533 if (c->x86_vendor == X86_VENDOR_INTEL) { 536 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) && 537 if ((c->x86 == 15) &&
535 (c->x86_model == 6) && 538 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable()) 539 (c->x86_mask == 8)) {
540 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
541 "Xeon(R) 7100 Errata AL30, processors may "
542 "lock up on frequency changes: disabling "
543 "acpi-cpufreq.\n");
537 return -ENODEV; 544 return -ENODEV;
545 }
538 } 546 }
539 return 0; 547 return 0;
540} 548}
@@ -549,13 +557,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
549 unsigned int result = 0; 557 unsigned int result = 0;
550 struct cpuinfo_x86 *c = &cpu_data(policy->cpu); 558 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
551 struct acpi_processor_performance *perf; 559 struct acpi_processor_performance *perf;
560#ifdef CONFIG_SMP
561 static int blacklisted;
562#endif
552 563
553 dprintk("acpi_cpufreq_cpu_init\n"); 564 dprintk("acpi_cpufreq_cpu_init\n");
554 565
555#ifdef CONFIG_SMP 566#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c); 567 if (blacklisted)
557 if (result) 568 return blacklisted;
558 return result; 569 blacklisted = acpi_cpufreq_blacklist(c);
570 if (blacklisted)
571 return blacklisted;
559#endif 572#endif
560 573
561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 574 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
@@ -753,14 +766,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = {
753}; 766};
754 767
755static struct cpufreq_driver acpi_cpufreq_driver = { 768static struct cpufreq_driver acpi_cpufreq_driver = {
756 .verify = acpi_cpufreq_verify, 769 .verify = acpi_cpufreq_verify,
757 .target = acpi_cpufreq_target, 770 .target = acpi_cpufreq_target,
758 .init = acpi_cpufreq_cpu_init, 771 .bios_limit = acpi_processor_get_bios_limit,
759 .exit = acpi_cpufreq_cpu_exit, 772 .init = acpi_cpufreq_cpu_init,
760 .resume = acpi_cpufreq_resume, 773 .exit = acpi_cpufreq_cpu_exit,
761 .name = "acpi-cpufreq", 774 .resume = acpi_cpufreq_resume,
762 .owner = THIS_MODULE, 775 .name = "acpi-cpufreq",
763 .attr = acpi_cpufreq_attr, 776 .owner = THIS_MODULE,
777 .attr = acpi_cpufreq_attr,
764}; 778};
765 779
766static int __init acpi_cpufreq_init(void) 780static int __init acpi_cpufreq_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index ce2ed3e4aad9..7e7eea4f8261 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); 813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break; 814 break;
815 case 1 ... 15: 815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V1; 816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) { 817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2; 818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]"; 819 cpuname = "C3 'Samuel 2' [C5B]";
@@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
885 885
886 /* Find ACPI data for processor */ 886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, 887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, 888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr); 889 NULL, (void *)&pr);
890 890
891 /* Check ACPI support for C3 state */ 891 /* Check ACPI support for C3 state */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index f10dea409f40..cb01dac267d3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
164 } 164 }
165 165
166 /* cpuinfo and default policy values */ 166 /* cpuinfo and default policy values */
167 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 167 policy->cpuinfo.transition_latency = 200000;
168 policy->cur = busfreq * max_multiplier; 168 policy->cur = busfreq * max_multiplier;
169 169
170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index d47c775eb0ab..9a97116f89e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = {
714}; 714};
715 715
716static struct cpufreq_driver powernow_driver = { 716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify, 717 .verify = powernow_verify,
718 .target = powernow_target, 718 .target = powernow_target,
719 .get = powernow_get, 719 .get = powernow_get,
720 .init = powernow_cpu_init, 720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .exit = powernow_cpu_exit, 721 .bios_limit = acpi_processor_get_bios_limit,
722 .name = "powernow-k7", 722#endif
723 .owner = THIS_MODULE, 723 .init = powernow_cpu_init,
724 .attr = powernow_table_attr, 724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
725}; 728};
726 729
727static int __init powernow_init(void) 730static int __init powernow_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6394aa5c7985..f125e5c551c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data)
1022 * set it to 1 to avoid problems in the future. 1022 * set it to 1 to avoid problems in the future.
1023 * For all others it's a BIOS bug. 1023 * For all others it's a BIOS bug.
1024 */ 1024 */
1025 if (!boot_cpu_data.x86 == 0x11) 1025 if (boot_cpu_data.x86 != 0x11)
1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1027 "latency\n"); 1027 "latency\n");
1028 max_latency = 1; 1028 max_latency = 1;
@@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1118static int powernowk8_target(struct cpufreq_policy *pol, 1118static int powernowk8_target(struct cpufreq_policy *pol,
1119 unsigned targfreq, unsigned relation) 1119 unsigned targfreq, unsigned relation)
1120{ 1120{
1121 cpumask_t oldmask; 1121 cpumask_var_t oldmask;
1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1123 u32 checkfid; 1123 u32 checkfid;
1124 u32 checkvid; 1124 u32 checkvid;
@@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1131 checkfid = data->currfid; 1131 checkfid = data->currfid;
1132 checkvid = data->currvid; 1132 checkvid = data->currvid;
1133 1133
1134 /* only run on specific CPU from here on */ 1134 /* only run on specific CPU from here on. */
1135 oldmask = current->cpus_allowed; 1135 /* This is poor form: use a workqueue or smp_call_function_single */
1136 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1136 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1137 return -ENOMEM;
1138
1139 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1140 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1137 1141
1138 if (smp_processor_id() != pol->cpu) { 1142 if (smp_processor_id() != pol->cpu) {
1139 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1143 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1193 ret = 0; 1197 ret = 0;
1194 1198
1195err_out: 1199err_out:
1196 set_cpus_allowed_ptr(current, &oldmask); 1200 set_cpus_allowed_ptr(current, oldmask);
1201 free_cpumask_var(oldmask);
1197 return ret; 1202 return ret;
1198} 1203}
1199 1204
@@ -1393,14 +1398,15 @@ static struct freq_attr *powernow_k8_attr[] = {
1393}; 1398};
1394 1399
1395static struct cpufreq_driver cpufreq_amd64_driver = { 1400static struct cpufreq_driver cpufreq_amd64_driver = {
1396 .verify = powernowk8_verify, 1401 .verify = powernowk8_verify,
1397 .target = powernowk8_target, 1402 .target = powernowk8_target,
1398 .init = powernowk8_cpu_init, 1403 .bios_limit = acpi_processor_get_bios_limit,
1399 .exit = __devexit_p(powernowk8_cpu_exit), 1404 .init = powernowk8_cpu_init,
1400 .get = powernowk8_get, 1405 .exit = __devexit_p(powernowk8_cpu_exit),
1401 .name = "powernow-k8", 1406 .get = powernowk8_get,
1402 .owner = THIS_MODULE, 1407 .name = "powernow-k8",
1403 .attr = powernow_k8_attr, 1408 .owner = THIS_MODULE,
1409 .attr = powernow_k8_attr,
1404}; 1410};
1405 1411
1406/* driver entry point for init */ 1412/* driver entry point for init */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 6911e91fb4f6..2ce8e0b5cc54 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 39
40/* speedstep_processor 40/* speedstep_processor
41 */ 41 */
42static unsigned int speedstep_processor; 42static enum speedstep_processor speedstep_processor;
43 43
44static u32 pmbase; 44static u32 pmbase;
45 45
@@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void)
232 return 0; 232 return 0;
233} 233}
234 234
235struct get_freq_data { 235static void get_freq_data(void *_speed)
236 unsigned int speed;
237 unsigned int processor;
238};
239
240static void get_freq_data(void *_data)
241{ 236{
242 struct get_freq_data *data = _data; 237 unsigned int *speed = _speed;
243 238
244 data->speed = speedstep_get_frequency(data->processor); 239 *speed = speedstep_get_frequency(speedstep_processor);
245} 240}
246 241
247static unsigned int speedstep_get(unsigned int cpu) 242static unsigned int speedstep_get(unsigned int cpu)
248{ 243{
249 struct get_freq_data data = { .processor = cpu }; 244 unsigned int speed;
250 245
251 /* You're supposed to ensure CPU is online. */ 246 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) 247 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
253 BUG(); 248 BUG();
254 249
255 dprintk("detected %u kHz as current frequency\n", data.speed); 250 dprintk("detected %u kHz as current frequency\n", speed);
256 return data.speed; 251 return speed;
257} 252}
258 253
259/** 254/**
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index f4c290b8482f..ad0083abfa23 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -34,7 +34,7 @@ static int relaxed_check;
34 * GET PROCESSOR CORE SPEED IN KHZ * 34 * GET PROCESSOR CORE SPEED IN KHZ *
35 *********************************************************************/ 35 *********************************************************************/
36 36
37static unsigned int pentium3_get_frequency(unsigned int processor) 37static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
38{ 38{
39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
40 struct { 40 struct {
@@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void)
227 227
228 228
229/* Warning: may get called from smp_call_function_single. */ 229/* Warning: may get called from smp_call_function_single. */
230unsigned int speedstep_get_frequency(unsigned int processor) 230unsigned int speedstep_get_frequency(enum speedstep_processor processor)
231{ 231{
232 switch (processor) { 232 switch (processor) {
233 case SPEEDSTEP_CPU_PCORE: 233 case SPEEDSTEP_CPU_PCORE:
@@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor);
380 * DETECT SPEEDSTEP SPEEDS * 380 * DETECT SPEEDSTEP SPEEDS *
381 *********************************************************************/ 381 *********************************************************************/
382 382
383unsigned int speedstep_get_freqs(unsigned int processor, 383unsigned int speedstep_get_freqs(enum speedstep_processor processor,
384 unsigned int *low_speed, 384 unsigned int *low_speed,
385 unsigned int *high_speed, 385 unsigned int *high_speed,
386 unsigned int *transition_latency, 386 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index 2b6c04e5a304..70d9cea1219d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -11,18 +11,18 @@
11 11
12 12
13/* processors */ 13/* processors */
14 14enum speedstep_processor {
15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ 16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ 17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ 18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected 19/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 20 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_frequency() call. */ 21 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ 22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ 23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ 24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -31,10 +31,10 @@
31 31
32 32
33/* detect a speedstep-capable processor */ 33/* detect a speedstep-capable processor */
34extern unsigned int speedstep_detect_processor (void); 34extern enum speedstep_processor speedstep_detect_processor(void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
@@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor);
42 * SPEEDSTEP_LOW; the second argument is zero so that no 42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated. 43 * cpufreq_notify_transition calls are initiated.
44 */ 44 */
45extern unsigned int speedstep_get_freqs(unsigned int processor, 45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed, 46 unsigned int *low_speed,
47 unsigned int *high_speed, 47 unsigned int *high_speed,
48 unsigned int *transition_latency, 48 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index befea088e4f5..04d73c114e49 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -35,7 +35,7 @@ static int smi_cmd;
35static unsigned int smi_sig; 35static unsigned int smi_sig;
36 36
37/* info about the processor */ 37/* info about the processor */
38static unsigned int speedstep_processor; 38static enum speedstep_processor speedstep_processor;
39 39
40/* 40/*
41 * There are only two frequency states for each processor. Values 41 * There are only two frequency states for each processor. Values
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
373 /* Handle the GX (Formally known as the GX2) */ 373 /* Handle the GX (Formally known as the GX2) */
374 374
375 if (c->x86 == 5 && c->x86_model == 5) 375 if (c->x86 == 5 && c->x86_model == 5)
376 display_cacheinfo(c); 376 cpu_detect_cache_sizes(c);
377 else 377 else
378 init_cyrix(c); 378 init_cyrix(c);
379} 379}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..879666f4d871 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -70,7 +70,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 70 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
74 sched_clock_stable = 1; 73 sched_clock_stable = 1;
75 } 74 }
76 75
@@ -263,11 +262,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
263 /* Don't do the funky fallback heuristics the AMD version employs 262 /* Don't do the funky fallback heuristics the AMD version employs
264 for now. */ 263 for now. */
265 node = apicid_to_node[apicid]; 264 node = apicid_to_node[apicid];
266 if (node == NUMA_NO_NODE || !node_online(node)) 265 if (node == NUMA_NO_NODE)
267 node = first_node(node_online_map); 266 node = first_node(node_online_map);
267 else if (!node_online(node)) {
268 /* reuse the value from init_cpu_to_node() */
269 node = cpu_to_node(cpu);
270 }
268 numa_set_node(cpu, node); 271 numa_set_node(cpu, node);
269
270 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
271#endif 272#endif
272} 273}
273 274
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index f5ccb4fa5a5d..fc6c8ef92dcc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ 97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
@@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */
105 { 0x00, 0, 0} 108 { 0x00, 0, 0}
106}; 109};
107 110
@@ -488,22 +491,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488#endif 491#endif
489 } 492 }
490 493
491 if (trace)
492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
493 else if (l1i)
494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
495
496 if (l1d)
497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
498 else
499 printk(KERN_CONT "\n");
500
501 if (l2)
502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
503
504 if (l3)
505 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
506
507 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 494 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
508 495
509 return l2; 496 return l2;
@@ -520,18 +507,19 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
520{ 507{
521 struct _cpuid4_info *this_leaf, *sibling_leaf; 508 struct _cpuid4_info *this_leaf, *sibling_leaf;
522 unsigned long num_threads_sharing; 509 unsigned long num_threads_sharing;
523 int index_msb, i; 510 int index_msb, i, sibling;
524 struct cpuinfo_x86 *c = &cpu_data(cpu); 511 struct cpuinfo_x86 *c = &cpu_data(cpu);
525 512
526 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 513 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
527 struct cpuinfo_x86 *d; 514 for_each_cpu(i, c->llc_shared_map) {
528 for_each_online_cpu(i) {
529 if (!per_cpu(ici_cpuid4_info, i)) 515 if (!per_cpu(ici_cpuid4_info, i))
530 continue; 516 continue;
531 d = &cpu_data(i);
532 this_leaf = CPUID4_INFO_IDX(i, index); 517 this_leaf = CPUID4_INFO_IDX(i, index);
533 cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), 518 for_each_cpu(sibling, c->llc_shared_map) {
534 d->llc_shared_map); 519 if (!cpu_online(sibling))
520 continue;
521 set_bit(sibling, this_leaf->shared_cpu_map);
522 }
535 } 523 }
536 return; 524 return;
537 } 525 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 472763d92098..73734baa50f2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
74 m->finished = 0; 74 m->finished = 0;
75} 75}
76 76
77static cpumask_t mce_inject_cpumask; 77static cpumask_var_t mce_inject_cpumask;
78 78
79static int mce_raise_notify(struct notifier_block *self, 79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data) 80 unsigned long val, void *data)
@@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self,
82 struct die_args *args = (struct die_args *)data; 82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id(); 83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm); 84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) 85 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE; 86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask); 87 cpumask_clear_cpu(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION) 88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs); 89 raise_exception(m, args->regs);
90 else if (m->status) 90 else if (m->status)
@@ -148,22 +148,22 @@ static void raise_mce(struct mce *m)
148 unsigned long start; 148 unsigned long start;
149 int cpu; 149 int cpu;
150 get_online_cpus(); 150 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map; 151 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
152 cpu_clear(get_cpu(), mce_inject_cpumask); 152 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) { 153 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu); 154 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished || 155 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask); 157 cpumask_clear_cpu(cpu, mce_inject_cpumask);
158 } 158 }
159 if (!cpus_empty(mce_inject_cpumask)) 159 if (!cpumask_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); 160 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies; 161 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) { 162 while (!cpumask_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) { 163 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR 164 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n", 165 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask)); 166 *cpumask_bits(mce_inject_cpumask));
167 break; 167 break;
168 } 168 }
169 cpu_relax(); 169 cpu_relax();
@@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
210 210
211static int inject_init(void) 211static int inject_init(void)
212{ 212{
213 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
214 return -ENOMEM;
213 printk(KERN_INFO "Machine check injector initialized\n"); 215 printk(KERN_INFO "Machine check injector initialized\n");
214 mce_chrdev_ops.write = mce_write; 216 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb); 217 register_die_notifier(&mce_raise_nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 183c3457d2f4..a8aacd4b513c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,9 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h>
51
49int mce_disabled __read_mostly; 52int mce_disabled __read_mostly;
50 53
51#define MISC_MCELOG_MINOR 227 54#define MISC_MCELOG_MINOR 227
@@ -85,6 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 88static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 89static int cpu_missing;
87 90
91/*
92 * CPU/chipset specific EDAC code can register a notifier call here to print
93 * MCE errors in a human-readable form.
94 */
95ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
96EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
97
98static int default_decode_mce(struct notifier_block *nb, unsigned long val,
99 void *data)
100{
101 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
103
104 return NOTIFY_STOP;
105}
106
107static struct notifier_block mce_dec_nb = {
108 .notifier_call = default_decode_mce,
109 .priority = -1,
110};
88 111
89/* MCA banks polled by the period polling timer for corrected events */ 112/* MCA banks polled by the period polling timer for corrected events */
90DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 113DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -129,6 +152,9 @@ void mce_log(struct mce *mce)
129{ 152{
130 unsigned next, entry; 153 unsigned next, entry;
131 154
155 /* Emit the trace record: */
156 trace_mce_record(mce);
157
132 mce->finished = 0; 158 mce->finished = 0;
133 wmb(); 159 wmb();
134 for (;;) { 160 for (;;) {
@@ -165,46 +191,46 @@ void mce_log(struct mce *mce)
165 set_bit(0, &mce_need_notify); 191 set_bit(0, &mce_need_notify);
166} 192}
167 193
168void __weak decode_mce(struct mce *m)
169{
170 return;
171}
172
173static void print_mce(struct mce *m) 194static void print_mce(struct mce *m)
174{ 195{
175 printk(KERN_EMERG 196 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
176 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
177 m->extcpu, m->mcgstatus, m->bank, m->status); 197 m->extcpu, m->mcgstatus, m->bank, m->status);
198
178 if (m->ip) { 199 if (m->ip) {
179 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 200 pr_emerg("RIP%s %02x:<%016Lx> ",
180 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 201 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
181 m->cs, m->ip); 202 m->cs, m->ip);
203
182 if (m->cs == __KERNEL_CS) 204 if (m->cs == __KERNEL_CS)
183 print_symbol("{%s}", m->ip); 205 print_symbol("{%s}", m->ip);
184 printk(KERN_CONT "\n"); 206 pr_cont("\n");
185 } 207 }
186 printk(KERN_EMERG "TSC %llx ", m->tsc); 208
209 pr_emerg("TSC %llx ", m->tsc);
187 if (m->addr) 210 if (m->addr)
188 printk(KERN_CONT "ADDR %llx ", m->addr); 211 pr_cont("ADDR %llx ", m->addr);
189 if (m->misc) 212 if (m->misc)
190 printk(KERN_CONT "MISC %llx ", m->misc); 213 pr_cont("MISC %llx ", m->misc);
191 printk(KERN_CONT "\n"); 214
192 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 215 pr_cont("\n");
193 m->cpuvendor, m->cpuid, m->time, m->socketid, 216 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
194 m->apicid); 217 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
195 218
196 decode_mce(m); 219 /*
220 * Print out human-readable details about the MCE error,
221 * (if the CPU has an implementation for that)
222 */
223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
197} 224}
198 225
199static void print_mce_head(void) 226static void print_mce_head(void)
200{ 227{
201 printk(KERN_EMERG "\nHARDWARE ERROR\n"); 228 pr_emerg("\nHARDWARE ERROR\n");
202} 229}
203 230
204static void print_mce_tail(void) 231static void print_mce_tail(void)
205{ 232{
206 printk(KERN_EMERG "This is not a software problem!\n" 233 pr_emerg("This is not a software problem!\n");
207 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
208} 234}
209 235
210#define PANIC_TIMEOUT 5 /* 5 seconds */ 236#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -218,6 +244,7 @@ static atomic_t mce_fake_paniced;
218static void wait_for_panic(void) 244static void wait_for_panic(void)
219{ 245{
220 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 246 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
247
221 preempt_disable(); 248 preempt_disable();
222 local_irq_enable(); 249 local_irq_enable();
223 while (timeout-- > 0) 250 while (timeout-- > 0)
@@ -285,6 +312,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
285static int msr_to_offset(u32 msr) 312static int msr_to_offset(u32 msr)
286{ 313{
287 unsigned bank = __get_cpu_var(injectm.bank); 314 unsigned bank = __get_cpu_var(injectm.bank);
315
288 if (msr == rip_msr) 316 if (msr == rip_msr)
289 return offsetof(struct mce, ip); 317 return offsetof(struct mce, ip);
290 if (msr == MSR_IA32_MCx_STATUS(bank)) 318 if (msr == MSR_IA32_MCx_STATUS(bank))
@@ -1108,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1108static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1136static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1109static DEFINE_PER_CPU(struct timer_list, mce_timer); 1137static DEFINE_PER_CPU(struct timer_list, mce_timer);
1110 1138
1111static void mcheck_timer(unsigned long data) 1139static void mce_start_timer(unsigned long data)
1112{ 1140{
1113 struct timer_list *t = &per_cpu(mce_timer, data); 1141 struct timer_list *t = &per_cpu(mce_timer, data);
1114 int *n; 1142 int *n;
@@ -1173,7 +1201,7 @@ int mce_notify_irq(void)
1173} 1201}
1174EXPORT_SYMBOL_GPL(mce_notify_irq); 1202EXPORT_SYMBOL_GPL(mce_notify_irq);
1175 1203
1176static int mce_banks_init(void) 1204static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1177{ 1205{
1178 int i; 1206 int i;
1179 1207
@@ -1192,7 +1220,7 @@ static int mce_banks_init(void)
1192/* 1220/*
1193 * Initialize Machine Checks for a CPU. 1221 * Initialize Machine Checks for a CPU.
1194 */ 1222 */
1195static int __cpuinit mce_cap_init(void) 1223static int __cpuinit __mcheck_cpu_cap_init(void)
1196{ 1224{
1197 unsigned b; 1225 unsigned b;
1198 u64 cap; 1226 u64 cap;
@@ -1200,7 +1228,8 @@ static int __cpuinit mce_cap_init(void)
1200 rdmsrl(MSR_IA32_MCG_CAP, cap); 1228 rdmsrl(MSR_IA32_MCG_CAP, cap);
1201 1229
1202 b = cap & MCG_BANKCNT_MASK; 1230 b = cap & MCG_BANKCNT_MASK;
1203 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1231 if (!banks)
1232 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1204 1233
1205 if (b > MAX_NR_BANKS) { 1234 if (b > MAX_NR_BANKS) {
1206 printk(KERN_WARNING 1235 printk(KERN_WARNING
@@ -1213,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
1213 WARN_ON(banks != 0 && b != banks); 1242 WARN_ON(banks != 0 && b != banks);
1214 banks = b; 1243 banks = b;
1215 if (!mce_banks) { 1244 if (!mce_banks) {
1216 int err = mce_banks_init(); 1245 int err = __mcheck_cpu_mce_banks_init();
1217 1246
1218 if (err) 1247 if (err)
1219 return err; 1248 return err;
@@ -1229,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
1229 return 0; 1258 return 0;
1230} 1259}
1231 1260
1232static void mce_init(void) 1261static void __mcheck_cpu_init_generic(void)
1233{ 1262{
1234 mce_banks_t all_banks; 1263 mce_banks_t all_banks;
1235 u64 cap; 1264 u64 cap;
@@ -1258,7 +1287,7 @@ static void mce_init(void)
1258} 1287}
1259 1288
1260/* Add per CPU specific workarounds here */ 1289/* Add per CPU specific workarounds here */
1261static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1290static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1262{ 1291{
1263 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1292 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1264 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1293 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1326,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1326 return 0; 1355 return 0;
1327} 1356}
1328 1357
1329static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1358static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1330{ 1359{
1331 if (c->x86 != 5) 1360 if (c->x86 != 5)
1332 return; 1361 return;
@@ -1340,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1340 } 1369 }
1341} 1370}
1342 1371
1343static void mce_cpu_features(struct cpuinfo_x86 *c) 1372static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1344{ 1373{
1345 switch (c->x86_vendor) { 1374 switch (c->x86_vendor) {
1346 case X86_VENDOR_INTEL: 1375 case X86_VENDOR_INTEL:
@@ -1354,18 +1383,19 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1354 } 1383 }
1355} 1384}
1356 1385
1357static void mce_init_timer(void) 1386static void __mcheck_cpu_init_timer(void)
1358{ 1387{
1359 struct timer_list *t = &__get_cpu_var(mce_timer); 1388 struct timer_list *t = &__get_cpu_var(mce_timer);
1360 int *n = &__get_cpu_var(mce_next_interval); 1389 int *n = &__get_cpu_var(mce_next_interval);
1361 1390
1391 setup_timer(t, mce_start_timer, smp_processor_id());
1392
1362 if (mce_ignore_ce) 1393 if (mce_ignore_ce)
1363 return; 1394 return;
1364 1395
1365 *n = check_interval * HZ; 1396 *n = check_interval * HZ;
1366 if (!*n) 1397 if (!*n)
1367 return; 1398 return;
1368 setup_timer(t, mcheck_timer, smp_processor_id());
1369 t->expires = round_jiffies(jiffies + *n); 1399 t->expires = round_jiffies(jiffies + *n);
1370 add_timer_on(t, smp_processor_id()); 1400 add_timer_on(t, smp_processor_id());
1371} 1401}
@@ -1385,27 +1415,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1385 * Called for each booted CPU to set up machine checks. 1415 * Called for each booted CPU to set up machine checks.
1386 * Must be called with preempt off: 1416 * Must be called with preempt off:
1387 */ 1417 */
1388void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1418void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1389{ 1419{
1390 if (mce_disabled) 1420 if (mce_disabled)
1391 return; 1421 return;
1392 1422
1393 mce_ancient_init(c); 1423 __mcheck_cpu_ancient_init(c);
1394 1424
1395 if (!mce_available(c)) 1425 if (!mce_available(c))
1396 return; 1426 return;
1397 1427
1398 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1428 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1399 mce_disabled = 1; 1429 mce_disabled = 1;
1400 return; 1430 return;
1401 } 1431 }
1402 1432
1403 machine_check_vector = do_machine_check; 1433 machine_check_vector = do_machine_check;
1404 1434
1405 mce_init(); 1435 __mcheck_cpu_init_generic();
1406 mce_cpu_features(c); 1436 __mcheck_cpu_init_vendor(c);
1407 mce_init_timer(); 1437 __mcheck_cpu_init_timer();
1408 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1438 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1439
1409} 1440}
1410 1441
1411/* 1442/*
@@ -1625,6 +1656,15 @@ static int __init mcheck_enable(char *str)
1625} 1656}
1626__setup("mce", mcheck_enable); 1657__setup("mce", mcheck_enable);
1627 1658
1659int __init mcheck_init(void)
1660{
1661 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1662
1663 mcheck_intel_therm_init();
1664
1665 return 0;
1666}
1667
1628/* 1668/*
1629 * Sysfs support 1669 * Sysfs support
1630 */ 1670 */
@@ -1633,7 +1673,7 @@ __setup("mce", mcheck_enable);
1633 * Disable machine checks on suspend and shutdown. We can't really handle 1673 * Disable machine checks on suspend and shutdown. We can't really handle
1634 * them later. 1674 * them later.
1635 */ 1675 */
1636static int mce_disable(void) 1676static int mce_disable_error_reporting(void)
1637{ 1677{
1638 int i; 1678 int i;
1639 1679
@@ -1648,12 +1688,12 @@ static int mce_disable(void)
1648 1688
1649static int mce_suspend(struct sys_device *dev, pm_message_t state) 1689static int mce_suspend(struct sys_device *dev, pm_message_t state)
1650{ 1690{
1651 return mce_disable(); 1691 return mce_disable_error_reporting();
1652} 1692}
1653 1693
1654static int mce_shutdown(struct sys_device *dev) 1694static int mce_shutdown(struct sys_device *dev)
1655{ 1695{
1656 return mce_disable(); 1696 return mce_disable_error_reporting();
1657} 1697}
1658 1698
1659/* 1699/*
@@ -1663,8 +1703,8 @@ static int mce_shutdown(struct sys_device *dev)
1663 */ 1703 */
1664static int mce_resume(struct sys_device *dev) 1704static int mce_resume(struct sys_device *dev)
1665{ 1705{
1666 mce_init(); 1706 __mcheck_cpu_init_generic();
1667 mce_cpu_features(&current_cpu_data); 1707 __mcheck_cpu_init_vendor(&current_cpu_data);
1668 1708
1669 return 0; 1709 return 0;
1670} 1710}
@@ -1674,8 +1714,8 @@ static void mce_cpu_restart(void *data)
1674 del_timer_sync(&__get_cpu_var(mce_timer)); 1714 del_timer_sync(&__get_cpu_var(mce_timer));
1675 if (!mce_available(&current_cpu_data)) 1715 if (!mce_available(&current_cpu_data))
1676 return; 1716 return;
1677 mce_init(); 1717 __mcheck_cpu_init_generic();
1678 mce_init_timer(); 1718 __mcheck_cpu_init_timer();
1679} 1719}
1680 1720
1681/* Reinit MCEs after user configuration changes */ 1721/* Reinit MCEs after user configuration changes */
@@ -1701,7 +1741,7 @@ static void mce_enable_ce(void *all)
1701 cmci_reenable(); 1741 cmci_reenable();
1702 cmci_recheck(); 1742 cmci_recheck();
1703 if (all) 1743 if (all)
1704 mce_init_timer(); 1744 __mcheck_cpu_init_timer();
1705} 1745}
1706 1746
1707static struct sysdev_class mce_sysclass = { 1747static struct sysdev_class mce_sysclass = {
@@ -1889,7 +1929,7 @@ error2:
1889 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1929 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1890error: 1930error:
1891 while (--i >= 0) 1931 while (--i >= 0)
1892 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1932 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1893 1933
1894 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1934 sysdev_unregister(&per_cpu(mce_dev, cpu));
1895 1935
@@ -1914,13 +1954,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1914} 1954}
1915 1955
1916/* Make sure there are no machine checks on offlined CPUs. */ 1956/* Make sure there are no machine checks on offlined CPUs. */
1917static void mce_disable_cpu(void *h) 1957static void __cpuinit mce_disable_cpu(void *h)
1918{ 1958{
1919 unsigned long action = *(unsigned long *)h; 1959 unsigned long action = *(unsigned long *)h;
1920 int i; 1960 int i;
1921 1961
1922 if (!mce_available(&current_cpu_data)) 1962 if (!mce_available(&current_cpu_data))
1923 return; 1963 return;
1964
1924 if (!(action & CPU_TASKS_FROZEN)) 1965 if (!(action & CPU_TASKS_FROZEN))
1925 cmci_clear(); 1966 cmci_clear();
1926 for (i = 0; i < banks; i++) { 1967 for (i = 0; i < banks; i++) {
@@ -1931,7 +1972,7 @@ static void mce_disable_cpu(void *h)
1931 } 1972 }
1932} 1973}
1933 1974
1934static void mce_reenable_cpu(void *h) 1975static void __cpuinit mce_reenable_cpu(void *h)
1935{ 1976{
1936 unsigned long action = *(unsigned long *)h; 1977 unsigned long action = *(unsigned long *)h;
1937 int i; 1978 int i;
@@ -1976,9 +2017,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1976 break; 2017 break;
1977 case CPU_DOWN_FAILED: 2018 case CPU_DOWN_FAILED:
1978 case CPU_DOWN_FAILED_FROZEN: 2019 case CPU_DOWN_FAILED_FROZEN:
1979 t->expires = round_jiffies(jiffies + 2020 if (!mce_ignore_ce && check_interval) {
2021 t->expires = round_jiffies(jiffies +
1980 __get_cpu_var(mce_next_interval)); 2022 __get_cpu_var(mce_next_interval));
1981 add_timer_on(t, cpu); 2023 add_timer_on(t, cpu);
2024 }
1982 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2025 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1983 break; 2026 break;
1984 case CPU_POST_DEAD: 2027 case CPU_POST_DEAD:
@@ -2010,7 +2053,7 @@ static __init void mce_init_banks(void)
2010 } 2053 }
2011} 2054}
2012 2055
2013static __init int mce_init_device(void) 2056static __init int mcheck_init_device(void)
2014{ 2057{
2015 int err; 2058 int err;
2016 int i = 0; 2059 int i = 0;
@@ -2038,7 +2081,7 @@ static __init int mce_init_device(void)
2038 return err; 2081 return err;
2039} 2082}
2040 2083
2041device_initcall(mce_init_device); 2084device_initcall(mcheck_init_device);
2042 2085
2043/* 2086/*
2044 * Old style boot options parsing. Only for compatibility. 2087 * Old style boot options parsing. Only for compatibility.
@@ -2086,7 +2129,7 @@ static int fake_panic_set(void *data, u64 val)
2086DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2129DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2087 fake_panic_set, "%llu\n"); 2130 fake_panic_set, "%llu\n");
2088 2131
2089static int __init mce_debugfs_init(void) 2132static int __init mcheck_debugfs_init(void)
2090{ 2133{
2091 struct dentry *dmce, *ffake_panic; 2134 struct dentry *dmce, *ffake_panic;
2092 2135
@@ -2100,5 +2143,5 @@ static int __init mce_debugfs_init(void)
2100 2143
2101 return 0; 2144 return 0;
2102} 2145}
2103late_initcall(mce_debugfs_init); 2146late_initcall(mcheck_debugfs_init);
2104#endif 2147#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 889f665fe93d..7c785634af2b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/sched.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/processor.h> 13#include <asm/processor.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..81c499eceb21 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,14 +256,34 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259/* Thermal monitoring depends on APIC, ACPI and clock modulation */
260static int intel_thermal_supported(struct cpuinfo_x86 *c)
261{
262 if (!cpu_has_apic)
263 return 0;
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return 0;
266 return 1;
267}
268
269void __init mcheck_intel_therm_init(void)
270{
271 /*
272 * This function is only called on boot CPU. Save the init thermal
273 * LVT value on BSP and use that value to restore APs' thermal LVT
274 * entry BIOS programmed later
275 */
276 if (intel_thermal_supported(&boot_cpu_data))
277 lvtthmr_init = apic_read(APIC_LVTTHMR);
278}
279
257void intel_init_thermal(struct cpuinfo_x86 *c) 280void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 281{
259 unsigned int cpu = smp_processor_id(); 282 unsigned int cpu = smp_processor_id();
260 int tm2 = 0; 283 int tm2 = 0;
261 u32 l, h; 284 u32 l, h;
262 285
263 /* Thermal monitoring depends on ACPI and clock modulation*/ 286 if (!intel_thermal_supported(c))
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return; 287 return;
266 288
267 /* 289 /*
@@ -270,7 +292,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 292 * since it might be delivered via SMI already:
271 */ 293 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 294 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 295
296 /*
297 * The initial value of thermal LVT entries on all APs always reads
298 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
299 * sequence to them and LVT registers are reset to 0s except for
300 * the mask bits which are set to 1s when APs receive INIT IPI.
301 * Always restore the value that BIOS has programmed on AP based on
302 * BSP's info we saved since BIOS is always setting the same value
303 * for all threads/cores
304 */
305 apic_write(APIC_LVTTHMR, lvtthmr_init);
306
307 h = lvtthmr_init;
308
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 309 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 310 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 311 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
@@ -312,8 +347,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
312 l = apic_read(APIC_LVTTHMR); 347 l = apic_read(APIC_LVTTHMR);
313 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 348 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
314 349
315 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 350 printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
316 cpu, tm2 ? "TM2" : "TM1"); 351 tm2 ? "TM2" : "TM1");
317 352
318 /* enable thermal throttle processing */ 353 /* enable thermal throttle processing */
319 atomic_set(&therm_throt_en, 1); 354 atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 315738c74aad..09b1698e0466 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2)
170 return start1 - start2; 170 return start1 - start2;
171} 171}
172 172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
173#define BIOS_BUG_MSG KERN_WARNING \ 208#define BIOS_BUG_MSG KERN_WARNING \
174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
175 210
@@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 subtract_range(range, extra_remove_base, 258 subtract_range(range, extra_remove_base,
224 extra_remove_base + extra_remove_size - 1); 259 extra_remove_base + extra_remove_size - 1);
225 260
226 /* get new range num */
227 nr_range = 0;
228 for (i = 0; i < RANGE_NUM; i++) {
229 if (!range[i].end)
230 continue;
231 nr_range++;
232 }
233 if (debug_print) { 261 if (debug_print) {
234 printk(KERN_DEBUG "After UC checking\n"); 262 printk(KERN_DEBUG "After UC checking\n");
235 for (i = 0; i < nr_range; i++) 263 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end)
265 continue;
236 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
237 range[i].start, range[i].end + 1); 267 range[i].start, range[i].end + 1);
268 }
238 } 269 }
239 270
240 /* sort the ranges */ 271 /* sort the ranges */
241 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 272 nr_range = clean_sort_range(range, RANGE_NUM);
242 if (debug_print) { 273 if (debug_print) {
243 printk(KERN_DEBUG "After sorting\n"); 274 printk(KERN_DEBUG "After sorting\n");
244 for (i = 0; i < nr_range; i++) 275 for (i = 0; i < nr_range; i++)
@@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void)
689 continue; 720 continue;
690 if (!size) 721 if (!size)
691 type = MTRR_NUM_TYPES; 722 type = MTRR_NUM_TYPES;
692 if (type == MTRR_TYPE_WRPROT)
693 type = MTRR_TYPE_UNCACHABLE;
694 num[type]++; 723 num[type]++;
695 } 724 }
696 725
@@ -846,7 +875,7 @@ int __init mtrr_cleanup(unsigned address_bits)
846 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
847 876
848 range_sums = sum_ranges(range, nr_range); 877 range_sums = sum_ranges(range, nr_range);
849 printk(KERN_INFO "total RAM coverred: %ldM\n", 878 printk(KERN_INFO "total RAM covered: %ldM\n",
850 range_sums >> (20 - PAGE_SHIFT)); 879 range_sums >> (20 - PAGE_SHIFT));
851 880
852 if (mtrr_chunk_size && mtrr_gran_size) { 881 if (mtrr_chunk_size && mtrr_gran_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index f04e72527604..e006e56f699c 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -4,6 +4,7 @@
4#include <linux/proc_fs.h> 4#include <linux/proc_fs.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h>
7#include <linux/init.h> 8#include <linux/init.h>
8 9
9#define LINE_SIZE 80 10#define LINE_SIZE 80
@@ -96,17 +97,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
96 unsigned long long base, size; 97 unsigned long long base, size;
97 char *ptr; 98 char *ptr;
98 char line[LINE_SIZE]; 99 char line[LINE_SIZE];
100 int length;
99 size_t linelen; 101 size_t linelen;
100 102
101 if (!capable(CAP_SYS_ADMIN)) 103 if (!capable(CAP_SYS_ADMIN))
102 return -EPERM; 104 return -EPERM;
103 if (!len)
104 return -EINVAL;
105 105
106 memset(line, 0, LINE_SIZE); 106 memset(line, 0, LINE_SIZE);
107 if (len > LINE_SIZE) 107
108 len = LINE_SIZE; 108 length = len;
109 if (copy_from_user(line, buf, len - 1)) 109 length--;
110
111 if (length > LINE_SIZE - 1)
112 length = LINE_SIZE - 1;
113
114 if (length < 0)
115 return -EINVAL;
116
117 if (copy_from_user(line, buf, length))
110 return -EFAULT; 118 return -EFAULT;
111 119
112 linelen = strlen(line); 120 linelen = strlen(line);
@@ -126,8 +134,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
126 return -EINVAL; 134 return -EINVAL;
127 135
128 base = simple_strtoull(line + 5, &ptr, 0); 136 base = simple_strtoull(line + 5, &ptr, 0);
129 while (isspace(*ptr)) 137 ptr = skip_spaces(ptr);
130 ptr++;
131 138
132 if (strncmp(ptr, "size=", 5)) 139 if (strncmp(ptr, "size=", 5))
133 return -EINVAL; 140 return -EINVAL;
@@ -135,14 +142,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
135 size = simple_strtoull(ptr + 5, &ptr, 0); 142 size = simple_strtoull(ptr + 5, &ptr, 0);
136 if ((base & 0xfff) || (size & 0xfff)) 143 if ((base & 0xfff) || (size & 0xfff))
137 return -EINVAL; 144 return -EINVAL;
138 while (isspace(*ptr)) 145 ptr = skip_spaces(ptr);
139 ptr++;
140 146
141 if (strncmp(ptr, "type=", 5)) 147 if (strncmp(ptr, "type=", 5))
142 return -EINVAL; 148 return -EINVAL;
143 ptr += 5; 149 ptr = skip_spaces(ptr + 5);
144 while (isspace(*ptr))
145 ptr++;
146 150
147 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 151 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
148 if (strcmp(ptr, mtrr_strings[i])) 152 if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..d616c06e99b4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -77,6 +77,18 @@ struct cpu_hw_events {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -102,6 +114,8 @@ struct x86_pmu {
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
155 return hw_event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
173static u64 intel_pmu_event_map(int hw_event) 228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[hw_event]; 230 return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 245 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 246 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 247
193static const u64 nehalem_hw_cache_event_ids 248static __initconst u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX] 249 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX] 250 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
281 }, 336 },
282}; 337};
283 338
284static const u64 core2_hw_cache_event_ids 339static __initconst u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX] 340 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX] 341 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
372 }, 427 },
373}; 428};
374 429
375static const u64 atom_hw_cache_event_ids 430static __initconst u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX] 431 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX] 432 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
481 return hw_event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static __initconst u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX] 540 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX] 541 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 987 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 989
990 hwc->idx = -1;
991
935 /* 992 /*
936 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
937 */ 994 */
@@ -1229,7 +1286,7 @@ x86_perf_event_set_period(struct perf_event *event,
1229 return 0; 1286 return 0;
1230 1287
1231 /* 1288 /*
1232 * If we are way outside a reasoable range then just skip forward: 1289 * If we are way outside a reasonable range then just skip forward:
1233 */ 1290 */
1234 if (unlikely(left <= -period)) { 1291 if (unlikely(left <= -period)) {
1235 left = period; 1292 left = period;
@@ -1334,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1334 x86_pmu_enable_event(hwc, idx); 1391 x86_pmu_enable_event(hwc, idx);
1335} 1392}
1336 1393
1337static int 1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{ 1395{
1340 unsigned int hw_event; 1396 unsigned int hw_event;
1341 1397
@@ -1349,6 +1405,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1349 if (!x86_pmu.num_events_fixed) 1405 if (!x86_pmu.num_events_fixed)
1350 return -1; 1406 return -1;
1351 1407
1408 /*
1409 * fixed counters do not take all possible filters
1410 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1412 return -1;
1413
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1422,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1360} 1422}
1361 1423
1362/* 1424/*
1363 * Find a PMC slot for the freshly enabled / scheduled in event: 1425 * generic counter allocator: get next free counter
1364 */ 1426 */
1365static int x86_pmu_enable(struct perf_event *event) 1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1429{
1430 int idx;
1431
1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435
1436/*
1437 * intel-specific counter allocator: check event constraints
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444
1445 if (!event_constraints)
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1465{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw;
1369 int idx; 1466 int idx;
1370 1467
1371 idx = fixed_mode_idx(event, hwc); 1468 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1470 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1471 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1472 return -EAGAIN;
1376 1473
1377 hwc->config_base = 0; 1474 hwc->config_base = 0;
1378 hwc->event_base = 0; 1475 hwc->event_base = 0;
1379 hwc->idx = idx; 1476 hwc->idx = idx;
1380 } else if (idx >= 0) { 1477 } else if (idx >= 0) {
1381 /* 1478 /*
@@ -1396,20 +1493,35 @@ static int x86_pmu_enable(struct perf_event *event)
1396 } else { 1493 } else {
1397 idx = hwc->idx; 1494 idx = hwc->idx;
1398 /* Try to get the previous generic event again */ 1495 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1497try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_events); 1499 if (idx == -1)
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN; 1500 return -EAGAIN;
1405 1501
1406 set_bit(idx, cpuc->used_mask); 1502 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1503 hwc->idx = idx;
1408 } 1504 }
1409 hwc->config_base = x86_pmu.eventsel; 1505 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr; 1506 hwc->event_base = x86_pmu.perfctr;
1411 } 1507 }
1412 1508
1509 return idx;
1510}
1511
1512/*
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1413 perf_events_lapic_init(); 1525 perf_events_lapic_init();
1414 1526
1415 x86_pmu.disable(hwc, idx); 1527 x86_pmu.disable(hwc, idx);
@@ -1520,6 +1632,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1520 1632
1521 data.period = event->hw.last_period; 1633 data.period = event->hw.last_period;
1522 data.addr = 0; 1634 data.addr = 0;
1635 data.raw = NULL;
1523 regs.ip = 0; 1636 regs.ip = 0;
1524 1637
1525 /* 1638 /*
@@ -1637,6 +1750,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1637 u64 val; 1750 u64 val;
1638 1751
1639 data.addr = 0; 1752 data.addr = 0;
1753 data.raw = NULL;
1640 1754
1641 cpuc = &__get_cpu_var(cpu_hw_events); 1755 cpuc = &__get_cpu_var(cpu_hw_events);
1642 1756
@@ -1682,6 +1796,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1682 u64 ack, status; 1796 u64 ack, status;
1683 1797
1684 data.addr = 0; 1798 data.addr = 0;
1799 data.raw = NULL;
1685 1800
1686 cpuc = &__get_cpu_var(cpu_hw_events); 1801 cpuc = &__get_cpu_var(cpu_hw_events);
1687 1802
@@ -1745,6 +1860,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1745 u64 val; 1860 u64 val;
1746 1861
1747 data.addr = 0; 1862 data.addr = 0;
1863 data.raw = NULL;
1748 1864
1749 cpuc = &__get_cpu_var(cpu_hw_events); 1865 cpuc = &__get_cpu_var(cpu_hw_events);
1750 1866
@@ -1852,7 +1968,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1968 .priority = 1
1853}; 1969};
1854 1970
1855static struct x86_pmu p6_pmu = { 1971static __initconst struct x86_pmu p6_pmu = {
1856 .name = "p6", 1972 .name = "p6",
1857 .handle_irq = p6_pmu_handle_irq, 1973 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all, 1974 .disable_all = p6_pmu_disable_all,
@@ -1877,9 +1993,10 @@ static struct x86_pmu p6_pmu = {
1877 */ 1993 */
1878 .event_bits = 32, 1994 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1, 1995 .event_mask = (1ULL << 32) - 1,
1996 .get_event_idx = intel_get_event_idx,
1880}; 1997};
1881 1998
1882static struct x86_pmu intel_pmu = { 1999static __initconst struct x86_pmu intel_pmu = {
1883 .name = "Intel", 2000 .name = "Intel",
1884 .handle_irq = intel_pmu_handle_irq, 2001 .handle_irq = intel_pmu_handle_irq,
1885 .disable_all = intel_pmu_disable_all, 2002 .disable_all = intel_pmu_disable_all,
@@ -1900,9 +2017,10 @@ static struct x86_pmu intel_pmu = {
1900 .max_period = (1ULL << 31) - 1, 2017 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts, 2018 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts, 2019 .disable_bts = intel_pmu_disable_bts,
2020 .get_event_idx = intel_get_event_idx,
1903}; 2021};
1904 2022
1905static struct x86_pmu amd_pmu = { 2023static __initconst struct x86_pmu amd_pmu = {
1906 .name = "AMD", 2024 .name = "AMD",
1907 .handle_irq = amd_pmu_handle_irq, 2025 .handle_irq = amd_pmu_handle_irq,
1908 .disable_all = amd_pmu_disable_all, 2026 .disable_all = amd_pmu_disable_all,
@@ -1920,9 +2038,10 @@ static struct x86_pmu amd_pmu = {
1920 .apic = 1, 2038 .apic = 1,
1921 /* use highest bit to detect overflow */ 2039 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1, 2040 .max_period = (1ULL << 47) - 1,
2041 .get_event_idx = gen_get_event_idx,
1923}; 2042};
1924 2043
1925static int p6_pmu_init(void) 2044static __init int p6_pmu_init(void)
1926{ 2045{
1927 switch (boot_cpu_data.x86_model) { 2046 switch (boot_cpu_data.x86_model) {
1928 case 1: 2047 case 1:
@@ -1932,10 +2051,12 @@ static int p6_pmu_init(void)
1932 case 7: 2051 case 7:
1933 case 8: 2052 case 8:
1934 case 11: /* Pentium III */ 2053 case 11: /* Pentium III */
2054 event_constraints = intel_p6_event_constraints;
1935 break; 2055 break;
1936 case 9: 2056 case 9:
1937 case 13: 2057 case 13:
1938 /* Pentium M */ 2058 /* Pentium M */
2059 event_constraints = intel_p6_event_constraints;
1939 break; 2060 break;
1940 default: 2061 default:
1941 pr_cont("unsupported p6 CPU model %d ", 2062 pr_cont("unsupported p6 CPU model %d ",
@@ -1945,16 +2066,10 @@ static int p6_pmu_init(void)
1945 2066
1946 x86_pmu = p6_pmu; 2067 x86_pmu = p6_pmu;
1947 2068
1948 if (!cpu_has_apic) {
1949 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1950 pr_info("no hardware sampling interrupt available.\n");
1951 x86_pmu.apic = 0;
1952 }
1953
1954 return 0; 2069 return 0;
1955} 2070}
1956 2071
1957static int intel_pmu_init(void) 2072static __init int intel_pmu_init(void)
1958{ 2073{
1959 union cpuid10_edx edx; 2074 union cpuid10_edx edx;
1960 union cpuid10_eax eax; 2075 union cpuid10_eax eax;
@@ -2007,12 +2122,14 @@ static int intel_pmu_init(void)
2007 sizeof(hw_cache_event_ids)); 2122 sizeof(hw_cache_event_ids));
2008 2123
2009 pr_cont("Core2 events, "); 2124 pr_cont("Core2 events, ");
2125 event_constraints = intel_core_event_constraints;
2010 break; 2126 break;
2011 default: 2127 default:
2012 case 26: 2128 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2129 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids)); 2130 sizeof(hw_cache_event_ids));
2015 2131
2132 event_constraints = intel_nehalem_event_constraints;
2016 pr_cont("Nehalem/Corei7 events, "); 2133 pr_cont("Nehalem/Corei7 events, ");
2017 break; 2134 break;
2018 case 28: 2135 case 28:
@@ -2025,7 +2142,7 @@ static int intel_pmu_init(void)
2025 return 0; 2142 return 0;
2026} 2143}
2027 2144
2028static int amd_pmu_init(void) 2145static __init int amd_pmu_init(void)
2029{ 2146{
2030 /* Performance-monitoring supported from K7 and later: */ 2147 /* Performance-monitoring supported from K7 and later: */
2031 if (boot_cpu_data.x86 < 6) 2148 if (boot_cpu_data.x86 < 6)
@@ -2040,6 +2157,16 @@ static int amd_pmu_init(void)
2040 return 0; 2157 return 0;
2041} 2158}
2042 2159
2160static void __init pmu_check_apic(void)
2161{
2162 if (cpu_has_apic)
2163 return;
2164
2165 x86_pmu.apic = 0;
2166 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
2167 pr_info("no hardware sampling interrupt available.\n");
2168}
2169
2043void __init init_hw_perf_events(void) 2170void __init init_hw_perf_events(void)
2044{ 2171{
2045 int err; 2172 int err;
@@ -2061,6 +2188,8 @@ void __init init_hw_perf_events(void)
2061 return; 2188 return;
2062 } 2189 }
2063 2190
2191 pmu_check_apic();
2192
2064 pr_cont("%s PMU driver.\n", x86_pmu.name); 2193 pr_cont("%s PMU driver.\n", x86_pmu.name);
2065 2194
2066 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 2195 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
@@ -2105,11 +2234,47 @@ static const struct pmu pmu = {
2105 .unthrottle = x86_pmu_unthrottle, 2234 .unthrottle = x86_pmu_unthrottle,
2106}; 2235};
2107 2236
2237static int
2238validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2239{
2240 struct hw_perf_event fake_event = event->hw;
2241
2242 if (event->pmu && event->pmu != &pmu)
2243 return 0;
2244
2245 return x86_schedule_event(cpuc, &fake_event) >= 0;
2246}
2247
2248static int validate_group(struct perf_event *event)
2249{
2250 struct perf_event *sibling, *leader = event->group_leader;
2251 struct cpu_hw_events fake_pmu;
2252
2253 memset(&fake_pmu, 0, sizeof(fake_pmu));
2254
2255 if (!validate_event(&fake_pmu, leader))
2256 return -ENOSPC;
2257
2258 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2259 if (!validate_event(&fake_pmu, sibling))
2260 return -ENOSPC;
2261 }
2262
2263 if (!validate_event(&fake_pmu, event))
2264 return -ENOSPC;
2265
2266 return 0;
2267}
2268
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 2269const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 2270{
2110 int err; 2271 int err;
2111 2272
2112 err = __hw_perf_event_init(event); 2273 err = __hw_perf_event_init(event);
2274 if (!err) {
2275 if (event->group_leader != event)
2276 err = validate_group(event);
2277 }
2113 if (err) { 2278 if (err) {
2114 if (event->destroy) 2279 if (event->destroy)
2115 event->destroy(event); 2280 event->destroy(event);
@@ -2132,7 +2297,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2132 2297
2133static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 2298static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2134static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 2299static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2135static DEFINE_PER_CPU(int, in_nmi_frame); 2300static DEFINE_PER_CPU(int, in_ignored_frame);
2136 2301
2137 2302
2138static void 2303static void
@@ -2148,8 +2313,9 @@ static void backtrace_warning(void *data, char *msg)
2148 2313
2149static int backtrace_stack(void *data, char *name) 2314static int backtrace_stack(void *data, char *name)
2150{ 2315{
2151 per_cpu(in_nmi_frame, smp_processor_id()) = 2316 per_cpu(in_ignored_frame, smp_processor_id()) =
2152 x86_is_stack_id(NMI_STACK, name); 2317 x86_is_stack_id(NMI_STACK, name) ||
2318 x86_is_stack_id(DEBUG_STACK, name);
2153 2319
2154 return 0; 2320 return 0;
2155} 2321}
@@ -2158,7 +2324,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2158{ 2324{
2159 struct perf_callchain_entry *entry = data; 2325 struct perf_callchain_entry *entry = data;
2160 2326
2161 if (per_cpu(in_nmi_frame, smp_processor_id())) 2327 if (per_cpu(in_ignored_frame, smp_processor_id()))
2162 return; 2328 return;
2163 2329
2164 if (reliable) 2330 if (reliable)
@@ -2170,6 +2336,7 @@ static const struct stacktrace_ops backtrace_ops = {
2170 .warning_symbol = backtrace_warning_symbol, 2336 .warning_symbol = backtrace_warning_symbol,
2171 .stack = backtrace_stack, 2337 .stack = backtrace_stack,
2172 .address = backtrace_address, 2338 .address = backtrace_address,
2339 .walk_stack = print_context_stack_bp,
2173}; 2340};
2174 2341
2175#include "../dumpstack.h" 2342#include "../dumpstack.h"
@@ -2180,7 +2347,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2180 callchain_store(entry, PERF_CONTEXT_KERNEL); 2347 callchain_store(entry, PERF_CONTEXT_KERNEL);
2181 callchain_store(entry, regs->ip); 2348 callchain_store(entry, regs->ip);
2182 2349
2183 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 2350 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2184} 2351}
2185 2352
2186/* 2353/*
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..898df9719afb 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void)
712 switch (boot_cpu_data.x86_vendor) { 712 switch (boot_cpu_data.x86_vendor) {
713 case X86_VENDOR_AMD: 713 case X86_VENDOR_AMD:
714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
715 boot_cpu_data.x86 != 16) 715 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
716 return; 716 return;
717 wd_ops = &k7_wd_ops; 717 wd_ops = &k7_wd_ops;
718 break; 718 break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
26 26
27 early_init_transmeta(c); 27 early_init_transmeta(c);
28 28
29 display_cacheinfo(c); 29 cpu_detect_cache_sizes(c);
30 30
31 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
32 max = cpuid_eax(0x80860000); 32 max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..cb27fd6136c9 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -116,21 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
116{ 116{
117 unsigned int cpu; 117 unsigned int cpu;
118 struct cpuinfo_x86 *c; 118 struct cpuinfo_x86 *c;
119 int ret = 0;
120
121 lock_kernel();
122 119
123 cpu = iminor(file->f_path.dentry->d_inode); 120 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 121 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
125 ret = -ENXIO; /* No such CPU */ 122 return -ENXIO; /* No such CPU */
126 goto out; 123
127 }
128 c = &cpu_data(cpu); 124 c = &cpu_data(cpu);
129 if (c->cpuid_level < 0) 125 if (c->cpuid_level < 0)
130 ret = -EIO; /* CPUID not supported */ 126 return -EIO; /* CPUID not supported */
131out: 127
132 unlock_kernel(); 128 return 0;
133 return ret;
134} 129}
135 130
136/* 131/*
@@ -192,7 +187,8 @@ static int __init cpuid_init(void)
192 int i, err = 0; 187 int i, err = 0;
193 i = 0; 188 i = 0;
194 189
195 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { 190 if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
191 "cpu/cpuid", &cpuid_fops)) {
196 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", 192 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
197 CPUID_MAJOR); 193 CPUID_MAJOR);
198 err = -EBUSY; 194 err = -EBUSY;
@@ -221,7 +217,7 @@ out_class:
221 } 217 }
222 class_destroy(cpuid_class); 218 class_destroy(cpuid_class);
223out_chrdev: 219out_chrdev:
224 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 220 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
225out: 221out:
226 return err; 222 return err;
227} 223}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..a4849c10a77e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h> 30#include <asm/x86_init.h>
31
32 31
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 33
@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
106#endif 105#endif
107 106
108#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
109 pci_iommu_shutdown(); 108 x86_platform.iommu_shutdown();
110#endif 109#endif
111 110
112 crash_save_cpu(regs, safe_smp_processor_id()); 111 crash_save_cpu(regs, safe_smp_processor_id());
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index f7cdb3b457aa..cd97ce18c29d 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -16,6 +16,22 @@ static void *kdump_buf_page;
16/* Stores the physical address of elf header of crash image. */ 16/* Stores the physical address of elf header of crash image. */
17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
18 18
19static inline bool is_crashed_pfn_valid(unsigned long pfn)
20{
21#ifndef CONFIG_X86_PAE
22 /*
23 * non-PAE kdump kernel executed from a PAE one will crop high pte
24 * bits and poke unwanted space counting again from address 0, we
25 * don't want that. pte must fit into unsigned long. In fact the
26 * test checks high 12 bits for being zero (pfn will be shifted left
27 * by PAGE_SHIFT).
28 */
29 return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
30#else
31 return true;
32#endif
33}
34
19/** 35/**
20 * copy_oldmem_page - copy one page from "oldmem" 36 * copy_oldmem_page - copy one page from "oldmem"
21 * @pfn: page frame number to be copied 37 * @pfn: page frame number to be copied
@@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
41 if (!csize) 57 if (!csize)
42 return 0; 58 return 0;
43 59
60 if (!is_crashed_pfn_valid(pfn))
61 return -EFAULT;
62
44 vaddr = kmap_atomic_pfn(pfn, KM_PTE0); 63 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
45 64
46 if (!userbuf) { 65 if (!userbuf) {
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..c56bc2873030 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -109,6 +109,30 @@ print_context_stack(struct thread_info *tinfo,
109 } 109 }
110 return bp; 110 return bp;
111} 111}
112EXPORT_SYMBOL_GPL(print_context_stack);
113
114unsigned long
115print_context_stack_bp(struct thread_info *tinfo,
116 unsigned long *stack, unsigned long bp,
117 const struct stacktrace_ops *ops, void *data,
118 unsigned long *end, int *graph)
119{
120 struct stack_frame *frame = (struct stack_frame *)bp;
121 unsigned long *ret_addr = &frame->return_address;
122
123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
124 unsigned long addr = *ret_addr;
125
126 if (__kernel_text_address(addr)) {
127 ops->address(data, addr, 1);
128 frame = frame->next_frame;
129 ret_addr = &frame->return_address;
130 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
131 }
132 }
133 return (unsigned long)frame;
134}
135EXPORT_SYMBOL_GPL(print_context_stack_bp);
112 136
113 137
114static void 138static void
@@ -141,10 +165,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
141} 165}
142 166
143static const struct stacktrace_ops print_trace_ops = { 167static const struct stacktrace_ops print_trace_ops = {
144 .warning = print_trace_warning, 168 .warning = print_trace_warning,
145 .warning_symbol = print_trace_warning_symbol, 169 .warning_symbol = print_trace_warning_symbol,
146 .stack = print_trace_stack, 170 .stack = print_trace_stack,
147 .address = print_trace_address, 171 .address = print_trace_address,
172 .walk_stack = print_context_stack,
148}; 173};
149 174
150void 175void
@@ -188,7 +213,7 @@ void dump_stack(void)
188} 213}
189EXPORT_SYMBOL(dump_stack); 214EXPORT_SYMBOL(dump_stack);
190 215
191static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 216static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
192static int die_owner = -1; 217static int die_owner = -1;
193static unsigned int die_nest_count; 218static unsigned int die_nest_count;
194 219
@@ -207,11 +232,11 @@ unsigned __kprobes long oops_begin(void)
207 /* racy, but better than risking deadlock. */ 232 /* racy, but better than risking deadlock. */
208 raw_local_irq_save(flags); 233 raw_local_irq_save(flags);
209 cpu = smp_processor_id(); 234 cpu = smp_processor_id();
210 if (!__raw_spin_trylock(&die_lock)) { 235 if (!arch_spin_trylock(&die_lock)) {
211 if (cpu == die_owner) 236 if (cpu == die_owner)
212 /* nested oops. should stop eventually */; 237 /* nested oops. should stop eventually */;
213 else 238 else
214 __raw_spin_lock(&die_lock); 239 arch_spin_lock(&die_lock);
215 } 240 }
216 die_nest_count++; 241 die_nest_count++;
217 die_owner = cpu; 242 die_owner = cpu;
@@ -231,7 +256,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
231 die_nest_count--; 256 die_nest_count--;
232 if (!die_nest_count) 257 if (!die_nest_count)
233 /* Nest count reaches zero, release the lock. */ 258 /* Nest count reaches zero, release the lock. */
234 __raw_spin_unlock(&die_lock); 259 arch_spin_unlock(&die_lock);
235 raw_local_irq_restore(flags); 260 raw_local_irq_restore(flags);
236 oops_exit(); 261 oops_exit();
237 262
@@ -268,11 +293,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268 293
269 show_registers(regs); 294 show_registers(regs);
270#ifdef CONFIG_X86_32 295#ifdef CONFIG_X86_32
271 sp = (unsigned long) (&regs->sp); 296 if (user_mode_vm(regs)) {
272 savesegment(ss, ss);
273 if (user_mode(regs)) {
274 sp = regs->sp; 297 sp = regs->sp;
275 ss = regs->ss & 0xffff; 298 ss = regs->ss & 0xffff;
299 } else {
300 sp = kernel_stack_pointer(regs);
301 savesegment(ss, ss);
276 } 302 }
277 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 303 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
278 print_symbol("%s", regs->ip); 304 print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 81086c227ab7..4fd1420faffa 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,12 +14,6 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void 17extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 18show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl); 19 unsigned long *stack, unsigned long bp, char *log_lvl);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..ae775ca47b25 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,9 +10,9 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
@@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
35 35
36 if (!stack) { 36 if (!stack) {
37 unsigned long dummy; 37 unsigned long dummy;
38
38 stack = &dummy; 39 stack = &dummy;
39 if (task && task != current) 40 if (task && task != current)
40 stack = (unsigned long *)task->thread.sp; 41 stack = (unsigned long *)task->thread.sp;
@@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
57 58
58 context = (struct thread_info *) 59 context = (struct thread_info *)
59 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 60 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
60 bp = print_context_stack(context, stack, bp, ops, 61 bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
61 data, NULL, &graph);
62 62
63 stack = (unsigned long *)context->previous_esp; 63 stack = (unsigned long *)context->previous_esp;
64 if (!stack) 64 if (!stack)
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace);
72 72
73void 73void
74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
75 unsigned long *sp, unsigned long bp, char *log_lvl) 75 unsigned long *sp, unsigned long bp, char *log_lvl)
76{ 76{
77 unsigned long *stack; 77 unsigned long *stack;
78 int i; 78 int i;
@@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip)
156 156
157 return ud2 == 0x0b0f; 157 return ud2 == 0x0b0f;
158} 158}
159
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..0ad9597073f5 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,26 +10,28 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
21 23
22static char x86_stack_ids[][8] = { 24static char x86_stack_ids[][8] = {
23 [DEBUG_STACK - 1] = "#DB", 25 [ DEBUG_STACK-1 ] = "#DB",
24 [NMI_STACK - 1] = "NMI", 26 [ NMI_STACK-1 ] = "NMI",
25 [DOUBLEFAULT_STACK - 1] = "#DF", 27 [ DOUBLEFAULT_STACK-1 ] = "#DF",
26 [STACKFAULT_STACK - 1] = "#SS", 28 [ STACKFAULT_STACK-1 ] = "#SS",
27 [MCE_STACK - 1] = "#MC", 29 [ MCE_STACK-1 ] = "#MC",
28#if DEBUG_STKSZ > EXCEPTION_STKSZ 30#if DEBUG_STKSZ > EXCEPTION_STKSZ
29 [N_EXCEPTION_STACKS ... 31 [ N_EXCEPTION_STACKS ...
30 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 32 N_EXCEPTION_STACKS_END ] = "#DB[?]"
31#endif 33#endif
32 }; 34};
33 35
34int x86_is_stack_id(int id, char *name) 36int x86_is_stack_id(int id, char *name)
35{ 37{
@@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name)
37} 39}
38 40
39static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
40 unsigned *usedp, char **idp) 42 unsigned *usedp, char **idp)
41{ 43{
42 unsigned k; 44 unsigned k;
43 45
@@ -101,6 +103,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
101 return NULL; 103 return NULL;
102} 104}
103 105
106static inline int
107in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
108 unsigned long *irq_stack_end)
109{
110 return (stack >= irq_stack && stack < irq_stack_end);
111}
112
113/*
114 * We are returning from the irq stack and go to the previous one.
115 * If the previous stack is also in the irq stack, then bp in the first
116 * frame of the irq stack points to the previous, interrupted one.
117 * Otherwise we have another level of indirection: We first save
118 * the bp of the previous stack, then we switch the stack to the irq one
119 * and save a new bp that links to the previous one.
120 * (See save_args())
121 */
122static inline unsigned long
123fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
124 unsigned long *irq_stack, unsigned long *irq_stack_end)
125{
126#ifdef CONFIG_FRAME_POINTER
127 struct stack_frame *frame = (struct stack_frame *)bp;
128
129 if (!in_irq_stack(stack, irq_stack, irq_stack_end))
130 return (unsigned long)frame->next_frame;
131#endif
132 return bp;
133}
134
104/* 135/*
105 * x86-64 can have up to three kernel stacks: 136 * x86-64 can have up to three kernel stacks:
106 * process stack 137 * process stack
@@ -157,8 +188,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
157 if (ops->stack(data, id) < 0) 188 if (ops->stack(data, id) < 0)
158 break; 189 break;
159 190
160 bp = print_context_stack(tinfo, stack, bp, ops, 191 bp = ops->walk_stack(tinfo, stack, bp, ops,
161 data, estack_end, &graph); 192 data, estack_end, &graph);
162 ops->stack(data, "<EOE>"); 193 ops->stack(data, "<EOE>");
163 /* 194 /*
164 * We link to the next stack via the 195 * We link to the next stack via the
@@ -173,7 +204,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 irq_stack = irq_stack_end - 204 irq_stack = irq_stack_end -
174 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); 205 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
175 206
176 if (stack >= irq_stack && stack < irq_stack_end) { 207 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
177 if (ops->stack(data, "IRQ") < 0) 208 if (ops->stack(data, "IRQ") < 0)
178 break; 209 break;
179 bp = print_context_stack(tinfo, stack, bp, 210 bp = print_context_stack(tinfo, stack, bp,
@@ -184,6 +215,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
184 * pointer (index -1 to end) in the IRQ stack: 215 * pointer (index -1 to end) in the IRQ stack:
185 */ 216 */
186 stack = (unsigned long *) (irq_stack_end[-1]); 217 stack = (unsigned long *) (irq_stack_end[-1]);
218 bp = fixup_bp_irq_link(bp, stack, irq_stack,
219 irq_stack_end);
187 irq_stack_end = NULL; 220 irq_stack_end = NULL;
188 ops->stack(data, "EOI"); 221 ops->stack(data, "EOI");
189 continue; 222 continue;
@@ -202,21 +235,24 @@ EXPORT_SYMBOL(dump_trace);
202 235
203void 236void
204show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 237show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
205 unsigned long *sp, unsigned long bp, char *log_lvl) 238 unsigned long *sp, unsigned long bp, char *log_lvl)
206{ 239{
240 unsigned long *irq_stack_end;
241 unsigned long *irq_stack;
207 unsigned long *stack; 242 unsigned long *stack;
243 int cpu;
208 int i; 244 int i;
209 const int cpu = smp_processor_id(); 245
210 unsigned long *irq_stack_end = 246 preempt_disable();
211 (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); 247 cpu = smp_processor_id();
212 unsigned long *irq_stack = 248
213 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); 249 irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
250 irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
214 251
215 /* 252 /*
216 * debugging aid: "show_stack(NULL, NULL);" prints the 253 * Debugging aid: "show_stack(NULL, NULL);" prints the
217 * back trace for this cpu. 254 * back trace for this cpu:
218 */ 255 */
219
220 if (sp == NULL) { 256 if (sp == NULL) {
221 if (task) 257 if (task)
222 sp = (unsigned long *)task->thread.sp; 258 sp = (unsigned long *)task->thread.sp;
@@ -240,6 +276,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 printk(" %016lx", *stack++); 276 printk(" %016lx", *stack++);
241 touch_nmi_watchdog(); 277 touch_nmi_watchdog();
242 } 278 }
279 preempt_enable();
280
243 printk("\n"); 281 printk("\n");
244 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 282 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
245} 283}
@@ -303,4 +341,3 @@ int is_valid_bugaddr(unsigned long ip)
303 341
304 return ud2 == 0x0b0f; 342 return ud2 == 0x0b0f;
305} 343}
306
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 85419bb7d4ab..05ed7ab2ca48 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -724,7 +724,7 @@ core_initcall(e820_mark_nvs_memory);
724/* 724/*
725 * Early reserved memory areas. 725 * Early reserved memory areas.
726 */ 726 */
727#define MAX_EARLY_RES 20 727#define MAX_EARLY_RES 32
728 728
729struct early_res { 729struct early_res {
730 u64 start, end; 730 u64 start, end;
@@ -732,7 +732,16 @@ struct early_res {
732 char overlap_ok; 732 char overlap_ok;
733}; 733};
734static struct early_res early_res[MAX_EARLY_RES] __initdata = { 734static struct early_res early_res[MAX_EARLY_RES] __initdata = {
735 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ 735 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
736#ifdef CONFIG_X86_32
737 /*
738 * But first pinch a few for the stack/trampoline stuff
739 * FIXME: Don't need the extra page at 4K, but need to fix
740 * trampoline before removing it. (see the GDT stuff)
741 */
742 { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 },
743#endif
744
736 {} 745 {}
737}; 746};
738 747
@@ -1378,8 +1387,8 @@ static unsigned long ram_alignment(resource_size_t pos)
1378 if (mb < 16) 1387 if (mb < 16)
1379 return 1024*1024; 1388 return 1024*1024;
1380 1389
1381 /* To 32MB for anything above that */ 1390 /* To 64MB for anything above that */
1382 return 32*1024*1024; 1391 return 64*1024*1024;
1383} 1392}
1384 1393
1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1) 1394#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 41fd965c80c6..b9c830c12b4a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -206,8 +206,11 @@ static int __init setup_early_printk(char *buf)
206 206
207 while (*buf != '\0') { 207 while (*buf != '\0') {
208 if (!strncmp(buf, "serial", 6)) { 208 if (!strncmp(buf, "serial", 6)) {
209 early_serial_init(buf + 6); 209 buf += 6;
210 early_serial_init(buf);
210 early_console_register(&early_serial_console, keep); 211 early_console_register(&early_serial_console, keep);
212 if (!strncmp(buf, ",ttyS", 5))
213 buf += 5;
211 } 214 }
212 if (!strncmp(buf, "ttyS", 4)) { 215 if (!strncmp(buf, "ttyS", 4)) {
213 early_serial_init(buf + 4); 216 early_serial_init(buf + 4);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index ad5bd988fb79..cdcfb122f256 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -454,8 +454,10 @@ void __init efi_init(void)
454 if (add_efi_memmap) 454 if (add_efi_memmap)
455 do_add_efi_memmap(); 455 do_add_efi_memmap();
456 456
457#ifdef CONFIG_X86_32
457 x86_platform.get_wallclock = efi_get_time; 458 x86_platform.get_wallclock = efi_get_time;
458 x86_platform.set_wallclock = efi_set_rtc_mmss; 459 x86_platform.set_wallclock = efi_set_rtc_mmss;
460#endif
459 461
460 /* Setup for EFI runtime service */ 462 /* Setup for EFI runtime service */
461 reboot_type = BOOT_EFI; 463 reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..44a8e0dc6737 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,26 +717,69 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
711 */ 727 */
712#define PTREGSCALL(name) \ 728#define PTREGSCALL0(name) \
713 ALIGN; \ 729 ALIGN; \
714ptregs_##name: \ 730ptregs_##name: \
715 leal 4(%esp),%eax; \ 731 leal 4(%esp),%eax; \
716 jmp sys_##name; 732 jmp sys_##name;
717 733
718PTREGSCALL(iopl) 734#define PTREGSCALL1(name) \
719PTREGSCALL(fork) 735 ALIGN; \
720PTREGSCALL(clone) 736ptregs_##name: \
721PTREGSCALL(vfork) 737 leal 4(%esp),%edx; \
722PTREGSCALL(execve) 738 movl (PT_EBX+4)(%esp),%eax; \
723PTREGSCALL(sigaltstack) 739 jmp sys_##name;
724PTREGSCALL(sigreturn) 740
725PTREGSCALL(rt_sigreturn) 741#define PTREGSCALL2(name) \
726PTREGSCALL(vm86) 742 ALIGN; \
727PTREGSCALL(vm86old) 743ptregs_##name: \
744 leal 4(%esp),%ecx; \
745 movl (PT_ECX+4)(%esp),%edx; \
746 movl (PT_EBX+4)(%esp),%eax; \
747 jmp sys_##name;
748
749#define PTREGSCALL3(name) \
750 ALIGN; \
751ptregs_##name: \
752 leal 4(%esp),%eax; \
753 pushl %eax; \
754 movl PT_EDX(%eax),%ecx; \
755 movl PT_ECX(%eax),%edx; \
756 movl PT_EBX(%eax),%eax; \
757 call sys_##name; \
758 addl $4,%esp; \
759 ret
760
761PTREGSCALL1(iopl)
762PTREGSCALL0(fork)
763PTREGSCALL0(vfork)
764PTREGSCALL3(execve)
765PTREGSCALL2(sigaltstack)
766PTREGSCALL0(sigreturn)
767PTREGSCALL0(rt_sigreturn)
768PTREGSCALL2(vm86)
769PTREGSCALL1(vm86old)
770
771/* Clone is an oddball. The 4th arg is in %edi */
772 ALIGN;
773ptregs_clone:
774 leal 4(%esp),%eax
775 pushl %eax
776 pushl PT_EDI(%eax)
777 movl PT_EDX(%eax),%ecx
778 movl PT_ECX(%eax),%edx
779 movl PT_EBX(%eax),%eax
780 call sys_clone
781 addl $8,%esp
782 ret
728 783
729.macro FIXUP_ESPFIX_STACK 784.macro FIXUP_ESPFIX_STACK
730/* 785/*
@@ -814,6 +869,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 869ENDPROC(common_interrupt)
815 CFI_ENDPROC 870 CFI_ENDPROC
816 871
872/*
873 * Irq entries should be protected against kprobes
874 */
875 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 876#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 877ENTRY(name) \
819 RING0_INT_FRAME; \ 878 RING0_INT_FRAME; \
@@ -980,16 +1039,16 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1039 jmp error_code
981 CFI_ENDPROC 1040 CFI_ENDPROC
982END(spurious_interrupt_bug) 1041END(spurious_interrupt_bug)
1042/*
1043 * End of kprobes section
1044 */
1045 .popsection
983 1046
984ENTRY(kernel_thread_helper) 1047ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1048 pushl $0 # fake return address for unwinder
986 CFI_STARTPROC 1049 CFI_STARTPROC
987 movl %edx,%eax 1050 movl %edi,%eax
988 push %edx 1051 call *%esi
989 CFI_ADJUST_CFA_OFFSET 4
990 call *%ebx
991 push %eax
992 CFI_ADJUST_CFA_OFFSET 4
993 call do_exit 1052 call do_exit
994 ud2 # padding for call trace 1053 ud2 # padding for call trace
995 CFI_ENDPROC 1054 CFI_ENDPROC
@@ -1185,17 +1244,14 @@ END(ftrace_graph_caller)
1185 1244
1186.globl return_to_handler 1245.globl return_to_handler
1187return_to_handler: 1246return_to_handler:
1188 pushl $0
1189 pushl %eax 1247 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1248 pushl %edx
1192 movl %ebp, %eax 1249 movl %ebp, %eax
1193 call ftrace_return_to_handler 1250 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1251 movl %eax, %ecx
1195 popl %edx 1252 popl %edx
1196 popl %ecx
1197 popl %eax 1253 popl %eax
1198 ret 1254 jmp *%ecx
1199#endif 1255#endif
1200 1256
1201.section .rodata,"a" 1257.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f8f358..0697ff139837 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 16(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $16, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
@@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
969#endif 977#endif
970apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
971 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
972apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
973 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
974 982
975#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
976apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1068,10 +1076,10 @@ ENTRY(\sym)
1068 TRACE_IRQS_OFF 1076 TRACE_IRQS_OFF
1069 movq %rsp,%rdi /* pt_regs pointer */ 1077 movq %rsp,%rdi /* pt_regs pointer */
1070 xorl %esi,%esi /* no error code */ 1078 xorl %esi,%esi /* no error code */
1071 PER_CPU(init_tss, %rbp) 1079 PER_CPU(init_tss, %r12)
1072 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1073 call \do_sym 1081 call \do_sym
1074 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1075 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1076 CFI_ENDPROC 1084 CFI_ENDPROC
1077END(\sym) 1085END(\sym)
@@ -1158,63 +1166,20 @@ bad_gs:
1158 jmp 2b 1166 jmp 2b
1159 .previous 1167 .previous
1160 1168
1161/* 1169ENTRY(kernel_thread_helper)
1162 * Create a kernel thread.
1163 *
1164 * C extern interface:
1165 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1166 *
1167 * asm input arguments:
1168 * rdi: fn, rsi: arg, rdx: flags
1169 */
1170ENTRY(kernel_thread)
1171 CFI_STARTPROC
1172 FAKE_STACK_FRAME $child_rip
1173 SAVE_ALL
1174
1175 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1176 movq %rdx,%rdi
1177 orq kernel_thread_flags(%rip),%rdi
1178 movq $-1, %rsi
1179 movq %rsp, %rdx
1180
1181 xorl %r8d,%r8d
1182 xorl %r9d,%r9d
1183
1184 # clone now
1185 call do_fork
1186 movq %rax,RAX(%rsp)
1187 xorl %edi,%edi
1188
1189 /*
1190 * It isn't worth to check for reschedule here,
1191 * so internally to the x86_64 port you can rely on kernel_thread()
1192 * not to reschedule the child before returning, this avoids the need
1193 * of hacks for example to fork off the per-CPU idle tasks.
1194 * [Hopefully no generic code relies on the reschedule -AK]
1195 */
1196 RESTORE_ALL
1197 UNFAKE_STACK_FRAME
1198 ret
1199 CFI_ENDPROC
1200END(kernel_thread)
1201
1202ENTRY(child_rip)
1203 pushq $0 # fake return address 1170 pushq $0 # fake return address
1204 CFI_STARTPROC 1171 CFI_STARTPROC
1205 /* 1172 /*
1206 * Here we are in the child and the registers are set as they were 1173 * Here we are in the child and the registers are set as they were
1207 * at kernel_thread() invocation in the parent. 1174 * at kernel_thread() invocation in the parent.
1208 */ 1175 */
1209 movq %rdi, %rax 1176 call *%rsi
1210 movq %rsi, %rdi
1211 call *%rax
1212 # exit 1177 # exit
1213 mov %eax, %edi 1178 mov %eax, %edi
1214 call do_exit 1179 call do_exit
1215 ud2 # padding for call trace 1180 ud2 # padding for call trace
1216 CFI_ENDPROC 1181 CFI_ENDPROC
1217END(child_rip) 1182END(kernel_thread_helper)
1218 1183
1219/* 1184/*
1220 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1185 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1491,12 +1456,17 @@ error_kernelspace:
1491 leaq irq_return(%rip),%rcx 1456 leaq irq_return(%rip),%rcx
1492 cmpq %rcx,RIP+8(%rsp) 1457 cmpq %rcx,RIP+8(%rsp)
1493 je error_swapgs 1458 je error_swapgs
1494 movl %ecx,%ecx /* zero extend */ 1459 movl %ecx,%eax /* zero extend */
1495 cmpq %rcx,RIP+8(%rsp) 1460 cmpq %rax,RIP+8(%rsp)
1496 je error_swapgs 1461 je bstep_iret
1497 cmpq $gs_change,RIP+8(%rsp) 1462 cmpq $gs_change,RIP+8(%rsp)
1498 je error_swapgs 1463 je error_swapgs
1499 jmp error_sti 1464 jmp error_sti
1465
1466bstep_iret:
1467 /* Fix truncated RIP */
1468 movq %rcx,RIP+8(%rsp)
1469 jmp error_swapgs
1500END(error_entry) 1470END(error_entry)
1501 1471
1502 1472
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..309689245431 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -187,9 +189,26 @@ static void wait_for_nmi(void)
187 nmi_wait_count++; 189 nmi_wait_count++;
188} 190}
189 191
192static inline int
193within(unsigned long addr, unsigned long start, unsigned long end)
194{
195 return addr >= start && addr < end;
196}
197
190static int 198static int
191do_ftrace_mod_code(unsigned long ip, void *new_code) 199do_ftrace_mod_code(unsigned long ip, void *new_code)
192{ 200{
201 /*
202 * On x86_64, kernel text mappings are mapped read-only with
203 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
204 * of the kernel text mapping to modify the kernel text.
205 *
206 * For 32bit kernels, these mappings are same and we can use
207 * kernel identity mapping to modify code.
208 */
209 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
210 ip = (unsigned long)__va(__pa(ip));
211
193 mod_code_ip = (void *)ip; 212 mod_code_ip = (void *)ip;
194 mod_code_newcode = new_code; 213 mod_code_newcode = new_code;
195 214
@@ -336,15 +355,15 @@ int __init ftrace_dyn_arch_init(void *data)
336 355
337 switch (faulted) { 356 switch (faulted) {
338 case 0: 357 case 0:
339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 358 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 359 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
341 break; 360 break;
342 case 1: 361 case 1:
343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 362 pr_info("converting mcount calls to 66 66 66 66 90\n");
344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 363 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
345 break; 364 break;
346 case 2: 365 case 2:
347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 366 pr_info("converting mcount calls to jmp . + 5\n");
348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 367 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
349 break; 368 break;
350 } 369 }
@@ -468,82 +487,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
468 487
469#ifdef CONFIG_FTRACE_SYSCALLS 488#ifdef CONFIG_FTRACE_SYSCALLS
470 489
471extern unsigned long __start_syscalls_metadata[];
472extern unsigned long __stop_syscalls_metadata[];
473extern unsigned long *sys_call_table; 490extern unsigned long *sys_call_table;
474 491
475static struct syscall_metadata **syscalls_metadata; 492unsigned long __init arch_syscall_addr(int nr)
476
477static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
478{
479 struct syscall_metadata *start;
480 struct syscall_metadata *stop;
481 char str[KSYM_SYMBOL_LEN];
482
483
484 start = (struct syscall_metadata *)__start_syscalls_metadata;
485 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
487
488 for ( ; start < stop; start++) {
489 if (start->name && !strcmp(start->name, str))
490 return start;
491 }
492 return NULL;
493}
494
495struct syscall_metadata *syscall_nr_to_meta(int nr)
496{
497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
498 return NULL;
499
500 return syscalls_metadata[nr];
501}
502
503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{ 493{
526 syscalls_metadata[num]->exit_id = id; 494 return (unsigned long)(&sys_call_table)[nr];
527}
528
529static int __init arch_init_ftrace_syscalls(void)
530{
531 int i;
532 struct syscall_metadata *meta;
533 unsigned long **psys_syscall_table = &sys_call_table;
534
535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
536 NR_syscalls, GFP_KERNEL);
537 if (!syscalls_metadata) {
538 WARN_ON(1);
539 return -ENOMEM;
540 }
541
542 for (i = 0; i < NR_syscalls; i++) {
543 meta = find_syscall_meta(psys_syscall_table[i]);
544 syscalls_metadata[i] = meta;
545 }
546 return 0;
547} 495}
548arch_initcall(arch_init_ftrace_syscalls);
549#endif 496#endif
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
deleted file mode 100644
index 9b08e852fd1a..000000000000
--- a/arch/x86/kernel/geode_32.c
+++ /dev/null
@@ -1,196 +0,0 @@
1/*
2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public License
8 * as published by the Free Software Foundation.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/ioport.h>
14#include <linux/io.h>
15#include <asm/msr.h>
16#include <asm/geode.h>
17
18static struct {
19 char *name;
20 u32 msr;
21 int size;
22 u32 base;
23} lbars[] = {
24 { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
25 { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
26 { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
27 { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
28};
29
30static void __init init_lbars(void)
31{
32 u32 lo, hi;
33 int i;
34
35 for (i = 0; i < ARRAY_SIZE(lbars); i++) {
36 rdmsr(lbars[i].msr, lo, hi);
37 if (hi & 0x01)
38 lbars[i].base = lo & 0x0000ffff;
39
40 if (lbars[i].base == 0)
41 printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
42 lbars[i].name);
43 }
44}
45
46int geode_get_dev_base(unsigned int dev)
47{
48 BUG_ON(dev >= ARRAY_SIZE(lbars));
49 return lbars[dev].base;
50}
51EXPORT_SYMBOL_GPL(geode_get_dev_base);
52
53/* === GPIO API === */
54
55void geode_gpio_set(u32 gpio, unsigned int reg)
56{
57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
58
59 if (!base)
60 return;
61
62 /* low bank register */
63 if (gpio & 0xFFFF)
64 outl(gpio & 0xFFFF, base + reg);
65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
69}
70EXPORT_SYMBOL_GPL(geode_gpio_set);
71
72void geode_gpio_clear(u32 gpio, unsigned int reg)
73{
74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
75
76 if (!base)
77 return;
78
79 /* low bank register */
80 if (gpio & 0xFFFF)
81 outl((gpio & 0xFFFF) << 16, base + reg);
82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
86}
87EXPORT_SYMBOL_GPL(geode_gpio_clear);
88
89int geode_gpio_isset(u32 gpio, unsigned int reg)
90{
91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
93
94 if (!base)
95 return 0;
96
97 /* low bank register */
98 if (gpio & 0xFFFF) {
99 val = inl(base + reg) & (gpio & 0xFFFF);
100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
111}
112EXPORT_SYMBOL_GPL(geode_gpio_isset);
113
114void geode_gpio_set_irq(unsigned int group, unsigned int irq)
115{
116 u32 lo, hi;
117
118 if (group > 7 || irq > 15)
119 return;
120
121 rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
122
123 lo &= ~(0xF << (group * 4));
124 lo |= (irq & 0xF) << (group * 4);
125
126 wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
127}
128EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
129
130void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
131{
132 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
133 u32 offset, shift, val;
134
135 if (gpio >= 24)
136 offset = GPIO_MAP_W;
137 else if (gpio >= 16)
138 offset = GPIO_MAP_Z;
139 else if (gpio >= 8)
140 offset = GPIO_MAP_Y;
141 else
142 offset = GPIO_MAP_X;
143
144 shift = (gpio % 8) * 4;
145
146 val = inl(base + offset);
147
148 /* Clear whatever was there before */
149 val &= ~(0xF << shift);
150
151 /* And set the new value */
152
153 val |= ((pair & 7) << shift);
154
155 /* Set the PME bit if this is a PME event */
156
157 if (pme)
158 val |= (1 << (shift + 3));
159
160 outl(val, base + offset);
161}
162EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
163
164int geode_has_vsa2(void)
165{
166 static int has_vsa2 = -1;
167
168 if (has_vsa2 == -1) {
169 u16 val;
170
171 /*
172 * The VSA has virtual registers that we can query for a
173 * signature.
174 */
175 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
176 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
177
178 val = inw(VSA_VRC_DATA);
179 has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
180 }
181
182 return has_vsa2;
183}
184EXPORT_SYMBOL_GPL(geode_has_vsa2);
185
186static int __init geode_southbridge_init(void)
187{
188 if (!is_geode())
189 return -ENODEV;
190
191 init_lbars();
192 (void) mfgpt_timer_setup();
193 return 0;
194}
195
196postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 4f8e2507e8f3..5051b94c9069 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,8 +29,6 @@ static void __init i386_default_early_setup(void)
29 29
30void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
31{ 31{
32 reserve_trampoline_memory();
33
34 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
35 33
36#ifdef CONFIG_BLK_DEV_INITRD 34#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd778fd9..b5a9896ca1e7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 reserve_trampoline_memory();
102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 102
105#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fd39eaf83b84..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h>
22#include <asm/cpufeature.h>
21#include <asm/percpu.h> 23#include <asm/percpu.h>
22 24
23/* Physical address */ 25/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
297 orl %edx,%eax 299 orl %edx,%eax
298 movl %eax,%cr4 300 movl %eax,%cr4
299 301
300 btl $5, %eax # check if PAE is enabled 302 testb $X86_CR4_PAE, %al # check if PAE is enabled
301 jnc 6f 303 jz 6f
302 304
303 /* Check if extended functions are implemented */ 305 /* Check if extended functions are implemented */
304 movl $0x80000000, %eax 306 movl $0x80000000, %eax
305 cpuid 307 cpuid
306 cmpl $0x80000000, %eax 308 /* Value must be in the range 0x80000001 to 0x8000ffff */
307 jbe 6f 309 subl $0x80000001, %eax
310 cmpl $(0x8000ffff-0x80000001), %eax
311 ja 6f
308 mov $0x80000001, %eax 312 mov $0x80000001, %eax
309 cpuid 313 cpuid
310 /* Execute Disable bit supported? */ 314 /* Execute Disable bit supported? */
311 btl $20, %edx 315 btl $(X86_FEATURE_NX & 31), %edx
312 jnc 6f 316 jnc 6f
313 317
314 /* Setup EFER (Extended Feature Enable Register) */ 318 /* Setup EFER (Extended Feature Enable Register) */
315 movl $0xc0000080, %ecx 319 movl $MSR_EFER, %ecx
316 rdmsr 320 rdmsr
317 321
318 btsl $11, %eax 322 btsl $_EFER_NX, %eax
319 /* Make changes effective */ 323 /* Make changes effective */
320 wrmsr 324 wrmsr
321 325
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 780cd928fcd5..2d8b5035371c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
212 */ 212 */
213 lgdt early_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
214 214
215 /* set up data segments. actually 0 would do too */ 215 /* set up data segments */
216 movl $__KERNEL_DS,%eax 216 xorl %eax,%eax
217 movl %eax,%ds 217 movl %eax,%ds
218 movl %eax,%ss 218 movl %eax,%ss
219 movl %eax,%es 219 movl %eax,%es
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
262 .quad x86_64_start_kernel 262 .quad x86_64_start_kernel
263 ENTRY(initial_gs) 263 ENTRY(initial_gs)
264 .quad INIT_PER_CPU_VAR(irq_stack_union) 264 .quad INIT_PER_CPU_VAR(irq_stack_union)
265 __FINITDATA
266 265
267 ENTRY(stack_start) 266 ENTRY(stack_start)
268 .quad init_thread_union+THREAD_SIZE-8 267 .quad init_thread_union+THREAD_SIZE-8
269 .word 0 268 .word 0
269 __FINITDATA
270 270
271bad_address: 271bad_address:
272 jmp bad_address 272 jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
340 i = i + 1 ; \ 340 i = i + 1 ; \
341 .endr 341 .endr
342 342
343 .data
343 /* 344 /*
344 * This default setting generates an ident mapping at address 0x100000 345 * This default setting generates an ident mapping at address 0x100000
345 * and a mapping for the kernel that precisely maps virtual address 346 * and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..ba6e65884603 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,6 +33,7 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36u8 hpet_blockid; /* OS timer block num */
36#ifdef CONFIG_PCI_MSI 37#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers; 38static unsigned long hpet_num_timers;
38#endif 39#endif
@@ -47,12 +48,12 @@ struct hpet_dev {
47 char name[10]; 48 char name[10];
48}; 49};
49 50
50unsigned long hpet_readl(unsigned long a) 51inline unsigned int hpet_readl(unsigned int a)
51{ 52{
52 return readl(hpet_virt_address + a); 53 return readl(hpet_virt_address + a);
53} 54}
54 55
55static inline void hpet_writel(unsigned long d, unsigned long a) 56static inline void hpet_writel(unsigned int d, unsigned int a)
56{ 57{
57 writel(d, hpet_virt_address + a); 58 writel(d, hpet_virt_address + a);
58} 59}
@@ -167,7 +168,7 @@ do { \
167 168
168static void hpet_reserve_msi_timers(struct hpet_data *hd); 169static void hpet_reserve_msi_timers(struct hpet_data *hd);
169 170
170static void hpet_reserve_platform_timers(unsigned long id) 171static void hpet_reserve_platform_timers(unsigned int id)
171{ 172{
172 struct hpet __iomem *hpet = hpet_virt_address; 173 struct hpet __iomem *hpet = hpet_virt_address;
173 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; 174 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +206,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
205 206
206} 207}
207#else 208#else
208static void hpet_reserve_platform_timers(unsigned long id) { } 209static void hpet_reserve_platform_timers(unsigned int id) { }
209#endif 210#endif
210 211
211/* 212/*
@@ -246,7 +247,7 @@ static void hpet_reset_counter(void)
246 247
247static void hpet_start_counter(void) 248static void hpet_start_counter(void)
248{ 249{
249 unsigned long cfg = hpet_readl(HPET_CFG); 250 unsigned int cfg = hpet_readl(HPET_CFG);
250 cfg |= HPET_CFG_ENABLE; 251 cfg |= HPET_CFG_ENABLE;
251 hpet_writel(cfg, HPET_CFG); 252 hpet_writel(cfg, HPET_CFG);
252} 253}
@@ -271,7 +272,7 @@ static void hpet_resume_counter(void)
271 272
272static void hpet_enable_legacy_int(void) 273static void hpet_enable_legacy_int(void)
273{ 274{
274 unsigned long cfg = hpet_readl(HPET_CFG); 275 unsigned int cfg = hpet_readl(HPET_CFG);
275 276
276 cfg |= HPET_CFG_LEGACY; 277 cfg |= HPET_CFG_LEGACY;
277 hpet_writel(cfg, HPET_CFG); 278 hpet_writel(cfg, HPET_CFG);
@@ -314,7 +315,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
314static void hpet_set_mode(enum clock_event_mode mode, 315static void hpet_set_mode(enum clock_event_mode mode,
315 struct clock_event_device *evt, int timer) 316 struct clock_event_device *evt, int timer)
316{ 317{
317 unsigned long cfg, cmp, now; 318 unsigned int cfg, cmp, now;
318 uint64_t delta; 319 uint64_t delta;
319 320
320 switch (mode) { 321 switch (mode) {
@@ -323,7 +324,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
323 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 324 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
324 delta >>= evt->shift; 325 delta >>= evt->shift;
325 now = hpet_readl(HPET_COUNTER); 326 now = hpet_readl(HPET_COUNTER);
326 cmp = now + (unsigned long) delta; 327 cmp = now + (unsigned int) delta;
327 cfg = hpet_readl(HPET_Tn_CFG(timer)); 328 cfg = hpet_readl(HPET_Tn_CFG(timer));
328 /* Make sure we use edge triggered interrupts */ 329 /* Make sure we use edge triggered interrupts */
329 cfg &= ~HPET_TN_LEVEL; 330 cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +340,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
339 * (See AMD-8111 HyperTransport I/O Hub Data Sheet, 340 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
340 * Publication # 24674) 341 * Publication # 24674)
341 */ 342 */
342 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 343 hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
343 hpet_start_counter(); 344 hpet_start_counter();
344 hpet_print_config(); 345 hpet_print_config();
345 break; 346 break;
@@ -383,13 +384,24 @@ static int hpet_next_event(unsigned long delta,
383 hpet_writel(cnt, HPET_Tn_CMP(timer)); 384 hpet_writel(cnt, HPET_Tn_CMP(timer));
384 385
385 /* 386 /*
386 * We need to read back the CMP register to make sure that 387 * We need to read back the CMP register on certain HPET
387 * what we wrote hit the chip before we compare it to the 388 * implementations (ATI chipsets) which seem to delay the
388 * counter. 389 * transfer of the compare register into the internal compare
390 * logic. With small deltas this might actually be too late as
391 * the counter could already be higher than the compare value
392 * at that point and we would wait for the next hpet interrupt
393 * forever. We found out that reading the CMP register back
394 * forces the transfer so we can rely on the comparison with
395 * the counter register below. If the read back from the
396 * compare register does not match the value we programmed
397 * then we might have a real hardware problem. We can not do
398 * much about it here, but at least alert the user/admin with
399 * a prominent warning.
389 */ 400 */
390 WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); 401 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
402 KERN_WARNING "hpet: compare register read back failed.\n");
391 403
392 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 404 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
393} 405}
394 406
395static void hpet_legacy_set_mode(enum clock_event_mode mode, 407static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +427,7 @@ static struct hpet_dev *hpet_devs;
415void hpet_msi_unmask(unsigned int irq) 427void hpet_msi_unmask(unsigned int irq)
416{ 428{
417 struct hpet_dev *hdev = get_irq_data(irq); 429 struct hpet_dev *hdev = get_irq_data(irq);
418 unsigned long cfg; 430 unsigned int cfg;
419 431
420 /* unmask it */ 432 /* unmask it */
421 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 433 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +437,7 @@ void hpet_msi_unmask(unsigned int irq)
425 437
426void hpet_msi_mask(unsigned int irq) 438void hpet_msi_mask(unsigned int irq)
427{ 439{
428 unsigned long cfg; 440 unsigned int cfg;
429 struct hpet_dev *hdev = get_irq_data(irq); 441 struct hpet_dev *hdev = get_irq_data(irq);
430 442
431 /* mask it */ 443 /* mask it */
@@ -467,7 +479,7 @@ static int hpet_msi_next_event(unsigned long delta,
467 479
468static int hpet_setup_msi_irq(unsigned int irq) 480static int hpet_setup_msi_irq(unsigned int irq)
469{ 481{
470 if (arch_setup_hpet_msi(irq)) { 482 if (arch_setup_hpet_msi(irq, hpet_blockid)) {
471 destroy_irq(irq); 483 destroy_irq(irq);
472 return -EINVAL; 484 return -EINVAL;
473 } 485 }
@@ -584,6 +596,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
584 unsigned int num_timers_used = 0; 596 unsigned int num_timers_used = 0;
585 int i; 597 int i;
586 598
599 if (boot_cpu_has(X86_FEATURE_ARAT))
600 return;
587 id = hpet_readl(HPET_ID); 601 id = hpet_readl(HPET_ID);
588 602
589 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 603 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +612,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
598 612
599 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { 613 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
600 struct hpet_dev *hdev = &hpet_devs[num_timers_used]; 614 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
601 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); 615 unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
602 616
603 /* Only consider HPET timer with MSI support */ 617 /* Only consider HPET timer with MSI support */
604 if (!(cfg & HPET_TN_FSB_CAP)) 618 if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +827,7 @@ static int hpet_clocksource_register(void)
813 */ 827 */
814int __init hpet_enable(void) 828int __init hpet_enable(void)
815{ 829{
816 unsigned long id; 830 unsigned int id;
817 int i; 831 int i;
818 832
819 if (!is_hpet_capable()) 833 if (!is_hpet_capable())
@@ -872,10 +886,8 @@ int __init hpet_enable(void)
872 886
873 if (id & HPET_ID_LEGSUP) { 887 if (id & HPET_ID_LEGSUP) {
874 hpet_legacy_clockevent_register(); 888 hpet_legacy_clockevent_register();
875 hpet_msi_capability_lookup(2);
876 return 1; 889 return 1;
877 } 890 }
878 hpet_msi_capability_lookup(0);
879 return 0; 891 return 0;
880 892
881out_nohpet: 893out_nohpet:
@@ -908,9 +920,17 @@ static __init int hpet_late_init(void)
908 if (!hpet_virt_address) 920 if (!hpet_virt_address)
909 return -ENODEV; 921 return -ENODEV;
910 922
923 if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
924 hpet_msi_capability_lookup(2);
925 else
926 hpet_msi_capability_lookup(0);
927
911 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 928 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
912 hpet_print_config(); 929 hpet_print_config();
913 930
931 if (boot_cpu_has(X86_FEATURE_ARAT))
932 return 0;
933
914 for_each_online_cpu(cpu) { 934 for_each_online_cpu(cpu) {
915 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 935 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
916 } 936 }
@@ -925,7 +945,7 @@ fs_initcall(hpet_late_init);
925void hpet_disable(void) 945void hpet_disable(void)
926{ 946{
927 if (is_hpet_capable()) { 947 if (is_hpet_capable()) {
928 unsigned long cfg = hpet_readl(HPET_CFG); 948 unsigned int cfg = hpet_readl(HPET_CFG);
929 949
930 if (hpet_legacy_int_enabled) { 950 if (hpet_legacy_int_enabled) {
931 cfg &= ~HPET_CFG_LEGACY; 951 cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +985,8 @@ static int hpet_prev_update_sec;
965static struct rtc_time hpet_alarm_time; 985static struct rtc_time hpet_alarm_time;
966static unsigned long hpet_pie_count; 986static unsigned long hpet_pie_count;
967static u32 hpet_t1_cmp; 987static u32 hpet_t1_cmp;
968static unsigned long hpet_default_delta; 988static u32 hpet_default_delta;
969static unsigned long hpet_pie_delta; 989static u32 hpet_pie_delta;
970static unsigned long hpet_pie_limit; 990static unsigned long hpet_pie_limit;
971 991
972static rtc_irq_handler irq_handler; 992static rtc_irq_handler irq_handler;
@@ -1017,7 +1037,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
1017 */ 1037 */
1018int hpet_rtc_timer_init(void) 1038int hpet_rtc_timer_init(void)
1019{ 1039{
1020 unsigned long cfg, cnt, delta, flags; 1040 unsigned int cfg, cnt, delta;
1041 unsigned long flags;
1021 1042
1022 if (!is_hpet_enabled()) 1043 if (!is_hpet_enabled())
1023 return 0; 1044 return 0;
@@ -1027,7 +1048,7 @@ int hpet_rtc_timer_init(void)
1027 1048
1028 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1049 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1029 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; 1050 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
1030 hpet_default_delta = (unsigned long) clc; 1051 hpet_default_delta = clc;
1031 } 1052 }
1032 1053
1033 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1054 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1134,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1113 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1134 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1114 do_div(clc, freq); 1135 do_div(clc, freq);
1115 clc >>= hpet_clockevent.shift; 1136 clc >>= hpet_clockevent.shift;
1116 hpet_pie_delta = (unsigned long) clc; 1137 hpet_pie_delta = clc;
1117 } 1138 }
1118 return 1; 1139 return 1;
1119} 1140}
@@ -1127,7 +1148,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1127 1148
1128static void hpet_rtc_timer_reinit(void) 1149static void hpet_rtc_timer_reinit(void)
1129{ 1150{
1130 unsigned long cfg, delta; 1151 unsigned int cfg, delta;
1131 int lost_ints = -1; 1152 int lost_ints = -1;
1132 1153
1133 if (unlikely(!hpet_rtc_flags)) { 1154 if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..05d5fec64a94
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,554 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type)
236{
237 /* Len */
238 switch (x86_len) {
239 case X86_BREAKPOINT_LEN_1:
240 *gen_len = HW_BREAKPOINT_LEN_1;
241 break;
242 case X86_BREAKPOINT_LEN_2:
243 *gen_len = HW_BREAKPOINT_LEN_2;
244 break;
245 case X86_BREAKPOINT_LEN_4:
246 *gen_len = HW_BREAKPOINT_LEN_4;
247 break;
248#ifdef CONFIG_X86_64
249 case X86_BREAKPOINT_LEN_8:
250 *gen_len = HW_BREAKPOINT_LEN_8;
251 break;
252#endif
253 default:
254 return -EINVAL;
255 }
256
257 /* Type */
258 switch (x86_type) {
259 case X86_BREAKPOINT_EXECUTE:
260 *gen_type = HW_BREAKPOINT_X;
261 break;
262 case X86_BREAKPOINT_WRITE:
263 *gen_type = HW_BREAKPOINT_W;
264 break;
265 case X86_BREAKPOINT_RW:
266 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
267 break;
268 default:
269 return -EINVAL;
270 }
271
272 return 0;
273}
274
275
276static int arch_build_bp_info(struct perf_event *bp)
277{
278 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
279
280 info->address = bp->attr.bp_addr;
281
282 /* Len */
283 switch (bp->attr.bp_len) {
284 case HW_BREAKPOINT_LEN_1:
285 info->len = X86_BREAKPOINT_LEN_1;
286 break;
287 case HW_BREAKPOINT_LEN_2:
288 info->len = X86_BREAKPOINT_LEN_2;
289 break;
290 case HW_BREAKPOINT_LEN_4:
291 info->len = X86_BREAKPOINT_LEN_4;
292 break;
293#ifdef CONFIG_X86_64
294 case HW_BREAKPOINT_LEN_8:
295 info->len = X86_BREAKPOINT_LEN_8;
296 break;
297#endif
298 default:
299 return -EINVAL;
300 }
301
302 /* Type */
303 switch (bp->attr.bp_type) {
304 case HW_BREAKPOINT_W:
305 info->type = X86_BREAKPOINT_WRITE;
306 break;
307 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
308 info->type = X86_BREAKPOINT_RW;
309 break;
310 case HW_BREAKPOINT_X:
311 info->type = X86_BREAKPOINT_EXECUTE;
312 break;
313 default:
314 return -EINVAL;
315 }
316
317 return 0;
318}
319/*
320 * Validate the arch-specific HW Breakpoint register settings
321 */
322int arch_validate_hwbkpt_settings(struct perf_event *bp,
323 struct task_struct *tsk)
324{
325 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
326 unsigned int align;
327 int ret;
328
329
330 ret = arch_build_bp_info(bp);
331 if (ret)
332 return ret;
333
334 ret = -EINVAL;
335
336 if (info->type == X86_BREAKPOINT_EXECUTE)
337 /*
338 * Ptrace-refactoring code
339 * For now, we'll allow instruction breakpoint only for user-space
340 * addresses
341 */
342 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
343 info->len != X86_BREAKPOINT_EXECUTE)
344 return ret;
345
346 switch (info->len) {
347 case X86_BREAKPOINT_LEN_1:
348 align = 0;
349 break;
350 case X86_BREAKPOINT_LEN_2:
351 align = 1;
352 break;
353 case X86_BREAKPOINT_LEN_4:
354 align = 3;
355 break;
356#ifdef CONFIG_X86_64
357 case X86_BREAKPOINT_LEN_8:
358 align = 7;
359 break;
360#endif
361 default:
362 return ret;
363 }
364
365 ret = arch_store_info(bp);
366
367 if (ret < 0)
368 return ret;
369 /*
370 * Check that the low-order bits of the address are appropriate
371 * for the alignment implied by len.
372 */
373 if (info->address & align)
374 return -EINVAL;
375
376 /* Check that the virtual address is in the proper range */
377 if (tsk) {
378 if (!arch_check_va_in_userspace(info->address, info->len))
379 return -EFAULT;
380 } else {
381 if (!arch_check_va_in_kernelspace(info->address, info->len))
382 return -EFAULT;
383 }
384
385 return 0;
386}
387
388/*
389 * Dump the debug register contents to the user.
390 * We can't dump our per cpu values because it
391 * may contain cpu wide breakpoint, something that
392 * doesn't belong to the current task.
393 *
394 * TODO: include non-ptrace user breakpoints (perf)
395 */
396void aout_dump_debugregs(struct user *dump)
397{
398 int i;
399 int dr7 = 0;
400 struct perf_event *bp;
401 struct arch_hw_breakpoint *info;
402 struct thread_struct *thread = &current->thread;
403
404 for (i = 0; i < HBP_NUM; i++) {
405 bp = thread->ptrace_bps[i];
406
407 if (bp && !bp->attr.disabled) {
408 dump->u_debugreg[i] = bp->attr.bp_addr;
409 info = counter_arch_bp(bp);
410 dr7 |= encode_dr7(i, info->len, info->type);
411 } else {
412 dump->u_debugreg[i] = 0;
413 }
414 }
415
416 dump->u_debugreg[4] = 0;
417 dump->u_debugreg[5] = 0;
418 dump->u_debugreg[6] = current->thread.debugreg6;
419
420 dump->u_debugreg[7] = dr7;
421}
422EXPORT_SYMBOL_GPL(aout_dump_debugregs);
423
424/*
425 * Release the user breakpoints used by ptrace
426 */
427void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
428{
429 int i;
430 struct thread_struct *t = &tsk->thread;
431
432 for (i = 0; i < HBP_NUM; i++) {
433 unregister_hw_breakpoint(t->ptrace_bps[i]);
434 t->ptrace_bps[i] = NULL;
435 }
436}
437
438void hw_breakpoint_restore(void)
439{
440 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
441 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
442 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
443 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
444 set_debugreg(current->thread.debugreg6, 6);
445 set_debugreg(__get_cpu_var(cpu_dr7), 7);
446}
447EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
448
449/*
450 * Handle debug exception notifications.
451 *
452 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
453 *
454 * NOTIFY_DONE returned if one of the following conditions is true.
455 * i) When the causative address is from user-space and the exception
456 * is a valid one, i.e. not triggered as a result of lazy debug register
457 * switching
458 * ii) When there are more bits than trap<n> set in DR6 register (such
459 * as BD, BS or BT) indicating that more than one debug condition is
460 * met and requires some more action in do_debug().
461 *
462 * NOTIFY_STOP returned for all other cases
463 *
464 */
465static int __kprobes hw_breakpoint_handler(struct die_args *args)
466{
467 int i, cpu, rc = NOTIFY_STOP;
468 struct perf_event *bp;
469 unsigned long dr7, dr6;
470 unsigned long *dr6_p;
471
472 /* The DR6 value is pointed by args->err */
473 dr6_p = (unsigned long *)ERR_PTR(args->err);
474 dr6 = *dr6_p;
475
476 /* Do an early return if no trap bits are set in DR6 */
477 if ((dr6 & DR_TRAP_BITS) == 0)
478 return NOTIFY_DONE;
479
480 get_debugreg(dr7, 7);
481 /* Disable breakpoints during exception handling */
482 set_debugreg(0UL, 7);
483 /*
484 * Assert that local interrupts are disabled
485 * Reset the DRn bits in the virtualized register value.
486 * The ptrace trigger routine will add in whatever is needed.
487 */
488 current->thread.debugreg6 &= ~DR_TRAP_BITS;
489 cpu = get_cpu();
490
491 /* Handle all the breakpoints that were triggered */
492 for (i = 0; i < HBP_NUM; ++i) {
493 if (likely(!(dr6 & (DR_TRAP0 << i))))
494 continue;
495
496 /*
497 * The counter may be concurrently released but that can only
498 * occur from a call_rcu() path. We can then safely fetch
499 * the breakpoint, use its callback, touch its counter
500 * while we are in an rcu_read_lock() path.
501 */
502 rcu_read_lock();
503
504 bp = per_cpu(bp_per_reg[i], cpu);
505 if (bp)
506 rc = NOTIFY_DONE;
507 /*
508 * Reset the 'i'th TRAP bit in dr6 to denote completion of
509 * exception handling
510 */
511 (*dr6_p) &= ~(DR_TRAP0 << i);
512 /*
513 * bp can be NULL due to lazy debug register switching
514 * or due to concurrent perf counter removing.
515 */
516 if (!bp) {
517 rcu_read_unlock();
518 break;
519 }
520
521 perf_bp_event(bp, args->regs);
522
523 rcu_read_unlock();
524 }
525 if (dr6 & (~DR_TRAP_BITS))
526 rc = NOTIFY_DONE;
527
528 set_debugreg(dr7, 7);
529 put_cpu();
530
531 return rc;
532}
533
534/*
535 * Handle debug exception notifications.
536 */
537int __kprobes hw_breakpoint_exceptions_notify(
538 struct notifier_block *unused, unsigned long val, void *data)
539{
540 if (val != DIE_DEBUG)
541 return NOTIFY_DONE;
542
543 return hw_breakpoint_handler(data);
544}
545
546void hw_breakpoint_pmu_read(struct perf_event *bp)
547{
548 /* TODO */
549}
550
551void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
552{
553 /* TODO */
554}
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 1736c5a725aa..9c3bd4a2050e 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -15,8 +15,10 @@ EXPORT_SYMBOL(mcount);
15 * the export, but dont use it from C code, it is used 15 * the export, but dont use it from C code, it is used
16 * by assembly code and is not using C calling convention! 16 * by assembly code and is not using C calling convention!
17 */ 17 */
18#ifndef CONFIG_X86_CMPXCHG64
18extern void cmpxchg8b_emu(void); 19extern void cmpxchg8b_emu(void);
19EXPORT_SYMBOL(cmpxchg8b_emu); 20EXPORT_SYMBOL(cmpxchg8b_emu);
21#endif
20 22
21/* Networking helper routines. */ 23/* Networking helper routines. */
22EXPORT_SYMBOL(csum_partial_copy_generic); 24EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d308f16b..8eec0ec59af2 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
103 * on system-call entry - see also fork() and the signal handling 103 * on system-call entry - see also fork() and the signal handling
104 * code. 104 * code.
105 */ 105 */
106static int do_iopl(unsigned int level, struct pt_regs *regs) 106long sys_iopl(unsigned int level, struct pt_regs *regs)
107{ 107{
108 unsigned int old = (regs->flags >> 12) & 3; 108 unsigned int old = (regs->flags >> 12) & 3;
109 struct thread_struct *t = &current->thread;
109 110
110 if (level > 3) 111 if (level > 3)
111 return -EINVAL; 112 return -EINVAL;
@@ -115,29 +116,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
115 return -EPERM; 116 return -EPERM;
116 } 117 }
117 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); 118 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
118
119 return 0;
120}
121
122#ifdef CONFIG_X86_32
123long sys_iopl(struct pt_regs *regs)
124{
125 unsigned int level = regs->bx;
126 struct thread_struct *t = &current->thread;
127 int rc;
128
129 rc = do_iopl(level, regs);
130 if (rc < 0)
131 goto out;
132
133 t->iopl = level << 12; 119 t->iopl = level << 12;
134 set_iopl_mask(t->iopl); 120 set_iopl_mask(t->iopl);
135out: 121
136 return rc; 122 return 0;
137}
138#else
139asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
140{
141 return do_iopl(level, regs);
142} 123}
143#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 74656d1d4e30..91fd0c70a18a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -63,19 +63,19 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT"); 66 seq_printf(p, "%*s: ", prec, "PMI");
67 for_each_online_cpu(j) 67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n"); 69 seq_printf(p, " Performance monitoring interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND"); 70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j) 71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v)
149 if (!desc) 149 if (!desc)
150 return 0; 150 return 0;
151 151
152 spin_lock_irqsave(&desc->lock, flags); 152 raw_spin_lock_irqsave(&desc->lock, flags);
153 for_each_online_cpu(j) 153 for_each_online_cpu(j)
154 any_count |= kstat_irqs_cpu(i, j); 154 any_count |= kstat_irqs_cpu(i, j);
155 action = desc->action; 155 action = desc->action;
@@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v)
170 170
171 seq_putc(p, '\n'); 171 seq_putc(p, '\n');
172out: 172out:
173 spin_unlock_irqrestore(&desc->lock, flags); 173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 return 0; 174 return 0;
175} 175}
176 176
@@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
@@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
251} 251}
252 252
253/* 253/*
254 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
255 */ 255 */
256void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
257{ 257{
258 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
259 259
@@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
263 263
264 irq_enter(); 264 irq_enter();
265 265
266 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
267 267
268 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
269 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
270 270
271 irq_exit(); 271 irq_exit();
272 272
@@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs)
274} 274}
275 275
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277
278#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void)
281{
282 unsigned int irq, vector;
283 static int warned;
284 struct irq_desc *desc;
285
286 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0;
288 int set_affinity = 1;
289 const struct cpumask *affinity;
290
291 if (!desc)
292 continue;
293 if (irq == 2)
294 continue;
295
296 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock);
298
299 affinity = desc->affinity;
300 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock);
303 continue;
304 }
305
306 /*
307 * Complete the irq move. This cpu is going down and for
308 * non intr-remapping case, we can't wait till this interrupt
309 * arrives at this cpu before completing the irq move.
310 */
311 irq_force_complete_move(irq);
312
313 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
314 break_affinity = 1;
315 affinity = cpu_all_mask;
316 }
317
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
319 desc->chip->mask(irq);
320
321 if (desc->chip->set_affinity)
322 desc->chip->set_affinity(irq, affinity);
323 else if (!(warned++))
324 set_affinity = 0;
325
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
327 desc->chip->unmask(irq);
328
329 raw_spin_unlock(&desc->lock);
330
331 if (break_affinity && set_affinity)
332 printk("Broke affinity for irq %i\n", irq);
333 else if (!set_affinity)
334 printk("Cannot set affinity for irq %i\n", irq);
335 }
336
337 /*
338 * We can remove mdelay() and then send spuriuous interrupts to
339 * new cpu targets for all the irqs that were handled previously by
340 * this cpu. While it works, I have seen spurious interrupt messages
341 * (nothing wrong but still...).
342 *
343 * So for now, retain mdelay(1) and check the IRR and then send those
344 * interrupts to new targets as this cpu is already offlined...
345 */
346 mdelay(1);
347
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr;
350
351 if (__get_cpu_var(vector_irq)[vector] < 0)
352 continue;
353
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector];
357
358 desc = irq_to_desc(irq);
359 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger)
361 desc->chip->retrigger(irq);
362 raw_spin_unlock(&desc->lock);
363 }
364 }
365}
366#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
211 211
212 return true; 212 return true;
213} 213}
214
215#ifdef CONFIG_HOTPLUG_CPU
216
217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
219{
220 unsigned int irq;
221 struct irq_desc *desc;
222
223 for_each_irq_desc(irq, desc) {
224 const struct cpumask *affinity;
225
226 if (!desc)
227 continue;
228 if (irq == 2)
229 continue;
230
231 affinity = desc->affinity;
232 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
233 printk("Breaking affinity for irq %i\n", irq);
234 affinity = cpu_all_mask;
235 }
236 if (desc->chip->set_affinity)
237 desc->chip->set_affinity(irq, affinity);
238 else if (desc->action)
239 printk_once("Cannot set affinity for irq %i\n", irq);
240 }
241
242#if 0
243 barrier();
244 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
245 [note the nop - the interrupt-enable boundary on x86 is two
246 instructions from sti] - to flush out pending hardirqs and
247 IPIs. After this point nothing is supposed to reach this CPU." */
248 __asm__ __volatile__("sti; nop; cli");
249 barrier();
250#else
251 /* That doesn't seem sufficient. Give it 1ms. */
252 local_irq_enable();
253 mdelay(1);
254 local_irq_disable();
255#endif
256}
257#endif
258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
62 return true; 62 return true;
63} 63}
64 64
65#ifdef CONFIG_HOTPLUG_CPU
66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
68{
69 unsigned int irq;
70 static int warned;
71 struct irq_desc *desc;
72
73 for_each_irq_desc(irq, desc) {
74 int break_affinity = 0;
75 int set_affinity = 1;
76 const struct cpumask *affinity;
77
78 if (!desc)
79 continue;
80 if (irq == 2)
81 continue;
82
83 /* interrupt's are disabled at this point */
84 spin_lock(&desc->lock);
85
86 affinity = desc->affinity;
87 if (!irq_has_action(irq) ||
88 cpumask_equal(affinity, cpu_online_mask)) {
89 spin_unlock(&desc->lock);
90 continue;
91 }
92
93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
94 break_affinity = 1;
95 affinity = cpu_all_mask;
96 }
97
98 if (desc->chip->mask)
99 desc->chip->mask(irq);
100
101 if (desc->chip->set_affinity)
102 desc->chip->set_affinity(irq, affinity);
103 else if (!(warned++))
104 set_affinity = 0;
105
106 if (desc->chip->unmask)
107 desc->chip->unmask(irq);
108
109 spin_unlock(&desc->lock);
110
111 if (break_affinity && set_affinity)
112 printk("Broke affinity for irq %i\n", irq);
113 else if (!set_affinity)
114 printk("Cannot set affinity for irq %i\n", irq);
115 }
116
117 /* That doesn't seem sufficient. Give it 1ms. */
118 local_irq_enable();
119 mdelay(1);
120 local_irq_disable();
121}
122#endif
123 65
124extern void call_softirq(void); 66extern void call_softirq(void);
125 67
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f30773fb29..d5932226614f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -200,8 +200,8 @@ static void __init apic_intr_init(void)
200 /* self generated IPI for local APIC timer */ 200 /* self generated IPI for local APIC timer */
201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
202 202
203 /* generic IPI for platform specific use */ 203 /* IPI for X86 platform specific use */
204 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 204 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
205 205
206 /* IPI vectors for APIC spurious and error interrupts */ 206 /* IPI vectors for APIC spurious and error interrupts */
207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..dd74fe7273b1 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,7 @@
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45 45
46#include <asm/debugreg.h>
46#include <asm/apicdef.h> 47#include <asm/apicdef.h>
47#include <asm/system.h> 48#include <asm/system.h>
48 49
@@ -85,10 +86,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
85 gdb_regs[GDB_DS] = regs->ds; 86 gdb_regs[GDB_DS] = regs->ds;
86 gdb_regs[GDB_ES] = regs->es; 87 gdb_regs[GDB_ES] = regs->es;
87 gdb_regs[GDB_CS] = regs->cs; 88 gdb_regs[GDB_CS] = regs->cs;
88 gdb_regs[GDB_SS] = __KERNEL_DS;
89 gdb_regs[GDB_FS] = 0xFFFF; 89 gdb_regs[GDB_FS] = 0xFFFF;
90 gdb_regs[GDB_GS] = 0xFFFF; 90 gdb_regs[GDB_GS] = 0xFFFF;
91 gdb_regs[GDB_SP] = (int)&regs->sp; 91 if (user_mode_vm(regs)) {
92 gdb_regs[GDB_SS] = regs->ss;
93 gdb_regs[GDB_SP] = regs->sp;
94 } else {
95 gdb_regs[GDB_SS] = __KERNEL_DS;
96 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
97 }
92#else 98#else
93 gdb_regs[GDB_R8] = regs->r8; 99 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 100 gdb_regs[GDB_R9] = regs->r9;
@@ -101,7 +107,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 107 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 108 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 109 gdb_regs32[GDB_SS] = regs->ss;
104 gdb_regs[GDB_SP] = regs->sp; 110 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
105#endif 111#endif
106} 112}
107 113
@@ -220,8 +226,7 @@ static void kgdb_correct_hw_break(void)
220 dr7 |= ((breakinfo[breakno].len << 2) | 226 dr7 |= ((breakinfo[breakno].len << 2) |
221 breakinfo[breakno].type) << 227 breakinfo[breakno].type) <<
222 ((breakno << 2) + 16); 228 ((breakno << 2) + 16);
223 if (breakno >= 0 && breakno <= 3) 229 set_debugreg(breakinfo[breakno].addr, breakno);
224 set_debugreg(breakinfo[breakno].addr, breakno);
225 230
226 } else { 231 } else {
227 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 232 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
@@ -395,7 +400,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
395 /* set the trace bit if we're stepping */ 400 /* set the trace bit if we're stepping */
396 if (remcomInBuffer[0] == 's') { 401 if (remcomInBuffer[0] == 's') {
397 linux_regs->flags |= X86_EFLAGS_TF; 402 linux_regs->flags |= X86_EFLAGS_TF;
398 kgdb_single_step = 1;
399 atomic_set(&kgdb_cpu_doing_single_step, 403 atomic_set(&kgdb_cpu_doing_single_step,
400 raw_smp_processor_id()); 404 raw_smp_processor_id());
401 } 405 }
@@ -434,6 +438,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 438 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 439 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 440 args->err, "c", "", regs);
441 /*
442 * Reset the BS bit in dr6 (pointed by args->err) to
443 * denote completion of processing
444 */
445 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 446
438 return NOTIFY_STOP; 447 return NOTIFY_STOP;
439} 448}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..5b8c7505b3bc 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,22 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
51 52
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/desc.h> 54#include <asm/desc.h>
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/alternative.h> 57#include <asm/alternative.h>
58#include <asm/insn.h>
59#include <asm/debugreg.h>
57 60
58void jprobe_return_end(void); 61void jprobe_return_end(void);
59 62
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 63DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 64DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62 65
63#ifdef CONFIG_X86_64 66#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76 67
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 68#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 97 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 99};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 100#undef W
154 101
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 102struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +191,75 @@ retry:
244 } 191 }
245} 192}
246 193
194/* Recover the probed instruction at addr for further analysis. */
195static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
196{
197 struct kprobe *kp;
198 kp = get_kprobe((void *)addr);
199 if (!kp)
200 return -EINVAL;
201
202 /*
203 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn.
208 *
209 * On the other hand, kp->opcode has a copy of the first byte of
210 * the probed instruction, which is overwritten by int3. And
211 * the instruction at kp->addr is not modified by kprobes except
212 * for the first byte, we can recover the original instruction
213 * from it and kp->opcode.
214 */
215 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
216 buf[0] = kp->opcode;
217 return 0;
218}
219
220/* Dummy buffers for kallsyms_lookup */
221static char __dummy_buf[KSYM_NAME_LEN];
222
223/* Check if paddr is at an instruction boundary */
224static int __kprobes can_probe(unsigned long paddr)
225{
226 int ret;
227 unsigned long addr, offset = 0;
228 struct insn insn;
229 kprobe_opcode_t buf[MAX_INSN_SIZE];
230
231 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
232 return 0;
233
234 /* Decode instructions */
235 addr = paddr - offset;
236 while (addr < paddr) {
237 kernel_insn_init(&insn, (void *)addr);
238 insn_get_opcode(&insn);
239
240 /*
241 * Check if the instruction has been modified by another
242 * kprobe, in which case we replace the breakpoint by the
243 * original instruction in our buffer.
244 */
245 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
246 ret = recover_probed_instruction(buf, addr);
247 if (ret)
248 /*
249 * Another debugging subsystem might insert
250 * this breakpoint. In that case, we can't
251 * recover it.
252 */
253 return 0;
254 kernel_insn_init(&insn, buf);
255 }
256 insn_get_length(&insn);
257 addr += insn.length;
258 }
259
260 return (addr == paddr);
261}
262
247/* 263/*
248 * Returns non-zero if opcode modifies the interrupt flag. 264 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 265 */
@@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
277static void __kprobes fix_riprel(struct kprobe *p) 293static void __kprobes fix_riprel(struct kprobe *p)
278{ 294{
279#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
280 u8 *insn = p->ainsn.insn; 296 struct insn insn;
281 s64 disp; 297 kernel_insn_init(&insn, p->ainsn.insn);
282 int need_modrm;
283
284 /* Skip legacy instruction prefixes. */
285 while (1) {
286 switch (*insn) {
287 case 0x66:
288 case 0x67:
289 case 0x2e:
290 case 0x3e:
291 case 0x26:
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 }
301 break;
302 }
303 298
304 /* Skip REX instruction prefix. */ 299 if (insn_rip_relative(&insn)) {
305 if (is_REX_prefix(insn)) 300 s64 newdisp;
306 ++insn; 301 u8 *disp;
307 302 insn_get_displacement(&insn);
308 if (*insn == 0x0f) { 303 /*
309 /* Two-byte opcode. */ 304 * The copied instruction uses the %rip-relative addressing
310 ++insn; 305 * mode. Adjust the displacement for the difference between
311 need_modrm = test_bit(*insn, 306 * the original location of this instruction and the location
312 (unsigned long *)twobyte_has_modrm); 307 * of the copy that will actually be run. The tricky bit here
313 } else 308 * is making sure that the sign extension happens correctly in
314 /* One-byte opcode. */ 309 * this calculation, since we need a signed 32-bit result to
315 need_modrm = test_bit(*insn, 310 * be sign-extended to 64 bits when it's added to the %rip
316 (unsigned long *)onebyte_has_modrm); 311 * value and yield the same 64-bit result that the sign-
317 312 * extension of the original signed 32-bit displacement would
318 if (need_modrm) { 313 * have given.
319 u8 modrm = *++insn; 314 */
320 if ((modrm & 0xc7) == 0x05) { 315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
321 /* %rip+disp32 addressing mode */ 316 (u8 *) p->ainsn.insn;
322 /* Displacement follows ModRM byte. */ 317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
323 ++insn; 318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
324 /* 319 *(s32 *) disp = (s32) newdisp;
325 * The copied instruction uses the %rip-relative
326 * addressing mode. Adjust the displacement for the
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 320 }
343#endif 321#endif
344} 322}
@@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 337
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 338int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 339{
340 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 342 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 343 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 344 if (!p->ainsn.insn)
@@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 452{
473 switch (kcb->kprobe_status) { 453 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 454 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 455 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb); 456 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb); 457 set_current_kprobe(p, regs, kcb);
@@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
491 kcb->kprobe_status = KPROBE_REENTER; 460 kcb->kprobe_status = KPROBE_REENTER;
492 break; 461 break;
493 case KPROBE_HIT_SS: 462 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 463 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 464 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 465 * codepath should strictly reside in .kprobes.text section.
497 return 0; 466 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 467 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 468 */
500 * to, or just after, single-stepping of a probed 469 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 470 p->addr);
502 * reside in .kprobes.text section. Raise a warning 471 dump_kprobe(p);
503 * to highlight this peculiar case. 472 BUG();
504 */
505 }
506 default: 473 default:
507 /* impossible cases */ 474 /* impossible cases */
508 WARN_ON(1); 475 WARN_ON(1);
@@ -514,7 +481,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
514 481
515/* 482/*
516 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 483 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
517 * remain disabled thorough out this function. 484 * remain disabled throughout this function.
518 */ 485 */
519static int __kprobes kprobe_handler(struct pt_regs *regs) 486static int __kprobes kprobe_handler(struct pt_regs *regs)
520{ 487{
@@ -851,7 +818,7 @@ no_change:
851 818
852/* 819/*
853 * Interrupts are disabled on entry as trap1 is an interrupt gate and they 820 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
854 * remain disabled thoroughout this function. 821 * remain disabled throughout this function.
855 */ 822 */
856static int __kprobes post_kprobe_handler(struct pt_regs *regs) 823static int __kprobes post_kprobe_handler(struct pt_regs *regs)
857{ 824{
@@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 934 ret = NOTIFY_STOP;
968 break; 935 break;
969 case DIE_DEBUG: 936 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 937 if (post_kprobe_handler(args->regs)) {
938 /*
939 * Reset the BS bit in dr6 (pointed by args->err) to
940 * denote completion of processing
941 */
942 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 943 ret = NOTIFY_STOP;
944 }
972 break; 945 break;
973 case DIE_GPF: 946 case DIE_GPF:
974 /* 947 /*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -157,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
157{ 158{
158 int error; 159 int error;
159 160
160 if (nx_enabled) 161 set_pages_x(image->control_code_page, 1);
161 set_pages_x(image->control_code_page, 1);
162 error = machine_kexec_alloc_page_tables(image); 162 error = machine_kexec_alloc_page_tables(image);
163 if (error) 163 if (error)
164 return error; 164 return error;
@@ -172,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
172 */ 172 */
173void machine_kexec_cleanup(struct kimage *image) 173void machine_kexec_cleanup(struct kimage *image)
174{ 174{
175 if (nx_enabled) 175 set_pages_nx(image->control_code_page, 1);
176 set_pages_nx(image->control_code_page, 1);
177 machine_kexec_free_page_tables(image); 176 machine_kexec_free_page_tables(image);
178} 177}
179 178
@@ -202,6 +201,7 @@ void machine_kexec(struct kimage *image)
202 201
203 /* Interrupts aren't acceptable while we reboot */ 202 /* Interrupts aren't acceptable while we reboot */
204 local_irq_disable(); 203 local_irq_disable();
204 hw_breakpoint_disable();
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/debugreg.h>
21 22
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 23static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 24 unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
282 283
283 /* Interrupts aren't acceptable while we reboot */ 284 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 285 local_irq_disable();
286 hw_breakpoint_disable();
285 287
286 if (image->preserve_context) { 288 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 289#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
deleted file mode 100644
index 2a62d843f015..000000000000
--- a/arch/x86/kernel/mfgpt_32.c
+++ /dev/null
@@ -1,410 +0,0 @@
1/*
2 * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT)
3 *
4 * Copyright (C) 2006, Advanced Micro Devices, Inc.
5 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of version 2 of the GNU General Public License
9 * as published by the Free Software Foundation.
10 *
11 * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book.
12 */
13
14/*
15 * We are using the 32.768kHz input clock - it's the only one that has the
16 * ranges we find desirable. The following table lists the suitable
17 * divisors and the associated Hz, minimum interval and the maximum interval:
18 *
19 * Divisor Hz Min Delta (s) Max Delta (s)
20 * 1 32768 .00048828125 2.000
21 * 2 16384 .0009765625 4.000
22 * 4 8192 .001953125 8.000
23 * 8 4096 .00390625 16.000
24 * 16 2048 .0078125 32.000
25 * 32 1024 .015625 64.000
26 * 64 512 .03125 128.000
27 * 128 256 .0625 256.000
28 * 256 128 .125 512.000
29 */
30
31#include <linux/kernel.h>
32#include <linux/interrupt.h>
33#include <linux/module.h>
34#include <asm/geode.h>
35
36#define MFGPT_DEFAULT_IRQ 7
37
38static struct mfgpt_timer_t {
39 unsigned int avail:1;
40} mfgpt_timers[MFGPT_MAX_TIMERS];
41
42/* Selected from the table above */
43
44#define MFGPT_DIVISOR 16
45#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
46#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
47#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
48
49/* Allow for disabling of MFGPTs */
50static int disable;
51static int __init mfgpt_disable(char *s)
52{
53 disable = 1;
54 return 1;
55}
56__setup("nomfgpt", mfgpt_disable);
57
58/* Reset the MFGPT timers. This is required by some broken BIOSes which already
59 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
60 * affected at least (0.99 is OK with MFGPT workaround left to off).
61 */
62static int __init mfgpt_fix(char *s)
63{
64 u32 val, dummy;
65
66 /* The following udocumented bit resets the MFGPT timers */
67 val = 0xFF; dummy = 0;
68 wrmsr(MSR_MFGPT_SETUP, val, dummy);
69 return 1;
70}
71__setup("mfgptfix", mfgpt_fix);
72
73/*
74 * Check whether any MFGPTs are available for the kernel to use. In most
75 * cases, firmware that uses AMD's VSA code will claim all timers during
76 * bootup; we certainly don't want to take them if they're already in use.
77 * In other cases (such as with VSAless OpenFirmware), the system firmware
78 * leaves timers available for us to use.
79 */
80
81
82static int timers = -1;
83
84static void geode_mfgpt_detect(void)
85{
86 int i;
87 u16 val;
88
89 timers = 0;
90
91 if (disable) {
92 printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
93 goto done;
94 }
95
96 if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
97 printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
98 goto done;
99 }
100
101 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
102 val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
103 if (!(val & MFGPT_SETUP_SETUP)) {
104 mfgpt_timers[i].avail = 1;
105 timers++;
106 }
107 }
108
109done:
110 printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
111}
112
113int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
114{
115 u32 msr, mask, value, dummy;
116 int shift = (cmp == MFGPT_CMP1) ? 0 : 8;
117
118 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
119 return -EIO;
120
121 /*
122 * The register maps for these are described in sections 6.17.1.x of
123 * the AMD Geode CS5536 Companion Device Data Book.
124 */
125 switch (event) {
126 case MFGPT_EVENT_RESET:
127 /*
128 * XXX: According to the docs, we cannot reset timers above
129 * 6; that is, resets for 7 and 8 will be ignored. Is this
130 * a problem? -dilinger
131 */
132 msr = MSR_MFGPT_NR;
133 mask = 1 << (timer + 24);
134 break;
135
136 case MFGPT_EVENT_NMI:
137 msr = MSR_MFGPT_NR;
138 mask = 1 << (timer + shift);
139 break;
140
141 case MFGPT_EVENT_IRQ:
142 msr = MSR_MFGPT_IRQ;
143 mask = 1 << (timer + shift);
144 break;
145
146 default:
147 return -EIO;
148 }
149
150 rdmsr(msr, value, dummy);
151
152 if (enable)
153 value |= mask;
154 else
155 value &= ~mask;
156
157 wrmsr(msr, value, dummy);
158 return 0;
159}
160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
161
162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
163{
164 u32 zsel, lpc, dummy;
165 int shift;
166
167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
168 return -EIO;
169
170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
181 return -EIO;
182
183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
188
189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
190 if (*irq < 1 || *irq == 2 || *irq > 15)
191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
199 if (enable) {
200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
202 }
203
204 return 0;
205}
206
207static int mfgpt_get(int timer)
208{
209 mfgpt_timers[timer].avail = 0;
210 printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
211 return timer;
212}
213
214int geode_mfgpt_alloc_timer(int timer, int domain)
215{
216 int i;
217
218 if (timers == -1) {
219 /* timers haven't been detected yet */
220 geode_mfgpt_detect();
221 }
222
223 if (!timers)
224 return -1;
225
226 if (timer >= MFGPT_MAX_TIMERS)
227 return -1;
228
229 if (timer < 0) {
230 /* Try to find an available timer */
231 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
232 if (mfgpt_timers[i].avail)
233 return mfgpt_get(i);
234
235 if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
236 break;
237 }
238 } else {
239 /* If they requested a specific timer, try to honor that */
240 if (mfgpt_timers[timer].avail)
241 return mfgpt_get(timer);
242 }
243
244 /* No timers available - too bad */
245 return -1;
246}
247EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
248
249
250#ifdef CONFIG_GEODE_MFGPT_TIMER
251
252/*
253 * The MFPGT timers on the CS5536 provide us with suitable timers to use
254 * as clock event sources - not as good as a HPET or APIC, but certainly
255 * better than the PIT. This isn't a general purpose MFGPT driver, but
256 * a simplified one designed specifically to act as a clock event source.
257 * For full details about the MFGPT, please consult the CS5536 data sheet.
258 */
259
260#include <linux/clocksource.h>
261#include <linux/clockchips.h>
262
263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
264static u16 mfgpt_event_clock;
265
266static int irq;
267static int __init mfgpt_setup(char *str)
268{
269 get_option(&str, &irq);
270 return 1;
271}
272__setup("mfgpt_irq=", mfgpt_setup);
273
274static void mfgpt_disable_timer(u16 clock)
275{
276 /* avoid races by clearing CMP1 and CMP2 unconditionally */
277 geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
278 MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
279}
280
281static int mfgpt_next_event(unsigned long, struct clock_event_device *);
282static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *);
283
284static struct clock_event_device mfgpt_clockevent = {
285 .name = "mfgpt-timer",
286 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event,
289 .rating = 250,
290 .cpumask = cpu_all_mask,
291 .shift = 32
292};
293
294static void mfgpt_start_timer(u16 delta)
295{
296 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
297 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
298
299 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
300 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
301}
302
303static void mfgpt_set_mode(enum clock_event_mode mode,
304 struct clock_event_device *evt)
305{
306 mfgpt_disable_timer(mfgpt_event_clock);
307
308 if (mode == CLOCK_EVT_MODE_PERIODIC)
309 mfgpt_start_timer(MFGPT_PERIODIC);
310
311 mfgpt_tick_mode = mode;
312}
313
314static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
315{
316 mfgpt_start_timer(delta);
317 return 0;
318}
319
320static irqreturn_t mfgpt_tick(int irq, void *dev_id)
321{
322 u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
323
324 /* See if the interrupt was for us */
325 if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
326 return IRQ_NONE;
327
328 /* Turn off the clock (and clear the event) */
329 mfgpt_disable_timer(mfgpt_event_clock);
330
331 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
332 return IRQ_HANDLED;
333
334 /* Clear the counter */
335 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
336
337 /* Restart the clock in periodic mode */
338
339 if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) {
340 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
341 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
342 }
343
344 mfgpt_clockevent.event_handler(&mfgpt_clockevent);
345 return IRQ_HANDLED;
346}
347
348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer"
352};
353
354int __init mfgpt_timer_setup(void)
355{
356 int timer, ret;
357 u16 val;
358
359 timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
360 if (timer < 0) {
361 printk(KERN_ERR
362 "mfgpt-timer: Could not allocate a MFPGT timer\n");
363 return -ENODEV;
364 }
365
366 mfgpt_event_clock = timer;
367
368 /* Set up the IRQ on the MFGPT side */
369 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
370 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
371 return -EIO;
372 }
373
374 /* And register it with the kernel */
375 ret = setup_irq(irq, &mfgptirq);
376
377 if (ret) {
378 printk(KERN_ERR
379 "mfgpt-timer: Unable to set up the interrupt.\n");
380 goto err;
381 }
382
383 /* Set the clock scale and enable the event mode for CMP2 */
384 val = MFGPT_SCALE | (3 << 8);
385
386 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
387
388 /* Set up the clock event */
389 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
390 mfgpt_clockevent.shift);
391 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
392 &mfgpt_clockevent);
393 mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
394 &mfgpt_clockevent);
395
396 printk(KERN_INFO
397 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
398 timer, irq);
399 clockevents_register_device(&mfgpt_clockevent);
400
401 return 0;
402
403err:
404 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
405 printk(KERN_ERR
406 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
407 return -EIO;
408}
409
410#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 366baa179913..37542b67c57e 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,6 +13,9 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
16#include <linux/firmware.h> 19#include <linux/firmware.h>
17#include <linux/pci_ids.h> 20#include <linux/pci_ids.h>
18#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -33,6 +36,9 @@ MODULE_LICENSE("GPL v2");
33#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 36#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
34#define UCODE_UCODE_TYPE 0x00000001 37#define UCODE_UCODE_TYPE 0x00000001
35 38
39const struct firmware *firmware;
40static int supported_cpu;
41
36struct equiv_cpu_entry { 42struct equiv_cpu_entry {
37 u32 installed_cpu; 43 u32 installed_cpu;
38 u32 fixed_errata_mask; 44 u32 fixed_errata_mask;
@@ -71,17 +77,14 @@ static struct equiv_cpu_entry *equiv_cpu_table;
71 77
72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 78static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
73{ 79{
74 struct cpuinfo_x86 *c = &cpu_data(cpu);
75 u32 dummy; 80 u32 dummy;
76 81
77 memset(csig, 0, sizeof(*csig)); 82 if (!supported_cpu)
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86);
81 return -1; 83 return -1;
82 } 84
85 memset(csig, 0, sizeof(*csig));
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 88 return 0;
86} 89}
87 90
@@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 106 i++;
104 } 107 }
105 108
106 if (!equiv_cpu_id) { 109 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 110 return 0;
110 }
111 111
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 112 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 113 return 0;
117 }
118 114
119 /* ucode might be chipset specific -- currently we don't support this */ 115 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 117 pr_err("CPU%d: loading of chipset specific code not yet supported\n",
122 "specific code not yet supported\n", cpu); 118 cpu);
123 return 0; 119 return 0;
124 } 120 }
125 121
@@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu)
148 144
149 /* check current patch id and patch's id for match */ 145 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 146 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 148 cpu, mc_amd->hdr.patch_id);
153 return -1; 149 return -1;
154 } 150 }
155 151
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 153 uci->cpu_sig.rev = rev;
160 154
161 return 0; 155 return 0;
@@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 172 return NULL;
179 173
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 175 pr_err("error: invalid type field in container file section header\n");
182 "container file section header\n");
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 180
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 181 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 182 pr_err("error: size mismatch\n");
193 return NULL; 183 return NULL;
194 } 184 }
195 185
@@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 208 size = buf_pos[2];
219 209
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 211 pr_err("error: invalid type field in container file section header\n");
222 "container file section header\n");
223 return 0; 212 return 0;
224 } 213 }
225 214
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 216 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 217 pr_err("failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 218 return 0;
231 } 219 }
232 220
@@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 247
260 offset = install_equiv_cpu_table(ucode_ptr); 248 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 249 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 250 pr_err("failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 251 return UCODE_ERROR;
265 } 252 }
266 253
@@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
291 if (!leftover) { 278 if (!leftover) {
292 vfree(uci->mc); 279 vfree(uci->mc);
293 uci->mc = new_mc; 280 uci->mc = new_mc;
294 pr_debug("microcode: CPU%d found a matching microcode " 281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
295 "update with version 0x%x (current=0x%x)\n",
296 cpu, new_rev, uci->cpu_sig.rev); 282 cpu, new_rev, uci->cpu_sig.rev);
297 } else { 283 } else {
298 vfree(new_mc); 284 vfree(new_mc);
@@ -308,27 +294,26 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
308 294
309static enum ucode_state request_microcode_fw(int cpu, struct device *device) 295static enum ucode_state request_microcode_fw(int cpu, struct device *device)
310{ 296{
311 const char *fw_name = "amd-ucode/microcode_amd.bin";
312 const struct firmware *firmware;
313 enum ucode_state ret; 297 enum ucode_state ret;
314 298
315 if (request_firmware(&firmware, fw_name, device)) { 299 if (firmware == NULL)
316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
317 return UCODE_NFOUND; 300 return UCODE_NFOUND;
301
302 if (*(u32 *)firmware->data != UCODE_MAGIC) {
303 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
304 *(u32 *)firmware->data);
305 return UCODE_ERROR;
318 } 306 }
319 307
320 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 308 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
321 309
322 release_firmware(firmware);
323
324 return ret; 310 return ret;
325} 311}
326 312
327static enum ucode_state 313static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size) 314request_microcode_user(int cpu, const void __user *buf, size_t size)
329{ 315{
330 printk(KERN_INFO "microcode: AMD microcode update via " 316 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
331 "/dev/cpu/microcode not supported\n");
332 return UCODE_ERROR; 317 return UCODE_ERROR;
333} 318}
334 319
@@ -340,7 +325,31 @@ static void microcode_fini_cpu_amd(int cpu)
340 uci->mc = NULL; 325 uci->mc = NULL;
341} 326}
342 327
328void init_microcode_amd(struct device *device)
329{
330 const char *fw_name = "amd-ucode/microcode_amd.bin";
331 struct cpuinfo_x86 *c = &boot_cpu_data;
332
333 WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
334
335 if (c->x86 < 0x10) {
336 pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
337 return;
338 }
339 supported_cpu = 1;
340
341 if (request_firmware(&firmware, fw_name, device))
342 pr_err("failed to load file %s\n", fw_name);
343}
344
345void fini_microcode_amd(void)
346{
347 release_firmware(firmware);
348}
349
343static struct microcode_ops microcode_amd_ops = { 350static struct microcode_ops microcode_amd_ops = {
351 .init = init_microcode_amd,
352 .fini = fini_microcode_amd,
344 .request_microcode_user = request_microcode_user, 353 .request_microcode_user = request_microcode_user,
345 .request_microcode_fw = request_microcode_fw, 354 .request_microcode_fw = request_microcode_fw,
346 .collect_cpu_info = collect_cpu_info_amd, 355 .collect_cpu_info = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..0c8632433090 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,10 +70,12 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/platform_device.h> 76#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 77#include <linux/miscdevice.h>
75#include <linux/capability.h> 78#include <linux/capability.h>
76#include <linux/smp_lock.h>
77#include <linux/kernel.h> 79#include <linux/kernel.h>
78#include <linux/module.h> 80#include <linux/module.h>
79#include <linux/mutex.h> 81#include <linux/mutex.h>
@@ -201,7 +203,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 203
202static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *unused1, struct file *unused2)
203{ 205{
204 cycle_kernel_lock();
205 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
206} 207}
207 208
@@ -211,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
211 ssize_t ret = -EINVAL; 212 ssize_t ret = -EINVAL;
212 213
213 if ((len >> PAGE_SHIFT) > totalram_pages) { 214 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); 215 pr_err("too much data (max %ld pages)\n", totalram_pages);
215 return ret; 216 return ret;
216 } 217 }
217 218
@@ -246,7 +247,7 @@ static int __init microcode_dev_init(void)
246 247
247 error = misc_register(&microcode_dev); 248 error = misc_register(&microcode_dev);
248 if (error) { 249 if (error) {
249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); 250 pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
250 return error; 251 return error;
251 } 252 }
252 253
@@ -361,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu)
361 if (!uci->mc) 362 if (!uci->mc)
362 return UCODE_NFOUND; 363 return UCODE_NFOUND;
363 364
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu); 365 pr_debug("CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu); 366 apply_microcode_on_target(cpu);
366 367
367 return UCODE_OK; 368 return UCODE_OK;
@@ -381,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu)
381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev); 382 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
382 383
383 if (ustate == UCODE_OK) { 384 if (ustate == UCODE_OK) {
384 pr_debug("microcode: CPU%d updated upon init\n", cpu); 385 pr_debug("CPU%d updated upon init\n", cpu);
385 apply_microcode_on_target(cpu); 386 apply_microcode_on_target(cpu);
386 } 387 }
387 388
@@ -408,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
408 if (!cpu_online(cpu)) 409 if (!cpu_online(cpu))
409 return 0; 410 return 0;
410 411
411 pr_debug("microcode: CPU%d added\n", cpu); 412 pr_debug("CPU%d added\n", cpu);
412 413
413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 414 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
414 if (err) 415 if (err)
@@ -427,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
427 if (!cpu_online(cpu)) 428 if (!cpu_online(cpu))
428 return 0; 429 return 0;
429 430
430 pr_debug("microcode: CPU%d removed\n", cpu); 431 pr_debug("CPU%d removed\n", cpu);
431 microcode_fini_cpu(cpu); 432 microcode_fini_cpu(cpu);
432 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 433 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
433 return 0; 434 return 0;
@@ -475,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
475 microcode_update_cpu(cpu); 476 microcode_update_cpu(cpu);
476 case CPU_DOWN_FAILED: 477 case CPU_DOWN_FAILED:
477 case CPU_DOWN_FAILED_FROZEN: 478 case CPU_DOWN_FAILED_FROZEN:
478 pr_debug("microcode: CPU%d added\n", cpu); 479 pr_debug("CPU%d added\n", cpu);
479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 480 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
480 pr_err("microcode: Failed to create group for CPU%d\n", cpu); 481 pr_err("Failed to create group for CPU%d\n", cpu);
481 break; 482 break;
482 case CPU_DOWN_PREPARE: 483 case CPU_DOWN_PREPARE:
483 case CPU_DOWN_PREPARE_FROZEN: 484 case CPU_DOWN_PREPARE_FROZEN:
484 /* Suspend is in progress, only remove the interface */ 485 /* Suspend is in progress, only remove the interface */
485 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 486 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
486 pr_debug("microcode: CPU%d removed\n", cpu); 487 pr_debug("CPU%d removed\n", cpu);
487 break; 488 break;
488 case CPU_DEAD: 489 case CPU_DEAD:
489 case CPU_UP_CANCELED_FROZEN: 490 case CPU_UP_CANCELED_FROZEN:
@@ -509,7 +510,7 @@ static int __init microcode_init(void)
509 microcode_ops = init_amd_microcode(); 510 microcode_ops = init_amd_microcode();
510 511
511 if (!microcode_ops) { 512 if (!microcode_ops) {
512 pr_err("microcode: no support for this CPU vendor\n"); 513 pr_err("no support for this CPU vendor\n");
513 return -ENODEV; 514 return -ENODEV;
514 } 515 }
515 516
@@ -520,6 +521,9 @@ static int __init microcode_init(void)
520 return PTR_ERR(microcode_pdev); 521 return PTR_ERR(microcode_pdev);
521 } 522 }
522 523
524 if (microcode_ops->init)
525 microcode_ops->init(&microcode_pdev->dev);
526
523 get_online_cpus(); 527 get_online_cpus();
524 mutex_lock(&microcode_mutex); 528 mutex_lock(&microcode_mutex);
525 529
@@ -540,8 +544,7 @@ static int __init microcode_init(void)
540 register_hotcpu_notifier(&mc_cpu_notifier); 544 register_hotcpu_notifier(&mc_cpu_notifier);
541 545
542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 546 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
543 " <tigran@aivazian.fsnet.co.uk>," 547 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
544 " Peter Oruba\n");
545 548
546 return 0; 549 return 0;
547} 550}
@@ -563,6 +566,9 @@ static void __exit microcode_exit(void)
563 566
564 platform_device_unregister(microcode_pdev); 567 platform_device_unregister(microcode_pdev);
565 568
569 if (microcode_ops->fini)
570 microcode_ops->fini();
571
566 microcode_ops = NULL; 572 microcode_ops = NULL;
567 573
568 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 574 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0d334ddd0a96..ebd193e476ca 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,6 +70,9 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/firmware.h> 76#include <linux/firmware.h>
74#include <linux/uaccess.h> 77#include <linux/uaccess.h>
75#include <linux/kernel.h> 78#include <linux/kernel.h>
@@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
146 149
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) { 151 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel " 152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
150 "processor\n", cpu_num);
151 return -1; 153 return -1;
152 } 154 }
153 155
@@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
165 /* get the current revision from MSR 0x8B */ 167 /* get the current revision from MSR 0x8B */
166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 168 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
167 169
168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", 170 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
169 cpu_num, csig->sig, csig->pf, csig->rev); 171 cpu_num, csig->sig, csig->pf, csig->rev);
170 172
171 return 0; 173 return 0;
172} 174}
@@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc)
194 data_size = get_datasize(mc_header); 196 data_size = get_datasize(mc_header);
195 197
196 if (data_size + MC_HEADER_SIZE > total_size) { 198 if (data_size + MC_HEADER_SIZE > total_size) {
197 printk(KERN_ERR "microcode: error! " 199 pr_err("error! Bad data size in microcode data file\n");
198 "Bad data size in microcode data file\n");
199 return -EINVAL; 200 return -EINVAL;
200 } 201 }
201 202
202 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 203 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
203 printk(KERN_ERR "microcode: error! " 204 pr_err("error! Unknown microcode update format\n");
204 "Unknown microcode update format\n");
205 return -EINVAL; 205 return -EINVAL;
206 } 206 }
207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
208 if (ext_table_size) { 208 if (ext_table_size) {
209 if ((ext_table_size < EXT_HEADER_SIZE) 209 if ((ext_table_size < EXT_HEADER_SIZE)
210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { 210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
211 printk(KERN_ERR "microcode: error! " 211 pr_err("error! Small exttable size in microcode data file\n");
212 "Small exttable size in microcode data file\n");
213 return -EINVAL; 212 return -EINVAL;
214 } 213 }
215 ext_header = mc + MC_HEADER_SIZE + data_size; 214 ext_header = mc + MC_HEADER_SIZE + data_size;
216 if (ext_table_size != exttable_size(ext_header)) { 215 if (ext_table_size != exttable_size(ext_header)) {
217 printk(KERN_ERR "microcode: error! " 216 pr_err("error! Bad exttable size in microcode data file\n");
218 "Bad exttable size in microcode data file\n");
219 return -EFAULT; 217 return -EFAULT;
220 } 218 }
221 ext_sigcount = ext_header->count; 219 ext_sigcount = ext_header->count;
@@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc)
230 while (i--) 228 while (i--)
231 ext_table_sum += ext_tablep[i]; 229 ext_table_sum += ext_tablep[i];
232 if (ext_table_sum) { 230 if (ext_table_sum) {
233 printk(KERN_WARNING "microcode: aborting, " 231 pr_warning("aborting, bad extended signature table checksum\n");
234 "bad extended signature table checksum\n");
235 return -EINVAL; 232 return -EINVAL;
236 } 233 }
237 } 234 }
@@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc)
242 while (i--) 239 while (i--)
243 orig_sum += ((int *)mc)[i]; 240 orig_sum += ((int *)mc)[i];
244 if (orig_sum) { 241 if (orig_sum) {
245 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 242 pr_err("aborting, bad checksum\n");
246 return -EINVAL; 243 return -EINVAL;
247 } 244 }
248 if (!ext_table_size) 245 if (!ext_table_size)
@@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc)
255 - (mc_header->sig + mc_header->pf + mc_header->cksum) 252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
256 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
257 if (sum) { 254 if (sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 255 pr_err("aborting, bad checksum\n");
259 return -EINVAL; 256 return -EINVAL;
260 } 257 }
261 } 258 }
@@ -327,13 +324,11 @@ static int apply_microcode(int cpu)
327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 324 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
328 325
329 if (val[1] != mc_intel->hdr.rev) { 326 if (val[1] != mc_intel->hdr.rev) {
330 printk(KERN_ERR "microcode: CPU%d update " 327 pr_err("CPU%d update to revision 0x%x failed\n",
331 "to revision 0x%x failed\n", 328 cpu_num, mc_intel->hdr.rev);
332 cpu_num, mc_intel->hdr.rev);
333 return -1; 329 return -1;
334 } 330 }
335 printk(KERN_INFO "microcode: CPU%d updated to revision " 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n",
336 "0x%x, date = %04x-%02x-%02x \n",
337 cpu_num, val[1], 332 cpu_num, val[1],
338 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
339 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
@@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
362 357
363 mc_size = get_totalsize(&mc_header); 358 mc_size = get_totalsize(&mc_header);
364 if (!mc_size || mc_size > leftover) { 359 if (!mc_size || mc_size > leftover) {
365 printk(KERN_ERR "microcode: error!" 360 pr_err("error! Bad data in microcode data file\n");
366 "Bad data in microcode data file\n");
367 break; 361 break;
368 } 362 }
369 363
@@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 vfree(uci->mc); 399 vfree(uci->mc);
406 uci->mc = (struct microcode_intel *)new_mc; 400 uci->mc = (struct microcode_intel *)new_mc;
407 401
408 pr_debug("microcode: CPU%d found a matching microcode update with" 402 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
409 " version 0x%x (current=0x%x)\n", 403 cpu, new_rev, uci->cpu_sig.rev);
410 cpu, new_rev, uci->cpu_sig.rev);
411out: 404out:
412 return state; 405 return state;
413} 406}
@@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
429 c->x86, c->x86_model, c->x86_mask); 422 c->x86, c->x86_model, c->x86_mask);
430 423
431 if (request_firmware(&firmware, name, device)) { 424 if (request_firmware(&firmware, name, device)) {
432 pr_debug("microcode: data file %s load failed\n", name); 425 pr_debug("data file %s load failed\n", name);
433 return UCODE_NFOUND; 426 return UCODE_NFOUND;
434 } 427 }
435 428
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..40b54ceb68b5 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early)
667 */ 667 */
668} 668}
669 669
670static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 671{
672 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
673#ifdef CONFIG_X86_32
674 /*
675 * We cannot access to MPC table to compute table size yet,
676 * as only few megabytes from the bottom is mapped now.
677 * PC-9800's MPC table places on the very last of physical
678 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
679 * yields BUG() in reserve_bootmem.
680 * also need to make sure physptr is below than max_low_pfn
681 * we don't need reserve the area above max_low_pfn
682 */
683 unsigned long end = max_low_pfn * PAGE_SIZE;
684 673
685 if (mpf->physptr < end) { 674 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
686 if (mpf->physptr + size > end)
687 size = end - mpf->physptr;
688 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
689 }
690#else
691 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
692#endif
693} 675}
694 676
695static int __init smp_scan_config(unsigned long base, unsigned long length, 677static int __init smp_scan_config(unsigned long base, unsigned long length)
696 unsigned reserve)
697{ 678{
698 unsigned int *bp = phys_to_virt(base); 679 unsigned int *bp = phys_to_virt(base);
699 struct mpf_intel *mpf; 680 struct mpf_intel *mpf;
681 unsigned long mem;
700 682
701 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 683 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
702 bp, length); 684 bp, length);
@@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 699 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
718 mpf, (u64)virt_to_phys(mpf)); 700 mpf, (u64)virt_to_phys(mpf));
719 701
720 if (!reserve) 702 mem = virt_to_phys(mpf);
721 return 1; 703 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
722 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
723 BOOTMEM_DEFAULT);
724 if (mpf->physptr) 704 if (mpf->physptr)
725 smp_reserve_bootmem(mpf); 705 smp_reserve_memory(mpf);
726 706
727 return 1; 707 return 1;
728 } 708 }
@@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
732 return 0; 712 return 0;
733} 713}
734 714
735void __init default_find_smp_config(unsigned int reserve) 715void __init default_find_smp_config(void)
736{ 716{
737 unsigned int address; 717 unsigned int address;
738 718
@@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve)
744 * 2) Scan the top 1K of base RAM 724 * 2) Scan the top 1K of base RAM
745 * 3) Scan the 64K of bios 725 * 3) Scan the 64K of bios
746 */ 726 */
747 if (smp_scan_config(0x0, 0x400, reserve) || 727 if (smp_scan_config(0x0, 0x400) ||
748 smp_scan_config(639 * 0x400, 0x400, reserve) || 728 smp_scan_config(639 * 0x400, 0x400) ||
749 smp_scan_config(0xF0000, 0x10000, reserve)) 729 smp_scan_config(0xF0000, 0x10000))
750 return; 730 return;
751 /* 731 /*
752 * If it is an SMP machine we should know now, unless the 732 * If it is an SMP machine we should know now, unless the
@@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve)
767 747
768 address = get_bios_ebda(); 748 address = get_bios_ebda();
769 if (address) 749 if (address)
770 smp_scan_config(address, 0x400, reserve); 750 smp_scan_config(address, 0x400);
771} 751}
772 752
773#ifdef CONFIG_X86_IO_APIC 753#ifdef CONFIG_X86_IO_APIC
@@ -965,9 +945,6 @@ void __init early_reserve_e820_mpc_new(void)
965{ 945{
966 if (enable_update_mptable && alloc_mptable) { 946 if (enable_update_mptable && alloc_mptable) {
967 u64 startt = 0; 947 u64 startt = 0;
968#ifdef CONFIG_X86_TRAMPOLINE
969 startt = TRAMPOLINE_BASE;
970#endif
971 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); 948 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
972 } 949 }
973} 950}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..4bd93c9b2b27 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -172,23 +172,18 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
172 172
173static int msr_open(struct inode *inode, struct file *file) 173static int msr_open(struct inode *inode, struct file *file)
174{ 174{
175 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 175 unsigned int cpu;
176 struct cpuinfo_x86 *c = &cpu_data(cpu); 176 struct cpuinfo_x86 *c;
177 int ret = 0;
178 177
179 lock_kernel();
180 cpu = iminor(file->f_path.dentry->d_inode); 178 cpu = iminor(file->f_path.dentry->d_inode);
179 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
180 return -ENXIO; /* No such CPU */
181 181
182 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
183 ret = -ENXIO; /* No such CPU */
184 goto out;
185 }
186 c = &cpu_data(cpu); 182 c = &cpu_data(cpu);
187 if (!cpu_has(c, X86_FEATURE_MSR)) 183 if (!cpu_has(c, X86_FEATURE_MSR))
188 ret = -EIO; /* MSR not supported */ 184 return -EIO; /* MSR not supported */
189out: 185
190 unlock_kernel(); 186 return 0;
191 return ret;
192} 187}
193 188
194/* 189/*
@@ -251,7 +246,7 @@ static int __init msr_init(void)
251 int i, err = 0; 246 int i, err = 0;
252 i = 0; 247 i = 0;
253 248
254 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { 249 if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
255 printk(KERN_ERR "msr: unable to get major %d for msr\n", 250 printk(KERN_ERR "msr: unable to get major %d for msr\n",
256 MSR_MAJOR); 251 MSR_MAJOR);
257 err = -EBUSY; 252 err = -EBUSY;
@@ -279,7 +274,7 @@ out_class:
279 msr_device_destroy(i); 274 msr_device_destroy(i);
280 class_destroy(msr_class); 275 class_destroy(msr_class);
281out_chrdev: 276out_chrdev:
282 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 277 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
283out: 278out:
284 return err; 279 return err;
285} 280}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 4006c522adc7..9d1d263f786f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -212,7 +212,7 @@ static int __init olpc_init(void)
212 unsigned char *romsig; 212 unsigned char *romsig;
213 213
214 /* The ioremap check is dangerous; limit what we run it on */ 214 /* The ioremap check is dangerous; limit what we run it on */
215 if (!is_geode() || geode_has_vsa2()) 215 if (!is_geode() || cs5535_has_vsa2())
216 return 0; 216 return 0;
217 217
218 spin_lock_init(&ec_lock); 218 spin_lock_init(&ec_lock);
@@ -244,7 +244,7 @@ static int __init olpc_init(void)
244 (unsigned char *) &olpc_platform_info.ecver, 1); 244 (unsigned char *) &olpc_platform_info.ecver, 1);
245 245
246 /* check to see if the VSA exists */ 246 /* check to see if the VSA exists */
247 if (geode_has_vsa2()) 247 if (cs5535_has_vsa2())
248 olpc_platform_info.flags |= OLPC_F_VSA; 248 olpc_platform_info.flags |= OLPC_F_VSA;
249 249
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 3a7c5a44082e..676b8c77a976 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,9 +8,9 @@
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9 9
10static inline void 10static inline void
11default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) 11default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
12{ 12{
13 __raw_spin_lock(lock); 13 arch_spin_lock(lock);
14} 14}
15 15
16struct pv_lock_ops pv_lock_ops = { 16struct pv_lock_ops pv_lock_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..2bbde6078143 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -31,7 +31,7 @@
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h> 32#include <linux/crash_dump.h>
33#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
34#include <linux/bitops.h> 34#include <linux/bitmap.h>
35#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
36#include <linux/pci.h> 36#include <linux/pci.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
@@ -46,6 +46,7 @@
46#include <asm/dma.h> 46#include <asm/dma.h>
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h>
49 50
50#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
51int use_calgary __read_mostly = 1; 52int use_calgary __read_mostly = 1;
@@ -211,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
211 212
212 spin_lock_irqsave(&tbl->it_lock, flags); 213 spin_lock_irqsave(&tbl->it_lock, flags);
213 214
214 iommu_area_reserve(tbl->it_map, index, npages); 215 bitmap_set(tbl->it_map, index, npages);
215 216
216 spin_unlock_irqrestore(&tbl->it_lock, flags); 217 spin_unlock_irqrestore(&tbl->it_lock, flags);
217} 218}
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
244 if (panic_on_overflow) 245 if (panic_on_overflow)
245 panic("Calgary: fix the allocator.\n"); 246 panic("Calgary: fix the allocator.\n");
246 else 247 else
247 return bad_dma_address; 248 return DMA_ERROR_CODE;
248 } 249 }
249 } 250 }
250 251
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
260 void *vaddr, unsigned int npages, int direction) 261 void *vaddr, unsigned int npages, int direction)
261{ 262{
262 unsigned long entry; 263 unsigned long entry;
263 dma_addr_t ret = bad_dma_address; 264 dma_addr_t ret;
264 265
265 entry = iommu_range_alloc(dev, tbl, npages); 266 entry = iommu_range_alloc(dev, tbl, npages);
266 267
267 if (unlikely(entry == bad_dma_address)) 268 if (unlikely(entry == DMA_ERROR_CODE)) {
268 goto error; 269 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
270 "iommu %p\n", npages, tbl);
271 return DMA_ERROR_CODE;
272 }
269 273
270 /* set the return dma address */ 274 /* set the return dma address */
271 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); 275 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 /* put the TCEs in the HW table */ 277 /* put the TCEs in the HW table */
274 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, 278 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
275 direction); 279 direction);
276
277 return ret; 280 return ret;
278
279error:
280 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
281 "iommu %p\n", npages, tbl);
282 return bad_dma_address;
283} 281}
284 282
285static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 283static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
290 unsigned long flags; 288 unsigned long flags;
291 289
292 /* were we called with bad_dma_address? */ 290 /* were we called with bad_dma_address? */
293 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 291 badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
294 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 292 if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
295 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " 293 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
296 "address 0x%Lx\n", dma_addr); 294 "address 0x%Lx\n", dma_addr);
297 return; 295 return;
@@ -305,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
305 303
306 spin_lock_irqsave(&tbl->it_lock, flags); 304 spin_lock_irqsave(&tbl->it_lock, flags);
307 305
308 iommu_area_free(tbl->it_map, entry, npages); 306 bitmap_clear(tbl->it_map, entry, npages);
309 307
310 spin_unlock_irqrestore(&tbl->it_lock, flags); 308 spin_unlock_irqrestore(&tbl->it_lock, flags);
311} 309}
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
318 316
319 pdev = to_pci_dev(dev); 317 pdev = to_pci_dev(dev);
320 318
319 /* search up the device tree for an iommu */
321 pbus = pdev->bus; 320 pbus = pdev->bus;
322 321 do {
323 /* is the device behind a bridge? Look for the root bus */ 322 tbl = pci_iommu(pbus);
324 while (pbus->parent) 323 if (tbl && tbl->it_busno == pbus->number)
324 break;
325 tbl = NULL;
325 pbus = pbus->parent; 326 pbus = pbus->parent;
326 327 } while (pbus);
327 tbl = pci_iommu(pbus);
328 328
329 BUG_ON(tbl && (tbl->it_busno != pbus->number)); 329 BUG_ON(tbl && (tbl->it_busno != pbus->number));
330 330
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); 373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
374 374
375 entry = iommu_range_alloc(dev, tbl, npages); 375 entry = iommu_range_alloc(dev, tbl, npages);
376 if (entry == bad_dma_address) { 376 if (entry == DMA_ERROR_CODE) {
377 /* makes sure unmap knows to stop */ 377 /* makes sure unmap knows to stop */
378 s->dma_length = 0; 378 s->dma_length = 0;
379 goto error; 379 goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
391error: 391error:
392 calgary_unmap_sg(dev, sg, nelems, dir, NULL); 392 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
393 for_each_sg(sg, s, nelems, i) { 393 for_each_sg(sg, s, nelems, i) {
394 sg->dma_address = bad_dma_address; 394 sg->dma_address = DMA_ERROR_CODE;
395 sg->dma_length = 0; 395 sg->dma_length = 0;
396 } 396 }
397 return 0; 397 return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
446 446
447 /* set up tces to cover the allocated range */ 447 /* set up tces to cover the allocated range */
448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
449 if (mapping == bad_dma_address) 449 if (mapping == DMA_ERROR_CODE)
450 goto free; 450 goto free;
451 *dma_handle = mapping; 451 *dma_handle = mapping;
452 return ret; 452 return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
727 struct iommu_table *tbl = pci_iommu(dev->bus); 727 struct iommu_table *tbl = pci_iommu(dev->bus);
728 728
729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */ 729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
730 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); 730 iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
731 731
732 /* avoid the BIOS/VGA first 640KB-1MB region */ 732 /* avoid the BIOS/VGA first 640KB-1MB region */
733 /* for CalIOC2 - avoid the entire first MB */ 733 /* for CalIOC2 - avoid the entire first MB */
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
1344 return; 1344 return;
1345} 1345}
1346 1346
1347static int __init calgary_iommu_init(void)
1348{
1349 int ret;
1350
1351 /* ok, we're trying to use Calgary - let's roll */
1352 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1353
1354 ret = calgary_init();
1355 if (ret) {
1356 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1357 "falling back to no_iommu\n", ret);
1358 return ret;
1359 }
1360
1361 return 0;
1362}
1363
1347void __init detect_calgary(void) 1364void __init detect_calgary(void)
1348{ 1365{
1349 int bus; 1366 int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
1357 * if the user specified iommu=off or iommu=soft or we found 1374 * if the user specified iommu=off or iommu=soft or we found
1358 * another HW IOMMU already, bail out. 1375 * another HW IOMMU already, bail out.
1359 */ 1376 */
1360 if (swiotlb || no_iommu || iommu_detected) 1377 if (no_iommu || iommu_detected)
1361 return; 1378 return;
1362 1379
1363 if (!use_calgary) 1380 if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", 1459 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1443 specified_table_size); 1460 specified_table_size);
1444 1461
1445 /* swiotlb for devices that aren't behind the Calgary. */ 1462 x86_init.iommu.iommu_init = calgary_iommu_init;
1446 if (max_pfn > MAX_DMA32_PFN)
1447 swiotlb = 1;
1448 } 1463 }
1449 return; 1464 return;
1450 1465
@@ -1457,35 +1472,6 @@ cleanup:
1457 } 1472 }
1458} 1473}
1459 1474
1460int __init calgary_iommu_init(void)
1461{
1462 int ret;
1463
1464 if (no_iommu || (swiotlb && !calgary_detected))
1465 return -ENODEV;
1466
1467 if (!calgary_detected)
1468 return -ENODEV;
1469
1470 /* ok, we're trying to use Calgary - let's roll */
1471 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1472
1473 ret = calgary_init();
1474 if (ret) {
1475 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1476 "falling back to no_iommu\n", ret);
1477 return ret;
1478 }
1479
1480 force_iommu = 1;
1481 bad_dma_address = 0x0;
1482 /* dma_ops is set to swiotlb or nommu */
1483 if (!dma_ops)
1484 dma_ops = &nommu_dma_ops;
1485
1486 return 0;
1487}
1488
1489static int __init calgary_parse_options(char *p) 1475static int __init calgary_parse_options(char *p)
1490{ 1476{
1491 unsigned int bridge; 1477 unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 64b838eac18c..75e14e21f61a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,11 @@
11#include <asm/gart.h> 11#include <asm/gart.h>
12#include <asm/calgary.h> 12#include <asm/calgary.h>
13#include <asm/amd_iommu.h> 13#include <asm/amd_iommu.h>
14#include <asm/x86_init.h>
14 15
15static int forbid_dac __read_mostly; 16static int forbid_dac __read_mostly;
16 17
17struct dma_map_ops *dma_ops; 18struct dma_map_ops *dma_ops = &nommu_dma_ops;
18EXPORT_SYMBOL(dma_ops); 19EXPORT_SYMBOL(dma_ops);
19 20
20static int iommu_sac_force __read_mostly; 21static int iommu_sac_force __read_mostly;
@@ -35,22 +36,17 @@ int iommu_detected __read_mostly = 0;
35 36
36/* 37/*
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 38 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA ranslation for 39 * If this variable is 1, IOMMU implementations do no DMA translation for
39 * devices and allow every device to access to whole physical memory. This is 40 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to 41 * useful if a user want to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation. 42 * guests and not for driver dma translation.
42 */ 43 */
43int iommu_pass_through __read_mostly; 44int iommu_pass_through __read_mostly;
44 45
45dma_addr_t bad_dma_address __read_mostly = 0; 46/* Dummy device used for NULL arguments (normally ISA). */
46EXPORT_SYMBOL(bad_dma_address);
47
48/* Dummy device used for NULL arguments (normally ISA). Better would
49 be probably a smaller DMA mask, but this is bug-to-bug compatible
50 to older i386. */
51struct device x86_dma_fallback_dev = { 47struct device x86_dma_fallback_dev = {
52 .init_name = "fallback device", 48 .init_name = "fallback device",
53 .coherent_dma_mask = DMA_BIT_MASK(32), 49 .coherent_dma_mask = ISA_DMA_BIT_MASK,
54 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, 50 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
55}; 51};
56EXPORT_SYMBOL(x86_dma_fallback_dev); 52EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -128,19 +124,18 @@ void __init pci_iommu_alloc(void)
128 /* free the range so iommu could get some range less than 4G */ 124 /* free the range so iommu could get some range less than 4G */
129 dma32_free_bootmem(); 125 dma32_free_bootmem();
130#endif 126#endif
127 if (pci_swiotlb_detect())
128 goto out;
131 129
132 /*
133 * The order of these functions is important for
134 * fall-back/fail-over reasons
135 */
136 gart_iommu_hole_init(); 130 gart_iommu_hole_init();
137 131
138 detect_calgary(); 132 detect_calgary();
139 133
140 detect_intel_iommu(); 134 detect_intel_iommu();
141 135
136 /* needs to be called after gart_iommu_hole_init */
142 amd_iommu_detect(); 137 amd_iommu_detect();
143 138out:
144 pci_swiotlb_init(); 139 pci_swiotlb_init();
145} 140}
146 141
@@ -216,7 +211,7 @@ static __init int iommu_setup(char *p)
216 if (!strncmp(p, "allowdac", 8)) 211 if (!strncmp(p, "allowdac", 8))
217 forbid_dac = 0; 212 forbid_dac = 0;
218 if (!strncmp(p, "nodac", 5)) 213 if (!strncmp(p, "nodac", 5))
219 forbid_dac = -1; 214 forbid_dac = 1;
220 if (!strncmp(p, "usedac", 6)) { 215 if (!strncmp(p, "usedac", 6)) {
221 forbid_dac = -1; 216 forbid_dac = -1;
222 return 1; 217 return 1;
@@ -291,27 +286,19 @@ static int __init pci_iommu_init(void)
291#ifdef CONFIG_PCI 286#ifdef CONFIG_PCI
292 dma_debug_add_bus(&pci_bus_type); 287 dma_debug_add_bus(&pci_bus_type);
293#endif 288#endif
289 x86_init.iommu.iommu_init();
294 290
295 calgary_iommu_init(); 291 if (swiotlb) {
296 292 printk(KERN_INFO "PCI-DMA: "
297 intel_iommu_init(); 293 "Using software bounce buffering for IO (SWIOTLB)\n");
294 swiotlb_print_info();
295 } else
296 swiotlb_free();
298 297
299 amd_iommu_init();
300
301 gart_iommu_init();
302
303 no_iommu_init();
304 return 0; 298 return 0;
305} 299}
306
307void pci_iommu_shutdown(void)
308{
309 gart_iommu_shutdown();
310
311 amd_iommu_shutdown();
312}
313/* Must execute after PCI subsystem */ 300/* Must execute after PCI subsystem */
314fs_initcall(pci_iommu_init); 301rootfs_initcall(pci_iommu_init);
315 302
316#ifdef CONFIG_PCI 303#ifdef CONFIG_PCI
317/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 304/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 98a827ee9ed7..34de53b46f87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -16,13 +16,14 @@
16#include <linux/agp_backend.h> 16#include <linux/agp_backend.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/sched.h>
19#include <linux/string.h> 20#include <linux/string.h>
20#include <linux/spinlock.h> 21#include <linux/spinlock.h>
21#include <linux/pci.h> 22#include <linux/pci.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/topology.h> 24#include <linux/topology.h>
24#include <linux/interrupt.h> 25#include <linux/interrupt.h>
25#include <linux/bitops.h> 26#include <linux/bitmap.h>
26#include <linux/kdebug.h> 27#include <linux/kdebug.h>
27#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
@@ -38,6 +39,7 @@
38#include <asm/swiotlb.h> 39#include <asm/swiotlb.h>
39#include <asm/dma.h> 40#include <asm/dma.h>
40#include <asm/k8.h> 41#include <asm/k8.h>
42#include <asm/x86_init.h>
41 43
42static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 44static unsigned long iommu_bus_base; /* GART remapping area (physical) */
43static unsigned long iommu_size; /* size of remapping area bytes */ 45static unsigned long iommu_size; /* size of remapping area bytes */
@@ -45,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
45 47
46static u32 *iommu_gatt_base; /* Remapping table */ 48static u32 *iommu_gatt_base; /* Remapping table */
47 49
50static dma_addr_t bad_dma_addr;
51
48/* 52/*
49 * If this is disabled the IOMMU will use an optimized flushing strategy 53 * If this is disabled the IOMMU will use an optimized flushing strategy
50 * of only flushing when an mapping is reused. With it true the GART is 54 * of only flushing when an mapping is reused. With it true the GART is
@@ -91,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
91 95
92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 96 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
93 PAGE_SIZE) >> PAGE_SHIFT; 97 PAGE_SIZE) >> PAGE_SHIFT;
94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, 98 boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
95 PAGE_SIZE) >> PAGE_SHIFT; 99 PAGE_SIZE) >> PAGE_SHIFT;
96 100
97 spin_lock_irqsave(&iommu_bitmap_lock, flags); 101 spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -122,7 +126,7 @@ static void free_iommu(unsigned long offset, int size)
122 unsigned long flags; 126 unsigned long flags;
123 127
124 spin_lock_irqsave(&iommu_bitmap_lock, flags); 128 spin_lock_irqsave(&iommu_bitmap_lock, flags);
125 iommu_area_free(iommu_gart_bitmap, offset, size); 129 bitmap_clear(iommu_gart_bitmap, offset, size);
126 if (offset >= next_bit) 130 if (offset >= next_bit)
127 next_bit = offset + size; 131 next_bit = offset + size;
128 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 132 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -215,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
215 if (panic_on_overflow) 219 if (panic_on_overflow)
216 panic("dma_map_area overflow %lu bytes\n", size); 220 panic("dma_map_area overflow %lu bytes\n", size);
217 iommu_full(dev, size, dir); 221 iommu_full(dev, size, dir);
218 return bad_dma_address; 222 return bad_dma_addr;
219 } 223 }
220 224
221 for (i = 0; i < npages; i++) { 225 for (i = 0; i < npages; i++) {
@@ -293,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
293 int i; 297 int i;
294 298
295#ifdef CONFIG_IOMMU_DEBUG 299#ifdef CONFIG_IOMMU_DEBUG
296 printk(KERN_DEBUG "dma_map_sg overflow\n"); 300 pr_debug("dma_map_sg overflow\n");
297#endif 301#endif
298 302
299 for_each_sg(sg, s, nents, i) { 303 for_each_sg(sg, s, nents, i) {
@@ -301,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
301 305
302 if (nonforced_iommu(dev, addr, s->length)) { 306 if (nonforced_iommu(dev, addr, s->length)) {
303 addr = dma_map_area(dev, addr, s->length, dir, 0); 307 addr = dma_map_area(dev, addr, s->length, dir, 0);
304 if (addr == bad_dma_address) { 308 if (addr == bad_dma_addr) {
305 if (i > 0) 309 if (i > 0)
306 gart_unmap_sg(dev, sg, i, dir, NULL); 310 gart_unmap_sg(dev, sg, i, dir, NULL);
307 nents = 0; 311 nents = 0;
@@ -388,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
388 if (!dev) 392 if (!dev)
389 dev = &x86_dma_fallback_dev; 393 dev = &x86_dma_fallback_dev;
390 394
391 out = 0; 395 out = 0;
392 start = 0; 396 start = 0;
393 start_sg = sgmap = sg; 397 start_sg = sg;
394 seg_size = 0; 398 sgmap = sg;
395 max_seg_size = dma_get_max_seg_size(dev); 399 seg_size = 0;
396 ps = NULL; /* shut up gcc */ 400 max_seg_size = dma_get_max_seg_size(dev);
401 ps = NULL; /* shut up gcc */
402
397 for_each_sg(sg, s, nents, i) { 403 for_each_sg(sg, s, nents, i) {
398 dma_addr_t addr = sg_phys(s); 404 dma_addr_t addr = sg_phys(s);
399 405
@@ -416,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
416 sgmap, pages, need) < 0) 422 sgmap, pages, need) < 0)
417 goto error; 423 goto error;
418 out++; 424 out++;
419 seg_size = 0; 425
420 sgmap = sg_next(sgmap); 426 seg_size = 0;
421 pages = 0; 427 sgmap = sg_next(sgmap);
422 start = i; 428 pages = 0;
423 start_sg = s; 429 start = i;
430 start_sg = s;
424 } 431 }
425 } 432 }
426 433
@@ -454,7 +461,7 @@ error:
454 461
455 iommu_full(dev, pages << PAGE_SHIFT, dir); 462 iommu_full(dev, pages << PAGE_SHIFT, dir);
456 for_each_sg(sg, s, nents, i) 463 for_each_sg(sg, s, nents, i)
457 s->dma_address = bad_dma_address; 464 s->dma_address = bad_dma_addr;
458 return 0; 465 return 0;
459} 466}
460 467
@@ -478,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
478 DMA_BIDIRECTIONAL, align_mask); 485 DMA_BIDIRECTIONAL, align_mask);
479 486
480 flush_gart(); 487 flush_gart();
481 if (paddr != bad_dma_address) { 488 if (paddr != bad_dma_addr) {
482 *dma_addr = paddr; 489 *dma_addr = paddr;
483 return page_address(page); 490 return page_address(page);
484 } 491 }
@@ -498,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
498 free_pages((unsigned long)vaddr, get_order(size)); 505 free_pages((unsigned long)vaddr, get_order(size));
499} 506}
500 507
508static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
509{
510 return (dma_addr == bad_dma_addr);
511}
512
501static int no_agp; 513static int no_agp;
502 514
503static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 515static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -514,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
514 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; 526 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
515 527
516 if (iommu_size < 64*1024*1024) { 528 if (iommu_size < 64*1024*1024) {
517 printk(KERN_WARNING 529 pr_warning(
518 "PCI-DMA: Warning: Small IOMMU %luMB." 530 "PCI-DMA: Warning: Small IOMMU %luMB."
519 " Consider increasing the AGP aperture in BIOS\n", 531 " Consider increasing the AGP aperture in BIOS\n",
520 iommu_size >> 20); 532 iommu_size >> 20);
@@ -569,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
569 aperture_alloc = aper_alloc; 581 aperture_alloc = aper_alloc;
570} 582}
571 583
572static int gart_resume(struct sys_device *dev) 584static void gart_fixup_northbridges(struct sys_device *dev)
573{ 585{
574 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); 586 int i;
575 587
576 if (fix_up_north_bridges) { 588 if (!fix_up_north_bridges)
577 int i; 589 return;
578 590
579 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); 591 pr_info("PCI-DMA: Restoring GART aperture settings\n");
580 592
581 for (i = 0; i < num_k8_northbridges; i++) { 593 for (i = 0; i < num_k8_northbridges; i++) {
582 struct pci_dev *dev = k8_northbridges[i]; 594 struct pci_dev *dev = k8_northbridges[i];
583 595
584 /* 596 /*
585 * Don't enable translations just yet. That is the next 597 * Don't enable translations just yet. That is the next
586 * step. Restore the pre-suspend aperture settings. 598 * step. Restore the pre-suspend aperture settings.
587 */ 599 */
588 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, 600 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
589 aperture_order << 1); 601 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
590 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
591 aperture_alloc >> 25);
592 }
593 } 602 }
603}
604
605static int gart_resume(struct sys_device *dev)
606{
607 pr_info("PCI-DMA: Resuming GART IOMMU\n");
608
609 gart_fixup_northbridges(dev);
594 610
595 enable_gart_translations(); 611 enable_gart_translations();
596 612
@@ -603,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
603} 619}
604 620
605static struct sysdev_class gart_sysdev_class = { 621static struct sysdev_class gart_sysdev_class = {
606 .name = "gart", 622 .name = "gart",
607 .suspend = gart_suspend, 623 .suspend = gart_suspend,
608 .resume = gart_resume, 624 .resume = gart_resume,
609 625
610}; 626};
611 627
612static struct sys_device device_gart = { 628static struct sys_device device_gart = {
613 .id = 0, 629 .cls = &gart_sysdev_class,
614 .cls = &gart_sysdev_class,
615}; 630};
616 631
617/* 632/*
@@ -626,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
626 void *gatt; 641 void *gatt;
627 int i, error; 642 int i, error;
628 643
629 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 644 pr_info("PCI-DMA: Disabling AGP.\n");
645
630 aper_size = aper_base = info->aper_size = 0; 646 aper_size = aper_base = info->aper_size = 0;
631 dev = NULL; 647 dev = NULL;
632 for (i = 0; i < num_k8_northbridges; i++) { 648 for (i = 0; i < num_k8_northbridges; i++) {
@@ -644,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
644 } 660 }
645 if (!aper_base) 661 if (!aper_base)
646 goto nommu; 662 goto nommu;
663
647 info->aper_base = aper_base; 664 info->aper_base = aper_base;
648 info->aper_size = aper_size >> 20; 665 info->aper_size = aper_size >> 20;
649 666
@@ -666,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
666 683
667 flush_gart(); 684 flush_gart();
668 685
669 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 686 pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
670 aper_base, aper_size>>10); 687 aper_base, aper_size>>10);
671 688
672 return 0; 689 return 0;
673 690
674 nommu: 691 nommu:
675 /* Should not happen anymore */ 692 /* Should not happen anymore */
676 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 693 pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
677 "falling back to iommu=soft.\n"); 694 "falling back to iommu=soft.\n");
678 return -1; 695 return -1;
679} 696}
@@ -685,14 +702,16 @@ static struct dma_map_ops gart_dma_ops = {
685 .unmap_page = gart_unmap_page, 702 .unmap_page = gart_unmap_page,
686 .alloc_coherent = gart_alloc_coherent, 703 .alloc_coherent = gart_alloc_coherent,
687 .free_coherent = gart_free_coherent, 704 .free_coherent = gart_free_coherent,
705 .mapping_error = gart_mapping_error,
688}; 706};
689 707
690void gart_iommu_shutdown(void) 708static void gart_iommu_shutdown(void)
691{ 709{
692 struct pci_dev *dev; 710 struct pci_dev *dev;
693 int i; 711 int i;
694 712
695 if (no_agp && (dma_ops != &gart_dma_ops)) 713 /* don't shutdown it if there is AGP installed */
714 if (!no_agp)
696 return; 715 return;
697 716
698 for (i = 0; i < num_k8_northbridges; i++) { 717 for (i = 0; i < num_k8_northbridges; i++) {
@@ -707,7 +726,7 @@ void gart_iommu_shutdown(void)
707 } 726 }
708} 727}
709 728
710void __init gart_iommu_init(void) 729int __init gart_iommu_init(void)
711{ 730{
712 struct agp_kern_info info; 731 struct agp_kern_info info;
713 unsigned long iommu_start; 732 unsigned long iommu_start;
@@ -717,7 +736,7 @@ void __init gart_iommu_init(void)
717 long i; 736 long i;
718 737
719 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 738 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
720 return; 739 return 0;
721 740
722#ifndef CONFIG_AGP_AMD64 741#ifndef CONFIG_AGP_AMD64
723 no_agp = 1; 742 no_agp = 1;
@@ -729,35 +748,28 @@ void __init gart_iommu_init(void)
729 (agp_copy_info(agp_bridge, &info) < 0); 748 (agp_copy_info(agp_bridge, &info) < 0);
730#endif 749#endif
731 750
732 if (swiotlb)
733 return;
734
735 /* Did we detect a different HW IOMMU? */
736 if (iommu_detected && !gart_iommu_aperture)
737 return;
738
739 if (no_iommu || 751 if (no_iommu ||
740 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 752 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
741 !gart_iommu_aperture || 753 !gart_iommu_aperture ||
742 (no_agp && init_k8_gatt(&info) < 0)) { 754 (no_agp && init_k8_gatt(&info) < 0)) {
743 if (max_pfn > MAX_DMA32_PFN) { 755 if (max_pfn > MAX_DMA32_PFN) {
744 printk(KERN_WARNING "More than 4GB of memory " 756 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
745 "but GART IOMMU not available.\n"); 757 pr_warning("falling back to iommu=soft.\n");
746 printk(KERN_WARNING "falling back to iommu=soft.\n");
747 } 758 }
748 return; 759 return 0;
749 } 760 }
750 761
751 /* need to map that range */ 762 /* need to map that range */
752 aper_size = info.aper_size << 20; 763 aper_size = info.aper_size << 20;
753 aper_base = info.aper_base; 764 aper_base = info.aper_base;
754 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 765 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
766
755 if (end_pfn > max_low_pfn_mapped) { 767 if (end_pfn > max_low_pfn_mapped) {
756 start_pfn = (aper_base>>PAGE_SHIFT); 768 start_pfn = (aper_base>>PAGE_SHIFT);
757 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 769 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
758 } 770 }
759 771
760 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 772 pr_info("PCI-DMA: using GART IOMMU.\n");
761 iommu_size = check_iommu_size(info.aper_base, aper_size); 773 iommu_size = check_iommu_size(info.aper_base, aper_size);
762 iommu_pages = iommu_size >> PAGE_SHIFT; 774 iommu_pages = iommu_size >> PAGE_SHIFT;
763 775
@@ -772,8 +784,7 @@ void __init gart_iommu_init(void)
772 784
773 ret = dma_debug_resize_entries(iommu_pages); 785 ret = dma_debug_resize_entries(iommu_pages);
774 if (ret) 786 if (ret)
775 printk(KERN_DEBUG 787 pr_debug("PCI-DMA: Cannot trace all the entries\n");
776 "PCI-DMA: Cannot trace all the entries\n");
777 } 788 }
778#endif 789#endif
779 790
@@ -781,17 +792,16 @@ void __init gart_iommu_init(void)
781 * Out of IOMMU space handling. 792 * Out of IOMMU space handling.
782 * Reserve some invalid pages at the beginning of the GART. 793 * Reserve some invalid pages at the beginning of the GART.
783 */ 794 */
784 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 795 bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
785 796
786 agp_memory_reserved = iommu_size; 797 pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
787 printk(KERN_INFO
788 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
789 iommu_size >> 20); 798 iommu_size >> 20);
790 799
791 iommu_start = aper_size - iommu_size; 800 agp_memory_reserved = iommu_size;
792 iommu_bus_base = info.aper_base + iommu_start; 801 iommu_start = aper_size - iommu_size;
793 bad_dma_address = iommu_bus_base; 802 iommu_bus_base = info.aper_base + iommu_start;
794 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 803 bad_dma_addr = iommu_bus_base;
804 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
795 805
796 /* 806 /*
797 * Unmap the IOMMU part of the GART. The alias of the page is 807 * Unmap the IOMMU part of the GART. The alias of the page is
@@ -813,7 +823,7 @@ void __init gart_iommu_init(void)
813 * the pages as Not-Present: 823 * the pages as Not-Present:
814 */ 824 */
815 wbinvd(); 825 wbinvd();
816 826
817 /* 827 /*
818 * Now all caches are flushed and we can safely enable 828 * Now all caches are flushed and we can safely enable
819 * GART hardware. Doing it early leaves the possibility 829 * GART hardware. Doing it early leaves the possibility
@@ -837,6 +847,10 @@ void __init gart_iommu_init(void)
837 847
838 flush_gart(); 848 flush_gart();
839 dma_ops = &gart_dma_ops; 849 dma_ops = &gart_dma_ops;
850 x86_platform.iommu_shutdown = gart_iommu_shutdown;
851 swiotlb = 0;
852
853 return 0;
840} 854}
841 855
842void __init gart_parse_options(char *p) 856void __init gart_parse_options(char *p)
@@ -855,7 +869,7 @@ void __init gart_parse_options(char *p)
855#endif 869#endif
856 if (isdigit(*p) && get_option(&p, &arg)) 870 if (isdigit(*p) && get_option(&p, &arg))
857 iommu_size = arg; 871 iommu_size = arg;
858 if (!strncmp(p, "fullflush", 8)) 872 if (!strncmp(p, "fullflush", 9))
859 iommu_fullflush = 1; 873 iommu_fullflush = 1;
860 if (!strncmp(p, "nofullflush", 11)) 874 if (!strncmp(p, "nofullflush", 11))
861 iommu_fullflush = 0; 875 iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..22be12b60a8f 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
33 dma_addr_t bus = page_to_phys(page) + offset; 33 dma_addr_t bus = page_to_phys(page) + offset;
34 WARN_ON(size == 0); 34 WARN_ON(size == 0);
35 if (!check_addr("map_single", dev, bus, size)) 35 if (!check_addr("map_single", dev, bus, size))
36 return bad_dma_address; 36 return DMA_ERROR_CODE;
37 flush_write_buffers(); 37 flush_write_buffers();
38 return bus; 38 return bus;
39} 39}
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
103 .sync_sg_for_device = nommu_sync_sg_for_device, 103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1, 104 .is_phys = 1,
105}; 105};
106
107void __init no_iommu_init(void)
108{
109 if (dma_ops)
110 return;
111
112 force_iommu = 0; /* no HW IOMMU */
113 dma_ops = &nommu_dma_ops;
114}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..7d2829dde20e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
42 .dma_supported = NULL, 42 .dma_supported = NULL,
43}; 43};
44 44
45void __init pci_swiotlb_init(void) 45/*
46 * pci_swiotlb_detect - set swiotlb to 1 if necessary
47 *
48 * This returns non-zero if we are forced to use swiotlb (by the boot
49 * option).
50 */
51int __init pci_swiotlb_detect(void)
46{ 52{
53 int use_swiotlb = swiotlb | swiotlb_force;
54
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 55 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) 57 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
50 swiotlb = 1; 58 swiotlb = 1;
51#endif 59#endif
52 if (swiotlb_force) 60 if (swiotlb_force)
53 swiotlb = 1; 61 swiotlb = 1;
62
63 return use_swiotlb;
64}
65
66void __init pci_swiotlb_init(void)
67{
54 if (swiotlb) { 68 if (swiotlb) {
55 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); 69 swiotlb_init(0);
56 swiotlb_init();
57 dma_ops = &swiotlb_dma_ops; 70 dma_ops = &swiotlb_dma_ops;
58 } 71 }
59} 72}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..c6ee241c8a98 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,11 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
13#include <linux/dmi.h>
14#include <linux/utsname.h>
12#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 17#include <asm/system.h>
14#include <asm/apic.h> 18#include <asm/apic.h>
15#include <asm/syscalls.h> 19#include <asm/syscalls.h>
@@ -17,6 +21,7 @@
17#include <asm/uaccess.h> 21#include <asm/uaccess.h>
18#include <asm/i387.h> 22#include <asm/i387.h>
19#include <asm/ds.h> 23#include <asm/ds.h>
24#include <asm/debugreg.h>
20 25
21unsigned long idle_halt; 26unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 27EXPORT_SYMBOL(idle_halt);
@@ -87,6 +92,25 @@ void exit_thread(void)
87 } 92 }
88} 93}
89 94
95void show_regs_common(void)
96{
97 const char *board, *product;
98
99 board = dmi_get_system_info(DMI_BOARD_NAME);
100 if (!board)
101 board = "";
102 product = dmi_get_system_info(DMI_PRODUCT_NAME);
103 if (!product)
104 product = "";
105
106 printk(KERN_CONT "\n");
107 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
108 current->pid, current->comm, print_tainted(),
109 init_utsname()->release,
110 (int)strcspn(init_utsname()->version, " "),
111 init_utsname()->version, board, product);
112}
113
90void flush_thread(void) 114void flush_thread(void)
91{ 115{
92 struct task_struct *tsk = current; 116 struct task_struct *tsk = current;
@@ -103,14 +127,7 @@ void flush_thread(void)
103 } 127 }
104#endif 128#endif
105 129
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 130 flush_ptrace_hw_breakpoint(tsk);
107
108 tsk->thread.debugreg0 = 0;
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 131 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 132 /*
116 * Forget coprocessor state.. 133 * Forget coprocessor state..
@@ -192,16 +209,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 209 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 210 update_debugctlmsr(next->debugctlmsr);
194 211
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 212 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 213 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 214 /* prev and next are different */
@@ -224,6 +231,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
224 */ 231 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 232 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 } 233 }
234 propagate_user_return_notify(prev_p, next_p);
227} 235}
228 236
229int sys_fork(struct pt_regs *regs) 237int sys_fork(struct pt_regs *regs)
@@ -247,6 +255,76 @@ int sys_vfork(struct pt_regs *regs)
247 NULL, NULL); 255 NULL, NULL);
248} 256}
249 257
258long
259sys_clone(unsigned long clone_flags, unsigned long newsp,
260 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
261{
262 if (!newsp)
263 newsp = regs->sp;
264 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
265}
266
267/*
268 * This gets run with %si containing the
269 * function to call, and %di containing
270 * the "args".
271 */
272extern void kernel_thread_helper(void);
273
274/*
275 * Create a kernel thread
276 */
277int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
278{
279 struct pt_regs regs;
280
281 memset(&regs, 0, sizeof(regs));
282
283 regs.si = (unsigned long) fn;
284 regs.di = (unsigned long) arg;
285
286#ifdef CONFIG_X86_32
287 regs.ds = __USER_DS;
288 regs.es = __USER_DS;
289 regs.fs = __KERNEL_PERCPU;
290 regs.gs = __KERNEL_STACK_CANARY;
291#endif
292
293 regs.orig_ax = -1;
294 regs.ip = (unsigned long) kernel_thread_helper;
295 regs.cs = __KERNEL_CS | get_kernel_rpl();
296 regs.flags = X86_EFLAGS_IF | 0x2;
297
298 /* Ok, create the new process.. */
299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
300}
301EXPORT_SYMBOL(kernel_thread);
302
303/*
304 * sys_execve() executes a new program.
305 */
306long sys_execve(char __user *name, char __user * __user *argv,
307 char __user * __user *envp, struct pt_regs *regs)
308{
309 long error;
310 char *filename;
311
312 filename = getname(name);
313 error = PTR_ERR(filename);
314 if (IS_ERR(filename))
315 return error;
316 error = do_execve(filename, argv, envp, regs);
317
318#ifdef CONFIG_X86_32
319 if (error == 0) {
320 /* Make sure we don't return using sysenter.. */
321 set_thread_flag(TIF_IRET);
322 }
323#endif
324
325 putname(filename);
326 return error;
327}
250 328
251/* 329/*
252 * Idle related variables and functions 330 * Idle related variables and functions
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..37ad1e046aae 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -23,7 +23,6 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/user.h> 24#include <linux/user.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/utsname.h>
27#include <linux/delay.h> 26#include <linux/delay.h>
28#include <linux/reboot.h> 27#include <linux/reboot.h>
29#include <linux/init.h> 28#include <linux/init.h>
@@ -35,7 +34,6 @@
35#include <linux/tick.h> 34#include <linux/tick.h>
36#include <linux/percpu.h> 35#include <linux/percpu.h>
37#include <linux/prctl.h> 36#include <linux/prctl.h>
38#include <linux/dmi.h>
39#include <linux/ftrace.h> 37#include <linux/ftrace.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/io.h> 39#include <linux/io.h>
@@ -58,6 +56,7 @@
58#include <asm/idle.h> 56#include <asm/idle.h>
59#include <asm/syscalls.h> 57#include <asm/syscalls.h>
60#include <asm/ds.h> 58#include <asm/ds.h>
59#include <asm/debugreg.h>
61 60
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 62
@@ -127,39 +126,29 @@ void __show_regs(struct pt_regs *regs, int all)
127 unsigned long d0, d1, d2, d3, d6, d7; 126 unsigned long d0, d1, d2, d3, d6, d7;
128 unsigned long sp; 127 unsigned long sp;
129 unsigned short ss, gs; 128 unsigned short ss, gs;
130 const char *board;
131 129
132 if (user_mode_vm(regs)) { 130 if (user_mode_vm(regs)) {
133 sp = regs->sp; 131 sp = regs->sp;
134 ss = regs->ss & 0xffff; 132 ss = regs->ss & 0xffff;
135 gs = get_user_gs(regs); 133 gs = get_user_gs(regs);
136 } else { 134 } else {
137 sp = (unsigned long) (&regs->sp); 135 sp = kernel_stack_pointer(regs);
138 savesegment(ss, ss); 136 savesegment(ss, ss);
139 savesegment(gs, gs); 137 savesegment(gs, gs);
140 } 138 }
141 139
142 printk("\n"); 140 show_regs_common();
143 141
144 board = dmi_get_system_info(DMI_PRODUCT_NAME); 142 printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
145 if (!board)
146 board = "";
147 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
148 task_pid_nr(current), current->comm,
149 print_tainted(), init_utsname()->release,
150 (int)strcspn(init_utsname()->version, " "),
151 init_utsname()->version, board);
152
153 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
154 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
155 smp_processor_id()); 144 smp_processor_id());
156 print_symbol("EIP is at %s\n", regs->ip); 145 print_symbol("EIP is at %s\n", regs->ip);
157 146
158 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 147 printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
159 regs->ax, regs->bx, regs->cx, regs->dx); 148 regs->ax, regs->bx, regs->cx, regs->dx);
160 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 149 printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
161 regs->si, regs->di, regs->bp, sp); 150 regs->si, regs->di, regs->bp, sp);
162 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 151 printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
163 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); 152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
164 153
165 if (!all) 154 if (!all)
@@ -169,61 +158,28 @@ void __show_regs(struct pt_regs *regs, int all)
169 cr2 = read_cr2(); 158 cr2 = read_cr2();
170 cr3 = read_cr3(); 159 cr3 = read_cr3();
171 cr4 = read_cr4_safe(); 160 cr4 = read_cr4_safe();
172 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 161 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
173 cr0, cr2, cr3, cr4); 162 cr0, cr2, cr3, cr4);
174 163
175 get_debugreg(d0, 0); 164 get_debugreg(d0, 0);
176 get_debugreg(d1, 1); 165 get_debugreg(d1, 1);
177 get_debugreg(d2, 2); 166 get_debugreg(d2, 2);
178 get_debugreg(d3, 3); 167 get_debugreg(d3, 3);
179 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", 168 printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
180 d0, d1, d2, d3); 169 d0, d1, d2, d3);
181 170
182 get_debugreg(d6, 6); 171 get_debugreg(d6, 6);
183 get_debugreg(d7, 7); 172 get_debugreg(d7, 7);
184 printk("DR6: %08lx DR7: %08lx\n", 173 printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
185 d6, d7); 174 d6, d7);
186} 175}
187 176
188void show_regs(struct pt_regs *regs) 177void show_regs(struct pt_regs *regs)
189{ 178{
190 __show_regs(regs, 1); 179 show_registers(regs);
191 show_trace(NULL, regs, &regs->sp, regs->bp); 180 show_trace(NULL, regs, &regs->sp, regs->bp);
192} 181}
193 182
194/*
195 * This gets run with %bx containing the
196 * function to call, and %dx containing
197 * the "args".
198 */
199extern void kernel_thread_helper(void);
200
201/*
202 * Create a kernel thread
203 */
204int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
205{
206 struct pt_regs regs;
207
208 memset(&regs, 0, sizeof(regs));
209
210 regs.bx = (unsigned long) fn;
211 regs.dx = (unsigned long) arg;
212
213 regs.ds = __USER_DS;
214 regs.es = __USER_DS;
215 regs.fs = __KERNEL_PERCPU;
216 regs.gs = __KERNEL_STACK_CANARY;
217 regs.orig_ax = -1;
218 regs.ip = (unsigned long) kernel_thread_helper;
219 regs.cs = __KERNEL_CS | get_kernel_rpl();
220 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
221
222 /* Ok, create the new process.. */
223 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
224}
225EXPORT_SYMBOL(kernel_thread);
226
227void release_thread(struct task_struct *dead_task) 183void release_thread(struct task_struct *dead_task)
228{ 184{
229 BUG_ON(dead_task->mm); 185 BUG_ON(dead_task->mm);
@@ -259,7 +215,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 215
260 task_user_gs(p) = get_user_gs(regs); 216 task_user_gs(p) = get_user_gs(regs);
261 217
218 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 219 tsk = current;
220 err = -ENOMEM;
221
222 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
223
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 224 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 225 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 226 IO_BITMAP_BYTES, GFP_KERNEL);
@@ -430,46 +391,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
430 return prev_p; 391 return prev_p;
431} 392}
432 393
433int sys_clone(struct pt_regs *regs)
434{
435 unsigned long clone_flags;
436 unsigned long newsp;
437 int __user *parent_tidptr, *child_tidptr;
438
439 clone_flags = regs->bx;
440 newsp = regs->cx;
441 parent_tidptr = (int __user *)regs->dx;
442 child_tidptr = (int __user *)regs->di;
443 if (!newsp)
444 newsp = regs->sp;
445 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
446}
447
448/*
449 * sys_execve() executes a new program.
450 */
451int sys_execve(struct pt_regs *regs)
452{
453 int error;
454 char *filename;
455
456 filename = getname((char __user *) regs->bx);
457 error = PTR_ERR(filename);
458 if (IS_ERR(filename))
459 goto out;
460 error = do_execve(filename,
461 (char __user * __user *) regs->cx,
462 (char __user * __user *) regs->dx,
463 regs);
464 if (error == 0) {
465 /* Make sure we don't return using sysenter.. */
466 set_thread_flag(TIF_IRET);
467 }
468 putname(filename);
469out:
470 return error;
471}
472
473#define top_esp (THREAD_SIZE - sizeof(unsigned long)) 394#define top_esp (THREAD_SIZE - sizeof(unsigned long))
474#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) 395#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
475 396
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b683170..f9e033150cdf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,6 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/user.h> 27#include <linux/user.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/utsname.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/module.h> 30#include <linux/module.h>
32#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -38,7 +37,6 @@
38#include <linux/uaccess.h> 37#include <linux/uaccess.h>
39#include <linux/io.h> 38#include <linux/io.h>
40#include <linux/ftrace.h> 39#include <linux/ftrace.h>
41#include <linux/dmi.h>
42 40
43#include <asm/pgtable.h> 41#include <asm/pgtable.h>
44#include <asm/system.h> 42#include <asm/system.h>
@@ -52,14 +50,13 @@
52#include <asm/idle.h> 50#include <asm/idle.h>
53#include <asm/syscalls.h> 51#include <asm/syscalls.h>
54#include <asm/ds.h> 52#include <asm/ds.h>
53#include <asm/debugreg.h>
55 54
56asmlinkage extern void ret_from_fork(void); 55asmlinkage extern void ret_from_fork(void);
57 56
58DEFINE_PER_CPU(unsigned long, old_rsp); 57DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle); 58static DEFINE_PER_CPU(unsigned char, is_idle);
60 59
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 60static ATOMIC_NOTIFIER_HEAD(idle_notifier);
64 61
65void idle_notifier_register(struct notifier_block *n) 62void idle_notifier_register(struct notifier_block *n)
@@ -162,31 +159,21 @@ void __show_regs(struct pt_regs *regs, int all)
162 unsigned long d0, d1, d2, d3, d6, d7; 159 unsigned long d0, d1, d2, d3, d6, d7;
163 unsigned int fsindex, gsindex; 160 unsigned int fsindex, gsindex;
164 unsigned int ds, cs, es; 161 unsigned int ds, cs, es;
165 const char *board; 162
166 163 show_regs_common();
167 printk("\n"); 164 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
168 print_modules();
169 board = dmi_get_system_info(DMI_PRODUCT_NAME);
170 if (!board)
171 board = "";
172 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
173 current->pid, current->comm, print_tainted(),
174 init_utsname()->release,
175 (int)strcspn(init_utsname()->version, " "),
176 init_utsname()->version, board);
177 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
178 printk_address(regs->ip, 1); 165 printk_address(regs->ip, 1);
179 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 166 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
180 regs->sp, regs->flags); 167 regs->sp, regs->flags);
181 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 168 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
182 regs->ax, regs->bx, regs->cx); 169 regs->ax, regs->bx, regs->cx);
183 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 170 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
184 regs->dx, regs->si, regs->di); 171 regs->dx, regs->si, regs->di);
185 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 172 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
186 regs->bp, regs->r8, regs->r9); 173 regs->bp, regs->r8, regs->r9);
187 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 174 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
188 regs->r10, regs->r11, regs->r12); 175 regs->r10, regs->r11, regs->r12);
189 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 176 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
190 regs->r13, regs->r14, regs->r15); 177 regs->r13, regs->r14, regs->r15);
191 178
192 asm("movl %%ds,%0" : "=r" (ds)); 179 asm("movl %%ds,%0" : "=r" (ds));
@@ -207,27 +194,26 @@ void __show_regs(struct pt_regs *regs, int all)
207 cr3 = read_cr3(); 194 cr3 = read_cr3();
208 cr4 = read_cr4(); 195 cr4 = read_cr4();
209 196
210 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 197 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
211 fs, fsindex, gs, gsindex, shadowgs); 198 fs, fsindex, gs, gsindex, shadowgs);
212 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 199 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
213 es, cr0); 200 es, cr0);
214 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 201 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
215 cr4); 202 cr4);
216 203
217 get_debugreg(d0, 0); 204 get_debugreg(d0, 0);
218 get_debugreg(d1, 1); 205 get_debugreg(d1, 1);
219 get_debugreg(d2, 2); 206 get_debugreg(d2, 2);
220 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 207 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
221 get_debugreg(d3, 3); 208 get_debugreg(d3, 3);
222 get_debugreg(d6, 6); 209 get_debugreg(d6, 6);
223 get_debugreg(d7, 7); 210 get_debugreg(d7, 7);
224 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
225} 212}
226 213
227void show_regs(struct pt_regs *regs) 214void show_regs(struct pt_regs *regs)
228{ 215{
229 printk(KERN_INFO "CPU %d:", smp_processor_id()); 216 show_registers(regs);
230 __show_regs(regs, 1);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 217 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232} 218}
233 219
@@ -285,8 +271,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
285 *childregs = *regs; 271 *childregs = *regs;
286 272
287 childregs->ax = 0; 273 childregs->ax = 0;
288 childregs->sp = sp; 274 if (user_mode(regs))
289 if (sp == ~0UL) 275 childregs->sp = sp;
276 else
290 childregs->sp = (unsigned long)childregs; 277 childregs->sp = (unsigned long)childregs;
291 278
292 p->thread.sp = (unsigned long) childregs; 279 p->thread.sp = (unsigned long) childregs;
@@ -297,12 +284,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
297 284
298 p->thread.fs = me->thread.fs; 285 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs; 286 p->thread.gs = me->thread.gs;
287 p->thread.io_bitmap_ptr = NULL;
300 288
301 savesegment(gs, p->thread.gsindex); 289 savesegment(gs, p->thread.gsindex);
302 savesegment(fs, p->thread.fsindex); 290 savesegment(fs, p->thread.fsindex);
303 savesegment(es, p->thread.es); 291 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 292 savesegment(ds, p->thread.ds);
305 293
294 err = -ENOMEM;
295 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
296
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 297 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 298 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 299 if (!p->thread.io_bitmap_ptr) {
@@ -341,29 +332,46 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 332 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 333 p->thread.io_bitmap_max = 0;
343 } 334 }
335
344 return err; 336 return err;
345} 337}
346 338
347void 339static void
348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 340start_thread_common(struct pt_regs *regs, unsigned long new_ip,
341 unsigned long new_sp,
342 unsigned int _cs, unsigned int _ss, unsigned int _ds)
349{ 343{
350 loadsegment(fs, 0); 344 loadsegment(fs, 0);
351 loadsegment(es, 0); 345 loadsegment(es, _ds);
352 loadsegment(ds, 0); 346 loadsegment(ds, _ds);
353 load_gs_index(0); 347 load_gs_index(0);
354 regs->ip = new_ip; 348 regs->ip = new_ip;
355 regs->sp = new_sp; 349 regs->sp = new_sp;
356 percpu_write(old_rsp, new_sp); 350 percpu_write(old_rsp, new_sp);
357 regs->cs = __USER_CS; 351 regs->cs = _cs;
358 regs->ss = __USER_DS; 352 regs->ss = _ss;
359 regs->flags = 0x200; 353 regs->flags = X86_EFLAGS_IF;
360 set_fs(USER_DS); 354 set_fs(USER_DS);
361 /* 355 /*
362 * Free the old FP and other extended state 356 * Free the old FP and other extended state
363 */ 357 */
364 free_thread_xstate(current); 358 free_thread_xstate(current);
365} 359}
366EXPORT_SYMBOL_GPL(start_thread); 360
361void
362start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
363{
364 start_thread_common(regs, new_ip, new_sp,
365 __USER_CS, __USER_DS, 0);
366}
367
368#ifdef CONFIG_IA32_EMULATION
369void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
370{
371 start_thread_common(regs, new_ip, new_sp,
372 __USER32_CS, __USER32_DS, __USER32_DS);
373}
374#endif
367 375
368/* 376/*
369 * switch_to(x,y) should switch tasks from x to y. 377 * switch_to(x,y) should switch tasks from x to y.
@@ -495,26 +503,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 503 */
496 if (preload_fpu) 504 if (preload_fpu)
497 __math_state_restore(); 505 __math_state_restore();
498 return prev_p;
499}
500 506
501/* 507 return prev_p;
502 * sys_execve() executes a new program.
503 */
504asmlinkage
505long sys_execve(char __user *name, char __user * __user *argv,
506 char __user * __user *envp, struct pt_regs *regs)
507{
508 long error;
509 char *filename;
510
511 filename = getname(name);
512 error = PTR_ERR(filename);
513 if (IS_ERR(filename))
514 return error;
515 error = do_execve(filename, argv, envp, regs);
516 putname(filename);
517 return error;
518} 508}
519 509
520void set_personality_64bit(void) 510void set_personality_64bit(void)
@@ -531,15 +521,6 @@ void set_personality_64bit(void)
531 current->personality &= ~READ_IMPLIES_EXEC; 521 current->personality &= ~READ_IMPLIES_EXEC;
532} 522}
533 523
534asmlinkage long
535sys_clone(unsigned long clone_flags, unsigned long newsp,
536 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
537{
538 if (!newsp)
539 newsp = regs->sp;
540 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
541}
542
543unsigned long get_wchan(struct task_struct *p) 524unsigned long get_wchan(struct task_struct *p)
544{ 525{
545 unsigned long stack; 526 unsigned long stack;
@@ -664,3 +645,8 @@ long sys_arch_prctl(int code, unsigned long addr)
664 return do_arch_prctl(current, code, addr); 645 return do_arch_prctl(current, code, addr);
665} 646}
666 647
648unsigned long KSTK_ESP(struct task_struct *task)
649{
650 return (test_tsk_thread_flag(task, TIF_IA32)) ?
651 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
652}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..017d937639fe 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/perf_event.h>
26#include <linux/hw_breakpoint.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
34#include <asm/prctl.h> 36#include <asm/prctl.h>
35#include <asm/proto.h> 37#include <asm/proto.h>
36#include <asm/ds.h> 38#include <asm/ds.h>
39#include <asm/hw_breakpoint.h>
37 40
38#include "tls.h" 41#include "tls.h"
39 42
@@ -49,6 +52,118 @@ enum x86_regset {
49 REGSET_IOPERM32, 52 REGSET_IOPERM32,
50}; 53};
51 54
55struct pt_regs_offset {
56 const char *name;
57 int offset;
58};
59
60#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
61#define REG_OFFSET_END {.name = NULL, .offset = 0}
62
63static const struct pt_regs_offset regoffset_table[] = {
64#ifdef CONFIG_X86_64
65 REG_OFFSET_NAME(r15),
66 REG_OFFSET_NAME(r14),
67 REG_OFFSET_NAME(r13),
68 REG_OFFSET_NAME(r12),
69 REG_OFFSET_NAME(r11),
70 REG_OFFSET_NAME(r10),
71 REG_OFFSET_NAME(r9),
72 REG_OFFSET_NAME(r8),
73#endif
74 REG_OFFSET_NAME(bx),
75 REG_OFFSET_NAME(cx),
76 REG_OFFSET_NAME(dx),
77 REG_OFFSET_NAME(si),
78 REG_OFFSET_NAME(di),
79 REG_OFFSET_NAME(bp),
80 REG_OFFSET_NAME(ax),
81#ifdef CONFIG_X86_32
82 REG_OFFSET_NAME(ds),
83 REG_OFFSET_NAME(es),
84 REG_OFFSET_NAME(fs),
85 REG_OFFSET_NAME(gs),
86#endif
87 REG_OFFSET_NAME(orig_ax),
88 REG_OFFSET_NAME(ip),
89 REG_OFFSET_NAME(cs),
90 REG_OFFSET_NAME(flags),
91 REG_OFFSET_NAME(sp),
92 REG_OFFSET_NAME(ss),
93 REG_OFFSET_END,
94};
95
96/**
97 * regs_query_register_offset() - query register offset from its name
98 * @name: the name of a register
99 *
100 * regs_query_register_offset() returns the offset of a register in struct
101 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
102 */
103int regs_query_register_offset(const char *name)
104{
105 const struct pt_regs_offset *roff;
106 for (roff = regoffset_table; roff->name != NULL; roff++)
107 if (!strcmp(roff->name, name))
108 return roff->offset;
109 return -EINVAL;
110}
111
112/**
113 * regs_query_register_name() - query register name from its offset
114 * @offset: the offset of a register in struct pt_regs.
115 *
116 * regs_query_register_name() returns the name of a register from its
117 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
118 */
119const char *regs_query_register_name(unsigned int offset)
120{
121 const struct pt_regs_offset *roff;
122 for (roff = regoffset_table; roff->name != NULL; roff++)
123 if (roff->offset == offset)
124 return roff->name;
125 return NULL;
126}
127
128static const int arg_offs_table[] = {
129#ifdef CONFIG_X86_32
130 [0] = offsetof(struct pt_regs, ax),
131 [1] = offsetof(struct pt_regs, dx),
132 [2] = offsetof(struct pt_regs, cx)
133#else /* CONFIG_X86_64 */
134 [0] = offsetof(struct pt_regs, di),
135 [1] = offsetof(struct pt_regs, si),
136 [2] = offsetof(struct pt_regs, dx),
137 [3] = offsetof(struct pt_regs, cx),
138 [4] = offsetof(struct pt_regs, r8),
139 [5] = offsetof(struct pt_regs, r9)
140#endif
141};
142
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
52/* 167/*
53 * does not yet catch signals sent when the child dies. 168 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 169 * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 252 return 0;
138} 253}
139 254
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 255#else /* CONFIG_X86_64 */
146 256
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 257#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 376 return 0;
267} 377}
268 378
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 379#endif /* CONFIG_X86_32 */
279 380
280static unsigned long get_flags(struct task_struct *task) 381static unsigned long get_flags(struct task_struct *task)
@@ -408,14 +509,14 @@ static int genregs_get(struct task_struct *target,
408{ 509{
409 if (kbuf) { 510 if (kbuf) {
410 unsigned long *k = kbuf; 511 unsigned long *k = kbuf;
411 while (count > 0) { 512 while (count >= sizeof(*k)) {
412 *k++ = getreg(target, pos); 513 *k++ = getreg(target, pos);
413 count -= sizeof(*k); 514 count -= sizeof(*k);
414 pos += sizeof(*k); 515 pos += sizeof(*k);
415 } 516 }
416 } else { 517 } else {
417 unsigned long __user *u = ubuf; 518 unsigned long __user *u = ubuf;
418 while (count > 0) { 519 while (count >= sizeof(*u)) {
419 if (__put_user(getreg(target, pos), u++)) 520 if (__put_user(getreg(target, pos), u++))
420 return -EFAULT; 521 return -EFAULT;
421 count -= sizeof(*u); 522 count -= sizeof(*u);
@@ -434,14 +535,14 @@ static int genregs_set(struct task_struct *target,
434 int ret = 0; 535 int ret = 0;
435 if (kbuf) { 536 if (kbuf) {
436 const unsigned long *k = kbuf; 537 const unsigned long *k = kbuf;
437 while (count > 0 && !ret) { 538 while (count >= sizeof(*k) && !ret) {
438 ret = putreg(target, pos, *k++); 539 ret = putreg(target, pos, *k++);
439 count -= sizeof(*k); 540 count -= sizeof(*k);
440 pos += sizeof(*k); 541 pos += sizeof(*k);
441 } 542 }
442 } else { 543 } else {
443 const unsigned long __user *u = ubuf; 544 const unsigned long __user *u = ubuf;
444 while (count > 0 && !ret) { 545 while (count >= sizeof(*u) && !ret) {
445 unsigned long word; 546 unsigned long word;
446 ret = __get_user(word, u++); 547 ret = __get_user(word, u++);
447 if (ret) 548 if (ret)
@@ -454,99 +555,237 @@ static int genregs_set(struct task_struct *target,
454 return ret; 555 return ret;
455} 556}
456 557
558static void ptrace_triggered(struct perf_event *bp, int nmi,
559 struct perf_sample_data *data,
560 struct pt_regs *regs)
561{
562 int i;
563 struct thread_struct *thread = &(current->thread);
564
565 /*
566 * Store in the virtual DR6 register the fact that the breakpoint
567 * was hit so the thread's debugger will see it.
568 */
569 for (i = 0; i < HBP_NUM; i++) {
570 if (thread->ptrace_bps[i] == bp)
571 break;
572 }
573
574 thread->debugreg6 |= (DR_TRAP0 << i);
575}
576
457/* 577/*
458 * This function is trivial and will be inlined by the compiler. 578 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 579 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 580 *
461 */ 581 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 582static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 583{
464 switch (n) { 584 int i;
465 case 0: return child->thread.debugreg0; 585 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 586 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 587
468 case 3: return child->thread.debugreg3; 588 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 589 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 590 info = counter_arch_bp(bp[i]);
591 dr7 |= encode_dr7(i, info->len, info->type);
592 }
471 } 593 }
472 return 0; 594
595 return dr7;
473} 596}
474 597
475static int ptrace_set_debugreg(struct task_struct *child, 598static int
476 int n, unsigned long data) 599ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
600 struct task_struct *tsk, int disabled)
477{ 601{
478 int i; 602 int err;
603 int gen_len, gen_type;
604 struct perf_event_attr attr;
479 605
480 if (unlikely(n == 4 || n == 5)) 606 /*
481 return -EIO; 607 * We shoud have at least an inactive breakpoint at this
608 * slot. It means the user is writing dr7 without having
609 * written the address register first
610 */
611 if (!bp)
612 return -EINVAL;
482 613
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 614 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 615 if (err)
616 return err;
485 617
486 switch (n) { 618 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 619 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 620 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 621 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 622
492 case 6: 623 return modify_user_hw_breakpoint(bp, &attr);
493 if ((data & ~0xffffffffUL) != 0) 624}
494 return -EIO;
495 child->thread.debugreg6 = data;
496 break;
497 625
498 case 7: 626/*
627 * Handle ptrace writes to debug register 7.
628 */
629static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
630{
631 struct thread_struct *thread = &(tsk->thread);
632 unsigned long old_dr7;
633 int i, orig_ret = 0, rc = 0;
634 int enabled, second_pass = 0;
635 unsigned len, type;
636 struct perf_event *bp;
637
638 data &= ~DR_CONTROL_RESERVED;
639 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
640restore:
641 /*
642 * Loop through all the hardware breakpoints, making the
643 * appropriate changes to each.
644 */
645 for (i = 0; i < HBP_NUM; i++) {
646 enabled = decode_dr7(data, i, &len, &type);
647 bp = thread->ptrace_bps[i];
648
649 if (!enabled) {
650 if (bp) {
651 /*
652 * Don't unregister the breakpoints right-away,
653 * unless all register_user_hw_breakpoint()
654 * requests have succeeded. This prevents
655 * any window of opportunity for debug
656 * register grabbing by other users.
657 */
658 if (!second_pass)
659 continue;
660
661 rc = ptrace_modify_breakpoint(bp, len, type,
662 tsk, 1);
663 if (rc)
664 break;
665 }
666 continue;
667 }
668
669 rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
670 if (rc)
671 break;
672 }
673 /*
674 * Make a second pass to free the remaining unused breakpoints
675 * or to restore the original breakpoints if an error occurred.
676 */
677 if (!second_pass) {
678 second_pass = 1;
679 if (rc < 0) {
680 orig_ret = rc;
681 data = old_dr7;
682 }
683 goto restore;
684 }
685 return ((orig_ret < 0) ? orig_ret : rc);
686}
687
688/*
689 * Handle PTRACE_PEEKUSR calls for the debug register area.
690 */
691static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
692{
693 struct thread_struct *thread = &(tsk->thread);
694 unsigned long val = 0;
695
696 if (n < HBP_NUM) {
697 struct perf_event *bp;
698 bp = thread->ptrace_bps[n];
699 if (!bp)
700 return 0;
701 val = bp->hw.info.address;
702 } else if (n == 6) {
703 val = thread->debugreg6;
704 } else if (n == 7) {
705 val = ptrace_get_dr7(thread->ptrace_bps);
706 }
707 return val;
708}
709
710static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
711 unsigned long addr)
712{
713 struct perf_event *bp;
714 struct thread_struct *t = &tsk->thread;
715 struct perf_event_attr attr;
716
717 if (!t->ptrace_bps[nr]) {
718 hw_breakpoint_init(&attr);
499 /* 719 /*
500 * Sanity-check data. Take one half-byte at once with 720 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 721 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 722 */
529#ifdef CONFIG_X86_32 723 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 724 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 725 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 726 attr.disabled = 1;
533#endif 727
534 data &= ~DR_CONTROL_RESERVED; 728 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 729
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 730 /*
537 return -EIO; 731 * CHECKME: the previous code returned -EIO if the addr wasn't
538 child->thread.debugreg7 = data; 732 * a valid task virtual addr. The new one will return -EINVAL in
539 if (data) 733 * this case.
540 set_tsk_thread_flag(child, TIF_DEBUG); 734 * -EINVAL may be what we want for in-kernel breakpoints users,
541 else 735 * but -EIO looks better for ptrace, since we refuse a register
542 clear_tsk_thread_flag(child, TIF_DEBUG); 736 * writing for the user. And anyway this is the previous
543 break; 737 * behaviour.
738 */
739 if (IS_ERR(bp))
740 return PTR_ERR(bp);
741
742 t->ptrace_bps[nr] = bp;
743 } else {
744 int err;
745
746 bp = t->ptrace_bps[nr];
747
748 attr = bp->attr;
749 attr.bp_addr = addr;
750 err = modify_user_hw_breakpoint(bp, &attr);
751 if (err)
752 return err;
544 } 753 }
545 754
755
546 return 0; 756 return 0;
547} 757}
548 758
549/* 759/*
760 * Handle PTRACE_POKEUSR calls for the debug register area.
761 */
762int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
763{
764 struct thread_struct *thread = &(tsk->thread);
765 int rc = 0;
766
767 /* There are no DR4 or DR5 registers */
768 if (n == 4 || n == 5)
769 return -EIO;
770
771 if (n == 6) {
772 thread->debugreg6 = val;
773 goto ret_path;
774 }
775 if (n < HBP_NUM) {
776 rc = ptrace_set_breakpoint_addr(tsk, n, val);
777 if (rc)
778 return rc;
779 }
780 /* All that's left is DR7 */
781 if (n == 7)
782 rc = ptrace_write_dr7(tsk, val);
783
784ret_path:
785 return rc;
786}
787
788/*
550 * These access the current or another (stopped) task's io permission 789 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 790 * bitmap for debugging or core dump.
552 */ 791 */
@@ -1219,14 +1458,14 @@ static int genregs32_get(struct task_struct *target,
1219{ 1458{
1220 if (kbuf) { 1459 if (kbuf) {
1221 compat_ulong_t *k = kbuf; 1460 compat_ulong_t *k = kbuf;
1222 while (count > 0) { 1461 while (count >= sizeof(*k)) {
1223 getreg32(target, pos, k++); 1462 getreg32(target, pos, k++);
1224 count -= sizeof(*k); 1463 count -= sizeof(*k);
1225 pos += sizeof(*k); 1464 pos += sizeof(*k);
1226 } 1465 }
1227 } else { 1466 } else {
1228 compat_ulong_t __user *u = ubuf; 1467 compat_ulong_t __user *u = ubuf;
1229 while (count > 0) { 1468 while (count >= sizeof(*u)) {
1230 compat_ulong_t word; 1469 compat_ulong_t word;
1231 getreg32(target, pos, &word); 1470 getreg32(target, pos, &word);
1232 if (__put_user(word, u++)) 1471 if (__put_user(word, u++))
@@ -1247,14 +1486,14 @@ static int genregs32_set(struct task_struct *target,
1247 int ret = 0; 1486 int ret = 0;
1248 if (kbuf) { 1487 if (kbuf) {
1249 const compat_ulong_t *k = kbuf; 1488 const compat_ulong_t *k = kbuf;
1250 while (count > 0 && !ret) { 1489 while (count >= sizeof(*k) && !ret) {
1251 ret = putreg32(target, pos, *k++); 1490 ret = putreg32(target, pos, *k++);
1252 count -= sizeof(*k); 1491 count -= sizeof(*k);
1253 pos += sizeof(*k); 1492 pos += sizeof(*k);
1254 } 1493 }
1255 } else { 1494 } else {
1256 const compat_ulong_t __user *u = ubuf; 1495 const compat_ulong_t __user *u = ubuf;
1257 while (count > 0 && !ret) { 1496 while (count >= sizeof(*u) && !ret) {
1258 compat_ulong_t word; 1497 compat_ulong_t word;
1259 ret = __get_user(word, u++); 1498 ret = __get_user(word, u++);
1260 if (ret) 1499 if (ret)
@@ -1437,21 +1676,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1437#endif 1676#endif
1438} 1677}
1439 1678
1440void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, 1679static void fill_sigtrap_info(struct task_struct *tsk,
1441 int error_code, int si_code) 1680 struct pt_regs *regs,
1681 int error_code, int si_code,
1682 struct siginfo *info)
1442{ 1683{
1443 struct siginfo info;
1444
1445 tsk->thread.trap_no = 1; 1684 tsk->thread.trap_no = 1;
1446 tsk->thread.error_code = error_code; 1685 tsk->thread.error_code = error_code;
1447 1686
1448 memset(&info, 0, sizeof(info)); 1687 memset(info, 0, sizeof(*info));
1449 info.si_signo = SIGTRAP; 1688 info->si_signo = SIGTRAP;
1450 info.si_code = si_code; 1689 info->si_code = si_code;
1690 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
1691}
1451 1692
1452 /* User-mode ip? */ 1693void user_single_step_siginfo(struct task_struct *tsk,
1453 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1694 struct pt_regs *regs,
1695 struct siginfo *info)
1696{
1697 fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
1698}
1454 1699
1700void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1701 int error_code, int si_code)
1702{
1703 struct siginfo info;
1704
1705 fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
1455 /* Send us the fake SIGTRAP */ 1706 /* Send us the fake SIGTRAP */
1456 force_sig_info(SIGTRAP, &info, tsk); 1707 force_sig_info(SIGTRAP, &info, tsk);
1457} 1708}
@@ -1516,29 +1767,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1516 1767
1517asmregparm void syscall_trace_leave(struct pt_regs *regs) 1768asmregparm void syscall_trace_leave(struct pt_regs *regs)
1518{ 1769{
1770 bool step;
1771
1519 if (unlikely(current->audit_context)) 1772 if (unlikely(current->audit_context))
1520 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1773 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1521 1774
1522 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1775 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1523 trace_sys_exit(regs, regs->ax); 1776 trace_sys_exit(regs, regs->ax);
1524 1777
1525 if (test_thread_flag(TIF_SYSCALL_TRACE))
1526 tracehook_report_syscall_exit(regs, 0);
1527
1528 /* 1778 /*
1529 * If TIF_SYSCALL_EMU is set, we only get here because of 1779 * If TIF_SYSCALL_EMU is set, we only get here because of
1530 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 1780 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1531 * We already reported this syscall instruction in 1781 * We already reported this syscall instruction in
1532 * syscall_trace_enter(), so don't do any more now. 1782 * syscall_trace_enter().
1533 */
1534 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1535 return;
1536
1537 /*
1538 * If we are single-stepping, synthesize a trap to follow the
1539 * system call instruction.
1540 */ 1783 */
1541 if (test_thread_flag(TIF_SINGLESTEP) && 1784 step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
1542 tracehook_consider_fatal_signal(current, SIGTRAP)) 1785 !test_thread_flag(TIF_SYSCALL_EMU);
1543 send_sigtrap(current, regs, 0, TRAP_BRKPT); 1786 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1787 tracehook_report_syscall_exit(regs, step);
1544} 1788}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..18093d7498f0 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -499,6 +499,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{ 499{
500 struct pci_dev *nb_ht; 500 struct pci_dev *nb_ht;
501 unsigned int devfn; 501 unsigned int devfn;
502 u32 node;
502 u32 val; 503 u32 val;
503 504
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); 505 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +508,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
507 return; 508 return;
508 509
509 pci_read_config_dword(nb_ht, 0x60, &val); 510 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 511 node = val & 7;
512 /*
513 * Some hardware may return an invalid node ID,
514 * so check it first:
515 */
516 if (node_online(node))
517 set_dev_node(&dev->dev, node);
511 pci_dev_put(nb_ht); 518 pci_dev_put(nb_ht);
512} 519}
513 520
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 27349f92a6d7..1545bc0c9845 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h>
7#include <linux/tboot.h> 8#include <linux/tboot.h>
8#include <acpi/reboot.h> 9#include <acpi/reboot.h>
9#include <asm/io.h> 10#include <asm/io.h>
@@ -22,7 +23,7 @@
22# include <linux/ctype.h> 23# include <linux/ctype.h>
23# include <linux/mc146818rtc.h> 24# include <linux/mc146818rtc.h>
24#else 25#else
25# include <asm/iommu.h> 26# include <asm/x86_init.h>
26#endif 27#endif
27 28
28/* 29/*
@@ -258,6 +259,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
258 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), 259 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
259 }, 260 },
260 }, 261 },
262 { /* Handle problems with rebooting on ASUS P4S800 */
263 .callback = set_bios_reboot,
264 .ident = "ASUS P4S800",
265 .matches = {
266 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
267 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
268 },
269 },
261 { } 270 { }
262}; 271};
263 272
@@ -435,6 +444,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
435 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), 444 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
436 }, 445 },
437 }, 446 },
447 { /* Handle problems with rebooting on Apple Macmini3,1 */
448 .callback = set_pci_reboot,
449 .ident = "Apple Macmini3,1",
450 .matches = {
451 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
452 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
453 },
454 },
438 { } 455 { }
439}; 456};
440 457
@@ -613,7 +630,7 @@ void native_machine_shutdown(void)
613#endif 630#endif
614 631
615#ifdef CONFIG_X86_64 632#ifdef CONFIG_X86_64
616 pci_iommu_shutdown(); 633 x86_platform.iommu_shutdown();
617#endif 634#endif
618} 635}
619 636
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..fda313ebbb03 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -12,7 +12,7 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/geode.h> 15#include <linux/cs5535.h>
16 16
17static void cs5530a_warm_reset(struct pci_dev *dev) 17static void cs5530a_warm_reset(struct pci_dev *dev)
18{ 18{
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e09f0e2c14b5..f7b8b9894b22 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -73,6 +73,7 @@
73 73
74#include <asm/mtrr.h> 74#include <asm/mtrr.h>
75#include <asm/apic.h> 75#include <asm/apic.h>
76#include <asm/trampoline.h>
76#include <asm/e820.h> 77#include <asm/e820.h>
77#include <asm/mpspec.h> 78#include <asm/mpspec.h>
78#include <asm/setup.h> 79#include <asm/setup.h>
@@ -106,9 +107,11 @@
106#include <asm/percpu.h> 107#include <asm/percpu.h>
107#include <asm/topology.h> 108#include <asm/topology.h>
108#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/k8.h>
109#ifdef CONFIG_X86_64 111#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 112#include <asm/numa_64.h>
111#endif 113#endif
114#include <asm/mce.h>
112 115
113/* 116/*
114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 117 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -247,7 +250,7 @@ EXPORT_SYMBOL(edd);
247 * from boot_params into a safe place. 250 * from boot_params into a safe place.
248 * 251 *
249 */ 252 */
250static inline void copy_edd(void) 253static inline void __init copy_edd(void)
251{ 254{
252 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 255 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
253 sizeof(edd.mbr_signature)); 256 sizeof(edd.mbr_signature));
@@ -256,7 +259,7 @@ static inline void copy_edd(void)
256 edd.edd_info_nr = boot_params.eddbuf_entries; 259 edd.edd_info_nr = boot_params.eddbuf_entries;
257} 260}
258#else 261#else
259static inline void copy_edd(void) 262static inline void __init copy_edd(void)
260{ 263{
261} 264}
262#endif 265#endif
@@ -486,42 +489,11 @@ static void __init reserve_early_setup_data(void)
486 489
487#ifdef CONFIG_KEXEC 490#ifdef CONFIG_KEXEC
488 491
489/**
490 * Reserve @size bytes of crashkernel memory at any suitable offset.
491 *
492 * @size: Size of the crashkernel memory to reserve.
493 * Returns the base address on success, and -1ULL on failure.
494 */
495static
496unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
497{
498 const unsigned long long alignment = 16<<20; /* 16M */
499 unsigned long long start = 0LL;
500
501 while (1) {
502 int ret;
503
504 start = find_e820_area(start, ULONG_MAX, size, alignment);
505 if (start == -1ULL)
506 return start;
507
508 /* try to reserve it */
509 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
510 if (ret >= 0)
511 return start;
512
513 start += alignment;
514 }
515}
516
517static inline unsigned long long get_total_mem(void) 492static inline unsigned long long get_total_mem(void)
518{ 493{
519 unsigned long long total; 494 unsigned long long total;
520 495
521 total = max_low_pfn - min_low_pfn; 496 total = max_pfn - min_low_pfn;
522#ifdef CONFIG_HIGHMEM
523 total += highend_pfn - highstart_pfn;
524#endif
525 497
526 return total << PAGE_SHIFT; 498 return total << PAGE_SHIFT;
527} 499}
@@ -541,21 +513,25 @@ static void __init reserve_crashkernel(void)
541 513
542 /* 0 means: find the address automatically */ 514 /* 0 means: find the address automatically */
543 if (crash_base <= 0) { 515 if (crash_base <= 0) {
544 crash_base = find_and_reserve_crashkernel(crash_size); 516 const unsigned long long alignment = 16<<20; /* 16M */
517
518 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
519 alignment);
545 if (crash_base == -1ULL) { 520 if (crash_base == -1ULL) {
546 pr_info("crashkernel reservation failed. " 521 pr_info("crashkernel reservation failed - No suitable area found.\n");
547 "No suitable area found.\n");
548 return; 522 return;
549 } 523 }
550 } else { 524 } else {
551 ret = reserve_bootmem_generic(crash_base, crash_size, 525 unsigned long long start;
552 BOOTMEM_EXCLUSIVE); 526
553 if (ret < 0) { 527 start = find_e820_area(crash_base, ULONG_MAX, crash_size,
554 pr_info("crashkernel reservation failed - " 528 1<<20);
555 "memory is in use\n"); 529 if (start != crash_base) {
530 pr_info("crashkernel reservation failed - memory is in use.\n");
556 return; 531 return;
557 } 532 }
558 } 533 }
534 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
559 535
560 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 536 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
561 "for crashkernel (System RAM: %ldMB)\n", 537 "for crashkernel (System RAM: %ldMB)\n",
@@ -660,6 +636,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
660 }, 636 },
661 }, 637 },
662 { 638 {
639 .callback = dmi_low_memory_corruption,
640 .ident = "Phoenix/MSC BIOS",
641 .matches = {
642 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
643 },
644 },
645 {
663 /* 646 /*
664 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 647 * AMI BIOS with low memory corruption was found on Intel DG45ID board.
665 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 648 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
@@ -691,6 +674,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
691 674
692void __init setup_arch(char **cmdline_p) 675void __init setup_arch(char **cmdline_p)
693{ 676{
677 int acpi = 0;
678 int k8 = 0;
679
694#ifdef CONFIG_X86_32 680#ifdef CONFIG_X86_32
695 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 681 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
696 visws_early_detect(); 682 visws_early_detect();
@@ -783,21 +769,18 @@ void __init setup_arch(char **cmdline_p)
783 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 769 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
784 *cmdline_p = command_line; 770 *cmdline_p = command_line;
785 771
786#ifdef CONFIG_X86_64
787 /* 772 /*
788 * Must call this twice: Once just to detect whether hardware doesn't 773 * x86_configure_nx() is called before parse_early_param() to detect
789 * support NX (so that the early EHCI debug console setup can safely 774 * whether hardware doesn't support NX (so that the early EHCI debug
790 * call set_fixmap(), and then again after parsing early parameters to 775 * console setup can safely call set_fixmap()). It may then be called
791 * honor the respective command line option. 776 * again from within noexec_setup() during parsing early parameters
777 * to honor the respective command line option.
792 */ 778 */
793 check_efer(); 779 x86_configure_nx();
794#endif
795 780
796 parse_early_param(); 781 parse_early_param();
797 782
798#ifdef CONFIG_X86_64 783 x86_report_nx();
799 check_efer();
800#endif
801 784
802 /* Must be before kernel pagetables are setup */ 785 /* Must be before kernel pagetables are setup */
803 vmi_activate(); 786 vmi_activate();
@@ -893,6 +876,20 @@ void __init setup_arch(char **cmdline_p)
893 876
894 reserve_brk(); 877 reserve_brk();
895 878
879 /*
880 * Find and reserve possible boot-time SMP configuration:
881 */
882 find_smp_config();
883
884 reserve_trampoline_memory();
885
886#ifdef CONFIG_ACPI_SLEEP
887 /*
888 * Reserve low memory region for sleep support.
889 * even before init_memory_mapping
890 */
891 acpi_reserve_wakeup_memory();
892#endif
896 init_gbpages(); 893 init_gbpages();
897 894
898 /* max_pfn_mapped is updated here */ 895 /* max_pfn_mapped is updated here */
@@ -919,6 +916,8 @@ void __init setup_arch(char **cmdline_p)
919 916
920 reserve_initrd(); 917 reserve_initrd();
921 918
919 reserve_crashkernel();
920
922 vsmp_init(); 921 vsmp_init();
923 922
924 io_delay_init(); 923 io_delay_init();
@@ -934,23 +933,15 @@ void __init setup_arch(char **cmdline_p)
934 /* 933 /*
935 * Parse SRAT to discover nodes. 934 * Parse SRAT to discover nodes.
936 */ 935 */
937 acpi_numa_init(); 936 acpi = acpi_numa_init();
938#endif 937#endif
939 938
940 initmem_init(0, max_pfn); 939#ifdef CONFIG_K8_NUMA
941 940 if (!acpi)
942#ifdef CONFIG_ACPI_SLEEP 941 k8 = !k8_numa_init(0, max_pfn);
943 /*
944 * Reserve low memory region for sleep support.
945 */
946 acpi_reserve_bootmem();
947#endif 942#endif
948 /*
949 * Find and reserve possible boot-time SMP configuration:
950 */
951 find_smp_config();
952 943
953 reserve_crashkernel(); 944 initmem_init(0, max_pfn, acpi, k8);
954 945
955#ifdef CONFIG_X86_64 946#ifdef CONFIG_X86_64
956 /* 947 /*
@@ -1024,6 +1015,8 @@ void __init setup_arch(char **cmdline_p)
1024#endif 1015#endif
1025#endif 1016#endif
1026 x86_init.oem.banner(); 1017 x86_init.oem.banner();
1018
1019 mcheck_init();
1027} 1020}
1028 1021
1029#ifdef CONFIG_X86_32 1022#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d559af913e1f..35abcb8b00e9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/module.h> 4#include <linux/module.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,9 +22,9 @@
20#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
21 23
22#ifdef CONFIG_DEBUG_PER_CPU_MAPS 24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
23# define DBG(x...) printk(KERN_DEBUG x) 25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
24#else 26#else
25# define DBG(x...) 27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
26#endif 28#endif
27 29
28DEFINE_PER_CPU(int, cpu_number); 30DEFINE_PER_CPU(int, cpu_number);
@@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
116 } else { 118 } else {
117 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 119 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
118 size, align, goal); 120 size, align, goal);
119 pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 121 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
120 "%016lx\n", cpu, size, node, __pa(ptr)); 122 cpu, size, node, __pa(ptr));
121 } 123 }
122 return ptr; 124 return ptr;
123#else 125#else
@@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void)
198 pcpu_cpu_distance, 200 pcpu_cpu_distance,
199 pcpu_fc_alloc, pcpu_fc_free); 201 pcpu_fc_alloc, pcpu_fc_free);
200 if (rc < 0) 202 if (rc < 0)
201 pr_warning("PERCPU: %s allocator failed (%d), " 203 pr_warning("%s allocator failed (%d), falling back to page size\n",
202 "falling back to page size\n",
203 pcpu_fc_names[pcpu_chosen_fc], rc); 204 pcpu_fc_names[pcpu_chosen_fc], rc);
204 } 205 }
205 if (rc < 0) 206 if (rc < 0)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..4fd173cd8e57 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -544,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
544} 545}
545#endif /* CONFIG_X86_32 */ 546#endif /* CONFIG_X86_32 */
546 547
547#ifdef CONFIG_X86_32 548long
548int sys_sigaltstack(struct pt_regs *regs)
549{
550 const stack_t __user *uss = (const stack_t __user *)regs->bx;
551 stack_t __user *uoss = (stack_t __user *)regs->cx;
552
553 return do_sigaltstack(uss, uoss, regs->sp);
554}
555#else /* !CONFIG_X86_32 */
556asmlinkage long
557sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 549sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
558 struct pt_regs *regs) 550 struct pt_regs *regs)
559{ 551{
560 return do_sigaltstack(uss, uoss, regs->sp); 552 return do_sigaltstack(uss, uoss, regs->sp);
561} 553}
562#endif /* CONFIG_X86_32 */
563 554
564/* 555/*
565 * Do a signal return; undo the signal stack. 556 * Do a signal return; undo the signal stack.
@@ -799,15 +790,6 @@ static void do_signal(struct pt_regs *regs)
799 790
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 791 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 792 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 793 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 794 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 795 /*
@@ -872,6 +854,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
872 if (current->replacement_session_keyring) 854 if (current->replacement_session_keyring)
873 key_replace_session_keyring(); 855 key_replace_session_keyring();
874 } 856 }
857 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
858 fire_user_return_notifiers();
875 859
876#ifdef CONFIG_X86_32 860#ifdef CONFIG_X86_32
877 clear_thread_flag(TIF_IRET); 861 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..678d0b8c26f3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -671,6 +671,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
671 complete(&c_idle->done); 671 complete(&c_idle->done);
672} 672}
673 673
674/* reduce the number of lines printed when booting a large cpu count system */
675static void __cpuinit announce_cpu(int cpu, int apicid)
676{
677 static int current_node = -1;
678 int node = cpu_to_node(cpu);
679
680 if (system_state == SYSTEM_BOOTING) {
681 if (node != current_node) {
682 if (current_node > (-1))
683 pr_cont(" Ok.\n");
684 current_node = node;
685 pr_info("Booting Node %3d, Processors ", node);
686 }
687 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
688 return;
689 } else
690 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
691 node, cpu, apicid);
692}
693
674/* 694/*
675 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 695 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
676 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 696 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -687,7 +707,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
687 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 707 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
688 }; 708 };
689 709
690 INIT_WORK(&c_idle.work, do_fork_idle); 710 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
691 711
692 alternatives_smp_switch(1); 712 alternatives_smp_switch(1);
693 713
@@ -713,6 +733,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
713 733
714 if (IS_ERR(c_idle.idle)) { 734 if (IS_ERR(c_idle.idle)) {
715 printk("failed fork for CPU %d\n", cpu); 735 printk("failed fork for CPU %d\n", cpu);
736 destroy_work_on_stack(&c_idle.work);
716 return PTR_ERR(c_idle.idle); 737 return PTR_ERR(c_idle.idle);
717 } 738 }
718 739
@@ -736,9 +757,8 @@ do_rest:
736 /* start_ip had better be page-aligned! */ 757 /* start_ip had better be page-aligned! */
737 start_ip = setup_trampoline(); 758 start_ip = setup_trampoline();
738 759
739 /* So we see what's up */ 760 /* So we see what's up */
740 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", 761 announce_cpu(cpu, apicid);
741 cpu, apicid, start_ip);
742 762
743 /* 763 /*
744 * This grunge runs the startup process for 764 * This grunge runs the startup process for
@@ -787,21 +807,17 @@ do_rest:
787 udelay(100); 807 udelay(100);
788 } 808 }
789 809
790 if (cpumask_test_cpu(cpu, cpu_callin_mask)) { 810 if (cpumask_test_cpu(cpu, cpu_callin_mask))
791 /* number CPUs logically, starting from 1 (BSP is 0) */ 811 pr_debug("CPU%d: has booted.\n", cpu);
792 pr_debug("OK.\n"); 812 else {
793 printk(KERN_INFO "CPU%d: ", cpu);
794 print_cpu_info(&cpu_data(cpu));
795 pr_debug("CPU has booted.\n");
796 } else {
797 boot_error = 1; 813 boot_error = 1;
798 if (*((volatile unsigned char *)trampoline_base) 814 if (*((volatile unsigned char *)trampoline_base)
799 == 0xA5) 815 == 0xA5)
800 /* trampoline started but...? */ 816 /* trampoline started but...? */
801 printk(KERN_ERR "Stuck ??\n"); 817 pr_err("CPU%d: Stuck ??\n", cpu);
802 else 818 else
803 /* trampoline code not run */ 819 /* trampoline code not run */
804 printk(KERN_ERR "Not responding.\n"); 820 pr_err("CPU%d: Not responding.\n", cpu);
805 if (apic->inquire_remote_apic) 821 if (apic->inquire_remote_apic)
806 apic->inquire_remote_apic(apicid); 822 apic->inquire_remote_apic(apicid);
807 } 823 }
@@ -831,6 +847,7 @@ do_rest:
831 smpboot_restore_warm_reset_vector(); 847 smpboot_restore_warm_reset_vector();
832 } 848 }
833 849
850 destroy_work_on_stack(&c_idle.work);
834 return boot_error; 851 return boot_error;
835} 852}
836 853
@@ -1250,16 +1267,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1250void cpu_disable_common(void) 1267void cpu_disable_common(void)
1251{ 1268{
1252 int cpu = smp_processor_id(); 1269 int cpu = smp_processor_id();
1253 /*
1254 * HACK:
1255 * Allow any queued timer interrupts to get serviced
1256 * This is only a temporary solution until we cleanup
1257 * fixup_irqs as we do for IA64.
1258 */
1259 local_irq_enable();
1260 mdelay(1);
1261 1270
1262 local_irq_disable();
1263 remove_siblinginfo(cpu); 1271 remove_siblinginfo(cpu);
1264 1272
1265 /* It's now safe to remove this processor from the online map */ 1273 /* It's now safe to remove this processor from the online map */
@@ -1300,14 +1308,16 @@ void native_cpu_die(unsigned int cpu)
1300 for (i = 0; i < 10; i++) { 1308 for (i = 0; i < 10; i++) {
1301 /* They ack this in play_dead by setting CPU_DEAD */ 1309 /* They ack this in play_dead by setting CPU_DEAD */
1302 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1310 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1303 printk(KERN_INFO "CPU %d is now offline\n", cpu); 1311 if (system_state == SYSTEM_RUNNING)
1312 pr_info("CPU %u is now offline\n", cpu);
1313
1304 if (1 == num_online_cpus()) 1314 if (1 == num_online_cpus())
1305 alternatives_smp_switch(0); 1315 alternatives_smp_switch(0);
1306 return; 1316 return;
1307 } 1317 }
1308 msleep(100); 1318 msleep(100);
1309 } 1319 }
1310 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1320 pr_err("CPU %u didn't die...\n", cpu);
1311} 1321}
1312 1322
1313void play_dead_common(void) 1323void play_dead_common(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207181fe..922eefbb3f6c 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning, 56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol, 57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 58 .stack = save_stack_stack,
59 .address = save_stack_address, 59 .address = save_stack_address,
60 .walk_stack = print_context_stack,
60}; 61};
61 62
62static const struct stacktrace_ops save_stack_ops_nosched = { 63static const struct stacktrace_ops save_stack_ops_nosched = {
63 .warning = save_stack_warning, 64 .warning = save_stack_warning,
64 .warning_symbol = save_stack_warning_symbol, 65 .warning_symbol = save_stack_warning_symbol,
65 .stack = save_stack_stack, 66 .stack = save_stack_stack,
66 .address = save_stack_address_nosched, 67 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack,
67}; 69};
68 70
69/* 71/*
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..dee1ff7cba58 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -24,31 +24,6 @@
24 24
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
28 unsigned long prot, unsigned long flags,
29 unsigned long fd, unsigned long pgoff)
30{
31 int error = -EBADF;
32 struct file *file = NULL;
33 struct mm_struct *mm = current->mm;
34
35 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
36 if (!(flags & MAP_ANONYMOUS)) {
37 file = fget(fd);
38 if (!file)
39 goto out;
40 }
41
42 down_write(&mm->mmap_sem);
43 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
44 up_write(&mm->mmap_sem);
45
46 if (file)
47 fput(file);
48out:
49 return error;
50}
51
52/* 27/*
53 * Perform the select(nd, in, out, ex, tv) and mmap() system 28 * Perform the select(nd, in, out, ex, tv) and mmap() system
54 * calls. Linux/i386 didn't use to be able to handle more than 29 * calls. Linux/i386 didn't use to be able to handle more than
@@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
77 if (a.offset & ~PAGE_MASK) 52 if (a.offset & ~PAGE_MASK)
78 goto out; 53 goto out;
79 54
80 err = sys_mmap2(a.addr, a.len, a.prot, a.flags, 55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
81 a.fd, a.offset >> PAGE_SHIFT); 56 a.fd, a.offset >> PAGE_SHIFT);
82out: 57out:
83 return err; 58 return err;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 45e00eb09c3a..8aa2057efd12 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
23 unsigned long, fd, unsigned long, off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file;
27
28 error = -EINVAL; 26 error = -EINVAL;
29 if (off & ~PAGE_MASK) 27 if (off & ~PAGE_MASK)
30 goto out; 28 goto out;
31 29
32 error = -EBADF; 30 error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
33 file = NULL;
34 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
35 if (!(flags & MAP_ANONYMOUS)) {
36 file = fget(fd);
37 if (!file)
38 goto out;
39 }
40 down_write(&current->mm->mmap_sem);
41 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
42 up_write(&current->mm->mmap_sem);
43
44 if (file)
45 fput(file);
46out: 31out:
47 return error; 32 return error;
48} 33}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..15228b5d3eb7 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -191,7 +191,7 @@ ENTRY(sys_call_table)
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap_pgoff
195 .long sys_truncate64 195 .long sys_truncate64
196 .long sys_ftruncate64 196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */ 197 .long sys_stat64 /* 195 */
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dcb00d278512..be2573448ed9 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs)
38#ifdef CONFIG_FRAME_POINTER 38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long)); 39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else 40#else
41 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp =
42 (unsigned long *)kernel_stack_pointer(regs);
42 /* 43 /*
43 * Return address is either directly at stack pointer 44 * Return address is either directly at stack pointer
44 * or above a saved flags. Eflags has bits 22-31 zero, 45 * or above a saved flags. Eflags has bits 22-31 zero,
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 503c1f2e8835..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -23,8 +23,6 @@
23static struct bau_control **uv_bau_table_bases __read_mostly; 23static struct bau_control **uv_bau_table_bases __read_mostly;
24static int uv_bau_retry_limit __read_mostly; 24static int uv_bau_retry_limit __read_mostly;
25 25
26/* position of pnode (which is nasid>>1): */
27static int uv_nshift __read_mostly;
28/* base pnode in this partition */ 26/* base pnode in this partition */
29static int uv_partition_base_pnode __read_mostly; 27static int uv_partition_base_pnode __read_mostly;
30 28
@@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode)
723 BUG_ON(!adp); 721 BUG_ON(!adp);
724 722
725 pa = uv_gpa(adp); /* need the real nasid*/ 723 pa = uv_gpa(adp); /* need the real nasid*/
726 n = pa >> uv_nshift; 724 n = uv_gpa_to_pnode(pa);
727 m = pa & uv_mmask; 725 m = pa & uv_mmask;
728 726
729 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 727 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
778 * need the pnode of where the memory was really allocated 776 * need the pnode of where the memory was really allocated
779 */ 777 */
780 pa = uv_gpa(pqp); 778 pa = uv_gpa(pqp);
781 pn = pa >> uv_nshift; 779 pn = uv_gpa_to_pnode(pa);
782 uv_write_global_mmr64(pnode, 780 uv_write_global_mmr64(pnode,
783 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 781 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
784 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 782 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
@@ -819,10 +817,8 @@ static int __init uv_init_blade(int blade)
819 */ 817 */
820 apicid = blade_to_first_apicid(blade); 818 apicid = blade_to_first_apicid(blade);
821 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
822 if ((pa & 0xff) != UV_BAU_MESSAGE) { 820 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
823 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
824 ((apicid << 32) | UV_BAU_MESSAGE)); 821 ((apicid << 32) | UV_BAU_MESSAGE));
825 }
826 return 0; 822 return 0;
827} 823}
828 824
@@ -843,8 +839,7 @@ static int __init uv_bau_init(void)
843 GFP_KERNEL, cpu_to_node(cur_cpu)); 839 GFP_KERNEL, cpu_to_node(cur_cpu));
844 840
845 uv_bau_retry_limit = 1; 841 uv_bau_retry_limit = 1;
846 uv_nshift = uv_hub_info->n_val; 842 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
847 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
848 nblades = uv_num_possible_blades(); 843 nblades = uv_num_possible_blades();
849 844
850 uv_bau_table_bases = (struct bau_control **) 845 uv_bau_table_bases = (struct bau_control **)
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 699f7eeb896a..c652ef62742d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,22 +3,28 @@
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h> 4#include <asm/e820.h>
5 5
6#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
7#define __trampinit
8#define __trampinitdata
9#else
10#define __trampinit __cpuinit
11#define __trampinitdata __cpuinitdata
12#endif
13
6/* ready for x86_64 and x86 */ 14/* ready for x86_64 and x86 */
7unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); 15unsigned char *__trampinitdata trampoline_base;
8 16
9void __init reserve_trampoline_memory(void) 17void __init reserve_trampoline_memory(void)
10{ 18{
11#ifdef CONFIG_X86_32 19 unsigned long mem;
12 /* 20
13 * But first pinch a few for the stack/trampoline stuff
14 * FIXME: Don't need the extra page at 4K, but need to fix
15 * trampoline before removing it. (see the GDT stuff)
16 */
17 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
18#endif
19 /* Has to be in very low memory so we can execute real-mode AP code. */ 21 /* Has to be in very low memory so we can execute real-mode AP code. */
20 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, 22 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
21 "TRAMPOLINE"); 23 if (mem == -1L)
24 panic("Cannot allocate trampoline\n");
25
26 trampoline_base = __va(mem);
27 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
22} 28}
23 29
24/* 30/*
@@ -26,7 +32,7 @@ void __init reserve_trampoline_memory(void)
26 * bootstrap into the page concerned. The caller 32 * bootstrap into the page concerned. The caller
27 * has made sure it's suitably aligned. 33 * has made sure it's suitably aligned.
28 */ 34 */
29unsigned long __cpuinit setup_trampoline(void) 35unsigned long __trampinit setup_trampoline(void)
30{ 36{
31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
32 return virt_to_phys(trampoline_base); 38 return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 596d54c660a5..3af2dff58b21 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,8 +32,12 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP
36.section .rodata, "a", @progbits
37#else
35/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ 38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
36__CPUINITRODATA 39__CPUINITRODATA
40#endif
37.code16 41.code16
38 42
39ENTRY(trampoline_data) 43ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */ 537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 539 return;
540 540
541 /* DR6 may or may not be cleared by the CPU */
542 set_debugreg(0, 6);
541 /* 543 /*
542 * The processor cleared BTF, so don't mark that we need it set. 544 * The processor cleared BTF, so don't mark that we need it set.
543 */ 545 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 546 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 547 tsk->thread.debugctlmsr = 0;
546 548
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 549 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 550 tsk->thread.debugreg6 = dr6;
551
552 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
553 SIGTRAP) == NOTIFY_STOP)
549 return; 554 return;
550 555
551 /* It's safe to allow irq's after DR6 has been saved */ 556 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 557 preempt_conditional_sti(regs);
553 558
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 559 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 560 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 561 error_code, 1);
557 goto clear_dr7; 562 return;
558 } 563 }
559 564
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 565 /*
569 * Single-stepping through TF: make sure we ignore any events in 566 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 567 * kernel space, but re-enable TF when returning to user mode.
568 *
569 * We already checked v86 mode above, so we can check for kernel mode
570 * by just checking the CPL of CS.
571 */ 571 */
572 if (condition & DR_STEP) { 572 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 573 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 574 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
575 regs->flags &= ~X86_EFLAGS_TF;
575 } 576 }
576 577 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 578 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 579 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 580 preempt_conditional_cli(regs);
588 return;
589 581
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 582 return;
604} 583}
605 584
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cd982f48e23e..597683aa5ba0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason)
763{ 763{
764 if (!tsc_unstable) { 764 if (!tsc_unstable) {
765 tsc_unstable = 1; 765 tsc_unstable = 1;
766 sched_clock_stable = 0;
766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 767 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
767 /* Change only the rating, when not registered */ 768 /* Change only the rating, when not registered */
768 if (clocksource_tsc.mult) 769 if (clocksource_tsc.mult)
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..0aa5fed8b9e6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count;
33 * we want to have the fastest, inlined, non-debug version 33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
37 37
38static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
39static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
@@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void)
62 * previous TSC that was measured (possibly on 62 * previous TSC that was measured (possibly on
63 * another CPU) and update the previous TSC timestamp. 63 * another CPU) and update the previous TSC timestamp.
64 */ 64 */
65 __raw_spin_lock(&sync_lock); 65 arch_spin_lock(&sync_lock);
66 prev = last_tsc; 66 prev = last_tsc;
67 rdtsc_barrier(); 67 rdtsc_barrier();
68 now = get_cycles(); 68 now = get_cycles();
69 rdtsc_barrier(); 69 rdtsc_barrier();
70 last_tsc = now; 70 last_tsc = now;
71 __raw_spin_unlock(&sync_lock); 71 arch_spin_unlock(&sync_lock);
72 72
73 /* 73 /*
74 * Be nice every now and then (and also check whether 74 * Be nice every now and then (and also check whether
@@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void)
87 * we saw a time-warp of the TSC going backwards: 87 * we saw a time-warp of the TSC going backwards:
88 */ 88 */
89 if (unlikely(prev > now)) { 89 if (unlikely(prev > now)) {
90 __raw_spin_lock(&sync_lock); 90 arch_spin_lock(&sync_lock);
91 max_warp = max(max_warp, prev - now); 91 max_warp = max(max_warp, prev - now);
92 nr_warps++; 92 nr_warps++;
93 __raw_spin_unlock(&sync_lock); 93 arch_spin_unlock(&sync_lock);
94 } 94 }
95 } 95 }
96 WARN(!(now-start), 96 WARN(!(now-start),
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n");
118 return; 120 return;
119 } 121 }
120 122
121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu);
123
124 /* 123 /*
125 * Reset it - in case this is a second bootup: 124 * Reset it - in case this is a second bootup:
126 */ 125 */
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
142 cpu_relax(); 141 cpu_relax();
143 142
144 if (nr_warps) { 143 if (nr_warps) {
145 printk("\n"); 144 pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
145 smp_processor_id(), cpu);
146 pr_warning("Measured %Ld cycles TSC warp between CPUs, " 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 "turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
151 smp_processor_id(), cpu);
151 } 152 }
152 153
153 /* 154 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..ece73d8e3240 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,25 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h>
12#include <linux/irq.h> 13#include <linux/irq.h>
13 14
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/uv/uv_irq.h> 16#include <asm/uv/uv_irq.h>
17#include <asm/uv/uv_hub.h>
18
19/* MMR offset and pnode of hub sourcing interrupts for a given irq */
20struct uv_irq_2_mmr_pnode{
21 struct rb_node list;
22 unsigned long offset;
23 int pnode;
24 int irq;
25};
26
27static spinlock_t uv_irq_lock;
28static struct rb_root uv_irq_root;
29
30static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
16 31
17static void uv_noop(unsigned int irq) 32static void uv_noop(unsigned int irq)
18{ 33{
@@ -39,25 +54,213 @@ struct irq_chip uv_irq_chip = {
39 .unmask = uv_noop, 54 .unmask = uv_noop,
40 .eoi = uv_ack_apic, 55 .eoi = uv_ack_apic,
41 .end = uv_noop, 56 .end = uv_noop,
57 .set_affinity = uv_set_irq_affinity,
42}; 58};
43 59
44/* 60/*
61 * Add offset and pnode information of the hub sourcing interrupts to the
62 * rb tree for a specific irq.
63 */
64static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
65{
66 struct rb_node **link = &uv_irq_root.rb_node;
67 struct rb_node *parent = NULL;
68 struct uv_irq_2_mmr_pnode *n;
69 struct uv_irq_2_mmr_pnode *e;
70 unsigned long irqflags;
71
72 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
73 uv_blade_to_memory_nid(blade));
74 if (!n)
75 return -ENOMEM;
76
77 n->irq = irq;
78 n->offset = offset;
79 n->pnode = uv_blade_to_pnode(blade);
80 spin_lock_irqsave(&uv_irq_lock, irqflags);
81 /* Find the right place in the rbtree: */
82 while (*link) {
83 parent = *link;
84 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
85
86 if (unlikely(irq == e->irq)) {
87 /* irq entry exists */
88 e->pnode = uv_blade_to_pnode(blade);
89 e->offset = offset;
90 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
91 kfree(n);
92 return 0;
93 }
94
95 if (irq < e->irq)
96 link = &(*link)->rb_left;
97 else
98 link = &(*link)->rb_right;
99 }
100
101 /* Insert the node into the rbtree. */
102 rb_link_node(&n->list, parent, link);
103 rb_insert_color(&n->list, &uv_irq_root);
104
105 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
106 return 0;
107}
108
109/* Retrieve offset and pnode information from the rb tree for a specific irq */
110int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
111{
112 struct uv_irq_2_mmr_pnode *e;
113 struct rb_node *n;
114 unsigned long irqflags;
115
116 spin_lock_irqsave(&uv_irq_lock, irqflags);
117 n = uv_irq_root.rb_node;
118 while (n) {
119 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
120
121 if (e->irq == irq) {
122 *offset = e->offset;
123 *pnode = e->pnode;
124 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
125 return 0;
126 }
127
128 if (irq < e->irq)
129 n = n->rb_left;
130 else
131 n = n->rb_right;
132 }
133 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
134 return -1;
135}
136
137/*
138 * Re-target the irq to the specified CPU and enable the specified MMR located
139 * on the specified blade to allow the sending of MSIs to the specified CPU.
140 */
141static int
142arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 unsigned long mmr_offset, int restrict)
144{
145 const struct cpumask *eligible_cpu = cpumask_of(cpu);
146 struct irq_desc *desc = irq_to_desc(irq);
147 struct irq_cfg *cfg;
148 int mmr_pnode;
149 unsigned long mmr_value;
150 struct uv_IO_APIC_route_entry *entry;
151 int err;
152
153 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
154 sizeof(unsigned long));
155
156 cfg = irq_cfg(irq);
157
158 err = assign_irq_vector(irq, cfg, eligible_cpu);
159 if (err != 0)
160 return err;
161
162 if (restrict == UV_AFFINITY_CPU)
163 desc->status |= IRQ_NO_BALANCING;
164 else
165 desc->status |= IRQ_MOVE_PCNTXT;
166
167 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
168 irq_name);
169
170 mmr_value = 0;
171 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
172 entry->vector = cfg->vector;
173 entry->delivery_mode = apic->irq_delivery_mode;
174 entry->dest_mode = apic->irq_dest_mode;
175 entry->polarity = 0;
176 entry->trigger = 0;
177 entry->mask = 0;
178 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
179
180 mmr_pnode = uv_blade_to_pnode(mmr_blade);
181 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
182
183 if (cfg->move_in_progress)
184 send_cleanup_vector(cfg);
185
186 return irq;
187}
188
189/*
190 * Disable the specified MMR located on the specified blade so that MSIs are
191 * longer allowed to be sent.
192 */
193static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
194{
195 unsigned long mmr_value;
196 struct uv_IO_APIC_route_entry *entry;
197
198 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
199 sizeof(unsigned long));
200
201 mmr_value = 0;
202 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
203 entry->mask = 1;
204
205 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
206}
207
208static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
209{
210 struct irq_desc *desc = irq_to_desc(irq);
211 struct irq_cfg *cfg = desc->chip_data;
212 unsigned int dest;
213 unsigned long mmr_value;
214 struct uv_IO_APIC_route_entry *entry;
215 unsigned long mmr_offset;
216 unsigned mmr_pnode;
217
218 if (set_desc_affinity(desc, mask, &dest))
219 return -1;
220
221 mmr_value = 0;
222 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
223
224 entry->vector = cfg->vector;
225 entry->delivery_mode = apic->irq_delivery_mode;
226 entry->dest_mode = apic->irq_dest_mode;
227 entry->polarity = 0;
228 entry->trigger = 0;
229 entry->mask = 0;
230 entry->dest = dest;
231
232 /* Get previously stored MMR and pnode of hub sourcing interrupts */
233 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
234 return -1;
235
236 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
237
238 if (cfg->move_in_progress)
239 send_cleanup_vector(cfg);
240
241 return 0;
242}
243
244/*
45 * Set up a mapping of an available irq and vector, and enable the specified 245 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an 246 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised. 247 * interrupt is raised.
48 */ 248 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 249int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset) 250 unsigned long mmr_offset, int restrict)
51{ 251{
52 int irq; 252 int irq, ret;
53 int ret; 253
254 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
54 255
55 irq = create_irq();
56 if (irq <= 0) 256 if (irq <= 0)
57 return -EBUSY; 257 return -EBUSY;
58 258
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); 259 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
60 if (ret != irq) 260 restrict);
261 if (ret == irq)
262 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
263 else
61 destroy_irq(irq); 264 destroy_irq(irq);
62 265
63 return ret; 266 return ret;
@@ -71,9 +274,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
71 * 274 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). 275 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */ 276 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) 277void uv_teardown_irq(unsigned int irq)
75{ 278{
76 arch_disable_uv_irq(mmr_blade, mmr_offset); 279 struct uv_irq_2_mmr_pnode *e;
280 struct rb_node *n;
281 unsigned long irqflags;
282
283 spin_lock_irqsave(&uv_irq_lock, irqflags);
284 n = uv_irq_root.rb_node;
285 while (n) {
286 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
287 if (e->irq == irq) {
288 arch_disable_uv_irq(e->pnode, e->offset);
289 rb_erase(n, &uv_irq_root);
290 kfree(e);
291 break;
292 }
293 if (irq < e->irq)
294 n = n->rb_left;
295 else
296 n = n->rb_right;
297 }
298 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
77 destroy_irq(irq); 299 destroy_irq(irq);
78} 300}
79EXPORT_SYMBOL_GPL(uv_teardown_irq); 301EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..3c84aa001c11 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,7 @@ struct uv_rtc_timer_head {
74 */ 74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 75static struct uv_rtc_timer_head **blade_info __read_mostly;
76 76
77static int uv_rtc_enable; 77static int uv_rtc_evt_enable;
78 78
79/* 79/*
80 * Hardware interface routines 80 * Hardware interface routines
@@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 93 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 96}
@@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 118 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 120
121 /* Set configuration */ 121 /* Set configuration */
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 125
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 126 if (uv_read_rtc(NULL) <= expires)
127 return 0;
128
129 return !uv_intr_pending(pnode);
127} 130}
128 131
129/* 132/*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 226
224 next_cpu = head->next_cpu; 227 next_cpu = head->next_cpu;
225 *t = expires; 228 *t = expires;
229
226 /* Will this one be next to go off? */ 230 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 231 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 232 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 235 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 236 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 237 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 238 return -ETIME;
235 } 239 }
236 } 240 }
237 241
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 248 *
245 * Returns 1 if this timer was pending. 249 * Returns 1 if this timer was pending.
246 */ 250 */
247static int uv_rtc_unset_timer(int cpu) 251static int uv_rtc_unset_timer(int cpu, int force)
248{ 252{
249 int pnode = uv_cpu_to_pnode(cpu); 253 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 254 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
256 260
257 spin_lock_irqsave(&head->lock, flags); 261 spin_lock_irqsave(&head->lock, flags);
258 262
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 263 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 264 rc = 1;
261 265
262 *t = ULLONG_MAX; 266 if (rc) {
263 267 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 268 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 269 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 270 uv_rtc_find_next_timer(head, pnode);
271 }
267 272
268 spin_unlock_irqrestore(&head->lock, flags); 273 spin_unlock_irqrestore(&head->lock, flags);
269 274
@@ -310,32 +315,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 315 break;
311 case CLOCK_EVT_MODE_UNUSED: 316 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 317 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 318 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 319 break;
315 } 320 }
316} 321}
317 322
318static void uv_rtc_interrupt(void) 323static void uv_rtc_interrupt(void)
319{ 324{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 325 int cpu = smp_processor_id();
326 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 327
323 if (!ced || !ced->event_handler) 328 if (!ced || !ced->event_handler)
324 return; 329 return;
325 330
326 if (uv_rtc_unset_timer(cpu) != 1) 331 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 332 return;
328 333
329 ced->event_handler(ced); 334 ced->event_handler(ced);
330} 335}
331 336
332static int __init uv_enable_rtc(char *str) 337static int __init uv_enable_evt_rtc(char *str)
333{ 338{
334 uv_rtc_enable = 1; 339 uv_rtc_evt_enable = 1;
335 340
336 return 1; 341 return 1;
337} 342}
338__setup("uvrtc", uv_enable_rtc); 343__setup("uvrtcevt", uv_enable_evt_rtc);
339 344
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 345static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 346{
@@ -350,27 +355,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 355{
351 int rc; 356 int rc;
352 357
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 358 if (!is_uv_system())
354 return -ENODEV; 359 return -ENODEV;
355 360
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 361 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 362 clocksource_uv.shift);
360 363
364 /* If single blade, prefer tsc */
365 if (uv_num_possible_blades() == 1)
366 clocksource_uv.rating = 250;
367
361 rc = clocksource_register(&clocksource_uv); 368 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 369 if (rc)
363 generic_interrupt_extension = NULL; 370 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
371 else
372 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
373 sn_rtc_cycles_per_second/(unsigned long)1E6);
374
375 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 376 return rc;
365 }
366 377
367 /* Setup and register clockevents */ 378 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 379 rc = uv_rtc_allocate_timers();
369 if (rc) { 380 if (rc)
370 clocksource_unregister(&clocksource_uv); 381 goto error;
371 generic_interrupt_extension = NULL; 382
372 return rc; 383 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 384
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 385 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 386 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +393,19 @@ static __init int uv_rtc_setup_clock(void)
383 393
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 394 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 395 if (rc) {
386 clocksource_unregister(&clocksource_uv); 396 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 397 uv_rtc_deallocate_timers();
398 goto error;
389 } 399 }
390 400
401 printk(KERN_INFO "UV RTC clockevents registered\n");
402
403 return 0;
404
405error:
406 clocksource_unregister(&clocksource_uv);
407 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
408
391 return rc; 409 return rc;
392} 410}
393arch_initcall(uv_rtc_setup_clock); 411arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index f068553a1b17..34a279a7471d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
183 return; 183 return;
184 } 184 }
185 185
186 apic_cpus = apic->apicid_to_cpu_present(m->apicid); 186 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
188 /* 188 /*
189 * Validate version 189 * Validate version
@@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
197 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
198} 198}
199 199
200static void __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(void)
201{ 201{
202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq)
486} 486}
487 487
488static struct irq_chip cobalt_irq_type = { 488static struct irq_chip cobalt_irq_type = {
489 .typename = "Cobalt-APIC", 489 .name = "Cobalt-APIC",
490 .startup = startup_cobalt_irq, 490 .startup = startup_cobalt_irq,
491 .shutdown = disable_cobalt_irq, 491 .shutdown = disable_cobalt_irq,
492 .enable = enable_cobalt_irq, 492 .enable = enable_cobalt_irq,
@@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq)
523} 523}
524 524
525static struct irq_chip piix4_master_irq_type = { 525static struct irq_chip piix4_master_irq_type = {
526 .typename = "PIIX4-master", 526 .name = "PIIX4-master",
527 .startup = startup_piix4_master_irq, 527 .startup = startup_piix4_master_irq,
528 .ack = ack_cobalt_irq, 528 .ack = ack_cobalt_irq,
529 .end = end_piix4_master_irq, 529 .end = end_piix4_master_irq,
@@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = {
531 531
532 532
533static struct irq_chip piix4_virtual_irq_type = { 533static struct irq_chip piix4_virtual_irq_type = {
534 .typename = "PIIX4-virtual", 534 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq, 535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq, 536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq, 537 .disable = disable_8259A_irq,
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..5ffb5622f793 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -197,9 +197,8 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200int sys_vm86old(struct pt_regs *regs) 200int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 202 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 203 * this avoids wasting of stack space.
205 * This remains on the stack until we 204 * This remains on the stack until we
@@ -227,7 +226,7 @@ out:
227} 226}
228 227
229 228
230int sys_vm86(struct pt_regs *regs) 229int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
231{ 230{
232 struct kernel_vm86_struct info; /* declare this _on top_, 231 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 232 * this avoids wasting of stack space.
@@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs)
239 struct vm86plus_struct __user *v86; 238 struct vm86plus_struct __user *v86;
240 239
241 tsk = current; 240 tsk = current;
242 switch (regs->bx) { 241 switch (cmd) {
243 case VM86_REQUEST_IRQ: 242 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 243 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 244 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 245 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); 246 ret = do_vm86_irq_handling(cmd, (int)arg);
248 goto out; 247 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 248 case VM86_PLUS_INSTALL_CHECK:
250 /* 249 /*
@@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs)
261 ret = -EPERM; 260 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 261 if (tsk->thread.saved_sp0)
263 goto out; 262 goto out;
264 v86 = (struct vm86plus_struct __user *)regs->cx; 263 v86 = (struct vm86plus_struct __user *)arg;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 265 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 266 sizeof(info.regs));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 31e6f6cfe53e..d430e4c30193 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -648,7 +648,7 @@ static inline int __init activate_vmi(void)
648 648
649 pv_info.paravirt_enabled = 1; 649 pv_info.paravirt_enabled = 1;
650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
651 pv_info.name = "vmi"; 651 pv_info.name = "vmi [deprecated]";
652 652
653 pv_init_ops.patch = vmi_patch; 653 pv_init_ops.patch = vmi_patch;
654 654
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..74c92bb194df 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 226 evt->min_delta_ns = clockevent_delta2ns(1, evt);
227 evt->cpumask = cpumask_of(cpu); 227 evt->cpumask = cpumask_of(cpu);
228 228
229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
230 evt->name, evt->mult, evt->shift); 230 evt->name, evt->mult, evt->shift);
231 clockevents_register_device(evt); 231 clockevents_register_device(evt);
232} 232}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ecb92717c412..44879df55696 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
41jiffies_64 = jiffies; 41jiffies_64 = jiffies;
42#endif 42#endif
43 43
44#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
45/*
46 * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
47 * we retain large page mappings for boundaries spanning kernel text, rodata
48 * and data sections.
49 *
50 * However, kernel identity mappings will have different RWX permissions
51 * to the pages mapping to text and to the pages padding (which are freed) the
52 * text section. Hence kernel identity mappings will be broken to smaller
53 * pages. For 64-bit, kernel text and kernel identity mappings are different,
54 * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
55 * as well as retain 2MB large page mappings for kernel text.
56 */
57#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
58
59#define X64_ALIGN_DEBUG_RODATA_END \
60 . = ALIGN(HPAGE_SIZE); \
61 __end_rodata_hpage_align = .;
62
63#else
64
65#define X64_ALIGN_DEBUG_RODATA_BEGIN
66#define X64_ALIGN_DEBUG_RODATA_END
67
68#endif
69
44PHDRS { 70PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
90 116
91 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
92 118
119 X64_ALIGN_DEBUG_RODATA_BEGIN
93 RO_DATA(PAGE_SIZE) 120 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END
94 122
95 /* Data */ 123 /* Data */
96 .data : AT(ADDR(.data) - LOAD_OFFSET) { 124 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
107 135
108 PAGE_ALIGNED_DATA(PAGE_SIZE) 136 PAGE_ALIGNED_DATA(PAGE_SIZE)
109 137
110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 138 CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
111 139
112 DATA_DATA 140 DATA_DATA
113 CONSTRUCTORS 141 CONSTRUCTORS
114 142
115 /* rarely changed data like cpu maps */ 143 /* rarely changed data like cpu maps */
116 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) 144 READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
117 145
118 /* End of data section */ 146 /* End of data section */
119 _edata = .; 147 _edata = .;
@@ -137,12 +165,12 @@ SECTIONS
137 *(.vsyscall_0) 165 *(.vsyscall_0)
138 } :user 166 } :user
139 167
140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 168 . = ALIGN(L1_CACHE_BYTES);
141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 169 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
142 *(.vsyscall_fn) 170 *(.vsyscall_fn)
143 } 171 }
144 172
145 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 173 . = ALIGN(L1_CACHE_BYTES);
146 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { 174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
147 *(.vsyscall_gtod_data) 175 *(.vsyscall_gtod_data)
148 } 176 }
@@ -166,7 +194,7 @@ SECTIONS
166 } 194 }
167 vgetcpu_mode = VVIRT(.vgetcpu_mode); 195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
168 196
169 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(L1_CACHE_BYTES);
170 .jiffies : AT(VLOAD(.jiffies)) { 198 .jiffies : AT(VLOAD(.jiffies)) {
171 *(.jiffies) 199 *(.jiffies)
172 } 200 }
@@ -291,9 +319,7 @@ SECTIONS
291 __brk_limit = .; 319 __brk_limit = .;
292 } 320 }
293 321
294 .end : AT(ADDR(.end) - LOAD_OFFSET) { 322 _end = .;
295 _end = .;
296 }
297 323
298 STABS_DEBUG 324 STABS_DEBUG
299 DWARF_DEBUG 325 DWARF_DEBUG
@@ -305,6 +331,9 @@ SECTIONS
305 331
306 332
307#ifdef CONFIG_X86_32 333#ifdef CONFIG_X86_32
334/*
335 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
336 */
308. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 337. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
309 "kernel image bigger than KERNEL_IMAGE_SIZE"); 338 "kernel image bigger than KERNEL_IMAGE_SIZE");
310#else 339#else
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..9055e5872ff0 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
77 u32 mult)
77{ 78{
78 unsigned long flags; 79 unsigned long flags;
79 80
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
82 vsyscall_gtod_data.clock.vread = clock->vread; 83 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask; 85 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult; 86 vsyscall_gtod_data.clock.mult = mult;
86 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
237}; 238};
238 239
239static ctl_table kernel_root_table2[] = { 240static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 241 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 242 .child = kernel_table2 },
242 {} 243 {}
243}; 244};
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..619f7f88b8cc 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -17,8 +17,6 @@
17EXPORT_SYMBOL(mcount); 17EXPORT_SYMBOL(mcount);
18#endif 18#endif
19 19
20EXPORT_SYMBOL(kernel_thread);
21
22EXPORT_SYMBOL(__get_user_1); 20EXPORT_SYMBOL(__get_user_1);
23EXPORT_SYMBOL(__get_user_2); 21EXPORT_SYMBOL(__get_user_2);
24EXPORT_SYMBOL(__get_user_4); 22EXPORT_SYMBOL(__get_user_4);
@@ -30,9 +28,8 @@ EXPORT_SYMBOL(__put_user_8);
30 28
31EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic);
32EXPORT_SYMBOL(__copy_user_nocache); 30EXPORT_SYMBOL(__copy_user_nocache);
33EXPORT_SYMBOL(copy_from_user); 31EXPORT_SYMBOL(_copy_from_user);
34EXPORT_SYMBOL(copy_to_user); 32EXPORT_SYMBOL(_copy_to_user);
35EXPORT_SYMBOL(__copy_from_user_inatomic);
36 33
37EXPORT_SYMBOL(copy_page); 34EXPORT_SYMBOL(copy_page);
38EXPORT_SYMBOL(clear_page); 35EXPORT_SYMBOL(clear_page);
@@ -57,4 +54,6 @@ EXPORT_SYMBOL(__memcpy);
57 54
58EXPORT_SYMBOL(empty_zero_page); 55EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 56EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 57#ifndef CONFIG_PARAVIRT
58EXPORT_SYMBOL(native_load_gs_index);
59#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a2c2ed..ccd179dec36e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -13,11 +13,15 @@
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/pat.h>
16#include <asm/tsc.h> 17#include <asm/tsc.h>
18#include <asm/iommu.h>
17 19
18void __cpuinit x86_init_noop(void) { } 20void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { } 21void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { } 22void __init x86_init_pgd_noop(pgd_t *unused) { }
23int __init iommu_init_noop(void) { return 0; }
24void iommu_shutdown_noop(void) { }
21 25
22/* 26/*
23 * The platform setup functions are preset with the default functions 27 * The platform setup functions are preset with the default functions
@@ -62,6 +66,10 @@ struct x86_init_ops x86_init __initdata = {
62 .tsc_pre_init = x86_init_noop, 66 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init, 67 .timer_init = hpet_time_init,
64 }, 68 },
69
70 .iommu = {
71 .iommu_init = iommu_init_noop,
72 },
65}; 73};
66 74
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
@@ -72,4 +80,6 @@ struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc, 80 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time, 81 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss, 82 .set_wallclock = mach_set_rtc_mmss,
83 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range,
75}; 85};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..4cd498332466 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER
31 ---help--- 32 ---help---
32 Support hosting fully virtualized guest machines using hardware 33 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 34 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o) 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o)
10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11 12
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..7e8faea4651e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -75,6 +75,8 @@
75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */
79#define No64 (1<<28)
78/* Source 2 operand type */ 80/* Source 2 operand type */
79#define Src2None (0<<29) 81#define Src2None (0<<29)
80#define Src2CL (1<<29) 82#define Src2CL (1<<29)
@@ -92,19 +94,23 @@ static u32 opcode_table[256] = {
92 /* 0x00 - 0x07 */ 94 /* 0x00 - 0x07 */
93 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
94 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
95 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
96 /* 0x08 - 0x0F */ 99 /* 0x08 - 0x0F */
97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
99 0, 0, 0, 0, 102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0,
100 /* 0x10 - 0x17 */ 104 /* 0x10 - 0x17 */
101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
104 /* 0x18 - 0x1F */ 109 /* 0x18 - 0x1F */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
108 /* 0x20 - 0x27 */ 114 /* 0x20 - 0x27 */
109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -133,7 +139,8 @@ static u32 opcode_table[256] = {
133 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 139 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
134 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 140 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
135 /* 0x60 - 0x67 */ 141 /* 0x60 - 0x67 */
136 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 142 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
143 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
137 0, 0, 0, 0, 144 0, 0, 0, 0,
138 /* 0x68 - 0x6F */ 145 /* 0x68 - 0x6F */
139 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 146 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -158,7 +165,7 @@ static u32 opcode_table[256] = {
158 /* 0x90 - 0x97 */ 165 /* 0x90 - 0x97 */
159 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 166 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
160 /* 0x98 - 0x9F */ 167 /* 0x98 - 0x9F */
161 0, 0, SrcImm | Src2Imm16, 0, 168 0, 0, SrcImm | Src2Imm16 | No64, 0,
162 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 169 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
163 /* 0xA0 - 0xA7 */ 170 /* 0xA0 - 0xA7 */
164 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 171 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +192,7 @@ static u32 opcode_table[256] = {
185 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 192 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
186 /* 0xC8 - 0xCF */ 193 /* 0xC8 - 0xCF */
187 0, 0, 0, ImplicitOps | Stack, 194 0, 0, 0, ImplicitOps | Stack,
188 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, 195 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
189 /* 0xD0 - 0xD7 */ 196 /* 0xD0 - 0xD7 */
190 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 197 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
191 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 198 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,7 +205,7 @@ static u32 opcode_table[256] = {
198 ByteOp | SrcImmUByte, SrcImmUByte, 205 ByteOp | SrcImmUByte, SrcImmUByte,
199 /* 0xE8 - 0xEF */ 206 /* 0xE8 - 0xEF */
200 SrcImm | Stack, SrcImm | ImplicitOps, 207 SrcImm | Stack, SrcImm | ImplicitOps,
201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, 208 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 209 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
204 /* 0xF0 - 0xF7 */ 211 /* 0xF0 - 0xF7 */
@@ -244,11 +251,13 @@ static u32 twobyte_table[256] = {
244 /* 0x90 - 0x9F */ 251 /* 0x90 - 0x9F */
245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0xA0 - 0xA7 */ 253 /* 0xA0 - 0xA7 */
247 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 254 ImplicitOps | Stack, ImplicitOps | Stack,
255 0, DstMem | SrcReg | ModRM | BitOp,
248 DstMem | SrcReg | Src2ImmByte | ModRM, 256 DstMem | SrcReg | Src2ImmByte | ModRM,
249 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 257 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
250 /* 0xA8 - 0xAF */ 258 /* 0xA8 - 0xAF */
251 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 259 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp,
252 DstMem | SrcReg | Src2ImmByte | ModRM, 261 DstMem | SrcReg | Src2ImmByte | ModRM,
253 DstMem | SrcReg | Src2CL | ModRM, 262 DstMem | SrcReg | Src2CL | ModRM,
254 ModRM, 0, 263 ModRM, 0,
@@ -613,6 +622,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
613{ 622{
614 int rc = 0; 623 int rc = 0;
615 624
625 /* x86 instructions are limited to 15 bytes. */
626 if (eip + size - ctxt->decode.eip_orig > 15)
627 return X86EMUL_UNHANDLEABLE;
616 eip += ctxt->cs_base; 628 eip += ctxt->cs_base;
617 while (size--) { 629 while (size--) {
618 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 630 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -871,7 +883,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
871 /* Shadow copy of register state. Committed on successful emulation. */ 883 /* Shadow copy of register state. Committed on successful emulation. */
872 884
873 memset(c, 0, sizeof(struct decode_cache)); 885 memset(c, 0, sizeof(struct decode_cache));
874 c->eip = kvm_rip_read(ctxt->vcpu); 886 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
875 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 887 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
876 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 888 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
877 889
@@ -962,6 +974,11 @@ done_prefixes:
962 } 974 }
963 } 975 }
964 976
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
979 return -1;
980 }
981
965 if (c->d & Group) { 982 if (c->d & Group) {
966 group = c->d & GroupMask; 983 group = c->d & GroupMask;
967 c->modrm = insn_fetch(u8, 1, c->eip); 984 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1186,6 +1203,69 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1186 return rc; 1203 return rc;
1187} 1204}
1188 1205
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{
1208 struct decode_cache *c = &ctxt->decode;
1209 struct kvm_segment segment;
1210
1211 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
1212
1213 c->src.val = segment.selector;
1214 emulate_push(ctxt);
1215}
1216
1217static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1218 struct x86_emulate_ops *ops, int seg)
1219{
1220 struct decode_cache *c = &ctxt->decode;
1221 unsigned long selector;
1222 int rc;
1223
1224 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1225 if (rc != 0)
1226 return rc;
1227
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
1229 return rc;
1230}
1231
1232static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
1233{
1234 struct decode_cache *c = &ctxt->decode;
1235 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1236 int reg = VCPU_REGS_RAX;
1237
1238 while (reg <= VCPU_REGS_RDI) {
1239 (reg == VCPU_REGS_RSP) ?
1240 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1241
1242 emulate_push(ctxt);
1243 ++reg;
1244 }
1245}
1246
1247static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops)
1249{
1250 struct decode_cache *c = &ctxt->decode;
1251 int rc = 0;
1252 int reg = VCPU_REGS_RDI;
1253
1254 while (reg >= VCPU_REGS_RAX) {
1255 if (reg == VCPU_REGS_RSP) {
1256 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1257 c->op_bytes);
1258 --reg;
1259 }
1260
1261 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1262 if (rc != 0)
1263 break;
1264 --reg;
1265 }
1266 return rc;
1267}
1268
1189static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1269static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1190 struct x86_emulate_ops *ops) 1270 struct x86_emulate_ops *ops)
1191{ 1271{
@@ -1707,18 +1787,45 @@ special_insn:
1707 add: /* add */ 1787 add: /* add */
1708 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 1788 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1709 break; 1789 break;
1790 case 0x06: /* push es */
1791 emulate_push_sreg(ctxt, VCPU_SREG_ES);
1792 break;
1793 case 0x07: /* pop es */
1794 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1795 if (rc != 0)
1796 goto done;
1797 break;
1710 case 0x08 ... 0x0d: 1798 case 0x08 ... 0x0d:
1711 or: /* or */ 1799 or: /* or */
1712 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 1800 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1713 break; 1801 break;
1802 case 0x0e: /* push cs */
1803 emulate_push_sreg(ctxt, VCPU_SREG_CS);
1804 break;
1714 case 0x10 ... 0x15: 1805 case 0x10 ... 0x15:
1715 adc: /* adc */ 1806 adc: /* adc */
1716 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 1807 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1717 break; 1808 break;
1809 case 0x16: /* push ss */
1810 emulate_push_sreg(ctxt, VCPU_SREG_SS);
1811 break;
1812 case 0x17: /* pop ss */
1813 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1814 if (rc != 0)
1815 goto done;
1816 break;
1718 case 0x18 ... 0x1d: 1817 case 0x18 ... 0x1d:
1719 sbb: /* sbb */ 1818 sbb: /* sbb */
1720 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1819 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1721 break; 1820 break;
1821 case 0x1e: /* push ds */
1822 emulate_push_sreg(ctxt, VCPU_SREG_DS);
1823 break;
1824 case 0x1f: /* pop ds */
1825 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1826 if (rc != 0)
1827 goto done;
1828 break;
1722 case 0x20 ... 0x25: 1829 case 0x20 ... 0x25:
1723 and: /* and */ 1830 and: /* and */
1724 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1831 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1857,14 @@ special_insn:
1750 if (rc != 0) 1857 if (rc != 0)
1751 goto done; 1858 goto done;
1752 break; 1859 break;
1860 case 0x60: /* pusha */
1861 emulate_pusha(ctxt);
1862 break;
1863 case 0x61: /* popa */
1864 rc = emulate_popa(ctxt, ops);
1865 if (rc != 0)
1866 goto done;
1867 break;
1753 case 0x63: /* movsxd */ 1868 case 0x63: /* movsxd */
1754 if (ctxt->mode != X86EMUL_MODE_PROT64) 1869 if (ctxt->mode != X86EMUL_MODE_PROT64)
1755 goto cannot_emulate; 1870 goto cannot_emulate;
@@ -1761,7 +1876,7 @@ special_insn:
1761 break; 1876 break;
1762 case 0x6c: /* insb */ 1877 case 0x6c: /* insb */
1763 case 0x6d: /* insw/insd */ 1878 case 0x6d: /* insw/insd */
1764 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1879 if (kvm_emulate_pio_string(ctxt->vcpu,
1765 1, 1880 1,
1766 (c->d & ByteOp) ? 1 : c->op_bytes, 1881 (c->d & ByteOp) ? 1 : c->op_bytes,
1767 c->rep_prefix ? 1882 c->rep_prefix ?
@@ -1777,7 +1892,7 @@ special_insn:
1777 return 0; 1892 return 0;
1778 case 0x6e: /* outsb */ 1893 case 0x6e: /* outsb */
1779 case 0x6f: /* outsw/outsd */ 1894 case 0x6f: /* outsw/outsd */
1780 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1895 if (kvm_emulate_pio_string(ctxt->vcpu,
1781 0, 1896 0,
1782 (c->d & ByteOp) ? 1 : c->op_bytes, 1897 (c->d & ByteOp) ? 1 : c->op_bytes,
1783 c->rep_prefix ? 1898 c->rep_prefix ?
@@ -2070,7 +2185,7 @@ special_insn:
2070 case 0xef: /* out (e/r)ax,dx */ 2185 case 0xef: /* out (e/r)ax,dx */
2071 port = c->regs[VCPU_REGS_RDX]; 2186 port = c->regs[VCPU_REGS_RDX];
2072 io_dir_in = 0; 2187 io_dir_in = 0;
2073 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, 2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2074 (c->d & ByteOp) ? 1 : c->op_bytes, 2189 (c->d & ByteOp) ? 1 : c->op_bytes,
2075 port) != 0) { 2190 port) != 0) {
2076 c->eip = saved_eip; 2191 c->eip = saved_eip;
@@ -2297,6 +2412,14 @@ twobyte_insn:
2297 jmp_rel(c, c->src.val); 2412 jmp_rel(c, c->src.val);
2298 c->dst.type = OP_NONE; 2413 c->dst.type = OP_NONE;
2299 break; 2414 break;
2415 case 0xa0: /* push fs */
2416 emulate_push_sreg(ctxt, VCPU_SREG_FS);
2417 break;
2418 case 0xa1: /* pop fs */
2419 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2420 if (rc != 0)
2421 goto done;
2422 break;
2300 case 0xa3: 2423 case 0xa3:
2301 bt: /* bt */ 2424 bt: /* bt */
2302 c->dst.type = OP_NONE; 2425 c->dst.type = OP_NONE;
@@ -2308,6 +2431,14 @@ twobyte_insn:
2308 case 0xa5: /* shld cl, r, r/m */ 2431 case 0xa5: /* shld cl, r, r/m */
2309 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 2432 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2310 break; 2433 break;
2434 case 0xa8: /* push gs */
2435 emulate_push_sreg(ctxt, VCPU_SREG_GS);
2436 break;
2437 case 0xa9: /* pop gs */
2438 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2439 if (rc != 0)
2440 goto done;
2441 break;
2311 case 0xab: 2442 case 0xab:
2312 bts: /* bts */ 2443 bts: /* bts */
2313 /* only subword offset */ 2444 /* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 82ad523b4901..296aba49472a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -29,6 +29,8 @@
29 * Based on QEMU and Xen. 29 * Based on QEMU and Xen.
30 */ 30 */
31 31
32#define pr_fmt(fmt) "pit: " fmt
33
32#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
33 35
34#include "irq.h" 36#include "irq.h"
@@ -116,7 +118,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
116 * itself with the initial count and continues counting 118 * itself with the initial count and continues counting
117 * from there. 119 * from there.
118 */ 120 */
119 remaining = hrtimer_expires_remaining(&ps->pit_timer.timer); 121 remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
120 elapsed = ps->pit_timer.period - ktime_to_ns(remaining); 122 elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
121 elapsed = mod_64(elapsed, ps->pit_timer.period); 123 elapsed = mod_64(elapsed, ps->pit_timer.period);
122 124
@@ -262,7 +264,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
262 264
263static void destroy_pit_timer(struct kvm_timer *pt) 265static void destroy_pit_timer(struct kvm_timer *pt)
264{ 266{
265 pr_debug("pit: execute del timer!\n"); 267 pr_debug("execute del timer!\n");
266 hrtimer_cancel(&pt->timer); 268 hrtimer_cancel(&pt->timer);
267} 269}
268 270
@@ -284,7 +286,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284 286
285 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 287 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
286 288
287 pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); 289 pr_debug("create pit timer, interval is %llu nsec\n", interval);
288 290
289 /* TODO The new value only affected after the retriggered */ 291 /* TODO The new value only affected after the retriggered */
290 hrtimer_cancel(&pt->timer); 292 hrtimer_cancel(&pt->timer);
@@ -309,7 +311,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
309 311
310 WARN_ON(!mutex_is_locked(&ps->lock)); 312 WARN_ON(!mutex_is_locked(&ps->lock));
311 313
312 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 314 pr_debug("load_count val is %d, channel is %d\n", val, channel);
313 315
314 /* 316 /*
315 * The largest possible initial count is 0; this is equivalent 317 * The largest possible initial count is 0; this is equivalent
@@ -395,8 +397,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
395 mutex_lock(&pit_state->lock); 397 mutex_lock(&pit_state->lock);
396 398
397 if (val != 0) 399 if (val != 0)
398 pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", 400 pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
399 (unsigned int)addr, len, val); 401 (unsigned int)addr, len, val);
400 402
401 if (addr == 3) { 403 if (addr == 3) {
402 channel = val >> 6; 404 channel = val >> 6;
@@ -688,10 +690,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
688 struct kvm_vcpu *vcpu; 690 struct kvm_vcpu *vcpu;
689 int i; 691 int i;
690 692
691 mutex_lock(&kvm->irq_lock);
692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 694 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
694 mutex_unlock(&kvm->irq_lock);
695 695
696 /* 696 /*
697 * Provides NMI watchdog support via Virtual Wire mode. 697 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..d057c0cbd245 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
38 s->isr_ack |= (1 << irq); 38 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0]) 39 if (s != &s->pics_state->pics[0])
40 irq += 8; 40 irq += 8;
41 /*
42 * We are dropping lock while calling ack notifiers since ack
43 * notifier callbacks for assigned devices call into PIC recursively.
44 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage.
46 */
47 spin_unlock(&s->pics_state->lock);
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock);
42} 50}
43 51
44void kvm_pic_clear_isr_ack(struct kvm *kvm) 52void kvm_pic_clear_isr_ack(struct kvm *kvm)
@@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
176static inline void pic_intack(struct kvm_kpic_state *s, int irq) 184static inline void pic_intack(struct kvm_kpic_state *s, int irq)
177{ 185{
178 s->isr |= 1 << irq; 186 s->isr |= 1 << irq;
179 if (s->auto_eoi) {
180 if (s->rotate_on_auto_eoi)
181 s->priority_add = (irq + 1) & 7;
182 pic_clear_isr(s, irq);
183 }
184 /* 187 /*
185 * We don't clear a level sensitive interrupt here 188 * We don't clear a level sensitive interrupt here
186 */ 189 */
187 if (!(s->elcr & (1 << irq))) 190 if (!(s->elcr & (1 << irq)))
188 s->irr &= ~(1 << irq); 191 s->irr &= ~(1 << irq);
192
193 if (s->auto_eoi) {
194 if (s->rotate_on_auto_eoi)
195 s->priority_add = (irq + 1) & 7;
196 pic_clear_isr(s, irq);
197 }
198
189} 199}
190 200
191int kvm_pic_read_irq(struct kvm *kvm) 201int kvm_pic_read_irq(struct kvm *kvm)
@@ -225,22 +235,11 @@ int kvm_pic_read_irq(struct kvm *kvm)
225 235
226void kvm_pic_reset(struct kvm_kpic_state *s) 236void kvm_pic_reset(struct kvm_kpic_state *s)
227{ 237{
228 int irq, irqbase, n; 238 int irq;
229 struct kvm *kvm = s->pics_state->irq_request_opaque; 239 struct kvm *kvm = s->pics_state->irq_request_opaque;
230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 240 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
241 u8 irr = s->irr, isr = s->imr;
231 242
232 if (s == &s->pics_state->pics[0])
233 irqbase = 0;
234 else
235 irqbase = 8;
236
237 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
240 n = irq + irqbase;
241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
242 }
243 }
244 s->last_irr = 0; 243 s->last_irr = 0;
245 s->irr = 0; 244 s->irr = 0;
246 s->imr = 0; 245 s->imr = 0;
@@ -256,6 +255,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
256 s->rotate_on_auto_eoi = 0; 255 s->rotate_on_auto_eoi = 0;
257 s->special_fully_nested_mode = 0; 256 s->special_fully_nested_mode = 0;
258 s->init4 = 0; 257 s->init4 = 0;
258
259 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
260 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
261 if (irr & (1 << irq) || isr & (1 << irq)) {
262 pic_clear_isr(s, irq);
263 }
264 }
259} 265}
260 266
261static void pic_ioport_write(void *opaque, u32 addr, u32 val) 267static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
298 priority = get_priority(s, s->isr); 304 priority = get_priority(s, s->isr);
299 if (priority != 8) { 305 if (priority != 8) {
300 irq = (priority + s->priority_add) & 7; 306 irq = (priority + s->priority_add) & 7;
301 pic_clear_isr(s, irq);
302 if (cmd == 5) 307 if (cmd == 5)
303 s->priority_add = (irq + 1) & 7; 308 s->priority_add = (irq + 1) & 7;
309 pic_clear_isr(s, irq);
304 pic_update_irq(s->pics_state); 310 pic_update_irq(s->pics_state);
305 } 311 }
306 break; 312 break;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..be399e207d57 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -71,6 +71,7 @@ struct kvm_pic {
71 int output; /* intr from master PIC */ 71 int output; /* intr from master PIC */
72 struct kvm_io_device dev; 72 struct kvm_io_device dev;
73 void (*ack_notifier)(void *opaque, int irq); 73 void (*ack_notifier)(void *opaque, int irq);
74 unsigned long irq_states[16];
74}; 75};
75 76
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
@@ -85,7 +86,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
85 86
86static inline int irqchip_in_kernel(struct kvm *kvm) 87static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 88{
88 return pic_irqchip(kvm) != NULL; 89 int ret;
90
91 ret = (pic_irqchip(kvm) != NULL);
92 smp_rmb();
93 return ret;
89} 94}
90 95
91void kvm_pic_reset(struct kvm_kpic_state *s); 96void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1ae5ceba7eb2..3063a0c4858b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,7 +32,6 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include <asm/apicdef.h>
36#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
37#include "irq.h" 36#include "irq.h"
38#include "trace.h" 37#include "trace.h"
@@ -471,11 +470,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
471 trigger_mode = IOAPIC_LEVEL_TRIG; 470 trigger_mode = IOAPIC_LEVEL_TRIG;
472 else 471 else
473 trigger_mode = IOAPIC_EDGE_TRIG; 472 trigger_mode = IOAPIC_EDGE_TRIG;
474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { 473 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 474 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
479} 475}
480 476
481static void apic_send_ipi(struct kvm_lapic *apic) 477static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +500,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 500 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
505 irq.vector); 501 irq.vector);
506 502
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 503 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
510} 504}
511 505
512static u32 apic_get_tmcct(struct kvm_lapic *apic) 506static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -521,7 +515,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
521 if (apic_get_reg(apic, APIC_TMICT) == 0) 515 if (apic_get_reg(apic, APIC_TMICT) == 0)
522 return 0; 516 return 0;
523 517
524 remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer); 518 remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
525 if (ktime_to_ns(remaining) < 0) 519 if (ktime_to_ns(remaining) < 0)
526 remaining = ktime_set(0, 0); 520 remaining = ktime_set(0, 0);
527 521
@@ -664,7 +658,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
664{ 658{
665 ktime_t now = apic->lapic_timer.timer.base->get_time(); 659 ktime_t now = apic->lapic_timer.timer.base->get_time();
666 660
667 apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) * 661 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
668 APIC_BUS_CYCLE_NS * apic->divide_count; 662 APIC_BUS_CYCLE_NS * apic->divide_count;
669 atomic_set(&apic->lapic_timer.pending, 0); 663 atomic_set(&apic->lapic_timer.pending, 0);
670 664
@@ -1156,6 +1150,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1156 hrtimer_cancel(&apic->lapic_timer.timer); 1150 hrtimer_cancel(&apic->lapic_timer.timer);
1157 update_divide_count(apic); 1151 update_divide_count(apic);
1158 start_apic_timer(apic); 1152 start_apic_timer(apic);
1153 apic->irr_pending = true;
1159} 1154}
1160 1155
1161void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1156void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eca41ae9f453..4c3e5b2314cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -156,6 +156,8 @@ module_param(oos_shadow, bool, 0644);
156#define CREATE_TRACE_POINTS 156#define CREATE_TRACE_POINTS
157#include "mmutrace.h" 157#include "mmutrace.h"
158 158
159#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
160
159#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 161#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
160 162
161struct kvm_rmap_desc { 163struct kvm_rmap_desc {
@@ -634,9 +636,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
634 if (*spte & shadow_accessed_mask) 636 if (*spte & shadow_accessed_mask)
635 kvm_set_pfn_accessed(pfn); 637 kvm_set_pfn_accessed(pfn);
636 if (is_writeble_pte(*spte)) 638 if (is_writeble_pte(*spte))
637 kvm_release_pfn_dirty(pfn); 639 kvm_set_pfn_dirty(pfn);
638 else
639 kvm_release_pfn_clean(pfn);
640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); 640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
641 if (!*rmapp) { 641 if (!*rmapp) {
642 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 642 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -748,7 +748,8 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
748 return write_protected; 748 return write_protected;
749} 749}
750 750
751static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 751static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
752 unsigned long data)
752{ 753{
753 u64 *spte; 754 u64 *spte;
754 int need_tlb_flush = 0; 755 int need_tlb_flush = 0;
@@ -763,8 +764,47 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
763 return need_tlb_flush; 764 return need_tlb_flush;
764} 765}
765 766
767static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
768 unsigned long data)
769{
770 int need_flush = 0;
771 u64 *spte, new_spte;
772 pte_t *ptep = (pte_t *)data;
773 pfn_t new_pfn;
774
775 WARN_ON(pte_huge(*ptep));
776 new_pfn = pte_pfn(*ptep);
777 spte = rmap_next(kvm, rmapp, NULL);
778 while (spte) {
779 BUG_ON(!is_shadow_present_pte(*spte));
780 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
781 need_flush = 1;
782 if (pte_write(*ptep)) {
783 rmap_remove(kvm, spte);
784 __set_spte(spte, shadow_trap_nonpresent_pte);
785 spte = rmap_next(kvm, rmapp, NULL);
786 } else {
787 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
788 new_spte |= (u64)new_pfn << PAGE_SHIFT;
789
790 new_spte &= ~PT_WRITABLE_MASK;
791 new_spte &= ~SPTE_HOST_WRITEABLE;
792 if (is_writeble_pte(*spte))
793 kvm_set_pfn_dirty(spte_to_pfn(*spte));
794 __set_spte(spte, new_spte);
795 spte = rmap_next(kvm, rmapp, spte);
796 }
797 }
798 if (need_flush)
799 kvm_flush_remote_tlbs(kvm);
800
801 return 0;
802}
803
766static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 804static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
767 int (*handler)(struct kvm *kvm, unsigned long *rmapp)) 805 unsigned long data,
806 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
807 unsigned long data))
768{ 808{
769 int i, j; 809 int i, j;
770 int retval = 0; 810 int retval = 0;
@@ -786,13 +826,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
786 if (hva >= start && hva < end) { 826 if (hva >= start && hva < end) {
787 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 827 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
788 828
789 retval |= handler(kvm, &memslot->rmap[gfn_offset]); 829 retval |= handler(kvm, &memslot->rmap[gfn_offset],
830 data);
790 831
791 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 832 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
792 int idx = gfn_offset; 833 int idx = gfn_offset;
793 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 834 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
794 retval |= handler(kvm, 835 retval |= handler(kvm,
795 &memslot->lpage_info[j][idx].rmap_pde); 836 &memslot->lpage_info[j][idx].rmap_pde,
837 data);
796 } 838 }
797 } 839 }
798 } 840 }
@@ -802,10 +844,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
802 844
803int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 845int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
804{ 846{
805 return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 847 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
806} 848}
807 849
808static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) 850void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
851{
852 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
853}
854
855static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
856 unsigned long data)
809{ 857{
810 u64 *spte; 858 u64 *spte;
811 int young = 0; 859 int young = 0;
@@ -841,13 +889,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
841 gfn = unalias_gfn(vcpu->kvm, gfn); 889 gfn = unalias_gfn(vcpu->kvm, gfn);
842 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 890 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
843 891
844 kvm_unmap_rmapp(vcpu->kvm, rmapp); 892 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
845 kvm_flush_remote_tlbs(vcpu->kvm); 893 kvm_flush_remote_tlbs(vcpu->kvm);
846} 894}
847 895
848int kvm_age_hva(struct kvm *kvm, unsigned long hva) 896int kvm_age_hva(struct kvm *kvm, unsigned long hva)
849{ 897{
850 return kvm_handle_hva(kvm, hva, kvm_age_rmapp); 898 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
851} 899}
852 900
853#ifdef MMU_DEBUG 901#ifdef MMU_DEBUG
@@ -1756,7 +1804,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1756 unsigned pte_access, int user_fault, 1804 unsigned pte_access, int user_fault,
1757 int write_fault, int dirty, int level, 1805 int write_fault, int dirty, int level,
1758 gfn_t gfn, pfn_t pfn, bool speculative, 1806 gfn_t gfn, pfn_t pfn, bool speculative,
1759 bool can_unsync) 1807 bool can_unsync, bool reset_host_protection)
1760{ 1808{
1761 u64 spte; 1809 u64 spte;
1762 int ret = 0; 1810 int ret = 0;
@@ -1783,6 +1831,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1783 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 1831 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1784 kvm_is_mmio_pfn(pfn)); 1832 kvm_is_mmio_pfn(pfn));
1785 1833
1834 if (reset_host_protection)
1835 spte |= SPTE_HOST_WRITEABLE;
1836
1786 spte |= (u64)pfn << PAGE_SHIFT; 1837 spte |= (u64)pfn << PAGE_SHIFT;
1787 1838
1788 if ((pte_access & ACC_WRITE_MASK) 1839 if ((pte_access & ACC_WRITE_MASK)
@@ -1828,7 +1879,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1828 unsigned pt_access, unsigned pte_access, 1879 unsigned pt_access, unsigned pte_access,
1829 int user_fault, int write_fault, int dirty, 1880 int user_fault, int write_fault, int dirty,
1830 int *ptwrite, int level, gfn_t gfn, 1881 int *ptwrite, int level, gfn_t gfn,
1831 pfn_t pfn, bool speculative) 1882 pfn_t pfn, bool speculative,
1883 bool reset_host_protection)
1832{ 1884{
1833 int was_rmapped = 0; 1885 int was_rmapped = 0;
1834 int was_writeble = is_writeble_pte(*sptep); 1886 int was_writeble = is_writeble_pte(*sptep);
@@ -1860,7 +1912,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1860 } 1912 }
1861 1913
1862 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 1914 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1863 dirty, level, gfn, pfn, speculative, true)) { 1915 dirty, level, gfn, pfn, speculative, true,
1916 reset_host_protection)) {
1864 if (write_fault) 1917 if (write_fault)
1865 *ptwrite = 1; 1918 *ptwrite = 1;
1866 kvm_x86_ops->tlb_flush(vcpu); 1919 kvm_x86_ops->tlb_flush(vcpu);
@@ -1877,8 +1930,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1877 page_header_update_slot(vcpu->kvm, sptep, gfn); 1930 page_header_update_slot(vcpu->kvm, sptep, gfn);
1878 if (!was_rmapped) { 1931 if (!was_rmapped) {
1879 rmap_count = rmap_add(vcpu, sptep, gfn); 1932 rmap_count = rmap_add(vcpu, sptep, gfn);
1880 if (!is_rmap_spte(*sptep)) 1933 kvm_release_pfn_clean(pfn);
1881 kvm_release_pfn_clean(pfn);
1882 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1934 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1883 rmap_recycle(vcpu, sptep, gfn); 1935 rmap_recycle(vcpu, sptep, gfn);
1884 } else { 1936 } else {
@@ -1909,7 +1961,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1909 if (iterator.level == level) { 1961 if (iterator.level == level) {
1910 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1962 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1911 0, write, 1, &pt_write, 1963 0, write, 1, &pt_write,
1912 level, gfn, pfn, false); 1964 level, gfn, pfn, false, true);
1913 ++vcpu->stat.pf_fixed; 1965 ++vcpu->stat.pf_fixed;
1914 break; 1966 break;
1915 } 1967 }
@@ -2737,7 +2789,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2737 if (r) 2789 if (r)
2738 goto out; 2790 goto out;
2739 2791
2740 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); 2792 er = emulate_instruction(vcpu, cr2, error_code, 0);
2741 2793
2742 switch (er) { 2794 switch (er) {
2743 case EMULATE_DONE: 2795 case EMULATE_DONE:
@@ -2748,6 +2800,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2748 case EMULATE_FAIL: 2800 case EMULATE_FAIL:
2749 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2750 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2803 vcpu->run->internal.ndata = 0;
2751 return 0; 2804 return 0;
2752 default: 2805 default:
2753 BUG(); 2806 BUG();
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d2fec9c12d22..58a0f1e88596 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -273,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
273 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) 273 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
274 return; 274 return;
275 kvm_get_pfn(pfn); 275 kvm_get_pfn(pfn);
276 /*
277 * we call mmu_set_spte() with reset_host_protection = true beacuse that
278 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
279 */
276 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 280 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
277 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 281 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
278 gpte_to_gfn(gpte), pfn, true); 282 gpte_to_gfn(gpte), pfn, true, true);
279} 283}
280 284
281/* 285/*
@@ -308,7 +312,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
308 user_fault, write_fault, 312 user_fault, write_fault,
309 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 313 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
310 ptwrite, level, 314 ptwrite, level,
311 gw->gfn, pfn, false); 315 gw->gfn, pfn, false, true);
312 break; 316 break;
313 } 317 }
314 318
@@ -451,8 +455,6 @@ out_unlock:
451static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 455static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
452{ 456{
453 struct kvm_shadow_walk_iterator iterator; 457 struct kvm_shadow_walk_iterator iterator;
454 pt_element_t gpte;
455 gpa_t pte_gpa = -1;
456 int level; 458 int level;
457 u64 *sptep; 459 u64 *sptep;
458 int need_flush = 0; 460 int need_flush = 0;
@@ -463,14 +465,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
463 level = iterator.level; 465 level = iterator.level;
464 sptep = iterator.sptep; 466 sptep = iterator.sptep;
465 467
466 /* FIXME: properly handle invlpg on large guest pages */
467 if (level == PT_PAGE_TABLE_LEVEL || 468 if (level == PT_PAGE_TABLE_LEVEL ||
468 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 469 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
469 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 470 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
470 struct kvm_mmu_page *sp = page_header(__pa(sptep));
471
472 pte_gpa = (sp->gfn << PAGE_SHIFT);
473 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
474 471
475 if (is_shadow_present_pte(*sptep)) { 472 if (is_shadow_present_pte(*sptep)) {
476 rmap_remove(vcpu->kvm, sptep); 473 rmap_remove(vcpu->kvm, sptep);
@@ -489,18 +486,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
489 if (need_flush) 486 if (need_flush)
490 kvm_flush_remote_tlbs(vcpu->kvm); 487 kvm_flush_remote_tlbs(vcpu->kvm);
491 spin_unlock(&vcpu->kvm->mmu_lock); 488 spin_unlock(&vcpu->kvm->mmu_lock);
492
493 if (pte_gpa == -1)
494 return;
495 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
496 sizeof(pt_element_t)))
497 return;
498 if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
499 if (mmu_topup_memory_caches(vcpu))
500 return;
501 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
502 sizeof(pt_element_t), 0);
503 }
504} 489}
505 490
506static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 491static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -558,6 +543,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
558static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 543static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
559{ 544{
560 int i, offset, nr_present; 545 int i, offset, nr_present;
546 bool reset_host_protection;
561 547
562 offset = nr_present = 0; 548 offset = nr_present = 0;
563 549
@@ -595,9 +581,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
595 581
596 nr_present++; 582 nr_present++;
597 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 583 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
584 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
585 pte_access &= ~ACC_WRITE_MASK;
586 reset_host_protection = 0;
587 } else {
588 reset_host_protection = 1;
589 }
598 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 590 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
599 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 591 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
600 spte_to_pfn(sp->spt[i]), true, false); 592 spte_to_pfn(sp->spt[i]), true, false,
593 reset_host_protection);
601 } 594 }
602 595
603 return !nr_present; 596 return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6c79a14a3b6f..1d9b33843c80 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_FEATURE_SVML (1 << 2) 48#define SVM_FEATURE_SVML (1 << 2)
49#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
49 50
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 51#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 52#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
53 54
54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 55#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
55 56
56/* Turn on to get debugging output*/
57/* #define NESTED_DEBUG */
58
59#ifdef NESTED_DEBUG
60#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
61#else
62#define nsvm_printk(fmt, args...) do {} while(0)
63#endif
64
65static const u32 host_save_user_msrs[] = { 57static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 59 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
85 /* gpa pointers to the real vectors */ 77 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm; 78 u64 vmcb_msrpm;
87 79
80 /* A VMEXIT is required but not yet emulated */
81 bool exit_required;
82
88 /* cache for intercepts of the guest */ 83 /* cache for intercepts of the guest */
89 u16 intercept_cr_read; 84 u16 intercept_cr_read;
90 u16 intercept_cr_write; 85 u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
112 u32 *msrpm; 107 u32 *msrpm;
113 108
114 struct nested_state nested; 109 struct nested_state nested;
110
111 bool nmi_singlestep;
115}; 112};
116 113
117/* enable NPT for AMD64 and X86 with PAE */ 114/* enable NPT for AMD64 and X86 with PAE */
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
286 struct vcpu_svm *svm = to_svm(vcpu); 283 struct vcpu_svm *svm = to_svm(vcpu);
287 284
288 if (!svm->next_rip) { 285 if (!svm->next_rip) {
289 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != 286 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
290 EMULATE_DONE) 287 EMULATE_DONE)
291 printk(KERN_DEBUG "%s: NOP\n", __func__); 288 printk(KERN_DEBUG "%s: NOP\n", __func__);
292 return; 289 return;
@@ -316,7 +313,7 @@ static void svm_hardware_disable(void *garbage)
316 cpu_svm_disable(); 313 cpu_svm_disable();
317} 314}
318 315
319static void svm_hardware_enable(void *garbage) 316static int svm_hardware_enable(void *garbage)
320{ 317{
321 318
322 struct svm_cpu_data *sd; 319 struct svm_cpu_data *sd;
@@ -325,16 +322,21 @@ static void svm_hardware_enable(void *garbage)
325 struct desc_struct *gdt; 322 struct desc_struct *gdt;
326 int me = raw_smp_processor_id(); 323 int me = raw_smp_processor_id();
327 324
325 rdmsrl(MSR_EFER, efer);
326 if (efer & EFER_SVME)
327 return -EBUSY;
328
328 if (!has_svm()) { 329 if (!has_svm()) {
329 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); 330 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
330 return; 331 me);
332 return -EINVAL;
331 } 333 }
332 sd = per_cpu(svm_data, me); 334 sd = per_cpu(svm_data, me);
333 335
334 if (!sd) { 336 if (!sd) {
335 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", 337 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
336 me); 338 me);
337 return; 339 return -EINVAL;
338 } 340 }
339 341
340 sd->asid_generation = 1; 342 sd->asid_generation = 1;
@@ -345,11 +347,11 @@ static void svm_hardware_enable(void *garbage)
345 gdt = (struct desc_struct *)gdt_descr.base; 347 gdt = (struct desc_struct *)gdt_descr.base;
346 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 348 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
347 349
348 rdmsrl(MSR_EFER, efer);
349 wrmsrl(MSR_EFER, efer | EFER_SVME); 350 wrmsrl(MSR_EFER, efer | EFER_SVME);
350 351
351 wrmsrl(MSR_VM_HSAVE_PA, 352 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
352 page_to_pfn(sd->save_area) << PAGE_SHIFT); 353
354 return 0;
353} 355}
354 356
355static void svm_cpu_uninit(int cpu) 357static void svm_cpu_uninit(int cpu)
@@ -475,7 +477,7 @@ static __init int svm_hardware_setup(void)
475 kvm_enable_efer_bits(EFER_SVME); 477 kvm_enable_efer_bits(EFER_SVME);
476 } 478 }
477 479
478 for_each_online_cpu(cpu) { 480 for_each_possible_cpu(cpu) {
479 r = svm_cpu_init(cpu); 481 r = svm_cpu_init(cpu);
480 if (r) 482 if (r)
481 goto err; 483 goto err;
@@ -509,7 +511,7 @@ static __exit void svm_hardware_unsetup(void)
509{ 511{
510 int cpu; 512 int cpu;
511 513
512 for_each_online_cpu(cpu) 514 for_each_possible_cpu(cpu)
513 svm_cpu_uninit(cpu); 515 svm_cpu_uninit(cpu);
514 516
515 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 517 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -624,11 +626,12 @@ static void init_vmcb(struct vcpu_svm *svm)
624 save->rip = 0x0000fff0; 626 save->rip = 0x0000fff0;
625 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 627 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
626 628
627 /* 629 /* This is the guest-visible cr0 value.
628 * cr0 val on cpu init should be 0x60000010, we enable cpu 630 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
629 * cache by default. the orderly way is to enable cache in bios.
630 */ 631 */
631 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 632 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
633 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
634
632 save->cr4 = X86_CR4_PAE; 635 save->cr4 = X86_CR4_PAE;
633 /* rdx = ?? */ 636 /* rdx = ?? */
634 637
@@ -643,8 +646,6 @@ static void init_vmcb(struct vcpu_svm *svm)
643 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| 646 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
644 INTERCEPT_CR3_MASK); 647 INTERCEPT_CR3_MASK);
645 save->g_pat = 0x0007040600070406ULL; 648 save->g_pat = 0x0007040600070406ULL;
646 /* enable caching because the QEMU Bios doesn't enable it */
647 save->cr0 = X86_CR0_ET;
648 save->cr3 = 0; 649 save->cr3 = 0;
649 save->cr4 = 0; 650 save->cr4 = 0;
650 } 651 }
@@ -653,6 +654,11 @@ static void init_vmcb(struct vcpu_svm *svm)
653 svm->nested.vmcb = 0; 654 svm->nested.vmcb = 0;
654 svm->vcpu.arch.hflags = 0; 655 svm->vcpu.arch.hflags = 0;
655 656
657 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
658 control->pause_filter_count = 3000;
659 control->intercept |= (1ULL << INTERCEPT_PAUSE);
660 }
661
656 enable_gif(svm); 662 enable_gif(svm);
657} 663}
658 664
@@ -757,15 +763,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
757 int i; 763 int i;
758 764
759 if (unlikely(cpu != vcpu->cpu)) { 765 if (unlikely(cpu != vcpu->cpu)) {
760 u64 tsc_this, delta; 766 u64 delta;
761 767
762 /* 768 /*
763 * Make sure that the guest sees a monotonically 769 * Make sure that the guest sees a monotonically
764 * increasing TSC. 770 * increasing TSC.
765 */ 771 */
766 rdtscll(tsc_this); 772 delta = vcpu->arch.host_tsc - native_read_tsc();
767 delta = vcpu->arch.host_tsc - tsc_this;
768 svm->vmcb->control.tsc_offset += delta; 773 svm->vmcb->control.tsc_offset += delta;
774 if (is_nested(svm))
775 svm->nested.hsave->control.tsc_offset += delta;
769 vcpu->cpu = cpu; 776 vcpu->cpu = cpu;
770 kvm_migrate_timers(vcpu); 777 kvm_migrate_timers(vcpu);
771 svm->asid_generation = 0; 778 svm->asid_generation = 0;
@@ -784,7 +791,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
784 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 791 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
785 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 792 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
786 793
787 rdtscll(vcpu->arch.host_tsc); 794 vcpu->arch.host_tsc = native_read_tsc();
788} 795}
789 796
790static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 797static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1042,7 +1049,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1042 svm->vmcb->control.intercept_exceptions &= 1049 svm->vmcb->control.intercept_exceptions &=
1043 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1050 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1044 1051
1045 if (vcpu->arch.singlestep) 1052 if (svm->nmi_singlestep)
1046 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1053 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1047 1054
1048 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1055 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1057,26 +1064,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1057 vcpu->guest_debug = 0; 1064 vcpu->guest_debug = 0;
1058} 1065}
1059 1066
1060static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1067static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1061{ 1068{
1062 int old_debug = vcpu->guest_debug;
1063 struct vcpu_svm *svm = to_svm(vcpu); 1069 struct vcpu_svm *svm = to_svm(vcpu);
1064 1070
1065 vcpu->guest_debug = dbg->control;
1066
1067 update_db_intercept(vcpu);
1068
1069 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1071 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1070 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1072 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1071 else 1073 else
1072 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1074 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1073 1075
1074 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 1076 update_db_intercept(vcpu);
1075 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1076 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1077 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1078
1079 return 0;
1080} 1077}
1081 1078
1082static void load_host_msrs(struct kvm_vcpu *vcpu) 1079static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1177,7 +1174,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1177 } 1174 }
1178} 1175}
1179 1176
1180static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1177static int pf_interception(struct vcpu_svm *svm)
1181{ 1178{
1182 u64 fault_address; 1179 u64 fault_address;
1183 u32 error_code; 1180 u32 error_code;
@@ -1191,17 +1188,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1191 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1188 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1192} 1189}
1193 1190
1194static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1191static int db_interception(struct vcpu_svm *svm)
1195{ 1192{
1193 struct kvm_run *kvm_run = svm->vcpu.run;
1194
1196 if (!(svm->vcpu.guest_debug & 1195 if (!(svm->vcpu.guest_debug &
1197 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1196 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1198 !svm->vcpu.arch.singlestep) { 1197 !svm->nmi_singlestep) {
1199 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1198 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1200 return 1; 1199 return 1;
1201 } 1200 }
1202 1201
1203 if (svm->vcpu.arch.singlestep) { 1202 if (svm->nmi_singlestep) {
1204 svm->vcpu.arch.singlestep = false; 1203 svm->nmi_singlestep = false;
1205 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1204 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1206 svm->vmcb->save.rflags &= 1205 svm->vmcb->save.rflags &=
1207 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1206 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1220,25 +1219,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1220 return 1; 1219 return 1;
1221} 1220}
1222 1221
1223static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1222static int bp_interception(struct vcpu_svm *svm)
1224{ 1223{
1224 struct kvm_run *kvm_run = svm->vcpu.run;
1225
1225 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1226 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1226 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1227 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1227 kvm_run->debug.arch.exception = BP_VECTOR; 1228 kvm_run->debug.arch.exception = BP_VECTOR;
1228 return 0; 1229 return 0;
1229} 1230}
1230 1231
1231static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1232static int ud_interception(struct vcpu_svm *svm)
1232{ 1233{
1233 int er; 1234 int er;
1234 1235
1235 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 1236 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
1236 if (er != EMULATE_DONE) 1237 if (er != EMULATE_DONE)
1237 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1238 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1238 return 1; 1239 return 1;
1239} 1240}
1240 1241
1241static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1242static int nm_interception(struct vcpu_svm *svm)
1242{ 1243{
1243 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1244 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1244 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) 1245 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
@@ -1248,7 +1249,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1248 return 1; 1249 return 1;
1249} 1250}
1250 1251
1251static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1252static int mc_interception(struct vcpu_svm *svm)
1252{ 1253{
1253 /* 1254 /*
1254 * On an #MC intercept the MCE handler is not called automatically in 1255 * On an #MC intercept the MCE handler is not called automatically in
@@ -1261,8 +1262,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1261 return 1; 1262 return 1;
1262} 1263}
1263 1264
1264static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1265static int shutdown_interception(struct vcpu_svm *svm)
1265{ 1266{
1267 struct kvm_run *kvm_run = svm->vcpu.run;
1268
1266 /* 1269 /*
1267 * VMCB is undefined after a SHUTDOWN intercept 1270 * VMCB is undefined after a SHUTDOWN intercept
1268 * so reinitialize it. 1271 * so reinitialize it.
@@ -1274,7 +1277,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1274 return 0; 1277 return 0;
1275} 1278}
1276 1279
1277static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1280static int io_interception(struct vcpu_svm *svm)
1278{ 1281{
1279 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1280 int size, in, string; 1283 int size, in, string;
@@ -1288,7 +1291,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1288 1291
1289 if (string) { 1292 if (string) {
1290 if (emulate_instruction(&svm->vcpu, 1293 if (emulate_instruction(&svm->vcpu,
1291 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) 1294 0, 0, 0) == EMULATE_DO_MMIO)
1292 return 0; 1295 return 0;
1293 return 1; 1296 return 1;
1294 } 1297 }
@@ -1298,33 +1301,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1298 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1299 1302
1300 skip_emulated_instruction(&svm->vcpu); 1303 skip_emulated_instruction(&svm->vcpu);
1301 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1304 return kvm_emulate_pio(&svm->vcpu, in, size, port);
1302} 1305}
1303 1306
1304static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1307static int nmi_interception(struct vcpu_svm *svm)
1305{ 1308{
1306 return 1; 1309 return 1;
1307} 1310}
1308 1311
1309static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1312static int intr_interception(struct vcpu_svm *svm)
1310{ 1313{
1311 ++svm->vcpu.stat.irq_exits; 1314 ++svm->vcpu.stat.irq_exits;
1312 return 1; 1315 return 1;
1313} 1316}
1314 1317
1315static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1318static int nop_on_interception(struct vcpu_svm *svm)
1316{ 1319{
1317 return 1; 1320 return 1;
1318} 1321}
1319 1322
1320static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1323static int halt_interception(struct vcpu_svm *svm)
1321{ 1324{
1322 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1323 skip_emulated_instruction(&svm->vcpu); 1326 skip_emulated_instruction(&svm->vcpu);
1324 return kvm_emulate_halt(&svm->vcpu); 1327 return kvm_emulate_halt(&svm->vcpu);
1325} 1328}
1326 1329
1327static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1330static int vmmcall_interception(struct vcpu_svm *svm)
1328{ 1331{
1329 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1330 skip_emulated_instruction(&svm->vcpu); 1333 skip_emulated_instruction(&svm->vcpu);
@@ -1375,8 +1378,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1375 1378
1376 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1377 1380
1378 if (nested_svm_exit_handled(svm)) { 1381 if (svm->nested.intercept & 1ULL) {
1379 nsvm_printk("VMexit -> INTR\n"); 1382 /*
1383 * The #vmexit can't be emulated here directly because this
1384 * code path runs with irqs and preemtion disabled. A
1385 * #vmexit emulation might sleep. Only signal request for
1386 * the #vmexit here.
1387 */
1388 svm->nested.exit_required = true;
1389 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1380 return 1; 1390 return 1;
1381 } 1391 }
1382 1392
@@ -1387,10 +1397,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1387{ 1397{
1388 struct page *page; 1398 struct page *page;
1389 1399
1390 down_read(&current->mm->mmap_sem);
1391 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1400 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1392 up_read(&current->mm->mmap_sem);
1393
1394 if (is_error_page(page)) 1401 if (is_error_page(page))
1395 goto error; 1402 goto error;
1396 1403
@@ -1529,14 +1536,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1529 } 1536 }
1530 default: { 1537 default: {
1531 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1538 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1532 nsvm_printk("exit code: 0x%x\n", exit_code);
1533 if (svm->nested.intercept & exit_bits) 1539 if (svm->nested.intercept & exit_bits)
1534 vmexit = NESTED_EXIT_DONE; 1540 vmexit = NESTED_EXIT_DONE;
1535 } 1541 }
1536 } 1542 }
1537 1543
1538 if (vmexit == NESTED_EXIT_DONE) { 1544 if (vmexit == NESTED_EXIT_DONE) {
1539 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1540 nested_svm_vmexit(svm); 1545 nested_svm_vmexit(svm);
1541 } 1546 }
1542 1547
@@ -1581,6 +1586,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1581 struct vmcb *hsave = svm->nested.hsave; 1586 struct vmcb *hsave = svm->nested.hsave;
1582 struct vmcb *vmcb = svm->vmcb; 1587 struct vmcb *vmcb = svm->vmcb;
1583 1588
1589 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1590 vmcb->control.exit_info_1,
1591 vmcb->control.exit_info_2,
1592 vmcb->control.exit_int_info,
1593 vmcb->control.exit_int_info_err);
1594
1584 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1595 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1585 if (!nested_vmcb) 1596 if (!nested_vmcb)
1586 return 1; 1597 return 1;
@@ -1614,6 +1625,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1614 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 1625 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1615 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 1626 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1616 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 1627 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1628
1629 /*
1630 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
1631 * to make sure that we do not lose injected events. So check event_inj
1632 * here and copy it to exit_int_info if it is valid.
1633 * Exit_int_info and event_inj can't be both valid because the case
1634 * below only happens on a VMRUN instruction intercept which has
1635 * no valid exit_int_info set.
1636 */
1637 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
1638 struct vmcb_control_area *nc = &nested_vmcb->control;
1639
1640 nc->exit_int_info = vmcb->control.event_inj;
1641 nc->exit_int_info_err = vmcb->control.event_inj_err;
1642 }
1643
1617 nested_vmcb->control.tlb_ctl = 0; 1644 nested_vmcb->control.tlb_ctl = 0;
1618 nested_vmcb->control.event_inj = 0; 1645 nested_vmcb->control.event_inj = 0;
1619 nested_vmcb->control.event_inj_err = 0; 1646 nested_vmcb->control.event_inj_err = 0;
@@ -1625,10 +1652,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1625 /* Restore the original control entries */ 1652 /* Restore the original control entries */
1626 copy_vmcb_control_area(vmcb, hsave); 1653 copy_vmcb_control_area(vmcb, hsave);
1627 1654
1628 /* Kill any pending exceptions */
1629 if (svm->vcpu.arch.exception.pending == true)
1630 nsvm_printk("WARNING: Pending Exception\n");
1631
1632 kvm_clear_exception_queue(&svm->vcpu); 1655 kvm_clear_exception_queue(&svm->vcpu);
1633 kvm_clear_interrupt_queue(&svm->vcpu); 1656 kvm_clear_interrupt_queue(&svm->vcpu);
1634 1657
@@ -1699,6 +1722,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1699 /* nested_vmcb is our indicator if nested SVM is activated */ 1722 /* nested_vmcb is our indicator if nested SVM is activated */
1700 svm->nested.vmcb = svm->vmcb->save.rax; 1723 svm->nested.vmcb = svm->vmcb->save.rax;
1701 1724
1725 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1726 nested_vmcb->save.rip,
1727 nested_vmcb->control.int_ctl,
1728 nested_vmcb->control.event_inj,
1729 nested_vmcb->control.nested_ctl);
1730
1702 /* Clear internal status */ 1731 /* Clear internal status */
1703 kvm_clear_exception_queue(&svm->vcpu); 1732 kvm_clear_exception_queue(&svm->vcpu);
1704 kvm_clear_interrupt_queue(&svm->vcpu); 1733 kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1786,28 +1815,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1786 svm->nested.intercept = nested_vmcb->control.intercept; 1815 svm->nested.intercept = nested_vmcb->control.intercept;
1787 1816
1788 force_new_asid(&svm->vcpu); 1817 force_new_asid(&svm->vcpu);
1789 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1790 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1791 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 1818 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1792 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1793 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1794 nested_vmcb->control.int_ctl);
1795 }
1796 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 1819 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1797 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 1820 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1798 else 1821 else
1799 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 1822 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1800 1823
1801 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1802 nested_vmcb->control.exit_int_info,
1803 nested_vmcb->control.int_state);
1804
1805 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 1824 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1806 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 1825 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1807 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 1826 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1808 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1809 nsvm_printk("Injecting Event: 0x%x\n",
1810 nested_vmcb->control.event_inj);
1811 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1827 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1812 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1828 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1813 1829
@@ -1834,7 +1850,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1834 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1850 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1835} 1851}
1836 1852
1837static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1853static int vmload_interception(struct vcpu_svm *svm)
1838{ 1854{
1839 struct vmcb *nested_vmcb; 1855 struct vmcb *nested_vmcb;
1840 1856
@@ -1854,7 +1870,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1854 return 1; 1870 return 1;
1855} 1871}
1856 1872
1857static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1873static int vmsave_interception(struct vcpu_svm *svm)
1858{ 1874{
1859 struct vmcb *nested_vmcb; 1875 struct vmcb *nested_vmcb;
1860 1876
@@ -1874,10 +1890,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1874 return 1; 1890 return 1;
1875} 1891}
1876 1892
1877static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1893static int vmrun_interception(struct vcpu_svm *svm)
1878{ 1894{
1879 nsvm_printk("VMrun\n");
1880
1881 if (nested_svm_check_permissions(svm)) 1895 if (nested_svm_check_permissions(svm))
1882 return 1; 1896 return 1;
1883 1897
@@ -1904,7 +1918,7 @@ failed:
1904 return 1; 1918 return 1;
1905} 1919}
1906 1920
1907static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1921static int stgi_interception(struct vcpu_svm *svm)
1908{ 1922{
1909 if (nested_svm_check_permissions(svm)) 1923 if (nested_svm_check_permissions(svm))
1910 return 1; 1924 return 1;
@@ -1917,7 +1931,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1917 return 1; 1931 return 1;
1918} 1932}
1919 1933
1920static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1934static int clgi_interception(struct vcpu_svm *svm)
1921{ 1935{
1922 if (nested_svm_check_permissions(svm)) 1936 if (nested_svm_check_permissions(svm))
1923 return 1; 1937 return 1;
@@ -1934,10 +1948,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1934 return 1; 1948 return 1;
1935} 1949}
1936 1950
1937static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1951static int invlpga_interception(struct vcpu_svm *svm)
1938{ 1952{
1939 struct kvm_vcpu *vcpu = &svm->vcpu; 1953 struct kvm_vcpu *vcpu = &svm->vcpu;
1940 nsvm_printk("INVLPGA\n"); 1954
1955 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
1956 vcpu->arch.regs[VCPU_REGS_RAX]);
1941 1957
1942 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 1958 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1943 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 1959 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1947,15 +1963,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1947 return 1; 1963 return 1;
1948} 1964}
1949 1965
1950static int invalid_op_interception(struct vcpu_svm *svm, 1966static int skinit_interception(struct vcpu_svm *svm)
1951 struct kvm_run *kvm_run)
1952{ 1967{
1968 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
1969
1953 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1970 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1954 return 1; 1971 return 1;
1955} 1972}
1956 1973
1957static int task_switch_interception(struct vcpu_svm *svm, 1974static int invalid_op_interception(struct vcpu_svm *svm)
1958 struct kvm_run *kvm_run) 1975{
1976 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1977 return 1;
1978}
1979
1980static int task_switch_interception(struct vcpu_svm *svm)
1959{ 1981{
1960 u16 tss_selector; 1982 u16 tss_selector;
1961 int reason; 1983 int reason;
@@ -2005,14 +2027,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
2005 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2027 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
2006} 2028}
2007 2029
2008static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2030static int cpuid_interception(struct vcpu_svm *svm)
2009{ 2031{
2010 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2032 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2011 kvm_emulate_cpuid(&svm->vcpu); 2033 kvm_emulate_cpuid(&svm->vcpu);
2012 return 1; 2034 return 1;
2013} 2035}
2014 2036
2015static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2037static int iret_interception(struct vcpu_svm *svm)
2016{ 2038{
2017 ++svm->vcpu.stat.nmi_window_exits; 2039 ++svm->vcpu.stat.nmi_window_exits;
2018 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2040 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2020,26 +2042,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2020 return 1; 2042 return 1;
2021} 2043}
2022 2044
2023static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2045static int invlpg_interception(struct vcpu_svm *svm)
2024{ 2046{
2025 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 2047 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2026 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2048 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2027 return 1; 2049 return 1;
2028} 2050}
2029 2051
2030static int emulate_on_interception(struct vcpu_svm *svm, 2052static int emulate_on_interception(struct vcpu_svm *svm)
2031 struct kvm_run *kvm_run)
2032{ 2053{
2033 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 2054 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2034 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2055 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2035 return 1; 2056 return 1;
2036} 2057}
2037 2058
2038static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2059static int cr8_write_interception(struct vcpu_svm *svm)
2039{ 2060{
2061 struct kvm_run *kvm_run = svm->vcpu.run;
2062
2040 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2063 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2041 /* instruction emulation calls kvm_set_cr8() */ 2064 /* instruction emulation calls kvm_set_cr8() */
2042 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 2065 emulate_instruction(&svm->vcpu, 0, 0, 0);
2043 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2066 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2044 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2067 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2045 return 1; 2068 return 1;
@@ -2056,10 +2079,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2056 2079
2057 switch (ecx) { 2080 switch (ecx) {
2058 case MSR_IA32_TSC: { 2081 case MSR_IA32_TSC: {
2059 u64 tsc; 2082 u64 tsc_offset;
2060 2083
2061 rdtscll(tsc); 2084 if (is_nested(svm))
2062 *data = svm->vmcb->control.tsc_offset + tsc; 2085 tsc_offset = svm->nested.hsave->control.tsc_offset;
2086 else
2087 tsc_offset = svm->vmcb->control.tsc_offset;
2088
2089 *data = tsc_offset + native_read_tsc();
2063 break; 2090 break;
2064 } 2091 }
2065 case MSR_K6_STAR: 2092 case MSR_K6_STAR:
@@ -2121,7 +2148,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2121 return 0; 2148 return 0;
2122} 2149}
2123 2150
2124static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2151static int rdmsr_interception(struct vcpu_svm *svm)
2125{ 2152{
2126 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2153 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2127 u64 data; 2154 u64 data;
@@ -2145,10 +2172,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2145 2172
2146 switch (ecx) { 2173 switch (ecx) {
2147 case MSR_IA32_TSC: { 2174 case MSR_IA32_TSC: {
2148 u64 tsc; 2175 u64 tsc_offset = data - native_read_tsc();
2176 u64 g_tsc_offset = 0;
2177
2178 if (is_nested(svm)) {
2179 g_tsc_offset = svm->vmcb->control.tsc_offset -
2180 svm->nested.hsave->control.tsc_offset;
2181 svm->nested.hsave->control.tsc_offset = tsc_offset;
2182 }
2183
2184 svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
2149 2185
2150 rdtscll(tsc);
2151 svm->vmcb->control.tsc_offset = data - tsc;
2152 break; 2186 break;
2153 } 2187 }
2154 case MSR_K6_STAR: 2188 case MSR_K6_STAR:
@@ -2207,7 +2241,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2207 return 0; 2241 return 0;
2208} 2242}
2209 2243
2210static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2244static int wrmsr_interception(struct vcpu_svm *svm)
2211{ 2245{
2212 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2246 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2213 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2247 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
@@ -2223,17 +2257,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2223 return 1; 2257 return 1;
2224} 2258}
2225 2259
2226static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2260static int msr_interception(struct vcpu_svm *svm)
2227{ 2261{
2228 if (svm->vmcb->control.exit_info_1) 2262 if (svm->vmcb->control.exit_info_1)
2229 return wrmsr_interception(svm, kvm_run); 2263 return wrmsr_interception(svm);
2230 else 2264 else
2231 return rdmsr_interception(svm, kvm_run); 2265 return rdmsr_interception(svm);
2232} 2266}
2233 2267
2234static int interrupt_window_interception(struct vcpu_svm *svm, 2268static int interrupt_window_interception(struct vcpu_svm *svm)
2235 struct kvm_run *kvm_run)
2236{ 2269{
2270 struct kvm_run *kvm_run = svm->vcpu.run;
2271
2237 svm_clear_vintr(svm); 2272 svm_clear_vintr(svm);
2238 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2273 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2239 /* 2274 /*
@@ -2251,8 +2286,13 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2251 return 1; 2286 return 1;
2252} 2287}
2253 2288
2254static int (*svm_exit_handlers[])(struct vcpu_svm *svm, 2289static int pause_interception(struct vcpu_svm *svm)
2255 struct kvm_run *kvm_run) = { 2290{
2291 kvm_vcpu_on_spin(&(svm->vcpu));
2292 return 1;
2293}
2294
2295static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2256 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2296 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2257 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2297 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2258 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2298 [SVM_EXIT_READ_CR4] = emulate_on_interception,
@@ -2287,6 +2327,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2287 [SVM_EXIT_CPUID] = cpuid_interception, 2327 [SVM_EXIT_CPUID] = cpuid_interception,
2288 [SVM_EXIT_IRET] = iret_interception, 2328 [SVM_EXIT_IRET] = iret_interception,
2289 [SVM_EXIT_INVD] = emulate_on_interception, 2329 [SVM_EXIT_INVD] = emulate_on_interception,
2330 [SVM_EXIT_PAUSE] = pause_interception,
2290 [SVM_EXIT_HLT] = halt_interception, 2331 [SVM_EXIT_HLT] = halt_interception,
2291 [SVM_EXIT_INVLPG] = invlpg_interception, 2332 [SVM_EXIT_INVLPG] = invlpg_interception,
2292 [SVM_EXIT_INVLPGA] = invlpga_interception, 2333 [SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2300,26 +2341,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2300 [SVM_EXIT_VMSAVE] = vmsave_interception, 2341 [SVM_EXIT_VMSAVE] = vmsave_interception,
2301 [SVM_EXIT_STGI] = stgi_interception, 2342 [SVM_EXIT_STGI] = stgi_interception,
2302 [SVM_EXIT_CLGI] = clgi_interception, 2343 [SVM_EXIT_CLGI] = clgi_interception,
2303 [SVM_EXIT_SKINIT] = invalid_op_interception, 2344 [SVM_EXIT_SKINIT] = skinit_interception,
2304 [SVM_EXIT_WBINVD] = emulate_on_interception, 2345 [SVM_EXIT_WBINVD] = emulate_on_interception,
2305 [SVM_EXIT_MONITOR] = invalid_op_interception, 2346 [SVM_EXIT_MONITOR] = invalid_op_interception,
2306 [SVM_EXIT_MWAIT] = invalid_op_interception, 2347 [SVM_EXIT_MWAIT] = invalid_op_interception,
2307 [SVM_EXIT_NPF] = pf_interception, 2348 [SVM_EXIT_NPF] = pf_interception,
2308}; 2349};
2309 2350
2310static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2351static int handle_exit(struct kvm_vcpu *vcpu)
2311{ 2352{
2312 struct vcpu_svm *svm = to_svm(vcpu); 2353 struct vcpu_svm *svm = to_svm(vcpu);
2354 struct kvm_run *kvm_run = vcpu->run;
2313 u32 exit_code = svm->vmcb->control.exit_code; 2355 u32 exit_code = svm->vmcb->control.exit_code;
2314 2356
2315 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2357 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2316 2358
2359 if (unlikely(svm->nested.exit_required)) {
2360 nested_svm_vmexit(svm);
2361 svm->nested.exit_required = false;
2362
2363 return 1;
2364 }
2365
2317 if (is_nested(svm)) { 2366 if (is_nested(svm)) {
2318 int vmexit; 2367 int vmexit;
2319 2368
2320 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2369 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
2321 exit_code, svm->vmcb->control.exit_info_1, 2370 svm->vmcb->control.exit_info_1,
2322 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2371 svm->vmcb->control.exit_info_2,
2372 svm->vmcb->control.exit_int_info,
2373 svm->vmcb->control.exit_int_info_err);
2323 2374
2324 vmexit = nested_svm_exit_special(svm); 2375 vmexit = nested_svm_exit_special(svm);
2325 2376
@@ -2369,7 +2420,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2369 return 0; 2420 return 0;
2370 } 2421 }
2371 2422
2372 return svm_exit_handlers[exit_code](svm, kvm_run); 2423 return svm_exit_handlers[exit_code](svm);
2373} 2424}
2374 2425
2375static void reload_tss(struct kvm_vcpu *vcpu) 2426static void reload_tss(struct kvm_vcpu *vcpu)
@@ -2446,20 +2497,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2446 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2497 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2447} 2498}
2448 2499
2500static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
2501{
2502 struct vcpu_svm *svm = to_svm(vcpu);
2503
2504 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
2505}
2506
2507static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2508{
2509 struct vcpu_svm *svm = to_svm(vcpu);
2510
2511 if (masked) {
2512 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2513 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2514 } else {
2515 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2516 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
2517 }
2518}
2519
2449static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 2520static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2450{ 2521{
2451 struct vcpu_svm *svm = to_svm(vcpu); 2522 struct vcpu_svm *svm = to_svm(vcpu);
2452 struct vmcb *vmcb = svm->vmcb; 2523 struct vmcb *vmcb = svm->vmcb;
2453 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2524 int ret;
2454 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2525
2455 gif_set(svm) && 2526 if (!gif_set(svm) ||
2456 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); 2527 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2528 return 0;
2529
2530 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
2531
2532 if (is_nested(svm))
2533 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
2534
2535 return ret;
2457} 2536}
2458 2537
2459static void enable_irq_window(struct kvm_vcpu *vcpu) 2538static void enable_irq_window(struct kvm_vcpu *vcpu)
2460{ 2539{
2461 struct vcpu_svm *svm = to_svm(vcpu); 2540 struct vcpu_svm *svm = to_svm(vcpu);
2462 nsvm_printk("Trying to open IRQ window\n");
2463 2541
2464 nested_svm_intr(svm); 2542 nested_svm_intr(svm);
2465 2543
@@ -2484,7 +2562,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2484 /* Something prevents NMI from been injected. Single step over 2562 /* Something prevents NMI from been injected. Single step over
2485 possible problem (IRET or exception injection or interrupt 2563 possible problem (IRET or exception injection or interrupt
2486 shadow) */ 2564 shadow) */
2487 vcpu->arch.singlestep = true; 2565 svm->nmi_singlestep = true;
2488 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2566 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2489 update_db_intercept(vcpu); 2567 update_db_intercept(vcpu);
2490} 2568}
@@ -2574,13 +2652,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2574#define R "e" 2652#define R "e"
2575#endif 2653#endif
2576 2654
2577static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2655static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2578{ 2656{
2579 struct vcpu_svm *svm = to_svm(vcpu); 2657 struct vcpu_svm *svm = to_svm(vcpu);
2580 u16 fs_selector; 2658 u16 fs_selector;
2581 u16 gs_selector; 2659 u16 gs_selector;
2582 u16 ldt_selector; 2660 u16 ldt_selector;
2583 2661
2662 /*
2663 * A vmexit emulation is required before the vcpu can be executed
2664 * again.
2665 */
2666 if (unlikely(svm->nested.exit_required))
2667 return;
2668
2584 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2669 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2585 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2670 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2586 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 2671 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2879,6 +2964,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2879 .queue_exception = svm_queue_exception, 2964 .queue_exception = svm_queue_exception,
2880 .interrupt_allowed = svm_interrupt_allowed, 2965 .interrupt_allowed = svm_interrupt_allowed,
2881 .nmi_allowed = svm_nmi_allowed, 2966 .nmi_allowed = svm_nmi_allowed,
2967 .get_nmi_mask = svm_get_nmi_mask,
2968 .set_nmi_mask = svm_set_nmi_mask,
2882 .enable_nmi_window = enable_nmi_window, 2969 .enable_nmi_window = enable_nmi_window,
2883 .enable_irq_window = enable_irq_window, 2970 .enable_irq_window = enable_irq_window,
2884 .update_cr8_intercept = update_cr8_intercept, 2971 .update_cr8_intercept = update_cr8_intercept,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..816e0449db0b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -349,6 +349,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
349 __entry->coalesced ? " (coalesced)" : "") 349 __entry->coalesced ? " (coalesced)" : "")
350); 350);
351 351
352/*
353 * Tracepoint for nested VMRUN
354 */
355TRACE_EVENT(kvm_nested_vmrun,
356 TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
357 __u32 event_inj, bool npt),
358 TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
359
360 TP_STRUCT__entry(
361 __field( __u64, rip )
362 __field( __u64, vmcb )
363 __field( __u64, nested_rip )
364 __field( __u32, int_ctl )
365 __field( __u32, event_inj )
366 __field( bool, npt )
367 ),
368
369 TP_fast_assign(
370 __entry->rip = rip;
371 __entry->vmcb = vmcb;
372 __entry->nested_rip = nested_rip;
373 __entry->int_ctl = int_ctl;
374 __entry->event_inj = event_inj;
375 __entry->npt = npt;
376 ),
377
378 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
379 "event_inj: 0x%08x npt: %s\n",
380 __entry->rip, __entry->vmcb, __entry->nested_rip,
381 __entry->int_ctl, __entry->event_inj,
382 __entry->npt ? "on" : "off")
383);
384
385/*
386 * Tracepoint for #VMEXIT while nested
387 */
388TRACE_EVENT(kvm_nested_vmexit,
389 TP_PROTO(__u64 rip, __u32 exit_code,
390 __u64 exit_info1, __u64 exit_info2,
391 __u32 exit_int_info, __u32 exit_int_info_err),
392 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
393 exit_int_info, exit_int_info_err),
394
395 TP_STRUCT__entry(
396 __field( __u64, rip )
397 __field( __u32, exit_code )
398 __field( __u64, exit_info1 )
399 __field( __u64, exit_info2 )
400 __field( __u32, exit_int_info )
401 __field( __u32, exit_int_info_err )
402 ),
403
404 TP_fast_assign(
405 __entry->rip = rip;
406 __entry->exit_code = exit_code;
407 __entry->exit_info1 = exit_info1;
408 __entry->exit_info2 = exit_info2;
409 __entry->exit_int_info = exit_int_info;
410 __entry->exit_int_info_err = exit_int_info_err;
411 ),
412 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
413 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
414 __entry->rip,
415 ftrace_print_symbols_seq(p, __entry->exit_code,
416 kvm_x86_ops->exit_reasons_str),
417 __entry->exit_info1, __entry->exit_info2,
418 __entry->exit_int_info, __entry->exit_int_info_err)
419);
420
421/*
422 * Tracepoint for #VMEXIT reinjected to the guest
423 */
424TRACE_EVENT(kvm_nested_vmexit_inject,
425 TP_PROTO(__u32 exit_code,
426 __u64 exit_info1, __u64 exit_info2,
427 __u32 exit_int_info, __u32 exit_int_info_err),
428 TP_ARGS(exit_code, exit_info1, exit_info2,
429 exit_int_info, exit_int_info_err),
430
431 TP_STRUCT__entry(
432 __field( __u32, exit_code )
433 __field( __u64, exit_info1 )
434 __field( __u64, exit_info2 )
435 __field( __u32, exit_int_info )
436 __field( __u32, exit_int_info_err )
437 ),
438
439 TP_fast_assign(
440 __entry->exit_code = exit_code;
441 __entry->exit_info1 = exit_info1;
442 __entry->exit_info2 = exit_info2;
443 __entry->exit_int_info = exit_int_info;
444 __entry->exit_int_info_err = exit_int_info_err;
445 ),
446
447 TP_printk("reason: %s ext_inf1: 0x%016llx "
448 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
449 ftrace_print_symbols_seq(p, __entry->exit_code,
450 kvm_x86_ops->exit_reasons_str),
451 __entry->exit_info1, __entry->exit_info2,
452 __entry->exit_int_info, __entry->exit_int_info_err)
453);
454
455/*
456 * Tracepoint for nested #vmexit because of interrupt pending
457 */
458TRACE_EVENT(kvm_nested_intr_vmexit,
459 TP_PROTO(__u64 rip),
460 TP_ARGS(rip),
461
462 TP_STRUCT__entry(
463 __field( __u64, rip )
464 ),
465
466 TP_fast_assign(
467 __entry->rip = rip
468 ),
469
470 TP_printk("rip: 0x%016llx\n", __entry->rip)
471);
472
473/*
474 * Tracepoint for nested #vmexit because of interrupt pending
475 */
476TRACE_EVENT(kvm_invlpga,
477 TP_PROTO(__u64 rip, int asid, u64 address),
478 TP_ARGS(rip, asid, address),
479
480 TP_STRUCT__entry(
481 __field( __u64, rip )
482 __field( int, asid )
483 __field( __u64, address )
484 ),
485
486 TP_fast_assign(
487 __entry->rip = rip;
488 __entry->asid = asid;
489 __entry->address = address;
490 ),
491
492 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
493 __entry->rip, __entry->asid, __entry->address)
494);
495
496/*
497 * Tracepoint for nested #vmexit because of interrupt pending
498 */
499TRACE_EVENT(kvm_skinit,
500 TP_PROTO(__u64 rip, __u32 slb),
501 TP_ARGS(rip, slb),
502
503 TP_STRUCT__entry(
504 __field( __u64, rip )
505 __field( __u32, slb )
506 ),
507
508 TP_fast_assign(
509 __entry->rip = rip;
510 __entry->slb = slb;
511 ),
512
513 TP_printk("rip: 0x%016llx slb: 0x%08x\n",
514 __entry->rip, __entry->slb)
515);
516
352#endif /* _TRACE_KVM_H */ 517#endif /* _TRACE_KVM_H */
353 518
354/* This part must be outside protection */ 519/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f3812014bd0b..d4918d6fc924 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,12 +61,37 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 63
64/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive
67 * executions of PAUSE in a loop. Also indicate if ple enabled.
68 * According to test, this time is usually small than 41 cycles.
69 * ple_window: upper bound on the amount of time a guest is allowed to execute
70 * in a PAUSE loop. Tests indicate that most spinlocks are held for
71 * less than 2^12 cycles
72 * Time is measured based on a counter that runs at the same rate as the TSC,
73 * refer SDM volume 3b section 21.6.13 & 22.1.3.
74 */
75#define KVM_VMX_DEFAULT_PLE_GAP 41
76#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
77static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
78module_param(ple_gap, int, S_IRUGO);
79
80static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
81module_param(ple_window, int, S_IRUGO);
82
64struct vmcs { 83struct vmcs {
65 u32 revision_id; 84 u32 revision_id;
66 u32 abort; 85 u32 abort;
67 char data[0]; 86 char data[0];
68}; 87};
69 88
89struct shared_msr_entry {
90 unsigned index;
91 u64 data;
92 u64 mask;
93};
94
70struct vcpu_vmx { 95struct vcpu_vmx {
71 struct kvm_vcpu vcpu; 96 struct kvm_vcpu vcpu;
72 struct list_head local_vcpus_link; 97 struct list_head local_vcpus_link;
@@ -74,13 +99,12 @@ struct vcpu_vmx {
74 int launched; 99 int launched;
75 u8 fail; 100 u8 fail;
76 u32 idt_vectoring_info; 101 u32 idt_vectoring_info;
77 struct kvm_msr_entry *guest_msrs; 102 struct shared_msr_entry *guest_msrs;
78 struct kvm_msr_entry *host_msrs;
79 int nmsrs; 103 int nmsrs;
80 int save_nmsrs; 104 int save_nmsrs;
81 int msr_offset_efer;
82#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
83 int msr_offset_kernel_gs_base; 106 u64 msr_host_kernel_gs_base;
107 u64 msr_guest_kernel_gs_base;
84#endif 108#endif
85 struct vmcs *vmcs; 109 struct vmcs *vmcs;
86 struct { 110 struct {
@@ -88,7 +112,6 @@ struct vcpu_vmx {
88 u16 fs_sel, gs_sel, ldt_sel; 112 u16 fs_sel, gs_sel, ldt_sel;
89 int gs_ldt_reload_needed; 113 int gs_ldt_reload_needed;
90 int fs_reload_needed; 114 int fs_reload_needed;
91 int guest_efer_loaded;
92 } host_state; 115 } host_state;
93 struct { 116 struct {
94 int vm86_active; 117 int vm86_active;
@@ -107,7 +130,6 @@ struct vcpu_vmx {
107 } rmode; 130 } rmode;
108 int vpid; 131 int vpid;
109 bool emulation_required; 132 bool emulation_required;
110 enum emulation_result invalid_state_emulation_result;
111 133
112 /* Support for vnmi-less CPUs */ 134 /* Support for vnmi-less CPUs */
113 int soft_vnmi_blocked; 135 int soft_vnmi_blocked;
@@ -176,6 +198,8 @@ static struct kvm_vmx_segment_field {
176 VMX_SEGMENT_FIELD(LDTR), 198 VMX_SEGMENT_FIELD(LDTR),
177}; 199};
178 200
201static u64 host_efer;
202
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 203static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180 204
181/* 205/*
@@ -184,28 +208,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
184 */ 208 */
185static const u32 vmx_msr_index[] = { 209static const u32 vmx_msr_index[] = {
186#ifdef CONFIG_X86_64 210#ifdef CONFIG_X86_64
187 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
188#endif 212#endif
189 MSR_EFER, MSR_K6_STAR, 213 MSR_EFER, MSR_K6_STAR,
190}; 214};
191#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
192 216
193static void load_msrs(struct kvm_msr_entry *e, int n)
194{
195 int i;
196
197 for (i = 0; i < n; ++i)
198 wrmsrl(e[i].index, e[i].data);
199}
200
201static void save_msrs(struct kvm_msr_entry *e, int n)
202{
203 int i;
204
205 for (i = 0; i < n; ++i)
206 rdmsrl(e[i].index, e[i].data);
207}
208
209static inline int is_page_fault(u32 intr_info) 217static inline int is_page_fault(u32 intr_info)
210{ 218{
211 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 219 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -320,6 +328,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
320 SECONDARY_EXEC_UNRESTRICTED_GUEST; 328 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321} 329}
322 330
331static inline int cpu_has_vmx_ple(void)
332{
333 return vmcs_config.cpu_based_2nd_exec_ctrl &
334 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
335}
336
323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
324{ 338{
325 return flexpriority_enabled && 339 return flexpriority_enabled &&
@@ -348,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
348 int i; 362 int i;
349 363
350 for (i = 0; i < vmx->nmsrs; ++i) 364 for (i = 0; i < vmx->nmsrs; ++i)
351 if (vmx->guest_msrs[i].index == msr) 365 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
352 return i; 366 return i;
353 return -1; 367 return -1;
354} 368}
@@ -379,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
379 : : "a" (&operand), "c" (ext) : "cc", "memory"); 393 : : "a" (&operand), "c" (ext) : "cc", "memory");
380} 394}
381 395
382static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 396static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
383{ 397{
384 int i; 398 int i;
385 399
@@ -570,17 +584,12 @@ static void reload_tss(void)
570 load_TR_desc(); 584 load_TR_desc();
571} 585}
572 586
573static void load_transition_efer(struct vcpu_vmx *vmx) 587static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
574{ 588{
575 int efer_offset = vmx->msr_offset_efer;
576 u64 host_efer;
577 u64 guest_efer; 589 u64 guest_efer;
578 u64 ignore_bits; 590 u64 ignore_bits;
579 591
580 if (efer_offset < 0) 592 guest_efer = vmx->vcpu.arch.shadow_efer;
581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584 593
585 /* 594 /*
586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +602,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
593 if (guest_efer & EFER_LMA) 602 if (guest_efer & EFER_LMA)
594 ignore_bits &= ~(u64)EFER_SCE; 603 ignore_bits &= ~(u64)EFER_SCE;
595#endif 604#endif
596 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
597 return;
598
599 vmx->host_state.guest_efer_loaded = 1;
600 guest_efer &= ~ignore_bits; 605 guest_efer &= ~ignore_bits;
601 guest_efer |= host_efer & ignore_bits; 606 guest_efer |= host_efer & ignore_bits;
602 wrmsrl(MSR_EFER, guest_efer); 607 vmx->guest_msrs[efer_offset].data = guest_efer;
603 vmx->vcpu.stat.efer_reload++; 608 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
604} 609 return true;
605
606static void reload_host_efer(struct vcpu_vmx *vmx)
607{
608 if (vmx->host_state.guest_efer_loaded) {
609 vmx->host_state.guest_efer_loaded = 0;
610 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
611 }
612} 610}
613 611
614static void vmx_save_host_state(struct kvm_vcpu *vcpu) 612static void vmx_save_host_state(struct kvm_vcpu *vcpu)
615{ 613{
616 struct vcpu_vmx *vmx = to_vmx(vcpu); 614 struct vcpu_vmx *vmx = to_vmx(vcpu);
615 int i;
617 616
618 if (vmx->host_state.loaded) 617 if (vmx->host_state.loaded)
619 return; 618 return;
@@ -650,13 +649,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
650#endif 649#endif
651 650
652#ifdef CONFIG_X86_64 651#ifdef CONFIG_X86_64
653 if (is_long_mode(&vmx->vcpu)) 652 if (is_long_mode(&vmx->vcpu)) {
654 save_msrs(vmx->host_msrs + 653 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
655 vmx->msr_offset_kernel_gs_base, 1); 654 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
656 655 }
657#endif 656#endif
658 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 657 for (i = 0; i < vmx->save_nmsrs; ++i)
659 load_transition_efer(vmx); 658 kvm_set_shared_msr(vmx->guest_msrs[i].index,
659 vmx->guest_msrs[i].data,
660 vmx->guest_msrs[i].mask);
660} 661}
661 662
662static void __vmx_load_host_state(struct vcpu_vmx *vmx) 663static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
684 local_irq_restore(flags); 685 local_irq_restore(flags);
685 } 686 }
686 reload_tss(); 687 reload_tss();
687 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 688#ifdef CONFIG_X86_64
688 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 689 if (is_long_mode(&vmx->vcpu)) {
689 reload_host_efer(vmx); 690 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
691 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
692 }
693#endif
690} 694}
691 695
692static void vmx_load_host_state(struct vcpu_vmx *vmx) 696static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -709,7 +713,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
709 if (vcpu->cpu != cpu) { 713 if (vcpu->cpu != cpu) {
710 vcpu_clear(vmx); 714 vcpu_clear(vmx);
711 kvm_migrate_timers(vcpu); 715 kvm_migrate_timers(vcpu);
712 vpid_sync_vcpu_all(vmx); 716 set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
713 local_irq_disable(); 717 local_irq_disable();
714 list_add(&vmx->local_vcpus_link, 718 list_add(&vmx->local_vcpus_link,
715 &per_cpu(vcpus_on_cpu, cpu)); 719 &per_cpu(vcpus_on_cpu, cpu));
@@ -877,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
877/* 881/*
878 * Swap MSR entry in host/guest MSR entry array. 882 * Swap MSR entry in host/guest MSR entry array.
879 */ 883 */
880#ifdef CONFIG_X86_64
881static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 884static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
882{ 885{
883 struct kvm_msr_entry tmp; 886 struct shared_msr_entry tmp;
884 887
885 tmp = vmx->guest_msrs[to]; 888 tmp = vmx->guest_msrs[to];
886 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 889 vmx->guest_msrs[to] = vmx->guest_msrs[from];
887 vmx->guest_msrs[from] = tmp; 890 vmx->guest_msrs[from] = tmp;
888 tmp = vmx->host_msrs[to];
889 vmx->host_msrs[to] = vmx->host_msrs[from];
890 vmx->host_msrs[from] = tmp;
891} 891}
892#endif
893 892
894/* 893/*
895 * Set up the vmcs to automatically save and restore system 894 * Set up the vmcs to automatically save and restore system
@@ -898,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
898 */ 897 */
899static void setup_msrs(struct vcpu_vmx *vmx) 898static void setup_msrs(struct vcpu_vmx *vmx)
900{ 899{
901 int save_nmsrs; 900 int save_nmsrs, index;
902 unsigned long *msr_bitmap; 901 unsigned long *msr_bitmap;
903 902
904 vmx_load_host_state(vmx); 903 vmx_load_host_state(vmx);
905 save_nmsrs = 0; 904 save_nmsrs = 0;
906#ifdef CONFIG_X86_64 905#ifdef CONFIG_X86_64
907 if (is_long_mode(&vmx->vcpu)) { 906 if (is_long_mode(&vmx->vcpu)) {
908 int index;
909
910 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 907 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
911 if (index >= 0) 908 if (index >= 0)
912 move_msr_up(vmx, index, save_nmsrs++); 909 move_msr_up(vmx, index, save_nmsrs++);
@@ -916,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
916 index = __find_msr_index(vmx, MSR_CSTAR); 913 index = __find_msr_index(vmx, MSR_CSTAR);
917 if (index >= 0) 914 if (index >= 0)
918 move_msr_up(vmx, index, save_nmsrs++); 915 move_msr_up(vmx, index, save_nmsrs++);
919 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
920 if (index >= 0)
921 move_msr_up(vmx, index, save_nmsrs++);
922 /* 916 /*
923 * MSR_K6_STAR is only needed on long mode guests, and only 917 * MSR_K6_STAR is only needed on long mode guests, and only
924 * if efer.sce is enabled. 918 * if efer.sce is enabled.
@@ -928,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
928 move_msr_up(vmx, index, save_nmsrs++); 922 move_msr_up(vmx, index, save_nmsrs++);
929 } 923 }
930#endif 924#endif
931 vmx->save_nmsrs = save_nmsrs; 925 index = __find_msr_index(vmx, MSR_EFER);
926 if (index >= 0 && update_transition_efer(vmx, index))
927 move_msr_up(vmx, index, save_nmsrs++);
932 928
933#ifdef CONFIG_X86_64 929 vmx->save_nmsrs = save_nmsrs;
934 vmx->msr_offset_kernel_gs_base =
935 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
936#endif
937 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
938 930
939 if (cpu_has_vmx_msr_bitmap()) { 931 if (cpu_has_vmx_msr_bitmap()) {
940 if (is_long_mode(&vmx->vcpu)) 932 if (is_long_mode(&vmx->vcpu))
@@ -976,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
976static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 968static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
977{ 969{
978 u64 data; 970 u64 data;
979 struct kvm_msr_entry *msr; 971 struct shared_msr_entry *msr;
980 972
981 if (!pdata) { 973 if (!pdata) {
982 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 974 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
991 case MSR_GS_BASE: 983 case MSR_GS_BASE:
992 data = vmcs_readl(GUEST_GS_BASE); 984 data = vmcs_readl(GUEST_GS_BASE);
993 break; 985 break;
986 case MSR_KERNEL_GS_BASE:
987 vmx_load_host_state(to_vmx(vcpu));
988 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
989 break;
990#endif
994 case MSR_EFER: 991 case MSR_EFER:
995 return kvm_get_msr_common(vcpu, msr_index, pdata); 992 return kvm_get_msr_common(vcpu, msr_index, pdata);
996#endif
997 case MSR_IA32_TSC: 993 case MSR_IA32_TSC:
998 data = guest_read_tsc(); 994 data = guest_read_tsc();
999 break; 995 break;
@@ -1007,6 +1003,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1007 data = vmcs_readl(GUEST_SYSENTER_ESP); 1003 data = vmcs_readl(GUEST_SYSENTER_ESP);
1008 break; 1004 break;
1009 default: 1005 default:
1006 vmx_load_host_state(to_vmx(vcpu));
1010 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1007 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1011 if (msr) { 1008 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu)); 1009 vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1028static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1025static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1029{ 1026{
1030 struct vcpu_vmx *vmx = to_vmx(vcpu); 1027 struct vcpu_vmx *vmx = to_vmx(vcpu);
1031 struct kvm_msr_entry *msr; 1028 struct shared_msr_entry *msr;
1032 u64 host_tsc; 1029 u64 host_tsc;
1033 int ret = 0; 1030 int ret = 0;
1034 1031
@@ -1044,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1044 case MSR_GS_BASE: 1041 case MSR_GS_BASE:
1045 vmcs_writel(GUEST_GS_BASE, data); 1042 vmcs_writel(GUEST_GS_BASE, data);
1046 break; 1043 break;
1044 case MSR_KERNEL_GS_BASE:
1045 vmx_load_host_state(vmx);
1046 vmx->msr_guest_kernel_gs_base = data;
1047 break;
1047#endif 1048#endif
1048 case MSR_IA32_SYSENTER_CS: 1049 case MSR_IA32_SYSENTER_CS:
1049 vmcs_write32(GUEST_SYSENTER_CS, data); 1050 vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1097,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1097 } 1098 }
1098} 1099}
1099 1100
1100static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1101static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1101{ 1102{
1102 int old_debug = vcpu->guest_debug;
1103 unsigned long flags;
1104
1105 vcpu->guest_debug = dbg->control;
1106 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1107 vcpu->guest_debug = 0;
1108
1109 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1103 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1110 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); 1104 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1111 else 1105 else
1112 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 1106 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1113 1107
1114 flags = vmcs_readl(GUEST_RFLAGS);
1115 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1116 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1117 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1118 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1119 vmcs_writel(GUEST_RFLAGS, flags);
1120
1121 update_exception_bitmap(vcpu); 1108 update_exception_bitmap(vcpu);
1122
1123 return 0;
1124} 1109}
1125 1110
1126static __init int cpu_has_kvm_support(void) 1111static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void)
1139 /* locked but not enabled */ 1124 /* locked but not enabled */
1140} 1125}
1141 1126
1142static void hardware_enable(void *garbage) 1127static int hardware_enable(void *garbage)
1143{ 1128{
1144 int cpu = raw_smp_processor_id(); 1129 int cpu = raw_smp_processor_id();
1145 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1130 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1146 u64 old; 1131 u64 old;
1147 1132
1133 if (read_cr4() & X86_CR4_VMXE)
1134 return -EBUSY;
1135
1148 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1136 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1149 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1137 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1150 if ((old & (FEATURE_CONTROL_LOCKED | 1138 if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1147,10 @@ static void hardware_enable(void *garbage)
1159 asm volatile (ASM_VMX_VMXON_RAX 1147 asm volatile (ASM_VMX_VMXON_RAX
1160 : : "a"(&phys_addr), "m"(phys_addr) 1148 : : "a"(&phys_addr), "m"(phys_addr)
1161 : "memory", "cc"); 1149 : "memory", "cc");
1150
1151 ept_sync_global();
1152
1153 return 0;
1162} 1154}
1163 1155
1164static void vmclear_local_vcpus(void) 1156static void vmclear_local_vcpus(void)
@@ -1250,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1250 SECONDARY_EXEC_WBINVD_EXITING | 1242 SECONDARY_EXEC_WBINVD_EXITING |
1251 SECONDARY_EXEC_ENABLE_VPID | 1243 SECONDARY_EXEC_ENABLE_VPID |
1252 SECONDARY_EXEC_ENABLE_EPT | 1244 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1245 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1254 if (adjust_vmx_controls(min2, opt2, 1247 if (adjust_vmx_controls(min2, opt2,
1255 MSR_IA32_VMX_PROCBASED_CTLS2, 1248 MSR_IA32_VMX_PROCBASED_CTLS2,
1256 &_cpu_based_2nd_exec_control) < 0) 1249 &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1337,17 @@ static void free_kvm_area(void)
1344{ 1337{
1345 int cpu; 1338 int cpu;
1346 1339
1347 for_each_online_cpu(cpu) 1340 for_each_possible_cpu(cpu) {
1348 free_vmcs(per_cpu(vmxarea, cpu)); 1341 free_vmcs(per_cpu(vmxarea, cpu));
1342 per_cpu(vmxarea, cpu) = NULL;
1343 }
1349} 1344}
1350 1345
1351static __init int alloc_kvm_area(void) 1346static __init int alloc_kvm_area(void)
1352{ 1347{
1353 int cpu; 1348 int cpu;
1354 1349
1355 for_each_online_cpu(cpu) { 1350 for_each_possible_cpu(cpu) {
1356 struct vmcs *vmcs; 1351 struct vmcs *vmcs;
1357 1352
1358 vmcs = alloc_vmcs_cpu(cpu); 1353 vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1389,9 @@ static __init int hardware_setup(void)
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 1389 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages(); 1390 kvm_disable_largepages();
1396 1391
1392 if (!cpu_has_vmx_ple())
1393 ple_gap = 0;
1394
1397 return alloc_kvm_area(); 1395 return alloc_kvm_area();
1398} 1396}
1399 1397
@@ -1536,8 +1534,16 @@ continue_rmode:
1536static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1534static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1537{ 1535{
1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1536 struct vcpu_vmx *vmx = to_vmx(vcpu);
1539 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1537 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1538
1539 if (!msr)
1540 return;
1540 1541
1542 /*
1543 * Force kernel_gs_base reloading before EFER changes, as control
1544 * of this msr depends on is_long_mode().
1545 */
1546 vmx_load_host_state(to_vmx(vcpu));
1541 vcpu->arch.shadow_efer = efer; 1547 vcpu->arch.shadow_efer = efer;
1542 if (!msr) 1548 if (!msr)
1543 return; 1549 return;
@@ -1727,6 +1733,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1727 vmcs_write64(EPT_POINTER, eptp); 1733 vmcs_write64(EPT_POINTER, eptp);
1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1734 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1729 vcpu->kvm->arch.ept_identity_map_addr; 1735 vcpu->kvm->arch.ept_identity_map_addr;
1736 ept_load_pdptrs(vcpu);
1730 } 1737 }
1731 1738
1732 vmx_flush_tlb(vcpu); 1739 vmx_flush_tlb(vcpu);
@@ -2302,13 +2309,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2302 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2309 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2303 if (vmx->vpid == 0) 2310 if (vmx->vpid == 0)
2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2311 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2305 if (!enable_ept) 2312 if (!enable_ept) {
2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2313 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2314 enable_unrestricted_guest = 0;
2315 }
2307 if (!enable_unrestricted_guest) 2316 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2317 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2318 if (!ple_gap)
2319 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2320 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2310 } 2321 }
2311 2322
2323 if (ple_gap) {
2324 vmcs_write32(PLE_GAP, ple_gap);
2325 vmcs_write32(PLE_WINDOW, ple_window);
2326 }
2327
2312 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 2328 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2313 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2329 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2314 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2330 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2376,10 +2392,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2376 if (wrmsr_safe(index, data_low, data_high) < 0) 2392 if (wrmsr_safe(index, data_low, data_high) < 0)
2377 continue; 2393 continue;
2378 data = data_low | ((u64)data_high << 32); 2394 data = data_low | ((u64)data_high << 32);
2379 vmx->host_msrs[j].index = index; 2395 vmx->guest_msrs[j].index = i;
2380 vmx->host_msrs[j].reserved = 0; 2396 vmx->guest_msrs[j].data = 0;
2381 vmx->host_msrs[j].data = data; 2397 vmx->guest_msrs[j].mask = -1ull;
2382 vmx->guest_msrs[j] = vmx->host_msrs[j];
2383 ++vmx->nmsrs; 2398 ++vmx->nmsrs;
2384 } 2399 }
2385 2400
@@ -2510,7 +2525,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2510 if (vmx->vpid != 0) 2525 if (vmx->vpid != 0)
2511 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2512 2527
2513 vmx->vcpu.arch.cr0 = 0x60000010; 2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2514 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2515 vmx_set_cr4(&vmx->vcpu, 0); 2530 vmx_set_cr4(&vmx->vcpu, 0);
2516 vmx_set_efer(&vmx->vcpu, 0); 2531 vmx_set_efer(&vmx->vcpu, 0);
@@ -2627,6 +2642,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2627 GUEST_INTR_STATE_NMI)); 2642 GUEST_INTR_STATE_NMI));
2628} 2643}
2629 2644
2645static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2646{
2647 if (!cpu_has_virtual_nmis())
2648 return to_vmx(vcpu)->soft_vnmi_blocked;
2649 else
2650 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2651 GUEST_INTR_STATE_NMI);
2652}
2653
2654static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2655{
2656 struct vcpu_vmx *vmx = to_vmx(vcpu);
2657
2658 if (!cpu_has_virtual_nmis()) {
2659 if (vmx->soft_vnmi_blocked != masked) {
2660 vmx->soft_vnmi_blocked = masked;
2661 vmx->vnmi_blocked_time = 0;
2662 }
2663 } else {
2664 if (masked)
2665 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2666 GUEST_INTR_STATE_NMI);
2667 else
2668 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2669 GUEST_INTR_STATE_NMI);
2670 }
2671}
2672
2630static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 2673static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2631{ 2674{
2632 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 2675 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2659,7 +2702,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2659 * Cause the #SS fault with 0 error code in VM86 mode. 2702 * Cause the #SS fault with 0 error code in VM86 mode.
2660 */ 2703 */
2661 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2704 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2662 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2705 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
2663 return 1; 2706 return 1;
2664 /* 2707 /*
2665 * Forward all other exceptions that are valid in real mode. 2708 * Forward all other exceptions that are valid in real mode.
@@ -2710,15 +2753,16 @@ static void kvm_machine_check(void)
2710#endif 2753#endif
2711} 2754}
2712 2755
2713static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2756static int handle_machine_check(struct kvm_vcpu *vcpu)
2714{ 2757{
2715 /* already handled by vcpu_run */ 2758 /* already handled by vcpu_run */
2716 return 1; 2759 return 1;
2717} 2760}
2718 2761
2719static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2762static int handle_exception(struct kvm_vcpu *vcpu)
2720{ 2763{
2721 struct vcpu_vmx *vmx = to_vmx(vcpu); 2764 struct vcpu_vmx *vmx = to_vmx(vcpu);
2765 struct kvm_run *kvm_run = vcpu->run;
2722 u32 intr_info, ex_no, error_code; 2766 u32 intr_info, ex_no, error_code;
2723 unsigned long cr2, rip, dr6; 2767 unsigned long cr2, rip, dr6;
2724 u32 vect_info; 2768 u32 vect_info;
@@ -2728,12 +2772,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2728 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2772 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2729 2773
2730 if (is_machine_check(intr_info)) 2774 if (is_machine_check(intr_info))
2731 return handle_machine_check(vcpu, kvm_run); 2775 return handle_machine_check(vcpu);
2732 2776
2733 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2777 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2734 !is_page_fault(intr_info)) 2778 !is_page_fault(intr_info)) {
2735 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2779 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2736 "intr info 0x%x\n", __func__, vect_info, intr_info); 2780 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2781 vcpu->run->internal.ndata = 2;
2782 vcpu->run->internal.data[0] = vect_info;
2783 vcpu->run->internal.data[1] = intr_info;
2784 return 0;
2785 }
2737 2786
2738 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2787 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2739 return 1; /* already handled by vmx_vcpu_run() */ 2788 return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2793,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 } 2793 }
2745 2794
2746 if (is_invalid_opcode(intr_info)) { 2795 if (is_invalid_opcode(intr_info)) {
2747 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 2796 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
2748 if (er != EMULATE_DONE) 2797 if (er != EMULATE_DONE)
2749 kvm_queue_exception(vcpu, UD_VECTOR); 2798 kvm_queue_exception(vcpu, UD_VECTOR);
2750 return 1; 2799 return 1;
@@ -2803,20 +2852,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2803 return 0; 2852 return 0;
2804} 2853}
2805 2854
2806static int handle_external_interrupt(struct kvm_vcpu *vcpu, 2855static int handle_external_interrupt(struct kvm_vcpu *vcpu)
2807 struct kvm_run *kvm_run)
2808{ 2856{
2809 ++vcpu->stat.irq_exits; 2857 ++vcpu->stat.irq_exits;
2810 return 1; 2858 return 1;
2811} 2859}
2812 2860
2813static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2861static int handle_triple_fault(struct kvm_vcpu *vcpu)
2814{ 2862{
2815 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2863 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2816 return 0; 2864 return 0;
2817} 2865}
2818 2866
2819static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2867static int handle_io(struct kvm_vcpu *vcpu)
2820{ 2868{
2821 unsigned long exit_qualification; 2869 unsigned long exit_qualification;
2822 int size, in, string; 2870 int size, in, string;
@@ -2827,8 +2875,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2827 string = (exit_qualification & 16) != 0; 2875 string = (exit_qualification & 16) != 0;
2828 2876
2829 if (string) { 2877 if (string) {
2830 if (emulate_instruction(vcpu, 2878 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2831 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2832 return 0; 2879 return 0;
2833 return 1; 2880 return 1;
2834 } 2881 }
@@ -2838,7 +2885,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 port = exit_qualification >> 16; 2885 port = exit_qualification >> 16;
2839 2886
2840 skip_emulated_instruction(vcpu); 2887 skip_emulated_instruction(vcpu);
2841 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2888 return kvm_emulate_pio(vcpu, in, size, port);
2842} 2889}
2843 2890
2844static void 2891static void
@@ -2852,7 +2899,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2852 hypercall[2] = 0xc1; 2899 hypercall[2] = 0xc1;
2853} 2900}
2854 2901
2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2902static int handle_cr(struct kvm_vcpu *vcpu)
2856{ 2903{
2857 unsigned long exit_qualification, val; 2904 unsigned long exit_qualification, val;
2858 int cr; 2905 int cr;
@@ -2887,7 +2934,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2887 return 1; 2934 return 1;
2888 if (cr8_prev <= cr8) 2935 if (cr8_prev <= cr8)
2889 return 1; 2936 return 1;
2890 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2937 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2891 return 0; 2938 return 0;
2892 } 2939 }
2893 }; 2940 };
@@ -2922,13 +2969,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2922 default: 2969 default:
2923 break; 2970 break;
2924 } 2971 }
2925 kvm_run->exit_reason = 0; 2972 vcpu->run->exit_reason = 0;
2926 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 2973 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2927 (int)(exit_qualification >> 4) & 3, cr); 2974 (int)(exit_qualification >> 4) & 3, cr);
2928 return 0; 2975 return 0;
2929} 2976}
2930 2977
2931static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2978static int handle_dr(struct kvm_vcpu *vcpu)
2932{ 2979{
2933 unsigned long exit_qualification; 2980 unsigned long exit_qualification;
2934 unsigned long val; 2981 unsigned long val;
@@ -2944,13 +2991,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2944 * guest debugging itself. 2991 * guest debugging itself.
2945 */ 2992 */
2946 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 2993 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2947 kvm_run->debug.arch.dr6 = vcpu->arch.dr6; 2994 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
2948 kvm_run->debug.arch.dr7 = dr; 2995 vcpu->run->debug.arch.dr7 = dr;
2949 kvm_run->debug.arch.pc = 2996 vcpu->run->debug.arch.pc =
2950 vmcs_readl(GUEST_CS_BASE) + 2997 vmcs_readl(GUEST_CS_BASE) +
2951 vmcs_readl(GUEST_RIP); 2998 vmcs_readl(GUEST_RIP);
2952 kvm_run->debug.arch.exception = DB_VECTOR; 2999 vcpu->run->debug.arch.exception = DB_VECTOR;
2953 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3000 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
2954 return 0; 3001 return 0;
2955 } else { 3002 } else {
2956 vcpu->arch.dr7 &= ~DR7_GD; 3003 vcpu->arch.dr7 &= ~DR7_GD;
@@ -3016,13 +3063,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016 return 1; 3063 return 1;
3017} 3064}
3018 3065
3019static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3066static int handle_cpuid(struct kvm_vcpu *vcpu)
3020{ 3067{
3021 kvm_emulate_cpuid(vcpu); 3068 kvm_emulate_cpuid(vcpu);
3022 return 1; 3069 return 1;
3023} 3070}
3024 3071
3025static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3072static int handle_rdmsr(struct kvm_vcpu *vcpu)
3026{ 3073{
3027 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3074 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3028 u64 data; 3075 u64 data;
@@ -3041,7 +3088,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3041 return 1; 3088 return 1;
3042} 3089}
3043 3090
3044static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3091static int handle_wrmsr(struct kvm_vcpu *vcpu)
3045{ 3092{
3046 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3093 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
@@ -3058,14 +3105,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3058 return 1; 3105 return 1;
3059} 3106}
3060 3107
3061static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, 3108static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3062 struct kvm_run *kvm_run)
3063{ 3109{
3064 return 1; 3110 return 1;
3065} 3111}
3066 3112
3067static int handle_interrupt_window(struct kvm_vcpu *vcpu, 3113static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3068 struct kvm_run *kvm_run)
3069{ 3114{
3070 u32 cpu_based_vm_exec_control; 3115 u32 cpu_based_vm_exec_control;
3071 3116
@@ -3081,34 +3126,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3081 * possible 3126 * possible
3082 */ 3127 */
3083 if (!irqchip_in_kernel(vcpu->kvm) && 3128 if (!irqchip_in_kernel(vcpu->kvm) &&
3084 kvm_run->request_interrupt_window && 3129 vcpu->run->request_interrupt_window &&
3085 !kvm_cpu_has_interrupt(vcpu)) { 3130 !kvm_cpu_has_interrupt(vcpu)) {
3086 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3131 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3087 return 0; 3132 return 0;
3088 } 3133 }
3089 return 1; 3134 return 1;
3090} 3135}
3091 3136
3092static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3137static int handle_halt(struct kvm_vcpu *vcpu)
3093{ 3138{
3094 skip_emulated_instruction(vcpu); 3139 skip_emulated_instruction(vcpu);
3095 return kvm_emulate_halt(vcpu); 3140 return kvm_emulate_halt(vcpu);
3096} 3141}
3097 3142
3098static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3143static int handle_vmcall(struct kvm_vcpu *vcpu)
3099{ 3144{
3100 skip_emulated_instruction(vcpu); 3145 skip_emulated_instruction(vcpu);
3101 kvm_emulate_hypercall(vcpu); 3146 kvm_emulate_hypercall(vcpu);
3102 return 1; 3147 return 1;
3103} 3148}
3104 3149
3105static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3150static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3106{ 3151{
3107 kvm_queue_exception(vcpu, UD_VECTOR); 3152 kvm_queue_exception(vcpu, UD_VECTOR);
3108 return 1; 3153 return 1;
3109} 3154}
3110 3155
3111static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3156static int handle_invlpg(struct kvm_vcpu *vcpu)
3112{ 3157{
3113 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3158 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3114 3159
@@ -3117,14 +3162,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3117 return 1; 3162 return 1;
3118} 3163}
3119 3164
3120static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3165static int handle_wbinvd(struct kvm_vcpu *vcpu)
3121{ 3166{
3122 skip_emulated_instruction(vcpu); 3167 skip_emulated_instruction(vcpu);
3123 /* TODO: Add support for VT-d/pass-through device */ 3168 /* TODO: Add support for VT-d/pass-through device */
3124 return 1; 3169 return 1;
3125} 3170}
3126 3171
3127static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3172static int handle_apic_access(struct kvm_vcpu *vcpu)
3128{ 3173{
3129 unsigned long exit_qualification; 3174 unsigned long exit_qualification;
3130 enum emulation_result er; 3175 enum emulation_result er;
@@ -3133,7 +3178,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3133 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3178 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3134 offset = exit_qualification & 0xffful; 3179 offset = exit_qualification & 0xffful;
3135 3180
3136 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3181 er = emulate_instruction(vcpu, 0, 0, 0);
3137 3182
3138 if (er != EMULATE_DONE) { 3183 if (er != EMULATE_DONE) {
3139 printk(KERN_ERR 3184 printk(KERN_ERR
@@ -3144,7 +3189,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3144 return 1; 3189 return 1;
3145} 3190}
3146 3191
3147static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3192static int handle_task_switch(struct kvm_vcpu *vcpu)
3148{ 3193{
3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3194 struct vcpu_vmx *vmx = to_vmx(vcpu);
3150 unsigned long exit_qualification; 3195 unsigned long exit_qualification;
@@ -3198,7 +3243,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3198 return 1; 3243 return 1;
3199} 3244}
3200 3245
3201static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3246static int handle_ept_violation(struct kvm_vcpu *vcpu)
3202{ 3247{
3203 unsigned long exit_qualification; 3248 unsigned long exit_qualification;
3204 gpa_t gpa; 3249 gpa_t gpa;
@@ -3219,8 +3264,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3219 vmcs_readl(GUEST_LINEAR_ADDRESS)); 3264 vmcs_readl(GUEST_LINEAR_ADDRESS));
3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3265 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3221 (long unsigned int)exit_qualification); 3266 (long unsigned int)exit_qualification);
3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3267 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 3268 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3224 return 0; 3269 return 0;
3225 } 3270 }
3226 3271
@@ -3290,7 +3335,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3290 } 3335 }
3291} 3336}
3292 3337
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3338static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3294{ 3339{
3295 u64 sptes[4]; 3340 u64 sptes[4];
3296 int nr_sptes, i; 3341 int nr_sptes, i;
@@ -3306,13 +3351,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 3351 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 3352 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308 3353
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3354 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 3355 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311 3356
3312 return 0; 3357 return 0;
3313} 3358}
3314 3359
3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3360static int handle_nmi_window(struct kvm_vcpu *vcpu)
3316{ 3361{
3317 u32 cpu_based_vm_exec_control; 3362 u32 cpu_based_vm_exec_control;
3318 3363
@@ -3325,36 +3370,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3325 return 1; 3370 return 1;
3326} 3371}
3327 3372
3328static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, 3373static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3329 struct kvm_run *kvm_run)
3330{ 3374{
3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3375 struct vcpu_vmx *vmx = to_vmx(vcpu);
3332 enum emulation_result err = EMULATE_DONE; 3376 enum emulation_result err = EMULATE_DONE;
3333 3377 int ret = 1;
3334 local_irq_enable();
3335 preempt_enable();
3336 3378
3337 while (!guest_state_valid(vcpu)) { 3379 while (!guest_state_valid(vcpu)) {
3338 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3380 err = emulate_instruction(vcpu, 0, 0, 0);
3339 3381
3340 if (err == EMULATE_DO_MMIO) 3382 if (err == EMULATE_DO_MMIO) {
3341 break; 3383 ret = 0;
3384 goto out;
3385 }
3342 3386
3343 if (err != EMULATE_DONE) { 3387 if (err != EMULATE_DONE) {
3344 kvm_report_emulation_failure(vcpu, "emulation failure"); 3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3345 break; 3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0;
3392 ret = 0;
3393 goto out;
3346 } 3394 }
3347 3395
3348 if (signal_pending(current)) 3396 if (signal_pending(current))
3349 break; 3397 goto out;
3350 if (need_resched()) 3398 if (need_resched())
3351 schedule(); 3399 schedule();
3352 } 3400 }
3353 3401
3354 preempt_disable(); 3402 vmx->emulation_required = 0;
3355 local_irq_disable(); 3403out:
3404 return ret;
3405}
3356 3406
3357 vmx->invalid_state_emulation_result = err; 3407/*
3408 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3409 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3410 */
3411static int handle_pause(struct kvm_vcpu *vcpu)
3412{
3413 skip_emulated_instruction(vcpu);
3414 kvm_vcpu_on_spin(vcpu);
3415
3416 return 1;
3358} 3417}
3359 3418
3360/* 3419/*
@@ -3362,8 +3421,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3362 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3363 * to be done to userspace and return 0. 3422 * to be done to userspace and return 0.
3364 */ 3423 */
3365static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 3424static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3366 struct kvm_run *kvm_run) = {
3367 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 3425 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3368 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 3426 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
3369 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 3427 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3452,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3452 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3397}; 3456};
3398 3457
3399static const int kvm_vmx_max_exit_handlers = 3458static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3462,7 @@ static const int kvm_vmx_max_exit_handlers =
3403 * The guest has exited. See if we can fix it or if we need userspace 3462 * The guest has exited. See if we can fix it or if we need userspace
3404 * assistance. 3463 * assistance.
3405 */ 3464 */
3406static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3465static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3407{ 3466{
3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3467 struct vcpu_vmx *vmx = to_vmx(vcpu);
3409 u32 exit_reason = vmx->exit_reason; 3468 u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3470,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3411 3470
3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3471 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3413 3472
3414 /* If we need to emulate an MMIO from handle_invalid_guest_state 3473 /* If guest state is invalid, start emulating */
3415 * we just return 0 */ 3474 if (vmx->emulation_required && emulate_invalid_guest_state)
3416 if (vmx->emulation_required && emulate_invalid_guest_state) { 3475 return handle_invalid_guest_state(vcpu);
3417 if (guest_state_valid(vcpu))
3418 vmx->emulation_required = 0;
3419 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3420 }
3421 3476
3422 /* Access CR3 don't cause VMExit in paging mode, so we need 3477 /* Access CR3 don't cause VMExit in paging mode, so we need
3423 * to sync with guest real CR3. */ 3478 * to sync with guest real CR3. */
@@ -3425,8 +3480,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3480 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3426 3481
3427 if (unlikely(vmx->fail)) { 3482 if (unlikely(vmx->fail)) {
3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3483 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3429 kvm_run->fail_entry.hardware_entry_failure_reason 3484 vcpu->run->fail_entry.hardware_entry_failure_reason
3430 = vmcs_read32(VM_INSTRUCTION_ERROR); 3485 = vmcs_read32(VM_INSTRUCTION_ERROR);
3431 return 0; 3486 return 0;
3432 } 3487 }
@@ -3459,10 +3514,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3459 3514
3460 if (exit_reason < kvm_vmx_max_exit_handlers 3515 if (exit_reason < kvm_vmx_max_exit_handlers
3461 && kvm_vmx_exit_handlers[exit_reason]) 3516 && kvm_vmx_exit_handlers[exit_reason])
3462 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3517 return kvm_vmx_exit_handlers[exit_reason](vcpu);
3463 else { 3518 else {
3464 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3519 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3465 kvm_run->hw.hardware_exit_reason = exit_reason; 3520 vcpu->run->hw.hardware_exit_reason = exit_reason;
3466 } 3521 }
3467 return 0; 3522 return 0;
3468} 3523}
@@ -3600,23 +3655,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3600#define Q "l" 3655#define Q "l"
3601#endif 3656#endif
3602 3657
3603static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3658static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3604{ 3659{
3605 struct vcpu_vmx *vmx = to_vmx(vcpu); 3660 struct vcpu_vmx *vmx = to_vmx(vcpu);
3606 3661
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3611 /* Record the guest's net vcpu time for enforced NMI injections. */ 3662 /* Record the guest's net vcpu time for enforced NMI injections. */
3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3663 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3613 vmx->entry_time = ktime_get(); 3664 vmx->entry_time = ktime_get();
3614 3665
3615 /* Handle invalid guest state instead of entering VMX */ 3666 /* Don't enter VMX if guest state is invalid, let the exit handler
3616 if (vmx->emulation_required && emulate_invalid_guest_state) { 3667 start emulation until we arrive back to a valid state */
3617 handle_invalid_guest_state(vcpu, kvm_run); 3668 if (vmx->emulation_required && emulate_invalid_guest_state)
3618 return; 3669 return;
3619 }
3620 3670
3621 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 3671 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3622 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 3672 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3775,7 +3825,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3775 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 3825 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3776 spin_unlock(&vmx_vpid_lock); 3826 spin_unlock(&vmx_vpid_lock);
3777 vmx_free_vmcs(vcpu); 3827 vmx_free_vmcs(vcpu);
3778 kfree(vmx->host_msrs);
3779 kfree(vmx->guest_msrs); 3828 kfree(vmx->guest_msrs);
3780 kvm_vcpu_uninit(vcpu); 3829 kvm_vcpu_uninit(vcpu);
3781 kmem_cache_free(kvm_vcpu_cache, vmx); 3830 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3802 goto uninit_vcpu; 3851 goto uninit_vcpu;
3803 } 3852 }
3804 3853
3805 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3806 if (!vmx->host_msrs)
3807 goto free_guest_msrs;
3808
3809 vmx->vmcs = alloc_vmcs(); 3854 vmx->vmcs = alloc_vmcs();
3810 if (!vmx->vmcs) 3855 if (!vmx->vmcs)
3811 goto free_msrs; 3856 goto free_msrs;
@@ -3836,8 +3881,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3836free_vmcs: 3881free_vmcs:
3837 free_vmcs(vmx->vmcs); 3882 free_vmcs(vmx->vmcs);
3838free_msrs: 3883free_msrs:
3839 kfree(vmx->host_msrs);
3840free_guest_msrs:
3841 kfree(vmx->guest_msrs); 3884 kfree(vmx->guest_msrs);
3842uninit_vcpu: 3885uninit_vcpu:
3843 kvm_vcpu_uninit(&vmx->vcpu); 3886 kvm_vcpu_uninit(&vmx->vcpu);
@@ -3973,6 +4016,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3973 .queue_exception = vmx_queue_exception, 4016 .queue_exception = vmx_queue_exception,
3974 .interrupt_allowed = vmx_interrupt_allowed, 4017 .interrupt_allowed = vmx_interrupt_allowed,
3975 .nmi_allowed = vmx_nmi_allowed, 4018 .nmi_allowed = vmx_nmi_allowed,
4019 .get_nmi_mask = vmx_get_nmi_mask,
4020 .set_nmi_mask = vmx_set_nmi_mask,
3976 .enable_nmi_window = enable_nmi_window, 4021 .enable_nmi_window = enable_nmi_window,
3977 .enable_irq_window = enable_irq_window, 4022 .enable_irq_window = enable_irq_window,
3978 .update_cr8_intercept = update_cr8_intercept, 4023 .update_cr8_intercept = update_cr8_intercept,
@@ -3987,7 +4032,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
3987 4032
3988static int __init vmx_init(void) 4033static int __init vmx_init(void)
3989{ 4034{
3990 int r; 4035 int r, i;
4036
4037 rdmsrl_safe(MSR_EFER, &host_efer);
4038
4039 for (i = 0; i < NR_VMX_MSR; ++i)
4040 kvm_define_shared_msr(i, vmx_msr_index[i]);
3991 4041
3992 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 4042 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3993 if (!vmx_io_bitmap_a) 4043 if (!vmx_io_bitmap_a)
@@ -4049,8 +4099,6 @@ static int __init vmx_init(void)
4049 if (bypass_guest_pf) 4099 if (bypass_guest_pf)
4050 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 4100 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4051 4101
4052 ept_sync_global();
4053
4054 return 0; 4102 return 0;
4055 4103
4056out3: 4104out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be451ee44249..6651dbf58675 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,13 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h>
40#include <trace/events/kvm.h> 41#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE 42#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS 43#define CREATE_TRACE_POINTS
43#include "trace.h" 44#include "trace.h"
44 45
46#include <asm/debugreg.h>
45#include <asm/uaccess.h> 47#include <asm/uaccess.h>
46#include <asm/msr.h> 48#include <asm/msr.h>
47#include <asm/desc.h> 49#include <asm/desc.h>
@@ -87,6 +89,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
87int ignore_msrs = 0; 89int ignore_msrs = 0;
88module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 90module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
89 91
92#define KVM_NR_SHARED_MSRS 16
93
94struct kvm_shared_msrs_global {
95 int nr;
96 struct kvm_shared_msr {
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100};
101
102struct kvm_shared_msrs {
103 struct user_return_notifier urn;
104 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS];
106};
107
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
109static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
110
90struct kvm_stats_debugfs_item debugfs_entries[] = { 111struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "pf_fixed", VCPU_STAT(pf_fixed) }, 112 { "pf_fixed", VCPU_STAT(pf_fixed) },
92 { "pf_guest", VCPU_STAT(pf_guest) }, 113 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -123,6 +144,72 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
123 { NULL } 144 { NULL }
124}; 145};
125 146
147static void kvm_on_user_return(struct user_return_notifier *urn)
148{
149 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn);
153
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot];
156 if (global->value != locals->current_value[slot]) {
157 wrmsrl(global->msr, global->value);
158 locals->current_value[slot] = global->value;
159 }
160 }
161 locals->registered = false;
162 user_return_notifier_unregister(urn);
163}
164
165void kvm_define_shared_msr(unsigned slot, u32 msr)
166{
167 int cpu;
168 u64 value;
169
170 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr;
173 rdmsrl_safe(msr, &value);
174 shared_msrs_global.msrs[slot].value = value;
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179
180static void kvm_shared_msr_cpu_online(void)
181{
182 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184
185 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value;
187}
188
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192
193 if (((value ^ smsr->current_value[slot]) & mask) == 0)
194 return;
195 smsr->current_value[slot] = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value);
197 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn);
200 smsr->registered = true;
201 }
202}
203EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
204
205static void drop_user_return_notifiers(void *ignore)
206{
207 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
208
209 if (smsr->registered)
210 kvm_on_user_return(&smsr->urn);
211}
212
126unsigned long segment_base(u16 selector) 213unsigned long segment_base(u16 selector)
127{ 214{
128 struct descriptor_table gdt; 215 struct descriptor_table gdt;
@@ -484,16 +571,19 @@ static inline u32 bit(int bitno)
484 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 571 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
485 * 572 *
486 * This list is modified at module load time to reflect the 573 * This list is modified at module load time to reflect the
487 * capabilities of the host cpu. 574 * capabilities of the host cpu. This capabilities test skips MSRs that are
575 * kvm-specific. Those are put in the beginning of the list.
488 */ 576 */
577
578#define KVM_SAVE_MSRS_BEGIN 2
489static u32 msrs_to_save[] = { 579static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
490 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
491 MSR_K6_STAR, 582 MSR_K6_STAR,
492#ifdef CONFIG_X86_64 583#ifdef CONFIG_X86_64
493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 584 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
494#endif 585#endif
495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 586 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
497}; 587};
498 588
499static unsigned num_msrs_to_save; 589static unsigned num_msrs_to_save;
@@ -677,7 +767,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
677 /* With all the info we got, fill in the values */ 767 /* With all the info we got, fill in the values */
678 768
679 vcpu->hv_clock.system_time = ts.tv_nsec + 769 vcpu->hv_clock.system_time = ts.tv_nsec +
680 (NSEC_PER_SEC * (u64)ts.tv_sec); 770 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
771
681 /* 772 /*
682 * The interface expects us to write an even number signaling that the 773 * The interface expects us to write an even number signaling that the
683 * update is finished. Since the guest won't see the intermediate 774 * update is finished. Since the guest won't see the intermediate
@@ -835,6 +926,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
835 return 0; 926 return 0;
836} 927}
837 928
929static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
930{
931 struct kvm *kvm = vcpu->kvm;
932 int lm = is_long_mode(vcpu);
933 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
934 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
935 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
936 : kvm->arch.xen_hvm_config.blob_size_32;
937 u32 page_num = data & ~PAGE_MASK;
938 u64 page_addr = data & PAGE_MASK;
939 u8 *page;
940 int r;
941
942 r = -E2BIG;
943 if (page_num >= blob_size)
944 goto out;
945 r = -ENOMEM;
946 page = kzalloc(PAGE_SIZE, GFP_KERNEL);
947 if (!page)
948 goto out;
949 r = -EFAULT;
950 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
951 goto out_free;
952 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
953 goto out_free;
954 r = 0;
955out_free:
956 kfree(page);
957out:
958 return r;
959}
960
838int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 961int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
839{ 962{
840 switch (msr) { 963 switch (msr) {
@@ -950,6 +1073,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
950 "0x%x data 0x%llx\n", msr, data); 1073 "0x%x data 0x%llx\n", msr, data);
951 break; 1074 break;
952 default: 1075 default:
1076 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1077 return xen_hvm_config(vcpu, data);
953 if (!ignore_msrs) { 1078 if (!ignore_msrs) {
954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1079 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
955 msr, data); 1080 msr, data);
@@ -1224,6 +1349,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1224 case KVM_CAP_PIT2: 1349 case KVM_CAP_PIT2:
1225 case KVM_CAP_PIT_STATE2: 1350 case KVM_CAP_PIT_STATE2:
1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1351 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1352 case KVM_CAP_XEN_HVM:
1353 case KVM_CAP_ADJUST_CLOCK:
1354 case KVM_CAP_VCPU_EVENTS:
1227 r = 1; 1355 r = 1;
1228 break; 1356 break;
1229 case KVM_CAP_COALESCED_MMIO: 1357 case KVM_CAP_COALESCED_MMIO:
@@ -1238,8 +1366,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1238 case KVM_CAP_NR_MEMSLOTS: 1366 case KVM_CAP_NR_MEMSLOTS:
1239 r = KVM_MEMORY_SLOTS; 1367 r = KVM_MEMORY_SLOTS;
1240 break; 1368 break;
1241 case KVM_CAP_PV_MMU: 1369 case KVM_CAP_PV_MMU: /* obsolete */
1242 r = !tdp_enabled; 1370 r = 0;
1243 break; 1371 break;
1244 case KVM_CAP_IOMMU: 1372 case KVM_CAP_IOMMU:
1245 r = iommu_found(); 1373 r = iommu_found();
@@ -1326,6 +1454,12 @@ out:
1326void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1454void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1327{ 1455{
1328 kvm_x86_ops->vcpu_load(vcpu, cpu); 1456 kvm_x86_ops->vcpu_load(vcpu, cpu);
1457 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1458 unsigned long khz = cpufreq_quick_get(cpu);
1459 if (!khz)
1460 khz = tsc_khz;
1461 per_cpu(cpu_tsc_khz, cpu) = khz;
1462 }
1329 kvm_request_guest_time_update(vcpu); 1463 kvm_request_guest_time_update(vcpu);
1330} 1464}
1331 1465
@@ -1591,6 +1725,8 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1591 1725
1592 if (cpuid->nent < 1) 1726 if (cpuid->nent < 1)
1593 goto out; 1727 goto out;
1728 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1729 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1594 r = -ENOMEM; 1730 r = -ENOMEM;
1595 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1731 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1596 if (!cpuid_entries) 1732 if (!cpuid_entries)
@@ -1690,7 +1826,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1690 unsigned bank_num = mcg_cap & 0xff, bank; 1826 unsigned bank_num = mcg_cap & 0xff, bank;
1691 1827
1692 r = -EINVAL; 1828 r = -EINVAL;
1693 if (!bank_num) 1829 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
1694 goto out; 1830 goto out;
1695 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 1831 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1696 goto out; 1832 goto out;
@@ -1757,6 +1893,65 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1757 return 0; 1893 return 0;
1758} 1894}
1759 1895
1896static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1897 struct kvm_vcpu_events *events)
1898{
1899 vcpu_load(vcpu);
1900
1901 events->exception.injected = vcpu->arch.exception.pending;
1902 events->exception.nr = vcpu->arch.exception.nr;
1903 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
1904 events->exception.error_code = vcpu->arch.exception.error_code;
1905
1906 events->interrupt.injected = vcpu->arch.interrupt.pending;
1907 events->interrupt.nr = vcpu->arch.interrupt.nr;
1908 events->interrupt.soft = vcpu->arch.interrupt.soft;
1909
1910 events->nmi.injected = vcpu->arch.nmi_injected;
1911 events->nmi.pending = vcpu->arch.nmi_pending;
1912 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
1913
1914 events->sipi_vector = vcpu->arch.sipi_vector;
1915
1916 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
1917 | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
1918
1919 vcpu_put(vcpu);
1920}
1921
1922static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1923 struct kvm_vcpu_events *events)
1924{
1925 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
1926 | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
1927 return -EINVAL;
1928
1929 vcpu_load(vcpu);
1930
1931 vcpu->arch.exception.pending = events->exception.injected;
1932 vcpu->arch.exception.nr = events->exception.nr;
1933 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
1934 vcpu->arch.exception.error_code = events->exception.error_code;
1935
1936 vcpu->arch.interrupt.pending = events->interrupt.injected;
1937 vcpu->arch.interrupt.nr = events->interrupt.nr;
1938 vcpu->arch.interrupt.soft = events->interrupt.soft;
1939 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
1940 kvm_pic_clear_isr_ack(vcpu->kvm);
1941
1942 vcpu->arch.nmi_injected = events->nmi.injected;
1943 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
1944 vcpu->arch.nmi_pending = events->nmi.pending;
1945 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
1946
1947 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
1948 vcpu->arch.sipi_vector = events->sipi_vector;
1949
1950 vcpu_put(vcpu);
1951
1952 return 0;
1953}
1954
1760long kvm_arch_vcpu_ioctl(struct file *filp, 1955long kvm_arch_vcpu_ioctl(struct file *filp,
1761 unsigned int ioctl, unsigned long arg) 1956 unsigned int ioctl, unsigned long arg)
1762{ 1957{
@@ -1767,6 +1962,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1767 1962
1768 switch (ioctl) { 1963 switch (ioctl) {
1769 case KVM_GET_LAPIC: { 1964 case KVM_GET_LAPIC: {
1965 r = -EINVAL;
1966 if (!vcpu->arch.apic)
1967 goto out;
1770 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1968 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1771 1969
1772 r = -ENOMEM; 1970 r = -ENOMEM;
@@ -1782,6 +1980,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1782 break; 1980 break;
1783 } 1981 }
1784 case KVM_SET_LAPIC: { 1982 case KVM_SET_LAPIC: {
1983 r = -EINVAL;
1984 if (!vcpu->arch.apic)
1985 goto out;
1785 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1986 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1786 r = -ENOMEM; 1987 r = -ENOMEM;
1787 if (!lapic) 1988 if (!lapic)
@@ -1908,6 +2109,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1908 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2109 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1909 break; 2110 break;
1910 } 2111 }
2112 case KVM_GET_VCPU_EVENTS: {
2113 struct kvm_vcpu_events events;
2114
2115 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2116
2117 r = -EFAULT;
2118 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2119 break;
2120 r = 0;
2121 break;
2122 }
2123 case KVM_SET_VCPU_EVENTS: {
2124 struct kvm_vcpu_events events;
2125
2126 r = -EFAULT;
2127 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2128 break;
2129
2130 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2131 break;
2132 }
1911 default: 2133 default:
1912 r = -EINVAL; 2134 r = -EINVAL;
1913 } 2135 }
@@ -2036,9 +2258,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2036 sizeof(struct kvm_pic_state)); 2258 sizeof(struct kvm_pic_state));
2037 break; 2259 break;
2038 case KVM_IRQCHIP_IOAPIC: 2260 case KVM_IRQCHIP_IOAPIC:
2039 memcpy(&chip->chip.ioapic, 2261 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2040 ioapic_irqchip(kvm),
2041 sizeof(struct kvm_ioapic_state));
2042 break; 2262 break;
2043 default: 2263 default:
2044 r = -EINVAL; 2264 r = -EINVAL;
@@ -2068,11 +2288,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2068 spin_unlock(&pic_irqchip(kvm)->lock); 2288 spin_unlock(&pic_irqchip(kvm)->lock);
2069 break; 2289 break;
2070 case KVM_IRQCHIP_IOAPIC: 2290 case KVM_IRQCHIP_IOAPIC:
2071 mutex_lock(&kvm->irq_lock); 2291 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2072 memcpy(ioapic_irqchip(kvm),
2073 &chip->chip.ioapic,
2074 sizeof(struct kvm_ioapic_state));
2075 mutex_unlock(&kvm->irq_lock);
2076 break; 2292 break;
2077 default: 2293 default:
2078 r = -EINVAL; 2294 r = -EINVAL;
@@ -2180,7 +2396,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2180{ 2396{
2181 struct kvm *kvm = filp->private_data; 2397 struct kvm *kvm = filp->private_data;
2182 void __user *argp = (void __user *)arg; 2398 void __user *argp = (void __user *)arg;
2183 int r = -EINVAL; 2399 int r = -ENOTTY;
2184 /* 2400 /*
2185 * This union makes it completely explicit to gcc-3.x 2401 * This union makes it completely explicit to gcc-3.x
2186 * that these two variables' stack usage should be 2402 * that these two variables' stack usage should be
@@ -2242,25 +2458,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
2242 if (r) 2458 if (r)
2243 goto out; 2459 goto out;
2244 break; 2460 break;
2245 case KVM_CREATE_IRQCHIP: 2461 case KVM_CREATE_IRQCHIP: {
2462 struct kvm_pic *vpic;
2463
2464 mutex_lock(&kvm->lock);
2465 r = -EEXIST;
2466 if (kvm->arch.vpic)
2467 goto create_irqchip_unlock;
2246 r = -ENOMEM; 2468 r = -ENOMEM;
2247 kvm->arch.vpic = kvm_create_pic(kvm); 2469 vpic = kvm_create_pic(kvm);
2248 if (kvm->arch.vpic) { 2470 if (vpic) {
2249 r = kvm_ioapic_init(kvm); 2471 r = kvm_ioapic_init(kvm);
2250 if (r) { 2472 if (r) {
2251 kfree(kvm->arch.vpic); 2473 kfree(vpic);
2252 kvm->arch.vpic = NULL; 2474 goto create_irqchip_unlock;
2253 goto out;
2254 } 2475 }
2255 } else 2476 } else
2256 goto out; 2477 goto create_irqchip_unlock;
2478 smp_wmb();
2479 kvm->arch.vpic = vpic;
2480 smp_wmb();
2257 r = kvm_setup_default_irq_routing(kvm); 2481 r = kvm_setup_default_irq_routing(kvm);
2258 if (r) { 2482 if (r) {
2483 mutex_lock(&kvm->irq_lock);
2259 kfree(kvm->arch.vpic); 2484 kfree(kvm->arch.vpic);
2260 kfree(kvm->arch.vioapic); 2485 kfree(kvm->arch.vioapic);
2261 goto out; 2486 kvm->arch.vpic = NULL;
2487 kvm->arch.vioapic = NULL;
2488 mutex_unlock(&kvm->irq_lock);
2262 } 2489 }
2490 create_irqchip_unlock:
2491 mutex_unlock(&kvm->lock);
2263 break; 2492 break;
2493 }
2264 case KVM_CREATE_PIT: 2494 case KVM_CREATE_PIT:
2265 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2495 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2266 goto create_pit; 2496 goto create_pit;
@@ -2290,10 +2520,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2290 goto out; 2520 goto out;
2291 if (irqchip_in_kernel(kvm)) { 2521 if (irqchip_in_kernel(kvm)) {
2292 __s32 status; 2522 __s32 status;
2293 mutex_lock(&kvm->irq_lock);
2294 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2523 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2295 irq_event.irq, irq_event.level); 2524 irq_event.irq, irq_event.level);
2296 mutex_unlock(&kvm->irq_lock);
2297 if (ioctl == KVM_IRQ_LINE_STATUS) { 2525 if (ioctl == KVM_IRQ_LINE_STATUS) {
2298 irq_event.status = status; 2526 irq_event.status = status;
2299 if (copy_to_user(argp, &irq_event, 2527 if (copy_to_user(argp, &irq_event,
@@ -2419,6 +2647,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
2419 r = 0; 2647 r = 0;
2420 break; 2648 break;
2421 } 2649 }
2650 case KVM_XEN_HVM_CONFIG: {
2651 r = -EFAULT;
2652 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2653 sizeof(struct kvm_xen_hvm_config)))
2654 goto out;
2655 r = -EINVAL;
2656 if (kvm->arch.xen_hvm_config.flags)
2657 goto out;
2658 r = 0;
2659 break;
2660 }
2661 case KVM_SET_CLOCK: {
2662 struct timespec now;
2663 struct kvm_clock_data user_ns;
2664 u64 now_ns;
2665 s64 delta;
2666
2667 r = -EFAULT;
2668 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2669 goto out;
2670
2671 r = -EINVAL;
2672 if (user_ns.flags)
2673 goto out;
2674
2675 r = 0;
2676 ktime_get_ts(&now);
2677 now_ns = timespec_to_ns(&now);
2678 delta = user_ns.clock - now_ns;
2679 kvm->arch.kvmclock_offset = delta;
2680 break;
2681 }
2682 case KVM_GET_CLOCK: {
2683 struct timespec now;
2684 struct kvm_clock_data user_ns;
2685 u64 now_ns;
2686
2687 ktime_get_ts(&now);
2688 now_ns = timespec_to_ns(&now);
2689 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2690 user_ns.flags = 0;
2691
2692 r = -EFAULT;
2693 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2694 goto out;
2695 r = 0;
2696 break;
2697 }
2698
2422 default: 2699 default:
2423 ; 2700 ;
2424 } 2701 }
@@ -2431,7 +2708,8 @@ static void kvm_init_msr_list(void)
2431 u32 dummy[2]; 2708 u32 dummy[2];
2432 unsigned i, j; 2709 unsigned i, j;
2433 2710
2434 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2711 /* skip the first msrs in the list. KVM-specific */
2712 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2435 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2713 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2436 continue; 2714 continue;
2437 if (j < i) 2715 if (j < i)
@@ -2755,13 +3033,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
2755} 3033}
2756 3034
2757int emulate_instruction(struct kvm_vcpu *vcpu, 3035int emulate_instruction(struct kvm_vcpu *vcpu,
2758 struct kvm_run *run,
2759 unsigned long cr2, 3036 unsigned long cr2,
2760 u16 error_code, 3037 u16 error_code,
2761 int emulation_type) 3038 int emulation_type)
2762{ 3039{
2763 int r, shadow_mask; 3040 int r, shadow_mask;
2764 struct decode_cache *c; 3041 struct decode_cache *c;
3042 struct kvm_run *run = vcpu->run;
2765 3043
2766 kvm_clear_exception_queue(vcpu); 3044 kvm_clear_exception_queue(vcpu);
2767 vcpu->arch.mmio_fault_cr2 = cr2; 3045 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2781,7 +3059,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2781 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3059 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2782 3060
2783 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3061 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2784 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3062 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2785 vcpu->arch.emulate_ctxt.mode = 3063 vcpu->arch.emulate_ctxt.mode =
2786 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3064 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2787 ? X86EMUL_MODE_REAL : cs_l 3065 ? X86EMUL_MODE_REAL : cs_l
@@ -2859,7 +3137,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2859 return EMULATE_DO_MMIO; 3137 return EMULATE_DO_MMIO;
2860 } 3138 }
2861 3139
2862 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3140 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2863 3141
2864 if (vcpu->mmio_is_write) { 3142 if (vcpu->mmio_is_write) {
2865 vcpu->mmio_needed = 0; 3143 vcpu->mmio_needed = 0;
@@ -2967,8 +3245,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2967 return r; 3245 return r;
2968} 3246}
2969 3247
2970int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3248int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2971 int size, unsigned port)
2972{ 3249{
2973 unsigned long val; 3250 unsigned long val;
2974 3251
@@ -2997,7 +3274,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2997} 3274}
2998EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3275EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2999 3276
3000int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3277int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3001 int size, unsigned long count, int down, 3278 int size, unsigned long count, int down,
3002 gva_t address, int rep, unsigned port) 3279 gva_t address, int rep, unsigned port)
3003{ 3280{
@@ -3070,9 +3347,6 @@ static void bounce_off(void *info)
3070 /* nothing */ 3347 /* nothing */
3071} 3348}
3072 3349
3073static unsigned int ref_freq;
3074static unsigned long tsc_khz_ref;
3075
3076static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3350static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3077 void *data) 3351 void *data)
3078{ 3352{
@@ -3081,14 +3355,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
3081 struct kvm_vcpu *vcpu; 3355 struct kvm_vcpu *vcpu;
3082 int i, send_ipi = 0; 3356 int i, send_ipi = 0;
3083 3357
3084 if (!ref_freq)
3085 ref_freq = freq->old;
3086
3087 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3358 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3088 return 0; 3359 return 0;
3089 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3360 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3090 return 0; 3361 return 0;
3091 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3362 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3092 3363
3093 spin_lock(&kvm_lock); 3364 spin_lock(&kvm_lock);
3094 list_for_each_entry(kvm, &vm_list, vm_list) { 3365 list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3125,9 +3396,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
3125 .notifier_call = kvmclock_cpufreq_notifier 3396 .notifier_call = kvmclock_cpufreq_notifier
3126}; 3397};
3127 3398
3399static void kvm_timer_init(void)
3400{
3401 int cpu;
3402
3403 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3404 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3405 CPUFREQ_TRANSITION_NOTIFIER);
3406 for_each_online_cpu(cpu) {
3407 unsigned long khz = cpufreq_get(cpu);
3408 if (!khz)
3409 khz = tsc_khz;
3410 per_cpu(cpu_tsc_khz, cpu) = khz;
3411 }
3412 } else {
3413 for_each_possible_cpu(cpu)
3414 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3415 }
3416}
3417
3128int kvm_arch_init(void *opaque) 3418int kvm_arch_init(void *opaque)
3129{ 3419{
3130 int r, cpu; 3420 int r;
3131 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3421 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3132 3422
3133 if (kvm_x86_ops) { 3423 if (kvm_x86_ops) {
@@ -3159,13 +3449,7 @@ int kvm_arch_init(void *opaque)
3159 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3449 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3160 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3450 PT_DIRTY_MASK, PT64_NX_MASK, 0);
3161 3451
3162 for_each_possible_cpu(cpu) 3452 kvm_timer_init();
3163 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3164 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3165 tsc_khz_ref = tsc_khz;
3166 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3167 CPUFREQ_TRANSITION_NOTIFIER);
3168 }
3169 3453
3170 return 0; 3454 return 0;
3171 3455
@@ -3293,7 +3577,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3293 unsigned long *rflags) 3577 unsigned long *rflags)
3294{ 3578{
3295 kvm_lmsw(vcpu, msw); 3579 kvm_lmsw(vcpu, msw);
3296 *rflags = kvm_x86_ops->get_rflags(vcpu); 3580 *rflags = kvm_get_rflags(vcpu);
3297} 3581}
3298 3582
3299unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3583unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
@@ -3331,7 +3615,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3331 switch (cr) { 3615 switch (cr) {
3332 case 0: 3616 case 0:
3333 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3617 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3334 *rflags = kvm_x86_ops->get_rflags(vcpu); 3618 *rflags = kvm_get_rflags(vcpu);
3335 break; 3619 break;
3336 case 2: 3620 case 2:
3337 vcpu->arch.cr2 = val; 3621 vcpu->arch.cr2 = val;
@@ -3451,18 +3735,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3451 * 3735 *
3452 * No need to exit to userspace if we already have an interrupt queued. 3736 * No need to exit to userspace if we already have an interrupt queued.
3453 */ 3737 */
3454static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3738static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3455 struct kvm_run *kvm_run)
3456{ 3739{
3457 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3740 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3458 kvm_run->request_interrupt_window && 3741 vcpu->run->request_interrupt_window &&
3459 kvm_arch_interrupt_allowed(vcpu)); 3742 kvm_arch_interrupt_allowed(vcpu));
3460} 3743}
3461 3744
3462static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3745static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3463 struct kvm_run *kvm_run)
3464{ 3746{
3465 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3747 struct kvm_run *kvm_run = vcpu->run;
3748
3749 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3466 kvm_run->cr8 = kvm_get_cr8(vcpu); 3750 kvm_run->cr8 = kvm_get_cr8(vcpu);
3467 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3751 kvm_run->apic_base = kvm_get_apic_base(vcpu);
3468 if (irqchip_in_kernel(vcpu->kvm)) 3752 if (irqchip_in_kernel(vcpu->kvm))
@@ -3523,7 +3807,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3523 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3807 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3524} 3808}
3525 3809
3526static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3810static void inject_pending_event(struct kvm_vcpu *vcpu)
3527{ 3811{
3528 /* try to reinject previous events if any */ 3812 /* try to reinject previous events if any */
3529 if (vcpu->arch.exception.pending) { 3813 if (vcpu->arch.exception.pending) {
@@ -3559,11 +3843,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3559 } 3843 }
3560} 3844}
3561 3845
3562static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3846static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3563{ 3847{
3564 int r; 3848 int r;
3565 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3849 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3566 kvm_run->request_interrupt_window; 3850 vcpu->run->request_interrupt_window;
3567 3851
3568 if (vcpu->requests) 3852 if (vcpu->requests)
3569 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3853 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3584,12 +3868,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3584 kvm_x86_ops->tlb_flush(vcpu); 3868 kvm_x86_ops->tlb_flush(vcpu);
3585 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3869 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3586 &vcpu->requests)) { 3870 &vcpu->requests)) {
3587 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3871 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3588 r = 0; 3872 r = 0;
3589 goto out; 3873 goto out;
3590 } 3874 }
3591 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3875 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3592 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3876 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3593 r = 0; 3877 r = 0;
3594 goto out; 3878 goto out;
3595 } 3879 }
@@ -3613,7 +3897,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3613 goto out; 3897 goto out;
3614 } 3898 }
3615 3899
3616 inject_pending_event(vcpu, kvm_run); 3900 inject_pending_event(vcpu);
3617 3901
3618 /* enable NMI/IRQ window open exits if needed */ 3902 /* enable NMI/IRQ window open exits if needed */
3619 if (vcpu->arch.nmi_pending) 3903 if (vcpu->arch.nmi_pending)
@@ -3639,16 +3923,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3639 } 3923 }
3640 3924
3641 trace_kvm_entry(vcpu->vcpu_id); 3925 trace_kvm_entry(vcpu->vcpu_id);
3642 kvm_x86_ops->run(vcpu, kvm_run); 3926 kvm_x86_ops->run(vcpu);
3643 3927
3644 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 3928 /*
3645 set_debugreg(current->thread.debugreg0, 0); 3929 * If the guest has used debug registers, at least dr7
3646 set_debugreg(current->thread.debugreg1, 1); 3930 * will be disabled while returning to the host.
3647 set_debugreg(current->thread.debugreg2, 2); 3931 * If we don't have active breakpoints in the host, we don't
3648 set_debugreg(current->thread.debugreg3, 3); 3932 * care about the messed up debug address registers. But if
3649 set_debugreg(current->thread.debugreg6, 6); 3933 * we have some of them active, restore the old state.
3650 set_debugreg(current->thread.debugreg7, 7); 3934 */
3651 } 3935 if (hw_breakpoint_active())
3936 hw_breakpoint_restore();
3652 3937
3653 set_bit(KVM_REQ_KICK, &vcpu->requests); 3938 set_bit(KVM_REQ_KICK, &vcpu->requests);
3654 local_irq_enable(); 3939 local_irq_enable();
@@ -3680,13 +3965,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3680 3965
3681 kvm_lapic_sync_from_vapic(vcpu); 3966 kvm_lapic_sync_from_vapic(vcpu);
3682 3967
3683 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3968 r = kvm_x86_ops->handle_exit(vcpu);
3684out: 3969out:
3685 return r; 3970 return r;
3686} 3971}
3687 3972
3688 3973
3689static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3974static int __vcpu_run(struct kvm_vcpu *vcpu)
3690{ 3975{
3691 int r; 3976 int r;
3692 3977
@@ -3706,7 +3991,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3706 r = 1; 3991 r = 1;
3707 while (r > 0) { 3992 while (r > 0) {
3708 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3993 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3709 r = vcpu_enter_guest(vcpu, kvm_run); 3994 r = vcpu_enter_guest(vcpu);
3710 else { 3995 else {
3711 up_read(&vcpu->kvm->slots_lock); 3996 up_read(&vcpu->kvm->slots_lock);
3712 kvm_vcpu_block(vcpu); 3997 kvm_vcpu_block(vcpu);
@@ -3734,14 +4019,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3734 if (kvm_cpu_has_pending_timer(vcpu)) 4019 if (kvm_cpu_has_pending_timer(vcpu))
3735 kvm_inject_pending_timer_irqs(vcpu); 4020 kvm_inject_pending_timer_irqs(vcpu);
3736 4021
3737 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 4022 if (dm_request_for_irq_injection(vcpu)) {
3738 r = -EINTR; 4023 r = -EINTR;
3739 kvm_run->exit_reason = KVM_EXIT_INTR; 4024 vcpu->run->exit_reason = KVM_EXIT_INTR;
3740 ++vcpu->stat.request_irq_exits; 4025 ++vcpu->stat.request_irq_exits;
3741 } 4026 }
3742 if (signal_pending(current)) { 4027 if (signal_pending(current)) {
3743 r = -EINTR; 4028 r = -EINTR;
3744 kvm_run->exit_reason = KVM_EXIT_INTR; 4029 vcpu->run->exit_reason = KVM_EXIT_INTR;
3745 ++vcpu->stat.signal_exits; 4030 ++vcpu->stat.signal_exits;
3746 } 4031 }
3747 if (need_resched()) { 4032 if (need_resched()) {
@@ -3752,7 +4037,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3752 } 4037 }
3753 4038
3754 up_read(&vcpu->kvm->slots_lock); 4039 up_read(&vcpu->kvm->slots_lock);
3755 post_kvm_run_save(vcpu, kvm_run); 4040 post_kvm_run_save(vcpu);
3756 4041
3757 vapic_exit(vcpu); 4042 vapic_exit(vcpu);
3758 4043
@@ -3785,15 +4070,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3785 if (r) 4070 if (r)
3786 goto out; 4071 goto out;
3787 } 4072 }
3788#if CONFIG_HAS_IOMEM
3789 if (vcpu->mmio_needed) { 4073 if (vcpu->mmio_needed) {
3790 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4074 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3791 vcpu->mmio_read_completed = 1; 4075 vcpu->mmio_read_completed = 1;
3792 vcpu->mmio_needed = 0; 4076 vcpu->mmio_needed = 0;
3793 4077
3794 down_read(&vcpu->kvm->slots_lock); 4078 down_read(&vcpu->kvm->slots_lock);
3795 r = emulate_instruction(vcpu, kvm_run, 4079 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3796 vcpu->arch.mmio_fault_cr2, 0,
3797 EMULTYPE_NO_DECODE); 4080 EMULTYPE_NO_DECODE);
3798 up_read(&vcpu->kvm->slots_lock); 4081 up_read(&vcpu->kvm->slots_lock);
3799 if (r == EMULATE_DO_MMIO) { 4082 if (r == EMULATE_DO_MMIO) {
@@ -3804,12 +4087,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3804 goto out; 4087 goto out;
3805 } 4088 }
3806 } 4089 }
3807#endif
3808 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4090 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3809 kvm_register_write(vcpu, VCPU_REGS_RAX, 4091 kvm_register_write(vcpu, VCPU_REGS_RAX,
3810 kvm_run->hypercall.ret); 4092 kvm_run->hypercall.ret);
3811 4093
3812 r = __vcpu_run(vcpu, kvm_run); 4094 r = __vcpu_run(vcpu);
3813 4095
3814out: 4096out:
3815 if (vcpu->sigset_active) 4097 if (vcpu->sigset_active)
@@ -3843,13 +4125,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3843#endif 4125#endif
3844 4126
3845 regs->rip = kvm_rip_read(vcpu); 4127 regs->rip = kvm_rip_read(vcpu);
3846 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 4128 regs->rflags = kvm_get_rflags(vcpu);
3847
3848 /*
3849 * Don't leak debug flags in case they were set for guest debugging
3850 */
3851 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3852 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3853 4129
3854 vcpu_put(vcpu); 4130 vcpu_put(vcpu);
3855 4131
@@ -3877,12 +4153,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3877 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4153 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3878 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4154 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3879 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4155 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3880
3881#endif 4156#endif
3882 4157
3883 kvm_rip_write(vcpu, regs->rip); 4158 kvm_rip_write(vcpu, regs->rip);
3884 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 4159 kvm_set_rflags(vcpu, regs->rflags);
3885
3886 4160
3887 vcpu->arch.exception.pending = false; 4161 vcpu->arch.exception.pending = false;
3888 4162
@@ -4049,7 +4323,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4049 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4323 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4050} 4324}
4051 4325
4052static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 4326static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
4053 struct desc_struct *seg_desc) 4327 struct desc_struct *seg_desc)
4054{ 4328{
4055 u32 base_addr = get_desc_base(seg_desc); 4329 u32 base_addr = get_desc_base(seg_desc);
@@ -4101,7 +4375,7 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4101{ 4375{
4102 return (seg != VCPU_SREG_LDTR) && 4376 return (seg != VCPU_SREG_LDTR) &&
4103 (seg != VCPU_SREG_TR) && 4377 (seg != VCPU_SREG_TR) &&
4104 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4378 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4105} 4379}
4106 4380
4107int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4381int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
@@ -4129,7 +4403,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4129{ 4403{
4130 tss->cr3 = vcpu->arch.cr3; 4404 tss->cr3 = vcpu->arch.cr3;
4131 tss->eip = kvm_rip_read(vcpu); 4405 tss->eip = kvm_rip_read(vcpu);
4132 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4406 tss->eflags = kvm_get_rflags(vcpu);
4133 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4407 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4134 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4408 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4135 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4409 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4153,7 +4427,7 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4153 kvm_set_cr3(vcpu, tss->cr3); 4427 kvm_set_cr3(vcpu, tss->cr3);
4154 4428
4155 kvm_rip_write(vcpu, tss->eip); 4429 kvm_rip_write(vcpu, tss->eip);
4156 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4430 kvm_set_rflags(vcpu, tss->eflags | 2);
4157 4431
4158 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4432 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4159 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4433 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4191,7 +4465,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4191 struct tss_segment_16 *tss) 4465 struct tss_segment_16 *tss)
4192{ 4466{
4193 tss->ip = kvm_rip_read(vcpu); 4467 tss->ip = kvm_rip_read(vcpu);
4194 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4468 tss->flag = kvm_get_rflags(vcpu);
4195 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4469 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4196 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4470 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4197 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4471 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4206,14 +4480,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4206 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4480 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4207 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4481 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4208 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4482 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4209 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4210} 4483}
4211 4484
4212static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4485static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4213 struct tss_segment_16 *tss) 4486 struct tss_segment_16 *tss)
4214{ 4487{
4215 kvm_rip_write(vcpu, tss->ip); 4488 kvm_rip_write(vcpu, tss->ip);
4216 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 4489 kvm_set_rflags(vcpu, tss->flag | 2);
4217 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4490 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4218 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4491 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4219 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4492 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4359,8 +4632,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4359 } 4632 }
4360 4633
4361 if (reason == TASK_SWITCH_IRET) { 4634 if (reason == TASK_SWITCH_IRET) {
4362 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4635 u32 eflags = kvm_get_rflags(vcpu);
4363 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4636 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4364 } 4637 }
4365 4638
4366 /* set back link to prev task only if NT bit is set in eflags 4639 /* set back link to prev task only if NT bit is set in eflags
@@ -4368,11 +4641,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4368 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4641 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4369 old_tss_sel = 0xffff; 4642 old_tss_sel = 0xffff;
4370 4643
4371 /* set back link to prev task only if NT bit is set in eflags
4372 note that old_tss_sel is not used afetr this point */
4373 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4374 old_tss_sel = 0xffff;
4375
4376 if (nseg_desc.type & 8) 4644 if (nseg_desc.type & 8)
4377 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4645 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4378 old_tss_base, &nseg_desc); 4646 old_tss_base, &nseg_desc);
@@ -4381,8 +4649,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4381 old_tss_base, &nseg_desc); 4649 old_tss_base, &nseg_desc);
4382 4650
4383 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4651 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4384 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4652 u32 eflags = kvm_get_rflags(vcpu);
4385 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4653 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4386 } 4654 }
4387 4655
4388 if (reason != TASK_SWITCH_IRET) { 4656 if (reason != TASK_SWITCH_IRET) {
@@ -4434,8 +4702,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4434 4702
4435 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4703 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4436 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4704 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4437 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4705 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4438 load_pdptrs(vcpu, vcpu->arch.cr3); 4706 load_pdptrs(vcpu, vcpu->arch.cr3);
4707 mmu_reset_needed = 1;
4708 }
4439 4709
4440 if (mmu_reset_needed) 4710 if (mmu_reset_needed)
4441 kvm_mmu_reset_context(vcpu); 4711 kvm_mmu_reset_context(vcpu);
@@ -4476,12 +4746,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4476int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4746int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4477 struct kvm_guest_debug *dbg) 4747 struct kvm_guest_debug *dbg)
4478{ 4748{
4749 unsigned long rflags;
4479 int i, r; 4750 int i, r;
4480 4751
4481 vcpu_load(vcpu); 4752 vcpu_load(vcpu);
4482 4753
4483 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4754 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4484 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4755 r = -EBUSY;
4756 if (vcpu->arch.exception.pending)
4757 goto unlock_out;
4758 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4759 kvm_queue_exception(vcpu, DB_VECTOR);
4760 else
4761 kvm_queue_exception(vcpu, BP_VECTOR);
4762 }
4763
4764 /*
4765 * Read rflags as long as potentially injected trace flags are still
4766 * filtered out.
4767 */
4768 rflags = kvm_get_rflags(vcpu);
4769
4770 vcpu->guest_debug = dbg->control;
4771 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4772 vcpu->guest_debug = 0;
4773
4774 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4485 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4775 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4486 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4776 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4487 vcpu->arch.switch_db_regs = 4777 vcpu->arch.switch_db_regs =
@@ -4492,13 +4782,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4492 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4782 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4493 } 4783 }
4494 4784
4495 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4785 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
4786 vcpu->arch.singlestep_cs =
4787 get_segment_selector(vcpu, VCPU_SREG_CS);
4788 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
4789 }
4790
4791 /*
4792 * Trigger an rflags update that will inject or remove the trace
4793 * flags.
4794 */
4795 kvm_set_rflags(vcpu, rflags);
4796
4797 kvm_x86_ops->set_guest_debug(vcpu, dbg);
4496 4798
4497 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4799 r = 0;
4498 kvm_queue_exception(vcpu, DB_VECTOR);
4499 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4500 kvm_queue_exception(vcpu, BP_VECTOR);
4501 4800
4801unlock_out:
4502 vcpu_put(vcpu); 4802 vcpu_put(vcpu);
4503 4803
4504 return r; 4804 return r;
@@ -4699,14 +4999,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4699 return kvm_x86_ops->vcpu_reset(vcpu); 4999 return kvm_x86_ops->vcpu_reset(vcpu);
4700} 5000}
4701 5001
4702void kvm_arch_hardware_enable(void *garbage) 5002int kvm_arch_hardware_enable(void *garbage)
4703{ 5003{
4704 kvm_x86_ops->hardware_enable(garbage); 5004 /*
5005 * Since this may be called from a hotplug notifcation,
5006 * we can't get the CPU frequency directly.
5007 */
5008 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5009 int cpu = raw_smp_processor_id();
5010 per_cpu(cpu_tsc_khz, cpu) = 0;
5011 }
5012
5013 kvm_shared_msr_cpu_online();
5014
5015 return kvm_x86_ops->hardware_enable(garbage);
4705} 5016}
4706 5017
4707void kvm_arch_hardware_disable(void *garbage) 5018void kvm_arch_hardware_disable(void *garbage)
4708{ 5019{
4709 kvm_x86_ops->hardware_disable(garbage); 5020 kvm_x86_ops->hardware_disable(garbage);
5021 drop_user_return_notifiers(garbage);
4710} 5022}
4711 5023
4712int kvm_arch_hardware_setup(void) 5024int kvm_arch_hardware_setup(void)
@@ -4944,8 +5256,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4944 return kvm_x86_ops->interrupt_allowed(vcpu); 5256 return kvm_x86_ops->interrupt_allowed(vcpu);
4945} 5257}
4946 5258
5259unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5260{
5261 unsigned long rflags;
5262
5263 rflags = kvm_x86_ops->get_rflags(vcpu);
5264 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5265 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5266 return rflags;
5267}
5268EXPORT_SYMBOL_GPL(kvm_get_rflags);
5269
5270void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5271{
5272 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5273 vcpu->arch.singlestep_cs ==
5274 get_segment_selector(vcpu, VCPU_SREG_CS) &&
5275 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5276 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5277 kvm_x86_ops->set_rflags(vcpu, rflags);
5278}
5279EXPORT_SYMBOL_GPL(kvm_set_rflags);
5280
4947EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5281EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4948EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5282EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4949EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5283EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4950EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5284EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5285EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5286EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5287EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5288EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5289EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5290EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5291EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 000000000000..8df89f0a3fe6
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1 @@
inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 3e549b8ec8c9..cffd754f3039 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,21 +2,36 @@
2# Makefile for x86 specific library files. 2# Makefile for x86 specific library files.
3# 3#
4 4
5obj-$(CONFIG_SMP) := msr.o 5inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
6inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
7quiet_cmd_inat_tables = GEN $@
8 cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@
9
10$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
11 $(call cmd,inat_tables)
12
13$(obj)/inat.o: $(obj)/inat-tables.c
14
15clean-files := inat-tables.c
16
17obj-$(CONFIG_SMP) += msr-smp.o
6 18
7lib-y := delay.o 19lib-y := delay.o
8lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
9lib-y += usercopy_$(BITS).o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
23lib-$(CONFIG_KPROBES) += insn.o inat.o
11 24
12obj-y += msr-reg.o msr-reg-export.o 25obj-y += msr.o msr-reg.o msr-reg-export.o
13 26
14ifeq ($(CONFIG_X86_32),y) 27ifeq ($(CONFIG_X86_32),y)
15 obj-y += atomic64_32.o 28 obj-y += atomic64_32.o
16 lib-y += checksum_32.o 29 lib-y += checksum_32.o
17 lib-y += strstr_32.o 30 lib-y += strstr_32.o
18 lib-y += semaphore_32.o string_32.o cmpxchg8b_emu.o 31 lib-y += semaphore_32.o string_32.o
19 32ifneq ($(CONFIG_X86_CMPXCHG64),y)
33 lib-y += cmpxchg8b_emu.o
34endif
20 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 35 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
21else 36else
22 obj-y += io_64.o iomap_copy_64.o 37 obj-y += io_64.o iomap_copy_64.o
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 6ba0f7bb85ea..cf889d4e076a 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -65,7 +65,7 @@
65 .endm 65 .endm
66 66
67/* Standard copy_to_user with segment limit checking */ 67/* Standard copy_to_user with segment limit checking */
68ENTRY(copy_to_user) 68ENTRY(_copy_to_user)
69 CFI_STARTPROC 69 CFI_STARTPROC
70 GET_THREAD_INFO(%rax) 70 GET_THREAD_INFO(%rax)
71 movq %rdi,%rcx 71 movq %rdi,%rcx
@@ -75,10 +75,10 @@ ENTRY(copy_to_user)
75 jae bad_to_user 75 jae bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
77 CFI_ENDPROC 77 CFI_ENDPROC
78ENDPROC(copy_to_user) 78ENDPROC(_copy_to_user)
79 79
80/* Standard copy_from_user with segment limit checking */ 80/* Standard copy_from_user with segment limit checking */
81ENTRY(copy_from_user) 81ENTRY(_copy_from_user)
82 CFI_STARTPROC 82 CFI_STARTPROC
83 GET_THREAD_INFO(%rax) 83 GET_THREAD_INFO(%rax)
84 movq %rsi,%rcx 84 movq %rsi,%rcx
@@ -88,7 +88,7 @@ ENTRY(copy_from_user)
88 jae bad_from_user 88 jae bad_from_user
89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
90 CFI_ENDPROC 90 CFI_ENDPROC
91ENDPROC(copy_from_user) 91ENDPROC(_copy_from_user)
92 92
93ENTRY(copy_user_generic) 93ENTRY(copy_user_generic)
94 CFI_STARTPROC 94 CFI_STARTPROC
@@ -96,12 +96,6 @@ ENTRY(copy_user_generic)
96 CFI_ENDPROC 96 CFI_ENDPROC
97ENDPROC(copy_user_generic) 97ENDPROC(copy_user_generic)
98 98
99ENTRY(__copy_from_user_inatomic)
100 CFI_STARTPROC
101 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
102 CFI_ENDPROC
103ENDPROC(__copy_from_user_inatomic)
104
105 .section .fixup,"ax" 99 .section .fixup,"ax"
106 /* must zero dest */ 100 /* must zero dest */
107ENTRY(bad_from_user) 101ENTRY(bad_from_user)
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 000000000000..46fc4ee09fc4
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,90 @@
1/*
2 * x86 instruction attribute tables
3 *
4 * Written by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 */
21#include <asm/insn.h>
22
23/* Attribute tables are generated from opcode map */
24#include "inat-tables.c"
25
26/* Attribute search APIs */
27insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
28{
29 return inat_primary_table[opcode];
30}
31
32insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
33 insn_attr_t esc_attr)
34{
35 const insn_attr_t *table;
36 insn_attr_t lpfx_attr;
37 int n, m = 0;
38
39 n = inat_escape_id(esc_attr);
40 if (last_pfx) {
41 lpfx_attr = inat_get_opcode_attribute(last_pfx);
42 m = inat_last_prefix_id(lpfx_attr);
43 }
44 table = inat_escape_tables[n][0];
45 if (!table)
46 return 0;
47 if (inat_has_variant(table[opcode]) && m) {
48 table = inat_escape_tables[n][m];
49 if (!table)
50 return 0;
51 }
52 return table[opcode];
53}
54
55insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
56 insn_attr_t grp_attr)
57{
58 const insn_attr_t *table;
59 insn_attr_t lpfx_attr;
60 int n, m = 0;
61
62 n = inat_group_id(grp_attr);
63 if (last_pfx) {
64 lpfx_attr = inat_get_opcode_attribute(last_pfx);
65 m = inat_last_prefix_id(lpfx_attr);
66 }
67 table = inat_group_tables[n][0];
68 if (!table)
69 return inat_group_common_attribute(grp_attr);
70 if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
71 table = inat_group_tables[n][m];
72 if (!table)
73 return inat_group_common_attribute(grp_attr);
74 }
75 return table[X86_MODRM_REG(modrm)] |
76 inat_group_common_attribute(grp_attr);
77}
78
79insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
80 insn_byte_t vex_p)
81{
82 const insn_attr_t *table;
83 if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
84 return 0;
85 table = inat_avx_tables[vex_m][vex_p];
86 if (!table)
87 return 0;
88 return table[opcode];
89}
90
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644
index 000000000000..9f33b984d0ef
--- /dev/null
+++ b/arch/x86/lib/insn.c
@@ -0,0 +1,516 @@
1/*
2 * x86 instruction analysis
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004, 2009
19 */
20
21#include <linux/string.h>
22#include <asm/inat.h>
23#include <asm/insn.h>
24
25#define get_next(t, insn) \
26 ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
27
28#define peek_next(t, insn) \
29 ({t r; r = *(t*)insn->next_byte; r; })
30
31#define peek_nbyte_next(t, insn, n) \
32 ({t r; r = *(t*)((insn)->next_byte + n); r; })
33
34/**
35 * insn_init() - initialize struct insn
36 * @insn: &struct insn to be initialized
37 * @kaddr: address (in kernel memory) of instruction (or copy thereof)
38 * @x86_64: !0 for 64-bit kernel or 64-bit app
39 */
40void insn_init(struct insn *insn, const void *kaddr, int x86_64)
41{
42 memset(insn, 0, sizeof(*insn));
43 insn->kaddr = kaddr;
44 insn->next_byte = kaddr;
45 insn->x86_64 = x86_64 ? 1 : 0;
46 insn->opnd_bytes = 4;
47 if (x86_64)
48 insn->addr_bytes = 8;
49 else
50 insn->addr_bytes = 4;
51}
52
53/**
54 * insn_get_prefixes - scan x86 instruction prefix bytes
55 * @insn: &struct insn containing instruction
56 *
57 * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
58 * to point to the (first) opcode. No effect if @insn->prefixes.got
59 * is already set.
60 */
61void insn_get_prefixes(struct insn *insn)
62{
63 struct insn_field *prefixes = &insn->prefixes;
64 insn_attr_t attr;
65 insn_byte_t b, lb;
66 int i, nb;
67
68 if (prefixes->got)
69 return;
70
71 nb = 0;
72 lb = 0;
73 b = peek_next(insn_byte_t, insn);
74 attr = inat_get_opcode_attribute(b);
75 while (inat_is_legacy_prefix(attr)) {
76 /* Skip if same prefix */
77 for (i = 0; i < nb; i++)
78 if (prefixes->bytes[i] == b)
79 goto found;
80 if (nb == 4)
81 /* Invalid instruction */
82 break;
83 prefixes->bytes[nb++] = b;
84 if (inat_is_address_size_prefix(attr)) {
85 /* address size switches 2/4 or 4/8 */
86 if (insn->x86_64)
87 insn->addr_bytes ^= 12;
88 else
89 insn->addr_bytes ^= 6;
90 } else if (inat_is_operand_size_prefix(attr)) {
91 /* oprand size switches 2/4 */
92 insn->opnd_bytes ^= 6;
93 }
94found:
95 prefixes->nbytes++;
96 insn->next_byte++;
97 lb = b;
98 b = peek_next(insn_byte_t, insn);
99 attr = inat_get_opcode_attribute(b);
100 }
101 /* Set the last prefix */
102 if (lb && lb != insn->prefixes.bytes[3]) {
103 if (unlikely(insn->prefixes.bytes[3])) {
104 /* Swap the last prefix */
105 b = insn->prefixes.bytes[3];
106 for (i = 0; i < nb; i++)
107 if (prefixes->bytes[i] == lb)
108 prefixes->bytes[i] = b;
109 }
110 insn->prefixes.bytes[3] = lb;
111 }
112
113 /* Decode REX prefix */
114 if (insn->x86_64) {
115 b = peek_next(insn_byte_t, insn);
116 attr = inat_get_opcode_attribute(b);
117 if (inat_is_rex_prefix(attr)) {
118 insn->rex_prefix.value = b;
119 insn->rex_prefix.nbytes = 1;
120 insn->next_byte++;
121 if (X86_REX_W(b))
122 /* REX.W overrides opnd_size */
123 insn->opnd_bytes = 8;
124 }
125 }
126 insn->rex_prefix.got = 1;
127
128 /* Decode VEX prefix */
129 b = peek_next(insn_byte_t, insn);
130 attr = inat_get_opcode_attribute(b);
131 if (inat_is_vex_prefix(attr)) {
132 insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
133 if (!insn->x86_64) {
134 /*
135 * In 32-bits mode, if the [7:6] bits (mod bits of
136 * ModRM) on the second byte are not 11b, it is
137 * LDS or LES.
138 */
139 if (X86_MODRM_MOD(b2) != 3)
140 goto vex_end;
141 }
142 insn->vex_prefix.bytes[0] = b;
143 insn->vex_prefix.bytes[1] = b2;
144 if (inat_is_vex3_prefix(attr)) {
145 b2 = peek_nbyte_next(insn_byte_t, insn, 2);
146 insn->vex_prefix.bytes[2] = b2;
147 insn->vex_prefix.nbytes = 3;
148 insn->next_byte += 3;
149 if (insn->x86_64 && X86_VEX_W(b2))
150 /* VEX.W overrides opnd_size */
151 insn->opnd_bytes = 8;
152 } else {
153 insn->vex_prefix.nbytes = 2;
154 insn->next_byte += 2;
155 }
156 }
157vex_end:
158 insn->vex_prefix.got = 1;
159
160 prefixes->got = 1;
161 return;
162}
163
164/**
165 * insn_get_opcode - collect opcode(s)
166 * @insn: &struct insn containing instruction
167 *
168 * Populates @insn->opcode, updates @insn->next_byte to point past the
169 * opcode byte(s), and set @insn->attr (except for groups).
170 * If necessary, first collects any preceding (prefix) bytes.
171 * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
172 * is already 1.
173 */
174void insn_get_opcode(struct insn *insn)
175{
176 struct insn_field *opcode = &insn->opcode;
177 insn_byte_t op, pfx;
178 if (opcode->got)
179 return;
180 if (!insn->prefixes.got)
181 insn_get_prefixes(insn);
182
183 /* Get first opcode */
184 op = get_next(insn_byte_t, insn);
185 opcode->bytes[0] = op;
186 opcode->nbytes = 1;
187
188 /* Check if there is VEX prefix or not */
189 if (insn_is_avx(insn)) {
190 insn_byte_t m, p;
191 m = insn_vex_m_bits(insn);
192 p = insn_vex_p_bits(insn);
193 insn->attr = inat_get_avx_attribute(op, m, p);
194 if (!inat_accept_vex(insn->attr))
195 insn->attr = 0; /* This instruction is bad */
196 goto end; /* VEX has only 1 byte for opcode */
197 }
198
199 insn->attr = inat_get_opcode_attribute(op);
200 while (inat_is_escape(insn->attr)) {
201 /* Get escaped opcode */
202 op = get_next(insn_byte_t, insn);
203 opcode->bytes[opcode->nbytes++] = op;
204 pfx = insn_last_prefix(insn);
205 insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
206 }
207 if (inat_must_vex(insn->attr))
208 insn->attr = 0; /* This instruction is bad */
209end:
210 opcode->got = 1;
211}
212
213/**
214 * insn_get_modrm - collect ModRM byte, if any
215 * @insn: &struct insn containing instruction
216 *
217 * Populates @insn->modrm and updates @insn->next_byte to point past the
218 * ModRM byte, if any. If necessary, first collects the preceding bytes
219 * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
220 */
221void insn_get_modrm(struct insn *insn)
222{
223 struct insn_field *modrm = &insn->modrm;
224 insn_byte_t pfx, mod;
225 if (modrm->got)
226 return;
227 if (!insn->opcode.got)
228 insn_get_opcode(insn);
229
230 if (inat_has_modrm(insn->attr)) {
231 mod = get_next(insn_byte_t, insn);
232 modrm->value = mod;
233 modrm->nbytes = 1;
234 if (inat_is_group(insn->attr)) {
235 pfx = insn_last_prefix(insn);
236 insn->attr = inat_get_group_attribute(mod, pfx,
237 insn->attr);
238 }
239 }
240
241 if (insn->x86_64 && inat_is_force64(insn->attr))
242 insn->opnd_bytes = 8;
243 modrm->got = 1;
244}
245
246
247/**
248 * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
249 * @insn: &struct insn containing instruction
250 *
251 * If necessary, first collects the instruction up to and including the
252 * ModRM byte. No effect if @insn->x86_64 is 0.
253 */
254int insn_rip_relative(struct insn *insn)
255{
256 struct insn_field *modrm = &insn->modrm;
257
258 if (!insn->x86_64)
259 return 0;
260 if (!modrm->got)
261 insn_get_modrm(insn);
262 /*
263 * For rip-relative instructions, the mod field (top 2 bits)
264 * is zero and the r/m field (bottom 3 bits) is 0x5.
265 */
266 return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
267}
268
269/**
270 * insn_get_sib() - Get the SIB byte of instruction
271 * @insn: &struct insn containing instruction
272 *
273 * If necessary, first collects the instruction up to and including the
274 * ModRM byte.
275 */
276void insn_get_sib(struct insn *insn)
277{
278 insn_byte_t modrm;
279
280 if (insn->sib.got)
281 return;
282 if (!insn->modrm.got)
283 insn_get_modrm(insn);
284 if (insn->modrm.nbytes) {
285 modrm = (insn_byte_t)insn->modrm.value;
286 if (insn->addr_bytes != 2 &&
287 X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
288 insn->sib.value = get_next(insn_byte_t, insn);
289 insn->sib.nbytes = 1;
290 }
291 }
292 insn->sib.got = 1;
293}
294
295
296/**
297 * insn_get_displacement() - Get the displacement of instruction
298 * @insn: &struct insn containing instruction
299 *
300 * If necessary, first collects the instruction up to and including the
301 * SIB byte.
302 * Displacement value is sign-expanded.
303 */
304void insn_get_displacement(struct insn *insn)
305{
306 insn_byte_t mod, rm, base;
307
308 if (insn->displacement.got)
309 return;
310 if (!insn->sib.got)
311 insn_get_sib(insn);
312 if (insn->modrm.nbytes) {
313 /*
314 * Interpreting the modrm byte:
315 * mod = 00 - no displacement fields (exceptions below)
316 * mod = 01 - 1-byte displacement field
317 * mod = 10 - displacement field is 4 bytes, or 2 bytes if
318 * address size = 2 (0x67 prefix in 32-bit mode)
319 * mod = 11 - no memory operand
320 *
321 * If address size = 2...
322 * mod = 00, r/m = 110 - displacement field is 2 bytes
323 *
324 * If address size != 2...
325 * mod != 11, r/m = 100 - SIB byte exists
326 * mod = 00, SIB base = 101 - displacement field is 4 bytes
327 * mod = 00, r/m = 101 - rip-relative addressing, displacement
328 * field is 4 bytes
329 */
330 mod = X86_MODRM_MOD(insn->modrm.value);
331 rm = X86_MODRM_RM(insn->modrm.value);
332 base = X86_SIB_BASE(insn->sib.value);
333 if (mod == 3)
334 goto out;
335 if (mod == 1) {
336 insn->displacement.value = get_next(char, insn);
337 insn->displacement.nbytes = 1;
338 } else if (insn->addr_bytes == 2) {
339 if ((mod == 0 && rm == 6) || mod == 2) {
340 insn->displacement.value =
341 get_next(short, insn);
342 insn->displacement.nbytes = 2;
343 }
344 } else {
345 if ((mod == 0 && rm == 5) || mod == 2 ||
346 (mod == 0 && base == 5)) {
347 insn->displacement.value = get_next(int, insn);
348 insn->displacement.nbytes = 4;
349 }
350 }
351 }
352out:
353 insn->displacement.got = 1;
354}
355
356/* Decode moffset16/32/64 */
357static void __get_moffset(struct insn *insn)
358{
359 switch (insn->addr_bytes) {
360 case 2:
361 insn->moffset1.value = get_next(short, insn);
362 insn->moffset1.nbytes = 2;
363 break;
364 case 4:
365 insn->moffset1.value = get_next(int, insn);
366 insn->moffset1.nbytes = 4;
367 break;
368 case 8:
369 insn->moffset1.value = get_next(int, insn);
370 insn->moffset1.nbytes = 4;
371 insn->moffset2.value = get_next(int, insn);
372 insn->moffset2.nbytes = 4;
373 break;
374 }
375 insn->moffset1.got = insn->moffset2.got = 1;
376}
377
378/* Decode imm v32(Iz) */
379static void __get_immv32(struct insn *insn)
380{
381 switch (insn->opnd_bytes) {
382 case 2:
383 insn->immediate.value = get_next(short, insn);
384 insn->immediate.nbytes = 2;
385 break;
386 case 4:
387 case 8:
388 insn->immediate.value = get_next(int, insn);
389 insn->immediate.nbytes = 4;
390 break;
391 }
392}
393
394/* Decode imm v64(Iv/Ov) */
395static void __get_immv(struct insn *insn)
396{
397 switch (insn->opnd_bytes) {
398 case 2:
399 insn->immediate1.value = get_next(short, insn);
400 insn->immediate1.nbytes = 2;
401 break;
402 case 4:
403 insn->immediate1.value = get_next(int, insn);
404 insn->immediate1.nbytes = 4;
405 break;
406 case 8:
407 insn->immediate1.value = get_next(int, insn);
408 insn->immediate1.nbytes = 4;
409 insn->immediate2.value = get_next(int, insn);
410 insn->immediate2.nbytes = 4;
411 break;
412 }
413 insn->immediate1.got = insn->immediate2.got = 1;
414}
415
416/* Decode ptr16:16/32(Ap) */
417static void __get_immptr(struct insn *insn)
418{
419 switch (insn->opnd_bytes) {
420 case 2:
421 insn->immediate1.value = get_next(short, insn);
422 insn->immediate1.nbytes = 2;
423 break;
424 case 4:
425 insn->immediate1.value = get_next(int, insn);
426 insn->immediate1.nbytes = 4;
427 break;
428 case 8:
429 /* ptr16:64 is not exist (no segment) */
430 return;
431 }
432 insn->immediate2.value = get_next(unsigned short, insn);
433 insn->immediate2.nbytes = 2;
434 insn->immediate1.got = insn->immediate2.got = 1;
435}
436
437/**
438 * insn_get_immediate() - Get the immediates of instruction
439 * @insn: &struct insn containing instruction
440 *
441 * If necessary, first collects the instruction up to and including the
442 * displacement bytes.
443 * Basically, most of immediates are sign-expanded. Unsigned-value can be
444 * get by bit masking with ((1 << (nbytes * 8)) - 1)
445 */
446void insn_get_immediate(struct insn *insn)
447{
448 if (insn->immediate.got)
449 return;
450 if (!insn->displacement.got)
451 insn_get_displacement(insn);
452
453 if (inat_has_moffset(insn->attr)) {
454 __get_moffset(insn);
455 goto done;
456 }
457
458 if (!inat_has_immediate(insn->attr))
459 /* no immediates */
460 goto done;
461
462 switch (inat_immediate_size(insn->attr)) {
463 case INAT_IMM_BYTE:
464 insn->immediate.value = get_next(char, insn);
465 insn->immediate.nbytes = 1;
466 break;
467 case INAT_IMM_WORD:
468 insn->immediate.value = get_next(short, insn);
469 insn->immediate.nbytes = 2;
470 break;
471 case INAT_IMM_DWORD:
472 insn->immediate.value = get_next(int, insn);
473 insn->immediate.nbytes = 4;
474 break;
475 case INAT_IMM_QWORD:
476 insn->immediate1.value = get_next(int, insn);
477 insn->immediate1.nbytes = 4;
478 insn->immediate2.value = get_next(int, insn);
479 insn->immediate2.nbytes = 4;
480 break;
481 case INAT_IMM_PTR:
482 __get_immptr(insn);
483 break;
484 case INAT_IMM_VWORD32:
485 __get_immv32(insn);
486 break;
487 case INAT_IMM_VWORD:
488 __get_immv(insn);
489 break;
490 default:
491 break;
492 }
493 if (inat_has_second_immediate(insn->attr)) {
494 insn->immediate2.value = get_next(char, insn);
495 insn->immediate2.nbytes = 1;
496 }
497done:
498 insn->immediate.got = 1;
499}
500
501/**
502 * insn_get_length() - Get the length of instruction
503 * @insn: &struct insn containing instruction
504 *
505 * If necessary, first collects the instruction up to and including the
506 * immediates bytes.
507 */
508void insn_get_length(struct insn *insn)
509{
510 if (insn->length)
511 return;
512 if (!insn->immediate.got)
513 insn_get_immediate(insn);
514 insn->length = (unsigned char)((unsigned long)insn->next_byte
515 - (unsigned long)insn->kaddr);
516}
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
new file mode 100644
index 000000000000..a6b1b86d2253
--- /dev/null
+++ b/arch/x86/lib/msr-smp.c
@@ -0,0 +1,204 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6static void __rdmsr_on_cpu(void *info)
7{
8 struct msr_info *rv = info;
9 struct msr *reg;
10 int this_cpu = raw_smp_processor_id();
11
12 if (rv->msrs)
13 reg = per_cpu_ptr(rv->msrs, this_cpu);
14 else
15 reg = &rv->reg;
16
17 rdmsr(rv->msr_no, reg->l, reg->h);
18}
19
20static void __wrmsr_on_cpu(void *info)
21{
22 struct msr_info *rv = info;
23 struct msr *reg;
24 int this_cpu = raw_smp_processor_id();
25
26 if (rv->msrs)
27 reg = per_cpu_ptr(rv->msrs, this_cpu);
28 else
29 reg = &rv->reg;
30
31 wrmsr(rv->msr_no, reg->l, reg->h);
32}
33
34int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
35{
36 int err;
37 struct msr_info rv;
38
39 memset(&rv, 0, sizeof(rv));
40
41 rv.msr_no = msr_no;
42 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
43 *l = rv.reg.l;
44 *h = rv.reg.h;
45
46 return err;
47}
48EXPORT_SYMBOL(rdmsr_on_cpu);
49
50int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
51{
52 int err;
53 struct msr_info rv;
54
55 memset(&rv, 0, sizeof(rv));
56
57 rv.msr_no = msr_no;
58 rv.reg.l = l;
59 rv.reg.h = h;
60 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
61
62 return err;
63}
64EXPORT_SYMBOL(wrmsr_on_cpu);
65
66static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
67 struct msr *msrs,
68 void (*msr_func) (void *info))
69{
70 struct msr_info rv;
71 int this_cpu;
72
73 memset(&rv, 0, sizeof(rv));
74
75 rv.msrs = msrs;
76 rv.msr_no = msr_no;
77
78 this_cpu = get_cpu();
79
80 if (cpumask_test_cpu(this_cpu, mask))
81 msr_func(&rv);
82
83 smp_call_function_many(mask, msr_func, &rv, 1);
84 put_cpu();
85}
86
87/* rdmsr on a bunch of CPUs
88 *
89 * @mask: which CPUs
90 * @msr_no: which MSR
91 * @msrs: array of MSR values
92 *
93 */
94void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
95{
96 __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
97}
98EXPORT_SYMBOL(rdmsr_on_cpus);
99
100/*
101 * wrmsr on a bunch of CPUs
102 *
103 * @mask: which CPUs
104 * @msr_no: which MSR
105 * @msrs: array of MSR values
106 *
107 */
108void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
109{
110 __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
111}
112EXPORT_SYMBOL(wrmsr_on_cpus);
113
114/* These "safe" variants are slower and should be used when the target MSR
115 may not actually exist. */
116static void __rdmsr_safe_on_cpu(void *info)
117{
118 struct msr_info *rv = info;
119
120 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
121}
122
123static void __wrmsr_safe_on_cpu(void *info)
124{
125 struct msr_info *rv = info;
126
127 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
128}
129
130int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
131{
132 int err;
133 struct msr_info rv;
134
135 memset(&rv, 0, sizeof(rv));
136
137 rv.msr_no = msr_no;
138 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
139 *l = rv.reg.l;
140 *h = rv.reg.h;
141
142 return err ? err : rv.err;
143}
144EXPORT_SYMBOL(rdmsr_safe_on_cpu);
145
146int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
147{
148 int err;
149 struct msr_info rv;
150
151 memset(&rv, 0, sizeof(rv));
152
153 rv.msr_no = msr_no;
154 rv.reg.l = l;
155 rv.reg.h = h;
156 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
157
158 return err ? err : rv.err;
159}
160EXPORT_SYMBOL(wrmsr_safe_on_cpu);
161
162/*
163 * These variants are significantly slower, but allows control over
164 * the entire 32-bit GPR set.
165 */
166static void __rdmsr_safe_regs_on_cpu(void *info)
167{
168 struct msr_regs_info *rv = info;
169
170 rv->err = rdmsr_safe_regs(rv->regs);
171}
172
173static void __wrmsr_safe_regs_on_cpu(void *info)
174{
175 struct msr_regs_info *rv = info;
176
177 rv->err = wrmsr_safe_regs(rv->regs);
178}
179
180int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
181{
182 int err;
183 struct msr_regs_info rv;
184
185 rv.regs = regs;
186 rv.err = -EIO;
187 err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
188
189 return err ? err : rv.err;
190}
191EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
192
193int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
194{
195 int err;
196 struct msr_regs_info rv;
197
198 rv.regs = regs;
199 rv.err = -EIO;
200 err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
201
202 return err ? err : rv.err;
203}
204EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 33a1e3ca22d8..8f8eebdca7d4 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,226 +1,23 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/preempt.h> 2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h> 3#include <asm/msr.h>
5 4
6struct msr_info { 5struct msr *msrs_alloc(void)
7 u32 msr_no;
8 struct msr reg;
9 struct msr *msrs;
10 int off;
11 int err;
12};
13
14static void __rdmsr_on_cpu(void *info)
15{
16 struct msr_info *rv = info;
17 struct msr *reg;
18 int this_cpu = raw_smp_processor_id();
19
20 if (rv->msrs)
21 reg = &rv->msrs[this_cpu - rv->off];
22 else
23 reg = &rv->reg;
24
25 rdmsr(rv->msr_no, reg->l, reg->h);
26}
27
28static void __wrmsr_on_cpu(void *info)
29{
30 struct msr_info *rv = info;
31 struct msr *reg;
32 int this_cpu = raw_smp_processor_id();
33
34 if (rv->msrs)
35 reg = &rv->msrs[this_cpu - rv->off];
36 else
37 reg = &rv->reg;
38
39 wrmsr(rv->msr_no, reg->l, reg->h);
40}
41
42int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
43{
44 int err;
45 struct msr_info rv;
46
47 memset(&rv, 0, sizeof(rv));
48
49 rv.msr_no = msr_no;
50 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
51 *l = rv.reg.l;
52 *h = rv.reg.h;
53
54 return err;
55}
56EXPORT_SYMBOL(rdmsr_on_cpu);
57
58int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
59{
60 int err;
61 struct msr_info rv;
62
63 memset(&rv, 0, sizeof(rv));
64
65 rv.msr_no = msr_no;
66 rv.reg.l = l;
67 rv.reg.h = h;
68 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
69
70 return err;
71}
72EXPORT_SYMBOL(wrmsr_on_cpu);
73
74/* rdmsr on a bunch of CPUs
75 *
76 * @mask: which CPUs
77 * @msr_no: which MSR
78 * @msrs: array of MSR values
79 *
80 */
81void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
82{
83 struct msr_info rv;
84 int this_cpu;
85
86 memset(&rv, 0, sizeof(rv));
87
88 rv.off = cpumask_first(mask);
89 rv.msrs = msrs;
90 rv.msr_no = msr_no;
91
92 this_cpu = get_cpu();
93
94 if (cpumask_test_cpu(this_cpu, mask))
95 __rdmsr_on_cpu(&rv);
96
97 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
98 put_cpu();
99}
100EXPORT_SYMBOL(rdmsr_on_cpus);
101
102/*
103 * wrmsr on a bunch of CPUs
104 *
105 * @mask: which CPUs
106 * @msr_no: which MSR
107 * @msrs: array of MSR values
108 *
109 */
110void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
111{
112 struct msr_info rv;
113 int this_cpu;
114
115 memset(&rv, 0, sizeof(rv));
116
117 rv.off = cpumask_first(mask);
118 rv.msrs = msrs;
119 rv.msr_no = msr_no;
120
121 this_cpu = get_cpu();
122
123 if (cpumask_test_cpu(this_cpu, mask))
124 __wrmsr_on_cpu(&rv);
125
126 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
127 put_cpu();
128}
129EXPORT_SYMBOL(wrmsr_on_cpus);
130
131/* These "safe" variants are slower and should be used when the target MSR
132 may not actually exist. */
133static void __rdmsr_safe_on_cpu(void *info)
134{
135 struct msr_info *rv = info;
136
137 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
138}
139
140static void __wrmsr_safe_on_cpu(void *info)
141{
142 struct msr_info *rv = info;
143
144 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
145}
146
147int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
148{ 6{
149 int err; 7 struct msr *msrs = NULL;
150 struct msr_info rv;
151 8
152 memset(&rv, 0, sizeof(rv)); 9 msrs = alloc_percpu(struct msr);
10 if (!msrs) {
11 pr_warning("%s: error allocating msrs\n", __func__);
12 return NULL;
13 }
153 14
154 rv.msr_no = msr_no; 15 return msrs;
155 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
156 *l = rv.reg.l;
157 *h = rv.reg.h;
158
159 return err ? err : rv.err;
160} 16}
161EXPORT_SYMBOL(rdmsr_safe_on_cpu); 17EXPORT_SYMBOL(msrs_alloc);
162 18
163int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 19void msrs_free(struct msr *msrs)
164{ 20{
165 int err; 21 free_percpu(msrs);
166 struct msr_info rv;
167
168 memset(&rv, 0, sizeof(rv));
169
170 rv.msr_no = msr_no;
171 rv.reg.l = l;
172 rv.reg.h = h;
173 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
174
175 return err ? err : rv.err;
176}
177EXPORT_SYMBOL(wrmsr_safe_on_cpu);
178
179/*
180 * These variants are significantly slower, but allows control over
181 * the entire 32-bit GPR set.
182 */
183struct msr_regs_info {
184 u32 *regs;
185 int err;
186};
187
188static void __rdmsr_safe_regs_on_cpu(void *info)
189{
190 struct msr_regs_info *rv = info;
191
192 rv->err = rdmsr_safe_regs(rv->regs);
193}
194
195static void __wrmsr_safe_regs_on_cpu(void *info)
196{
197 struct msr_regs_info *rv = info;
198
199 rv->err = wrmsr_safe_regs(rv->regs);
200}
201
202int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
203{
204 int err;
205 struct msr_regs_info rv;
206
207 rv.regs = regs;
208 rv.err = -EIO;
209 err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
210
211 return err ? err : rv.err;
212}
213EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
214
215int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
216{
217 int err;
218 struct msr_regs_info rv;
219
220 rv.regs = regs;
221 rv.err = -EIO;
222 err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
223
224 return err ? err : rv.err;
225} 22}
226EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); 23EXPORT_SYMBOL(msrs_free);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f118d462acc..e218d5df85ff 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user);
874 * data to the requested size using zero bytes. 874 * data to the requested size using zero bytes.
875 */ 875 */
876unsigned long 876unsigned long
877copy_from_user(void *to, const void __user *from, unsigned long n) 877_copy_from_user(void *to, const void __user *from, unsigned long n)
878{ 878{
879 if (access_ok(VERIFY_READ, from, n)) 879 if (access_ok(VERIFY_READ, from, n))
880 n = __copy_from_user(to, from, n); 880 n = __copy_from_user(to, from, n);
@@ -882,4 +882,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
882 memset(to, 0, n); 882 memset(to, 0, n);
883 return n; 883 return n;
884} 884}
885EXPORT_SYMBOL(copy_from_user); 885EXPORT_SYMBOL(_copy_from_user);
886
887void copy_from_user_overflow(void)
888{
889 WARN(1, "Buffer overflow detected!\n");
890}
891EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644
index 000000000000..a793da5e560e
--- /dev/null
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -0,0 +1,893 @@
1# x86 Opcode Maps
2#
3#<Opcode maps>
4# Table: table-name
5# Referrer: escaped-name
6# AVXcode: avx-code
7# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
8# (or)
9# opcode: escape # escaped-name
10# EndTable
11#
12#<group maps>
13# GrpTable: GrpXXX
14# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
15# EndTable
16#
17# AVX Superscripts
18# (VEX): this opcode can accept VEX prefix.
19# (oVEX): this opcode requires VEX prefix.
20# (o128): this opcode only supports 128bit VEX.
21# (o256): this opcode only supports 256bit VEX.
22#
23
24Table: one byte opcode
25Referrer:
26AVXcode:
27# 0x00 - 0x0f
2800: ADD Eb,Gb
2901: ADD Ev,Gv
3002: ADD Gb,Eb
3103: ADD Gv,Ev
3204: ADD AL,Ib
3305: ADD rAX,Iz
3406: PUSH ES (i64)
3507: POP ES (i64)
3608: OR Eb,Gb
3709: OR Ev,Gv
380a: OR Gb,Eb
390b: OR Gv,Ev
400c: OR AL,Ib
410d: OR rAX,Iz
420e: PUSH CS (i64)
430f: escape # 2-byte escape
44# 0x10 - 0x1f
4510: ADC Eb,Gb
4611: ADC Ev,Gv
4712: ADC Gb,Eb
4813: ADC Gv,Ev
4914: ADC AL,Ib
5015: ADC rAX,Iz
5116: PUSH SS (i64)
5217: POP SS (i64)
5318: SBB Eb,Gb
5419: SBB Ev,Gv
551a: SBB Gb,Eb
561b: SBB Gv,Ev
571c: SBB AL,Ib
581d: SBB rAX,Iz
591e: PUSH DS (i64)
601f: POP DS (i64)
61# 0x20 - 0x2f
6220: AND Eb,Gb
6321: AND Ev,Gv
6422: AND Gb,Eb
6523: AND Gv,Ev
6624: AND AL,Ib
6725: AND rAx,Iz
6826: SEG=ES (Prefix)
6927: DAA (i64)
7028: SUB Eb,Gb
7129: SUB Ev,Gv
722a: SUB Gb,Eb
732b: SUB Gv,Ev
742c: SUB AL,Ib
752d: SUB rAX,Iz
762e: SEG=CS (Prefix)
772f: DAS (i64)
78# 0x30 - 0x3f
7930: XOR Eb,Gb
8031: XOR Ev,Gv
8132: XOR Gb,Eb
8233: XOR Gv,Ev
8334: XOR AL,Ib
8435: XOR rAX,Iz
8536: SEG=SS (Prefix)
8637: AAA (i64)
8738: CMP Eb,Gb
8839: CMP Ev,Gv
893a: CMP Gb,Eb
903b: CMP Gv,Ev
913c: CMP AL,Ib
923d: CMP rAX,Iz
933e: SEG=DS (Prefix)
943f: AAS (i64)
95# 0x40 - 0x4f
9640: INC eAX (i64) | REX (o64)
9741: INC eCX (i64) | REX.B (o64)
9842: INC eDX (i64) | REX.X (o64)
9943: INC eBX (i64) | REX.XB (o64)
10044: INC eSP (i64) | REX.R (o64)
10145: INC eBP (i64) | REX.RB (o64)
10246: INC eSI (i64) | REX.RX (o64)
10347: INC eDI (i64) | REX.RXB (o64)
10448: DEC eAX (i64) | REX.W (o64)
10549: DEC eCX (i64) | REX.WB (o64)
1064a: DEC eDX (i64) | REX.WX (o64)
1074b: DEC eBX (i64) | REX.WXB (o64)
1084c: DEC eSP (i64) | REX.WR (o64)
1094d: DEC eBP (i64) | REX.WRB (o64)
1104e: DEC eSI (i64) | REX.WRX (o64)
1114f: DEC eDI (i64) | REX.WRXB (o64)
112# 0x50 - 0x5f
11350: PUSH rAX/r8 (d64)
11451: PUSH rCX/r9 (d64)
11552: PUSH rDX/r10 (d64)
11653: PUSH rBX/r11 (d64)
11754: PUSH rSP/r12 (d64)
11855: PUSH rBP/r13 (d64)
11956: PUSH rSI/r14 (d64)
12057: PUSH rDI/r15 (d64)
12158: POP rAX/r8 (d64)
12259: POP rCX/r9 (d64)
1235a: POP rDX/r10 (d64)
1245b: POP rBX/r11 (d64)
1255c: POP rSP/r12 (d64)
1265d: POP rBP/r13 (d64)
1275e: POP rSI/r14 (d64)
1285f: POP rDI/r15 (d64)
129# 0x60 - 0x6f
13060: PUSHA/PUSHAD (i64)
13161: POPA/POPAD (i64)
13262: BOUND Gv,Ma (i64)
13363: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
13464: SEG=FS (Prefix)
13565: SEG=GS (Prefix)
13666: Operand-Size (Prefix)
13767: Address-Size (Prefix)
13868: PUSH Iz (d64)
13969: IMUL Gv,Ev,Iz
1406a: PUSH Ib (d64)
1416b: IMUL Gv,Ev,Ib
1426c: INS/INSB Yb,DX
1436d: INS/INSW/INSD Yz,DX
1446e: OUTS/OUTSB DX,Xb
1456f: OUTS/OUTSW/OUTSD DX,Xz
146# 0x70 - 0x7f
14770: JO Jb
14871: JNO Jb
14972: JB/JNAE/JC Jb
15073: JNB/JAE/JNC Jb
15174: JZ/JE Jb
15275: JNZ/JNE Jb
15376: JBE/JNA Jb
15477: JNBE/JA Jb
15578: JS Jb
15679: JNS Jb
1577a: JP/JPE Jb
1587b: JNP/JPO Jb
1597c: JL/JNGE Jb
1607d: JNL/JGE Jb
1617e: JLE/JNG Jb
1627f: JNLE/JG Jb
163# 0x80 - 0x8f
16480: Grp1 Eb,Ib (1A)
16581: Grp1 Ev,Iz (1A)
16682: Grp1 Eb,Ib (1A),(i64)
16783: Grp1 Ev,Ib (1A)
16884: TEST Eb,Gb
16985: TEST Ev,Gv
17086: XCHG Eb,Gb
17187: XCHG Ev,Gv
17288: MOV Eb,Gb
17389: MOV Ev,Gv
1748a: MOV Gb,Eb
1758b: MOV Gv,Ev
1768c: MOV Ev,Sw
1778d: LEA Gv,M
1788e: MOV Sw,Ew
1798f: Grp1A (1A) | POP Ev (d64)
180# 0x90 - 0x9f
18190: NOP | PAUSE (F3) | XCHG r8,rAX
18291: XCHG rCX/r9,rAX
18392: XCHG rDX/r10,rAX
18493: XCHG rBX/r11,rAX
18594: XCHG rSP/r12,rAX
18695: XCHG rBP/r13,rAX
18796: XCHG rSI/r14,rAX
18897: XCHG rDI/r15,rAX
18998: CBW/CWDE/CDQE
19099: CWD/CDQ/CQO
1919a: CALLF Ap (i64)
1929b: FWAIT/WAIT
1939c: PUSHF/D/Q Fv (d64)
1949d: POPF/D/Q Fv (d64)
1959e: SAHF
1969f: LAHF
197# 0xa0 - 0xaf
198a0: MOV AL,Ob
199a1: MOV rAX,Ov
200a2: MOV Ob,AL
201a3: MOV Ov,rAX
202a4: MOVS/B Xb,Yb
203a5: MOVS/W/D/Q Xv,Yv
204a6: CMPS/B Xb,Yb
205a7: CMPS/W/D Xv,Yv
206a8: TEST AL,Ib
207a9: TEST rAX,Iz
208aa: STOS/B Yb,AL
209ab: STOS/W/D/Q Yv,rAX
210ac: LODS/B AL,Xb
211ad: LODS/W/D/Q rAX,Xv
212ae: SCAS/B AL,Yb
213af: SCAS/W/D/Q rAX,Xv
214# 0xb0 - 0xbf
215b0: MOV AL/R8L,Ib
216b1: MOV CL/R9L,Ib
217b2: MOV DL/R10L,Ib
218b3: MOV BL/R11L,Ib
219b4: MOV AH/R12L,Ib
220b5: MOV CH/R13L,Ib
221b6: MOV DH/R14L,Ib
222b7: MOV BH/R15L,Ib
223b8: MOV rAX/r8,Iv
224b9: MOV rCX/r9,Iv
225ba: MOV rDX/r10,Iv
226bb: MOV rBX/r11,Iv
227bc: MOV rSP/r12,Iv
228bd: MOV rBP/r13,Iv
229be: MOV rSI/r14,Iv
230bf: MOV rDI/r15,Iv
231# 0xc0 - 0xcf
232c0: Grp2 Eb,Ib (1A)
233c1: Grp2 Ev,Ib (1A)
234c2: RETN Iw (f64)
235c3: RETN
236c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
237c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
238c6: Grp11 Eb,Ib (1A)
239c7: Grp11 Ev,Iz (1A)
240c8: ENTER Iw,Ib
241c9: LEAVE (d64)
242ca: RETF Iw
243cb: RETF
244cc: INT3
245cd: INT Ib
246ce: INTO (i64)
247cf: IRET/D/Q
248# 0xd0 - 0xdf
249d0: Grp2 Eb,1 (1A)
250d1: Grp2 Ev,1 (1A)
251d2: Grp2 Eb,CL (1A)
252d3: Grp2 Ev,CL (1A)
253d4: AAM Ib (i64)
254d5: AAD Ib (i64)
255d6:
256d7: XLAT/XLATB
257d8: ESC
258d9: ESC
259da: ESC
260db: ESC
261dc: ESC
262dd: ESC
263de: ESC
264df: ESC
265# 0xe0 - 0xef
266e0: LOOPNE/LOOPNZ Jb (f64)
267e1: LOOPE/LOOPZ Jb (f64)
268e2: LOOP Jb (f64)
269e3: JrCXZ Jb (f64)
270e4: IN AL,Ib
271e5: IN eAX,Ib
272e6: OUT Ib,AL
273e7: OUT Ib,eAX
274e8: CALL Jz (f64)
275e9: JMP-near Jz (f64)
276ea: JMP-far Ap (i64)
277eb: JMP-short Jb (f64)
278ec: IN AL,DX
279ed: IN eAX,DX
280ee: OUT DX,AL
281ef: OUT DX,eAX
282# 0xf0 - 0xff
283f0: LOCK (Prefix)
284f1:
285f2: REPNE (Prefix)
286f3: REP/REPE (Prefix)
287f4: HLT
288f5: CMC
289f6: Grp3_1 Eb (1A)
290f7: Grp3_2 Ev (1A)
291f8: CLC
292f9: STC
293fa: CLI
294fb: STI
295fc: CLD
296fd: STD
297fe: Grp4 (1A)
298ff: Grp5 (1A)
299EndTable
300
301Table: 2-byte opcode (0x0f)
302Referrer: 2-byte escape
303AVXcode: 1
304# 0x0f 0x00-0x0f
30500: Grp6 (1A)
30601: Grp7 (1A)
30702: LAR Gv,Ew
30803: LSL Gv,Ew
30904:
31005: SYSCALL (o64)
31106: CLTS
31207: SYSRET (o64)
31308: INVD
31409: WBINVD
3150a:
3160b: UD2 (1B)
3170c:
3180d: NOP Ev | GrpP
3190e: FEMMS
320# 3DNow! uses the last imm byte as opcode extension.
3210f: 3DNow! Pq,Qq,Ib
322# 0x0f 0x10-0x1f
32310: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
32411: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
32512: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
32613: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
32714: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
32815: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
32916: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
33017: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
33118: Grp16 (1A)
33219:
3331a:
3341b:
3351c:
3361d:
3371e:
3381f: NOP Ev
339# 0x0f 0x20-0x2f
34020: MOV Rd,Cd
34121: MOV Rd,Dd
34222: MOV Cd,Rd
34323: MOV Dd,Rd
34424:
34525:
34626:
34727:
34828: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
34929: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
3502a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
3512b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
3522c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
3532d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
3542e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128)
3552f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128)
356# 0x0f 0x30-0x3f
35730: WRMSR
35831: RDTSC
35932: RDMSR
36033: RDPMC
36134: SYSENTER
36235: SYSEXIT
36336:
36437: GETSEC
36538: escape # 3-byte escape 1
36639:
3673a: escape # 3-byte escape 2
3683b:
3693c:
3703d:
3713e:
3723f:
373# 0x0f 0x40-0x4f
37440: CMOVO Gv,Ev
37541: CMOVNO Gv,Ev
37642: CMOVB/C/NAE Gv,Ev
37743: CMOVAE/NB/NC Gv,Ev
37844: CMOVE/Z Gv,Ev
37945: CMOVNE/NZ Gv,Ev
38046: CMOVBE/NA Gv,Ev
38147: CMOVA/NBE Gv,Ev
38248: CMOVS Gv,Ev
38349: CMOVNS Gv,Ev
3844a: CMOVP/PE Gv,Ev
3854b: CMOVNP/PO Gv,Ev
3864c: CMOVL/NGE Gv,Ev
3874d: CMOVNL/GE Gv,Ev
3884e: CMOVLE/NG Gv,Ev
3894f: CMOVNLE/G Gv,Ev
390# 0x0f 0x50-0x5f
39150: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
39251: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
39352: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
39453: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
39554: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
39655: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
39756: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
39857: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
39958: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
40059: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
4015a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
4025b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
4035c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
4045d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
4055e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
4065f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
407# 0x0f 0x60-0x6f
40860: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
40961: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
41062: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
41163: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
41264: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
41365: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
41466: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
41567: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
41668: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
41769: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
4186a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
4196b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
4206c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
4216d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
4226e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
4236f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
424# 0x0f 0x70-0x7f
42570: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
42671: Grp12 (1A)
42772: Grp13 (1A)
42873: Grp14 (1A)
42974: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
43075: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
43176: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
43277: emms/vzeroupper/vzeroall (VEX)
43378: VMREAD Ed/q,Gd/q
43479: VMWRITE Gd/q,Ed/q
4357a:
4367b:
4377c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
4387d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
4397e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
4407f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
441# 0x0f 0x80-0x8f
44280: JO Jz (f64)
44381: JNO Jz (f64)
44482: JB/JNAE/JC Jz (f64)
44583: JNB/JAE/JNC Jz (f64)
44684: JZ/JE Jz (f64)
44785: JNZ/JNE Jz (f64)
44886: JBE/JNA Jz (f64)
44987: JNBE/JA Jz (f64)
45088: JS Jz (f64)
45189: JNS Jz (f64)
4528a: JP/JPE Jz (f64)
4538b: JNP/JPO Jz (f64)
4548c: JL/JNGE Jz (f64)
4558d: JNL/JGE Jz (f64)
4568e: JLE/JNG Jz (f64)
4578f: JNLE/JG Jz (f64)
458# 0x0f 0x90-0x9f
45990: SETO Eb
46091: SETNO Eb
46192: SETB/C/NAE Eb
46293: SETAE/NB/NC Eb
46394: SETE/Z Eb
46495: SETNE/NZ Eb
46596: SETBE/NA Eb
46697: SETA/NBE Eb
46798: SETS Eb
46899: SETNS Eb
4699a: SETP/PE Eb
4709b: SETNP/PO Eb
4719c: SETL/NGE Eb
4729d: SETNL/GE Eb
4739e: SETLE/NG Eb
4749f: SETNLE/G Eb
475# 0x0f 0xa0-0xaf
476a0: PUSH FS (d64)
477a1: POP FS (d64)
478a2: CPUID
479a3: BT Ev,Gv
480a4: SHLD Ev,Gv,Ib
481a5: SHLD Ev,Gv,CL
482a6: GrpPDLK
483a7: GrpRNG
484a8: PUSH GS (d64)
485a9: POP GS (d64)
486aa: RSM
487ab: BTS Ev,Gv
488ac: SHRD Ev,Gv,Ib
489ad: SHRD Ev,Gv,CL
490ae: Grp15 (1A),(1C)
491af: IMUL Gv,Ev
492# 0x0f 0xb0-0xbf
493b0: CMPXCHG Eb,Gb
494b1: CMPXCHG Ev,Gv
495b2: LSS Gv,Mp
496b3: BTR Ev,Gv
497b4: LFS Gv,Mp
498b5: LGS Gv,Mp
499b6: MOVZX Gv,Eb
500b7: MOVZX Gv,Ew
501b8: JMPE | POPCNT Gv,Ev (F3)
502b9: Grp10 (1A)
503ba: Grp8 Ev,Ib (1A)
504bb: BTC Ev,Gv
505bc: BSF Gv,Ev
506bd: BSR Gv,Ev
507be: MOVSX Gv,Eb
508bf: MOVSX Gv,Ew
509# 0x0f 0xc0-0xcf
510c0: XADD Eb,Gb
511c1: XADD Ev,Gv
512c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
513c3: movnti Md/q,Gd/q
514c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
515c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
516c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
517c7: Grp9 (1A)
518c8: BSWAP RAX/EAX/R8/R8D
519c9: BSWAP RCX/ECX/R9/R9D
520ca: BSWAP RDX/EDX/R10/R10D
521cb: BSWAP RBX/EBX/R11/R11D
522cc: BSWAP RSP/ESP/R12/R12D
523cd: BSWAP RBP/EBP/R13/R13D
524ce: BSWAP RSI/ESI/R14/R14D
525cf: BSWAP RDI/EDI/R15/R15D
526# 0x0f 0xd0-0xdf
527d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
528d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
529d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
530d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
531d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
532d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
533d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
534d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
535d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
536d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
537da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
538db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
539dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
540dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
541de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
542df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
543# 0x0f 0xe0-0xef
544e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
545e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
546e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
547e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
548e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
549e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
550e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
551e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
552e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
553e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
554ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
555eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
556ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
557ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
558ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
559ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
560# 0x0f 0xf0-0xff
561f0: lddqu Vdq,Mdq (F2),(VEX)
562f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
563f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
564f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
565f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
566f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
567f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
568f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
569f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
570f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
571fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
572fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
573fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
574fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
575fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
576ff:
577EndTable
578
579Table: 3-byte opcode 1 (0x0f 0x38)
580Referrer: 3-byte escape 1
581AVXcode: 2
582# 0x0f 0x38 0x00-0x0f
58300: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
58401: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
58502: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
58603: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
58704: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
58805: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
58906: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
59007: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
59108: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
59209: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
5930a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
5940b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
5950c: Vpermilps /r (66),(oVEX)
5960d: Vpermilpd /r (66),(oVEX)
5970e: vtestps /r (66),(oVEX)
5980f: vtestpd /r (66),(oVEX)
599# 0x0f 0x38 0x10-0x1f
60010: pblendvb Vdq,Wdq (66)
60111:
60212:
60313:
60414: blendvps Vdq,Wdq (66)
60515: blendvpd Vdq,Wdq (66)
60616:
60717: ptest Vdq,Wdq (66),(VEX)
60818: vbroadcastss /r (66),(oVEX)
60919: vbroadcastsd /r (66),(oVEX),(o256)
6101a: vbroadcastf128 /r (66),(oVEX),(o256)
6111b:
6121c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
6131d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
6141e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
6151f:
616# 0x0f 0x38 0x20-0x2f
61720: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
61821: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
61922: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
62023: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
62124: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
62225: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
62326:
62427:
62528: pmuldq Vdq,Wdq (66),(VEX),(o128)
62629: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
6272a: movntdqa Vdq,Mdq (66),(VEX),(o128)
6282b: packusdw Vdq,Wdq (66),(VEX),(o128)
6292c: vmaskmovps(ld) /r (66),(oVEX)
6302d: vmaskmovpd(ld) /r (66),(oVEX)
6312e: vmaskmovps(st) /r (66),(oVEX)
6322f: vmaskmovpd(st) /r (66),(oVEX)
633# 0x0f 0x38 0x30-0x3f
63430: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
63531: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
63632: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
63733: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
63834: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
63935: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
64036:
64137: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
64238: pminsb Vdq,Wdq (66),(VEX),(o128)
64339: pminsd Vdq,Wdq (66),(VEX),(o128)
6443a: pminuw Vdq,Wdq (66),(VEX),(o128)
6453b: pminud Vdq,Wdq (66),(VEX),(o128)
6463c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
6473d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
6483e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
6493f: pmaxud Vdq,Wdq (66),(VEX),(o128)
650# 0x0f 0x38 0x40-0x8f
65140: pmulld Vdq,Wdq (66),(VEX),(o128)
65241: phminposuw Vdq,Wdq (66),(VEX),(o128)
65380: INVEPT Gd/q,Mdq (66)
65481: INVPID Gd/q,Mdq (66)
655# 0x0f 0x38 0x90-0xbf (FMA)
65696: vfmaddsub132pd/ps /r (66),(VEX)
65797: vfmsubadd132pd/ps /r (66),(VEX)
65898: vfmadd132pd/ps /r (66),(VEX)
65999: vfmadd132sd/ss /r (66),(VEX),(o128)
6609a: vfmsub132pd/ps /r (66),(VEX)
6619b: vfmsub132sd/ss /r (66),(VEX),(o128)
6629c: vfnmadd132pd/ps /r (66),(VEX)
6639d: vfnmadd132sd/ss /r (66),(VEX),(o128)
6649e: vfnmsub132pd/ps /r (66),(VEX)
6659f: vfnmsub132sd/ss /r (66),(VEX),(o128)
666a6: vfmaddsub213pd/ps /r (66),(VEX)
667a7: vfmsubadd213pd/ps /r (66),(VEX)
668a8: vfmadd213pd/ps /r (66),(VEX)
669a9: vfmadd213sd/ss /r (66),(VEX),(o128)
670aa: vfmsub213pd/ps /r (66),(VEX)
671ab: vfmsub213sd/ss /r (66),(VEX),(o128)
672ac: vfnmadd213pd/ps /r (66),(VEX)
673ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
674ae: vfnmsub213pd/ps /r (66),(VEX)
675af: vfnmsub213sd/ss /r (66),(VEX),(o128)
676b6: vfmaddsub231pd/ps /r (66),(VEX)
677b7: vfmsubadd231pd/ps /r (66),(VEX)
678b8: vfmadd231pd/ps /r (66),(VEX)
679b9: vfmadd231sd/ss /r (66),(VEX),(o128)
680ba: vfmsub231pd/ps /r (66),(VEX)
681bb: vfmsub231sd/ss /r (66),(VEX),(o128)
682bc: vfnmadd231pd/ps /r (66),(VEX)
683bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
684be: vfnmsub231pd/ps /r (66),(VEX)
685bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
686# 0x0f 0x38 0xc0-0xff
687db: aesimc Vdq,Wdq (66),(VEX),(o128)
688dc: aesenc Vdq,Wdq (66),(VEX),(o128)
689dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
690de: aesdec Vdq,Wdq (66),(VEX),(o128)
691df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
692f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
693f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
694EndTable
695
696Table: 3-byte opcode 2 (0x0f 0x3a)
697Referrer: 3-byte escape 2
698AVXcode: 3
699# 0x0f 0x3a 0x00-0xff
70004: vpermilps /r,Ib (66),(oVEX)
70105: vpermilpd /r,Ib (66),(oVEX)
70206: vperm2f128 /r,Ib (66),(oVEX),(o256)
70308: roundps Vdq,Wdq,Ib (66),(VEX)
70409: roundpd Vdq,Wdq,Ib (66),(VEX)
7050a: roundss Vss,Wss,Ib (66),(VEX),(o128)
7060b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
7070c: blendps Vdq,Wdq,Ib (66),(VEX)
7080d: blendpd Vdq,Wdq,Ib (66),(VEX)
7090e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
7100f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
71114: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
71215: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
71316: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
71417: extractps Ed,Vdq,Ib (66),(VEX),(o128)
71518: vinsertf128 /r,Ib (66),(oVEX),(o256)
71619: vextractf128 /r,Ib (66),(oVEX),(o256)
71720: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
71821: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
71922: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
72040: dpps Vdq,Wdq,Ib (66),(VEX)
72141: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
72242: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
72344: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
7244a: vblendvps /r,Ib (66),(oVEX)
7254b: vblendvpd /r,Ib (66),(oVEX)
7264c: vpblendvb /r,Ib (66),(oVEX),(o128)
72760: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
72861: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
72962: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
73063: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
731df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
732EndTable
733
734GrpTable: Grp1
7350: ADD
7361: OR
7372: ADC
7383: SBB
7394: AND
7405: SUB
7416: XOR
7427: CMP
743EndTable
744
745GrpTable: Grp1A
7460: POP
747EndTable
748
749GrpTable: Grp2
7500: ROL
7511: ROR
7522: RCL
7533: RCR
7544: SHL/SAL
7555: SHR
7566:
7577: SAR
758EndTable
759
760GrpTable: Grp3_1
7610: TEST Eb,Ib
7621:
7632: NOT Eb
7643: NEG Eb
7654: MUL AL,Eb
7665: IMUL AL,Eb
7676: DIV AL,Eb
7687: IDIV AL,Eb
769EndTable
770
771GrpTable: Grp3_2
7720: TEST Ev,Iz
7731:
7742: NOT Ev
7753: NEG Ev
7764: MUL rAX,Ev
7775: IMUL rAX,Ev
7786: DIV rAX,Ev
7797: IDIV rAX,Ev
780EndTable
781
782GrpTable: Grp4
7830: INC Eb
7841: DEC Eb
785EndTable
786
787GrpTable: Grp5
7880: INC Ev
7891: DEC Ev
7902: CALLN Ev (f64)
7913: CALLF Ep
7924: JMPN Ev (f64)
7935: JMPF Ep
7946: PUSH Ev (d64)
7957:
796EndTable
797
798GrpTable: Grp6
7990: SLDT Rv/Mw
8001: STR Rv/Mw
8012: LLDT Ew
8023: LTR Ew
8034: VERR Ew
8045: VERW Ew
805EndTable
806
807GrpTable: Grp7
8080: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
8091: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
8102: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
8113: LIDT Ms
8124: SMSW Mw/Rv
8135:
8146: LMSW Ew
8157: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
816EndTable
817
818GrpTable: Grp8
8194: BT
8205: BTS
8216: BTR
8227: BTC
823EndTable
824
825GrpTable: Grp9
8261: CMPXCHG8B/16B Mq/Mdq
8276: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
8287: VMPTRST Mq
829EndTable
830
831GrpTable: Grp10
832EndTable
833
834GrpTable: Grp11
8350: MOV
836EndTable
837
838GrpTable: Grp12
8392: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
8404: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
8416: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
842EndTable
843
844GrpTable: Grp13
8452: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
8464: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
8476: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
848EndTable
849
850GrpTable: Grp14
8512: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
8523: psrldq Udq,Ib (66),(11B),(VEX),(o128)
8536: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
8547: pslldq Udq,Ib (66),(11B),(VEX),(o128)
855EndTable
856
857GrpTable: Grp15
8580: fxsave
8591: fxstor
8602: ldmxcsr (VEX)
8613: stmxcsr (VEX)
8624: XSAVE
8635: XRSTOR | lfence (11B)
8646: mfence (11B)
8657: clflush | sfence (11B)
866EndTable
867
868GrpTable: Grp16
8690: prefetch NTA
8701: prefetch T0
8712: prefetch T1
8723: prefetch T2
873EndTable
874
875# AMD's Prefetch Group
876GrpTable: GrpP
8770: PREFETCH
8781: PREFETCHW
879EndTable
880
881GrpTable: GrpPDLK
8820: MONTMUL
8831: XSHA1
8842: XSHA2
885EndTable
886
887GrpTable: GrpRNG
8880: xstore-rng
8891: xcrypt-ecb
8902: xcrypt-cbc
8914: xcrypt-cfb
8925: xcrypt-ofb
893EndTable
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61b41ca3b5a2..d0474ad2a6e5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs)
35 35
36 return 0; 36 return 0;
37} 37}
38
39#ifdef CONFIG_X86_64
40/*
41 * Need to defined our own search_extable on X86_64 to work around
42 * a B stepping K8 bug.
43 */
44const struct exception_table_entry *
45search_extable(const struct exception_table_entry *first,
46 const struct exception_table_entry *last,
47 unsigned long value)
48{
49 /* B stepping K8 bug */
50 if ((value >> 32) == 0)
51 value |= 0xffffffffUL << 32;
52
53 while (first <= last) {
54 const struct exception_table_entry *mid;
55 long diff;
56
57 mid = (last - first) / 2 + first;
58 diff = mid->insn - value;
59 if (diff == 0)
60 return mid;
61 else if (diff < 0)
62 first = mid+1;
63 else
64 last = mid-1;
65 }
66 return NULL;
67}
68#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee9028cf0..f62777940dfb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
38 * Returns 0 if mmiotrace is disabled, or if the fault is not 38 * Returns 0 if mmiotrace is disabled, or if the fault is not
39 * handled by mmiotrace: 39 * handled by mmiotrace:
40 */ 40 */
41static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 41static inline int __kprobes
42kmmio_fault(struct pt_regs *regs, unsigned long addr)
42{ 43{
43 if (unlikely(is_kmmio_active())) 44 if (unlikely(is_kmmio_active()))
44 if (kmmio_handler(regs, addr) == 1) 45 if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
46 return 0; 47 return 0;
47} 48}
48 49
49static inline int notify_page_fault(struct pt_regs *regs) 50static inline int __kprobes notify_page_fault(struct pt_regs *regs)
50{ 51{
51 int ret = 0; 52 int ret = 0;
52 53
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
240 * 241 *
241 * Handle a fault on the vmalloc or module mapping area 242 * Handle a fault on the vmalloc or module mapping area
242 */ 243 */
243static noinline int vmalloc_fault(unsigned long address) 244static noinline __kprobes int vmalloc_fault(unsigned long address)
244{ 245{
245 unsigned long pgd_paddr; 246 unsigned long pgd_paddr;
246 pmd_t *pmd_k; 247 pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
357 * 358 *
358 * This assumes no large pages in there. 359 * This assumes no large pages in there.
359 */ 360 */
360static noinline int vmalloc_fault(unsigned long address) 361static noinline __kprobes int vmalloc_fault(unsigned long address)
361{ 362{
362 pgd_t *pgd, *pgd_ref; 363 pgd_t *pgd, *pgd_ref;
363 pud_t *pud, *pud_ref; 364 pud_t *pud, *pud_ref;
@@ -658,7 +659,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
658 show_fault_oops(regs, error_code, address); 659 show_fault_oops(regs, error_code, address);
659 660
660 stackend = end_of_stack(tsk); 661 stackend = end_of_stack(tsk);
661 if (*stackend != STACK_END_MAGIC) 662 if (tsk != &init_task && *stackend != STACK_END_MAGIC)
662 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 663 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
663 664
664 tsk->thread.cr2 = address; 665 tsk->thread.cr2 = address;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
860 * There are no security implications to leaving a stale TLB when 861 * There are no security implications to leaving a stale TLB when
861 * increasing the permissions on a page. 862 * increasing the permissions on a page.
862 */ 863 */
863static noinline int 864static noinline __kprobes int
864spurious_fault(unsigned long error_code, unsigned long address) 865spurious_fault(unsigned long error_code, unsigned long address)
865{ 866{
866 pgd_t *pgd; 867 pgd_t *pgd;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd5536f62..d406c5239019 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
146 use_gbpages = direct_gbpages; 146 use_gbpages = direct_gbpages;
147#endif 147#endif
148 148
149 set_nx();
150 if (nx_enabled)
151 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
152
153 /* Enable PSE if available */ 149 /* Enable PSE if available */
154 if (cpu_has_pse) 150 if (cpu_has_pse)
155 set_in_cr4(X86_CR4_PSE); 151 set_in_cr4(X86_CR4_PSE);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1d8d5d..c973f8e2a6cf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
412 pkmap_page_table = pte; 412 pkmap_page_table = pte;
413} 413}
414 414
415static void __init add_one_highpage_init(struct page *page, int pfn) 415static void __init add_one_highpage_init(struct page *page)
416{ 416{
417 ClearPageReserved(page); 417 ClearPageReserved(page);
418 init_page_count(page); 418 init_page_count(page);
@@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
445 if (!pfn_valid(node_pfn)) 445 if (!pfn_valid(node_pfn))
446 continue; 446 continue;
447 page = pfn_to_page(node_pfn); 447 page = pfn_to_page(node_pfn);
448 add_one_highpage_init(page, node_pfn); 448 add_one_highpage_init(page);
449 } 449 }
450 450
451 return 0; 451 return 0;
@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
703} 703}
704 704
705#ifndef CONFIG_NEED_MULTIPLE_NODES 705#ifndef CONFIG_NEED_MULTIPLE_NODES
706void __init initmem_init(unsigned long start_pfn, 706void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
707 unsigned long end_pfn) 707 int acpi, int k8)
708{ 708{
709#ifdef CONFIG_HIGHMEM 709#ifdef CONFIG_HIGHMEM
710 highstart_pfn = highend_pfn = max_pfn; 710 highstart_pfn = highend_pfn = max_pfn;
@@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void)
997const int rodata_test_data = 0xC3; 997const int rodata_test_data = 0xC3;
998EXPORT_SYMBOL_GPL(rodata_test_data); 998EXPORT_SYMBOL_GPL(rodata_test_data);
999 999
1000static int kernel_set_to_readonly; 1000int kernel_set_to_readonly __read_mostly;
1001 1001
1002void set_kernel_text_rw(void) 1002void set_kernel_text_rw(void)
1003{ 1003{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a6006b..5198b9bb34ef 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
568} 568}
569 569
570#ifndef CONFIG_NUMA 570#ifndef CONFIG_NUMA
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
572 int acpi, int k8)
572{ 573{
573 unsigned long bootmap_size, bootmap; 574 unsigned long bootmap_size, bootmap;
574 575
@@ -694,12 +695,12 @@ void __init mem_init(void)
694const int rodata_test_data = 0xC3; 695const int rodata_test_data = 0xC3;
695EXPORT_SYMBOL_GPL(rodata_test_data); 696EXPORT_SYMBOL_GPL(rodata_test_data);
696 697
697static int kernel_set_to_readonly; 698int kernel_set_to_readonly;
698 699
699void set_kernel_text_rw(void) 700void set_kernel_text_rw(void)
700{ 701{
701 unsigned long start = PFN_ALIGN(_stext); 702 unsigned long start = PFN_ALIGN(_text);
702 unsigned long end = PFN_ALIGN(__start_rodata); 703 unsigned long end = PFN_ALIGN(__stop___ex_table);
703 704
704 if (!kernel_set_to_readonly) 705 if (!kernel_set_to_readonly)
705 return; 706 return;
@@ -707,13 +708,18 @@ void set_kernel_text_rw(void)
707 pr_debug("Set kernel text: %lx - %lx for read write\n", 708 pr_debug("Set kernel text: %lx - %lx for read write\n",
708 start, end); 709 start, end);
709 710
711 /*
712 * Make the kernel identity mapping for text RW. Kernel text
713 * mapping will always be RO. Refer to the comment in
714 * static_protections() in pageattr.c
715 */
710 set_memory_rw(start, (end - start) >> PAGE_SHIFT); 716 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
711} 717}
712 718
713void set_kernel_text_ro(void) 719void set_kernel_text_ro(void)
714{ 720{
715 unsigned long start = PFN_ALIGN(_stext); 721 unsigned long start = PFN_ALIGN(_text);
716 unsigned long end = PFN_ALIGN(__start_rodata); 722 unsigned long end = PFN_ALIGN(__stop___ex_table);
717 723
718 if (!kernel_set_to_readonly) 724 if (!kernel_set_to_readonly)
719 return; 725 return;
@@ -721,14 +727,21 @@ void set_kernel_text_ro(void)
721 pr_debug("Set kernel text: %lx - %lx for read only\n", 727 pr_debug("Set kernel text: %lx - %lx for read only\n",
722 start, end); 728 start, end);
723 729
730 /*
731 * Set the kernel identity mapping for text RO.
732 */
724 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 733 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
725} 734}
726 735
727void mark_rodata_ro(void) 736void mark_rodata_ro(void)
728{ 737{
729 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 738 unsigned long start = PFN_ALIGN(_text);
730 unsigned long rodata_start = 739 unsigned long rodata_start =
731 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 740 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
741 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
742 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
743 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
744 unsigned long data_start = (unsigned long) &_sdata;
732 745
733 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 746 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
734 (end - start) >> 10); 747 (end - start) >> 10);
@@ -751,6 +764,14 @@ void mark_rodata_ro(void)
751 printk(KERN_INFO "Testing CPA: again\n"); 764 printk(KERN_INFO "Testing CPA: again\n");
752 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 765 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
753#endif 766#endif
767
768 free_init_pages("unused kernel memory",
769 (unsigned long) page_address(virt_to_page(text_end)),
770 (unsigned long)
771 page_address(virt_to_page(rodata_start)));
772 free_init_pages("unused kernel memory",
773 (unsigned long) page_address(virt_to_page(rodata_end)),
774 (unsigned long) page_address(virt_to_page(data_start)));
754} 775}
755 776
756#endif 777#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 334e63ca7b2b..c246d259822d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -170,8 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 (unsigned long long)phys_addr, 170 (unsigned long long)phys_addr,
171 (unsigned long long)(phys_addr + size), 171 (unsigned long long)(phys_addr + size),
172 prot_val, new_prot_val); 172 prot_val, new_prot_val);
173 free_memtype(phys_addr, phys_addr + size); 173 goto err_free_memtype;
174 return NULL;
175 } 174 }
176 prot_val = new_prot_val; 175 prot_val = new_prot_val;
177 } 176 }
@@ -197,26 +196,25 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
197 */ 196 */
198 area = get_vm_area_caller(size, VM_IOREMAP, caller); 197 area = get_vm_area_caller(size, VM_IOREMAP, caller);
199 if (!area) 198 if (!area)
200 return NULL; 199 goto err_free_memtype;
201 area->phys_addr = phys_addr; 200 area->phys_addr = phys_addr;
202 vaddr = (unsigned long) area->addr; 201 vaddr = (unsigned long) area->addr;
203 202
204 if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { 203 if (kernel_map_sync_memtype(phys_addr, size, prot_val))
205 free_memtype(phys_addr, phys_addr + size); 204 goto err_free_area;
206 free_vm_area(area);
207 return NULL;
208 }
209 205
210 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { 206 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
211 free_memtype(phys_addr, phys_addr + size); 207 goto err_free_area;
212 free_vm_area(area);
213 return NULL;
214 }
215 208
216 ret_addr = (void __iomem *) (vaddr + offset); 209 ret_addr = (void __iomem *) (vaddr + offset);
217 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 210 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
218 211
219 return ret_addr; 212 return ret_addr;
213err_free_area:
214 free_vm_area(area);
215err_free_memtype:
216 free_memtype(phys_addr, phys_addr + size);
217 return NULL;
220} 218}
221 219
222/** 220/**
@@ -283,30 +281,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
283} 281}
284EXPORT_SYMBOL(ioremap_cache); 282EXPORT_SYMBOL(ioremap_cache);
285 283
286static void __iomem *ioremap_default(resource_size_t phys_addr,
287 unsigned long size)
288{
289 unsigned long flags;
290 void __iomem *ret;
291 int err;
292
293 /*
294 * - WB for WB-able memory and no other conflicting mappings
295 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
296 * - Inherit from confliting mappings otherwise
297 */
298 err = reserve_memtype(phys_addr, phys_addr + size,
299 _PAGE_CACHE_WB, &flags);
300 if (err < 0)
301 return NULL;
302
303 ret = __ioremap_caller(phys_addr, size, flags,
304 __builtin_return_address(0));
305
306 free_memtype(phys_addr, phys_addr + size);
307 return ret;
308}
309
310void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, 284void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
311 unsigned long prot_val) 285 unsigned long prot_val)
312{ 286{
@@ -382,7 +356,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
382 if (page_is_ram(start >> PAGE_SHIFT)) 356 if (page_is_ram(start >> PAGE_SHIFT))
383 return __va(phys); 357 return __va(phys);
384 358
385 addr = (void __force *)ioremap_default(start, PAGE_SIZE); 359 addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
386 if (addr) 360 if (addr)
387 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 361 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
388 362
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f8255280f..970ed579d4e4 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/k8.h>
26 26
27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29
27static __init int find_northbridge(void) 30static __init int find_northbridge(void)
28{ 31{
29 int num; 32 int num;
@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
54 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get boot_cpu_id so can use that to create apicid_to_node
55 * in k8_scan_nodes() 58 * in k8_scan_nodes()
56 */ 59 */
57 /*
58 * Find possible boot-time SMP configuration:
59 */
60#ifdef CONFIG_X86_MPPARSE
61 early_find_smp_config();
62#endif
63#ifdef CONFIG_ACPI
64 /*
65 * Read APIC information from ACPI tables.
66 */
67 early_acpi_boot_init();
68#endif
69#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
70 /* 61 /*
71 * get boot-time SMP configuration: 62 * get boot-time SMP configuration:
@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
76 early_init_lapic_mapping(); 67 early_init_lapic_mapping();
77} 68}
78 69
79int __init k8_scan_nodes(unsigned long start, unsigned long end) 70int __init k8_get_nodes(struct bootnode *physnodes)
80{ 71{
81 unsigned numnodes, cores, bits, apicid_base; 72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81}
82
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
84{
85 unsigned long start = PFN_PHYS(start_pfn);
86 unsigned long end = PFN_PHYS(end_pfn);
87 unsigned numnodes;
82 unsigned long prevbase; 88 unsigned long prevbase;
83 struct bootnode nodes[8]; 89 int i, nb, found = 0;
84 int i, j, nb, found = 0;
85 u32 nodeid, reg; 90 u32 nodeid, reg;
86 91
87 if (!early_pci_allowed()) 92 if (!early_pci_allowed())
@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
91 if (nb < 0) 96 if (nb < 0)
92 return nb; 97 return nb;
93 98
94 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 99 pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
95 100
96 reg = read_pci_config(0, nb, 0, 0x60); 101 reg = read_pci_config(0, nb, 0, 0x60);
97 numnodes = ((reg >> 4) & 0xF) + 1; 102 numnodes = ((reg >> 4) & 0xF) + 1;
98 if (numnodes <= 1) 103 if (numnodes <= 1)
99 return -1; 104 return -1;
100 105
101 printk(KERN_INFO "Number of nodes %d\n", numnodes); 106 pr_info("Number of physical nodes %d\n", numnodes);
102 107
103 memset(&nodes, 0, sizeof(nodes));
104 prevbase = 0; 108 prevbase = 0;
105 for (i = 0; i < 8; i++) { 109 for (i = 0; i < 8; i++) {
106 unsigned long base, limit; 110 unsigned long base, limit;
@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
111 nodeid = limit & 7; 115 nodeid = limit & 7;
112 if ((base & 3) == 0) { 116 if ((base & 3) == 0) {
113 if (i < numnodes) 117 if (i < numnodes)
114 printk("Skipping disabled node %d\n", i); 118 pr_info("Skipping disabled node %d\n", i);
115 continue; 119 continue;
116 } 120 }
117 if (nodeid >= numnodes) { 121 if (nodeid >= numnodes) {
118 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
119 base, limit); 123 base, limit);
120 continue; 124 continue;
121 } 125 }
122 126
123 if (!limit) { 127 if (!limit) {
124 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", 128 pr_info("Skipping node entry %d (base %lx)\n",
125 i, base); 129 i, base);
126 continue; 130 continue;
127 } 131 }
128 if ((base >> 8) & 3 || (limit >> 8) & 3) { 132 if ((base >> 8) & 3 || (limit >> 8) & 3) {
129 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 133 pr_err("Node %d using interleaving mode %lx/%lx\n",
130 nodeid, (base>>8)&3, (limit>>8) & 3); 134 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
131 return -1; 135 return -1;
132 } 136 }
133 if (node_isset(nodeid, node_possible_map)) { 137 if (node_isset(nodeid, nodes_parsed)) {
134 printk(KERN_INFO "Node %d already present. Skipping\n", 138 pr_info("Node %d already present, skipping\n",
135 nodeid); 139 nodeid);
136 continue; 140 continue;
137 } 141 }
138 142
@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
141 limit |= (1<<24)-1; 145 limit |= (1<<24)-1;
142 limit++; 146 limit++;
143 147
144 if (limit > max_pfn << PAGE_SHIFT) 148 if (limit > end)
145 limit = max_pfn << PAGE_SHIFT; 149 limit = end;
146 if (limit <= base) 150 if (limit <= base)
147 continue; 151 continue;
148 152
@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
154 if (limit > end) 158 if (limit > end)
155 limit = end; 159 limit = end;
156 if (limit == base) { 160 if (limit == base) {
157 printk(KERN_ERR "Empty node %d\n", nodeid); 161 pr_err("Empty node %d\n", nodeid);
158 continue; 162 continue;
159 } 163 }
160 if (limit < base) { 164 if (limit < base) {
161 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 165 pr_err("Node %d bogus settings %lx-%lx.\n",
162 nodeid, base, limit); 166 nodeid, base, limit);
163 continue; 167 continue;
164 } 168 }
165 169
166 /* Could sort here, but pun for now. Should not happen anyroads. */ 170 /* Could sort here, but pun for now. Should not happen anyroads. */
167 if (prevbase > base) { 171 if (prevbase > base) {
168 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 172 pr_err("Node map not sorted %lx,%lx\n",
169 prevbase, base); 173 prevbase, base);
170 return -1; 174 return -1;
171 } 175 }
172 176
173 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 177 pr_info("Node %d MemBase %016lx Limit %016lx\n",
174 nodeid, base, limit); 178 nodeid, base, limit);
175 179
176 found++; 180 found++;
177 181
@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
180 184
181 prevbase = base; 185 prevbase = base;
182 186
183 node_set(nodeid, node_possible_map); 187 node_set(nodeid, nodes_parsed);
184 } 188 }
185 189
186 if (!found) 190 if (!found)
187 return -1; 191 return -1;
192 return 0;
193}
194
195int __init k8_scan_nodes(void)
196{
197 unsigned int bits;
198 unsigned int cores;
199 unsigned int apicid_base;
200 int i;
188 201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
189 memnode_shift = compute_hash_shift(nodes, 8, NULL); 204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
190 if (memnode_shift < 0) { 205 if (memnode_shift < 0) {
191 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 206 pr_err("No NUMA node hash function found. Contact maintainer\n");
192 return -1; 207 return -1;
193 } 208 }
194 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 209 pr_info("Using node hash shift of %d\n", memnode_shift);
195 210
196 /* use the coreid bits from early_identify_cpu */ 211 /* use the coreid bits from early_identify_cpu */
197 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
200 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* need to get boot_cpu_id early for system with apicid lifting */
201 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
202 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
203 printk(KERN_INFO "BSP APIC ID: %02x\n", 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
204 boot_cpu_physical_apicid);
205 apicid_base = boot_cpu_physical_apicid; 219 apicid_base = boot_cpu_physical_apicid;
206 } 220 }
207 221
208 for (i = 0; i < 8; i++) { 222 for_each_node_mask(i, node_possible_map) {
209 if (nodes[i].start == nodes[i].end) 223 int j;
210 continue;
211 224
212 e820_register_active_regions(i, 225 e820_register_active_regions(i,
213 nodes[i].start >> PAGE_SHIFT, 226 nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 4901d0dafda6..af3b6c8a436f 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -106,26 +106,25 @@ void kmemcheck_error_recall(void)
106 106
107 switch (e->type) { 107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS: 108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " 109 printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ? 110 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)", 111 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address); 112 (void *) e->address);
114 113
115 printk(KERN_INFO); 114 printk(KERN_WARNING);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i) 115 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]); 116 printk(KERN_CONT "%02x", e->memory_copy[i]);
118 printk("\n"); 117 printk(KERN_CONT "\n");
119 118
120 printk(KERN_INFO); 119 printk(KERN_WARNING);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) { 120 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) 121 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]); 122 printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
124 else 123 else
125 printk(" ?"); 124 printk(KERN_CONT " ?");
126 } 125 }
127 printk("\n"); 126 printk(KERN_CONT "\n");
128 printk(KERN_INFO "%*c\n", 2 + 2 127 printk(KERN_WARNING "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); 128 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break; 129 break;
131 case KMEMCHECK_ERROR_BUG: 130 case KMEMCHECK_ERROR_BUG:
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917f..c0f6198565eb 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -5,6 +5,8 @@
5 * 2008 Pekka Paalanen <pq@iki.fi> 5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/list.h> 10#include <linux/list.h>
9#include <linux/rculist.h> 11#include <linux/rculist.h>
10#include <linux/spinlock.h> 12#include <linux/spinlock.h>
@@ -136,7 +138,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
136 pte_t *pte = lookup_address(f->page, &level); 138 pte_t *pte = lookup_address(f->page, &level);
137 139
138 if (!pte) { 140 if (!pte) {
139 pr_err("kmmio: no pte for page 0x%08lx\n", f->page); 141 pr_err("no pte for page 0x%08lx\n", f->page);
140 return -1; 142 return -1;
141 } 143 }
142 144
@@ -148,7 +150,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
148 clear_pte_presence(pte, clear, &f->old_presence); 150 clear_pte_presence(pte, clear, &f->old_presence);
149 break; 151 break;
150 default: 152 default:
151 pr_err("kmmio: unexpected page level 0x%x.\n", level); 153 pr_err("unexpected page level 0x%x.\n", level);
152 return -1; 154 return -1;
153 } 155 }
154 156
@@ -170,13 +172,14 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
170static int arm_kmmio_fault_page(struct kmmio_fault_page *f) 172static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
171{ 173{
172 int ret; 174 int ret;
173 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); 175 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
174 if (f->armed) { 176 if (f->armed) {
175 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", 177 pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
176 f->page, f->count, !!f->old_presence); 178 f->page, f->count, !!f->old_presence);
177 } 179 }
178 ret = clear_page_presence(f, true); 180 ret = clear_page_presence(f, true);
179 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); 181 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
182 f->page);
180 f->armed = true; 183 f->armed = true;
181 return ret; 184 return ret;
182} 185}
@@ -203,7 +206,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
203 */ 206 */
204/* 207/*
205 * Interrupts are disabled on entry as trap3 is an interrupt gate 208 * Interrupts are disabled on entry as trap3 is an interrupt gate
206 * and they remain disabled thorough out this function. 209 * and they remain disabled throughout this function.
207 */ 210 */
208int kmmio_handler(struct pt_regs *regs, unsigned long addr) 211int kmmio_handler(struct pt_regs *regs, unsigned long addr)
209{ 212{
@@ -240,24 +243,21 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
240 * condition needs handling by do_page_fault(), the 243 * condition needs handling by do_page_fault(), the
241 * page really not being present is the most common. 244 * page really not being present is the most common.
242 */ 245 */
243 pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", 246 pr_debug("secondary hit for 0x%08lx CPU %d.\n",
244 addr, smp_processor_id()); 247 addr, smp_processor_id());
245 248
246 if (!faultpage->old_presence) 249 if (!faultpage->old_presence)
247 pr_info("kmmio: unexpected secondary hit for " 250 pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
248 "address 0x%08lx on CPU %d.\n", addr, 251 addr, smp_processor_id());
249 smp_processor_id());
250 } else { 252 } else {
251 /* 253 /*
252 * Prevent overwriting already in-flight context. 254 * Prevent overwriting already in-flight context.
253 * This should not happen, let's hope disarming at 255 * This should not happen, let's hope disarming at
254 * least prevents a panic. 256 * least prevents a panic.
255 */ 257 */
256 pr_emerg("kmmio: recursive probe hit on CPU %d, " 258 pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
257 "for address 0x%08lx. Ignoring.\n", 259 smp_processor_id(), addr);
258 smp_processor_id(), addr); 260 pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
259 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
260 ctx->addr);
261 disarm_kmmio_fault_page(faultpage); 261 disarm_kmmio_fault_page(faultpage);
262 } 262 }
263 goto no_kmmio_ctx; 263 goto no_kmmio_ctx;
@@ -302,7 +302,7 @@ no_kmmio:
302 302
303/* 303/*
304 * Interrupts are disabled on entry as trap1 is an interrupt gate 304 * Interrupts are disabled on entry as trap1 is an interrupt gate
305 * and they remain disabled thorough out this function. 305 * and they remain disabled throughout this function.
306 * This must always get called as the pair to kmmio_handler(). 306 * This must always get called as the pair to kmmio_handler().
307 */ 307 */
308static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) 308static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
@@ -316,8 +316,8 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
316 * something external causing them (f.e. using a debugger while 316 * something external causing them (f.e. using a debugger while
317 * mmio tracing enabled), or erroneous behaviour 317 * mmio tracing enabled), or erroneous behaviour
318 */ 318 */
319 pr_warning("kmmio: unexpected debug trap on CPU %d.\n", 319 pr_warning("unexpected debug trap on CPU %d.\n",
320 smp_processor_id()); 320 smp_processor_id());
321 goto out; 321 goto out;
322 } 322 }
323 323
@@ -425,7 +425,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
425 list_add_rcu(&p->list, &kmmio_probes); 425 list_add_rcu(&p->list, &kmmio_probes);
426 while (size < size_lim) { 426 while (size < size_lim) {
427 if (add_kmmio_fault_page(p->addr + size)) 427 if (add_kmmio_fault_page(p->addr + size))
428 pr_err("kmmio: Unable to set page fault.\n"); 428 pr_err("Unable to set page fault.\n");
429 size += PAGE_SIZE; 429 size += PAGE_SIZE;
430 } 430 }
431out: 431out:
@@ -490,7 +490,7 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
490 * 2. remove_kmmio_fault_pages() 490 * 2. remove_kmmio_fault_pages()
491 * Remove the pages from kmmio_page_table. 491 * Remove the pages from kmmio_page_table.
492 * 3. rcu_free_kmmio_fault_pages() 492 * 3. rcu_free_kmmio_fault_pages()
493 * Actally free the kmmio_fault_page structs as with RCU. 493 * Actually free the kmmio_fault_page structs as with RCU.
494 */ 494 */
495void unregister_kmmio_probe(struct kmmio_probe *p) 495void unregister_kmmio_probe(struct kmmio_probe *p)
496{ 496{
@@ -511,7 +511,7 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
511 511
512 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 512 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
513 if (!drelease) { 513 if (!drelease) {
514 pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); 514 pr_crit("leaking kmmio_fault_page objects.\n");
515 return; 515 return;
516 } 516 }
517 drelease->release_list = release_list; 517 drelease->release_list = release_list;
@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
540 struct die_args *arg = args; 540 struct die_args *arg = args;
541 541
542 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 542 if (val == DIE_DEBUG && (arg->err & DR_STEP))
543 if (post_kmmio_handler(arg->err, arg->regs) == 1) 543 if (post_kmmio_handler(arg->err, arg->regs) == 1) {
544 /*
545 * Reset the BS bit in dr6 (pointed by args->err) to
546 * denote completion of processing
547 */
548 (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
544 return NOTIFY_STOP; 549 return NOTIFY_STOP;
550 }
545 551
546 return NOTIFY_DONE; 552 return NOTIFY_DONE;
547} 553}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 132772a8ec57..34a3291ca103 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -19,6 +19,9 @@
19 * 19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi. 20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */ 21 */
22
23#define pr_fmt(fmt) "mmiotrace: " fmt
24
22#define DEBUG 1 25#define DEBUG 1
23 26
24#include <linux/module.h> 27#include <linux/module.h>
@@ -36,8 +39,6 @@
36 39
37#include "pf_in.h" 40#include "pf_in.h"
38 41
39#define NAME "mmiotrace: "
40
41struct trap_reason { 42struct trap_reason {
42 unsigned long addr; 43 unsigned long addr;
43 unsigned long ip; 44 unsigned long ip;
@@ -96,17 +97,18 @@ static void print_pte(unsigned long address)
96 pte_t *pte = lookup_address(address, &level); 97 pte_t *pte = lookup_address(address, &level);
97 98
98 if (!pte) { 99 if (!pte) {
99 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", 100 pr_err("Error in %s: no pte for page 0x%08lx\n",
100 __func__, address); 101 __func__, address);
101 return; 102 return;
102 } 103 }
103 104
104 if (level == PG_LEVEL_2M) { 105 if (level == PG_LEVEL_2M) {
105 pr_emerg(NAME "4MB pages are not currently supported: " 106 pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
106 "0x%08lx\n", address); 107 address);
107 BUG(); 108 BUG();
108 } 109 }
109 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, 110 pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
111 address,
110 (unsigned long long)pte_val(*pte), 112 (unsigned long long)pte_val(*pte),
111 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); 113 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
112} 114}
@@ -118,22 +120,21 @@ static void print_pte(unsigned long address)
118static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) 120static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
119{ 121{
120 const struct trap_reason *my_reason = &get_cpu_var(pf_reason); 122 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
121 pr_emerg(NAME "unexpected fault for address: 0x%08lx, " 123 pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
122 "last fault for address: 0x%08lx\n", 124 addr, my_reason->addr);
123 addr, my_reason->addr);
124 print_pte(addr); 125 print_pte(addr);
125 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); 126 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
126 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); 127 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
127#ifdef __i386__ 128#ifdef __i386__
128 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", 129 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
129 regs->ax, regs->bx, regs->cx, regs->dx); 130 regs->ax, regs->bx, regs->cx, regs->dx);
130 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", 131 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
131 regs->si, regs->di, regs->bp, regs->sp); 132 regs->si, regs->di, regs->bp, regs->sp);
132#else 133#else
133 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", 134 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
134 regs->ax, regs->cx, regs->dx); 135 regs->ax, regs->cx, regs->dx);
135 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", 136 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
136 regs->si, regs->di, regs->bp, regs->sp); 137 regs->si, regs->di, regs->bp, regs->sp);
137#endif 138#endif
138 put_cpu_var(pf_reason); 139 put_cpu_var(pf_reason);
139 BUG(); 140 BUG();
@@ -213,7 +214,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
213 /* this should always return the active_trace count to 0 */ 214 /* this should always return the active_trace count to 0 */
214 my_reason->active_traces--; 215 my_reason->active_traces--;
215 if (my_reason->active_traces) { 216 if (my_reason->active_traces) {
216 pr_emerg(NAME "unexpected post handler"); 217 pr_emerg("unexpected post handler");
217 BUG(); 218 BUG();
218 } 219 }
219 220
@@ -244,7 +245,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
244 }; 245 };
245 246
246 if (!trace) { 247 if (!trace) {
247 pr_err(NAME "kmalloc failed in ioremap\n"); 248 pr_err("kmalloc failed in ioremap\n");
248 return; 249 return;
249 } 250 }
250 251
@@ -282,8 +283,8 @@ void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
282 if (!is_enabled()) /* recheck and proper locking in *_core() */ 283 if (!is_enabled()) /* recheck and proper locking in *_core() */
283 return; 284 return;
284 285
285 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", 286 pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
286 (unsigned long long)offset, size, addr); 287 (unsigned long long)offset, size, addr);
287 if ((filter_offset) && (offset != filter_offset)) 288 if ((filter_offset) && (offset != filter_offset))
288 return; 289 return;
289 ioremap_trace_core(offset, size, addr); 290 ioremap_trace_core(offset, size, addr);
@@ -301,7 +302,7 @@ static void iounmap_trace_core(volatile void __iomem *addr)
301 struct remap_trace *tmp; 302 struct remap_trace *tmp;
302 struct remap_trace *found_trace = NULL; 303 struct remap_trace *found_trace = NULL;
303 304
304 pr_debug(NAME "Unmapping %p.\n", addr); 305 pr_debug("Unmapping %p.\n", addr);
305 306
306 spin_lock_irq(&trace_lock); 307 spin_lock_irq(&trace_lock);
307 if (!is_enabled()) 308 if (!is_enabled())
@@ -363,9 +364,8 @@ static void clear_trace_list(void)
363 * Caller also ensures is_enabled() cannot change. 364 * Caller also ensures is_enabled() cannot change.
364 */ 365 */
365 list_for_each_entry(trace, &trace_list, list) { 366 list_for_each_entry(trace, &trace_list, list) {
366 pr_notice(NAME "purging non-iounmapped " 367 pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
367 "trace @0x%08lx, size 0x%lx.\n", 368 trace->probe.addr, trace->probe.len);
368 trace->probe.addr, trace->probe.len);
369 if (!nommiotrace) 369 if (!nommiotrace)
370 unregister_kmmio_probe(&trace->probe); 370 unregister_kmmio_probe(&trace->probe);
371 } 371 }
@@ -387,7 +387,7 @@ static void enter_uniprocessor(void)
387 387
388 if (downed_cpus == NULL && 388 if (downed_cpus == NULL &&
389 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { 389 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
390 pr_notice(NAME "Failed to allocate mask\n"); 390 pr_notice("Failed to allocate mask\n");
391 goto out; 391 goto out;
392 } 392 }
393 393
@@ -395,20 +395,19 @@ static void enter_uniprocessor(void)
395 cpumask_copy(downed_cpus, cpu_online_mask); 395 cpumask_copy(downed_cpus, cpu_online_mask);
396 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); 396 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
397 if (num_online_cpus() > 1) 397 if (num_online_cpus() > 1)
398 pr_notice(NAME "Disabling non-boot CPUs...\n"); 398 pr_notice("Disabling non-boot CPUs...\n");
399 put_online_cpus(); 399 put_online_cpus();
400 400
401 for_each_cpu(cpu, downed_cpus) { 401 for_each_cpu(cpu, downed_cpus) {
402 err = cpu_down(cpu); 402 err = cpu_down(cpu);
403 if (!err) 403 if (!err)
404 pr_info(NAME "CPU%d is down.\n", cpu); 404 pr_info("CPU%d is down.\n", cpu);
405 else 405 else
406 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); 406 pr_err("Error taking CPU%d down: %d\n", cpu, err);
407 } 407 }
408out: 408out:
409 if (num_online_cpus() > 1) 409 if (num_online_cpus() > 1)
410 pr_warning(NAME "multiple CPUs still online, " 410 pr_warning("multiple CPUs still online, may miss events.\n");
411 "may miss events.\n");
412} 411}
413 412
414/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, 413/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
@@ -420,13 +419,13 @@ static void __ref leave_uniprocessor(void)
420 419
421 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) 420 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
422 return; 421 return;
423 pr_notice(NAME "Re-enabling CPUs...\n"); 422 pr_notice("Re-enabling CPUs...\n");
424 for_each_cpu(cpu, downed_cpus) { 423 for_each_cpu(cpu, downed_cpus) {
425 err = cpu_up(cpu); 424 err = cpu_up(cpu);
426 if (!err) 425 if (!err)
427 pr_info(NAME "enabled CPU%d.\n", cpu); 426 pr_info("enabled CPU%d.\n", cpu);
428 else 427 else
429 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); 428 pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
430 } 429 }
431} 430}
432 431
@@ -434,8 +433,8 @@ static void __ref leave_uniprocessor(void)
434static void enter_uniprocessor(void) 433static void enter_uniprocessor(void)
435{ 434{
436 if (num_online_cpus() > 1) 435 if (num_online_cpus() > 1)
437 pr_warning(NAME "multiple CPUs are online, may miss events. " 436 pr_warning("multiple CPUs are online, may miss events. "
438 "Suggest booting with maxcpus=1 kernel argument.\n"); 437 "Suggest booting with maxcpus=1 kernel argument.\n");
439} 438}
440 439
441static void leave_uniprocessor(void) 440static void leave_uniprocessor(void)
@@ -450,13 +449,13 @@ void enable_mmiotrace(void)
450 goto out; 449 goto out;
451 450
452 if (nommiotrace) 451 if (nommiotrace)
453 pr_info(NAME "MMIO tracing disabled.\n"); 452 pr_info("MMIO tracing disabled.\n");
454 kmmio_init(); 453 kmmio_init();
455 enter_uniprocessor(); 454 enter_uniprocessor();
456 spin_lock_irq(&trace_lock); 455 spin_lock_irq(&trace_lock);
457 atomic_inc(&mmiotrace_enabled); 456 atomic_inc(&mmiotrace_enabled);
458 spin_unlock_irq(&trace_lock); 457 spin_unlock_irq(&trace_lock);
459 pr_info(NAME "enabled.\n"); 458 pr_info("enabled.\n");
460out: 459out:
461 mutex_unlock(&mmiotrace_mutex); 460 mutex_unlock(&mmiotrace_mutex);
462} 461}
@@ -475,7 +474,7 @@ void disable_mmiotrace(void)
475 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 474 clear_trace_list(); /* guarantees: no more kmmio callbacks */
476 leave_uniprocessor(); 475 leave_uniprocessor();
477 kmmio_cleanup(); 476 kmmio_cleanup();
478 pr_info(NAME "disabled.\n"); 477 pr_info("disabled.\n");
479out: 478out:
480 mutex_unlock(&mmiotrace_mutex); 479 mutex_unlock(&mmiotrace_mutex);
481} 480}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d2530062fe00..b20760ca7244 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
347 (ulong) node_remap_end_vaddr[nid]); 347 (ulong) node_remap_end_vaddr[nid]);
348} 348}
349 349
350void __init initmem_init(unsigned long start_pfn, 350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 unsigned long end_pfn) 351 int acpi, int k8)
352{ 352{
353 int nid; 353 int nid;
354 long kva_target_pfn; 354 long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913beac71..83bbc70d11bb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 239 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 241 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) 242 if (nodedata_phys < start || nodedata_phys >= end) {
243 free_bootmem(nodedata_phys, pgdat_size); 243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
244 node_data[nodeid] = NULL; 250 node_data[nodeid] = NULL;
245 return; 251 return;
246 } 252 }
@@ -306,8 +312,71 @@ void __init numa_init_array(void)
306 312
307#ifdef CONFIG_NUMA_EMU 313#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 314/* Numa emulation */
315static struct bootnode nodes[MAX_NUMNODES] __initdata;
316static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 317static char *cmdline __initdata;
310 318
319static int __init setup_physnodes(unsigned long start, unsigned long end,
320 int acpi, int k8)
321{
322 int nr_nodes = 0;
323 int ret = 0;
324 int i;
325
326#ifdef CONFIG_ACPI_NUMA
327 if (acpi)
328 nr_nodes = acpi_get_nodes(physnodes);
329#endif
330#ifdef CONFIG_K8_NUMA
331 if (k8)
332 nr_nodes = k8_get_nodes(physnodes);
333#endif
334 /*
335 * Basic sanity checking on the physical node map: there may be errors
336 * if the SRAT or K8 incorrectly reported the topology or the mem=
337 * kernel parameter is used.
338 */
339 for (i = 0; i < nr_nodes; i++) {
340 if (physnodes[i].start == physnodes[i].end)
341 continue;
342 if (physnodes[i].start > end) {
343 physnodes[i].end = physnodes[i].start;
344 continue;
345 }
346 if (physnodes[i].end < start) {
347 physnodes[i].start = physnodes[i].end;
348 continue;
349 }
350 if (physnodes[i].start < start)
351 physnodes[i].start = start;
352 if (physnodes[i].end > end)
353 physnodes[i].end = end;
354 }
355
356 /*
357 * Remove all nodes that have no memory or were truncated because of the
358 * limited address range.
359 */
360 for (i = 0; i < nr_nodes; i++) {
361 if (physnodes[i].start == physnodes[i].end)
362 continue;
363 physnodes[ret].start = physnodes[i].start;
364 physnodes[ret].end = physnodes[i].end;
365 ret++;
366 }
367
368 /*
369 * If no physical topology was detected, a single node is faked to cover
370 * the entire address space.
371 */
372 if (!ret) {
373 physnodes[ret].start = start;
374 physnodes[ret].end = end;
375 ret = 1;
376 }
377 return ret;
378}
379
311/* 380/*
312 * Setups up nid to range from addr to addr + size. If the end 381 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 382 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +384,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 384 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 385 * the end of the node.
317 */ 386 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 387static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 388{
321 int ret = 0; 389 int ret = 0;
322
323 nodes[nid].start = *addr; 390 nodes[nid].start = *addr;
324 *addr += size; 391 *addr += size;
325 if (*addr >= max_addr) { 392 if (*addr >= max_addr) {
@@ -335,12 +402,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 402}
336 403
337/* 404/*
405 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
406 * to max_addr. The return value is the number of nodes allocated.
407 */
408static int __init split_nodes_interleave(u64 addr, u64 max_addr,
409 int nr_phys_nodes, int nr_nodes)
410{
411 nodemask_t physnode_mask = NODE_MASK_NONE;
412 u64 size;
413 int big;
414 int ret = 0;
415 int i;
416
417 if (nr_nodes <= 0)
418 return -1;
419 if (nr_nodes > MAX_NUMNODES) {
420 pr_info("numa=fake=%d too large, reducing to %d\n",
421 nr_nodes, MAX_NUMNODES);
422 nr_nodes = MAX_NUMNODES;
423 }
424
425 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
426 /*
427 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder.
429 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
431 FAKE_NODE_MIN_SIZE;
432
433 size &= FAKE_NODE_MIN_HASH_MASK;
434 if (!size) {
435 pr_err("Not enough memory for each node. "
436 "NUMA emulation disabled.\n");
437 return -1;
438 }
439
440 for (i = 0; i < nr_phys_nodes; i++)
441 if (physnodes[i].start != physnodes[i].end)
442 node_set(i, physnode_mask);
443
444 /*
445 * Continue to fill physical nodes with fake nodes until there is no
446 * memory left on any of them.
447 */
448 while (nodes_weight(physnode_mask)) {
449 for_each_node_mask(i, physnode_mask) {
450 u64 end = physnodes[i].start + size;
451 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
452
453 if (ret < big)
454 end += FAKE_NODE_MIN_SIZE;
455
456 /*
457 * Continue to add memory to this fake node if its
458 * non-reserved memory is less than the per-node size.
459 */
460 while (end - physnodes[i].start -
461 e820_hole_size(physnodes[i].start, end) < size) {
462 end += FAKE_NODE_MIN_SIZE;
463 if (end > physnodes[i].end) {
464 end = physnodes[i].end;
465 break;
466 }
467 }
468
469 /*
470 * If there won't be at least FAKE_NODE_MIN_SIZE of
471 * non-reserved memory in ZONE_DMA32 for the next node,
472 * this one must extend to the boundary.
473 */
474 if (end < dma32_end && dma32_end - end -
475 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
476 end = dma32_end;
477
478 /*
479 * If there won't be enough non-reserved memory for the
480 * next node, this one must extend to the end of the
481 * physical node.
482 */
483 if (physnodes[i].end - end -
484 e820_hole_size(end, physnodes[i].end) < size)
485 end = physnodes[i].end;
486
487 /*
488 * Avoid allocating more nodes than requested, which can
489 * happen as a result of rounding down each node's size
490 * to FAKE_NODE_MIN_SIZE.
491 */
492 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
493 end = physnodes[i].end;
494
495 if (setup_node_range(ret++, &physnodes[i].start,
496 end - physnodes[i].start,
497 physnodes[i].end) < 0)
498 node_clear(i, physnode_mask);
499 }
500 }
501 return ret;
502}
503
504/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 505 * Splits num_nodes nodes up equally starting at node_start. The return value
339 * is the number of nodes split up and addr is adjusted to be at the end of the 506 * is the number of nodes split up and addr is adjusted to be at the end of the
340 * last node allocated. 507 * last node allocated.
341 */ 508 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343 u64 max_addr, int node_start,
344 int num_nodes) 510 int num_nodes)
345{ 511{
346 unsigned int big; 512 unsigned int big;
@@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388 break; 554 break;
389 } 555 }
390 } 556 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392 break; 558 break;
393 } 559 }
394 return i - node_start + 1; 560 return i - node_start + 1;
@@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399 * always assigned to a final node and can be asymmetric. Returns the number of 565 * always assigned to a final node and can be asymmetric. Returns the number of
400 * nodes split. 566 * nodes split.
401 */ 567 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403 u64 max_addr, int node_start, u64 size) 569 u64 size)
404{ 570{
405 int i = node_start; 571 int i = node_start;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 573 while (!setup_node_range(i++, addr, size, max_addr))
408 ; 574 ;
409 return i - node_start; 575 return i - node_start;
410} 576}
@@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 579 * Sets up the system RAM area from start_pfn to last_pfn according to the
414 * numa=fake command-line option. 580 * numa=fake command-line option.
415 */ 581 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 582static int __init numa_emulation(unsigned long start_pfn,
417 583 unsigned long last_pfn, int acpi, int k8)
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 584{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 585 u64 size, addr = start_pfn << PAGE_SHIFT;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 586 u64 max_addr = last_pfn << PAGE_SHIFT;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes;
423 589
424 memset(&nodes, 0, sizeof(nodes)); 590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425 /* 591 /*
426 * If the numa=fake command-line is just a single number N, split the 592 * If the numa=fake command-line is just a single number N, split the
427 * system RAM into N fake nodes. 593 * system RAM into N fake nodes.
@@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430 long n = simple_strtol(cmdline, NULL, 0); 596 long n = simple_strtol(cmdline, NULL, 0);
431 597
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
433 if (num_nodes < 0) 600 if (num_nodes < 0)
434 return num_nodes; 601 return num_nodes;
435 goto out; 602 goto out;
@@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457 if (size) 624 if (size)
458 for (i = 0; i < coeff; i++, num_nodes++) 625 for (i = 0; i < coeff; i++, num_nodes++)
459 if (setup_node_range(num_nodes, nodes, 626 if (setup_node_range(num_nodes, &addr,
460 &addr, size, max_addr) < 0) 627 size, max_addr) < 0)
461 goto done; 628 goto done;
462 if (!*cmdline) 629 if (!*cmdline)
463 break; 630 break;
@@ -473,7 +640,7 @@ done:
473 if (addr < max_addr) { 640 if (addr < max_addr) {
474 if (coeff_flag && coeff < 0) { 641 if (coeff_flag && coeff < 0) {
475 /* Split remaining nodes into num-sized chunks */ 642 /* Split remaining nodes into num-sized chunks */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 643 num_nodes += split_nodes_by_size(&addr, max_addr,
477 num_nodes, num); 644 num_nodes, num);
478 goto out; 645 goto out;
479 } 646 }
@@ -482,7 +649,7 @@ done:
482 /* Split remaining nodes into coeff chunks */ 649 /* Split remaining nodes into coeff chunks */
483 if (coeff <= 0) 650 if (coeff <= 0)
484 break; 651 break;
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 652 num_nodes += split_nodes_equally(&addr, max_addr,
486 num_nodes, coeff); 653 num_nodes, coeff);
487 break; 654 break;
488 case ',': 655 case ',':
@@ -490,8 +657,8 @@ done:
490 break; 657 break;
491 default: 658 default:
492 /* Give one final node */ 659 /* Give one final node */
493 setup_node_range(num_nodes, nodes, &addr, 660 setup_node_range(num_nodes, &addr, max_addr - addr,
494 max_addr - addr, max_addr); 661 max_addr);
495 num_nodes++; 662 num_nodes++;
496 } 663 }
497 } 664 }
@@ -505,14 +672,10 @@ out:
505 } 672 }
506 673
507 /* 674 /*
508 * We need to vacate all active ranges that may have been registered by 675 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 676 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 677 */
512 remove_all_active_ranges(); 678 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 679 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 680 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 681 nodes[i].end >> PAGE_SHIFT);
@@ -524,7 +687,8 @@ out:
524} 687}
525#endif /* CONFIG_NUMA_EMU */ 688#endif /* CONFIG_NUMA_EMU */
526 689
527void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 690void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
691 int acpi, int k8)
528{ 692{
529 int i; 693 int i;
530 694
@@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
532 nodes_clear(node_online_map); 696 nodes_clear(node_online_map);
533 697
534#ifdef CONFIG_NUMA_EMU 698#ifdef CONFIG_NUMA_EMU
535 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 699 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
536 return; 700 return;
537 nodes_clear(node_possible_map); 701 nodes_clear(node_possible_map);
538 nodes_clear(node_online_map); 702 nodes_clear(node_online_map);
539#endif 703#endif
540 704
541#ifdef CONFIG_ACPI_NUMA 705#ifdef CONFIG_ACPI_NUMA
542 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 706 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
543 last_pfn << PAGE_SHIFT)) 707 last_pfn << PAGE_SHIFT))
544 return; 708 return;
545 nodes_clear(node_possible_map); 709 nodes_clear(node_possible_map);
546 nodes_clear(node_online_map); 710 nodes_clear(node_online_map);
547#endif 711#endif
548 712
549#ifdef CONFIG_K8_NUMA 713#ifdef CONFIG_K8_NUMA
550 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 714 if (!numa_off && k8 && !k8_scan_nodes())
551 last_pfn<<PAGE_SHIFT))
552 return; 715 return;
553 nodes_clear(node_possible_map); 716 nodes_clear(node_possible_map);
554 nodes_clear(node_online_map); 717 nodes_clear(node_online_map);
@@ -601,6 +764,25 @@ static __init int numa_setup(char *opt)
601early_param("numa", numa_setup); 764early_param("numa", numa_setup);
602 765
603#ifdef CONFIG_NUMA 766#ifdef CONFIG_NUMA
767
768static __init int find_near_online_node(int node)
769{
770 int n, val;
771 int min_val = INT_MAX;
772 int best_node = -1;
773
774 for_each_online_node(n) {
775 val = node_distance(node, n);
776
777 if (val < min_val) {
778 min_val = val;
779 best_node = n;
780 }
781 }
782
783 return best_node;
784}
785
604/* 786/*
605 * Setup early cpu_to_node. 787 * Setup early cpu_to_node.
606 * 788 *
@@ -632,7 +814,7 @@ void __init init_cpu_to_node(void)
632 if (node == NUMA_NO_NODE) 814 if (node == NUMA_NO_NODE)
633 continue; 815 continue;
634 if (!node_online(node)) 816 if (!node_online(node))
635 continue; 817 node = find_near_online_node(node);
636 numa_set_node(cpu, node); 818 numa_set_node(cpu, node);
637 } 819 }
638} 820}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd38bfbefd1f..1d4eb93d333c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,6 +279,22 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 280 pgprot_val(forbidden) |= _PAGE_RW;
281 281
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /*
284 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
285 * kernel text mappings for the large page aligned text, rodata sections
286 * will be always read-only. For the kernel identity mappings covering
287 * the holes caused by this alignment can be anything that user asks.
288 *
289 * This will preserve the large page mappings for kernel text/data
290 * at no extra cost.
291 */
292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align))
295 pgprot_val(forbidden) |= _PAGE_RW;
296#endif
297
282 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 298 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
283 299
284 return prot; 300 return prot;
@@ -1069,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
1069 1085
1070int set_memory_x(unsigned long addr, int numpages) 1086int set_memory_x(unsigned long addr, int numpages)
1071{ 1087{
1088 if (!(__supported_pte_mask & _PAGE_NX))
1089 return 0;
1090
1072 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1091 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1073} 1092}
1074EXPORT_SYMBOL(set_memory_x); 1093EXPORT_SYMBOL(set_memory_x);
1075 1094
1076int set_memory_nx(unsigned long addr, int numpages) 1095int set_memory_nx(unsigned long addr, int numpages)
1077{ 1096{
1097 if (!(__supported_pte_mask & _PAGE_NX))
1098 return 0;
1099
1078 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1100 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1079} 1101}
1080EXPORT_SYMBOL(set_memory_nx); 1102EXPORT_SYMBOL(set_memory_nx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0ec2bcf..ae9648eb1c7f 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -20,6 +20,7 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23#include <asm/x86_init.h>
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/fcntl.h> 25#include <asm/fcntl.h>
25#include <asm/e820.h> 26#include <asm/e820.h>
@@ -355,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end)
355 * - _PAGE_CACHE_UC_MINUS 356 * - _PAGE_CACHE_UC_MINUS
356 * - _PAGE_CACHE_UC 357 * - _PAGE_CACHE_UC
357 * 358 *
358 * req_type will have a special case value '-1', when requester want to inherit
359 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
360 *
361 * If new_type is NULL, function will return an error if it cannot reserve the 359 * If new_type is NULL, function will return an error if it cannot reserve the
362 * region with req_type. If new_type is non-NULL, function will return 360 * region with req_type. If new_type is non-NULL, function will return
363 * available type in new_type in case of no error. In case of any error 361 * available type in new_type in case of no error. In case of any error
@@ -377,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
377 if (!pat_enabled) { 375 if (!pat_enabled) {
378 /* This is identical to page table setting without PAT */ 376 /* This is identical to page table setting without PAT */
379 if (new_type) { 377 if (new_type) {
380 if (req_type == -1) 378 if (req_type == _PAGE_CACHE_WC)
381 *new_type = _PAGE_CACHE_WB;
382 else if (req_type == _PAGE_CACHE_WC)
383 *new_type = _PAGE_CACHE_UC_MINUS; 379 *new_type = _PAGE_CACHE_UC_MINUS;
384 else 380 else
385 *new_type = req_type & _PAGE_CACHE_MASK; 381 *new_type = req_type & _PAGE_CACHE_MASK;
@@ -388,7 +384,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
388 } 384 }
389 385
390 /* Low ISA region is always mapped WB in page table. No need to track */ 386 /* Low ISA region is always mapped WB in page table. No need to track */
391 if (is_ISA_range(start, end - 1)) { 387 if (x86_platform.is_untracked_pat_range(start, end)) {
392 if (new_type) 388 if (new_type)
393 *new_type = _PAGE_CACHE_WB; 389 *new_type = _PAGE_CACHE_WB;
394 return 0; 390 return 0;
@@ -499,7 +495,7 @@ int free_memtype(u64 start, u64 end)
499 return 0; 495 return 0;
500 496
501 /* Low ISA region is always mapped WB. No need to track */ 497 /* Low ISA region is always mapped WB. No need to track */
502 if (is_ISA_range(start, end - 1)) 498 if (x86_platform.is_untracked_pat_range(start, end))
503 return 0; 499 return 0;
504 500
505 is_range_ram = pat_pagerange_is_ram(start, end); 501 is_range_ram = pat_pagerange_is_ram(start, end);
@@ -582,7 +578,7 @@ static unsigned long lookup_memtype(u64 paddr)
582 int rettype = _PAGE_CACHE_WB; 578 int rettype = _PAGE_CACHE_WB;
583 struct memtype *entry; 579 struct memtype *entry;
584 580
585 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) 581 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
586 return rettype; 582 return rettype;
587 583
588 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 584 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
@@ -708,9 +704,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
708 if (!range_is_allowed(pfn, size)) 704 if (!range_is_allowed(pfn, size))
709 return 0; 705 return 0;
710 706
711 if (file->f_flags & O_SYNC) { 707 if (file->f_flags & O_DSYNC)
712 flags = _PAGE_CACHE_UC_MINUS; 708 flags = _PAGE_CACHE_UC_MINUS;
713 }
714 709
715#ifdef CONFIG_X86_32 710#ifdef CONFIG_X86_32
716 /* 711 /*
@@ -1018,8 +1013,10 @@ static const struct file_operations memtype_fops = {
1018 1013
1019static int __init pat_memtype_list_init(void) 1014static int __init pat_memtype_list_init(void)
1020{ 1015{
1021 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, 1016 if (pat_enabled) {
1022 NULL, &memtype_fops); 1017 debugfs_create_file("pat_memtype_list", S_IRUSR,
1018 arch_debugfs_dir, NULL, &memtype_fops);
1019 }
1023 return 0; 1020 return 0;
1024} 1021}
1025 1022
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 513d8ed5d2ec..a3250aa34086 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -3,10 +3,8 @@
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/proto.h>
6 7
7int nx_enabled;
8
9#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
10static int disable_nx __cpuinitdata; 8static int disable_nx __cpuinitdata;
11 9
12/* 10/*
@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
22 if (!str) 20 if (!str)
23 return -EINVAL; 21 return -EINVAL;
24 if (!strncmp(str, "on", 2)) { 22 if (!strncmp(str, "on", 2)) {
25 __supported_pte_mask |= _PAGE_NX;
26 disable_nx = 0; 23 disable_nx = 0;
27 } else if (!strncmp(str, "off", 3)) { 24 } else if (!strncmp(str, "off", 3)) {
28 disable_nx = 1; 25 disable_nx = 1;
29 __supported_pte_mask &= ~_PAGE_NX;
30 } 26 }
27 x86_configure_nx();
31 return 0; 28 return 0;
32} 29}
33early_param("noexec", noexec_setup); 30early_param("noexec", noexec_setup);
34#endif
35 31
36#ifdef CONFIG_X86_PAE 32void __cpuinit x86_configure_nx(void)
37void __init set_nx(void)
38{ 33{
39 unsigned int v[4], l, h; 34 if (cpu_has_nx && !disable_nx)
40 35 __supported_pte_mask |= _PAGE_NX;
41 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 36 else
42 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 37 __supported_pte_mask &= ~_PAGE_NX;
38}
43 39
44 if ((v[3] & (1 << 20)) && !disable_nx) { 40void __init x86_report_nx(void)
45 rdmsr(MSR_EFER, l, h); 41{
46 l |= EFER_NX; 42 if (!cpu_has_nx) {
47 wrmsr(MSR_EFER, l, h); 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
48 nx_enabled = 1; 44 "missing in CPU or disabled in BIOS!\n");
49 __supported_pte_mask |= _PAGE_NX; 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) {
48 printk(KERN_INFO "NX (Execute Disable) protection: "
49 "disabled by kernel command line option\n");
50 } else {
51 printk(KERN_INFO "NX (Execute Disable) protection: "
52 "active\n");
50 } 53 }
51 }
52}
53#else 54#else
54void set_nx(void) 55 /* 32bit non-PAE kernel, NX cannot be used */
55{ 56 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
56} 57 "cannot be enabled: non-PAE kernel!\n");
57#endif 58#endif
58 59 }
59#ifdef CONFIG_X86_64
60void __cpuinit check_efer(void)
61{
62 unsigned long efer;
63
64 rdmsrl(MSR_EFER, efer);
65 if (!(efer & EFER_NX) || disable_nx)
66 __supported_pte_mask &= ~_PAGE_NX;
67} 60}
68#endif
69
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 6f8aa33031c7..9324f13492d5 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -267,6 +267,8 @@ int __init get_memcfg_from_srat(void)
267 e820_register_active_regions(chunk->nid, chunk->start_pfn, 267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn)); 268 min(chunk->end_pfn, max_pfn));
269 } 269 }
270 /* for out of order entries in SRAT */
271 sort_node_map();
270 272
271 for_each_online_node(nid) { 273 for_each_online_node(nid) {
272 unsigned long start = node_start_pfn[nid]; 274 unsigned long start = node_start_pfn[nid];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index dbb5381f7b3b..a27124185fc1 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
136 apicid_to_node[apic_id] = node; 136 apicid_to_node[apic_id] = node;
137 node_set(node, cpu_nodes_parsed); 137 node_set(node, cpu_nodes_parsed);
138 acpi_numa = 1; 138 acpi_numa = 1;
139 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 139 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
140 pxm, apic_id, node); 140 pxm, apic_id, node);
141} 141}
142 142
@@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
170 apicid_to_node[apic_id] = node; 170 apicid_to_node[apic_id] = node;
171 node_set(node, cpu_nodes_parsed); 171 node_set(node, cpu_nodes_parsed);
172 acpi_numa = 1; 172 acpi_numa = 1;
173 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 173 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
174 pxm, apic_id, node); 174 pxm, apic_id, node);
175} 175}
176 176
@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
290 290
291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
292 start, end); 292 start, end);
293 e820_register_active_regions(node, start >> PAGE_SHIFT,
294 end >> PAGE_SHIFT);
295 293
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end); 295 update_nodes_add(node, start, end);
@@ -319,7 +317,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
319 unsigned long s = nodes[i].start >> PAGE_SHIFT; 317 unsigned long s = nodes[i].start >> PAGE_SHIFT;
320 unsigned long e = nodes[i].end >> PAGE_SHIFT; 318 unsigned long e = nodes[i].end >> PAGE_SHIFT;
321 pxmram += e - s; 319 pxmram += e - s;
322 pxmram -= absent_pages_in_range(s, e); 320 pxmram -= __absent_pages_in_range(i, s, e);
323 if ((long)pxmram < 0) 321 if ((long)pxmram < 0)
324 pxmram = 0; 322 pxmram = 0;
325 } 323 }
@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
338 336
339void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
340 338
339int __init acpi_get_nodes(struct bootnode *physnodes)
340{
341 int i;
342 int ret = 0;
343
344 for_each_node_mask(i, nodes_parsed) {
345 physnodes[ret].start = nodes[i].start;
346 physnodes[ret].end = nodes[i].end;
347 ret++;
348 }
349 return ret;
350}
351
341/* Use the information discovered above to actually set up the nodes. */ 352/* Use the information discovered above to actually set up the nodes. */
342int __init acpi_scan_nodes(unsigned long start, unsigned long end) 353int __init acpi_scan_nodes(unsigned long start, unsigned long end)
343{ 354{
@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
350 for (i = 0; i < MAX_NUMNODES; i++) 361 for (i = 0; i < MAX_NUMNODES; i++)
351 cutoff_node(i, start, end); 362 cutoff_node(i, start, end);
352 363
353 if (!nodes_cover_memory(nodes)) {
354 bad_srat();
355 return -1;
356 }
357
358 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 364 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
359 memblk_nodeid); 365 memblk_nodeid);
360 if (memnode_shift < 0) { 366 if (memnode_shift < 0) {
@@ -364,6 +370,16 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
364 return -1; 370 return -1;
365 } 371 }
366 372
373 for_each_node_mask(i, nodes_parsed)
374 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
375 nodes[i].end >> PAGE_SHIFT);
376 /* for out of order entries in SRAT */
377 sort_node_map();
378 if (!nodes_cover_memory(nodes)) {
379 bad_srat();
380 return -1;
381 }
382
367 /* Account for nodes with cpus and no memory */ 383 /* Account for nodes with cpus and no memory */
368 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 384 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
369 385
@@ -454,7 +470,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
454 for (i = 0; i < num_nodes; i++) 470 for (i = 0; i < num_nodes; i++)
455 if (fake_nodes[i].start != fake_nodes[i].end) 471 if (fake_nodes[i].start != fake_nodes[i].end)
456 node_set(i, nodes_parsed); 472 node_set(i, nodes_parsed);
457 WARN_ON(!nodes_cover_memory(fake_nodes));
458} 473}
459 474
460static int null_slit_node_compare(int a, int b) 475static int null_slit_node_compare(int a, int b)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df5..8565d944f7cf 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
1/* 1/*
2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> 2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
3 */ 3 */
4
5#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6
4#include <linux/module.h> 7#include <linux/module.h>
5#include <linux/io.h> 8#include <linux/io.h>
6#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
7 10
8#define MODULE_NAME "testmmiotrace"
9
10static unsigned long mmio_address; 11static unsigned long mmio_address;
11module_param(mmio_address, ulong, 0); 12module_param(mmio_address, ulong, 0);
12MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " 13MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
30static void do_write_test(void __iomem *p) 31static void do_write_test(void __iomem *p)
31{ 32{
32 unsigned int i; 33 unsigned int i;
33 pr_info(MODULE_NAME ": write test.\n"); 34 pr_info("write test.\n");
34 mmiotrace_printk("Write test.\n"); 35 mmiotrace_printk("Write test.\n");
35 36
36 for (i = 0; i < 256; i++) 37 for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
47{ 48{
48 unsigned int i; 49 unsigned int i;
49 unsigned errs[3] = { 0 }; 50 unsigned errs[3] = { 0 };
50 pr_info(MODULE_NAME ": read test.\n"); 51 pr_info("read test.\n");
51 mmiotrace_printk("Read test.\n"); 52 mmiotrace_printk("Read test.\n");
52 53
53 for (i = 0; i < 256; i++) 54 for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
68 69
69static void do_read_far_test(void __iomem *p) 70static void do_read_far_test(void __iomem *p)
70{ 71{
71 pr_info(MODULE_NAME ": read far test.\n"); 72 pr_info("read far test.\n");
72 mmiotrace_printk("Read far test.\n"); 73 mmiotrace_printk("Read far test.\n");
73 74
74 ioread32(p + read_far); 75 ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
78{ 79{
79 void __iomem *p = ioremap_nocache(mmio_address, size); 80 void __iomem *p = ioremap_nocache(mmio_address, size);
80 if (!p) { 81 if (!p) {
81 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 82 pr_err("could not ioremap, aborting.\n");
82 return; 83 return;
83 } 84 }
84 mmiotrace_printk("ioremap returned %p.\n", p); 85 mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
94 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
95 96
96 if (mmio_address == 0) { 97 if (mmio_address == 0) {
97 pr_err(MODULE_NAME ": you have to use the module argument " 98 pr_err("you have to use the module argument mmio_address.\n");
98 "mmio_address.\n"); 99 pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
99 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
100 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
101 return -ENXIO; 100 return -ENXIO;
102 } 101 }
103 102
104 pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " 103 pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
105 "address space, and writing 16 kB of rubbish in there.\n", 104 "and writing 16 kB of rubbish in there.\n",
106 size >> 10, mmio_address); 105 size >> 10, mmio_address);
107 do_test(size); 106 do_test(size);
108 pr_info(MODULE_NAME ": All done.\n"); 107 pr_info("All done.\n");
109 return 0; 108 return 0;
110} 109}
111 110
112static void __exit cleanup(void) 111static void __exit cleanup(void)
113{ 112{
114 pr_debug(MODULE_NAME ": unloaded.\n"); 113 pr_debug("unloaded.\n");
115} 114}
116 115
117module_init(init); 116module_init(init);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08eeb5c3..65b58e4b0b8b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
8 8
9#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
11#include <asm/cache.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/uv/uv.h> 13#include <asm/uv/uv.h>
13 14
@@ -43,7 +44,7 @@ union smp_flush_state {
43 spinlock_t tlbstate_lock; 44 spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
45 }; 46 };
46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
47} ____cacheline_internodealigned_in_smp; 48} ____cacheline_internodealigned_in_smp;
48 49
49/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 044897be021f..3855096c59b8 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -41,10 +41,11 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
41} 41}
42 42
43static struct stacktrace_ops backtrace_ops = { 43static struct stacktrace_ops backtrace_ops = {
44 .warning = backtrace_warning, 44 .warning = backtrace_warning,
45 .warning_symbol = backtrace_warning_symbol, 45 .warning_symbol = backtrace_warning_symbol,
46 .stack = backtrace_stack, 46 .stack = backtrace_stack,
47 .address = backtrace_address, 47 .address = backtrace_address,
48 .walk_stack = print_context_stack,
48}; 49};
49 50
50struct frame_head { 51struct frame_head {
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index d49202e740ea..564b008a51c7 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,3 +15,8 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-y += common.o early.o 16obj-y += common.o early.o
17obj-y += amd_bus.o 17obj-y += amd_bus.o
18obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o
19
20ifeq ($(CONFIG_PCI_DEBUG),y)
21EXTRA_CFLAGS += -DDEBUG
22endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1014eb4bfc37..959e548a7039 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -7,6 +7,7 @@
7#include <asm/pci_x86.h> 7#include <asm/pci_x86.h>
8 8
9struct pci_root_info { 9struct pci_root_info {
10 struct acpi_device *bridge;
10 char *name; 11 char *name;
11 unsigned int res_num; 12 unsigned int res_num;
12 struct resource *res; 13 struct resource *res;
@@ -58,6 +59,30 @@ bus_has_transparent_bridge(struct pci_bus *bus)
58 return false; 59 return false;
59} 60}
60 61
62static void
63align_resource(struct acpi_device *bridge, struct resource *res)
64{
65 int align = (res->flags & IORESOURCE_MEM) ? 16 : 4;
66
67 /*
68 * Host bridge windows are not BARs, but the decoders on the PCI side
69 * that claim this address space have starting alignment and length
70 * constraints, so fix any obvious BIOS goofs.
71 */
72 if (!IS_ALIGNED(res->start, align)) {
73 dev_printk(KERN_DEBUG, &bridge->dev,
74 "host bridge window %pR invalid; "
75 "aligning start to %d-byte boundary\n", res, align);
76 res->start &= ~(align - 1);
77 }
78 if (!IS_ALIGNED(res->end + 1, align)) {
79 dev_printk(KERN_DEBUG, &bridge->dev,
80 "host bridge window %pR invalid; "
81 "aligning end to %d-byte boundary\n", res, align);
82 res->end = ALIGN(res->end, align) - 1;
83 }
84}
85
61static acpi_status 86static acpi_status
62setup_resource(struct acpi_resource *acpi_res, void *data) 87setup_resource(struct acpi_resource *acpi_res, void *data)
63{ 88{
@@ -91,11 +116,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
91 start = addr.minimum + addr.translation_offset; 116 start = addr.minimum + addr.translation_offset;
92 end = start + addr.address_length - 1; 117 end = start + addr.address_length - 1;
93 if (info->res_num >= max_root_bus_resources) { 118 if (info->res_num >= max_root_bus_resources) {
94 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " 119 if (pci_probe & PCI_USE__CRS)
95 "from %s for %s due to _CRS returning more than " 120 printk(KERN_WARNING "PCI: Failed to allocate "
96 "%d resource descriptors\n", (unsigned long) start, 121 "0x%lx-0x%lx from %s for %s due to _CRS "
97 (unsigned long) end, root->name, info->name, 122 "returning more than %d resource descriptors\n",
98 max_root_bus_resources); 123 (unsigned long) start, (unsigned long) end,
124 root->name, info->name, max_root_bus_resources);
99 return AE_OK; 125 return AE_OK;
100 } 126 }
101 127
@@ -105,14 +131,28 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
105 res->start = start; 131 res->start = start;
106 res->end = end; 132 res->end = end;
107 res->child = NULL; 133 res->child = NULL;
134 align_resource(info->bridge, res);
135
136 if (!(pci_probe & PCI_USE__CRS)) {
137 dev_printk(KERN_DEBUG, &info->bridge->dev,
138 "host bridge window %pR (ignored)\n", res);
139 return AE_OK;
140 }
108 141
109 if (insert_resource(root, res)) { 142 if (insert_resource(root, res)) {
110 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " 143 dev_err(&info->bridge->dev,
111 "from %s for %s\n", (unsigned long) res->start, 144 "can't allocate host bridge window %pR\n", res);
112 (unsigned long) res->end, root->name, info->name);
113 } else { 145 } else {
114 info->bus->resource[info->res_num] = res; 146 info->bus->resource[info->res_num] = res;
115 info->res_num++; 147 info->res_num++;
148 if (addr.translation_offset)
149 dev_info(&info->bridge->dev, "host bridge window %pR "
150 "(PCI address [%#llx-%#llx])\n",
151 res, res->start - addr.translation_offset,
152 res->end - addr.translation_offset);
153 else
154 dev_info(&info->bridge->dev,
155 "host bridge window %pR\n", res);
116 } 156 }
117 return AE_OK; 157 return AE_OK;
118} 158}
@@ -124,6 +164,12 @@ get_current_resources(struct acpi_device *device, int busnum,
124 struct pci_root_info info; 164 struct pci_root_info info;
125 size_t size; 165 size_t size;
126 166
167 if (!(pci_probe & PCI_USE__CRS))
168 dev_info(&device->dev,
169 "ignoring host bridge windows from ACPI; "
170 "boot with \"pci=use_crs\" to use them\n");
171
172 info.bridge = device;
127 info.bus = bus; 173 info.bus = bus;
128 info.res_num = 0; 174 info.res_num = 0;
129 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 175 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
@@ -163,8 +209,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
163#endif 209#endif
164 210
165 if (domain && !pci_domains_supported) { 211 if (domain && !pci_domains_supported) {
166 printk(KERN_WARNING "PCI: Multiple domains not supported " 212 printk(KERN_WARNING "pci_bus %04x:%02x: "
167 "(dom %d, bus %d)\n", domain, busnum); 213 "ignored (multiple domains not supported)\n",
214 domain, busnum);
168 return NULL; 215 return NULL;
169 } 216 }
170 217
@@ -188,7 +235,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
188 */ 235 */
189 sd = kzalloc(sizeof(*sd), GFP_KERNEL); 236 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
190 if (!sd) { 237 if (!sd) {
191 printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); 238 printk(KERN_WARNING "pci_bus %04x:%02x: "
239 "ignored (out of memory)\n", domain, busnum);
192 return NULL; 240 return NULL;
193 } 241 }
194 242
@@ -209,9 +257,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
209 } else { 257 } else {
210 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); 258 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
211 if (bus) { 259 if (bus) {
212 if (pci_probe & PCI_USE__CRS) 260 get_current_resources(device, busnum, domain, bus);
213 get_current_resources(device, busnum, domain,
214 bus);
215 bus->subordinate = pci_scan_child_bus(bus); 261 bus->subordinate = pci_scan_child_bus(bus);
216 } 262 }
217 } 263 }
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 572ee9782f2a..95ecbd495955 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -6,10 +6,10 @@
6 6
7#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
8#include <asm/pci-direct.h> 8#include <asm/pci-direct.h>
9#include <asm/mpspec.h>
10#include <linux/cpumask.h>
11#endif 9#endif
12 10
11#include "bus_numa.h"
12
13/* 13/*
14 * This discovers the pcibus <-> node mapping on AMD K8. 14 * This discovers the pcibus <-> node mapping on AMD K8.
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
@@ -17,67 +17,6 @@
17 17
18#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
19 19
20/*
21 * sub bus (transparent) will use entres from 3 to store extra from root,
22 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
23 */
24#define RES_NUM 16
25struct pci_root_info {
26 char name[12];
27 unsigned int res_num;
28 struct resource res[RES_NUM];
29 int bus_min;
30 int bus_max;
31 int node;
32 int link;
33};
34
35/* 4 at this time, it may become to 32 */
36#define PCI_ROOT_NR 4
37static int pci_root_num;
38static struct pci_root_info pci_root_info[PCI_ROOT_NR];
39
40void x86_pci_root_bus_res_quirks(struct pci_bus *b)
41{
42 int i;
43 int j;
44 struct pci_root_info *info;
45
46 /* don't go for it if _CRS is used already */
47 if (b->resource[0] != &ioport_resource ||
48 b->resource[1] != &iomem_resource)
49 return;
50
51 /* if only one root bus, don't need to anything */
52 if (pci_root_num < 2)
53 return;
54
55 for (i = 0; i < pci_root_num; i++) {
56 if (pci_root_info[i].bus_min == b->number)
57 break;
58 }
59
60 if (i == pci_root_num)
61 return;
62
63 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
64 b->number);
65
66 info = &pci_root_info[i];
67 for (j = 0; j < info->res_num; j++) {
68 struct resource *res;
69 struct resource *root;
70
71 res = &info->res[j];
72 b->resource[j] = res;
73 if (res->flags & IORESOURCE_IO)
74 root = &ioport_resource;
75 else
76 root = &iomem_resource;
77 insert_resource(root, res);
78 }
79}
80
81#define RANGE_NUM 16 20#define RANGE_NUM 16
82 21
83struct res_range { 22struct res_range {
@@ -130,52 +69,6 @@ static void __init update_range(struct res_range *range, size_t start,
130 } 69 }
131} 70}
132 71
133static void __init update_res(struct pci_root_info *info, size_t start,
134 size_t end, unsigned long flags, int merge)
135{
136 int i;
137 struct resource *res;
138
139 if (!merge)
140 goto addit;
141
142 /* try to merge it with old one */
143 for (i = 0; i < info->res_num; i++) {
144 size_t final_start, final_end;
145 size_t common_start, common_end;
146
147 res = &info->res[i];
148 if (res->flags != flags)
149 continue;
150
151 common_start = max((size_t)res->start, start);
152 common_end = min((size_t)res->end, end);
153 if (common_start > common_end + 1)
154 continue;
155
156 final_start = min((size_t)res->start, start);
157 final_end = max((size_t)res->end, end);
158
159 res->start = final_start;
160 res->end = final_end;
161 return;
162 }
163
164addit:
165
166 /* need to add that */
167 if (info->res_num >= RES_NUM)
168 return;
169
170 res = &info->res[info->res_num];
171 res->name = info->name;
172 res->flags = flags;
173 res->start = start;
174 res->end = end;
175 res->child = NULL;
176 info->res_num++;
177}
178
179struct pci_hostbridge_probe { 72struct pci_hostbridge_probe {
180 u32 bus; 73 u32 bus;
181 u32 slot; 74 u32 slot;
@@ -230,7 +123,6 @@ static int __init early_fill_mp_bus_info(void)
230 int j; 123 int j;
231 unsigned bus; 124 unsigned bus;
232 unsigned slot; 125 unsigned slot;
233 int found;
234 int node; 126 int node;
235 int link; 127 int link;
236 int def_node; 128 int def_node;
@@ -247,7 +139,7 @@ static int __init early_fill_mp_bus_info(void)
247 if (!early_pci_allowed()) 139 if (!early_pci_allowed())
248 return -1; 140 return -1;
249 141
250 found = 0; 142 found_all_numa_early = 0;
251 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 143 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
252 u32 id; 144 u32 id;
253 u16 device; 145 u16 device;
@@ -261,12 +153,12 @@ static int __init early_fill_mp_bus_info(void)
261 device = (id>>16) & 0xffff; 153 device = (id>>16) & 0xffff;
262 if (pci_probes[i].vendor == vendor && 154 if (pci_probes[i].vendor == vendor &&
263 pci_probes[i].device == device) { 155 pci_probes[i].device == device) {
264 found = 1; 156 found_all_numa_early = 1;
265 break; 157 break;
266 } 158 }
267 } 159 }
268 160
269 if (!found) 161 if (!found_all_numa_early)
270 return 0; 162 return 0;
271 163
272 pci_root_num = 0; 164 pci_root_num = 0;
@@ -488,7 +380,7 @@ static int __init early_fill_mp_bus_info(void)
488 info = &pci_root_info[i]; 380 info = &pci_root_info[i];
489 res_num = info->res_num; 381 res_num = info->res_num;
490 busnum = info->bus_min; 382 busnum = info->bus_min;
491 printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", 383 printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
492 info->bus_min, info->bus_max, info->node, info->link); 384 info->bus_min, info->bus_max, info->node, info->link);
493 for (j = 0; j < res_num; j++) { 385 for (j = 0; j < res_num; j++) {
494 res = &info->res[j]; 386 res = &info->res[j];
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
new file mode 100644
index 000000000000..f939d603adfa
--- /dev/null
+++ b/arch/x86/pci/bus_numa.c
@@ -0,0 +1,101 @@
1#include <linux/init.h>
2#include <linux/pci.h>
3
4#include "bus_numa.h"
5
6int pci_root_num;
7struct pci_root_info pci_root_info[PCI_ROOT_NR];
8int found_all_numa_early;
9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b)
11{
12 int i;
13 int j;
14 struct pci_root_info *info;
15
16 /* don't go for it if _CRS is used already */
17 if (b->resource[0] != &ioport_resource ||
18 b->resource[1] != &iomem_resource)
19 return;
20
21 if (!pci_root_num)
22 return;
23
24 /* for amd, if only one root bus, don't need to do anything */
25 if (pci_root_num < 2 && found_all_numa_early)
26 return;
27
28 for (i = 0; i < pci_root_num; i++) {
29 if (pci_root_info[i].bus_min == b->number)
30 break;
31 }
32
33 if (i == pci_root_num)
34 return;
35
36 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
37 b->number);
38
39 info = &pci_root_info[i];
40 for (j = 0; j < info->res_num; j++) {
41 struct resource *res;
42 struct resource *root;
43
44 res = &info->res[j];
45 b->resource[j] = res;
46 if (res->flags & IORESOURCE_IO)
47 root = &ioport_resource;
48 else
49 root = &iomem_resource;
50 insert_resource(root, res);
51 }
52}
53
54void __devinit update_res(struct pci_root_info *info, size_t start,
55 size_t end, unsigned long flags, int merge)
56{
57 int i;
58 struct resource *res;
59
60 if (start > end)
61 return;
62
63 if (!merge)
64 goto addit;
65
66 /* try to merge it with old one */
67 for (i = 0; i < info->res_num; i++) {
68 size_t final_start, final_end;
69 size_t common_start, common_end;
70
71 res = &info->res[i];
72 if (res->flags != flags)
73 continue;
74
75 common_start = max((size_t)res->start, start);
76 common_end = min((size_t)res->end, end);
77 if (common_start > common_end + 1)
78 continue;
79
80 final_start = min((size_t)res->start, start);
81 final_end = max((size_t)res->end, end);
82
83 res->start = final_start;
84 res->end = final_end;
85 return;
86 }
87
88addit:
89
90 /* need to add that */
91 if (info->res_num >= RES_NUM)
92 return;
93
94 res = &info->res[info->res_num];
95 res->name = info->name;
96 res->flags = flags;
97 res->start = start;
98 res->end = end;
99 res->child = NULL;
100 info->res_num++;
101}
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
new file mode 100644
index 000000000000..adbc23fe82ac
--- /dev/null
+++ b/arch/x86/pci/bus_numa.h
@@ -0,0 +1,27 @@
1#ifdef CONFIG_X86_64
2
3/*
4 * sub bus (transparent) will use entres from 3 to store extra from
5 * root, so need to make sure we have enough slot there, Should we
6 * increase PCI_BUS_NUM_RESOURCES?
7 */
8#define RES_NUM 16
9struct pci_root_info {
10 char name[12];
11 unsigned int res_num;
12 struct resource res[RES_NUM];
13 int bus_min;
14 int bus_max;
15 int node;
16 int link;
17};
18
19/* 4 at this time, it may become to 32 */
20#define PCI_ROOT_NR 4
21extern int pci_root_num;
22extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
23extern int found_all_numa_early;
24
25extern void update_res(struct pci_root_info *info, size_t start,
26 size_t end, unsigned long flags, int merge);
27#endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 1331fcf26143..d2552c68e94d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -410,8 +410,6 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
410 return bus; 410 return bus;
411} 411}
412 412
413extern u8 pci_cache_line_size;
414
415int __init pcibios_init(void) 413int __init pcibios_init(void)
416{ 414{
417 struct cpuinfo_x86 *c = &boot_cpu_data; 415 struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -422,15 +420,19 @@ int __init pcibios_init(void)
422 } 420 }
423 421
424 /* 422 /*
425 * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 423 * Set PCI cacheline size to that of the CPU if the CPU has reported it.
426 * and P4. It's also good for 386/486s (which actually have 16) 424 * (For older CPUs that don't support cpuid, we se it to 32 bytes
425 * It's also good for 386/486s (which actually have 16)
427 * as quite a few PCI devices do not support smaller values. 426 * as quite a few PCI devices do not support smaller values.
428 */ 427 */
429 pci_cache_line_size = 32 >> 2; 428 if (c->x86_clflush_size > 0) {
430 if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) 429 pci_dfl_cache_line_size = c->x86_clflush_size >> 2;
431 pci_cache_line_size = 64 >> 2; /* K7 & K8 */ 430 printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
432 else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) 431 pci_dfl_cache_line_size << 2);
433 pci_cache_line_size = 128 >> 2; /* P4 */ 432 } else {
433 pci_dfl_cache_line_size = 32 >> 2;
434 printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
435 }
434 436
435 pcibios_resource_survey(); 437 pcibios_resource_survey();
436 438
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index aaf26ae58cd5..d1067d539bee 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -12,8 +12,6 @@ u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
12 u32 v; 12 u32 v;
13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
14 v = inl(0xcfc); 14 v = inl(0xcfc);
15 if (v != 0xffffffff)
16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
17 return v; 15 return v;
18} 16}
19 17
@@ -22,7 +20,6 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
22 u8 v; 20 u8 v;
23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 21 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
24 v = inb(0xcfc + (offset&3)); 22 v = inb(0xcfc + (offset&3));
25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
26 return v; 23 return v;
27} 24}
28 25
@@ -31,28 +28,24 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
31 u16 v; 28 u16 v;
32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 29 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
33 v = inw(0xcfc + (offset&2)); 30 v = inw(0xcfc + (offset&2));
34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
35 return v; 31 return v;
36} 32}
37 33
38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 34void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
39 u32 val) 35 u32 val)
40{ 36{
41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 37 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
43 outl(val, 0xcfc); 38 outl(val, 0xcfc);
44} 39}
45 40
46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 41void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
47{ 42{
48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 43 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
50 outb(val, 0xcfc + (offset&3)); 44 outb(val, 0xcfc + (offset&3));
51} 45}
52 46
53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) 47void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
54{ 48{
55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
57 outw(val, 0xcfc + (offset&2)); 50 outw(val, 0xcfc + (offset&2));
58} 51}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b22d13b0c71d..5dc9e8c63fcd 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -129,7 +129,9 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
129 continue; 129 continue;
130 if (!r->start || 130 if (!r->start ||
131 pci_claim_resource(dev, idx) < 0) { 131 pci_claim_resource(dev, idx) < 0) {
132 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 132 dev_info(&dev->dev,
133 "can't reserve window %pR\n",
134 r);
133 /* 135 /*
134 * Something is wrong with the region. 136 * Something is wrong with the region.
135 * Invalidate the resource to prevent 137 * Invalidate the resource to prevent
@@ -144,16 +146,29 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
144 } 146 }
145} 147}
146 148
149struct pci_check_idx_range {
150 int start;
151 int end;
152};
153
147static void __init pcibios_allocate_resources(int pass) 154static void __init pcibios_allocate_resources(int pass)
148{ 155{
149 struct pci_dev *dev = NULL; 156 struct pci_dev *dev = NULL;
150 int idx, disabled; 157 int idx, disabled, i;
151 u16 command; 158 u16 command;
152 struct resource *r; 159 struct resource *r;
153 160
161 struct pci_check_idx_range idx_range[] = {
162 { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END },
163#ifdef CONFIG_PCI_IOV
164 { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END },
165#endif
166 };
167
154 for_each_pci_dev(dev) { 168 for_each_pci_dev(dev) {
155 pci_read_config_word(dev, PCI_COMMAND, &command); 169 pci_read_config_word(dev, PCI_COMMAND, &command);
156 for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { 170 for (i = 0; i < ARRAY_SIZE(idx_range); i++)
171 for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) {
157 r = &dev->resource[idx]; 172 r = &dev->resource[idx];
158 if (r->parent) /* Already allocated */ 173 if (r->parent) /* Already allocated */
159 continue; 174 continue;
@@ -164,12 +179,12 @@ static void __init pcibios_allocate_resources(int pass)
164 else 179 else
165 disabled = !(command & PCI_COMMAND_MEMORY); 180 disabled = !(command & PCI_COMMAND_MEMORY);
166 if (pass == disabled) { 181 if (pass == disabled) {
167 dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", 182 dev_dbg(&dev->dev,
168 (unsigned long long) r->start, 183 "BAR %d: reserving %pr (d=%d, p=%d)\n",
169 (unsigned long long) r->end, 184 idx, r, disabled, pass);
170 r->flags, disabled, pass);
171 if (pci_claim_resource(dev, idx) < 0) { 185 if (pci_claim_resource(dev, idx) < 0) {
172 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 186 dev_info(&dev->dev,
187 "can't reserve %pR\n", r);
173 /* We'll assign a new address later */ 188 /* We'll assign a new address later */
174 r->end -= r->start; 189 r->end -= r->start;
175 r->start = 0; 190 r->start = 0;
@@ -182,7 +197,7 @@ static void __init pcibios_allocate_resources(int pass)
182 /* Turn the ROM off, leave the resource region, 197 /* Turn the ROM off, leave the resource region,
183 * but keep it unregistered. */ 198 * but keep it unregistered. */
184 u32 reg; 199 u32 reg;
185 dev_dbg(&dev->dev, "disabling ROM\n"); 200 dev_dbg(&dev->dev, "disabling ROM %pR\n", r);
186 r->flags &= ~IORESOURCE_ROM_ENABLE; 201 r->flags &= ~IORESOURCE_ROM_ENABLE;
187 pci_read_config_dword(dev, 202 pci_read_config_dword(dev,
188 dev->rom_base_reg, &reg); 203 dev->rom_base_reg, &reg);
@@ -282,6 +297,15 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
282 return -EINVAL; 297 return -EINVAL;
283 298
284 prot = pgprot_val(vma->vm_page_prot); 299 prot = pgprot_val(vma->vm_page_prot);
300
301 /*
302 * Return error if pat is not enabled and write_combine is requested.
303 * Caller can followup with UC MINUS request and add a WC mtrr if there
304 * is a free mtrr slot.
305 */
306 if (!pat_enabled && write_combine)
307 return -EINVAL;
308
285 if (pat_enabled && write_combine) 309 if (pat_enabled && write_combine)
286 prot |= _PAGE_CACHE_WC; 310 prot |= _PAGE_CACHE_WC;
287 else if (pat_enabled || boot_cpu_data.x86 > 3) 311 else if (pat_enabled || boot_cpu_data.x86 > 3)
diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c
new file mode 100644
index 000000000000..b7a55dc55d13
--- /dev/null
+++ b/arch/x86/pci/intel_bus.c
@@ -0,0 +1,90 @@
1/*
2 * to read io range from IOH pci conf, need to do it after mmconfig is there
3 */
4
5#include <linux/delay.h>
6#include <linux/dmi.h>
7#include <linux/pci.h>
8#include <linux/init.h>
9#include <asm/pci_x86.h>
10
11#include "bus_numa.h"
12
13static inline void print_ioh_resources(struct pci_root_info *info)
14{
15 int res_num;
16 int busnum;
17 int i;
18
19 printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n",
20 info->bus_min, info->bus_max);
21 res_num = info->res_num;
22 busnum = info->bus_min;
23 for (i = 0; i < res_num; i++) {
24 struct resource *res;
25
26 res = &info->res[i];
27 printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n",
28 busnum, i,
29 (res->flags & IORESOURCE_IO) ? "io port" :
30 "mmio",
31 res->start, res->end);
32 }
33}
34
35#define IOH_LIO 0x108
36#define IOH_LMMIOL 0x10c
37#define IOH_LMMIOH 0x110
38#define IOH_LMMIOH_BASEU 0x114
39#define IOH_LMMIOH_LIMITU 0x118
40#define IOH_LCFGBUS 0x11c
41
42static void __devinit pci_root_bus_res(struct pci_dev *dev)
43{
44 u16 word;
45 u32 dword;
46 struct pci_root_info *info;
47 u16 io_base, io_end;
48 u32 mmiol_base, mmiol_end;
49 u64 mmioh_base, mmioh_end;
50 int bus_base, bus_end;
51
52 if (pci_root_num >= PCI_ROOT_NR) {
53 printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n");
54 return;
55 }
56
57 info = &pci_root_info[pci_root_num];
58 pci_root_num++;
59
60 pci_read_config_word(dev, IOH_LCFGBUS, &word);
61 bus_base = (word & 0xff);
62 bus_end = (word & 0xff00) >> 8;
63 sprintf(info->name, "PCI Bus #%02x", bus_base);
64 info->bus_min = bus_base;
65 info->bus_max = bus_end;
66
67 pci_read_config_word(dev, IOH_LIO, &word);
68 io_base = (word & 0xf0) << (12 - 4);
69 io_end = (word & 0xf000) | 0xfff;
70 update_res(info, io_base, io_end, IORESOURCE_IO, 0);
71
72 pci_read_config_dword(dev, IOH_LMMIOL, &dword);
73 mmiol_base = (dword & 0xff00) << (24 - 8);
74 mmiol_end = (dword & 0xff000000) | 0xffffff;
75 update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0);
76
77 pci_read_config_dword(dev, IOH_LMMIOH, &dword);
78 mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10);
79 mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff);
80 pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword);
81 mmioh_base |= ((u64)(dword & 0x7ffff)) << 32;
82 pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword);
83 mmioh_end |= ((u64)(dword & 0x7ffff)) << 32;
84 update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0);
85
86 print_ioh_resources(info);
87}
88
89/* intel IOH */
90DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 602c172d3bd5..b19d1e54201e 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,48 +15,98 @@
15#include <linux/acpi.h> 15#include <linux/acpi.h>
16#include <linux/sfi_acpi.h> 16#include <linux/sfi_acpi.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/sort.h> 18#include <linux/dmi.h>
19#include <asm/e820.h> 19#include <asm/e820.h>
20#include <asm/pci_x86.h> 20#include <asm/pci_x86.h>
21#include <asm/acpi.h> 21#include <asm/acpi.h>
22 22
23#define PREFIX "PCI: " 23#define PREFIX "PCI: "
24 24
25/* aperture is up to 256MB but BIOS may reserve less */
26#define MMCONFIG_APER_MIN (2 * 1024*1024)
27#define MMCONFIG_APER_MAX (256 * 1024*1024)
28
29/* Indicate if the mmcfg resources have been placed into the resource table. */ 25/* Indicate if the mmcfg resources have been placed into the resource table. */
30static int __initdata pci_mmcfg_resources_inserted; 26static int __initdata pci_mmcfg_resources_inserted;
31 27
32static __init int extend_mmcfg(int num) 28LIST_HEAD(pci_mmcfg_list);
29
30static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
33{ 31{
34 struct acpi_mcfg_allocation *new; 32 if (cfg->res.parent)
35 int new_num = pci_mmcfg_config_num + num; 33 release_resource(&cfg->res);
34 list_del(&cfg->list);
35 kfree(cfg);
36}
36 37
37 new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); 38static __init void free_all_mmcfg(void)
38 if (!new) 39{
39 return -1; 40 struct pci_mmcfg_region *cfg, *tmp;
40 41
41 if (pci_mmcfg_config) { 42 pci_mmcfg_arch_free();
42 memcpy(new, pci_mmcfg_config, 43 list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list)
43 sizeof(pci_mmcfg_config[0]) * new_num); 44 pci_mmconfig_remove(cfg);
44 kfree(pci_mmcfg_config); 45}
46
47static __init void list_add_sorted(struct pci_mmcfg_region *new)
48{
49 struct pci_mmcfg_region *cfg;
50
51 /* keep list sorted by segment and starting bus number */
52 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
53 if (cfg->segment > new->segment ||
54 (cfg->segment == new->segment &&
55 cfg->start_bus >= new->start_bus)) {
56 list_add_tail(&new->list, &cfg->list);
57 return;
58 }
45 } 59 }
46 pci_mmcfg_config = new; 60 list_add_tail(&new->list, &pci_mmcfg_list);
61}
47 62
48 return 0; 63static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
64 int end, u64 addr)
65{
66 struct pci_mmcfg_region *new;
67 int num_buses;
68 struct resource *res;
69
70 if (addr == 0)
71 return NULL;
72
73 new = kzalloc(sizeof(*new), GFP_KERNEL);
74 if (!new)
75 return NULL;
76
77 new->address = addr;
78 new->segment = segment;
79 new->start_bus = start;
80 new->end_bus = end;
81
82 list_add_sorted(new);
83
84 num_buses = end - start + 1;
85 res = &new->res;
86 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
87 res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
88 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
89 snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
90 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
91 res->name = new->name;
92
93 printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at "
94 "%pR (base %#lx)\n", segment, start, end, &new->res,
95 (unsigned long) addr);
96
97 return new;
49} 98}
50 99
51static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end) 100struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
52{ 101{
53 int i = pci_mmcfg_config_num; 102 struct pci_mmcfg_region *cfg;
54 103
55 pci_mmcfg_config_num++; 104 list_for_each_entry(cfg, &pci_mmcfg_list, list)
56 pci_mmcfg_config[i].address = addr; 105 if (cfg->segment == segment &&
57 pci_mmcfg_config[i].pci_segment = segment; 106 cfg->start_bus <= bus && bus <= cfg->end_bus)
58 pci_mmcfg_config[i].start_bus_number = start; 107 return cfg;
59 pci_mmcfg_config[i].end_bus_number = end; 108
109 return NULL;
60} 110}
61 111
62static const char __init *pci_mmcfg_e7520(void) 112static const char __init *pci_mmcfg_e7520(void)
@@ -68,11 +118,9 @@ static const char __init *pci_mmcfg_e7520(void)
68 if (win == 0x0000 || win == 0xf000) 118 if (win == 0x0000 || win == 0xf000)
69 return NULL; 119 return NULL;
70 120
71 if (extend_mmcfg(1) == -1) 121 if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL)
72 return NULL; 122 return NULL;
73 123
74 fill_one_mmcfg(win << 16, 0, 0, 255);
75
76 return "Intel Corporation E7520 Memory Controller Hub"; 124 return "Intel Corporation E7520 Memory Controller Hub";
77} 125}
78 126
@@ -114,11 +162,9 @@ static const char __init *pci_mmcfg_intel_945(void)
114 if ((pciexbar & mask) >= 0xf0000000U) 162 if ((pciexbar & mask) >= 0xf0000000U)
115 return NULL; 163 return NULL;
116 164
117 if (extend_mmcfg(1) == -1) 165 if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL)
118 return NULL; 166 return NULL;
119 167
120 fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1);
121
122 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; 168 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
123} 169}
124 170
@@ -127,7 +173,7 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
127 u32 low, high, address; 173 u32 low, high, address;
128 u64 base, msr; 174 u64 base, msr;
129 int i; 175 int i;
130 unsigned segnbits = 0, busnbits; 176 unsigned segnbits = 0, busnbits, end_bus;
131 177
132 if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) 178 if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF))
133 return NULL; 179 return NULL;
@@ -161,11 +207,13 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
161 busnbits = 8; 207 busnbits = 8;
162 } 208 }
163 209
164 if (extend_mmcfg(1 << segnbits) == -1) 210 end_bus = (1 << busnbits) - 1;
165 return NULL;
166
167 for (i = 0; i < (1 << segnbits); i++) 211 for (i = 0; i < (1 << segnbits); i++)
168 fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1); 212 if (pci_mmconfig_add(i, 0, end_bus,
213 base + (1<<28) * i) == NULL) {
214 free_all_mmcfg();
215 return NULL;
216 }
169 217
170 return "AMD Family 10h NB"; 218 return "AMD Family 10h NB";
171} 219}
@@ -190,7 +238,7 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
190 /* 238 /*
191 * do check if amd fam10h already took over 239 * do check if amd fam10h already took over
192 */ 240 */
193 if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) 241 if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked)
194 return NULL; 242 return NULL;
195 243
196 mcp55_checked = true; 244 mcp55_checked = true;
@@ -213,16 +261,14 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
213 if (!(extcfg & extcfg_enable_mask)) 261 if (!(extcfg & extcfg_enable_mask))
214 continue; 262 continue;
215 263
216 if (extend_mmcfg(1) == -1)
217 continue;
218
219 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; 264 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
220 base = extcfg & extcfg_base_mask[size_index]; 265 base = extcfg & extcfg_base_mask[size_index];
221 /* base could > 4G */ 266 /* base could > 4G */
222 base <<= extcfg_base_lshift; 267 base <<= extcfg_base_lshift;
223 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift; 268 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
224 end = start + extcfg_sizebus[size_index] - 1; 269 end = start + extcfg_sizebus[size_index] - 1;
225 fill_one_mmcfg(base, 0, start, end); 270 if (pci_mmconfig_add(0, start, end, base) == NULL)
271 continue;
226 mcp55_mmconf_found++; 272 mcp55_mmconf_found++;
227 } 273 }
228 274
@@ -253,45 +299,27 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
253 0x0369, pci_mmcfg_nvidia_mcp55 }, 299 0x0369, pci_mmcfg_nvidia_mcp55 },
254}; 300};
255 301
256static int __init cmp_mmcfg(const void *x1, const void *x2)
257{
258 const typeof(pci_mmcfg_config[0]) *m1 = x1;
259 const typeof(pci_mmcfg_config[0]) *m2 = x2;
260 int start1, start2;
261
262 start1 = m1->start_bus_number;
263 start2 = m2->start_bus_number;
264
265 return start1 - start2;
266}
267
268static void __init pci_mmcfg_check_end_bus_number(void) 302static void __init pci_mmcfg_check_end_bus_number(void)
269{ 303{
270 int i; 304 struct pci_mmcfg_region *cfg, *cfgx;
271 typeof(pci_mmcfg_config[0]) *cfg, *cfgx;
272
273 /* sort them at first */
274 sort(pci_mmcfg_config, pci_mmcfg_config_num,
275 sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL);
276 305
277 /* last one*/ 306 /* last one*/
278 if (pci_mmcfg_config_num > 0) { 307 cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list);
279 i = pci_mmcfg_config_num - 1; 308 if (cfg)
280 cfg = &pci_mmcfg_config[i]; 309 if (cfg->end_bus < cfg->start_bus)
281 if (cfg->end_bus_number < cfg->start_bus_number) 310 cfg->end_bus = 255;
282 cfg->end_bus_number = 255;
283 }
284 311
285 /* don't overlap please */ 312 if (list_is_singular(&pci_mmcfg_list))
286 for (i = 0; i < pci_mmcfg_config_num - 1; i++) { 313 return;
287 cfg = &pci_mmcfg_config[i];
288 cfgx = &pci_mmcfg_config[i+1];
289 314
290 if (cfg->end_bus_number < cfg->start_bus_number) 315 /* don't overlap please */
291 cfg->end_bus_number = 255; 316 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
317 if (cfg->end_bus < cfg->start_bus)
318 cfg->end_bus = 255;
292 319
293 if (cfg->end_bus_number >= cfgx->start_bus_number) 320 cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
294 cfg->end_bus_number = cfgx->start_bus_number - 1; 321 if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus)
322 cfg->end_bus = cfgx->start_bus - 1;
295 } 323 }
296} 324}
297 325
@@ -306,8 +334,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
306 if (!raw_pci_ops) 334 if (!raw_pci_ops)
307 return 0; 335 return 0;
308 336
309 pci_mmcfg_config_num = 0; 337 free_all_mmcfg();
310 pci_mmcfg_config = NULL;
311 338
312 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) { 339 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
313 bus = pci_mmcfg_probes[i].bus; 340 bus = pci_mmcfg_probes[i].bus;
@@ -322,45 +349,22 @@ static int __init pci_mmcfg_check_hostbridge(void)
322 name = pci_mmcfg_probes[i].probe(); 349 name = pci_mmcfg_probes[i].probe();
323 350
324 if (name) 351 if (name)
325 printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", 352 printk(KERN_INFO PREFIX "%s with MMCONFIG support\n",
326 name); 353 name);
327 } 354 }
328 355
329 /* some end_bus_number is crazy, fix it */ 356 /* some end_bus_number is crazy, fix it */
330 pci_mmcfg_check_end_bus_number(); 357 pci_mmcfg_check_end_bus_number();
331 358
332 return pci_mmcfg_config_num != 0; 359 return !list_empty(&pci_mmcfg_list);
333} 360}
334 361
335static void __init pci_mmcfg_insert_resources(void) 362static void __init pci_mmcfg_insert_resources(void)
336{ 363{
337#define PCI_MMCFG_RESOURCE_NAME_LEN 24 364 struct pci_mmcfg_region *cfg;
338 int i;
339 struct resource *res;
340 char *names;
341 unsigned num_buses;
342
343 res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
344 pci_mmcfg_config_num, GFP_KERNEL);
345 if (!res) {
346 printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
347 return;
348 }
349 365
350 names = (void *)&res[pci_mmcfg_config_num]; 366 list_for_each_entry(cfg, &pci_mmcfg_list, list)
351 for (i = 0; i < pci_mmcfg_config_num; i++, res++) { 367 insert_resource(&iomem_resource, &cfg->res);
352 struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
353 num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
354 res->name = names;
355 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN,
356 "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment,
357 cfg->start_bus_number, cfg->end_bus_number);
358 res->start = cfg->address + (cfg->start_bus_number << 20);
359 res->end = res->start + (num_buses << 20) - 1;
360 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
361 insert_resource(&iomem_resource, res);
362 names += PCI_MMCFG_RESOURCE_NAME_LEN;
363 }
364 368
365 /* Mark that the resources have been inserted. */ 369 /* Mark that the resources have been inserted. */
366 pci_mmcfg_resources_inserted = 1; 370 pci_mmcfg_resources_inserted = 1;
@@ -437,11 +441,12 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
437typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); 441typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
438 442
439static int __init is_mmconf_reserved(check_reserved_t is_reserved, 443static int __init is_mmconf_reserved(check_reserved_t is_reserved,
440 u64 addr, u64 size, int i, 444 struct pci_mmcfg_region *cfg, int with_e820)
441 typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
442{ 445{
446 u64 addr = cfg->res.start;
447 u64 size = resource_size(&cfg->res);
443 u64 old_size = size; 448 u64 old_size = size;
444 int valid = 0; 449 int valid = 0, num_buses;
445 450
446 while (!is_reserved(addr, addr + size, E820_RESERVED)) { 451 while (!is_reserved(addr, addr + size, E820_RESERVED)) {
447 size >>= 1; 452 size >>= 1;
@@ -450,19 +455,25 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
450 } 455 }
451 456
452 if (size >= (16UL<<20) || size == old_size) { 457 if (size >= (16UL<<20) || size == old_size) {
453 printk(KERN_NOTICE 458 printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n",
454 "PCI: MCFG area at %Lx reserved in %s\n", 459 &cfg->res,
455 addr, with_e820?"E820":"ACPI motherboard resources"); 460 with_e820 ? "E820" : "ACPI motherboard resources");
456 valid = 1; 461 valid = 1;
457 462
458 if (old_size != size) { 463 if (old_size != size) {
459 /* update end_bus_number */ 464 /* update end_bus */
460 cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); 465 cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
461 printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " 466 num_buses = cfg->end_bus - cfg->start_bus + 1;
462 "segment %hu buses %u - %u\n", 467 cfg->res.end = cfg->res.start +
463 i, (unsigned long)cfg->address, cfg->pci_segment, 468 PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
464 (unsigned int)cfg->start_bus_number, 469 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
465 (unsigned int)cfg->end_bus_number); 470 "PCI MMCONFIG %04x [bus %02x-%02x]",
471 cfg->segment, cfg->start_bus, cfg->end_bus);
472 printk(KERN_INFO PREFIX
473 "MMCONFIG for %04x [bus%02x-%02x] "
474 "at %pR (base %#lx) (size reduced!)\n",
475 cfg->segment, cfg->start_bus, cfg->end_bus,
476 &cfg->res, (unsigned long) cfg->address);
466 } 477 }
467 } 478 }
468 479
@@ -471,45 +482,26 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
471 482
472static void __init pci_mmcfg_reject_broken(int early) 483static void __init pci_mmcfg_reject_broken(int early)
473{ 484{
474 typeof(pci_mmcfg_config[0]) *cfg; 485 struct pci_mmcfg_region *cfg;
475 int i;
476 486
477 if ((pci_mmcfg_config_num == 0) || 487 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
478 (pci_mmcfg_config == NULL) ||
479 (pci_mmcfg_config[0].address == 0))
480 return;
481
482 for (i = 0; i < pci_mmcfg_config_num; i++) {
483 int valid = 0; 488 int valid = 0;
484 u64 addr, size;
485
486 cfg = &pci_mmcfg_config[i];
487 addr = cfg->start_bus_number;
488 addr <<= 20;
489 addr += cfg->address;
490 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
491 size <<= 20;
492 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
493 "segment %hu buses %u - %u\n",
494 i, (unsigned long)cfg->address, cfg->pci_segment,
495 (unsigned int)cfg->start_bus_number,
496 (unsigned int)cfg->end_bus_number);
497 489
498 if (!early && !acpi_disabled) 490 if (!early && !acpi_disabled)
499 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); 491 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
500 492
501 if (valid) 493 if (valid)
502 continue; 494 continue;
503 495
504 if (!early) 496 if (!early)
505 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" 497 printk(KERN_ERR FW_BUG PREFIX
506 " reserved in ACPI motherboard resources\n", 498 "MMCONFIG at %pR not reserved in "
507 cfg->address); 499 "ACPI motherboard resources\n", &cfg->res);
508 500
509 /* Don't try to do this check unless configuration 501 /* Don't try to do this check unless configuration
510 type 1 is available. how about type 2 ?*/ 502 type 1 is available. how about type 2 ?*/
511 if (raw_pci_ops) 503 if (raw_pci_ops)
512 valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); 504 valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
513 505
514 if (!valid) 506 if (!valid)
515 goto reject; 507 goto reject;
@@ -518,34 +510,41 @@ static void __init pci_mmcfg_reject_broken(int early)
518 return; 510 return;
519 511
520reject: 512reject:
521 printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); 513 printk(KERN_INFO PREFIX "not using MMCONFIG\n");
522 pci_mmcfg_arch_free(); 514 free_all_mmcfg();
523 kfree(pci_mmcfg_config);
524 pci_mmcfg_config = NULL;
525 pci_mmcfg_config_num = 0;
526} 515}
527 516
528static int __initdata known_bridge; 517static int __initdata known_bridge;
529 518
530static int acpi_mcfg_64bit_base_addr __initdata = FALSE; 519static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
520 struct acpi_mcfg_allocation *cfg)
521{
522 int year;
531 523
532/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 524 if (cfg->address < 0xFFFFFFFF)
533struct acpi_mcfg_allocation *pci_mmcfg_config; 525 return 0;
534int pci_mmcfg_config_num;
535 526
536static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
537{
538 if (!strcmp(mcfg->header.oem_id, "SGI")) 527 if (!strcmp(mcfg->header.oem_id, "SGI"))
539 acpi_mcfg_64bit_base_addr = TRUE; 528 return 0;
540 529
541 return 0; 530 if (mcfg->header.revision >= 1) {
531 if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
532 year >= 2010)
533 return 0;
534 }
535
536 printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
537 "is above 4GB, ignored\n", cfg->pci_segment,
538 cfg->start_bus_number, cfg->end_bus_number, cfg->address);
539 return -EINVAL;
542} 540}
543 541
544static int __init pci_parse_mcfg(struct acpi_table_header *header) 542static int __init pci_parse_mcfg(struct acpi_table_header *header)
545{ 543{
546 struct acpi_table_mcfg *mcfg; 544 struct acpi_table_mcfg *mcfg;
545 struct acpi_mcfg_allocation *cfg_table, *cfg;
547 unsigned long i; 546 unsigned long i;
548 int config_size; 547 int entries;
549 548
550 if (!header) 549 if (!header)
551 return -EINVAL; 550 return -EINVAL;
@@ -553,38 +552,33 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
553 mcfg = (struct acpi_table_mcfg *)header; 552 mcfg = (struct acpi_table_mcfg *)header;
554 553
555 /* how many config structures do we have */ 554 /* how many config structures do we have */
556 pci_mmcfg_config_num = 0; 555 free_all_mmcfg();
556 entries = 0;
557 i = header->length - sizeof(struct acpi_table_mcfg); 557 i = header->length - sizeof(struct acpi_table_mcfg);
558 while (i >= sizeof(struct acpi_mcfg_allocation)) { 558 while (i >= sizeof(struct acpi_mcfg_allocation)) {
559 ++pci_mmcfg_config_num; 559 entries++;
560 i -= sizeof(struct acpi_mcfg_allocation); 560 i -= sizeof(struct acpi_mcfg_allocation);
561 }; 561 };
562 if (pci_mmcfg_config_num == 0) { 562 if (entries == 0) {
563 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); 563 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
564 return -ENODEV; 564 return -ENODEV;
565 } 565 }
566 566
567 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); 567 cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
568 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); 568 for (i = 0; i < entries; i++) {
569 if (!pci_mmcfg_config) { 569 cfg = &cfg_table[i];
570 printk(KERN_WARNING PREFIX 570 if (acpi_mcfg_check_entry(mcfg, cfg)) {
571 "No memory for MCFG config tables\n"); 571 free_all_mmcfg();
572 return -ENOMEM;
573 }
574
575 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
576
577 acpi_mcfg_oem_check(mcfg);
578
579 for (i = 0; i < pci_mmcfg_config_num; ++i) {
580 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
581 !acpi_mcfg_64bit_base_addr) {
582 printk(KERN_ERR PREFIX
583 "MMCONFIG not in low 4GB of memory\n");
584 kfree(pci_mmcfg_config);
585 pci_mmcfg_config_num = 0;
586 return -ENODEV; 572 return -ENODEV;
587 } 573 }
574
575 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
576 cfg->end_bus_number, cfg->address) == NULL) {
577 printk(KERN_WARNING PREFIX
578 "no memory for MCFG entries\n");
579 free_all_mmcfg();
580 return -ENOMEM;
581 }
588 } 582 }
589 583
590 return 0; 584 return 0;
@@ -614,9 +608,7 @@ static void __init __pci_mmcfg_init(int early)
614 608
615 pci_mmcfg_reject_broken(early); 609 pci_mmcfg_reject_broken(early);
616 610
617 if ((pci_mmcfg_config_num == 0) || 611 if (list_empty(&pci_mmcfg_list))
618 (pci_mmcfg_config == NULL) ||
619 (pci_mmcfg_config[0].address == 0))
620 return; 612 return;
621 613
622 if (pci_mmcfg_arch_init()) 614 if (pci_mmcfg_arch_init())
@@ -648,9 +640,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
648 */ 640 */
649 if ((pci_mmcfg_resources_inserted == 1) || 641 if ((pci_mmcfg_resources_inserted == 1) ||
650 (pci_probe & PCI_PROBE_MMCONF) == 0 || 642 (pci_probe & PCI_PROBE_MMCONF) == 0 ||
651 (pci_mmcfg_config_num == 0) || 643 list_empty(&pci_mmcfg_list))
652 (pci_mmcfg_config == NULL) ||
653 (pci_mmcfg_config[0].address == 0))
654 return 1; 644 return 1;
655 645
656 /* 646 /*
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f10a7e94a84c..90d5fd476ed4 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -27,18 +27,10 @@ static int mmcfg_last_accessed_cpu;
27 */ 27 */
28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) 28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
29{ 29{
30 struct acpi_mcfg_allocation *cfg; 30 struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
31 int cfg_num;
32
33 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
34 cfg = &pci_mmcfg_config[cfg_num];
35 if (cfg->pci_segment == seg &&
36 (cfg->start_bus_number <= bus) &&
37 (cfg->end_bus_number >= bus))
38 return cfg->address;
39 }
40 31
41 /* Fall back to type 0 */ 32 if (cfg)
33 return cfg->address;
42 return 0; 34 return 0;
43} 35}
44 36
@@ -47,7 +39,7 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
47 */ 39 */
48static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn) 40static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn)
49{ 41{
50 u32 dev_base = base | (bus << 20) | (devfn << 12); 42 u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12);
51 int cpu = smp_processor_id(); 43 int cpu = smp_processor_id();
52 if (dev_base != mmcfg_last_accessed_device || 44 if (dev_base != mmcfg_last_accessed_device ||
53 cpu != mmcfg_last_accessed_cpu) { 45 cpu != mmcfg_last_accessed_cpu) {
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 94349f8b2f96..e783841bd1d7 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -12,38 +12,15 @@
12#include <asm/e820.h> 12#include <asm/e820.h>
13#include <asm/pci_x86.h> 13#include <asm/pci_x86.h>
14 14
15/* Static virtual mapping of the MMCONFIG aperture */ 15#define PREFIX "PCI: "
16struct mmcfg_virt {
17 struct acpi_mcfg_allocation *cfg;
18 char __iomem *virt;
19};
20static struct mmcfg_virt *pci_mmcfg_virt;
21
22static char __iomem *get_virt(unsigned int seg, unsigned bus)
23{
24 struct acpi_mcfg_allocation *cfg;
25 int cfg_num;
26
27 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
28 cfg = pci_mmcfg_virt[cfg_num].cfg;
29 if (cfg->pci_segment == seg &&
30 (cfg->start_bus_number <= bus) &&
31 (cfg->end_bus_number >= bus))
32 return pci_mmcfg_virt[cfg_num].virt;
33 }
34
35 /* Fall back to type 0 */
36 return NULL;
37}
38 16
39static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) 17static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
40{ 18{
41 char __iomem *addr; 19 struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
42 20
43 addr = get_virt(seg, bus); 21 if (cfg && cfg->virt)
44 if (!addr) 22 return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
45 return NULL; 23 return NULL;
46 return addr + ((bus << 20) | (devfn << 12));
47} 24}
48 25
49static int pci_mmcfg_read(unsigned int seg, unsigned int bus, 26static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
@@ -109,42 +86,30 @@ static struct pci_raw_ops pci_mmcfg = {
109 .write = pci_mmcfg_write, 86 .write = pci_mmcfg_write,
110}; 87};
111 88
112static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) 89static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg)
113{ 90{
114 void __iomem *addr; 91 void __iomem *addr;
115 u64 start, size; 92 u64 start, size;
93 int num_buses;
116 94
117 start = cfg->start_bus_number; 95 start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
118 start <<= 20; 96 num_buses = cfg->end_bus - cfg->start_bus + 1;
119 start += cfg->address; 97 size = PCI_MMCFG_BUS_OFFSET(num_buses);
120 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
121 size <<= 20;
122 addr = ioremap_nocache(start, size); 98 addr = ioremap_nocache(start, size);
123 if (addr) { 99 if (addr)
124 printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", 100 addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
125 start, start + size - 1);
126 addr -= cfg->start_bus_number << 20;
127 }
128 return addr; 101 return addr;
129} 102}
130 103
131int __init pci_mmcfg_arch_init(void) 104int __init pci_mmcfg_arch_init(void)
132{ 105{
133 int i; 106 struct pci_mmcfg_region *cfg;
134 pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) *
135 pci_mmcfg_config_num, GFP_KERNEL);
136 if (pci_mmcfg_virt == NULL) {
137 printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
138 return 0;
139 }
140 107
141 for (i = 0; i < pci_mmcfg_config_num; ++i) { 108 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
142 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; 109 cfg->virt = mcfg_ioremap(cfg);
143 pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); 110 if (!cfg->virt) {
144 if (!pci_mmcfg_virt[i].virt) { 111 printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
145 printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " 112 &cfg->res);
146 "segment %d\n",
147 pci_mmcfg_config[i].pci_segment);
148 pci_mmcfg_arch_free(); 113 pci_mmcfg_arch_free();
149 return 0; 114 return 0;
150 } 115 }
@@ -155,19 +120,12 @@ int __init pci_mmcfg_arch_init(void)
155 120
156void __init pci_mmcfg_arch_free(void) 121void __init pci_mmcfg_arch_free(void)
157{ 122{
158 int i; 123 struct pci_mmcfg_region *cfg;
159
160 if (pci_mmcfg_virt == NULL)
161 return;
162 124
163 for (i = 0; i < pci_mmcfg_config_num; ++i) { 125 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
164 if (pci_mmcfg_virt[i].virt) { 126 if (cfg->virt) {
165 iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20)); 127 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
166 pci_mmcfg_virt[i].virt = NULL; 128 cfg->virt = NULL;
167 pci_mmcfg_virt[i].cfg = NULL;
168 } 129 }
169 } 130 }
170
171 kfree(pci_mmcfg_virt);
172 pci_mmcfg_virt = NULL;
173} 131}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 8aa85f17667e..0a979f3e5b8a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,6 +18,7 @@
18#include <asm/mce.h> 18#include <asm/mce.h>
19#include <asm/xcr.h> 19#include <asm/xcr.h>
20#include <asm/suspend.h> 20#include <asm/suspend.h>
21#include <asm/debugreg.h>
21 22
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23static struct saved_context saved_context; 24static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
142#endif 143#endif
143 load_TR_desc(); /* This does ltr */ 144 load_TR_desc(); /* This does ltr */
144 load_LDT(&current->active_mm->context); /* This does lldt */ 145 load_LDT(&current->active_mm->context); /* This does lldt */
145
146 /*
147 * Now maybe reload the debug registers
148 */
149 if (current->thread.debugreg7) {
150#ifdef CONFIG_X86_32
151 set_debugreg(current->thread.debugreg0, 0);
152 set_debugreg(current->thread.debugreg1, 1);
153 set_debugreg(current->thread.debugreg2, 2);
154 set_debugreg(current->thread.debugreg3, 3);
155 /* no 4 and 5 */
156 set_debugreg(current->thread.debugreg6, 6);
157 set_debugreg(current->thread.debugreg7, 7);
158#else
159 /* CONFIG_X86_64 */
160 loaddebug(&current->thread, 0);
161 loaddebug(&current->thread, 1);
162 loaddebug(&current->thread, 2);
163 loaddebug(&current->thread, 3);
164 /* no 4 and 5 */
165 loaddebug(&current->thread, 6);
166 loaddebug(&current->thread, 7);
167#endif
168 }
169
170} 146}
171 147
172/** 148/**
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644
index 000000000000..f82082677337
--- /dev/null
+++ b/arch/x86/tools/Makefile
@@ -0,0 +1,31 @@
1PHONY += posttest
2
3ifeq ($(KBUILD_VERBOSE),1)
4 posttest_verbose = -v
5else
6 posttest_verbose =
7endif
8
9ifeq ($(CONFIG_64BIT),y)
10 posttest_64bit = -y
11else
12 posttest_64bit = -n
13endif
14
15distill_awk = $(srctree)/arch/x86/tools/distill.awk
16chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
17
18quiet_cmd_posttest = TEST $@
19 cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
20
21posttest: $(obj)/test_get_len vmlinux
22 $(call cmd,posttest)
23
24hostprogs-y := test_get_len
25
26# -I needed for generated C source and C source which in the kernel tree.
27HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
28
29# Dependencies are also needed.
30$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
31
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644
index 000000000000..fd1ab80be0de
--- /dev/null
+++ b/arch/x86/tools/chkobjdump.awk
@@ -0,0 +1,33 @@
1# GNU objdump version checker
2#
3# Usage:
4# objdump -v | awk -f chkobjdump.awk
5BEGIN {
6 # objdump version 2.19 or later is OK for the test.
7 od_ver = 2;
8 od_sver = 19;
9}
10
11/^GNU objdump/ {
12 verstr = ""
13 for (i = 3; i <= NF; i++)
14 if (match($(i), "^[0-9]")) {
15 verstr = $(i);
16 break;
17 }
18 if (verstr == "") {
19 printf("Warning: Failed to find objdump version number.\n");
20 exit 0;
21 }
22 split(verstr, ver, ".");
23 if (ver[1] > od_ver ||
24 (ver[1] == od_ver && ver[2] >= od_sver)) {
25 exit 1;
26 } else {
27 printf("Warning: objdump version %s is older than %d.%d\n",
28 verstr, od_ver, od_sver);
29 print("Warning: Skipping posttest.");
30 # Logic is inverted, because we just skip test without error.
31 exit 0;
32 }
33}
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644
index 000000000000..c13c0ee48ab4
--- /dev/null
+++ b/arch/x86/tools/distill.awk
@@ -0,0 +1,47 @@
1#!/bin/awk -f
2# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
3# Distills the disassembly as follows:
4# - Removes all lines except the disassembled instructions.
5# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
6# into a single line.
7# - Remove bad(or prefix only) instructions
8
9BEGIN {
10 prev_addr = ""
11 prev_hex = ""
12 prev_mnemonic = ""
13 bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
14 fwait_expr = "^9b "
15 fwait_str="9b\tfwait"
16}
17
18/^ *[0-9a-f]+ <[^>]*>:/ {
19 # Symbol entry
20 printf("%s%s\n", $2, $1)
21}
22
23/^ *[0-9a-f]+:/ {
24 if (split($0, field, "\t") < 3) {
25 # This is a continuation of the same insn.
26 prev_hex = prev_hex field[2]
27 } else {
28 # Skip bad instructions
29 if (match(prev_mnemonic, bad_expr))
30 prev_addr = ""
31 # Split fwait from other f* instructions
32 if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
33 printf "%s\t%s\n", prev_addr, fwait_str
34 sub(fwait_expr, "", prev_hex)
35 }
36 if (prev_addr != "")
37 printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
38 prev_addr = field[1]
39 prev_hex = field[2]
40 prev_mnemonic = field[3]
41 }
42}
43
44END {
45 if (prev_addr != "")
46 printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
47}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644
index 000000000000..eaf11f52fc0b
--- /dev/null
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -0,0 +1,378 @@
1#!/bin/awk -f
2# gen-insn-attr-x86.awk: Instruction attribute table generator
3# Written by Masami Hiramatsu <mhiramat@redhat.com>
4#
5# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
6
7# Awk implementation sanity check
8function check_awk_implement() {
9 if (sprintf("%x", 0) != "0")
10 return "Your awk has a printf-format problem."
11 return ""
12}
13
14# Clear working vars
15function clear_vars() {
16 delete table
17 delete lptable2
18 delete lptable1
19 delete lptable3
20 eid = -1 # escape id
21 gid = -1 # group id
22 aid = -1 # AVX id
23 tname = ""
24}
25
26BEGIN {
27 # Implementation error checking
28 awkchecked = check_awk_implement()
29 if (awkchecked != "") {
30 print "Error: " awkchecked > "/dev/stderr"
31 print "Please try to use gawk." > "/dev/stderr"
32 exit 1
33 }
34
35 # Setup generating tables
36 print "/* x86 opcode map generated from x86-opcode-map.txt */"
37 print "/* Do not change this code. */\n"
38 ggid = 1
39 geid = 1
40 gaid = 0
41 delete etable
42 delete gtable
43 delete atable
44
45 opnd_expr = "^[A-Za-z/]"
46 ext_expr = "^\\("
47 sep_expr = "^\\|$"
48 group_expr = "^Grp[0-9A-Za-z]+"
49
50 imm_expr = "^[IJAO][a-z]"
51 imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
52 imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
53 imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
54 imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
55 imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
56 imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
57 imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
58 imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
59 imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
60 imm_flag["Ob"] = "INAT_MOFFSET"
61 imm_flag["Ov"] = "INAT_MOFFSET"
62
63 modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
64 force64_expr = "\\([df]64\\)"
65 rex_expr = "^REX(\\.[XRWB]+)*"
66 fpu_expr = "^ESC" # TODO
67
68 lprefix1_expr = "\\(66\\)"
69 lprefix2_expr = "\\(F3\\)"
70 lprefix3_expr = "\\(F2\\)"
71 max_lprefix = 4
72
73 vexok_expr = "\\(VEX\\)"
74 vexonly_expr = "\\(oVEX\\)"
75
76 prefix_expr = "\\(Prefix\\)"
77 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
78 prefix_num["REPNE"] = "INAT_PFX_REPNE"
79 prefix_num["REP/REPE"] = "INAT_PFX_REPE"
80 prefix_num["LOCK"] = "INAT_PFX_LOCK"
81 prefix_num["SEG=CS"] = "INAT_PFX_CS"
82 prefix_num["SEG=DS"] = "INAT_PFX_DS"
83 prefix_num["SEG=ES"] = "INAT_PFX_ES"
84 prefix_num["SEG=FS"] = "INAT_PFX_FS"
85 prefix_num["SEG=GS"] = "INAT_PFX_GS"
86 prefix_num["SEG=SS"] = "INAT_PFX_SS"
87 prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
88 prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
89 prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
90
91 clear_vars()
92}
93
94function semantic_error(msg) {
95 print "Semantic error at " NR ": " msg > "/dev/stderr"
96 exit 1
97}
98
99function debug(msg) {
100 print "DEBUG: " msg
101}
102
103function array_size(arr, i,c) {
104 c = 0
105 for (i in arr)
106 c++
107 return c
108}
109
110/^Table:/ {
111 print "/* " $0 " */"
112 if (tname != "")
113 semantic_error("Hit Table: before EndTable:.");
114}
115
116/^Referrer:/ {
117 if (NF != 1) {
118 # escape opcode table
119 ref = ""
120 for (i = 2; i <= NF; i++)
121 ref = ref $i
122 eid = escape[ref]
123 tname = sprintf("inat_escape_table_%d", eid)
124 }
125}
126
127/^AVXcode:/ {
128 if (NF != 1) {
129 # AVX/escape opcode table
130 aid = $2
131 if (gaid <= aid)
132 gaid = aid + 1
133 if (tname == "") # AVX only opcode table
134 tname = sprintf("inat_avx_table_%d", $2)
135 }
136 if (aid == -1 && eid == -1) # primary opcode table
137 tname = "inat_primary_table"
138}
139
140/^GrpTable:/ {
141 print "/* " $0 " */"
142 if (!($2 in group))
143 semantic_error("No group: " $2 )
144 gid = group[$2]
145 tname = "inat_group_table_" gid
146}
147
148function print_table(tbl,name,fmt,n)
149{
150 print "const insn_attr_t " name " = {"
151 for (i = 0; i < n; i++) {
152 id = sprintf(fmt, i)
153 if (tbl[id])
154 print " [" id "] = " tbl[id] ","
155 }
156 print "};"
157}
158
159/^EndTable/ {
160 if (gid != -1) {
161 # print group tables
162 if (array_size(table) != 0) {
163 print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
164 "0x%x", 8)
165 gtable[gid,0] = tname
166 }
167 if (array_size(lptable1) != 0) {
168 print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
169 "0x%x", 8)
170 gtable[gid,1] = tname "_1"
171 }
172 if (array_size(lptable2) != 0) {
173 print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
174 "0x%x", 8)
175 gtable[gid,2] = tname "_2"
176 }
177 if (array_size(lptable3) != 0) {
178 print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
179 "0x%x", 8)
180 gtable[gid,3] = tname "_3"
181 }
182 } else {
183 # print primary/escaped tables
184 if (array_size(table) != 0) {
185 print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
186 "0x%02x", 256)
187 etable[eid,0] = tname
188 if (aid >= 0)
189 atable[aid,0] = tname
190 }
191 if (array_size(lptable1) != 0) {
192 print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
193 "0x%02x", 256)
194 etable[eid,1] = tname "_1"
195 if (aid >= 0)
196 atable[aid,1] = tname "_1"
197 }
198 if (array_size(lptable2) != 0) {
199 print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
200 "0x%02x", 256)
201 etable[eid,2] = tname "_2"
202 if (aid >= 0)
203 atable[aid,2] = tname "_2"
204 }
205 if (array_size(lptable3) != 0) {
206 print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
207 "0x%02x", 256)
208 etable[eid,3] = tname "_3"
209 if (aid >= 0)
210 atable[aid,3] = tname "_3"
211 }
212 }
213 print ""
214 clear_vars()
215}
216
217function add_flags(old,new) {
218 if (old && new)
219 return old " | " new
220 else if (old)
221 return old
222 else
223 return new
224}
225
226# convert operands to flags.
227function convert_operands(count,opnd, i,j,imm,mod)
228{
229 imm = null
230 mod = null
231 for (j = 1; j <= count; j++) {
232 i = opnd[j]
233 if (match(i, imm_expr) == 1) {
234 if (!imm_flag[i])
235 semantic_error("Unknown imm opnd: " i)
236 if (imm) {
237 if (i != "Ib")
238 semantic_error("Second IMM error")
239 imm = add_flags(imm, "INAT_SCNDIMM")
240 } else
241 imm = imm_flag[i]
242 } else if (match(i, modrm_expr))
243 mod = "INAT_MODRM"
244 }
245 return add_flags(imm, mod)
246}
247
248/^[0-9a-f]+\:/ {
249 if (NR == 1)
250 next
251 # get index
252 idx = "0x" substr($1, 1, index($1,":") - 1)
253 if (idx in table)
254 semantic_error("Redefine " idx " in " tname)
255
256 # check if escaped opcode
257 if ("escape" == $2) {
258 if ($3 != "#")
259 semantic_error("No escaped name")
260 ref = ""
261 for (i = 4; i <= NF; i++)
262 ref = ref $i
263 if (ref in escape)
264 semantic_error("Redefine escape (" ref ")")
265 escape[ref] = geid
266 geid++
267 table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
268 next
269 }
270
271 variant = null
272 # converts
273 i = 2
274 while (i <= NF) {
275 opcode = $(i++)
276 delete opnds
277 ext = null
278 flags = null
279 opnd = null
280 # parse one opcode
281 if (match($i, opnd_expr)) {
282 opnd = $i
283 count = split($(i++), opnds, ",")
284 flags = convert_operands(count, opnds)
285 }
286 if (match($i, ext_expr))
287 ext = $(i++)
288 if (match($i, sep_expr))
289 i++
290 else if (i < NF)
291 semantic_error($i " is not a separator")
292
293 # check if group opcode
294 if (match(opcode, group_expr)) {
295 if (!(opcode in group)) {
296 group[opcode] = ggid
297 ggid++
298 }
299 flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
300 }
301 # check force(or default) 64bit
302 if (match(ext, force64_expr))
303 flags = add_flags(flags, "INAT_FORCE64")
304
305 # check REX prefix
306 if (match(opcode, rex_expr))
307 flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
308
309 # check coprocessor escape : TODO
310 if (match(opcode, fpu_expr))
311 flags = add_flags(flags, "INAT_MODRM")
312
313 # check VEX only code
314 if (match(ext, vexonly_expr))
315 flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
316
317 # check VEX only code
318 if (match(ext, vexok_expr))
319 flags = add_flags(flags, "INAT_VEXOK")
320
321 # check prefixes
322 if (match(ext, prefix_expr)) {
323 if (!prefix_num[opcode])
324 semantic_error("Unknown prefix: " opcode)
325 flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
326 }
327 if (length(flags) == 0)
328 continue
329 # check if last prefix
330 if (match(ext, lprefix1_expr)) {
331 lptable1[idx] = add_flags(lptable1[idx],flags)
332 variant = "INAT_VARIANT"
333 } else if (match(ext, lprefix2_expr)) {
334 lptable2[idx] = add_flags(lptable2[idx],flags)
335 variant = "INAT_VARIANT"
336 } else if (match(ext, lprefix3_expr)) {
337 lptable3[idx] = add_flags(lptable3[idx],flags)
338 variant = "INAT_VARIANT"
339 } else {
340 table[idx] = add_flags(table[idx],flags)
341 }
342 }
343 if (variant)
344 table[idx] = add_flags(table[idx],variant)
345}
346
347END {
348 if (awkchecked != "")
349 exit 1
350 # print escape opcode map's array
351 print "/* Escape opcode map array */"
352 print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \
353 "[INAT_LSTPFX_MAX + 1] = {"
354 for (i = 0; i < geid; i++)
355 for (j = 0; j < max_lprefix; j++)
356 if (etable[i,j])
357 print " ["i"]["j"] = "etable[i,j]","
358 print "};\n"
359 # print group opcode map's array
360 print "/* Group opcode map array */"
361 print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\
362 "[INAT_LSTPFX_MAX + 1] = {"
363 for (i = 0; i < ggid; i++)
364 for (j = 0; j < max_lprefix; j++)
365 if (gtable[i,j])
366 print " ["i"]["j"] = "gtable[i,j]","
367 print "};\n"
368 # print AVX opcode map's array
369 print "/* AVX opcode map array */"
370 print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\
371 "[INAT_LSTPFX_MAX + 1] = {"
372 for (i = 0; i < gaid; i++)
373 for (j = 0; j < max_lprefix; j++)
374 if (atable[i,j])
375 print " ["i"]["j"] = "atable[i,j]","
376 print "};"
377}
378
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
new file mode 100644
index 000000000000..bee8d6ac2691
--- /dev/null
+++ b/arch/x86/tools/test_get_len.c
@@ -0,0 +1,173 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2009
17 */
18
19#include <stdlib.h>
20#include <stdio.h>
21#include <string.h>
22#include <assert.h>
23#include <unistd.h>
24
25#define unlikely(cond) (cond)
26
27#include <asm/insn.h>
28#include <inat.c>
29#include <insn.c>
30
31/*
32 * Test of instruction analysis in general and insn_get_length() in
33 * particular. See if insn_get_length() and the disassembler agree
34 * on the length of each instruction in an elf disassembly.
35 *
36 * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
37 */
38
39const char *prog;
40static int verbose;
41static int x86_64;
42
43static void usage(void)
44{
45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
46 " %s [-y|-n] [-v] \n", prog);
47 fprintf(stderr, "\t-y 64bit mode\n");
48 fprintf(stderr, "\t-n 32bit mode\n");
49 fprintf(stderr, "\t-v verbose mode\n");
50 exit(1);
51}
52
53static void malformed_line(const char *line, int line_nr)
54{
55 fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
56 exit(3);
57}
58
59static void dump_field(FILE *fp, const char *name, const char *indent,
60 struct insn_field *field)
61{
62 fprintf(fp, "%s.%s = {\n", indent, name);
63 fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
64 indent, field->value, field->bytes[0], field->bytes[1],
65 field->bytes[2], field->bytes[3]);
66 fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
67 field->got, field->nbytes);
68}
69
70static void dump_insn(FILE *fp, struct insn *insn)
71{
72 fprintf(fp, "Instruction = { \n");
73 dump_field(fp, "prefixes", "\t", &insn->prefixes);
74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
76 dump_field(fp, "opcode", "\t", &insn->opcode);
77 dump_field(fp, "modrm", "\t", &insn->modrm);
78 dump_field(fp, "sib", "\t", &insn->sib);
79 dump_field(fp, "displacement", "\t", &insn->displacement);
80 dump_field(fp, "immediate1", "\t", &insn->immediate1);
81 dump_field(fp, "immediate2", "\t", &insn->immediate2);
82 fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
83 insn->attr, insn->opnd_bytes, insn->addr_bytes);
84 fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
85 insn->length, insn->x86_64, insn->kaddr);
86}
87
88static void parse_args(int argc, char **argv)
89{
90 int c;
91 prog = argv[0];
92 while ((c = getopt(argc, argv, "ynv")) != -1) {
93 switch (c) {
94 case 'y':
95 x86_64 = 1;
96 break;
97 case 'n':
98 x86_64 = 0;
99 break;
100 case 'v':
101 verbose = 1;
102 break;
103 default:
104 usage();
105 }
106 }
107}
108
109#define BUFSIZE 256
110
111int main(int argc, char **argv)
112{
113 char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
114 unsigned char insn_buf[16];
115 struct insn insn;
116 int insns = 0;
117 int warnings = 0;
118
119 parse_args(argc, argv);
120
121 while (fgets(line, BUFSIZE, stdin)) {
122 char copy[BUFSIZE], *s, *tab1, *tab2;
123 int nb = 0;
124 unsigned int b;
125
126 if (line[0] == '<') {
127 /* Symbol line */
128 strcpy(sym, line);
129 continue;
130 }
131
132 insns++;
133 memset(insn_buf, 0, 16);
134 strcpy(copy, line);
135 tab1 = strchr(copy, '\t');
136 if (!tab1)
137 malformed_line(line, insns);
138 s = tab1 + 1;
139 s += strspn(s, " ");
140 tab2 = strchr(s, '\t');
141 if (!tab2)
142 malformed_line(line, insns);
143 *tab2 = '\0'; /* Characters beyond tab2 aren't examined */
144 while (s < tab2) {
145 if (sscanf(s, "%x", &b) == 1) {
146 insn_buf[nb++] = (unsigned char) b;
147 s += 3;
148 } else
149 break;
150 }
151 /* Decode an instruction */
152 insn_init(&insn, insn_buf, x86_64);
153 insn_get_length(&insn);
154 if (insn.length != nb) {
155 warnings++;
156 fprintf(stderr, "Warning: %s found difference at %s\n",
157 prog, sym);
158 fprintf(stderr, "Warning: %s", line);
159 fprintf(stderr, "Warning: objdump says %d bytes, but "
160 "insn_get_length() says %d\n", nb,
161 insn.length);
162 if (verbose)
163 dump_insn(stderr, &insn);
164 }
165 }
166 if (warnings)
167 fprintf(stderr, "Warning: decoded and checked %d"
168 " instructions with %d warnings\n", insns, warnings);
169 else
170 fprintf(stderr, "Succeed: decoded and checked %d"
171 " instructions\n", insns);
172 return 0;
173}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 58bc00f68b12..02b442e92007 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -393,7 +393,6 @@ static ctl_table abi_table2[] = {
393 393
394static ctl_table abi_root_table2[] = { 394static ctl_table abi_root_table2[] = {
395 { 395 {
396 .ctl_name = CTL_ABI,
397 .procname = "abi", 396 .procname = "abi",
398 .mode = 0555, 397 .mode = 0555,
399 .child = abi_table2 398 .child = abi_table2
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3439616d69f1..2b26dd5930c6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -27,7 +27,9 @@
27#include <linux/page-flags.h> 27#include <linux/page-flags.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/console.h> 29#include <linux/console.h>
30#include <linux/pci.h>
30 31
32#include <xen/xen.h>
31#include <xen/interface/xen.h> 33#include <xen/interface/xen.h>
32#include <xen/interface/version.h> 34#include <xen/interface/version.h>
33#include <xen/interface/physdev.h> 35#include <xen/interface/physdev.h>
@@ -138,24 +140,23 @@ static void xen_vcpu_setup(int cpu)
138 */ 140 */
139void xen_vcpu_restore(void) 141void xen_vcpu_restore(void)
140{ 142{
141 if (have_vcpu_info_placement) { 143 int cpu;
142 int cpu;
143 144
144 for_each_online_cpu(cpu) { 145 for_each_online_cpu(cpu) {
145 bool other_cpu = (cpu != smp_processor_id()); 146 bool other_cpu = (cpu != smp_processor_id());
146 147
147 if (other_cpu && 148 if (other_cpu &&
148 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) 149 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
149 BUG(); 150 BUG();
150 151
151 xen_vcpu_setup(cpu); 152 xen_setup_runstate_info(cpu);
152 153
153 if (other_cpu && 154 if (have_vcpu_info_placement)
154 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) 155 xen_vcpu_setup(cpu);
155 BUG();
156 }
157 156
158 BUG_ON(!have_vcpu_info_placement); 157 if (other_cpu &&
158 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
159 BUG();
159 } 160 }
160} 161}
161 162
@@ -178,6 +179,7 @@ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
178static void xen_cpuid(unsigned int *ax, unsigned int *bx, 179static void xen_cpuid(unsigned int *ax, unsigned int *bx,
179 unsigned int *cx, unsigned int *dx) 180 unsigned int *cx, unsigned int *dx)
180{ 181{
182 unsigned maskebx = ~0;
181 unsigned maskecx = ~0; 183 unsigned maskecx = ~0;
182 unsigned maskedx = ~0; 184 unsigned maskedx = ~0;
183 185
@@ -185,9 +187,16 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
185 * Mask out inconvenient features, to try and disable as many 187 * Mask out inconvenient features, to try and disable as many
186 * unsupported kernel subsystems as possible. 188 * unsupported kernel subsystems as possible.
187 */ 189 */
188 if (*ax == 1) { 190 switch (*ax) {
191 case 1:
189 maskecx = cpuid_leaf1_ecx_mask; 192 maskecx = cpuid_leaf1_ecx_mask;
190 maskedx = cpuid_leaf1_edx_mask; 193 maskedx = cpuid_leaf1_edx_mask;
194 break;
195
196 case 0xb:
197 /* Suppress extended topology stuff */
198 maskebx = 0;
199 break;
191 } 200 }
192 201
193 asm(XEN_EMULATE_PREFIX "cpuid" 202 asm(XEN_EMULATE_PREFIX "cpuid"
@@ -197,6 +206,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
197 "=d" (*dx) 206 "=d" (*dx)
198 : "0" (*ax), "2" (*cx)); 207 : "0" (*ax), "2" (*cx));
199 208
209 *bx &= maskebx;
200 *cx &= maskecx; 210 *cx &= maskecx;
201 *dx &= maskedx; 211 *dx &= maskedx;
202} 212}
@@ -1075,6 +1085,8 @@ asmlinkage void __init xen_start_kernel(void)
1075 * Set up some pagetable state before starting to set any ptes. 1085 * Set up some pagetable state before starting to set any ptes.
1076 */ 1086 */
1077 1087
1088 xen_init_mmu_ops();
1089
1078 /* Prevent unwanted bits from being set in PTEs. */ 1090 /* Prevent unwanted bits from being set in PTEs. */
1079 __supported_pte_mask &= ~_PAGE_GLOBAL; 1091 __supported_pte_mask &= ~_PAGE_GLOBAL;
1080 if (!xen_initial_domain()) 1092 if (!xen_initial_domain())
@@ -1082,10 +1094,8 @@ asmlinkage void __init xen_start_kernel(void)
1082 1094
1083 __supported_pte_mask |= _PAGE_IOMAP; 1095 __supported_pte_mask |= _PAGE_IOMAP;
1084 1096
1085#ifdef CONFIG_X86_64
1086 /* Work out if we support NX */ 1097 /* Work out if we support NX */
1087 check_efer(); 1098 x86_configure_nx();
1088#endif
1089 1099
1090 xen_setup_features(); 1100 xen_setup_features();
1091 1101
@@ -1099,7 +1109,6 @@ asmlinkage void __init xen_start_kernel(void)
1099 */ 1109 */
1100 xen_setup_stackprotector(); 1110 xen_setup_stackprotector();
1101 1111
1102 xen_init_mmu_ops();
1103 xen_init_irq_ops(); 1112 xen_init_irq_ops();
1104 xen_init_cpuid_mask(); 1113 xen_init_cpuid_mask();
1105 1114
@@ -1168,10 +1177,16 @@ asmlinkage void __init xen_start_kernel(void)
1168 add_preferred_console("xenboot", 0, NULL); 1177 add_preferred_console("xenboot", 0, NULL);
1169 add_preferred_console("tty", 0, NULL); 1178 add_preferred_console("tty", 0, NULL);
1170 add_preferred_console("hvc", 0, NULL); 1179 add_preferred_console("hvc", 0, NULL);
1180 } else {
1181 /* Make sure ACS will be enabled */
1182 pci_request_acs();
1171 } 1183 }
1184
1172 1185
1173 xen_raw_console_write("about to get started...\n"); 1186 xen_raw_console_write("about to get started...\n");
1174 1187
1188 xen_setup_runstate_info(0);
1189
1175 /* Start the world */ 1190 /* Start the world */
1176#ifdef CONFIG_X86_32 1191#ifdef CONFIG_X86_32
1177 i386_start_kernel(); 1192 i386_start_kernel();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3bf7b1d250ce..bf4cd6bfe959 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -185,7 +185,7 @@ static inline unsigned p2m_index(unsigned long pfn)
185} 185}
186 186
187/* Build the parallel p2m_top_mfn structures */ 187/* Build the parallel p2m_top_mfn structures */
188static void __init xen_build_mfn_list_list(void) 188void xen_build_mfn_list_list(void)
189{ 189{
190 unsigned pfn, idx; 190 unsigned pfn, idx;
191 191
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 1167d9830f5f..563d20504988 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -73,7 +73,7 @@ static __cpuinit void cpu_bringup(void)
73 73
74 xen_setup_cpu_clockevents(); 74 xen_setup_cpu_clockevents();
75 75
76 cpu_set(cpu, cpu_online_map); 76 set_cpu_online(cpu, true);
77 percpu_write(cpu_state, CPU_ONLINE); 77 percpu_write(cpu_state, CPU_ONLINE);
78 wmb(); 78 wmb();
79 79
@@ -296,6 +296,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
296 (unsigned long)task_stack_page(idle) - 296 (unsigned long)task_stack_page(idle) -
297 KERNEL_STACK_OFFSET + THREAD_SIZE; 297 KERNEL_STACK_OFFSET + THREAD_SIZE;
298#endif 298#endif
299 xen_setup_runstate_info(cpu);
299 xen_setup_timer(cpu); 300 xen_setup_timer(cpu);
300 xen_init_lock_cpu(cpu); 301 xen_init_lock_cpu(cpu);
301 302
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 36a5141108df..24ded31b5aec 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -120,14 +120,14 @@ struct xen_spinlock {
120 unsigned short spinners; /* count of waiting cpus */ 120 unsigned short spinners; /* count of waiting cpus */
121}; 121};
122 122
123static int xen_spin_is_locked(struct raw_spinlock *lock) 123static int xen_spin_is_locked(struct arch_spinlock *lock)
124{ 124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126 126
127 return xl->lock != 0; 127 return xl->lock != 0;
128} 128}
129 129
130static int xen_spin_is_contended(struct raw_spinlock *lock) 130static int xen_spin_is_contended(struct arch_spinlock *lock)
131{ 131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133 133
@@ -136,7 +136,7 @@ static int xen_spin_is_contended(struct raw_spinlock *lock)
136 return xl->spinners != 0; 136 return xl->spinners != 0;
137} 137}
138 138
139static int xen_spin_trylock(struct raw_spinlock *lock) 139static int xen_spin_trylock(struct arch_spinlock *lock)
140{ 140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1; 142 u8 old = 1;
@@ -181,7 +181,7 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
181 __get_cpu_var(lock_spinners) = prev; 181 __get_cpu_var(lock_spinners) = prev;
182} 182}
183 183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) 184static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
185{ 185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev; 187 struct xen_spinlock *prev;
@@ -254,7 +254,7 @@ out:
254 return ret; 254 return ret;
255} 255}
256 256
257static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) 257static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
258{ 258{
259 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 259 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
260 unsigned timeout; 260 unsigned timeout;
@@ -291,12 +291,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
291 spin_time_accum_total(start_spin); 291 spin_time_accum_total(start_spin);
292} 292}
293 293
294static void xen_spin_lock(struct raw_spinlock *lock) 294static void xen_spin_lock(struct arch_spinlock *lock)
295{ 295{
296 __xen_spin_lock(lock, false); 296 __xen_spin_lock(lock, false);
297} 297}
298 298
299static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) 299static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
300{ 300{
301 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); 301 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
302} 302}
@@ -317,7 +317,7 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
317 } 317 }
318} 318}
319 319
320static void xen_spin_unlock(struct raw_spinlock *lock) 320static void xen_spin_unlock(struct arch_spinlock *lock)
321{ 321{
322 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 322 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
323 323
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 95be7b434724..987267f79bf5 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,4 +1,5 @@
1#include <linux/types.h> 1#include <linux/types.h>
2#include <linux/clockchips.h>
2 3
3#include <xen/interface/xen.h> 4#include <xen/interface/xen.h>
4#include <xen/grant_table.h> 5#include <xen/grant_table.h>
@@ -27,6 +28,8 @@ void xen_pre_suspend(void)
27 28
28void xen_post_suspend(int suspend_cancelled) 29void xen_post_suspend(int suspend_cancelled)
29{ 30{
31 xen_build_mfn_list_list();
32
30 xen_setup_shared_info(); 33 xen_setup_shared_info();
31 34
32 if (suspend_cancelled) { 35 if (suspend_cancelled) {
@@ -44,7 +47,19 @@ void xen_post_suspend(int suspend_cancelled)
44 47
45} 48}
46 49
50static void xen_vcpu_notify_restore(void *data)
51{
52 unsigned long reason = (unsigned long)data;
53
54 /* Boot processor notified via generic timekeeping_resume() */
55 if ( smp_processor_id() == 0)
56 return;
57
58 clockevents_notify(reason, NULL);
59}
60
47void xen_arch_resume(void) 61void xen_arch_resume(void)
48{ 62{
49 /* nothing */ 63 smp_call_function(xen_vcpu_notify_restore,
64 (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
50} 65}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 26e37b787ad3..0d3f07cd1b5f 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -100,7 +100,7 @@ bool xen_vcpu_stolen(int vcpu)
100 return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; 100 return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
101} 101}
102 102
103static void setup_runstate_info(int cpu) 103void xen_setup_runstate_info(int cpu)
104{ 104{
105 struct vcpu_register_runstate_memory_area area; 105 struct vcpu_register_runstate_memory_area area;
106 106
@@ -434,7 +434,7 @@ void xen_setup_timer(int cpu)
434 name = "<timer kasprintf failed>"; 434 name = "<timer kasprintf failed>";
435 435
436 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 436 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
437 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, 437 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
438 name, NULL); 438 name, NULL);
439 439
440 evt = &per_cpu(xen_clock_events, cpu); 440 evt = &per_cpu(xen_clock_events, cpu);
@@ -442,8 +442,6 @@ void xen_setup_timer(int cpu)
442 442
443 evt->cpumask = cpumask_of(cpu); 443 evt->cpumask = cpumask_of(cpu);
444 evt->irq = irq; 444 evt->irq = irq;
445
446 setup_runstate_info(cpu);
447} 445}
448 446
449void xen_teardown_timer(int cpu) 447void xen_teardown_timer(int cpu)
@@ -494,6 +492,7 @@ __init void xen_time_init(void)
494 492
495 setup_force_cpu_cap(X86_FEATURE_TSC); 493 setup_force_cpu_cap(X86_FEATURE_TSC);
496 494
495 xen_setup_runstate_info(cpu);
497 xen_setup_timer(cpu); 496 xen_setup_timer(cpu);
498 xen_setup_cpu_clockevents(); 497 xen_setup_cpu_clockevents();
499} 498}
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 02f496a8dbaa..53adefda4275 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -96,7 +96,7 @@ ENTRY(xen_sysret32)
96 pushq $__USER32_CS 96 pushq $__USER32_CS
97 pushq %rcx 97 pushq %rcx
98 98
99 pushq $VGCF_in_syscall 99 pushq $0
1001: jmp hypercall_iret 1001: jmp hypercall_iret
101ENDPATCH(xen_sysret32) 101ENDPATCH(xen_sysret32)
102RELOC(xen_sysret32, 1b+1) 102RELOC(xen_sysret32, 1b+1)
@@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target)
151ENTRY(xen_sysenter_target) 151ENTRY(xen_sysenter_target)
152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */ 152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */
153 mov $-ENOSYS, %rax 153 mov $-ENOSYS, %rax
154 pushq $VGCF_in_syscall 154 pushq $0
155 jmp hypercall_iret 155 jmp hypercall_iret
156ENDPROC(xen_syscall32_target) 156ENDPROC(xen_syscall32_target)
157ENDPROC(xen_sysenter_target) 157ENDPROC(xen_sysenter_target)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 355fa6b99c9c..f9153a300bce 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -25,6 +25,7 @@ extern struct shared_info *HYPERVISOR_shared_info;
25 25
26void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
27void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_build_mfn_list_list(void);
28void xen_setup_machphys_mapping(void); 29void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 31void xen_ident_map_ISA(void);
@@ -41,6 +42,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
41 42
42void xen_init_irq_ops(void); 43void xen_init_irq_ops(void);
43void xen_setup_timer(int cpu); 44void xen_setup_timer(int cpu);
45void xen_setup_runstate_info(int cpu);
44void xen_teardown_timer(int cpu); 46void xen_teardown_timer(int cpu);
45cycle_t xen_clocksource_read(void); 47cycle_t xen_clocksource_read(void);
46void xen_setup_cpu_clockevents(void); 48void xen_setup_cpu_clockevents(void);